├── .github └── workflows │ ├── benchmark.yml │ ├── example-client.yml │ ├── format.yml │ └── test.yml ├── .gitignore ├── .prettierignore ├── .python-version ├── Cargo.toml ├── README.md ├── cmd └── main.go ├── examples ├── README.md ├── benchmarkdiff │ └── main.go ├── client │ ├── editor.js │ ├── index.html │ ├── search.html │ ├── server.go │ └── styles.css ├── visualizer │ └── main.go └── workspace │ ├── fetch_csv.py │ ├── fetch_jsonl.py │ ├── palo-alto.jsonl │ └── requirements.txt ├── go.mod ├── go.sum ├── jest.config.ts ├── package-lock.json ├── package.json ├── pkg ├── appendable │ ├── appendable.go │ ├── appendable_test.go │ ├── index_file.go │ ├── index_file_test.go │ └── typescript.go ├── bptree │ ├── README.md │ ├── bptree.go │ ├── bptree_test.go │ ├── node.go │ └── node_test.go ├── btree │ ├── btree.go │ ├── btree_test.go │ ├── node.go │ └── node_test.go ├── buftest │ ├── buffer.go │ └── buffer_test.go ├── encoding │ ├── sizeVarint.go │ └── sizeVarint_test.go ├── handlers │ ├── csv.go │ ├── csv_test.go │ ├── equality_test.go │ ├── jsonl.go │ └── jsonl_test.go ├── hnsw │ ├── friends.go │ ├── friends_test.go │ ├── heap.go │ ├── heap_test.go │ ├── hnsw.go │ └── hnsw_test.go ├── linkedpage │ ├── linkedpage.go │ └── linkedpage_test.go ├── metapage │ └── metapage.go ├── mmap │ ├── mmap.go │ ├── mmap_test.go │ ├── mremap_darwin.go │ └── mremap_linux.go ├── mocks │ ├── btree.go │ ├── main.go │ ├── meta_page.go │ ├── metadata.go │ └── node.go ├── ngram │ ├── tokenizer.go │ └── tokenizer_test.go ├── pagefile │ ├── pagefile.go │ ├── pagefile_debug.go │ ├── pagefile_debug_test.go │ └── pagefile_test.go ├── pointer │ ├── pointer.go │ └── referenced_value.go └── vectorpage │ ├── manager.go │ └── manager_test.go ├── scripts └── jsonl2json │ ├── Cargo.toml │ └── src │ └── main.rs ├── src ├── bptree │ ├── bptree.ts │ ├── node.ts │ └── traversal.ts ├── db │ ├── database.ts │ ├── query-builder.ts │ ├── query-lang.ts │ └── query-validation.ts ├── file │ ├── data-file.ts │ ├── index-file.ts │ ├── meta.ts │ └── multi.ts ├── index.ts ├── ngram │ ├── table.ts │ └── tokenizer.ts ├── resolver │ ├── cache.ts │ ├── multipart.ts │ ├── range-request.ts │ └── resolver.ts ├── tests │ ├── bptree.test.ts │ ├── index-file.test.ts │ ├── mock_binaries │ │ ├── btree_1.bin │ │ ├── btree_1023.bin │ │ ├── btree_iterator.bin │ │ ├── filemeta.bin │ │ ├── filled_metadata.bin │ │ ├── indexmeta.bin │ │ ├── internalnode.bin │ │ └── leafnode.bin │ ├── multi.test.ts │ ├── multipart.test.ts │ ├── ngramtable.test.ts │ ├── node.test.ts │ ├── query-builder.test.ts │ ├── query-logic.test.ts │ ├── query-validation.test.ts │ ├── test-util.ts │ ├── tokenizer.test.ts │ └── varint.ts └── util │ └── uvarint.ts └── tsconfig.json /.github/workflows/benchmark.yml: -------------------------------------------------------------------------------- 1 | name: Benchmark 2 | on: 3 | pull_request: 4 | 5 | jobs: 6 | baseline: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | with: 11 | path: "comparison" 12 | ref: ${{ github.event.pull_request.head.ref }} 13 | - uses: actions/checkout@v4 14 | with: 15 | path: "baseline" 16 | ref: ${{ github.event.pull_request.base.ref }} 17 | 18 | - uses: actions/setup-python@v4.7.1 19 | - run: | 20 | # Fetch the data in workspace 21 | cd examples/workspace 22 | python3 -m pip install -r requirements.txt 23 | python3 fetch_jsonl.py 24 | working-directory: comparison 25 | - run: | 26 | # Fetch the data in workspace 27 | cd examples/workspace 28 | python3 -m pip install -r requirements.txt 29 | python3 fetch_jsonl.py 30 | working-directory: baseline 31 | 32 | - uses: actions/setup-go@v4 33 | with: 34 | go-version: "1.21" 35 | 36 | - run: | 37 | go run cmd/main.go \ 38 | -i output.index \ 39 | -jsonl \ 40 | -b benchmark.txt \ 41 | examples/workspace/green_tripdata_2023-01.jsonl 42 | working-directory: comparison 43 | 44 | - run: | 45 | go run cmd/main.go \ 46 | -i output.index \ 47 | -jsonl \ 48 | -b benchmark.txt \ 49 | examples/workspace/green_tripdata_2023-01.jsonl 50 | working-directory: baseline 51 | 52 | - run: go run examples/benchmarkdiff/main.go ../baseline/benchmark.txt benchmark.txt 53 | working-directory: comparison 54 | 55 | - uses: actions/upload-artifact@v4 56 | id: upload-artifact 57 | with: 58 | name: benchmark-diff 59 | path: comparison/output.html 60 | 61 | # post a comment to the PR 62 | - name: Post comment 63 | uses: mshick/add-pr-comment@v2 64 | with: 65 | message: | 66 | Benchmark results: ${{ steps.upload-artifact.outputs.artifact-url }} 67 | -------------------------------------------------------------------------------- /.github/workflows/example-client.yml: -------------------------------------------------------------------------------- 1 | name: Deploy GitHub Pages Example 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | workflow_dispatch: 7 | 8 | permissions: 9 | contents: read 10 | pages: write 11 | id-token: write 12 | 13 | concurrency: 14 | group: "pages" 15 | cancel-in-progress: false 16 | 17 | jobs: 18 | build: 19 | runs-on: ubuntu-latest 20 | steps: 21 | - uses: actions/checkout@v3 22 | - uses: actions/configure-pages@v3 23 | - uses: actions/setup-go@v4.1.0 24 | with: 25 | go-version-file: go.mod 26 | - uses: actions/setup-node@v4.0.0 27 | - uses: actions/setup-python@v4.7.1 28 | - run: | 29 | # Fetch the data in workspace 30 | cd examples/workspace 31 | python3 -m pip install -r requirements.txt 32 | python3 fetch_jsonl.py 33 | cd - 34 | 35 | # Build the index 36 | go run cmd/main.go -i examples/client/green_tripdata_2023-01.index -jsonl examples/workspace/green_tripdata_2023-01.jsonl 37 | 38 | # Copy to client 39 | cp examples/workspace/green_tripdata_2023-01.jsonl examples/client 40 | 41 | # Build the index 42 | go run cmd/main.go -i examples/client/palo-alto.index -s description -jsonl examples/workspace/palo-alto.jsonl 43 | 44 | # Copy to client 45 | cp examples/workspace/palo-alto.jsonl examples/client 46 | 47 | # Build the js lib 48 | npm ci 49 | npm run build 50 | 51 | # Copy the js lib 52 | cp dist/appendable.min.js examples/client 53 | cp dist/appendable.min.js.map examples/client 54 | - uses: actions/upload-pages-artifact@v2 55 | with: 56 | path: examples/client 57 | deploy: 58 | environment: 59 | name: github-pages 60 | url: ${{ steps.deployment.outputs.page_url }} 61 | runs-on: ubuntu-latest 62 | needs: build 63 | steps: 64 | - name: Deploy to GitHub Pages 65 | id: deployment 66 | uses: actions/deploy-pages@v2 67 | -------------------------------------------------------------------------------- /.github/workflows/format.yml: -------------------------------------------------------------------------------- 1 | name: Format 2 | on: 3 | pull_request: 4 | 5 | jobs: 6 | go-fmt: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | - uses: actions/setup-go@v4 11 | with: 12 | go-version: "1.21" 13 | - run: if [ "$(gofmt -s -l . | wc -l)" -gt 0 ]; then exit 1; fi 14 | 15 | go-mod-tidy: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | - uses: actions/setup-go@v4 20 | with: 21 | go-version: "1.21" 22 | - run: | 23 | go mod tidy 24 | STATUS=$(git status --porcelain go.mod go.sum) 25 | if [ ! -z "$STATUS" ]; then 26 | echo "Running go mod tidy modified go.mod and/or go.sum" >> $GITHUB_STEP_SUMMARY 27 | exit 1 28 | fi 29 | exit 0 30 | 31 | prettier: 32 | runs-on: ubuntu-latest 33 | steps: 34 | - uses: actions/checkout@v4 35 | - name: Use Node.js 18 36 | uses: actions/setup-node@v3 37 | with: 38 | node-version: "18" 39 | - run: npx prettier --check "**/*.{js,jsx,ts,tsx,css,scss,md,json}" 2> $GITHUB_STEP_SUMMARY 40 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | on: 3 | pull_request: 4 | 5 | jobs: 6 | go-test: 7 | strategy: 8 | matrix: 9 | os: [ubuntu-latest, macos-latest] 10 | runs-on: ${{ matrix.os }} 11 | steps: 12 | - uses: actions/checkout@v3 13 | - uses: actions/setup-go@v4 14 | with: 15 | go-version: "1.21" 16 | - run: go test -v ./... 17 | - run: go vet -v ./... 18 | 19 | node-test: 20 | runs-on: ubuntu-latest 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Use Node.js 18 24 | uses: actions/setup-node@v3 25 | with: 26 | node-version: "18" 27 | - run: npm ci 28 | - run: npm run build 29 | - run: npm test 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | node_modules 3 | Cargo.lock 4 | 5 | .DS_Store 6 | pprof.out 7 | 8 | # Ignore files 9 | *.index 10 | *.jsonl 11 | *.csv 12 | examples/**/appendable.min.js 13 | examples/**/appendable.min.js.map 14 | 15 | # But include the palo-alto dataset 16 | !examples/workspace/palo-alto.jsonl 17 | 18 | # But include these files in src/tests/mock_binaries 19 | !src/tests/mock_binaries/*.jsonl 20 | !src/tests/mock_binaries/*.cs 21 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | package-lock.json -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.10.5 -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["scripts/jsonl2json"] 3 | 4 | resolver = "2" -------------------------------------------------------------------------------- /cmd/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "log/slog" 7 | "os" 8 | "runtime/pprof" 9 | "time" 10 | 11 | "github.com/kevmo314/appendable/pkg/appendable" 12 | "github.com/kevmo314/appendable/pkg/handlers" 13 | "github.com/kevmo314/appendable/pkg/mmap" 14 | ) 15 | 16 | type StringSlice []string 17 | 18 | func (s *StringSlice) String() string { 19 | return fmt.Sprintf("%v", *s) 20 | } 21 | 22 | func (s *StringSlice) Set(value string) error { 23 | *s = append(*s, value) 24 | return nil 25 | } 26 | 27 | func main() { 28 | var debugFlag, jsonlFlag, csvFlag, showTimings bool 29 | var indexFilename, pprofFilename, benchmarkFilename string 30 | var searchHeaders StringSlice 31 | 32 | flag.BoolVar(&debugFlag, "debug", false, "Use logger that prints at the debug-level") 33 | flag.BoolVar(&jsonlFlag, "jsonl", false, "Use JSONL handler") 34 | flag.BoolVar(&csvFlag, "csv", false, "Use CSV handler") 35 | flag.BoolVar(&showTimings, "t", false, "Show time-related metrics") 36 | flag.StringVar(&indexFilename, "i", "", "Specify the existing index of the file to be opened, writing to stdout") 37 | flag.StringVar(&pprofFilename, "pprof", "", "Specify the file to write the pprof data to") 38 | flag.StringVar(&benchmarkFilename, "b", "", "Specify the file to write the benchmark data to") 39 | flag.Var(&searchHeaders, "s", "Specify the headers you want to search") 40 | 41 | flag.Parse() 42 | 43 | logLevel := &slog.LevelVar{} 44 | 45 | if debugFlag { 46 | logLevel.Set(slog.LevelDebug) 47 | } 48 | 49 | if pprofFilename != "" { 50 | f, err := os.Create(pprofFilename) 51 | if err != nil { 52 | panic(err) 53 | } 54 | defer f.Close() // error handling omitted for example 55 | if err := pprof.StartCPUProfile(f); err != nil { 56 | panic(err) 57 | } 58 | defer pprof.StopCPUProfile() 59 | } 60 | 61 | logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: logLevel})) 62 | slog.SetDefault(logger) 63 | 64 | var totalStart, readStart, writeStart time.Time 65 | if showTimings { 66 | totalStart = time.Now() 67 | } 68 | 69 | flag.Usage = func() { 70 | fmt.Printf("Usage: %s [-t] [-i index] [-I index] filename\n", os.Args[0]) 71 | flag.PrintDefaults() 72 | os.Exit(1) 73 | } 74 | 75 | args := flag.Args() 76 | 77 | if len(args) != 1 { 78 | flag.Usage() 79 | } 80 | 81 | // Open the data df 82 | df, err := mmap.OpenFile(args[0], os.O_RDONLY, 0) 83 | if err != nil { 84 | panic(err) 85 | } 86 | defer df.Close() 87 | 88 | var dataHandler appendable.DataHandler 89 | 90 | switch { 91 | case jsonlFlag: 92 | dataHandler = handlers.JSONLHandler{} 93 | case csvFlag: 94 | dataHandler = handlers.CSVHandler{} 95 | default: 96 | logger.Error("Please specify the file type with -jsonl or -csv.") 97 | os.Exit(1) 98 | } 99 | if showTimings { 100 | readStart = time.Now() 101 | } 102 | mmpif, err := mmap.OpenFile(indexFilename, os.O_RDWR|os.O_CREATE, 0666) 103 | if err != nil { 104 | panic(err) 105 | } 106 | defer mmpif.Close() 107 | 108 | // Open the index file 109 | i, err := appendable.NewIndexFile(mmpif, dataHandler, searchHeaders) 110 | if err != nil { 111 | panic(err) 112 | } 113 | 114 | if benchmarkFilename != "" { 115 | f, err := os.Create(benchmarkFilename) 116 | if err != nil { 117 | panic(err) 118 | } 119 | defer f.Close() // error handling omitted for example 120 | i.SetBenchmarkFile(f) 121 | } 122 | 123 | if err := i.Synchronize(df.Bytes()); err != nil { 124 | panic(err) 125 | } 126 | 127 | if showTimings { 128 | readDuration := time.Since(readStart) 129 | logger.Info("Opening + synchronizing index file took", slog.Duration("duration", readDuration)) 130 | } 131 | 132 | // Write the index file 133 | if showTimings { 134 | writeStart = time.Now() 135 | } 136 | 137 | if err := mmpif.Close(); err != nil { 138 | panic(err) 139 | } 140 | 141 | if showTimings { 142 | writeDuration := time.Since(writeStart) 143 | logger.Info("Writing index file took", slog.Duration("duration", writeDuration)) 144 | 145 | totalDuration := time.Since(totalStart) 146 | logger.Info("Total execution time", slog.Duration("duration", totalDuration)) 147 | } 148 | 149 | logger.Info("Done!") 150 | } 151 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # kevmo314/appendable/examples 2 | 3 | These examples are hosted on this repository's GitHub pages. 4 | 5 | ``` 6 | # yellow tripdata 7 | wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet 8 | 9 | python3 -c "import pandas; pandas.read_parquet('yellow_tripdata_2023-01.parquet').to_json('yellow_tripdata_2023-01.jsonl', orient='records', lines=True)" 10 | ``` 11 | 12 | To build it locally, download the data and convert it to `jsonl`: 13 | 14 | ```sh 15 | cd workspace 16 | 17 | # green tripdata 18 | python3 -m pip install -r requirements.txt 19 | 20 | # fetch data with .jsonl format 21 | python3 fetch_jsonl.py 22 | ``` 23 | 24 | Then run the indexing process: 25 | 26 | ```sh 27 | # for jsonl: 28 | npm run build-index-jsonl 29 | ``` 30 | 31 | Copy the `.jsonl` file to `/client` 32 | 33 | ```sh 34 | cp green_tripdata_2023-01.jsonl ../client 35 | ``` 36 | 37 | Build the AppendableDB client library: 38 | 39 | ```sh 40 | npm run build 41 | ``` 42 | 43 | Copy the Appendable library to `/client` 44 | 45 | ```sh 46 | cp ../../dist/appendable.min.js ../client 47 | cp ../../dist/appendable.min.js.map ../client 48 | ``` 49 | 50 | Then run the development server: 51 | 52 | ```sh 53 | npm run client 54 | ``` 55 | 56 | You should see the example built on http://localhost:8080 57 | -------------------------------------------------------------------------------- /examples/benchmarkdiff/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "io" 6 | "os" 7 | "strconv" 8 | "strings" 9 | 10 | "github.com/go-echarts/go-echarts/v2/charts" 11 | "github.com/go-echarts/go-echarts/v2/components" 12 | "github.com/go-echarts/go-echarts/v2/opts" 13 | ) 14 | 15 | type record struct { 16 | timestamp int 17 | n int 18 | size int 19 | } 20 | 21 | func readFile(f *os.File) ([]record, error) { 22 | // read the file and parse the (timestamp,n,size) tuples 23 | s := bufio.NewScanner(f) 24 | var records []record 25 | for s.Scan() { 26 | // parse the line 27 | line := s.Text() 28 | // split the line 29 | tokens := strings.Split(line, ",") 30 | // convert the tokens to integers 31 | timestamp, err := strconv.Atoi(tokens[0]) 32 | if err != nil { 33 | return nil, err 34 | } 35 | n, err := strconv.Atoi(tokens[1]) 36 | if err != nil { 37 | return nil, err 38 | } 39 | size, err := strconv.Atoi(tokens[2]) 40 | if err != nil { 41 | return nil, err 42 | } 43 | records = append(records, record{timestamp, n, size}) 44 | } 45 | return records, s.Err() 46 | } 47 | 48 | func generateXAxis(records []record) []int { 49 | var xAxis []int 50 | for _, r := range records { 51 | xAxis = append(xAxis, r.n) 52 | } 53 | return xAxis 54 | } 55 | 56 | func generateTimestampYAxis(records []record) []opts.LineData { 57 | var yAxis []opts.LineData 58 | for _, r := range records { 59 | yAxis = append(yAxis, opts.LineData{Value: r.timestamp}) 60 | } 61 | return yAxis 62 | } 63 | 64 | func generateSizeYAxis(records []record) []opts.LineData { 65 | var yAxis []opts.LineData 66 | for _, r := range records { 67 | yAxis = append(yAxis, opts.LineData{Value: r.size}) 68 | } 69 | return yAxis 70 | } 71 | 72 | func generateTimestampDeltaYAxis(r1, r2 []record) []opts.LineData { 73 | var yAxis []opts.LineData 74 | for i := range r1 { 75 | yAxis = append(yAxis, opts.LineData{Value: r2[i].timestamp - r1[i].timestamp}) 76 | } 77 | return yAxis 78 | } 79 | 80 | func generateSizeDeltaYAxis(r1, r2 []record) []opts.LineData { 81 | var yAxis []opts.LineData 82 | for i := range r1 { 83 | yAxis = append(yAxis, opts.LineData{Value: r2[i].size - r1[i].size}) 84 | } 85 | return yAxis 86 | } 87 | 88 | func main() { 89 | // read two arguments as files and parse the (timestamp,n,size) tuples 90 | f1, err := os.Open(os.Args[1]) 91 | if err != nil { 92 | panic(err) 93 | } 94 | defer f1.Close() 95 | records1, err := readFile(f1) 96 | if err != nil { 97 | panic(err) 98 | } 99 | f2, err := os.Open(os.Args[2]) 100 | if err != nil { 101 | panic(err) 102 | } 103 | defer f2.Close() 104 | records2, err := readFile(f2) 105 | if err != nil { 106 | panic(err) 107 | } 108 | 109 | // generate four charts: 110 | // 1. timestamp vs n 111 | // 2. pagefile size vs n 112 | // 3. timestamp delta vs n 113 | // 4. pagefile size delta vs n 114 | 115 | line1 := charts.NewLine() 116 | line1.SetGlobalOptions( 117 | charts.WithTooltipOpts(opts.Tooltip{Show: true, Trigger: "axis"}), 118 | charts.WithYAxisOpts(opts.YAxis{ 119 | Name: "Time (μs)", 120 | }), 121 | charts.WithXAxisOpts(opts.XAxis{ 122 | Name: "Bytes read", 123 | })) 124 | line1.SetXAxis(generateXAxis(records1)). 125 | AddSeries("Run 1", generateTimestampYAxis(records1)). 126 | AddSeries("Run 2", generateTimestampYAxis(records2)) 127 | 128 | line2 := charts.NewLine() 129 | line2.SetGlobalOptions( 130 | charts.WithTooltipOpts(opts.Tooltip{Show: true, Trigger: "axis"}), 131 | charts.WithYAxisOpts(opts.YAxis{ 132 | Name: "Size (pages)", 133 | }), 134 | charts.WithXAxisOpts(opts.XAxis{ 135 | Name: "Bytes read", 136 | })) 137 | line2.SetXAxis(generateXAxis(records1)). 138 | AddSeries("Run 1", generateSizeYAxis(records1)). 139 | AddSeries("Run 2", generateSizeYAxis(records2)) 140 | 141 | line3 := charts.NewLine() 142 | line3.SetGlobalOptions( 143 | charts.WithYAxisOpts(opts.YAxis{ 144 | Name: "Time delta (μs)", 145 | }), 146 | charts.WithXAxisOpts(opts.XAxis{ 147 | Name: "Bytes read", 148 | })) 149 | line3.SetXAxis(generateXAxis(records1)). 150 | AddSeries("Time delta", generateTimestampDeltaYAxis(records1, records2)) 151 | 152 | line4 := charts.NewLine() 153 | line4.SetGlobalOptions( 154 | charts.WithYAxisOpts(opts.YAxis{ 155 | Name: "Size delta (pages)", 156 | }), 157 | charts.WithXAxisOpts(opts.XAxis{ 158 | Name: "Bytes read", 159 | })) 160 | line4.SetXAxis(generateXAxis(records1)). 161 | AddSeries("Size delta", generateSizeDeltaYAxis(records1, records2)) 162 | 163 | page := components.NewPage() 164 | page.PageTitle = "Benchmark diff" 165 | page.AddCharts( 166 | line1, 167 | line2, 168 | line3, 169 | line4, 170 | ) 171 | f, err := os.Create("output.html") 172 | if err != nil { 173 | panic(err) 174 | } 175 | page.Render(io.MultiWriter(f)) 176 | } 177 | -------------------------------------------------------------------------------- /examples/client/editor.js: -------------------------------------------------------------------------------- 1 | let activeEditor = "json"; 2 | 3 | var editor = ace.edit("editor"); 4 | editor.setTheme("ace/theme/chrome"); 5 | 6 | var jsonSession = ace.createEditSession( 7 | JSON.stringify( 8 | { 9 | where: [ 10 | { 11 | operation: ">=", 12 | key: "trip_distance", 13 | value: 10, 14 | }, 15 | ], 16 | orderBy: [ 17 | { 18 | key: "trip_distance", 19 | direction: "ASC", 20 | }, 21 | ], 22 | select: [ 23 | "trip_distance", 24 | "VendorID", 25 | "passenger_count", 26 | "fare_amount", 27 | "tip_amount", 28 | "mta_tax", 29 | ], 30 | }, 31 | null, 32 | 2, 33 | ), 34 | "ace/mode/json", 35 | ); 36 | 37 | var jsCode = 38 | "db\n" + 39 | " .where('trip_distance', '>=', 10)\n" + 40 | " .orderBy('trip_distance', 'ASC')\n" + 41 | " .select([\n" + 42 | " 'trip_distance',\n" + 43 | " 'VendorID',\n" + 44 | " 'passenger_count',\n" + 45 | " 'fare_amount',\n" + 46 | " 'tip_amount',\n" + 47 | " 'mta_tax'\n" + 48 | " ])\n" + 49 | " .get();"; 50 | 51 | var jsSession = ace.createEditSession(jsCode, "ace/mode/javascript"); 52 | 53 | editor.setSession(jsonSession); 54 | 55 | var jsonTab = document.getElementById("jsonTab"); 56 | var jsTab = document.getElementById("jsTab"); 57 | 58 | jsonTab.addEventListener("click", function () { 59 | editor.setSession(jsonSession); 60 | attachJsonEditorUX(); 61 | activeEditor = "json"; 62 | window.activeEditor = activeEditor; 63 | }); 64 | 65 | jsTab.addEventListener("click", function () { 66 | editor.setSession(jsSession); 67 | activeEditor = "javascript"; 68 | window.activeEditor = activeEditor; 69 | }); 70 | 71 | function attachJsonEditorUX() { 72 | // NOTE: when composite indexes get supported, remove this UX feature 73 | // <---- start of UX feature ----> 74 | let isProgramChange = false; 75 | let lastEdited = "none"; 76 | let prevWhereKey = "trip_distance"; 77 | let prevOrderByKey = "trip_distance"; 78 | 79 | function updateKey(editorContent) { 80 | try { 81 | let query = JSON.parse(editorContent); 82 | if (query.where && query.orderBy) { 83 | const whereKey = query.where[0].key; 84 | const orderByKey = query.orderBy[0].key; 85 | 86 | if (lastEdited === "where") { 87 | query.orderBy[0].key = whereKey; 88 | } else if (lastEdited === "orderBy") { 89 | query.where[0].key = orderByKey; 90 | } 91 | 92 | prevWhereKey = whereKey; 93 | prevOrderByKey = orderByKey; 94 | 95 | return JSON.stringify(query, null, 2); 96 | } 97 | } catch (e) { 98 | console.log("Error parsing JSON:", e.message); 99 | console.log("Incomplete string content:", editorContent); 100 | } 101 | return editorContent; 102 | } 103 | 104 | editor.getSession().on("change", function (e) { 105 | if (isProgramChange) { 106 | isProgramChange = false; 107 | return; 108 | } 109 | 110 | const cursorPosition = editor.getCursorPosition(); 111 | const editorContent = editor.getSession().getValue(); 112 | 113 | let query; 114 | try { 115 | query = JSON.parse(editorContent); 116 | } catch (e) { 117 | return; 118 | } 119 | 120 | const currentWhereKey = query.where ? query.where[0].key : ""; 121 | const currentOrderByKey = query.orderBy ? query.orderBy[0].key : ""; 122 | 123 | if (currentWhereKey !== prevWhereKey) { 124 | lastEdited = "where"; 125 | } else if (currentOrderByKey !== prevOrderByKey) { 126 | lastEdited = "orderBy"; 127 | } 128 | 129 | const updatedContent = updateKey(editorContent); 130 | 131 | if (updatedContent !== editorContent) { 132 | isProgramChange = true; 133 | 134 | const doc = editor.getSession().getDocument(); 135 | doc.setValue(updatedContent); 136 | 137 | editor.moveCursorToPosition(cursorPosition); 138 | editor.clearSelection(); 139 | } 140 | }); 141 | 142 | // <---- end of UX feature ----> 143 | } 144 | 145 | attachJsonEditorUX(); 146 | window.activeEditor = activeEditor; 147 | -------------------------------------------------------------------------------- /examples/client/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 38 | 39 | 40 | 41 |
42 |

43 | Appendable - NYC 44 | Green Cab Trip Data in 01/2023 45 |

46 |
47 | Download the raw data here: 48 | JSONL - 49 | Appendable Index - 50 | Source 53 |
54 |

55 | Appendable is querying the JSONL and index files that GitHub pages hosts 56 | directly. There is no server involved here! 57 |

58 |

59 | Keep in mind that while the query syntax supports a lot of different 60 | operations, Appendable doesn't support composite indexes yet. Therefore, 61 | only one field at a time can be filtered on and that field must be used 62 | for sorting. 63 |

64 |
65 |
66 |
67 |

Fields

68 |

 69 |       
70 |
71 |

Query

72 |
73 | 74 | 75 |
76 |

Results -

77 | 78 | 79 |

 80 |         
81 |
82 | 87 | 88 | 89 | 198 |
199 | 200 | 201 | -------------------------------------------------------------------------------- /examples/client/server.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | "net/http" 6 | ) 7 | 8 | func main() { 9 | // Set the directory to serve 10 | fs := http.FileServer(http.Dir("./")) 11 | 12 | // Handle all requests by serving a file of the same name 13 | http.Handle("/", fs) 14 | 15 | http.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) { 16 | http.ServeFile(w, r, "./search.html") 17 | }) 18 | 19 | // Define the port to listen on 20 | port := "3001" 21 | log.Printf("Listening on http://localhost:%s/", port) 22 | 23 | // Start the server 24 | err := http.ListenAndServe(":"+port, nil) 25 | if err != nil { 26 | log.Fatal(err) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /examples/client/styles.css: -------------------------------------------------------------------------------- 1 | body, 2 | html { 3 | margin: 0; 4 | padding: 0px 0px 4px 4px; 5 | } 6 | .flex-1 { 7 | flex: 1; 8 | display: flex; 9 | gap: 0 30px; 10 | height: 100vh; 11 | width: 100vw; 12 | } 13 | .result-row { 14 | cursor: pointer; 15 | } 16 | .result-row:hover { 17 | background-color: yellow; 18 | } 19 | #fields { 20 | max-height: calc(100vh - 50px); 21 | overflow-y: auto; 22 | } 23 | #results { 24 | overflow-y: auto; 25 | max-height: calc(100vh - 670px); 26 | } 27 | #results-header { 28 | width: max-content; 29 | } 30 | .header-item, 31 | .result-cell { 32 | padding: 4px; 33 | text-align: left; 34 | min-width: 200px; 35 | } 36 | .header-item { 37 | background-color: #f0f0f0; 38 | font-weight: bold; 39 | } 40 | -------------------------------------------------------------------------------- /examples/visualizer/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/binary" 5 | "fmt" 6 | "io" 7 | "os" 8 | "slices" 9 | 10 | "github.com/kevmo314/appendable/pkg/bptree" 11 | "github.com/kevmo314/appendable/pkg/handlers" 12 | "github.com/kevmo314/appendable/pkg/mmap" 13 | "github.com/kevmo314/appendable/pkg/pagefile" 14 | "golang.org/x/sys/unix" 15 | ) 16 | 17 | func main() { 18 | f, err := os.Open(os.Args[1]) 19 | if err != nil { 20 | panic(err) 21 | } 22 | defer f.Close() 23 | 24 | df, err := os.Open(os.Args[2]) 25 | if err != nil { 26 | panic(err) 27 | } 28 | defer df.Close() 29 | 30 | mmdf, err := mmap.NewMemoryMappedFile(df, unix.PROT_READ) 31 | if err != nil { 32 | panic(err) 33 | } 34 | 35 | // create a new pagefile 36 | pf, err := pagefile.NewPageFile(f) 37 | if err != nil { 38 | panic(err) 39 | } 40 | 41 | lmps := []int64{4096} // store a list of the linked meta pages. 42 | fps := []int64{} // store a list of the free pages. 43 | 44 | fmt.Printf("\n\n\nAppendable Visualizer\n\n\n
    \n") 45 | 46 | // read the free page index 47 | fmt.Printf("
  1. Free Page Index", 0) 48 | if _, err := f.Seek(0, io.SeekStart); err != nil { 49 | panic(err) 50 | } 51 | buf := make([]byte, pf.PageSize()) 52 | if _, err := f.Read(buf); err != nil { 53 | panic(err) 54 | } 55 | for j := 0; j < pf.PageSize()/8; j++ { 56 | val := binary.LittleEndian.Uint64(buf[j*8 : j*8+8]) 57 | if val == 0 { 58 | break 59 | } 60 | fmt.Printf("

    %d

    ", val) 61 | fps = append(fps, int64(val)) 62 | } 63 | fmt.Printf("
  2. ") 64 | 65 | slices.Sort(fps) 66 | 67 | for i := int64(0); i < pf.PageCount(); i++ { 68 | offset, err := pf.Page(int(i)) 69 | if err != nil { 70 | panic(err) 71 | } 72 | // read the page 73 | if _, err := f.Seek(offset, io.SeekStart); err != nil { 74 | panic(err) 75 | } 76 | if len(fps) > 0 && i == fps[0] { 77 | // this is a free page 78 | fps = fps[1:] 79 | fmt.Printf("
  3. Free Page", offset) 80 | fmt.Printf("
  4. ") 81 | } else if len(lmps) > 0 && offset == lmps[0] { 82 | // this is a linked meta page 83 | lmps = lmps[1:] 84 | 85 | // metaPage, err := linkedpage.NewMultiBPTree(pf, int(i)) 86 | // if err != nil { 87 | // panic(err) 88 | // } 89 | fmt.Printf("
  5. Linked Meta Page (TODO)
  6. ", offset) 90 | 91 | // root, err := metaPage.Root() 92 | // if err != nil { 93 | // panic(err) 94 | // } 95 | // next, err := metaPage.Next() 96 | // if err != nil { 97 | // panic(err) 98 | // } 99 | // exists, err := next.Exists() 100 | // if err != nil { 101 | // panic(err) 102 | // } 103 | // if exists { 104 | // fmt.Printf("

    Root (%x) - Next (%x)

    ", root.Offset, root.Offset, next.MemoryPointer().Offset, next.MemoryPointer().Offset) 105 | // lmps = append(lmps, int64(next.MemoryPointer().Offset)) 106 | // } else { 107 | // fmt.Printf("

    Root (%x) - Next (nil)

    ", root.Offset, root.Offset) 108 | // } 109 | // fmt.Printf("

    Metadata

    ") 110 | // md, err := metaPage.Metadata() 111 | // if err != nil { 112 | // panic(err) 113 | // } 114 | // fmt.Printf("
    %x
    ", md) 115 | // fmt.Printf("") 116 | } else { 117 | // try to read the page as a bptree node 118 | node := &bptree.BPTreeNode{} 119 | node.Data = mmdf.Bytes() 120 | node.DataParser = &handlers.JSONLHandler{} 121 | 122 | if _, err := f.Seek(offset, io.SeekStart); err != nil { 123 | panic(err) 124 | } 125 | buf := make([]byte, pf.PageSize()) 126 | if _, err := f.Read(buf); err != nil { 127 | panic(err) 128 | } 129 | if err := node.UnmarshalBinary(buf); err != nil { 130 | if err == io.EOF { 131 | break 132 | } 133 | panic(err) 134 | } 135 | 136 | if node.Leaf() { 137 | fmt.Printf("
  7. B+ Tree Leaf Node", offset) 138 | } else { 139 | fmt.Printf("
  8. B+ Tree Node", offset) 140 | } 141 | fmt.Printf("

    Keys

    ") 142 | for _, k := range node.Keys { 143 | fmt.Printf("
    %x
    ", k.Value) 144 | } 145 | fmt.Printf("

    Pointers

    ") 146 | for j := 0; j < node.NumPointers(); j++ { 147 | if node.Leaf() { 148 | fmt.Printf("

    [%x:%x]

    ", node.Pointer(j).Offset, node.Pointer(j).Offset+uint64(node.Pointer(j).Length)) 149 | } else { 150 | fmt.Printf("

    %x

    ", node.Pointer(j).Offset, node.Pointer(j).Offset) 151 | } 152 | } 153 | fmt.Printf("
  9. ") 154 | } 155 | } 156 | fmt.Printf("
\n\n\n") 157 | 158 | } 159 | -------------------------------------------------------------------------------- /examples/workspace/fetch_csv.py: -------------------------------------------------------------------------------- 1 | # Data taken from https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page 2 | 3 | import io 4 | 5 | import pandas as pd 6 | import requests 7 | 8 | response = requests.get('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet') 9 | 10 | df = pd.read_parquet(io.BytesIO(response.content)) 11 | df.to_csv('green_tripdata_2023-01.csv', index=False) -------------------------------------------------------------------------------- /examples/workspace/fetch_jsonl.py: -------------------------------------------------------------------------------- 1 | # Data taken from https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page 2 | 3 | import io 4 | 5 | import pandas as pd 6 | import requests 7 | 8 | response = requests.get('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet') 9 | 10 | pd.read_parquet(io.BytesIO(response.content)).to_json('green_tripdata_2023-01.jsonl', orient='records', lines=True) 11 | -------------------------------------------------------------------------------- /examples/workspace/requirements.txt: -------------------------------------------------------------------------------- 1 | pyarrow 2 | pandas 3 | requests -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/kevmo314/appendable 2 | 3 | go 1.22.0 4 | 5 | require ( 6 | github.com/go-echarts/go-echarts/v2 v2.3.3 7 | golang.org/x/sys v0.16.0 8 | golang.org/x/text v0.14.0 9 | ) 10 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 2 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 3 | github.com/go-echarts/go-echarts/v2 v2.3.3 h1:uImZAk6qLkC6F9ju6mZ5SPBqTyK8xjZKwSmwnCg4bxg= 4 | github.com/go-echarts/go-echarts/v2 v2.3.3/go.mod h1:56YlvzhW/a+du15f3S2qUGNDfKnFOeJSThBIrVFHDtI= 5 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 6 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 7 | github.com/stretchr/testify v1.6.0 h1:jlIyCplCJFULU/01vCkhKuTyc3OorI3bJFuw6obfgho= 8 | github.com/stretchr/testify v1.6.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 9 | golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= 10 | golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 11 | golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= 12 | golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= 13 | gopkg.in/yaml.v3 v3.0.0 h1:hjy8E9ON/egN1tAYqKb61G10WtihqetD4sz2H+8nIeA= 14 | gopkg.in/yaml.v3 v3.0.0/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 15 | -------------------------------------------------------------------------------- /jest.config.ts: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | preset: "ts-jest", 3 | testEnvironment: "node", 4 | }; 5 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "appendable", 3 | "version": "1.0.0", 4 | "description": "AppendableDB is an append-only\\*, schemaless, service-less, client-facing database.", 5 | "main": "index.js", 6 | "scripts": { 7 | "build": "esbuild src/index.ts --bundle --minify --sourcemap --outfile=dist/appendable.min.js", 8 | "warp": "rm -rf dist examples/client/appendable.min.js examples/client/appendable.min.js.map && esbuild src/index.ts --bundle --minify --sourcemap --outfile=dist/appendable.min.js", 9 | "client": "cd examples/client && go run server.go", 10 | "test": "jest" 11 | }, 12 | "repository": { 13 | "type": "git", 14 | "url": "git+https://github.com/kevmo314/appendable.git" 15 | }, 16 | "author": "Kevin Wang ", 17 | "license": "ISC", 18 | "bugs": { 19 | "url": "https://github.com/kevmo314/appendable/issues" 20 | }, 21 | "homepage": "https://github.com/kevmo314/appendable#readme", 22 | "dependencies": { 23 | "esbuild": "^0.19.7" 24 | }, 25 | "devDependencies": { 26 | "@types/jest": "^29.5.11", 27 | "prettier": "^3.2.1", 28 | "ts-jest": "^29.1.1", 29 | "ts-node": "^10.9.2" 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /pkg/appendable/appendable.go: -------------------------------------------------------------------------------- 1 | package appendable 2 | 3 | import ( 4 | "encoding/binary" 5 | "fmt" 6 | "github.com/kevmo314/appendable/pkg/encoding" 7 | "strings" 8 | ) 9 | 10 | /** 11 | * The structure of an index file is characterized by some pages that point 12 | * to other pages. Each box below represents a (typically 4kB) page and 13 | * the arrows indicate that there is a pointer to the next page. 14 | * 15 | * +-----------+-----------+ +-------------+ +-------------+ +-------------+ 16 | * | Page GC | File Meta | -> | Index Meta | -> | Index Meta | -> | Index Meta | 17 | * +-----------+-----------+ +-------------+ +-------------+ +-------------+ 18 | * | | | 19 | * v v v 20 | * +~~~~~~~~~~~~~+ +~~~~~~~~~~~~~+ +~~~~~~~~~~~~~+ 21 | * | B+ Tree | | B+ Tree | | B+ Tree | 22 | * +~~~~~~~~~~~~~+ +~~~~~~~~~~~~~+ +~~~~~~~~~~~~~+ 23 | * 24 | * Note: By convention, the first FileMeta does not have a pointer to the 25 | * B+ tree. Instead, the first FileMeta is used to store metadata about the 26 | * file itself and only contains a next pointer. 27 | * 28 | * Additionally, the Page GC page is used by the page file to store free page 29 | * indexes for garbage collection. 30 | * 31 | * Consequentially, the index file cannot be smaller than two pages (typically 8kB). 32 | */ 33 | 34 | type Version byte 35 | 36 | type Format byte 37 | 38 | const ( 39 | FormatJSONL Format = iota 40 | FormatCSV 41 | ) 42 | 43 | // FieldType represents the type of data stored in the field, which follows 44 | // JSON types excluding Object and null. Object is broken down into subfields 45 | // and null is not stored. 46 | type FieldType byte 47 | 48 | const ( 49 | FieldTypeString FieldType = iota 50 | FieldTypeInt64 51 | FieldTypeUint64 52 | FieldTypeFloat64 53 | FieldTypeObject 54 | FieldTypeArray 55 | FieldTypeBoolean 56 | FieldTypeNull 57 | 58 | FieldTypeTrigram 59 | FieldTypeBigram 60 | 61 | FieldTypeUnigram 62 | 63 | FieldTypeVector 64 | ) 65 | 66 | func (t FieldType) TypescriptType() string { 67 | components := []string{} 68 | if t&FieldTypeString != 0 || t&FieldTypeTrigram != 0 || t&FieldTypeBigram != 0 || t&FieldTypeUnigram != 0 { 69 | components = append(components, "string") 70 | } 71 | if t&FieldTypeInt64 != 0 || t&FieldTypeFloat64 != 0 { 72 | components = append(components, "number") 73 | } 74 | if t&FieldTypeObject != 0 { 75 | components = append(components, "Record") 76 | } 77 | if t&FieldTypeArray != 0 { 78 | components = append(components, "any[]") 79 | } 80 | if t&FieldTypeBoolean != 0 { 81 | components = append(components, "boolean") 82 | } 83 | if t&FieldTypeNull != 0 { 84 | components = append(components, "null") 85 | } 86 | 87 | if t&FieldTypeVector != 0 { 88 | components = append(components, "number[]") 89 | } 90 | 91 | if len(components) == 0 { 92 | return "unknown" 93 | } 94 | return strings.Join(components, " | ") 95 | } 96 | 97 | type FileMeta struct { 98 | Version 99 | Format 100 | // An offset to indicate how much data is contained within 101 | // this index. Note that this is implementation-dependent, 102 | // so it is not guaranteed to have any uniform interpretation. 103 | // For example, in JSONL, this is the number of bytes read 104 | // and indexed so far. 105 | ReadOffset uint64 106 | Entries uint64 107 | } 108 | 109 | func (m *FileMeta) MarshalBinary() ([]byte, error) { 110 | buf := make([]byte, 10+encoding.SizeVarint(m.Entries)) 111 | buf[0] = byte(m.Version) 112 | buf[1] = byte(m.Format) 113 | binary.LittleEndian.PutUint64(buf[2:], m.ReadOffset) 114 | binary.PutUvarint(buf[10:], m.Entries) 115 | return buf, nil 116 | } 117 | 118 | func (m *FileMeta) UnmarshalBinary(buf []byte) error { 119 | if len(buf) < 10 { 120 | return fmt.Errorf("invalid metadata size: %d", len(buf)) 121 | } 122 | m.Version = Version(buf[0]) 123 | 124 | fileFormat := buf[1] 125 | 126 | switch fileFormat { 127 | case byte(0): 128 | m.Format = FormatJSONL 129 | case byte(1): 130 | m.Format = FormatCSV 131 | default: 132 | return fmt.Errorf("unrecognized file format: %v", buf[1]) 133 | } 134 | 135 | m.ReadOffset = binary.LittleEndian.Uint64(buf[2:]) 136 | 137 | e, _ := binary.Uvarint(buf[10:]) 138 | m.Entries = e 139 | 140 | return nil 141 | } 142 | 143 | type IndexMeta struct { 144 | FieldName string 145 | FieldType FieldType 146 | Width uint16 147 | 148 | // TotalFieldValueLength represents the cumulative sum of the lengths of the entries within this index, used for computing the average length. 149 | TotalFieldValueLength uint64 150 | } 151 | 152 | func (m *IndexMeta) MarshalBinary() ([]byte, error) { 153 | buf := make([]byte, 2+2+len(m.FieldName)+2+encoding.SizeVarint(m.TotalFieldValueLength)) 154 | binary.LittleEndian.PutUint16(buf[0:], uint16(m.FieldType)) 155 | binary.LittleEndian.PutUint16(buf[2:], m.Width) 156 | binary.LittleEndian.PutUint16(buf[4:], uint16(len(m.FieldName))) 157 | copy(buf[6:], m.FieldName) 158 | binary.PutUvarint(buf[6+len(m.FieldName):], m.TotalFieldValueLength) 159 | return buf, nil 160 | } 161 | 162 | func (m *IndexMeta) UnmarshalBinary(buf []byte) error { 163 | if len(buf) < 4 { 164 | return fmt.Errorf("invalid metadata size: %d", len(buf)) 165 | } 166 | m.FieldType = FieldType(binary.LittleEndian.Uint16(buf[0:])) 167 | m.Width = binary.LittleEndian.Uint16(buf[2:]) 168 | nameLength := binary.LittleEndian.Uint16(buf[4:]) 169 | if len(buf) < 4+int(nameLength) { 170 | return fmt.Errorf("invalid metadata size: %d", len(buf)) 171 | } 172 | m.FieldName = string(buf[6 : 6+nameLength]) 173 | tl, _ := binary.Uvarint(buf[6+nameLength:]) 174 | m.TotalFieldValueLength = tl 175 | return nil 176 | } 177 | 178 | func DetermineType(ft FieldType) uint16 { 179 | shift := 1 // we'll dedicate 0 to be variable width, everything else is the fixed width + shift 180 | width := uint16(0) 181 | switch ft { 182 | case FieldTypeBoolean: 183 | width = uint16(shift + 1) 184 | case FieldTypeNull: 185 | width = uint16(shift + 0) 186 | case FieldTypeFloat64, FieldTypeInt64, FieldTypeUint64: 187 | width = uint16(shift + 8) 188 | case FieldTypeTrigram: 189 | width = uint16(shift + 3) 190 | case FieldTypeBigram: 191 | width = uint16(shift + 2) 192 | case FieldTypeUnigram: 193 | width = uint16(shift + 1) 194 | } 195 | 196 | return width 197 | } 198 | -------------------------------------------------------------------------------- /pkg/appendable/appendable_test.go: -------------------------------------------------------------------------------- 1 | package appendable 2 | 3 | import ( 4 | "github.com/kevmo314/appendable/pkg/buftest" 5 | "github.com/kevmo314/appendable/pkg/linkedpage" 6 | "github.com/kevmo314/appendable/pkg/pagefile" 7 | "reflect" 8 | "testing" 9 | ) 10 | 11 | func TestMarshalMetadata(t *testing.T) { 12 | t.Run("file meta", func(t *testing.T) { 13 | b := buftest.NewSeekableBuffer() 14 | p, err := pagefile.NewPageFile(b) 15 | if err != nil { 16 | t.Fatal(err) 17 | } 18 | 19 | tree, err := linkedpage.NewMultiBPTree(p, 0) 20 | if err != nil { 21 | t.Fatal(err) 22 | } 23 | 24 | page, err := tree.AddNext() 25 | if err != nil { 26 | t.Fatal(err) 27 | } 28 | 29 | fm := &FileMeta{ 30 | Version: 1, 31 | Format: 1, 32 | ReadOffset: 69, 33 | Entries: 38, 34 | } 35 | 36 | buf, err := fm.MarshalBinary() 37 | if err != nil { 38 | t.Fatalf("Failed to marshal binary: %v", err) 39 | } 40 | 41 | if err := page.SetMetadata(buf); err != nil { 42 | t.Fatal(err) 43 | } 44 | 45 | // finished marshaling 46 | // <--------> 47 | // start unmarshal 48 | 49 | buf, err = page.Metadata() 50 | if err != nil { 51 | t.Fatal(err) 52 | } 53 | 54 | fm2 := &FileMeta{} 55 | 56 | if err := fm2.UnmarshalBinary(buf); err != nil { 57 | t.Fatal(err) 58 | } 59 | 60 | if !reflect.DeepEqual(fm, fm2) { 61 | t.Fatal("not equal") 62 | } 63 | }) 64 | 65 | t.Run("file meta", func(t *testing.T) { 66 | b := buftest.NewSeekableBuffer() 67 | p, err := pagefile.NewPageFile(b) 68 | if err != nil { 69 | t.Fatal(err) 70 | } 71 | 72 | tree, err := linkedpage.NewMultiBPTree(p, 0) 73 | if err != nil { 74 | t.Fatal(err) 75 | } 76 | 77 | page, err := tree.AddNext() 78 | if err != nil { 79 | t.Fatal(err) 80 | } 81 | 82 | im := &IndexMeta{ 83 | FieldName: "scarface", 84 | FieldType: FieldTypeString, 85 | Width: 0, 86 | TotalFieldValueLength: 938, 87 | } 88 | buf, err := im.MarshalBinary() 89 | if err != nil { 90 | t.Fatalf("Failed to marshal binary: %v", err) 91 | } 92 | 93 | if err := page.SetMetadata(buf); err != nil { 94 | t.Fatal(err) 95 | } 96 | 97 | // finished marshaling 98 | // <--------> 99 | // start unmarshal 100 | 101 | buf, err = page.Metadata() 102 | if err != nil { 103 | t.Fatal(err) 104 | } 105 | 106 | im2 := &IndexMeta{} 107 | 108 | if err := im2.UnmarshalBinary(buf); err != nil { 109 | t.Fatal(err) 110 | } 111 | 112 | if !reflect.DeepEqual(im, im2) { 113 | t.Fatal("not equal") 114 | } 115 | }) 116 | 117 | } 118 | -------------------------------------------------------------------------------- /pkg/appendable/index_file.go: -------------------------------------------------------------------------------- 1 | package appendable 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "github.com/kevmo314/appendable/pkg/linkedpage" 7 | "io" 8 | "time" 9 | 10 | "github.com/kevmo314/appendable/pkg/bptree" 11 | "github.com/kevmo314/appendable/pkg/pagefile" 12 | ) 13 | 14 | const CurrentVersion = 1 15 | 16 | type DataHandler interface { 17 | bptree.DataParser 18 | Synchronize(f *IndexFile, df []byte) error 19 | Format() Format 20 | } 21 | 22 | // IndexFile is a representation of the entire index file. 23 | type IndexFile struct { 24 | tree *linkedpage.LinkedPage 25 | dataHandler DataHandler 26 | 27 | pf *pagefile.PageFile 28 | BenchmarkCallback func(int) 29 | 30 | searchHeaders []string 31 | } 32 | 33 | func NewIndexFile(f io.ReadWriteSeeker, dataHandler DataHandler, searchHeaders []string) (*IndexFile, error) { 34 | pf, err := pagefile.NewPageFile(f) 35 | if err != nil { 36 | return nil, fmt.Errorf("failed to create page file: %w", err) 37 | } 38 | 39 | tree, err := linkedpage.NewMultiBPTree(pf, 0) 40 | if err != nil { 41 | return nil, fmt.Errorf("failed to create multi b+ tree: %w", err) 42 | } 43 | // ensure the first page is written. 44 | node, err := tree.Next() 45 | if err != nil && !errors.Is(err, io.EOF) { 46 | return nil, fmt.Errorf("failed to get next meta page: %w", err) 47 | } 48 | if errors.Is(err, io.EOF) { 49 | // the page doesn't exist, so we need to create it 50 | created, err := tree.AddNext() 51 | if err != nil { 52 | return nil, fmt.Errorf("failed to add next meta page: %w", err) 53 | } 54 | metadata := &FileMeta{ 55 | Version: CurrentVersion, 56 | Format: dataHandler.Format(), 57 | } 58 | buf, err := metadata.MarshalBinary() 59 | if err != nil { 60 | return nil, fmt.Errorf("failed to marshal metadata: %w", err) 61 | } 62 | if err := created.SetMetadata(buf); err != nil { 63 | return nil, fmt.Errorf("failed to set metadata: %w", err) 64 | } 65 | return &IndexFile{tree: created, dataHandler: dataHandler, pf: pf, searchHeaders: searchHeaders}, nil 66 | } else { 67 | // validate the metadata 68 | buf, err := node.Metadata() 69 | if err != nil { 70 | return nil, fmt.Errorf("failed to read metadata: %w", err) 71 | } 72 | metadata := &FileMeta{} 73 | if err := metadata.UnmarshalBinary(buf); err != nil { 74 | return nil, fmt.Errorf("failed to unmarshal metadata: %w", err) 75 | } 76 | if metadata.Version != CurrentVersion { 77 | return nil, fmt.Errorf("unsupported version: %d", metadata.Version) 78 | } 79 | if metadata.Format != dataHandler.Format() { 80 | return nil, fmt.Errorf("unsupported format: %x", metadata.Format) 81 | } 82 | return &IndexFile{tree: node, dataHandler: dataHandler, pf: pf, searchHeaders: searchHeaders}, nil 83 | } 84 | } 85 | 86 | func (i *IndexFile) Metadata() (*FileMeta, error) { 87 | // the first page consists of associated metadata for the tree 88 | buf, err := i.tree.Metadata() 89 | if err != nil { 90 | return nil, fmt.Errorf("failed to read metadata: %w", err) 91 | } 92 | metadata := &FileMeta{} 93 | return metadata, metadata.UnmarshalBinary(buf) 94 | } 95 | 96 | func (i *IndexFile) SetMetadata(metadata *FileMeta) error { 97 | buf, err := metadata.MarshalBinary() 98 | if err != nil { 99 | return fmt.Errorf("failed to marshal metadata: %w", err) 100 | } 101 | return i.tree.SetMetadata(buf) 102 | } 103 | 104 | func (i *IndexFile) Indexes() (*linkedpage.LinkedPage, error) { 105 | return i.tree.Next() 106 | } 107 | 108 | func (i *IndexFile) IsEmpty() (bool, error) { 109 | n, err := i.tree.Next() 110 | if err != nil && !errors.Is(err, io.EOF) { 111 | return false, fmt.Errorf("failed to get next meta page: %w", err) 112 | } 113 | return n != nil, nil 114 | } 115 | 116 | func (i *IndexFile) IndexFieldNames() ([]string, error) { 117 | var fieldNames []string 118 | uniqueFieldNames := make(map[string]bool) 119 | 120 | mp := i.tree 121 | 122 | for { 123 | next, err := mp.Next() 124 | if err != nil { 125 | if errors.Is(err, io.EOF) { 126 | break 127 | } 128 | return nil, fmt.Errorf("failed to get next meta page: %w", err) 129 | } 130 | buf, err := next.Metadata() 131 | if err != nil { 132 | return nil, fmt.Errorf("failed to read metadata: %w", err) 133 | } 134 | metadata := &IndexMeta{} 135 | if err := metadata.UnmarshalBinary(buf); err != nil { 136 | return nil, fmt.Errorf("failed to unmarshal metadata: %w", err) 137 | } 138 | 139 | if _, ok := uniqueFieldNames[metadata.FieldName]; !ok { 140 | uniqueFieldNames[metadata.FieldName] = true 141 | fieldNames = append(fieldNames, metadata.FieldName) 142 | } 143 | mp = next 144 | } 145 | 146 | return fieldNames, nil 147 | } 148 | 149 | func (i *IndexFile) FindOrCreateIndex(name string, fieldType FieldType) (*linkedpage.LinkedPage, *IndexMeta, error) { 150 | mp := i.tree 151 | for { 152 | next, err := mp.Next() 153 | if err != nil { 154 | if errors.Is(err, io.EOF) { 155 | break 156 | } 157 | return nil, nil, fmt.Errorf("failed to get next meta page: %w", err) 158 | } 159 | buf, err := next.Metadata() 160 | if err != nil { 161 | return nil, nil, fmt.Errorf("failed to read metadata: %w", err) 162 | } 163 | metadata := &IndexMeta{} 164 | if err := metadata.UnmarshalBinary(buf); err != nil { 165 | return nil, nil, fmt.Errorf("failed to unmarshal metadata: %w", err) 166 | } 167 | if metadata.FieldName == name && metadata.FieldType == fieldType { 168 | return next, metadata, nil 169 | } 170 | mp = next 171 | } 172 | // we haven't found the index, so we need to create it 173 | next, err := mp.AddNext() 174 | if err != nil { 175 | return nil, nil, fmt.Errorf("failed to add next meta page: %w", err) 176 | } 177 | metadata := &IndexMeta{} 178 | metadata.FieldName = name 179 | metadata.FieldType = fieldType 180 | metadata.Width = DetermineType(fieldType) 181 | metadata.TotalFieldValueLength = uint64(0) 182 | buf, err := metadata.MarshalBinary() 183 | if err != nil { 184 | return nil, nil, fmt.Errorf("failed to marshal metadata: %w", err) 185 | } 186 | return next, metadata, next.SetMetadata(buf) 187 | } 188 | 189 | // Synchronize will synchronize the index file with the data file. 190 | // This is a convenience method and is equivalent to calling 191 | // Synchronize() on the data handler itself. 192 | func (i *IndexFile) Synchronize(df []byte) error { 193 | return i.dataHandler.Synchronize(i, df) 194 | } 195 | 196 | func (i *IndexFile) SetBenchmarkFile(f io.Writer) { 197 | t0 := time.Now() 198 | i.BenchmarkCallback = func(n int) { 199 | // write timestamp, number of records, and number of pages 200 | dt := time.Since(t0) 201 | fmt.Fprintf(f, "%d,%d,%d\n", dt.Microseconds(), n, i.pf.PageCount()) 202 | } 203 | } 204 | 205 | func (i *IndexFile) IsSearch(fieldName string) bool { 206 | for _, sh := range i.searchHeaders { 207 | if fieldName == sh { 208 | return true 209 | } 210 | } 211 | 212 | return false 213 | } 214 | -------------------------------------------------------------------------------- /pkg/appendable/index_file_test.go: -------------------------------------------------------------------------------- 1 | package appendable 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/kevmo314/appendable/pkg/buftest" 7 | ) 8 | 9 | type FormatHandler struct{ ReturnsFormat Format } 10 | 11 | func (f FormatHandler) Format() Format { 12 | return f.ReturnsFormat 13 | } 14 | 15 | func (f FormatHandler) Synchronize(f1 *IndexFile, df []byte) error { 16 | return nil 17 | } 18 | 19 | func (f FormatHandler) Parse(data []byte) []byte { 20 | return nil 21 | } 22 | 23 | func TestIndexFile(t *testing.T) { 24 | t.Run("validate metadata throws error if format doesn't match on second read", func(t *testing.T) { 25 | f := buftest.NewSeekableBuffer() 26 | 27 | var em []string 28 | 29 | if _, err := NewIndexFile( 30 | f, 31 | &FormatHandler{ReturnsFormat: Format(1)}, 32 | em, 33 | ); err != nil { 34 | t.Fatal(err) 35 | } 36 | 37 | // try creating a new index file with a different format 38 | if _, err := NewIndexFile(f, &FormatHandler{ReturnsFormat: Format(2)}, em); err == nil { 39 | t.Fatal("expected error") 40 | } 41 | }) 42 | } 43 | 44 | func TestWidthAllocation(t *testing.T) { 45 | 46 | type Truth struct { 47 | Type FieldType 48 | Width uint16 49 | } 50 | 51 | t.Run("should correctly allocate the fixed width or else for a given type", func(t *testing.T) { 52 | 53 | ws := [8]Truth{ 54 | {FieldTypeArray, 0}, 55 | {FieldTypeBoolean, 2}, 56 | {FieldTypeNull, 1}, 57 | {FieldTypeFloat64, 9}, 58 | {FieldTypeInt64, 9}, 59 | {FieldTypeObject, 0}, 60 | {FieldTypeString, 0}, 61 | {FieldTypeUint64, 9}, 62 | } 63 | 64 | for _, w := range ws { 65 | expected := w.Width 66 | input := DetermineType(w.Type) 67 | 68 | if expected != input { 69 | t.Errorf("For type: %v, expected: %v, got: %v", w.Type, expected, input) 70 | } 71 | } 72 | }) 73 | } 74 | -------------------------------------------------------------------------------- /pkg/appendable/typescript.go: -------------------------------------------------------------------------------- 1 | package appendable 2 | 3 | // func (f *IndexFile) WriteTypescriptDefinitions(w io.Writer) error { 4 | // _, err := w.Write([]byte(`// This file was generated by github.com/kevmo314/appendable/pkg/appendable/typescript.go`)) 5 | // if err != nil { 6 | // return err 7 | // } 8 | // if _, err := w.Write([]byte("\n\nexport type Record = {\n")); err != nil { 9 | // return err 10 | // } 11 | // // iterate over each field in the index header and generate a field for it 12 | // for _, index := range f.Indexes { 13 | // _, err := w.Write([]byte("\t\"" + index.FieldName + "\": " + index.FieldType.TypescriptType() + ";\n")) 14 | // if err != nil { 15 | // return err 16 | // } 17 | // } 18 | // if _, err := w.Write([]byte("}\n")); err != nil { 19 | // return err 20 | // } 21 | 22 | // return nil 23 | // } 24 | -------------------------------------------------------------------------------- /pkg/bptree/README.md: -------------------------------------------------------------------------------- 1 | # kevmo314/appendable/btree 2 | 3 | This package implements an on-disk B+ tree, taking some inspiration from 4 | https://github.com/spy16/kiwi/tree/master/index/bptree. 5 | 6 | ## On the significance of the 4kB page size 7 | 8 | The B+ tree is designed to be stored on disk, and as such, it is designed to 9 | take advantage of the 4kB page size of most disks. However, in practice we 10 | don't see a material impact on performance when using alternative sizes. So 11 | why do we choose to use 4kB pages? 12 | 13 | In order to garbage collect old B+ tree nodes, we want to have pointers to 14 | freed pages to deallocate them entirely. That is, if we did not use page sizes 15 | and stored nodes contiguously, it would be difficult to garbage collect the exact 16 | number of bytes and we would end up with fragmentation. By using page sizes, we 17 | can simply store a list of freed pages and deallocate them entirely and we can 18 | be sure that the freed page will be sufficient to store the new node. 19 | 20 | Therefore, we must choose a page size that is large enough to store a node. 21 | In practice, the choice of 4kB specifically is arbitrary, but it is a nice way 22 | to align with the page size of most disks. 23 | -------------------------------------------------------------------------------- /pkg/bptree/node.go: -------------------------------------------------------------------------------- 1 | package bptree 2 | 3 | import ( 4 | "encoding/binary" 5 | "fmt" 6 | "github.com/kevmo314/appendable/pkg/encoding" 7 | "github.com/kevmo314/appendable/pkg/pointer" 8 | "io" 9 | ) 10 | 11 | type DataParser interface { 12 | Parse([]byte) []byte 13 | } 14 | 15 | type BPTreeNode struct { 16 | Data []byte 17 | DataParser DataParser 18 | // contains the offset of the child node or the offset of the record for leaf 19 | // if the node is a leaf, the last pointer is the offset of the next leaf 20 | LeafPointers []pointer.MemoryPointer 21 | InternalPointers []uint64 22 | Keys []pointer.ReferencedValue 23 | 24 | // the expected width for the BPTree's type 25 | Width uint16 26 | } 27 | 28 | func (n *BPTreeNode) Leaf() bool { 29 | return len(n.LeafPointers) > 0 30 | } 31 | 32 | func (n *BPTreeNode) Pointer(i int) pointer.MemoryPointer { 33 | if n.Leaf() { 34 | return n.LeafPointers[i] 35 | } 36 | return pointer.MemoryPointer{Offset: n.InternalPointers[i]} 37 | } 38 | 39 | func (n *BPTreeNode) NumPointers() int { 40 | return len(n.InternalPointers) + len(n.LeafPointers) 41 | } 42 | 43 | func (n *BPTreeNode) Size() int64 { 44 | size := 4 // number of keys 45 | for _, k := range n.Keys { 46 | o := encoding.SizeVarint(uint64(k.DataPointer.Offset)) 47 | l := encoding.SizeVarint(uint64(k.DataPointer.Length)) 48 | size += l + o 49 | 50 | if n.Width != uint16(0) { 51 | size += len(k.Value) 52 | } 53 | } 54 | for _, n := range n.LeafPointers { 55 | o := encoding.SizeVarint(uint64(n.Offset)) 56 | l := encoding.SizeVarint(uint64(n.Length)) 57 | size += o + l 58 | } 59 | for _, n := range n.InternalPointers { 60 | o := len(binary.AppendUvarint([]byte{}, n)) 61 | size += o 62 | } 63 | return int64(size) 64 | } 65 | 66 | func (n *BPTreeNode) MarshalBinary() ([]byte, error) { 67 | size := int32(len(n.Keys)) 68 | 69 | if size == 0 { 70 | panic("writing empty node") 71 | } 72 | buf := make([]byte, n.Size()) 73 | // set the first bit to 1 if it's a leaf 74 | if n.Leaf() { 75 | binary.LittleEndian.PutUint32(buf[:4], uint32(-size)) 76 | } else { 77 | binary.LittleEndian.PutUint32(buf[:4], uint32(size)) 78 | } 79 | ct := 4 80 | for _, k := range n.Keys { 81 | on := binary.PutUvarint(buf[ct:], k.DataPointer.Offset) 82 | ln := binary.PutUvarint(buf[ct+on:], uint64(k.DataPointer.Length)) 83 | ct += on + ln 84 | if n.Width != uint16(0) { 85 | m := copy(buf[ct:ct+len(k.Value)], k.Value) 86 | if m != len(k.Value) { 87 | return nil, fmt.Errorf("failed to copy key: %w", io.ErrShortWrite) 88 | } 89 | ct += m 90 | } 91 | } 92 | for _, p := range n.LeafPointers { 93 | on := binary.PutUvarint(buf[ct:], p.Offset) 94 | ln := binary.PutUvarint(buf[ct+on:], uint64(p.Length)) 95 | 96 | ct += on + ln 97 | } 98 | for _, p := range n.InternalPointers { 99 | on := binary.PutUvarint(buf[ct:], p) 100 | ct += on 101 | } 102 | if ct != int(n.Size()) { 103 | panic("size mismatch") 104 | } 105 | return buf, nil 106 | } 107 | 108 | func (n *BPTreeNode) WriteTo(w io.Writer) (int64, error) { 109 | buf, err := n.MarshalBinary() 110 | if err != nil { 111 | return 0, err 112 | } 113 | m, err := w.Write(buf) 114 | return int64(m), err 115 | } 116 | 117 | func (n *BPTreeNode) UnmarshalBinary(buf []byte) error { 118 | size := int32(binary.LittleEndian.Uint32(buf[:4])) 119 | leaf := size < 0 120 | if leaf { 121 | n.LeafPointers = make([]pointer.MemoryPointer, -size) 122 | n.Keys = make([]pointer.ReferencedValue, -size) 123 | } else { 124 | n.InternalPointers = make([]uint64, size+1) 125 | n.Keys = make([]pointer.ReferencedValue, size) 126 | } 127 | if size == 0 { 128 | panic("empty node") 129 | } 130 | 131 | m := 4 132 | for i := range n.Keys { 133 | o, on := binary.Uvarint(buf[m:]) 134 | l, ln := binary.Uvarint(buf[m+on:]) 135 | 136 | n.Keys[i].DataPointer.Offset = o 137 | n.Keys[i].DataPointer.Length = uint32(l) 138 | 139 | m += on + ln 140 | 141 | if n.Width == uint16(0) { 142 | // read the key out of the memory pointer stored at this position 143 | dp := n.Keys[i].DataPointer 144 | n.Keys[i].Value = n.DataParser.Parse(n.Data[dp.Offset : dp.Offset+uint64(dp.Length)]) // resolving the data-file 145 | } else { 146 | n.Keys[i].Value = buf[m : m+int(n.Width-1)] 147 | m += int(n.Width - 1) 148 | } 149 | } 150 | for i := range n.LeafPointers { 151 | 152 | o, on := binary.Uvarint(buf[m:]) 153 | l, ln := binary.Uvarint(buf[m+on:]) 154 | 155 | n.LeafPointers[i].Offset = o 156 | n.LeafPointers[i].Length = uint32(l) 157 | m += on + ln 158 | } 159 | for i := range n.InternalPointers { 160 | o, on := binary.Uvarint(buf[m:]) 161 | n.InternalPointers[i] = o 162 | m += on 163 | } 164 | return nil 165 | } 166 | -------------------------------------------------------------------------------- /pkg/bptree/node_test.go: -------------------------------------------------------------------------------- 1 | package bptree 2 | 3 | import ( 4 | "bytes" 5 | "github.com/kevmo314/appendable/pkg/pointer" 6 | "reflect" 7 | "testing" 8 | ) 9 | 10 | func TestBPTreeNode_ReadWriteLeaf(t *testing.T) { 11 | // Create a test BPTreeNode 12 | node1 := &BPTreeNode{ 13 | LeafPointers: []pointer.MemoryPointer{ 14 | {Offset: 0, Length: 3}, 15 | {Offset: 3, Length: 3}, 16 | {Offset: 6, Length: 3}, 17 | }, 18 | Keys: []pointer.ReferencedValue{ 19 | {Value: []byte{0, 1, 2}}, 20 | {Value: []byte{1, 2, 3}}, 21 | {Value: []byte{3, 4, 5}}, 22 | }, 23 | Width: uint16(4), 24 | } 25 | 26 | buf := &bytes.Buffer{} 27 | if _, err := node1.WriteTo(buf); err != nil { 28 | t.Fatal(err) 29 | } 30 | 31 | node2 := &BPTreeNode{Width: uint16(4)} 32 | if err := node2.UnmarshalBinary(buf.Bytes()); err != nil { 33 | t.Fatal(err) 34 | } 35 | 36 | if !node2.Leaf() { 37 | t.Fatal("expected leaf node") 38 | } 39 | 40 | if !reflect.DeepEqual(node1, node2) { 41 | t.Fatalf("expected %#v\ngot %#v", node1, node2) 42 | } 43 | } 44 | 45 | func TestBPTreeNode_ReadWriteIntermediate(t *testing.T) { 46 | // Create a test BPTreeNode 47 | node1 := &BPTreeNode{ 48 | InternalPointers: []uint64{0, 1, 2, 3}, 49 | Keys: []pointer.ReferencedValue{ 50 | {Value: []byte{0, 1}}, 51 | {Value: []byte{1, 2}}, 52 | {Value: []byte{3, 4}}, 53 | }, 54 | Width: uint16(3), 55 | } 56 | 57 | buf := &bytes.Buffer{} 58 | if _, err := node1.WriteTo(buf); err != nil { 59 | t.Fatal(err) 60 | } 61 | 62 | node2 := &BPTreeNode{Width: uint16(3)} 63 | if err := node2.UnmarshalBinary(buf.Bytes()); err != nil { 64 | t.Fatal(err) 65 | } 66 | 67 | if node2.Leaf() { 68 | t.Fatal("expected intermediate node") 69 | } 70 | 71 | if !reflect.DeepEqual(node1, node2) { 72 | t.Fatalf("expected %#v, got %#v", node1, node2) 73 | } 74 | } 75 | 76 | func TestBPTreeNode_CompareReferencedValues(t *testing.T) { 77 | rv := []pointer.ReferencedValue{ 78 | { 79 | Value: []byte{0}, 80 | }, 81 | { 82 | Value: []byte{1}, 83 | DataPointer: pointer.MemoryPointer{Offset: 0}, 84 | }, { 85 | Value: []byte{1}, 86 | DataPointer: pointer.MemoryPointer{Offset: 1}, 87 | }, { 88 | Value: []byte{1}, 89 | DataPointer: pointer.MemoryPointer{Offset: 1, Length: 1}, 90 | }, 91 | } 92 | for i := 0; i < len(rv); i++ { 93 | for j := 0; j < len(rv); j++ { 94 | cmp := pointer.CompareReferencedValues(rv[i], rv[j]) 95 | if i < j && cmp >= 0 { 96 | t.Fatalf("expected %d < %d", i, j) 97 | } 98 | if i > j && cmp <= 0 { 99 | t.Fatalf("expected %d > %d", i, j) 100 | } 101 | if i == j && cmp != 0 { 102 | t.Fatalf("expected %d == %d", i, j) 103 | } 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /pkg/btree/node.go: -------------------------------------------------------------------------------- 1 | package btree 2 | 3 | import ( 4 | "encoding/binary" 5 | "fmt" 6 | "github.com/kevmo314/appendable/pkg/encoding" 7 | "github.com/kevmo314/appendable/pkg/hnsw" 8 | "github.com/kevmo314/appendable/pkg/pointer" 9 | "io" 10 | "math" 11 | ) 12 | 13 | type BTreeNode struct { 14 | Ids []pointer.ReferencedId 15 | Vectors []hnsw.Point 16 | 17 | Offsets []uint64 18 | Width uint16 19 | VectorDim uint64 20 | } 21 | 22 | func (n *BTreeNode) Size() int64 { 23 | size := 4 24 | 25 | for _, k := range n.Ids { 26 | size += encoding.SizeVarint(k.DataPointer.Offset) 27 | size += encoding.SizeVarint(uint64(k.DataPointer.Length)) 28 | size += encoding.SizeVarint(uint64(k.Value)) 29 | } 30 | 31 | for _, n := range n.Offsets { 32 | size += encoding.SizeVarint(n) 33 | } 34 | 35 | if n.VectorDim == 0 { 36 | panic("VectorDim cannot be zero") 37 | } 38 | 39 | size += encoding.SizeVarint(n.VectorDim) 40 | size += len(n.Vectors) * (4 * int(n.VectorDim)) 41 | 42 | return int64(size) 43 | } 44 | 45 | func (n *BTreeNode) Leaf() bool { 46 | return len(n.Offsets) == 0 47 | } 48 | 49 | func (n *BTreeNode) MarshalBinary() ([]byte, error) { 50 | size := int32(len(n.Ids)) 51 | 52 | if size == 0 { 53 | panic("writing empty node, no ids found!") 54 | } 55 | 56 | buf := make([]byte, n.Size()) 57 | 58 | if n.Leaf() { 59 | binary.LittleEndian.PutUint32(buf[:4], uint32(-size)) 60 | } else { 61 | binary.LittleEndian.PutUint32(buf[:4], uint32(size)) 62 | } 63 | 64 | ct := 4 65 | for _, k := range n.Ids { 66 | on := binary.PutUvarint(buf[ct:], k.DataPointer.Offset) 67 | ln := binary.PutUvarint(buf[ct+on:], uint64(k.DataPointer.Length)) 68 | vn := binary.PutUvarint(buf[ct+on+ln:], uint64(k.Value)) 69 | ct += on + ln + vn 70 | } 71 | 72 | for _, n := range n.Offsets { 73 | on := binary.PutUvarint(buf[ct:], n) 74 | ct += on 75 | } 76 | 77 | vdn := binary.PutUvarint(buf[ct:], n.VectorDim) 78 | ct += vdn 79 | 80 | for _, v := range n.Vectors { 81 | for _, elem := range v { 82 | binary.LittleEndian.PutUint32(buf[ct:], math.Float32bits(elem)) 83 | ct += 4 84 | } 85 | } 86 | 87 | if ct != int(n.Size()) { 88 | panic(fmt.Sprintf("size mismatch. ct: %v, size: %v", ct, n.Size())) 89 | } 90 | 91 | return buf, nil 92 | } 93 | 94 | func (n *BTreeNode) UnmarshalBinary(buf []byte) error { 95 | size := int32(binary.LittleEndian.Uint32(buf[:4])) 96 | leaf := size < 0 97 | 98 | if leaf { 99 | n.Ids = make([]pointer.ReferencedId, -size) 100 | n.Vectors = make([]hnsw.Point, -size) 101 | n.Offsets = make([]uint64, 0) 102 | } else { 103 | n.Ids = make([]pointer.ReferencedId, size) 104 | n.Vectors = make([]hnsw.Point, size) 105 | n.Offsets = make([]uint64, size+1) 106 | } 107 | 108 | if size == 0 { 109 | panic("empty node") 110 | } 111 | 112 | m := 4 113 | for i := range n.Ids { 114 | o, on := binary.Uvarint(buf[m:]) 115 | l, ln := binary.Uvarint(buf[m+on:]) 116 | 117 | n.Ids[i].DataPointer.Offset = o 118 | n.Ids[i].DataPointer.Length = uint32(l) 119 | 120 | m += on + ln 121 | 122 | v, vn := binary.Uvarint(buf[m:]) 123 | n.Ids[i].Value = hnsw.Id(v) 124 | 125 | m += vn 126 | } 127 | 128 | if !leaf { 129 | for i := range n.Offsets { 130 | o, on := binary.Uvarint(buf[m:]) 131 | n.Offsets[i] = o 132 | m += on 133 | } 134 | } 135 | 136 | vecdim, vdn := binary.Uvarint(buf[m:]) 137 | n.VectorDim = vecdim 138 | m += vdn 139 | 140 | for i := range n.Vectors { 141 | vector := make(hnsw.Point, vecdim) 142 | 143 | for vi := range vector { 144 | vector[vi] = float32(binary.LittleEndian.Uint32(buf[m:])) 145 | m += 4 146 | } 147 | 148 | n.Vectors[i] = vector 149 | } 150 | 151 | return nil 152 | } 153 | 154 | func (n *BTreeNode) WriteTo(w io.Writer) (int64, error) { 155 | buf, err := n.MarshalBinary() 156 | if err != nil { 157 | return 0, err 158 | } 159 | m, err := w.Write(buf) 160 | return int64(m), err 161 | } 162 | -------------------------------------------------------------------------------- /pkg/btree/node_test.go: -------------------------------------------------------------------------------- 1 | package btree 2 | 3 | import ( 4 | "bytes" 5 | "github.com/kevmo314/appendable/pkg/hnsw" 6 | "github.com/kevmo314/appendable/pkg/pointer" 7 | "reflect" 8 | "testing" 9 | ) 10 | 11 | func TestBTreeNode_Size(t *testing.T) { 12 | t.Run("node size", func(t *testing.T) { 13 | n := &BTreeNode{ // 4 14 | Ids: []pointer.ReferencedId{{Value: 1}, {Value: 2}, {Value: 3}}, // 3 * (3) 15 | Vectors: []hnsw.Point{{1, 1}, {2, 2}, {3, 3}}, // 6 * 4 == 3 * 2 * 4 // 24 16 | Offsets: make([]uint64, 0), 17 | VectorDim: 2, // 1 18 | } 19 | 20 | if n.Size() != 38 { 21 | t.Fatalf("wrong size: %d", n.Size()) 22 | } 23 | }) 24 | } 25 | 26 | func TestBTreeNode_MarshalBinary(t *testing.T) { 27 | t.Run("leaf node", func(t *testing.T) { 28 | n := &BTreeNode{ 29 | Ids: []pointer.ReferencedId{ 30 | {Value: 1}, 31 | {Value: 2}, 32 | {Value: 3}, 33 | }, 34 | Vectors: []hnsw.Point{{0, 0}, {0, 0}, {0, 0}}, 35 | Offsets: make([]uint64, 0), 36 | VectorDim: 2, 37 | } 38 | 39 | buf := &bytes.Buffer{} 40 | if _, err := n.WriteTo(buf); err != nil { 41 | t.Fatal(err) 42 | } 43 | 44 | m := &BTreeNode{} 45 | if err := m.UnmarshalBinary(buf.Bytes()); err != nil { 46 | t.Fatal(err) 47 | } 48 | 49 | if !m.Leaf() { 50 | t.Fatalf("expected leaf node, but got %v offsets", len(m.Offsets)) 51 | } 52 | 53 | if !reflect.DeepEqual(n, m) { 54 | t.Fatalf("encoded\n%#v\ndecoded\n%#v", n, m) 55 | } 56 | }) 57 | 58 | t.Run("intermediate node", func(t *testing.T) { 59 | n := &BTreeNode{ 60 | Ids: []pointer.ReferencedId{ 61 | {Value: 1}, 62 | {Value: 2}, 63 | {Value: 3}, 64 | }, 65 | Vectors: []hnsw.Point{{0, 0}, {0, 0}, {0, 0}}, 66 | Offsets: []uint64{0, 4096, 8192, 6969}, 67 | VectorDim: 2, 68 | } 69 | 70 | buf := &bytes.Buffer{} 71 | if _, err := n.WriteTo(buf); err != nil { 72 | t.Fatal(err) 73 | } 74 | 75 | m := &BTreeNode{} 76 | if err := m.UnmarshalBinary(buf.Bytes()); err != nil { 77 | t.Fatal(err) 78 | } 79 | 80 | if m.Leaf() { 81 | t.Fatal("expected intermediate node") 82 | } 83 | 84 | if !reflect.DeepEqual(n, m) { 85 | t.Fatalf("encoded\n%#v\ndecoded\n%#v", n, m) 86 | } 87 | }) 88 | } 89 | -------------------------------------------------------------------------------- /pkg/buftest/buffer.go: -------------------------------------------------------------------------------- 1 | package buftest 2 | 3 | import ( 4 | "io" 5 | "os" 6 | ) 7 | 8 | // SeekableBuffer is a buffer that can be seeked into. 9 | // this replicates the behavior of a file on disk without having to write to disk 10 | // which is useful for testing. 11 | type SeekableBuffer struct { 12 | buf []byte 13 | pos int 14 | } 15 | 16 | func NewSeekableBuffer() *SeekableBuffer { 17 | return &SeekableBuffer{} 18 | } 19 | 20 | func (b *SeekableBuffer) Bytes() []byte { 21 | return b.buf 22 | } 23 | 24 | func (b *SeekableBuffer) Write(p []byte) (int, error) { 25 | n := copy(b.buf[b.pos:], p) 26 | if n < len(p) { 27 | b.buf = append(b.buf, p[n:]...) 28 | } 29 | b.pos += len(p) 30 | return len(p), nil 31 | } 32 | 33 | func (b *SeekableBuffer) Seek(offset int64, whence int) (int64, error) { 34 | switch whence { 35 | case io.SeekStart: 36 | b.pos = int(offset) 37 | case io.SeekCurrent: 38 | b.pos += int(offset) 39 | case io.SeekEnd: 40 | b.pos = len(b.buf) + int(offset) 41 | } 42 | if b.pos < 0 { 43 | b.pos = 0 44 | } 45 | if b.pos > len(b.buf) { 46 | b.pos = len(b.buf) 47 | } 48 | return int64(b.pos), nil 49 | } 50 | 51 | func (b *SeekableBuffer) Read(p []byte) (int, error) { 52 | if b.pos >= len(b.buf) { 53 | return 0, io.EOF 54 | } 55 | n := copy(p, b.buf[b.pos:]) 56 | b.pos += n 57 | return n, nil 58 | } 59 | 60 | func (b *SeekableBuffer) Truncate(size int64) error { 61 | if size < 0 { 62 | return io.ErrShortBuffer 63 | } 64 | if size > int64(len(b.buf)) { 65 | return io.ErrShortWrite 66 | } 67 | b.buf = b.buf[:size] 68 | return nil 69 | } 70 | 71 | func (b *SeekableBuffer) WriteAt(p []byte, off int64) (int, error) { 72 | if off < 0 { 73 | return 0, io.ErrShortBuffer 74 | } 75 | if off > int64(len(b.buf)) { 76 | return 0, io.ErrShortWrite 77 | } 78 | n := copy(b.buf[off:], p) 79 | if n < len(p) { 80 | b.buf = append(b.buf, p[n:]...) 81 | } 82 | return len(p), nil 83 | } 84 | 85 | func (b *SeekableBuffer) ReadAt(p []byte, off int64) (int, error) { 86 | if off < 0 { 87 | return 0, io.ErrShortBuffer 88 | } 89 | if off > int64(len(b.buf)) { 90 | return 0, io.EOF 91 | } 92 | n := copy(p, b.buf[off:]) 93 | return n, nil 94 | } 95 | 96 | func (b *SeekableBuffer) WriteToDisk(filename string) error { 97 | return os.WriteFile(filename, b.buf, 0644) 98 | } 99 | 100 | var _ io.ReadWriteSeeker = &SeekableBuffer{} 101 | var _ io.ReaderAt = &SeekableBuffer{} 102 | var _ io.WriterAt = &SeekableBuffer{} 103 | -------------------------------------------------------------------------------- /pkg/buftest/buffer_test.go: -------------------------------------------------------------------------------- 1 | package buftest 2 | 3 | import ( 4 | "io" 5 | "testing" 6 | ) 7 | 8 | func TestSeekableBuffer(t *testing.T) { 9 | t.Run("Write", func(t *testing.T) { 10 | b := NewSeekableBuffer() 11 | n, err := b.Write([]byte("hello")) 12 | if err != nil { 13 | t.Fatal(err) 14 | } 15 | if n != 5 { 16 | t.Fatalf("expected to write 5 bytes, wrote %d", n) 17 | } 18 | if string(b.buf) != "hello" { 19 | t.Fatalf("expected to write 'hello', wrote %s", string(b.buf)) 20 | } 21 | }) 22 | 23 | t.Run("write to end", func(t *testing.T) { 24 | b := NewSeekableBuffer() 25 | if _, err := b.Write([]byte("hello")); err != nil { 26 | t.Fatal(err) 27 | } 28 | if _, err := b.Seek(-2, io.SeekEnd); err != nil { 29 | t.Fatal(err) 30 | } 31 | if _, err := b.Write([]byte("world")); err != nil { 32 | t.Fatal(err) 33 | } 34 | if string(b.buf) != "helworld" { 35 | t.Fatalf("expected to write 'helworld', wrote %s", string(b.buf)) 36 | } 37 | }) 38 | 39 | t.Run("Seek", func(t *testing.T) { 40 | b := NewSeekableBuffer() 41 | if _, err := b.Write([]byte("helloo")); err != nil { 42 | t.Fatal(err) 43 | } 44 | if _, err := b.Seek(0, io.SeekStart); err != nil { 45 | t.Fatal(err) 46 | } 47 | if _, err := b.Write([]byte("world")); err != nil { 48 | t.Fatal(err) 49 | } 50 | if string(b.buf) != "worldo" { 51 | t.Fatalf("expected to write 'worldo', wrote %s", string(b.buf)) 52 | } 53 | }) 54 | 55 | t.Run("Read", func(t *testing.T) { 56 | b := NewSeekableBuffer() 57 | if _, err := b.Write([]byte("hello")); err != nil { 58 | t.Fatal(err) 59 | } 60 | if _, err := b.Seek(0, io.SeekStart); err != nil { 61 | t.Fatal(err) 62 | } 63 | buf := make([]byte, 5) 64 | n, err := b.Read(buf) 65 | if err != nil { 66 | t.Fatal(err) 67 | } 68 | if n != 5 { 69 | t.Fatalf("expected to read 5 bytes, read %d", n) 70 | } 71 | if string(buf) != "hello" { 72 | t.Fatalf("expected to read 'hello', read %s", string(buf)) 73 | } 74 | }) 75 | 76 | t.Run("read from middle", func(t *testing.T) { 77 | b := NewSeekableBuffer() 78 | if _, err := b.Write([]byte("hello")); err != nil { 79 | t.Fatal(err) 80 | } 81 | if _, err := b.Seek(2, io.SeekStart); err != nil { 82 | t.Fatal(err) 83 | } 84 | buf := make([]byte, 3) 85 | n, err := b.Read(buf) 86 | if err != nil { 87 | t.Fatal(err) 88 | } 89 | if n != 3 { 90 | t.Fatalf("expected to read 3 bytes, read %d", n) 91 | } 92 | if string(buf) != "llo" { 93 | t.Fatalf("expected to read 'llo', read %s", string(buf)) 94 | } 95 | }) 96 | 97 | t.Run("truncate", func(t *testing.T) { 98 | b := NewSeekableBuffer() 99 | if _, err := b.Write([]byte("hello")); err != nil { 100 | t.Fatal(err) 101 | } 102 | if err := b.Truncate(3); err != nil { 103 | t.Fatal(err) 104 | } 105 | if string(b.buf) != "hel" { 106 | t.Fatalf("expected to truncate to 'hel', truncated to %s", string(b.buf)) 107 | } 108 | }) 109 | } 110 | -------------------------------------------------------------------------------- /pkg/encoding/sizeVarint.go: -------------------------------------------------------------------------------- 1 | package encoding 2 | 3 | import "math/bits" 4 | 5 | func SizeVarint(v uint64) int { 6 | return int(9*uint32(bits.Len64(v))+64) / 64 7 | } 8 | -------------------------------------------------------------------------------- /pkg/encoding/sizeVarint_test.go: -------------------------------------------------------------------------------- 1 | package encoding 2 | 3 | import ( 4 | "encoding/binary" 5 | "math/rand" 6 | "testing" 7 | "time" 8 | ) 9 | 10 | func TestSizeVariant(t *testing.T) { 11 | rand.Seed(time.Now().UnixNano()) 12 | 13 | const iterations = 1000 14 | 15 | for i := 0; i < iterations; i++ { 16 | randomNumber := rand.Uint64() 17 | 18 | x := len(binary.AppendUvarint([]byte{}, randomNumber)) 19 | y := SizeVarint(randomNumber) 20 | 21 | if x != y { 22 | t.Fatalf("Mismatch for %d: binary.AppendUvarint size = %d, SizeVarint size = %d", randomNumber, x, y) 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /pkg/handlers/csv.go: -------------------------------------------------------------------------------- 1 | package handlers 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "encoding/csv" 7 | "fmt" 8 | "github.com/kevmo314/appendable/pkg/pointer" 9 | "io" 10 | "log/slog" 11 | "math" 12 | "strconv" 13 | "strings" 14 | 15 | "github.com/kevmo314/appendable/pkg/appendable" 16 | "github.com/kevmo314/appendable/pkg/bptree" 17 | ) 18 | 19 | type CSVHandler struct { 20 | io.ReadSeeker 21 | } 22 | 23 | var _ appendable.DataHandler = (*CSVHandler)(nil) 24 | 25 | func (c CSVHandler) Format() appendable.Format { 26 | return appendable.FormatCSV 27 | } 28 | 29 | func (c CSVHandler) Synchronize(f *appendable.IndexFile, df []byte) error { 30 | slog.Debug("Starting CSV synchronization") 31 | 32 | var headers []string 33 | var err error 34 | 35 | metadata, err := f.Metadata() 36 | if err != nil { 37 | return fmt.Errorf("failed to read metadata: %w", err) 38 | } 39 | 40 | fieldNames, err := f.IndexFieldNames() 41 | if err != nil { 42 | return fmt.Errorf("failed to retrieve index field names: %w", err) 43 | } 44 | headers = fieldNames 45 | 46 | for { 47 | i := bytes.IndexByte(df[metadata.ReadOffset:], '\n') 48 | if i == -1 { 49 | break 50 | } 51 | 52 | if len(headers) == 0 { 53 | slog.Info("Parsing CSV headers") 54 | dec := csv.NewReader(bytes.NewReader(df[metadata.ReadOffset : metadata.ReadOffset+uint64(i)])) 55 | headers, err = dec.Read() 56 | if err != nil { 57 | slog.Error("failed to parse CSV header", "error", err) 58 | return fmt.Errorf("failed to parse CSV header: %w", err) 59 | } 60 | metadata.ReadOffset += uint64(i) + 1 61 | continue 62 | } 63 | 64 | dec := csv.NewReader(bytes.NewReader(df[metadata.ReadOffset : metadata.ReadOffset+uint64(i)])) 65 | 66 | if err := c.handleCSVLine(f, df, dec, headers, []string{}, pointer.MemoryPointer{ 67 | Offset: metadata.ReadOffset, 68 | Length: uint32(i), 69 | }); err != nil { 70 | return fmt.Errorf("failed to handle object: %w", err) 71 | } 72 | 73 | metadata.ReadOffset += uint64(i) + 1 // include the newline 74 | } 75 | 76 | // update the metadata 77 | if err := f.SetMetadata(metadata); err != nil { 78 | return fmt.Errorf("failed to set metadata: %w", err) 79 | } 80 | 81 | slog.Debug("indexes", slog.Any("", f.Indexes)) 82 | slog.Debug("Ending CSV synchronization") 83 | slog.Debug("=========") 84 | return nil 85 | } 86 | 87 | func fieldRankCsvField(fieldValue any) int { 88 | slog.Debug("serialize", slog.Any("fieldValue", fieldValue)) 89 | switch fieldValue.(type) { 90 | case nil: 91 | slog.Debug("nil", slog.Any("fieldValue", fieldValue)) 92 | return 1 93 | case bool: 94 | slog.Debug("bool", slog.Any("fieldValue", fieldValue)) 95 | return 2 96 | case int, int8, int16, int32, int64, float32, float64: 97 | slog.Debug("number", slog.Any("fieldValue", fieldValue)) 98 | return 3 99 | case string: 100 | slog.Debug("string", slog.Any("fieldValue", fieldValue)) 101 | return 4 102 | default: 103 | panic("unknown type") 104 | } 105 | } 106 | 107 | func InferCSVField(fieldValue string) (interface{}, appendable.FieldType) { 108 | if fieldValue == "" { 109 | return nil, appendable.FieldTypeNull 110 | } 111 | 112 | if i, err := strconv.Atoi(fieldValue); err == nil { 113 | 114 | return float64(i), appendable.FieldTypeFloat64 115 | } 116 | 117 | if f, err := strconv.ParseFloat(fieldValue, 64); err == nil { 118 | 119 | return float64(f), appendable.FieldTypeFloat64 120 | } 121 | 122 | if b, err := strconv.ParseBool(fieldValue); err == nil { 123 | return b, appendable.FieldTypeBoolean 124 | } 125 | 126 | return fieldValue, appendable.FieldTypeString 127 | } 128 | 129 | func (c CSVHandler) Parse(value []byte) []byte { 130 | parsed, fieldType := InferCSVField(string(value)) 131 | 132 | switch fieldType { 133 | case appendable.FieldTypeFloat64: 134 | buf := make([]byte, 8) 135 | binary.BigEndian.PutUint64(buf, math.Float64bits(parsed.(float64))) 136 | return buf 137 | case appendable.FieldTypeBoolean: 138 | if parsed.(bool) { 139 | return []byte{1} 140 | } else { 141 | return []byte{0} 142 | } 143 | case appendable.FieldTypeString: 144 | return []byte(parsed.(string)) 145 | case appendable.FieldTypeNull: 146 | // nil values are a bit of a degenerate case, we are essentially using the bptree 147 | // as a set. we store the value as an empty byte slice. 148 | return []byte{} 149 | } 150 | panic("unknown type") 151 | } 152 | 153 | func (c CSVHandler) handleCSVLine(f *appendable.IndexFile, df []byte, dec *csv.Reader, headers []string, path []string, data pointer.MemoryPointer) error { 154 | record, err := dec.Read() 155 | if err != nil { 156 | slog.Error("Failed to read CSV record at index", "error", err) 157 | return fmt.Errorf("failed to read CSV record: %w", err) 158 | } 159 | 160 | cumulativeLength := uint64(0) 161 | 162 | for fieldIndex, fieldValue := range record { 163 | if fieldIndex >= len(headers) { 164 | slog.Error("Field index is out of bounds with headers", "fieldIndex", fieldIndex, "headers", slog.Any("headers", headers)) 165 | return fmt.Errorf("field index %d is out of bounds with header", fieldIndex) 166 | } 167 | 168 | fieldName := headers[fieldIndex] 169 | 170 | name := strings.Join(append(path, fieldName), ".") 171 | 172 | fieldOffset := data.Offset + cumulativeLength 173 | fieldLength := uint32(len(fieldValue)) 174 | 175 | _, fieldType := InferCSVField(fieldValue) 176 | page, _, err := f.FindOrCreateIndex(name, fieldType) 177 | 178 | if err != nil { 179 | return fmt.Errorf("failed to find or create index: %w", err) 180 | } 181 | 182 | mp := pointer.MemoryPointer{ 183 | Offset: fieldOffset, 184 | Length: fieldLength, 185 | } 186 | 187 | if err := page.BPTree(&bptree.BPTree{Data: df, DataParser: CSVHandler{}, Width: uint16(0)}).Insert(pointer.ReferencedValue{Value: c.Parse([]byte(fieldValue)), DataPointer: mp}, data); err != nil { 188 | return fmt.Errorf("failed to insert into b+tree: %w", err) 189 | } 190 | 191 | cumulativeLength += uint64(fieldLength + 1) 192 | } 193 | 194 | return nil 195 | } 196 | -------------------------------------------------------------------------------- /pkg/hnsw/friends.go: -------------------------------------------------------------------------------- 1 | package hnsw 2 | 3 | import ( 4 | "encoding/binary" 5 | "errors" 6 | "fmt" 7 | "math" 8 | ) 9 | 10 | type Point []float32 11 | 12 | type Friends struct { 13 | friends []*DistHeap 14 | maxLevels map[Id]int 15 | } 16 | 17 | // NewFriends creates a new vector, note the max level is inclusive. 18 | func NewFriends(topLevel int) *Friends { 19 | friends := make([]*DistHeap, topLevel+1) 20 | 21 | for i := 0; i <= topLevel; i++ { 22 | friends[i] = NewDistHeap() 23 | } 24 | 25 | return &Friends{ 26 | friends: friends, 27 | maxLevels: make(map[Id]int), 28 | } 29 | } 30 | 31 | func (v *Friends) NumLevels() int { 32 | return len(v.friends) 33 | } 34 | 35 | func (v *Friends) TopLevel() int { 36 | return len(v.friends) - 1 37 | } 38 | 39 | func (v *Friends) HasLevel(level int) bool { 40 | if level < 0 { 41 | panic("level must be nonzero positive integer") 42 | } 43 | 44 | return level <= v.TopLevel() 45 | } 46 | 47 | // InsertFriendsAtLevel requires level must be zero-indexed and friendId must be valid at this level 48 | func (v *Friends) InsertFriendsAtLevel(level int, friendId Id, dist float32) { 49 | if !v.HasLevel(level) { 50 | panic("failed to insert friends at level, as level is not valId") 51 | } 52 | 53 | for i := 0; i <= level; i++ { 54 | v.friends[i].Insert(friendId, dist) 55 | } 56 | 57 | v.maxLevels[friendId] = level 58 | } 59 | 60 | func (v *Friends) GetFriendsAtLevel(level int) (*DistHeap, error) { 61 | if !v.HasLevel(level) { 62 | return nil, errors.New("failed to get friends at level") 63 | } 64 | 65 | return v.friends[level], nil 66 | } 67 | 68 | func (v *Friends) Flush(numNeighbors int) ([]byte, error) { 69 | if len(v.friends) == 0 { 70 | panic("no levels to be found") 71 | } 72 | 73 | // for every neighbor, we're going to serialize 74 | // +-------+-------------------------+ 75 | // | level | Id 76 | 77 | buf := make([]byte, (4+1)*numNeighbors) 78 | 79 | level0 := v.friends[0] 80 | copyLevel0 := level0.Clone() 81 | 82 | for i := 0; i < numNeighbors; i++ { 83 | if copyLevel0.IsEmpty() { 84 | // write out max values here 85 | continue 86 | } 87 | 88 | closestItem, err := copyLevel0.PopMinItem() 89 | if err != nil { 90 | return []byte{}, fmt.Errorf("failed to find closest item in friends: %v", err) 91 | } 92 | 93 | closestId := closestItem.id 94 | closestIdMaxLevel, ok := v.maxLevels[closestId] 95 | 96 | if !ok { 97 | return []byte{}, fmt.Errorf("failed to find id %v in maxLevels map", closestId) 98 | } 99 | 100 | buf[i*(1+4)] = byte(closestIdMaxLevel) 101 | binary.BigEndian.PutUint32(buf[i*(1+4)+1:], uint32(closestId)) 102 | } 103 | 104 | return buf, nil 105 | } 106 | 107 | func EuclidDistance(p0, p1 Point) float32 { 108 | var sum float32 109 | 110 | for i := range p0 { 111 | delta := p0[i] - p1[i] 112 | sum += delta * delta 113 | } 114 | 115 | return float32(math.Sqrt(float64(sum))) 116 | } 117 | 118 | // NearlyEqual is sourced from scalar package written by gonum 119 | // https://pkg.go.dev/gonum.org/v1/gonum/floats/scalar#EqualWithinAbsOrRel 120 | func NearlyEqual(a, b float32) bool { 121 | return EqualWithinAbs(float64(a), float64(b)) || EqualWithinRel(float64(a), float64(b)) 122 | } 123 | 124 | // EqualWithinAbs returns true when a and b have an absolute difference 125 | // not greater than tol. 126 | func EqualWithinAbs(a, b float64) bool { 127 | return a == b || math.Abs(a-b) <= 1e-6 128 | } 129 | 130 | // minNormalFloat64 is the smallest normal number. For 64 bit IEEE-754 131 | // floats this is 2^{-1022}. 132 | const minNormalFloat64 = 0x1p-1022 133 | 134 | // EqualWithinRel returns true when the difference between a and b 135 | // is not greater than tol times the greater absolute value of a and b, 136 | // 137 | // abs(a-b) <= tol * max(abs(a), abs(b)). 138 | func EqualWithinRel(a, b float64) bool { 139 | if a == b { 140 | return true 141 | } 142 | delta := math.Abs(a - b) 143 | if delta <= minNormalFloat64 { 144 | return delta <= 1e-6*minNormalFloat64 145 | } 146 | // We depend on the division in this relationship to Identify 147 | // infinities (we rely on the NaN to fail the test) otherwise 148 | // we compare Infs of the same sign and evaluate Infs as equal 149 | // independent of sign. 150 | return delta/math.Max(math.Abs(a), math.Abs(b)) <= 1e-6 151 | } 152 | -------------------------------------------------------------------------------- /pkg/hnsw/friends_test.go: -------------------------------------------------------------------------------- 1 | package hnsw 2 | 3 | import ( 4 | "math" 5 | "reflect" 6 | "testing" 7 | ) 8 | 9 | func TestVector_LevelManagement(t *testing.T) { 10 | 11 | /* 12 | hex has 6 layers from [0..5] 13 | oct has 8 layers from [0..8] 14 | */ 15 | t.Run("check levels for oct and hex vectors", func(t *testing.T) { 16 | hexId := Id(1) 17 | hex := []float32{9, 2.0, 30} 18 | 19 | hexFriends := NewFriends(6) 20 | 21 | if hexFriends.TopLevel() != 6 { 22 | t.Fatalf("since 0-indexed, the max level is 5, got: %v", hexFriends.TopLevel()) 23 | } 24 | 25 | if hexFriends.NumLevels() != 7 { 26 | t.Fatalf("since 0-indexed, the number of levels is 6, got: %v", hexFriends.NumLevels()) 27 | } 28 | 29 | octId := Id(2) 30 | oct := []float32{0, 2, 3} 31 | octFriends := NewFriends(8) 32 | 33 | if octFriends.TopLevel() != 8 { 34 | t.Fatalf("since 0-indexed, the max level is 7, got: %v", octFriends.TopLevel()) 35 | } 36 | 37 | if octFriends.NumLevels() != 9 { 38 | t.Fatalf("since 0-indexed, the number of levels is 8, got: %v", octFriends.NumLevels()) 39 | } 40 | 41 | for i := 0; i <= 6; i++ { 42 | if !hexFriends.HasLevel(i) { 43 | t.Fatalf("since 0-indexed, the level #%v is missing", i) 44 | } 45 | } 46 | 47 | for i := 7; i <= 8; i++ { 48 | if hexFriends.HasLevel(i) { 49 | t.Fatalf("since 0-indexed, expected the level #%v to be missing", i) 50 | } 51 | } 52 | 53 | hexOctDist := EuclidDistance(oct, hex) 54 | 55 | hexFriends.InsertFriendsAtLevel(5, octId, hexOctDist) 56 | octFriends.InsertFriendsAtLevel(5, hexId, hexOctDist) 57 | 58 | for i := 0; i <= 5; i++ { 59 | hexFriends, err := hexFriends.GetFriendsAtLevel(i) 60 | if err != nil { 61 | t.Fatal(err) 62 | } 63 | 64 | octFriends, err := octFriends.GetFriendsAtLevel(i) 65 | if err != nil { 66 | t.Fatal(err) 67 | } 68 | 69 | if hexFriends.Len() != 1 || octFriends.Len() != 1 { 70 | t.Fatalf("expected hex and oct friends list at level %v to be 1, got: %v || %v", i, hexFriends.Len(), octFriends.Len()) 71 | } 72 | 73 | top, err := hexFriends.PeekMinItem() 74 | if err != nil { 75 | t.Fatal(err) 76 | } 77 | if top.id != octId { 78 | t.Fatalf("expected %v, got %v", octId, top.id) 79 | } 80 | 81 | top, err = octFriends.PeekMinItem() 82 | if err != nil { 83 | t.Fatal(err) 84 | } 85 | if top.id != hexId { 86 | t.Fatalf("expected %v, got %v", hexId, top.id) 87 | } 88 | } 89 | }) 90 | 91 | } 92 | 93 | func TestVector_EuclidDistance(t *testing.T) { 94 | 95 | type vectorPair struct { 96 | v0, v1 Point 97 | expected float32 98 | } 99 | 100 | basic := []vectorPair{ 101 | { 102 | v0: Point{5, 3, 0}, 103 | v1: Point{2, -2, float32(math.Sqrt(2))}, 104 | expected: 6, 105 | }, 106 | { 107 | v0: Point{1, 0, -5}, 108 | v1: Point{-3, 2, -1}, 109 | expected: 6, 110 | }, 111 | { 112 | v0: Point{1, 3}, 113 | v1: Point{5, 2}, 114 | expected: float32(math.Sqrt(17)), 115 | }, 116 | { 117 | v0: Point{0, 1, 4}, 118 | v1: Point{2, 9, 1}, 119 | expected: float32(math.Sqrt(77)), 120 | }, 121 | { 122 | v0: Point{0}, 123 | v1: Point{0}, 124 | expected: 0, 125 | }, 126 | { 127 | v0: Point{10, 20, 30, 40}, 128 | v1: Point{10, 20, 30, 40}, 129 | expected: 0, 130 | }, 131 | } 132 | 133 | t.Run("correctly computes the distance of two vectors", func(t *testing.T) { 134 | for i, pair := range basic { 135 | dist := EuclidDistance(pair.v1, pair.v0) 136 | 137 | if !NearlyEqual(dist, pair.expected) { 138 | t.Fatalf("iter i: %v, expected %v and %v to be equal", i, dist, pair.expected) 139 | } 140 | } 141 | }) 142 | } 143 | 144 | func TestFriends_Flush(t *testing.T) { 145 | t.Run("flush single friend", func(t *testing.T) { 146 | f := NewFriends(3) 147 | 148 | f.InsertFriendsAtLevel(2, 1, 4) 149 | 150 | buf, err := f.Flush(1) 151 | if err != nil { 152 | t.Fatal(err) 153 | } 154 | if !reflect.DeepEqual(buf, []byte{2, 0, 0, 0, 1}) { 155 | t.Fatalf("expected %v, got %v", []byte{2, 0, 0, 0, 1}, buf) 156 | } 157 | }) 158 | 159 | t.Run("flushes 8 friends exactly", func(t *testing.T) { 160 | f := NewFriends(4) 161 | f.InsertFriendsAtLevel(2, 1, 1) 162 | f.InsertFriendsAtLevel(3, 2, 2) 163 | f.InsertFriendsAtLevel(1, 3, 3) 164 | f.InsertFriendsAtLevel(0, 4, 4) 165 | f.InsertFriendsAtLevel(4, 5, 5) 166 | f.InsertFriendsAtLevel(2, 6, 6) 167 | f.InsertFriendsAtLevel(0, 7, 7) 168 | f.InsertFriendsAtLevel(2, 8, 8) 169 | 170 | buf, err := f.Flush(8) 171 | if err != nil { 172 | t.Fatal(err) 173 | } 174 | 175 | if !reflect.DeepEqual(buf, []byte{2, 0, 0, 0, 1, 3, 0, 0, 0, 2, 1, 0, 0, 0, 3, 0, 0, 0, 0, 4, 4, 0, 0, 0, 5, 2, 0, 0, 0, 6, 0, 0, 0, 0, 7, 2, 0, 0, 0, 8}) { 176 | t.Fatalf("expected %v, got %v", []byte{2, 0, 0, 0, 1, 3, 0, 0, 0, 2, 1, 0, 0, 0, 3, 0, 0, 0, 0, 4, 4, 0, 0, 0, 5, 2, 0, 0, 0, 6, 0, 0, 0, 0, 7, 2, 0, 0, 0, 8}, buf) 177 | } 178 | }) 179 | } 180 | -------------------------------------------------------------------------------- /pkg/hnsw/heap.go: -------------------------------------------------------------------------------- 1 | package hnsw 2 | 3 | import ( 4 | "fmt" 5 | "maps" 6 | "math/bits" 7 | ) 8 | 9 | type Item struct { 10 | id Id 11 | dist float32 12 | } 13 | 14 | var EmptyHeapError = fmt.Errorf("Empty Heap") 15 | 16 | type DistHeap struct { 17 | items []*Item 18 | visited map[Id]int 19 | } 20 | 21 | func level(i int) int { 22 | // floor(log2(i + 1)) 23 | return bits.Len(uint(i)+1) - 1 24 | } 25 | 26 | func isMinLevel(i int) bool { 27 | return level(i)%2 == 0 28 | } 29 | 30 | func lchild(i int) int { 31 | return i*2 + 1 32 | } 33 | 34 | func rchild(i int) int { 35 | return i*2 + 2 36 | } 37 | 38 | func parent(i int) int { 39 | return (i - 1) / 2 40 | } 41 | 42 | func hasParent(i int) bool { 43 | return i > 0 44 | } 45 | 46 | func hasGrandparent(i int) bool { 47 | return i > 2 48 | } 49 | 50 | func grandparent(i int) int { 51 | return parent(parent(i)) 52 | } 53 | 54 | func (d *DistHeap) down(i, n int) bool { 55 | min := isMinLevel(i) 56 | i0 := i 57 | for { 58 | m := i 59 | 60 | l := lchild(i) 61 | if l >= n || l < 0 /* overflow */ { 62 | break 63 | } 64 | if d.Less(l, m) == min { 65 | m = l 66 | } 67 | 68 | r := rchild(i) 69 | if r < n && d.Less(r, m) == min { 70 | m = r 71 | } 72 | 73 | // grandchildren are contiguous i*4+3+{0,1,2,3} 74 | for g := lchild(l); g < n && g <= rchild(r); g++ { 75 | if d.Less(g, m) == min { 76 | m = g 77 | } 78 | } 79 | 80 | if m == i { 81 | break 82 | } 83 | 84 | d.Swap(i, m) 85 | 86 | if m == l || m == r { 87 | break 88 | } 89 | 90 | // m is grandchild 91 | p := parent(m) 92 | if d.Less(p, m) == min { 93 | d.Swap(m, p) 94 | } 95 | i = m 96 | } 97 | return i > i0 98 | } 99 | 100 | func (d *DistHeap) up(i int) { 101 | min := isMinLevel(i) 102 | 103 | if hasParent(i) { 104 | p := parent(i) 105 | if d.Less(p, i) == min { 106 | d.Swap(i, p) 107 | min = !min 108 | i = p 109 | } 110 | } 111 | 112 | for hasGrandparent(i) { 113 | g := grandparent(i) 114 | if d.Less(i, g) != min { 115 | return 116 | } 117 | 118 | d.Swap(i, g) 119 | i = g 120 | } 121 | } 122 | 123 | func NewDistHeap() *DistHeap { 124 | d := &DistHeap{ 125 | items: make([]*Item, 0), 126 | visited: make(map[Id]int), 127 | } 128 | return d 129 | } 130 | 131 | func (d *DistHeap) Clone() *DistHeap { 132 | n := &DistHeap{ 133 | items: make([]*Item, len(d.items)), 134 | visited: make(map[Id]int, len(d.visited)), 135 | } 136 | 137 | copy(n.items, d.items) 138 | maps.Copy(n.visited, d.visited) 139 | 140 | return n 141 | } 142 | 143 | func (d *DistHeap) PeekMinItem() (*Item, error) { 144 | if d.IsEmpty() { 145 | return nil, EmptyHeapError 146 | } 147 | 148 | return d.items[0], nil 149 | } 150 | func (d *DistHeap) PeekMaxItem() (*Item, error) { 151 | if d.Len() == 0 { 152 | return nil, EmptyHeapError 153 | } 154 | 155 | // Find the maximum element without removing it 156 | n := d.Len() 157 | 158 | i := 0 159 | l := lchild(0) 160 | if l < n && !d.Less(l, i) { 161 | i = l 162 | } 163 | 164 | r := rchild(0) 165 | if r < n && !d.Less(r, i) { 166 | i = r 167 | } 168 | 169 | return d.items[i], nil 170 | } 171 | func (d *DistHeap) PopMinItem() (*Item, error) { 172 | if d.IsEmpty() { 173 | return nil, EmptyHeapError 174 | } 175 | 176 | n := d.Len() - 1 177 | d.Swap(0, n) 178 | d.down(0, n) 179 | return d.Pop(), nil 180 | } 181 | func (d *DistHeap) PopMaxItem() (*Item, error) { 182 | if d.IsEmpty() { 183 | return nil, EmptyHeapError 184 | } 185 | 186 | n := d.Len() 187 | i := 0 188 | l := lchild(0) 189 | 190 | if l < n && !d.Less(l, i) { 191 | i = l 192 | } 193 | 194 | r := rchild(0) 195 | if r < n && !d.Less(r, i) { 196 | i = r 197 | } 198 | 199 | d.Swap(i, n-1) 200 | d.down(i, n-1) 201 | 202 | return d.Pop(), nil 203 | } 204 | func (d *DistHeap) Insert(id Id, dist float32) { 205 | index, ok := d.visited[id] 206 | 207 | if !ok { 208 | d.Push(&Item{id: id, dist: dist}) 209 | d.visited[id] = d.Len() - 1 210 | d.up(d.Len() - 1) 211 | return 212 | } 213 | 214 | d.items[index].dist = dist 215 | d.Fix(index) 216 | } 217 | 218 | func (d *DistHeap) Fix(i int) { 219 | if !d.down(i, d.Len()) { 220 | d.up(i) 221 | } 222 | } 223 | 224 | func (d DistHeap) IsEmpty() bool { return len(d.items) == 0 } 225 | func (d DistHeap) Len() int { return len(d.items) } 226 | func (d DistHeap) Less(i, j int) bool { return d.items[i].dist < d.items[j].dist } 227 | func (d DistHeap) Swap(i, j int) { 228 | d.visited[d.items[i].id], d.visited[d.items[j].id] = j, i 229 | d.items[i], d.items[j] = d.items[j], d.items[i] 230 | } 231 | func (d *DistHeap) Push(x *Item) { 232 | (*d).items = append((*d).items, x) 233 | } 234 | func (d *DistHeap) Pop() *Item { 235 | old := (*d).items 236 | n := len(old) 237 | x := old[n-1] 238 | (*d).items = old[0 : n-1] 239 | delete(d.visited, x.id) 240 | return x 241 | } 242 | -------------------------------------------------------------------------------- /pkg/hnsw/heap_test.go: -------------------------------------------------------------------------------- 1 | package hnsw 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestHeap(t *testing.T) { 9 | 10 | t.Run("basic min max properties", func(t *testing.T) { 11 | h := NewDistHeap() 12 | 13 | for i := 10; i > 0; i-- { 14 | h.Insert(Id(i), float32(10-i)) 15 | } 16 | 17 | if h.Len() != 10 { 18 | t.Fatalf("heap length should be 10, got %v", h.Len()) 19 | } 20 | 21 | expectedId := Id(10) 22 | for !h.IsEmpty() { 23 | peekMinItem, err := h.PeekMinItem() 24 | if err != nil { 25 | t.Fatalf("failed to peek min item: %v", err) 26 | } 27 | 28 | minItem, err := h.PopMinItem() 29 | if err != nil { 30 | t.Fatalf("failed to pop min item, err: %v", err) 31 | } 32 | 33 | if peekMinItem.id != minItem.id { 34 | t.Fatalf("mismatched item id, expected %v, got %v", expectedId, peekMinItem.id) 35 | } 36 | 37 | if minItem.id != expectedId { 38 | t.Fatalf("mismatched ids, expected %v, got: %v", expectedId, minItem.id) 39 | } 40 | 41 | expectedId -= 1 42 | } 43 | }) 44 | 45 | t.Run("basic min max properties 2", func(t *testing.T) { 46 | h := NewDistHeap() 47 | 48 | for i := 0; i <= 10; i++ { 49 | h.Insert(Id(i), float32(10-i)) 50 | } 51 | 52 | maxExpectedId := Id(0) 53 | minExpectedId := Id(10) 54 | 55 | for !h.IsEmpty() { 56 | peekMaxItem, err := h.PeekMaxItem() 57 | 58 | if err != nil { 59 | t.Fatalf("failed to peek max item, err: %v", err) 60 | } 61 | 62 | maxItem, err := h.PopMaxItem() 63 | 64 | if err != nil { 65 | t.Fatalf("failed to pop max item, err: %v", err) 66 | } 67 | 68 | if peekMaxItem.id != maxItem.id { 69 | t.Fatalf("mismatched max ids, expected %v, got: %v", maxItem.id, peekMaxItem.id) 70 | } 71 | 72 | if maxItem.id != maxExpectedId { 73 | t.Fatalf("expected id to be %v, got %v", maxExpectedId, maxItem.id) 74 | } 75 | 76 | if h.IsEmpty() { 77 | continue 78 | } 79 | 80 | peekMinItem, err := h.PeekMinItem() 81 | if err != nil { 82 | t.Fatalf("failed to peek min item, err: %v", err) 83 | } 84 | 85 | minItem, err := h.PopMinItem() 86 | 87 | if err != nil { 88 | t.Fatalf("failed to pop min item, err: %v", err) 89 | } 90 | 91 | if peekMinItem.id != minItem.id { 92 | t.Fatalf("mismatched min ids, expected %v, got: %v", maxItem.id, peekMaxItem.id) 93 | } 94 | 95 | if minItem.id != minExpectedId { 96 | t.Fatalf("expected id to be %v, got %v", minExpectedId, minItem.id) 97 | } 98 | 99 | minExpectedId -= 1 100 | maxExpectedId += 1 101 | } 102 | }) 103 | 104 | t.Run("bricks and ladders || min heap", func(t *testing.T) { 105 | type Case struct { 106 | heights []int 107 | bricks int 108 | ladders int 109 | expected int 110 | } 111 | 112 | cases := [3]Case{ 113 | { 114 | heights: []int{4, 2, 7, 6, 9, 14, 12}, 115 | bricks: 5, 116 | ladders: 1, 117 | expected: 4, 118 | }, 119 | { 120 | heights: []int{4, 12, 2, 7, 3, 18, 20, 3, 19}, 121 | bricks: 10, 122 | ladders: 2, 123 | expected: 7, 124 | }, 125 | { 126 | heights: []int{14, 3, 19, 3}, 127 | bricks: 17, 128 | ladders: 0, 129 | expected: 3, 130 | }, 131 | } 132 | 133 | for _, c := range cases { 134 | res, err := furthestBuildings(c.heights, c.bricks, c.ladders) 135 | if err != nil { 136 | t.Fatal(err) 137 | } 138 | 139 | if res != c.expected { 140 | t.Errorf("got %d, want %d", res, c.expected) 141 | } 142 | } 143 | }) 144 | 145 | t.Run("copy", func(t *testing.T) { 146 | m := NewDistHeap() 147 | 148 | for i := 0; i <= 10; i++ { 149 | m.Insert(Id(i), float32(10-i)) 150 | } 151 | 152 | n := m.Clone() 153 | 154 | reflect.DeepEqual(m.items, n.items) 155 | reflect.DeepEqual(m.visited, n.visited) 156 | 157 | expectedId := Id(10) 158 | 159 | for !n.IsEmpty() { 160 | item, err := n.PopMinItem() 161 | if err != nil { 162 | return 163 | } 164 | 165 | if item.id != expectedId { 166 | t.Fatalf("expected id to be %v, got %v", expectedId, item.id) 167 | } 168 | 169 | expectedId -= 1 170 | } 171 | }) 172 | } 173 | 174 | func furthestBuildings(heights []int, bricks, ladders int) (int, error) { 175 | 176 | ladderJumps := NewDistHeap() 177 | 178 | for idx := 0; idx < len(heights)-1; idx++ { 179 | height := heights[idx] 180 | nextHeight := heights[idx+1] 181 | 182 | if height >= nextHeight { 183 | continue 184 | } 185 | 186 | jump := nextHeight - height 187 | 188 | ladderJumps.Insert(Id(idx), float32(jump)) 189 | 190 | if ladderJumps.Len() > ladders { 191 | minLadderJump, err := ladderJumps.PopMinItem() 192 | if err != nil { 193 | return -1, err 194 | } 195 | 196 | if bricks-int(minLadderJump.dist) < 0 { 197 | return idx, nil 198 | } 199 | 200 | bricks -= int(minLadderJump.dist) 201 | } 202 | } 203 | 204 | return len(heights) - 1, nil 205 | } 206 | -------------------------------------------------------------------------------- /pkg/metapage/metapage.go: -------------------------------------------------------------------------------- 1 | package metapage 2 | 3 | import ( 4 | "github.com/kevmo314/appendable/pkg/pointer" 5 | "io" 6 | ) 7 | 8 | // MetaPage is an abstract interface over the root page of a bptree 9 | // This allows the caller to control the memory location of the meta 10 | // pointer 11 | type MetaPage interface { 12 | Root() (pointer.MemoryPointer, error) 13 | SetRoot(pointer.MemoryPointer) error 14 | } 15 | 16 | type NodeSerializable interface { 17 | Size() int64 18 | NumPointers() int 19 | MarshalBinary() ([]byte, error) 20 | UnmarshalBinary([]byte) error 21 | WriteTo(w io.Writer) (int64, error) 22 | } 23 | -------------------------------------------------------------------------------- /pkg/mmap/mmap.go: -------------------------------------------------------------------------------- 1 | // mmap contains utilities to memory map a file while still exposing file append operations. 2 | package mmap 3 | 4 | import ( 5 | "fmt" 6 | "io" 7 | "os" 8 | 9 | "golang.org/x/sys/unix" 10 | ) 11 | 12 | type MemoryMappedFile struct { 13 | file *os.File 14 | bytes []byte 15 | seek int64 16 | 17 | // parameters used for remapping. 18 | prot, flags int 19 | } 20 | 21 | var _ io.ReadWriteSeeker = &MemoryMappedFile{} 22 | var _ io.Closer = &MemoryMappedFile{} 23 | var _ io.ReaderAt = &MemoryMappedFile{} 24 | var _ io.WriterAt = &MemoryMappedFile{} 25 | 26 | func toProt(flag int) int { 27 | prot := unix.PROT_READ 28 | if flag&os.O_RDWR != 0 { 29 | prot |= unix.PROT_WRITE 30 | } 31 | return prot 32 | } 33 | 34 | func NewMemoryMappedFile(f *os.File, prot int) (*MemoryMappedFile, error) { 35 | fd := uintptr(f.Fd()) 36 | fi, err := f.Stat() 37 | if err != nil { 38 | return nil, fmt.Errorf("stat: %v", err) 39 | } 40 | if fi.Size() == 0 { 41 | return &MemoryMappedFile{file: f, bytes: nil, seek: 0, prot: prot, flags: unix.MAP_SHARED}, nil 42 | } 43 | b, err := unix.Mmap(int(fd), 0, int(fi.Size()), prot, unix.MAP_SHARED) 44 | if err != nil { 45 | return nil, fmt.Errorf("mmap: %v", err) 46 | } 47 | return &MemoryMappedFile{file: f, bytes: b, seek: 0, prot: prot, flags: unix.MAP_SHARED}, nil 48 | } 49 | 50 | // Open is a convenience function to open a file and memory map it. 51 | func Open(path string) (*MemoryMappedFile, error) { 52 | return OpenFile(path, os.O_RDWR, 0) 53 | } 54 | 55 | // OpenFile is a convenience function to open a file with the given flags and memory map it. 56 | func OpenFile(path string, flag int, perm os.FileMode) (*MemoryMappedFile, error) { 57 | f, err := os.OpenFile(path, flag, perm) 58 | if err != nil { 59 | return nil, fmt.Errorf("open: %v", err) 60 | } 61 | return NewMemoryMappedFile(f, toProt(flag)) 62 | } 63 | 64 | func (m *MemoryMappedFile) File() *os.File { 65 | return m.file 66 | } 67 | 68 | func (m *MemoryMappedFile) Bytes() []byte { 69 | return m.bytes 70 | } 71 | 72 | // Close closes the file and unmaps the memory. 73 | func (m *MemoryMappedFile) Close() error { 74 | if m.bytes == nil { 75 | return m.file.Close() 76 | } 77 | if err := unix.Munmap(m.bytes); err != nil { 78 | return fmt.Errorf("munmap: %v", err) 79 | } 80 | return m.file.Close() 81 | } 82 | 83 | // Seek sets the offset for the next Read or Write on file to offset. 84 | func (m *MemoryMappedFile) Seek(offset int64, whence int) (int64, error) { 85 | var abs int64 86 | switch whence { 87 | case io.SeekStart: 88 | abs = offset 89 | case io.SeekCurrent: 90 | abs = m.seek + offset 91 | case io.SeekEnd: 92 | abs = int64(len(m.bytes)) + offset 93 | default: 94 | return 0, fmt.Errorf("mmap: invalid whence") 95 | } 96 | if abs < 0 { 97 | return 0, fmt.Errorf("mmap: negative position") 98 | } else if abs > int64(len(m.bytes)) { 99 | return 0, fmt.Errorf("mmap: position out of bounds") 100 | } 101 | m.seek = abs 102 | return abs, nil 103 | } 104 | 105 | // Read reads up to len(b) bytes from the file. 106 | func (m *MemoryMappedFile) Read(b []byte) (int, error) { 107 | n := copy(b, m.bytes[m.seek:]) 108 | m.seek += int64(n) 109 | if n < len(b) { 110 | return n, io.EOF 111 | } 112 | return n, nil 113 | } 114 | 115 | // ReadAt reads len(b) bytes from the file starting at byte offset off. 116 | func (m *MemoryMappedFile) ReadAt(b []byte, off int64) (int, error) { 117 | n := copy(b, m.bytes[off:]) 118 | if n < len(b) { 119 | return n, io.EOF 120 | } 121 | return n, nil 122 | } 123 | 124 | // Write writes len(b) bytes to the file, appending to the file and remapping if necessary. 125 | func (m *MemoryMappedFile) Write(b []byte) (int, error) { 126 | n, err := m.WriteAt(b, m.seek) 127 | if err != nil { 128 | return 0, err 129 | } 130 | m.seek += int64(n) 131 | return n, nil 132 | } 133 | 134 | // WriteAt writes len(b) bytes to the file starting at byte offset off. 135 | func (m *MemoryMappedFile) WriteAt(b []byte, off int64) (int, error) { 136 | // check if the file needs to be remapped 137 | if off+int64(len(b)) > int64(len(m.bytes)) { 138 | // write the data and remap the file 139 | if _, err := m.file.WriteAt(b, off); err != nil { 140 | return 0, err 141 | } 142 | fi, err := m.file.Stat() 143 | if err != nil { 144 | return 0, err 145 | } 146 | if m.bytes == nil { 147 | m.bytes, err = unix.Mmap(int(m.file.Fd()), 0, int(fi.Size()), m.prot, m.flags) 148 | if err != nil { 149 | return 0, fmt.Errorf("mmap: %v", err) 150 | } 151 | return len(b), nil 152 | } 153 | b, err := mremap(m.bytes, int(m.file.Fd()), int(fi.Size()), m.prot, m.flags) 154 | if err != nil { 155 | return 0, fmt.Errorf("mmap: %v", err) 156 | } 157 | m.bytes = b 158 | return len(b), nil 159 | } 160 | // write the data 161 | n := copy(m.bytes[off:], b) 162 | return n, nil 163 | } 164 | -------------------------------------------------------------------------------- /pkg/mmap/mremap_darwin.go: -------------------------------------------------------------------------------- 1 | package mmap 2 | 3 | import "golang.org/x/sys/unix" 4 | 5 | func mremap(oldAddress []byte, fd, newSize, prot, flags int) ([]byte, error) { 6 | // darwin doesn't have mremap, so we have to munmap and mmap the new size 7 | 8 | // unmap the old address 9 | if err := unix.Munmap(oldAddress); err != nil { 10 | return nil, err 11 | } 12 | return unix.Mmap(fd, 0, newSize, prot, flags) 13 | } 14 | -------------------------------------------------------------------------------- /pkg/mmap/mremap_linux.go: -------------------------------------------------------------------------------- 1 | package mmap 2 | 3 | import "golang.org/x/sys/unix" 4 | 5 | func mremap(oldAddress []byte, fd, newSize, prot, flags int) ([]byte, error) { 6 | return unix.Mremap(oldAddress, newSize, unix.MREMAP_MAYMOVE) 7 | } 8 | -------------------------------------------------------------------------------- /pkg/mocks/btree.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/binary" 5 | "github.com/kevmo314/appendable/pkg/bptree" 6 | "github.com/kevmo314/appendable/pkg/buftest" 7 | "github.com/kevmo314/appendable/pkg/pagefile" 8 | "github.com/kevmo314/appendable/pkg/pointer" 9 | "log" 10 | "math" 11 | ) 12 | 13 | func generateBasicBtree() { 14 | b := buftest.NewSeekableBuffer() 15 | p, err := pagefile.NewPageFile(b) 16 | if err != nil { 17 | log.Fatalf("%v", err) 18 | } 19 | mp, err := newTestMetaPage(p) 20 | 21 | if err != nil { 22 | log.Fatalf("%v", err) 23 | } 24 | 25 | tree := &bptree.BPTree{PageFile: p, MetaPage: mp, Width: uint16(6)} 26 | if err := tree.Insert(pointer.ReferencedValue{Value: []byte("hello")}, pointer.MemoryPointer{Offset: 1, Length: 5}); err != nil { 27 | log.Fatalf("%v", err) 28 | } 29 | if err := tree.Insert(pointer.ReferencedValue{Value: []byte("world")}, pointer.MemoryPointer{Offset: 2, Length: 5}); err != nil { 30 | log.Fatalf("%v", err) 31 | } 32 | if err := tree.Insert(pointer.ReferencedValue{Value: []byte("moooo")}, pointer.MemoryPointer{Offset: 3, Length: 5}); err != nil { 33 | log.Fatalf("%v", err) 34 | } 35 | if err := tree.Insert(pointer.ReferencedValue{Value: []byte("cooow")}, pointer.MemoryPointer{Offset: 4, Length: 5}); err != nil { 36 | log.Fatalf("%v", err) 37 | } 38 | 39 | if err := b.WriteToDisk("BPTree_1.bin"); err != nil { 40 | log.Fatalf("%v", err) 41 | } 42 | } 43 | 44 | type StubDataParser struct{} 45 | 46 | func (s *StubDataParser) Parse(value []byte) []byte { 47 | return []byte{1, 2, 3, 4, 5, 6, 7, 8} 48 | } 49 | 50 | func generateBtreeIterator() { 51 | 52 | b := buftest.NewSeekableBuffer() 53 | p, err := pagefile.NewPageFile(b) 54 | if err != nil { 55 | log.Fatalf("%v", err) 56 | } 57 | 58 | mp, err := newTestMetaPage(p) 59 | 60 | if err != nil { 61 | log.Fatalf("%v", err) 62 | } 63 | tree := &bptree.BPTree{PageFile: p, MetaPage: mp, Data: make([]byte, 16384*4+8), DataParser: &StubDataParser{}, Width: uint16(0)} 64 | for i := 0; i < 16384*4; i++ { 65 | if err := tree.Insert(pointer.ReferencedValue{ 66 | Value: []byte{1, 2, 3, 4, 5, 6, 7, 8}, 67 | // DataPointer is used as a disambiguator. 68 | DataPointer: pointer.MemoryPointer{Offset: uint64(i), Length: 8}, 69 | }, pointer.MemoryPointer{Offset: uint64(i)}); err != nil { 70 | log.Fatalf("%v", err) 71 | } 72 | } 73 | 74 | b.WriteToDisk("btree_iterator.bin") 75 | } 76 | 77 | func generate1023Btree() { 78 | b := buftest.NewSeekableBuffer() 79 | p, err := pagefile.NewPageFile(b) 80 | if err != nil { 81 | log.Fatalf("%v", err) 82 | } 83 | 84 | mp, err := newTestMetaPage(p) 85 | 86 | if err != nil { 87 | log.Fatalf("%v", err) 88 | } 89 | tree := &bptree.BPTree{PageFile: p, MetaPage: mp, Width: uint16(9)} 90 | count := 10 91 | 92 | for i := 0; i < count; i++ { 93 | buf := make([]byte, 8) 94 | binary.BigEndian.PutUint64(buf, math.Float64bits(23)) 95 | 96 | if err := tree.Insert(pointer.ReferencedValue{Value: buf, DataPointer: pointer.MemoryPointer{Offset: uint64(i)}}, pointer.MemoryPointer{Offset: uint64(i), Length: uint32(len(buf))}); err != nil { 97 | log.Fatal(err) 98 | } 99 | } 100 | 101 | b.WriteToDisk("BPTree_1023.bin") 102 | } 103 | -------------------------------------------------------------------------------- /pkg/mocks/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | func main() { 4 | // generateUVariantTestCases() 5 | //generateFilledMetadata() 6 | //generateBasicBtree() 7 | //generateInternalNode() 8 | //generateLeafNode() 9 | //generateBtreeIterator() 10 | // generateFileMeta() 11 | //generateIndexMeta() 12 | //generate1023Btree() 13 | } 14 | -------------------------------------------------------------------------------- /pkg/mocks/meta_page.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/binary" 5 | "fmt" 6 | "github.com/kevmo314/appendable/pkg/pagefile" 7 | "github.com/kevmo314/appendable/pkg/pointer" 8 | "io" 9 | ) 10 | 11 | type testMetaPage struct { 12 | pf *pagefile.PageFile 13 | root pointer.MemoryPointer 14 | } 15 | 16 | func (m *testMetaPage) SetRoot(mp pointer.MemoryPointer) error { 17 | m.root = mp 18 | return m.write() 19 | } 20 | 21 | func (m *testMetaPage) Root() (pointer.MemoryPointer, error) { 22 | return m.root, nil 23 | } 24 | 25 | func (m *testMetaPage) write() error { 26 | buf := make([]byte, 8) 27 | binary.LittleEndian.PutUint64(buf, m.root.Offset) 28 | if _, err := m.pf.Seek(4096, io.SeekStart); err != nil { 29 | return err 30 | } 31 | if _, err := m.pf.Write(buf); err != nil { 32 | return err 33 | } 34 | return nil 35 | } 36 | 37 | func newTestMetaPage(pf *pagefile.PageFile) (*testMetaPage, error) { 38 | meta := &testMetaPage{pf: pf} 39 | offset, err := pf.NewPage([]byte{0, 0, 0, 0, 0, 0, 0, 0}) 40 | if err != nil { 41 | return nil, fmt.Errorf("%v", err) 42 | } 43 | // first page is garbage collection 44 | if offset != 4096 { 45 | return nil, fmt.Errorf("expected offset 0, got %d", offset) 46 | } 47 | return meta, nil 48 | } 49 | -------------------------------------------------------------------------------- /pkg/mocks/metadata.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | "os" 6 | 7 | "github.com/kevmo314/appendable/pkg/appendable" 8 | "github.com/kevmo314/appendable/pkg/buftest" 9 | "github.com/kevmo314/appendable/pkg/linkedpage" 10 | "github.com/kevmo314/appendable/pkg/pagefile" 11 | ) 12 | 13 | func generateFilledMetadata() { 14 | b := buftest.NewSeekableBuffer() 15 | p, err := pagefile.NewPageFile(b) 16 | if err != nil { 17 | log.Fatalf("%v", err) 18 | } 19 | tree, err := linkedpage.NewMultiBPTree(p, 0) 20 | if err != nil { 21 | log.Fatalf("%v", err) 22 | } 23 | node, err := tree.AddNext() 24 | if err != nil { 25 | log.Fatalf("%v", err) 26 | } 27 | if err := node.SetMetadata([]byte("hello")); err != nil { 28 | log.Fatalf("%v", err) 29 | } 30 | 31 | b.WriteToDisk("filled_metadata.bin") 32 | } 33 | 34 | func writeByteToFile(data []byte, filename string) error { 35 | if err := os.WriteFile(filename, data, 0644); err != nil { 36 | return err 37 | } 38 | return nil 39 | } 40 | 41 | func generateFileMeta() { 42 | fm := appendable.FileMeta{} 43 | fm.Format = 1 44 | fm.Version = 1 45 | fm.ReadOffset = 4096 46 | fm.Entries = 34 47 | 48 | b, err := fm.MarshalBinary() 49 | if err != nil { 50 | log.Fatalf("failed to write file meta to disk") 51 | } 52 | 53 | if err := writeByteToFile(b, "filemeta.bin"); err != nil { 54 | log.Fatalf("failed to write bytes to disk") 55 | } 56 | } 57 | 58 | func generateIndexMeta() { 59 | im := appendable.IndexMeta{} 60 | im.FieldName = "howdydo" 61 | im.FieldType = appendable.FieldTypeBoolean 62 | im.Width = appendable.DetermineType(appendable.FieldTypeBoolean) 63 | im.TotalFieldValueLength = 773424601 64 | 65 | b, err := im.MarshalBinary() 66 | if err != nil { 67 | log.Fatal("failed to write index meta to disk") 68 | } 69 | 70 | if err := writeByteToFile(b, "indexmeta.bin"); err != nil { 71 | log.Fatalf("failed to write bytes to disk") 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /pkg/mocks/node.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "fmt" 7 | "github.com/kevmo314/appendable/pkg/bptree" 8 | "github.com/kevmo314/appendable/pkg/pointer" 9 | "log" 10 | "os" 11 | ) 12 | 13 | func writeBufferToFile(buf *bytes.Buffer, filename string) error { 14 | if err := os.WriteFile(filename, buf.Bytes(), 0644); err != nil { 15 | return err 16 | } 17 | return nil 18 | } 19 | 20 | func generateLeafNode() { 21 | // Create a test BPTreeNode 22 | node1 := &bptree.BPTreeNode{ 23 | LeafPointers: []pointer.MemoryPointer{ 24 | {Offset: 0, Length: 3}, 25 | {Offset: 3, Length: 3}, 26 | {Offset: 6, Length: 3}, 27 | }, 28 | Keys: []pointer.ReferencedValue{ 29 | {Value: []byte{0, 1, 2}}, 30 | {Value: []byte{1, 2, 3}}, 31 | {Value: []byte{3, 4, 5}}, 32 | }, 33 | Width: uint16(4), 34 | } 35 | 36 | buf := &bytes.Buffer{} 37 | if _, err := node1.WriteTo(buf); err != nil { 38 | log.Fatal(err) 39 | } 40 | 41 | writeBufferToFile(buf, "leafnode.bin") 42 | } 43 | 44 | func generateInternalNode() { 45 | // Create a test BPTreeNode 46 | node1 := &bptree.BPTreeNode{ 47 | InternalPointers: []uint64{0, 1, 2, 3}, 48 | Keys: []pointer.ReferencedValue{ 49 | {Value: []byte{0, 1}}, 50 | {Value: []byte{1, 2}}, 51 | {Value: []byte{3, 4}}, 52 | }, 53 | Width: uint16(3), 54 | } 55 | 56 | buf := &bytes.Buffer{} 57 | if _, err := node1.WriteTo(buf); err != nil { 58 | log.Fatal(err) 59 | } 60 | 61 | writeBufferToFile(buf, "internalnode.bin") 62 | 63 | } 64 | 65 | func generateUVariantTestCases() { 66 | var tests = []uint64{ 67 | 0, 68 | 1, 69 | 2, 70 | 10, 71 | 20, 72 | 63, 73 | 64, 74 | 65, 75 | 127, 76 | 128, 77 | 129, 78 | 255, 79 | 256, 80 | 257, 81 | 1<<63 - 1, 82 | } 83 | 84 | for _, x := range tests { 85 | buf := make([]byte, binary.MaxVarintLen64) 86 | n := binary.PutUvarint(buf, x) 87 | y, m := binary.Uvarint(buf[0:n]) 88 | 89 | fmt.Printf("Test case - Value: %d, Encoded Bytes: %d\n", x, n) 90 | fmt.Printf("Decoded Value: %d, Bytes Read: %d\n", y, m) 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /pkg/ngram/tokenizer.go: -------------------------------------------------------------------------------- 1 | package ngram 2 | 3 | import ( 4 | "golang.org/x/text/unicode/norm" 5 | "hash/fnv" 6 | "math/rand" 7 | "strings" 8 | "unicode" 9 | "unicode/utf8" 10 | ) 11 | 12 | // NgramTokenizer generates the following tokens with the lengths: 1, 2, 3. 13 | // This is for two searching modes: 14 | // By default, we'll use the 12gram, that is the min-gram: 1 and max-gram: 2. 15 | // Also support trigrams, which have min-gram: 3, max-gram: 3. 16 | 17 | type Token struct { 18 | Word string 19 | Offset uint64 20 | Length uint32 21 | } 22 | 23 | // BuildTrigram makes two passes 24 | // 25 | // 1 - splits by white space and keeps track of the positions 26 | // 2 - performs a sliding window and builds trigrams 27 | 28 | func normalizeToAscii(s string) (string, map[int]int) { 29 | ogOffsets := make(map[int]int) 30 | 31 | var b strings.Builder 32 | norm := norm.NFKD.String(s) 33 | 34 | additionalOffsets := 0 35 | 36 | newIndex := 0 37 | 38 | for i, r := range norm { 39 | if utf8.RuneLen(r) > 1 { 40 | additionalOffsets += utf8.RuneLen(r) - 1 41 | } 42 | 43 | if r <= unicode.MaxASCII { 44 | b.WriteRune(r) 45 | ogOffsets[newIndex] = i - additionalOffsets 46 | newIndex++ 47 | } 48 | 49 | } 50 | return b.String(), ogOffsets 51 | } 52 | 53 | func combineHashes(tokens []Token) int64 { 54 | h := fnv.New32a() 55 | for _, t := range tokens { 56 | h.Write([]byte(t.Word)) 57 | } 58 | return int64(h.Sum32()) 59 | } 60 | 61 | func Shuffle(tokens []Token) []Token { 62 | soup := make([]Token, len(tokens)) 63 | copy(soup, tokens) 64 | 65 | seed := combineHashes(tokens) 66 | rand.Seed(seed) 67 | for i := len(tokens) - 1; i > 0; i-- { 68 | j := rand.Intn(i + 1) 69 | soup[i], soup[j] = soup[j], soup[i] 70 | } 71 | 72 | return soup 73 | } 74 | 75 | func BuildNgram(phrase string, gl int) []Token { 76 | var ngramTokens []Token 77 | 78 | var words [][]int 79 | var currWord []int 80 | 81 | clean, ogOffsets := normalizeToAscii(phrase) 82 | 83 | runes := []rune(clean) 84 | for i := 0; i < len(runes); i++ { 85 | r := runes[i] 86 | 87 | if unicode.IsLetter(r) || unicode.IsDigit(r) { 88 | currWord = append(currWord, i) 89 | } else if unicode.IsSpace(r) { 90 | if len(currWord) >= gl { 91 | words = append(words, currWord) 92 | } 93 | currWord = []int{} 94 | } 95 | } 96 | 97 | if len(currWord) >= gl { 98 | words = append(words, currWord) 99 | } 100 | 101 | for _, wOffsets := range words { 102 | for i := 0; i <= len(wOffsets)-gl; i++ { 103 | 104 | var str string 105 | 106 | p := 0 107 | for j := i; j < i+gl; j++ { 108 | str += string(runes[wOffsets[j]]) 109 | p = j 110 | } 111 | 112 | q := ogOffsets[wOffsets[i]] 113 | ngramTokens = append(ngramTokens, Token{ 114 | Word: strings.ToLower(str), 115 | Offset: uint64(q), 116 | Length: uint32(ogOffsets[wOffsets[p]] - q + 1), 117 | }) 118 | 119 | } 120 | } 121 | 122 | return ngramTokens 123 | } 124 | -------------------------------------------------------------------------------- /pkg/pagefile/pagefile.go: -------------------------------------------------------------------------------- 1 | package pagefile 2 | 3 | import ( 4 | "encoding/binary" 5 | "errors" 6 | "io" 7 | ) 8 | 9 | type ReadWriteSeekPager interface { 10 | io.ReadWriteSeeker 11 | 12 | Page(int) (int64, error) 13 | NewPage([]byte) (int64, error) 14 | FreePage(int64) error 15 | 16 | LastPage() int64 17 | 18 | PageSize() int 19 | SlotSize() int 20 | 21 | PageCount() int64 22 | } 23 | 24 | type PageFile struct { 25 | io.ReadWriteSeeker 26 | pageSize int 27 | slotSize int 28 | 29 | // local cache of free pages to avoid reading from disk too often. 30 | freePageIndexes [512]int64 31 | freePageHead, freePageCount int 32 | 33 | lastPage int64 34 | } 35 | 36 | var _ ReadWriteSeekPager = &PageFile{} 37 | 38 | // const maxFreePageIndices = 512 39 | const pageSizeBytes = 4096 // 4kB by default. 40 | const slotSizeBytes = 256 41 | 42 | func NewPageFile(rws io.ReadWriteSeeker) (*PageFile, error) { 43 | // check if the rws is empty. if it is, allocate one page for the free page indexes 44 | // if it is not, read the free page indexes from the last page 45 | if _, err := rws.Seek(0, io.SeekStart); err != nil { 46 | return nil, err 47 | } 48 | buf := make([]byte, pageSizeBytes) 49 | _, err := rws.Read(buf) 50 | if err != nil && err != io.EOF { 51 | return nil, err 52 | } 53 | pf := &PageFile{ 54 | ReadWriteSeeker: rws, 55 | pageSize: pageSizeBytes, 56 | slotSize: slotSizeBytes, 57 | } 58 | if err == io.EOF { 59 | // allocate one page for the free page indexes 60 | if _, err := rws.Write(buf); err != nil { 61 | return nil, err 62 | } 63 | } else { 64 | for i := 0; i < len(pf.freePageIndexes); i++ { 65 | offset := int64(binary.LittleEndian.Uint64(buf[i*8 : (i+1)*8])) 66 | if offset != 0 { 67 | pf.freePageIndexes[pf.freePageHead] = offset 68 | pf.freePageHead = (pf.freePageHead + 1) % len(pf.freePageIndexes) 69 | pf.freePageCount++ 70 | } else { 71 | break 72 | } 73 | } 74 | } 75 | // figure out what the last page is 76 | n, err := rws.Seek(0, io.SeekEnd) 77 | if err != nil { 78 | return nil, err 79 | } 80 | if n%int64(pf.pageSize) != 0 { 81 | return nil, errors.New("file size is not a multiple of the page size") 82 | } 83 | pf.lastPage = n / int64(pf.pageSize) 84 | return pf, nil 85 | } 86 | 87 | func (pf *PageFile) LastPage() int64 { 88 | return pf.lastPage 89 | } 90 | 91 | func (pf *PageFile) Page(i int) (int64, error) { 92 | if i < 0 { 93 | return 0, errors.New("page index cannot be negative") 94 | } 95 | // i + 1 because the first page is reserved for the free page indexes 96 | return int64(i+1) * int64(pf.pageSize), nil 97 | } 98 | 99 | func (pf *PageFile) writeFreePageIndices() error { 100 | buf := make([]byte, len(pf.freePageIndexes)*8) 101 | tail := (pf.freePageHead - pf.freePageCount + len(pf.freePageIndexes)) % len(pf.freePageIndexes) 102 | for i := 0; i < pf.freePageCount; i++ { 103 | offset := pf.freePageIndexes[tail+i] 104 | binary.LittleEndian.PutUint64(buf[i*8:(i+1)*8], uint64(offset)) 105 | } 106 | if _, err := pf.ReadWriteSeeker.Seek(0, io.SeekStart); err != nil { 107 | return err 108 | } 109 | if _, err := pf.ReadWriteSeeker.Write(buf); err != nil { 110 | return err 111 | } 112 | return nil 113 | } 114 | 115 | func (pf *PageFile) FreePageIndex() (int64, error) { 116 | // find the first free page index and return it 117 | if pf.freePageCount == 0 { 118 | return -1, nil 119 | } 120 | // pop from the tail 121 | tail := (pf.freePageHead - pf.freePageCount + len(pf.freePageIndexes)) % len(pf.freePageIndexes) 122 | offset := pf.freePageIndexes[tail] 123 | pf.freePageIndexes[tail] = 0 124 | pf.freePageCount-- 125 | 126 | if err := pf.writeFreePageIndices(); err != nil { 127 | return 0, err 128 | } 129 | 130 | return offset, nil 131 | } 132 | 133 | func (pf *PageFile) NewPage(buf []byte) (int64, error) { 134 | if buf != nil && len(buf) > pf.pageSize { 135 | return 0, errors.New("buffer is too large") 136 | } 137 | 138 | // if there are free pages, return the first one 139 | offset, err := pf.FreePageIndex() 140 | if err != nil { 141 | return 0, err 142 | } 143 | if offset != -1 { 144 | // seek to the free page 145 | if _, err := pf.ReadWriteSeeker.Seek(offset, io.SeekStart); err != nil { 146 | return 0, err 147 | } 148 | } else { 149 | n, err := pf.ReadWriteSeeker.Seek(0, io.SeekEnd) 150 | if err != nil { 151 | return 0, err 152 | } 153 | offset = n 154 | pf.lastPage++ 155 | } 156 | 157 | // if the offset is not a multiple of the page size, we need to pad the file 158 | // with zeros to the next page boundary. 159 | var pad int64 160 | if pf.pageSize > 0 && offset%int64(pf.pageSize) != 0 { 161 | // Calculate the number of bytes to pad 162 | pad = int64(pf.pageSize) - (offset % int64(pf.pageSize)) 163 | // Write the padding 164 | if _, err := pf.Write(make([]byte, pad)); err != nil { 165 | return 0, err 166 | } 167 | } 168 | page := make([]byte, pf.pageSize) 169 | if buf != nil { 170 | copy(page, buf) 171 | } 172 | if _, err := pf.ReadWriteSeeker.Write(page); err != nil { 173 | return 0, err 174 | } 175 | if _, err := pf.ReadWriteSeeker.Seek(offset, io.SeekStart); err != nil { 176 | return 0, err 177 | } 178 | return offset + pad, nil 179 | } 180 | 181 | func (pf *PageFile) FreePage(offset int64) error { 182 | if offset%int64(pf.pageSize) != 0 { 183 | return errors.New("offset is not a multiple of the page size") 184 | } 185 | if pf.freePageCount == len(pf.freePageIndexes) { 186 | return errors.New("free page index is full") 187 | } 188 | 189 | for i := range pf.freePageIndexes { 190 | if pf.freePageIndexes[i] == offset { 191 | return errors.New("offset already exists") 192 | } 193 | } 194 | 195 | // push to the head 196 | pf.freePageIndexes[pf.freePageHead] = offset 197 | pf.freePageHead = (pf.freePageHead + 1) % len(pf.freePageIndexes) 198 | pf.freePageCount++ 199 | 200 | return pf.writeFreePageIndices() 201 | } 202 | 203 | func (pf *PageFile) PageSize() int { 204 | return pf.pageSize 205 | } 206 | 207 | func (pf *PageFile) SlotSize() int { 208 | return slotSizeBytes 209 | } 210 | 211 | func (pf *PageFile) PageCount() int64 { 212 | return pf.lastPage 213 | } 214 | -------------------------------------------------------------------------------- /pkg/pagefile/pagefile_debug.go: -------------------------------------------------------------------------------- 1 | //go:build !release 2 | 3 | package pagefile 4 | 5 | import "io" 6 | 7 | func (pf *PageFile) Write(buf []byte) (int, error) { 8 | n, err := pf.ReadWriteSeeker.Seek(0, io.SeekCurrent) 9 | if err != nil { 10 | return 0, err 11 | } 12 | if n%int64(pf.pageSize)+int64(len(buf)) > int64(pf.pageSize) { 13 | panic("writing across page boundary not allowed") 14 | } 15 | return pf.ReadWriteSeeker.Write(buf) 16 | } 17 | 18 | func (pf *PageFile) Read(buf []byte) (int, error) { 19 | n, err := pf.ReadWriteSeeker.Seek(0, io.SeekCurrent) 20 | if err != nil { 21 | return 0, err 22 | } 23 | if n%int64(pf.pageSize)+int64(len(buf)) > int64(pf.pageSize) { 24 | panic("reading across page boundary not allowed") 25 | } 26 | return pf.ReadWriteSeeker.Read(buf) 27 | } 28 | -------------------------------------------------------------------------------- /pkg/pagefile/pagefile_debug_test.go: -------------------------------------------------------------------------------- 1 | package pagefile 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/kevmo314/appendable/pkg/buftest" 7 | ) 8 | 9 | func TestWriteAcrossBoundaryPanicsInDebug(t *testing.T) { 10 | defer func() { 11 | if r := recover(); r == nil { 12 | t.Errorf("The code did not panic") 13 | } 14 | }() 15 | 16 | buf := buftest.NewSeekableBuffer() 17 | pf, err := NewPageFile(buf) 18 | if err != nil { 19 | t.Fatal(err) 20 | } 21 | if _, err := pf.Write(make([]byte, pf.PageSize()+1)); err != nil { 22 | t.Fatal(err) 23 | } 24 | } 25 | 26 | func TestReadAcrossBoundaryPanicsInDebug(t *testing.T) { 27 | defer func() { 28 | if r := recover(); r == nil { 29 | t.Errorf("The code did not panic") 30 | } 31 | }() 32 | 33 | buf := buftest.NewSeekableBuffer() 34 | pf, err := NewPageFile(buf) 35 | if err != nil { 36 | t.Fatal(err) 37 | } 38 | if _, err := pf.Read(make([]byte, pf.PageSize()+1)); err != nil { 39 | t.Fatal(err) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /pkg/pagefile/pagefile_test.go: -------------------------------------------------------------------------------- 1 | package pagefile 2 | 3 | import ( 4 | "io" 5 | "testing" 6 | 7 | "github.com/kevmo314/appendable/pkg/buftest" 8 | ) 9 | 10 | func TestPageFile(t *testing.T) { 11 | t.Run("allocates first page", func(t *testing.T) { 12 | buf := buftest.NewSeekableBuffer() 13 | pf, err := NewPageFile(buf) 14 | if err != nil { 15 | t.Fatal(err) 16 | } 17 | offset, err := pf.NewPage(nil) 18 | if err != nil { 19 | t.Fatal(err) 20 | } 21 | if offset != pageSizeBytes { 22 | t.Fatalf("expected offset %d, got %d", pageSizeBytes, offset) 23 | } 24 | }) 25 | 26 | t.Run("page size allocates pages", func(t *testing.T) { 27 | buf := buftest.NewSeekableBuffer() 28 | pf, err := NewPageFile(buf) 29 | if err != nil { 30 | t.Fatal(err) 31 | } 32 | offset1, err := pf.NewPage(nil) 33 | if err != nil { 34 | t.Fatal(err) 35 | } 36 | if offset1 != pageSizeBytes { 37 | t.Fatalf("expected offset %d, got %d", pageSizeBytes, offset1) 38 | } 39 | // check the seek location 40 | n, err := buf.Seek(0, io.SeekCurrent) 41 | if err != nil { 42 | t.Fatal(err) 43 | } 44 | if n != pageSizeBytes { 45 | t.Fatalf("expected offset %d, got %d", pageSizeBytes, n) 46 | } 47 | offset2, err := pf.NewPage(nil) 48 | if err != nil { 49 | t.Fatal(err) 50 | } 51 | if offset2 != pageSizeBytes*2 { 52 | t.Fatalf("expected offset %d, got %d", pageSizeBytes*2, offset2) 53 | } 54 | m, err := buf.Seek(0, io.SeekCurrent) 55 | if err != nil { 56 | t.Fatal(err) 57 | } 58 | if m != pageSizeBytes*2 { 59 | t.Fatalf("expected offset %d, got %d", pageSizeBytes*2, m) 60 | } 61 | }) 62 | 63 | t.Run("page size allocates page with data", func(t *testing.T) { 64 | buf := buftest.NewSeekableBuffer() 65 | pf, err := NewPageFile(buf) 66 | if err != nil { 67 | t.Fatal(err) 68 | } 69 | data := []byte("hello") 70 | offset1, err := pf.NewPage(data) 71 | if err != nil { 72 | t.Fatal(err) 73 | } 74 | if offset1 != pageSizeBytes { 75 | t.Fatalf("expected offset %d, got %d", pageSizeBytes, offset1) 76 | } 77 | if _, err := pf.Seek(offset1, io.SeekStart); err != nil { 78 | t.Fatal(err) 79 | } 80 | buf2 := make([]byte, len(data)) 81 | if _, err := pf.Read(buf2); err != nil { 82 | t.Fatal(err) 83 | } 84 | if string(buf2) != string(data) { 85 | t.Fatalf("expected %s, got %s", string(data), string(buf2)) 86 | } 87 | }) 88 | 89 | t.Run("new page seeks to page", func(t *testing.T) { 90 | buf := buftest.NewSeekableBuffer() 91 | pf, err := NewPageFile(buf) 92 | if err != nil { 93 | t.Fatal(err) 94 | } 95 | offset1, err := pf.NewPage(nil) 96 | if err != nil { 97 | t.Fatal(err) 98 | } 99 | offset2, err := pf.Seek(0, io.SeekCurrent) 100 | if err != nil { 101 | t.Fatal(err) 102 | } 103 | if offset1 != offset2 { 104 | t.Fatalf("expected offset %d, got %d", offset1, offset2) 105 | } 106 | }) 107 | 108 | t.Run("free page reuses page", func(t *testing.T) { 109 | buf := buftest.NewSeekableBuffer() 110 | pf, err := NewPageFile(buf) 111 | if err != nil { 112 | t.Fatal(err) 113 | } 114 | offset1, err := pf.NewPage(nil) 115 | if err != nil { 116 | t.Fatal(err) 117 | } 118 | if offset1 != pageSizeBytes { 119 | t.Fatalf("expected offset %d, got %d", pageSizeBytes, offset1) 120 | } 121 | // need to write at least one byte to trigger a new page. 122 | if _, err := pf.Write(make([]byte, 1)); err != nil { 123 | t.Fatal(err) 124 | } 125 | offset2, err := pf.NewPage(nil) 126 | if err != nil { 127 | t.Fatal(err) 128 | } 129 | if offset2 != pageSizeBytes*2 { 130 | t.Fatalf("expected offset %d, got %d", 2*pageSizeBytes, offset2) 131 | } 132 | 133 | if err := pf.FreePage(offset1); err != nil { 134 | t.Fatal(err) 135 | } 136 | offset3, err := pf.NewPage(nil) 137 | if err != nil { 138 | t.Fatal(err) 139 | } 140 | if offset3 != offset1 { 141 | t.Fatalf("expected offset %d, got %d", offset2, offset3) 142 | } 143 | }) 144 | 145 | t.Run("free page behaves like a circular buffer", func(t *testing.T) { 146 | buf := buftest.NewSeekableBuffer() 147 | pf, err := NewPageFile(buf) 148 | if err != nil { 149 | t.Fatal(err) 150 | } 151 | offsets := make([]int64, 0, 10) 152 | for i := 0; i < 10; i++ { 153 | offset, err := pf.NewPage(nil) 154 | if err != nil { 155 | t.Fatal(err) 156 | } 157 | if i > 0 && offset != offsets[i-1]+pageSizeBytes { 158 | t.Fatalf("expected offset %d, got %d", offsets[i-1]+pageSizeBytes, offset) 159 | } 160 | offsets = append(offsets, offset) 161 | } 162 | for i := 0; i < 10; i++ { 163 | if err := pf.FreePage(offsets[i]); err != nil { 164 | t.Fatal(err) 165 | } 166 | } 167 | for i := 0; i < 10; i++ { 168 | offset, err := pf.NewPage(nil) 169 | if err != nil { 170 | t.Fatal(err) 171 | } 172 | if offset != offsets[i] { 173 | t.Fatalf("expected offset %d, got %d", offsets[i], offset) 174 | } 175 | } 176 | }) 177 | 178 | t.Run("cannot double free a page", func(t *testing.T) { 179 | buf := buftest.NewSeekableBuffer() 180 | pf, err := NewPageFile(buf) 181 | if err != nil { 182 | t.Fatal(err) 183 | } 184 | offset, err := pf.NewPage(nil) 185 | if err != nil { 186 | t.Fatal(err) 187 | } 188 | if err := pf.FreePage(offset); err != nil { 189 | t.Fatal(err) 190 | } 191 | if err := pf.FreePage(offset); err == nil { 192 | t.Fatal("expected error") 193 | } 194 | }) 195 | 196 | t.Run("track number of pages", func(t *testing.T) { 197 | buf := buftest.NewSeekableBuffer() 198 | pf, err := NewPageFile(buf) 199 | if err != nil { 200 | t.Fatal(err) 201 | } 202 | if pf.PageCount() != 1 { 203 | t.Fatalf("expected 1, got %d", pf.PageCount()) 204 | } 205 | offset, err := pf.NewPage(nil) 206 | if err != nil { 207 | t.Fatal(err) 208 | } 209 | if pf.PageCount() != 2 { 210 | t.Fatalf("expected 2, got %d", pf.PageCount()) 211 | } 212 | if err := pf.FreePage(offset); err != nil { 213 | t.Fatal(err) 214 | } 215 | if pf.PageCount() != 2 { 216 | t.Fatalf("expected 2, got %d", pf.PageCount()) 217 | } 218 | if _, err := pf.NewPage(nil); err != nil { 219 | t.Fatal(err) 220 | } 221 | if pf.PageCount() != 2 { 222 | t.Fatalf("expected 2, got %d", pf.PageCount()) 223 | } 224 | if _, err := pf.NewPage(nil); err != nil { 225 | t.Fatal(err) 226 | } 227 | if pf.PageCount() != 3 { 228 | t.Fatalf("expected 3, got %d", pf.PageCount()) 229 | } 230 | }) 231 | } 232 | -------------------------------------------------------------------------------- /pkg/pointer/pointer.go: -------------------------------------------------------------------------------- 1 | package pointer 2 | 3 | import "fmt" 4 | 5 | // MemoryPointer is a uint64 offset and uint32 length 6 | type MemoryPointer struct { 7 | Offset uint64 8 | Length uint32 9 | } 10 | 11 | func (mp MemoryPointer) String() string { 12 | return fmt.Sprintf("Pointer[%08x:%08x]", mp.Offset, mp.Offset+uint64(mp.Length)) 13 | } 14 | -------------------------------------------------------------------------------- /pkg/pointer/referenced_value.go: -------------------------------------------------------------------------------- 1 | package pointer 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "github.com/kevmo314/appendable/pkg/hnsw" 7 | ) 8 | 9 | type ReferencedValue struct { 10 | // it is generally optional to set the DataPointer. if it is not set, the 11 | // value is taken to be unreferenced and is stored directly in the node. 12 | // if it is set, the value is used for comparison but the value is stored 13 | // as a reference to the DataPointer. 14 | // 15 | // caveat: DataPointer is used as a disambiguator for the value. the b+ tree 16 | // implementation does not support duplicate keys and uses the DataPointer 17 | // to disambiguate between keys that compare as equal. 18 | DataPointer MemoryPointer 19 | Value []byte 20 | } 21 | 22 | type ReferencedId struct { 23 | DataPointer MemoryPointer 24 | Value hnsw.Id 25 | } 26 | 27 | func (rv ReferencedValue) String() string { 28 | return fmt.Sprintf("ReferencedValue@%s{%s}", rv.DataPointer, rv.Value) 29 | } 30 | 31 | func (rv ReferencedId) String() string { 32 | return fmt.Sprintf("ReferencedId@%s{%d}", rv.DataPointer, rv.Value) 33 | } 34 | 35 | func CompareReferencedValues(a, b ReferencedValue) int { 36 | if cmp := bytes.Compare(a.Value, b.Value); cmp != 0 { 37 | return cmp 38 | } else if a.DataPointer.Offset < b.DataPointer.Offset { 39 | return -1 40 | } else if a.DataPointer.Offset > b.DataPointer.Offset { 41 | return 1 42 | } else if a.DataPointer.Length < b.DataPointer.Length { 43 | return -1 44 | } else if a.DataPointer.Length > b.DataPointer.Length { 45 | return 1 46 | } 47 | return 0 48 | } 49 | 50 | func CompareReferencedIds(a, b ReferencedId) int { 51 | if a.Value > b.Value { 52 | return 1 53 | } else if a.Value < b.Value { 54 | return -1 55 | } 56 | 57 | return 0 58 | } 59 | -------------------------------------------------------------------------------- /pkg/vectorpage/manager.go: -------------------------------------------------------------------------------- 1 | package vectorpage 2 | 3 | import ( 4 | "fmt" 5 | "github.com/kevmo314/appendable/pkg/bptree" 6 | "github.com/kevmo314/appendable/pkg/btree" 7 | "github.com/kevmo314/appendable/pkg/hnsw" 8 | "github.com/kevmo314/appendable/pkg/pointer" 9 | ) 10 | 11 | type HNSWAdjacencyPage [16][8]uint32 12 | 13 | type VectorPageManager struct { 14 | btree *btree.BTree 15 | // vectors []*hnsw.Point 16 | 17 | bptree *bptree.BPTree 18 | // neighborhood map[hnsw.Id]*hnsw.Friends 19 | 20 | hnsw *hnsw.Hnsw 21 | } 22 | 23 | func NewVectorPageManager(btree *btree.BTree, bptree *bptree.BPTree, hnsw *hnsw.Hnsw) *VectorPageManager { 24 | if btree == nil || bptree == nil { 25 | panic("btree and bptree must not be nil") 26 | } 27 | 28 | return &VectorPageManager{ 29 | btree: btree, 30 | bptree: bptree, 31 | hnsw: hnsw, 32 | } 33 | } 34 | 35 | func (vp *VectorPageManager) AddNode(x hnsw.Point) error { 36 | xId, err := vp.hnsw.InsertVector(x) 37 | if err != nil { 38 | return err 39 | } 40 | 41 | // write point to btree 42 | if err := vp.btree.Insert(pointer.ReferencedId{Value: xId}, x); err != nil { 43 | return err 44 | } 45 | 46 | // write friends to bptree 47 | xFriends, err := vp.hnsw.Neighborhood(xId) 48 | if err != nil { 49 | return fmt.Errorf("vector id %v not found in hnsw neighborhood", x) 50 | } 51 | xfriendsBuf, err := xFriends.Flush(8) 52 | if err != nil { 53 | return err 54 | } 55 | 56 | if err := vp.bptree.Insert(pointer.ReferencedValue{Value: xfriendsBuf}, pointer.MemoryPointer{}); err != nil { 57 | return fmt.Errorf("failed to insert buf: %v", err) 58 | } 59 | 60 | return nil 61 | } 62 | -------------------------------------------------------------------------------- /pkg/vectorpage/manager_test.go: -------------------------------------------------------------------------------- 1 | package vectorpage 2 | 3 | import ( 4 | "github.com/kevmo314/appendable/pkg/hnsw" 5 | "testing" 6 | ) 7 | 8 | func TestNewVectorPageManager(t *testing.T) { 9 | 10 | t.Run("", func(t *testing.T) { 11 | p0 := hnsw.Point{3, 3} 12 | 13 | h := hnsw.NewHnsw(2, 10, 8, p0) 14 | 15 | for i := 0; i < 100; i++ { 16 | id, err := h.InsertVector(hnsw.Point{float32(i), float32(i)}) 17 | if err != nil { 18 | t.Fatal(err) 19 | } 20 | 21 | if id != hnsw.Id(i+1) { 22 | t.Fatalf("expected id %d, got %d", id, i+1) 23 | } 24 | } 25 | }) 26 | } 27 | -------------------------------------------------------------------------------- /scripts/jsonl2json/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "jsonl2json" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | anyhow = "1.0.82" 10 | clap = { version = "4.5.4", features = ["derive"] } 11 | serde = { version = "1.0.198", features = ["derive"] } 12 | serde_json = "1.0.116" 13 | -------------------------------------------------------------------------------- /scripts/jsonl2json/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | use std::io::{BufRead, BufReader}; 3 | use clap::Parser; 4 | use anyhow::{Context, Result}; 5 | use serde_json::Value; 6 | 7 | #[derive(Parser, Debug)] 8 | struct Args { 9 | #[arg(short)] 10 | file_path: String 11 | } 12 | 13 | fn main() -> Result<()>{ 14 | let args = Args::parse(); 15 | let file_path = args.file_path; 16 | 17 | let jsonl_file = File::open(&file_path)?; 18 | let reader = BufReader::new(jsonl_file); 19 | 20 | let mut array: Vec = vec![]; 21 | 22 | for line in reader.lines() { 23 | let line = line?; 24 | let json: Value = serde_json::from_str(&line)?; 25 | array.push(json); 26 | } 27 | 28 | let output_path = file_path.replace(".jsonl", ".json").to_owned(); 29 | let json_string = serde_json::to_string_pretty(&array) 30 | .with_context(|| "Failed to serialize JSON data")?; 31 | 32 | std::fs::write(&output_path, json_string.as_bytes()) 33 | .with_context(|| format!("Failed to write to file: {}", output_path))?; 34 | 35 | return Ok(()) 36 | } 37 | -------------------------------------------------------------------------------- /src/bptree/traversal.ts: -------------------------------------------------------------------------------- 1 | import { BPTree, ReferencedValue } from "./bptree"; 2 | import { BPTreeNode, MemoryPointer } from "./node"; 3 | 4 | export type TraversalRecord = { 5 | node: BPTreeNode; 6 | index: number; 7 | pointer: MemoryPointer; 8 | }; 9 | 10 | export class TraversalIterator { 11 | private tree: BPTree; 12 | private readonly key: ReferencedValue; 13 | private records: TraversalRecord[]; 14 | 15 | constructor(tree: BPTree, key: ReferencedValue) { 16 | this.tree = tree; 17 | this.key = key; 18 | this.records = []; // note this works iff all records are non-empty 19 | } 20 | 21 | async init(): Promise { 22 | const rootResponse = await this.tree.root(); 23 | 24 | if (rootResponse.rootNode === null) { 25 | return false; 26 | } 27 | 28 | const root = rootResponse.rootNode; 29 | const offset = rootResponse.pointer; 30 | this.records = await this.tree.traverse(this.key, root, offset); 31 | 32 | return true; 33 | } 34 | 35 | getKey(): ReferencedValue { 36 | return this.records[0].node.keys[this.records[0].index]; 37 | } 38 | 39 | getPointer(): MemoryPointer { 40 | return this.records[0].node.pointer(this.records[0].index); 41 | } 42 | 43 | async increment(i: number, delta: number): Promise { 44 | if (i === this.records.length) { 45 | return false; 46 | } 47 | 48 | this.records[i].index += delta; 49 | const rolloverLeft = this.records[i].index < 0; 50 | const rolloverRight = 51 | this.records[i].index >= this.records[i].node.numPointers(); 52 | 53 | if (rolloverLeft || rolloverRight) { 54 | if (!(await this.increment(i + 1, delta))) { 55 | return false; 56 | } 57 | 58 | if (!this.records[i + 1]) { 59 | return false; 60 | } 61 | // propagate the rollover 62 | this.records[i].node = await this.records[i + 1].node.child( 63 | this.records[i + 1].index, 64 | ); 65 | 66 | if (rolloverLeft) { 67 | this.records[i].index = this.records[i].node.numPointers() - 1; 68 | } else { 69 | this.records[i].index = 0; 70 | } 71 | } 72 | 73 | return true; 74 | } 75 | 76 | async next(): Promise { 77 | if (this.records.length === 0) { 78 | const res = await this.init(); 79 | 80 | return ( 81 | res && this.records[0].index !== this.records[0].node.numPointers() 82 | ); 83 | } 84 | 85 | return this.increment(0, 1); 86 | } 87 | 88 | async prev(): Promise { 89 | if (this.records.length === 0) { 90 | const res = await this.init(); 91 | if (!res) { 92 | return false; 93 | } 94 | } 95 | 96 | return this.increment(0, -1); 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/db/query-builder.ts: -------------------------------------------------------------------------------- 1 | import { Database } from "./database"; 2 | import { OrderBy, Query, Schema, WhereNode } from "./query-lang"; 3 | /** 4 | * A class for building and executing database queries in a flexible API style. 5 | * Allows chaining methods for 'where', 'orderBy', 'select', and 'limit' clauses. 6 | */ 7 | export class QueryBuilder { 8 | private queryObject: Query = { 9 | where: [], 10 | orderBy: undefined, 11 | select: undefined, 12 | limit: undefined, 13 | }; 14 | 15 | constructor(private database: Database) {} 16 | 17 | toQuery(): Query { 18 | return { 19 | where: this.queryObject.where ? [...this.queryObject.where] : [], 20 | orderBy: this.queryObject.orderBy 21 | ? [...this.queryObject.orderBy] 22 | : undefined, 23 | select: this.queryObject.select 24 | ? [...this.queryObject.select] 25 | : undefined, 26 | limit: this.queryObject.limit, 27 | }; 28 | } 29 | 30 | /** 31 | * Executes the constructed query 32 | */ 33 | get() { 34 | return this.database.query(this.queryObject); 35 | } 36 | 37 | where( 38 | key: keyof T, 39 | operation: WhereNode["operation"], 40 | value: T[keyof T], 41 | ): QueryBuilder { 42 | const newQuery = new QueryBuilder(this.database); 43 | newQuery.queryObject = { 44 | ...this.queryObject, 45 | where: [...(this.queryObject.where || []), { key, operation, value }], 46 | }; 47 | return newQuery; 48 | } 49 | 50 | orderBy(key: keyof T, direction: OrderBy["direction"]): QueryBuilder { 51 | const newQuery = new QueryBuilder(this.database); 52 | newQuery.queryObject = { 53 | ...this.queryObject, 54 | orderBy: [...(this.queryObject.orderBy || []), { key, direction }], 55 | }; 56 | return newQuery; 57 | } 58 | 59 | select(keys: (keyof T)[]): QueryBuilder { 60 | const newQuery = new QueryBuilder(this.database); 61 | newQuery.queryObject = { 62 | ...this.queryObject, 63 | select: keys, 64 | }; 65 | return newQuery; 66 | } 67 | 68 | limit(limit: number): QueryBuilder { 69 | const newQuery = new QueryBuilder(this.database); 70 | newQuery.queryObject = { 71 | ...this.queryObject, 72 | limit: limit, 73 | }; 74 | return newQuery; 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/db/query-lang.ts: -------------------------------------------------------------------------------- 1 | import { FieldType } from "./database"; 2 | 3 | export type Schema = { 4 | [key: string]: {}; 5 | }; 6 | 7 | export type WhereNode = { 8 | operation: "<" | "<=" | "==" | ">=" | ">"; 9 | key: keyof T; 10 | value: T[K]; 11 | }; 12 | 13 | export type SearchConfig = { 14 | minGram: number; 15 | maxGram: number; 16 | }; 17 | 18 | export type Search = { 19 | key: keyof T; 20 | like: string; 21 | config?: SearchConfig; 22 | }; 23 | 24 | export type OrderBy = { 25 | key: keyof T; 26 | direction: "ASC" | "DESC"; 27 | }; 28 | 29 | export type SelectField = keyof T; 30 | 31 | export type Query = { 32 | where?: WhereNode[]; 33 | orderBy?: OrderBy[]; 34 | select?: SelectField[]; 35 | search?: Search; 36 | limit?: number; 37 | }; 38 | 39 | type QueryWhere = { 40 | valueBuf: ArrayBuffer; 41 | fieldType: FieldType; 42 | }; 43 | 44 | export function processWhere(value: T[keyof T]): QueryWhere | null { 45 | let valueBuf: ArrayBuffer; 46 | 47 | if (value === null) { 48 | return { 49 | fieldType: FieldType.Null, 50 | valueBuf: new ArrayBuffer(0), 51 | }; 52 | } else { 53 | switch (typeof value) { 54 | case "bigint": 55 | case "number": 56 | valueBuf = new ArrayBuffer(8); 57 | new DataView(valueBuf).setFloat64(0, Number(value)); 58 | return { 59 | fieldType: FieldType.Float64, 60 | valueBuf, 61 | }; 62 | case "boolean": 63 | return { 64 | fieldType: FieldType.Boolean, 65 | valueBuf: new Uint8Array([value ? 1 : 0]).buffer, 66 | }; 67 | 68 | case "string": 69 | return { 70 | fieldType: FieldType.String, 71 | valueBuf: new TextEncoder().encode(value as string).buffer, 72 | }; 73 | } 74 | } 75 | 76 | return null; 77 | } 78 | 79 | export function handleSelect(data: string, select?: (keyof T)[]) { 80 | let jData = JSON.parse(data); 81 | if (select && select.length > 0) { 82 | return select.reduce( 83 | (acc, field) => { 84 | if (field in jData) { 85 | acc[field] = jData[field]; 86 | } 87 | return acc; 88 | }, 89 | {} as Pick, 90 | ); 91 | } 92 | 93 | return jData; 94 | } 95 | -------------------------------------------------------------------------------- /src/db/query-validation.ts: -------------------------------------------------------------------------------- 1 | import { IndexHeader } from "../file/meta"; 2 | import { FieldType, fieldTypeToString } from "./database"; 3 | import { 4 | OrderBy, 5 | Schema, 6 | Query, 7 | SelectField, 8 | WhereNode, 9 | Search, 10 | } from "./query-lang"; 11 | 12 | function checkType(headerType: number[], queryType: FieldType): boolean { 13 | return headerType.includes(queryType); 14 | } 15 | 16 | function validateWhere( 17 | where: WhereNode[] | undefined, 18 | headers: IndexHeader[], 19 | ): void { 20 | if (!where || !Array.isArray(where) || where.length === 0) { 21 | throw new Error("Missing 'where' clause."); 22 | } 23 | 24 | for (const whereNode of where) { 25 | if (!["<", "<=", "==", ">=", ">"].includes(whereNode.operation)) { 26 | throw new Error("Invalid operation in 'where' clause."); 27 | } 28 | 29 | if (typeof whereNode.key !== "string") { 30 | throw new Error("'key' in 'where' clause must be a string."); 31 | } 32 | 33 | const header = headers.find((h) => h.fieldName === whereNode.key); 34 | 35 | if (!header) { 36 | throw new Error( 37 | `key: ${whereNode.key} in 'where' clause does not exist in dataset.`, 38 | ); 39 | } 40 | 41 | if (typeof whereNode.value === "undefined") { 42 | throw new Error("'value' in 'where' clause is missing."); 43 | } 44 | 45 | const headerType = header.fieldTypes; 46 | 47 | if (whereNode.value === null) { 48 | if (!checkType(headerType, FieldType.Null)) { 49 | throw new Error( 50 | `null type not included in ${whereNode.key}'s header types.`, 51 | ); 52 | } 53 | } else { 54 | switch (typeof whereNode.value) { 55 | case "bigint": 56 | case "number": 57 | if ( 58 | !checkType(headerType, FieldType.Float64) && 59 | !checkType(headerType, FieldType.Uint64) && 60 | !checkType(headerType, FieldType.Int64) 61 | ) { 62 | throw new Error( 63 | `number type not included in ${whereNode.key}'s header types.`, 64 | ); 65 | } 66 | break; 67 | 68 | case "string": 69 | if (!checkType(headerType, FieldType.String)) { 70 | throw new Error( 71 | `string type not included in ${whereNode.key}'s header types`, 72 | ); 73 | } 74 | break; 75 | 76 | case "boolean": 77 | if (!checkType(headerType, FieldType.Boolean)) { 78 | throw new Error( 79 | `boolean type not included in ${whereNode.key}'s header types`, 80 | ); 81 | } 82 | break; 83 | 84 | default: 85 | throw new Error( 86 | `unrecognized type: ${typeof whereNode.value} not included in ${whereNode.key}'s header types`, 87 | ); 88 | } 89 | } 90 | } 91 | } 92 | 93 | function validateOrderBy( 94 | orderBy: OrderBy[] | undefined, 95 | whereKey: string, 96 | ): void { 97 | if (orderBy) { 98 | if (!Array.isArray(orderBy) || orderBy.length === 0) { 99 | throw new Error("Invalid 'orderBy' clause."); 100 | } 101 | 102 | // Note: currently we only support one orderBy and it must be the where clause. When we add composite indexes and complex querying, refactor. 103 | const orderByObj = orderBy[0]; 104 | 105 | if (!["ASC", "DESC"].includes(orderByObj.direction)) { 106 | throw new Error("Invalid direction in `orderBy`."); 107 | } 108 | 109 | if (orderByObj.key !== whereKey) { 110 | throw new Error("'key' in `orderBy` must match `key` in `where` clause"); 111 | } 112 | } 113 | } 114 | 115 | function validateSelect( 116 | select: SelectField[] | undefined, 117 | headers: IndexHeader[], 118 | ): void { 119 | if (select) { 120 | if (!Array.isArray(select)) { 121 | throw new Error(`select is not an array: ${select}`); 122 | } 123 | 124 | if (select.length <= 0) { 125 | throw new Error(`select clause is empty: ${select}`); 126 | } 127 | 128 | let hset = new Set(); 129 | headers.map((h) => hset.add(h.fieldName)); 130 | 131 | select.map((s) => { 132 | if (!hset.has(s as string)) { 133 | throw new Error( 134 | `${s as string} is not included in the field name headers`, 135 | ); 136 | } 137 | }); 138 | } 139 | } 140 | 141 | export function validateSearch( 142 | search: Search, 143 | headers: IndexHeader[], 144 | ) { 145 | if (!search.config) { 146 | search.config = { 147 | minGram: 1, 148 | maxGram: 2, 149 | }; 150 | } 151 | const { config } = search; 152 | let { minGram, maxGram } = config; 153 | 154 | const fh = headers.find((h) => h.fieldName === search.key); 155 | 156 | if (!fh) { 157 | throw new Error( 158 | `Unable to find index header for key: ${search.key as string}`, 159 | ); 160 | } 161 | 162 | let gset = new Set([FieldType.Unigram, FieldType.Bigram, FieldType.Trigram]); 163 | const { fieldTypes } = fh; 164 | fieldTypes.forEach((ft) => (gset.has(ft) ? gset.delete(ft) : {})); 165 | 166 | if (gset.size != 0) { 167 | throw new Error( 168 | `Unable to find valid ngram field types: ${[...gset.keys()].map((f) => fieldTypeToString(f))} for index header: ${search.key as string}.`, 169 | ); 170 | } 171 | 172 | if (maxGram > 3 || minGram > 3) { 173 | throw new Error( 174 | `Invalid gram length configuration. ${config.minGram} and ${config.maxGram} cannot be greater than 3.`, 175 | ); 176 | } 177 | 178 | if (minGram < 1 || maxGram < 1) { 179 | throw new Error( 180 | `Invalid gram length configuration. ${config.minGram} and ${config.maxGram} cannot be less than 3.`, 181 | ); 182 | } 183 | 184 | if (minGram > maxGram) { 185 | throw new Error( 186 | `Invalid gram length configuration: minGram ${config.minGram} cannot be greater than maxGram ${config.maxGram}.`, 187 | ); 188 | } 189 | } 190 | 191 | export function validateQuery( 192 | query: Query, 193 | headers: IndexHeader[], 194 | ): void { 195 | if (query.search) { 196 | validateSearch(query.search, headers); 197 | } 198 | 199 | if (query.where) { 200 | validateWhere(query.where, headers); 201 | validateOrderBy(query.orderBy, query.where![0].key as string); 202 | validateSelect(query.select, headers); 203 | } 204 | } 205 | -------------------------------------------------------------------------------- /src/file/data-file.ts: -------------------------------------------------------------------------------- 1 | import { Config } from "../index"; 2 | import { requestRanges } from "../resolver/range-request"; 3 | import { RangeResolver } from "../resolver/resolver"; 4 | 5 | export class DataFile { 6 | private originalResolver?: RangeResolver; 7 | 8 | private constructor(private resolver: RangeResolver) {} 9 | 10 | static forUrl(url: string, config: Config) { 11 | return DataFile.forResolver( 12 | async (ranges) => await requestRanges(url, ranges, config), 13 | ); 14 | } 15 | 16 | static forResolver(resolver: RangeResolver) { 17 | const instance = new DataFile(async (ranges) => { 18 | return await resolver(ranges); 19 | }); 20 | instance.originalResolver = resolver; 21 | return instance; 22 | } 23 | 24 | getResolver(): RangeResolver | undefined { 25 | return this.originalResolver; 26 | } 27 | 28 | async get(start: number, end: number) { 29 | if (end <= start) { 30 | throw new Error(`Invalid range for start: ${start} and end: ${end}`); 31 | } 32 | 33 | const res = await this.resolver([{ start, end }]); 34 | return new TextDecoder().decode(res[0].data); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/file/index-file.ts: -------------------------------------------------------------------------------- 1 | import { LinkedMetaPage, PAGE_SIZE_BYTES, ReadMultiBPTree } from "./multi"; 2 | import { RangeResolver } from "../resolver/resolver"; 3 | import { 4 | IndexHeader, 5 | IndexMeta, 6 | collectIndexMetas, 7 | readIndexMeta, 8 | readFileMeta, 9 | FileMeta, 10 | } from "./meta"; 11 | import { FieldType } from "../db/database"; 12 | import { Config } from ".."; 13 | import { requestRanges } from "../resolver/range-request"; 14 | 15 | export class IndexFile { 16 | static async forUrl(url: string, config: Config) { 17 | return await IndexFile.forResolver( 18 | async (ranges) => await requestRanges(url, ranges, config), 19 | ); 20 | } 21 | 22 | static async forResolver( 23 | resolver: RangeResolver, 24 | ): Promise> { 25 | return new IndexFileV1(resolver); 26 | } 27 | } 28 | 29 | export interface VersionedIndexFile { 30 | getResolver(): RangeResolver; 31 | 32 | tree(): Promise; 33 | 34 | metadata(): Promise; 35 | 36 | indexHeaders(): Promise; 37 | 38 | seek(header: string, fieldType: FieldType): Promise; 39 | 40 | fetchMetaPages(): Promise; 41 | } 42 | 43 | export class IndexFileV1 implements VersionedIndexFile { 44 | private _tree?: LinkedMetaPage; 45 | 46 | private linkedMetaPages: LinkedMetaPage[] = []; 47 | 48 | constructor(private resolver: RangeResolver) {} 49 | 50 | getResolver(): RangeResolver { 51 | return this.resolver; 52 | } 53 | 54 | async tree(): Promise { 55 | if (this._tree) { 56 | return this._tree; 57 | } 58 | 59 | const tree = ReadMultiBPTree(this.resolver, 0); 60 | 61 | this._tree = tree; 62 | return tree; 63 | } 64 | 65 | async metadata(): Promise { 66 | const tree = await this.tree(); 67 | const buffer = await tree.metadata(); 68 | 69 | return readFileMeta(buffer); 70 | } 71 | 72 | async seek(header: string, fieldType: FieldType): Promise { 73 | const tree = await this.tree(); 74 | let currMp = await tree.next(); 75 | 76 | if (!currMp) { 77 | throw new Error(`failed to fetch meta pages`); 78 | } 79 | 80 | let headerMps = []; 81 | 82 | while (true) { 83 | const indexMeta = readIndexMeta(await currMp.metadata()); 84 | if (indexMeta.fieldName === header) { 85 | if (fieldType === FieldType.Float64) { 86 | // if key is a number or bigint, we cast it as a float64 type 87 | if ( 88 | indexMeta.fieldType === FieldType.Float64 || 89 | indexMeta.fieldType === FieldType.Int64 || 90 | indexMeta.fieldType === FieldType.Uint64 91 | ) { 92 | headerMps.push(currMp); 93 | } 94 | } else { 95 | if (fieldType === indexMeta.fieldType) { 96 | headerMps.push(currMp); 97 | } 98 | } 99 | } 100 | 101 | const nextMp = await currMp.next(); 102 | if (!nextMp) { 103 | break; 104 | } 105 | currMp = nextMp; 106 | } 107 | 108 | return headerMps; 109 | } 110 | 111 | async fetchMetaPages(): Promise { 112 | const tree = await this.tree(); 113 | let currMp = await tree.next(); 114 | 115 | if (!currMp) { 116 | throw new Error(`failed to fetch meta pages`); 117 | } 118 | 119 | while (true) { 120 | this.linkedMetaPages.push(currMp); 121 | 122 | const nextMp = await currMp.next(); 123 | if (!nextMp) { 124 | break; 125 | } 126 | currMp = nextMp; 127 | } 128 | } 129 | 130 | async indexHeaders(): Promise { 131 | if (this.linkedMetaPages.length === 0) { 132 | await this.fetchMetaPages(); 133 | } 134 | 135 | let indexMetas: IndexMeta[] = []; 136 | for (let idx = 0; idx <= this.linkedMetaPages.length - 1; idx++) { 137 | const currMp = this.linkedMetaPages[idx]; 138 | const im = readIndexMeta(await currMp.metadata()); 139 | indexMetas.push(im); 140 | const nextMp = await currMp.next(); 141 | if (!nextMp) { 142 | break; 143 | } 144 | } 145 | 146 | return collectIndexMetas(indexMetas); 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /src/file/meta.ts: -------------------------------------------------------------------------------- 1 | import { decodeUvarint } from "../util/uvarint"; 2 | 3 | export enum FileFormat { 4 | JSONL = 0, 5 | CSV = 1, 6 | } 7 | 8 | export type FileMeta = { 9 | version: number; 10 | format: FileFormat; 11 | readOffset: bigint; 12 | entries: number; 13 | }; 14 | 15 | export async function readFileMeta(buffer: ArrayBuffer): Promise { 16 | // unmarshall binary for FileMeta 17 | if (buffer.byteLength <= 10) { 18 | throw new Error( 19 | `incorrect byte length! Want: 10, got ${buffer.byteLength}`, 20 | ); 21 | } 22 | 23 | const dataView = new DataView(buffer); 24 | const version = dataView.getUint8(0); 25 | const format = dataView.getUint8(1); 26 | 27 | if (Object.values(FileFormat).indexOf(format) === -1) { 28 | throw new Error(`unexpected file format. Got: ${format}`); 29 | } 30 | 31 | const readOffset = dataView.getBigUint64(2, true); 32 | 33 | const { value: entries } = decodeUvarint(buffer.slice(10)); 34 | 35 | return { 36 | version, 37 | format, 38 | readOffset, 39 | entries, 40 | }; 41 | } 42 | 43 | export type IndexMeta = { 44 | fieldName: string; 45 | fieldType: number; 46 | width: number; 47 | totalFieldValueLength: number; 48 | }; 49 | 50 | export type IndexHeader = { 51 | fieldName: string; 52 | fieldTypes: number[]; 53 | }; 54 | 55 | export function readIndexMeta(buffer: ArrayBuffer): IndexMeta { 56 | if (buffer.byteLength < 6) { 57 | throw new Error(`invalid metadata size ${buffer.byteLength}`); 58 | } 59 | 60 | const dataView = new DataView(buffer); 61 | const fieldType = dataView.getUint16(0, true); 62 | const width = dataView.getUint16(2, true); 63 | const nameLength = dataView.getUint16(4, true); 64 | 65 | if (buffer.byteLength < 6 + nameLength) { 66 | throw new Error(`invalid metadata size ${buffer.byteLength}`); 67 | } 68 | 69 | const fieldNameBuffer = buffer.slice(6, 6 + nameLength); 70 | const fieldName = new TextDecoder("utf-8").decode(fieldNameBuffer); 71 | 72 | const { value: totalFieldValueLength } = decodeUvarint( 73 | buffer.slice(6 + nameLength), 74 | ); 75 | 76 | return { 77 | fieldName, 78 | fieldType, 79 | width, 80 | totalFieldValueLength, 81 | }; 82 | } 83 | 84 | export function collectIndexMetas(indexMetas: IndexMeta[]): IndexHeader[] { 85 | const headersMap: Map = new Map(); 86 | 87 | for (const meta of indexMetas) { 88 | if (!headersMap.has(meta.fieldName)) { 89 | headersMap.set(meta.fieldName, [meta.fieldType]); 90 | } else { 91 | const updatedTypes = headersMap.get(meta.fieldName); 92 | updatedTypes?.push(meta.fieldType); 93 | headersMap.set(meta.fieldName, updatedTypes!!); 94 | } 95 | } 96 | 97 | const indexHeaders: IndexHeader[] = []; 98 | headersMap.forEach((fieldTypes, fieldName) => { 99 | indexHeaders.push({ fieldName, fieldTypes }); 100 | }); 101 | 102 | return indexHeaders; 103 | } 104 | -------------------------------------------------------------------------------- /src/file/multi.ts: -------------------------------------------------------------------------------- 1 | import { RangeResolver } from "../resolver/resolver"; 2 | import { MemoryPointer } from "../bptree/node"; 3 | 4 | export const PAGE_SIZE_BYTES = 4096; 5 | export const SLOT_SIZE_BYTES = 256; 6 | export const maxUint64 = 2n ** 64n - 1n; 7 | const POINTER_BYTES = 8; 8 | const LENGTH_BYTES = 4; 9 | const COUNT_BYTE = 1; 10 | 11 | export class LinkedMetaPage { 12 | constructor( 13 | private readonly resolver: RangeResolver, 14 | private readonly offset: bigint, 15 | private readonly index: number, 16 | private metaPageDataPromise?: Promise< 17 | { data: ArrayBuffer; totalLength: number }[] 18 | >, 19 | ) {} 20 | 21 | async root(): Promise { 22 | const pageData = await this.getMetaPage(); 23 | 24 | // we seek by 12 bytes since offset is 8 bytes, length is 4 bytes 25 | const data = pageData.slice( 26 | this.rootMemoryPointerPageOffset(), 27 | this.rootMemoryPointerPageOffset() + POINTER_BYTES + LENGTH_BYTES, 28 | ); 29 | 30 | if (data.byteLength != POINTER_BYTES + LENGTH_BYTES) { 31 | throw new Error( 32 | `failed to properly fetch root node. Got ${data.byteLength}`, 33 | ); 34 | } 35 | 36 | const view = new DataView(data); 37 | 38 | const pointerOffset = view.getBigUint64(0, true); 39 | const lengthOffset = view.getUint32(POINTER_BYTES, true); 40 | 41 | return { 42 | offset: pointerOffset, 43 | length: lengthOffset, 44 | }; 45 | } 46 | 47 | async metadata(): Promise { 48 | const pageData = await this.getMetaPage(); 49 | const rootPointer = POINTER_BYTES + LENGTH_BYTES; 50 | const metadata = pageData.slice( 51 | this.rootMemoryPointerPageOffset() + rootPointer, 52 | ); 53 | const metadataView = new DataView(metadata); 54 | // we need to seek past the root pointer 55 | const metadataLength = metadataView.getUint8(0); 56 | return metadataView.buffer.slice(1, 1 + metadataLength); 57 | } 58 | 59 | private async getMetaPage(): Promise { 60 | if (!this.metaPageDataPromise) { 61 | this.metaPageDataPromise = this.resolver([ 62 | { 63 | start: Number(this.offset), 64 | end: Number(this.offset) + PAGE_SIZE_BYTES - 1, 65 | }, 66 | ]); 67 | } 68 | 69 | const res = await this.metaPageDataPromise; 70 | const { data } = res[0]; 71 | 72 | return data; 73 | } 74 | 75 | async next() { 76 | const pageData = await this.getMetaPage(); 77 | const view = new DataView(pageData); 78 | 79 | const count = view.getUint8(POINTER_BYTES); 80 | 81 | if (this.index < count - 1) { 82 | return new LinkedMetaPage( 83 | this.resolver, 84 | this.offset, 85 | this.index + 1, 86 | this.metaPageDataPromise, 87 | ); 88 | } 89 | 90 | const nextOffset = view.getBigUint64(0, true); 91 | 92 | if (nextOffset === maxUint64) { 93 | return null; 94 | } 95 | 96 | return new LinkedMetaPage(this.resolver, nextOffset, 0); 97 | } 98 | 99 | private rootMemoryPointerPageOffset(): number { 100 | return ( 101 | POINTER_BYTES + 102 | COUNT_BYTE + 103 | this.index * (POINTER_BYTES + COUNT_BYTE + SLOT_SIZE_BYTES) 104 | ); 105 | } 106 | } 107 | 108 | export function ReadMultiBPTree( 109 | resolver: RangeResolver, 110 | idx: number, 111 | ): LinkedMetaPage { 112 | let offset = idx < 0 ? BigInt(0) : BigInt(idx + 1) * BigInt(PAGE_SIZE_BYTES); 113 | return new LinkedMetaPage(resolver, offset, 0); 114 | } 115 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | import { DataFile } from "./file/data-file"; 2 | import { Database, FieldType, fieldTypeToString } from "./db/database"; 3 | import { IndexFile } from "./file/index-file"; 4 | import { RangeResolver } from "./resolver/resolver"; 5 | 6 | export type Config = { 7 | useMultipartByteRanges?: boolean; 8 | }; 9 | 10 | export async function init( 11 | dataUrl: string | RangeResolver, 12 | indexUrl: string | RangeResolver, 13 | config?: Config, 14 | ) { 15 | if (!config) { 16 | config = { useMultipartByteRanges: true }; 17 | } 18 | 19 | return Database.forDataFileAndIndexFile( 20 | typeof dataUrl === "string" 21 | ? DataFile.forUrl(dataUrl, config) 22 | : DataFile.forResolver(dataUrl), 23 | typeof indexUrl === "string" 24 | ? await IndexFile.forUrl(indexUrl, config) 25 | : await IndexFile.forResolver(indexUrl), 26 | ); 27 | } 28 | 29 | interface GlobalMap { 30 | Appendable: { 31 | init: Function; 32 | FieldType: typeof FieldType; 33 | fieldTypeToString: Function; 34 | }; 35 | } 36 | 37 | declare global { 38 | var Appendable: GlobalMap["Appendable"]; 39 | } 40 | 41 | globalThis.Appendable = { 42 | init, 43 | FieldType, 44 | fieldTypeToString, 45 | }; 46 | -------------------------------------------------------------------------------- /src/ngram/table.ts: -------------------------------------------------------------------------------- 1 | type Entry = { key: K; score: number }; 2 | 3 | export class PriorityTable { 4 | private map: Map = new Map(); 5 | 6 | insert(key: K, score: number) { 7 | const prevScore = this.map.get(key) ?? 0; 8 | this.map.set(key, prevScore + score); 9 | } 10 | 11 | top() { 12 | return Array.from(this.map, ([key, score]) => ({ key, score })).sort( 13 | (m, n) => n.score - m.score, 14 | ); 15 | } 16 | get size(): number { 17 | return this.map.size; 18 | } 19 | 20 | clear(): void { 21 | this.map.clear(); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/ngram/tokenizer.ts: -------------------------------------------------------------------------------- 1 | import { FieldType } from "../db/database"; 2 | 3 | export type NgramToken = { 4 | value: string; 5 | valueBuf: ArrayBuffer; 6 | type: FieldType; 7 | }; 8 | 9 | export class NgramTokenizer { 10 | private readonly minGram: number; 11 | private readonly maxGram: number; 12 | 13 | private allGrams: Map = new Map([ 14 | [1, FieldType.Unigram], 15 | [2, FieldType.Bigram], 16 | [3, FieldType.Trigram], 17 | ]); 18 | 19 | private static encoder: TextEncoder = new TextEncoder(); 20 | 21 | constructor(minGram: number, maxGram: number) { 22 | this.maxGram = maxGram; 23 | this.minGram = minGram; 24 | } 25 | 26 | tokens(phrase: string): NgramToken[] { 27 | let ngrams: NgramToken[] = []; 28 | 29 | let wordOffsets: number[][] = []; 30 | let currentWordOffsets: number[] = []; 31 | 32 | Array.from(phrase).forEach((c, idx) => { 33 | if (/[a-zA-Z0-9]/.test(c)) { 34 | currentWordOffsets.push(idx); 35 | } else if (/\s/.test(c)) { 36 | if (currentWordOffsets.length >= this.minGram) { 37 | wordOffsets.push(currentWordOffsets); 38 | } 39 | currentWordOffsets = []; 40 | } 41 | }); 42 | 43 | if (currentWordOffsets.length >= this.minGram) { 44 | wordOffsets.push(currentWordOffsets); 45 | } 46 | 47 | for (let N = this.minGram; N <= this.maxGram; N++) { 48 | const gType = this.allGrams.get(N); 49 | 50 | if (!gType) { 51 | throw new Error(`Unrecognized gram type for gram length: ${N}`); 52 | } 53 | 54 | wordOffsets.forEach((word) => { 55 | for (let idx = 0; idx <= word.length - N; idx++) { 56 | let str = ""; 57 | 58 | for (let jdx = idx; jdx <= idx + N - 1; jdx++) { 59 | str += phrase[word[jdx]]; 60 | } 61 | 62 | let value = str.toLowerCase(); 63 | 64 | ngrams.push({ 65 | value, 66 | valueBuf: NgramTokenizer.encoder.encode(value).buffer, 67 | type: gType, 68 | }); 69 | } 70 | }); 71 | } 72 | 73 | return ngrams; 74 | } 75 | 76 | static shuffle(tokens: NgramToken[]): NgramToken[] { 77 | // https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle 78 | let soup = [...tokens]; 79 | 80 | for (let idx = tokens.length - 1; idx > 0; idx--) { 81 | const jdx = Math.floor(Math.random() * (idx + 1)); 82 | [soup[idx], soup[jdx]] = [soup[jdx], soup[idx]]; 83 | } 84 | 85 | return soup; 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/resolver/cache.ts: -------------------------------------------------------------------------------- 1 | import { RangeResolver } from "./resolver"; 2 | 3 | export function cache(resolver: RangeResolver): RangeResolver { 4 | const cache: [ 5 | [number, number], 6 | Promise<{ data: ArrayBuffer; totalLength: number }[]>, 7 | ][] = []; 8 | 9 | return async ([{ start, end }]): Promise< 10 | { data: ArrayBuffer; totalLength: number }[] 11 | > => { 12 | // check if start-end is contained in any of the cached ranges 13 | const cached = cache.find(([[s, e]]) => s <= start && end <= e); 14 | if (cached) { 15 | return cached[1].then((cachedData) => { 16 | const data = cachedData[0].data.slice( 17 | start - cached[0][0], 18 | end - cached[0][0], 19 | ); 20 | return [ 21 | { 22 | data, 23 | totalLength: cachedData[0].totalLength, 24 | }, 25 | ]; 26 | }); 27 | } 28 | 29 | // TODO: check if start-end overlaps with any of the cached ranges 30 | 31 | const promise = resolver([{ start, end }]); 32 | cache.push([[start, end], promise]); 33 | return promise; 34 | }; 35 | } 36 | -------------------------------------------------------------------------------- /src/resolver/multipart.ts: -------------------------------------------------------------------------------- 1 | function getReader(stream: ReadableStream) { 2 | let residual: Uint8Array | null = null; 3 | let readDone = false; 4 | let reader: ReadableStreamDefaultReader = stream.getReader(); 5 | return async ( 6 | buf: Uint8Array, 7 | ): Promise> => { 8 | if (reader instanceof ReadableStreamBYOBReader) { 9 | return await reader.read(buf); 10 | } else { 11 | while (true) { 12 | if (residual) { 13 | const n = Math.min(residual.length, buf.length); 14 | buf.set(residual.subarray(0, n)); 15 | residual = residual.subarray(n); 16 | if (residual.length === 0) { 17 | residual = null; 18 | } 19 | return { 20 | done: readDone && residual === null, 21 | value: buf.subarray(0, n), 22 | }; 23 | } 24 | const result = await reader.read(); 25 | if (result.value) { 26 | residual = result.value; 27 | } 28 | readDone ||= result.done; 29 | } 30 | } 31 | }; 32 | } 33 | 34 | function parseContentRangeHeader( 35 | header: string, 36 | ): [string, number, number, number] { 37 | // parse bytes a-b/c 38 | const tokens = header.split(" "); 39 | if (tokens.length !== 2) { 40 | throw new Error("Invalid Content-Range header"); 41 | } 42 | const [range, total] = tokens[1].split("/"); 43 | const [start, end] = range.split("-"); 44 | return [tokens[0], Number(start), Number(end), Number(total)]; 45 | } 46 | 47 | export default async function* parseMultipartBody( 48 | contentType: string, 49 | stream: ReadableStream, 50 | ): AsyncGenerator<{ data: ArrayBuffer; headers: Record }> { 51 | const reader = getReader(stream); 52 | const tokens = contentType.split(";"); 53 | if (tokens[0] !== "multipart/byteranges") { 54 | throw new Error("Not a multipart/byteranges body"); 55 | } 56 | const boundaryToken = tokens 57 | .map((s) => s.trim()) 58 | .find((s) => s.startsWith("boundary=")) 59 | ?.split("=", 2)?.[1]; 60 | if (!boundaryToken) { 61 | throw new Error("No boundary found"); 62 | } 63 | const boundary = `--${boundaryToken}`; 64 | 65 | let headers: Record = {}; 66 | 67 | const buf = new Uint8Array(4096); 68 | let ptr = 0; 69 | let length = 0; 70 | 71 | const extend = async () => { 72 | if (length === buf.byteLength) { 73 | throw new Error("no buffer space left"); 74 | } 75 | const { done, value } = await reader( 76 | ptr + length >= buf.length 77 | ? buf.subarray((ptr + length) % buf.length, ptr) 78 | : buf.subarray(ptr + length, buf.length), 79 | ); 80 | if (done) { 81 | return done; 82 | } 83 | length += value.length; 84 | return false; 85 | }; 86 | 87 | while (true) { 88 | // read boundary 89 | for (let i = 0; i < boundary.length; i++) { 90 | while (length === 0) { 91 | if (await extend()) { 92 | return; 93 | } 94 | } 95 | if (buf[ptr] !== boundary.charCodeAt(i)) { 96 | console.log("boundary.charCode", buf[ptr], boundary.charCodeAt(i), i); 97 | throw new Error("Invalid boundary"); 98 | } 99 | ptr = (ptr + 1) % buf.length; 100 | length--; 101 | } 102 | 103 | // read the boundary terminator 104 | for (const c of ["\r", "\n"]) { 105 | while (length === 0) { 106 | if (await extend()) { 107 | return; 108 | } 109 | } 110 | if (buf[ptr] === c.charCodeAt(0)) { 111 | ptr = (ptr + 1) % buf.length; 112 | length--; 113 | } else if (buf[ptr] === "-".charCodeAt(0)) { 114 | // eof 115 | return; 116 | } else { 117 | // invalid boundary 118 | throw new Error("Invalid boundary"); 119 | } 120 | } 121 | 122 | // read headers 123 | let lastByte = 0; 124 | let header: number[] = []; 125 | while (true) { 126 | while (length === 0) { 127 | if (await extend()) { 128 | return; 129 | } 130 | } 131 | const byte = buf[ptr]; 132 | ptr = (ptr + 1) % buf.length; 133 | length--; 134 | if (lastByte === "\r".charCodeAt(0) && byte === "\n".charCodeAt(0)) { 135 | // end of header 136 | if (header.length === 1 /* it's an \r */) { 137 | // end of headers 138 | break; 139 | } else { 140 | const decoded = new TextDecoder().decode(new Uint8Array(header)); 141 | const tokens = decoded.split(":", 2); 142 | if (tokens.length !== 2) { 143 | throw new Error(`Invalid header: ${decoded}`); 144 | } 145 | const [key, value] = tokens; 146 | headers[key.trim()] = value.trim(); 147 | header.length = 0; 148 | } 149 | } else { 150 | header.push(byte); 151 | } 152 | lastByte = byte; 153 | } 154 | 155 | // read body 156 | // read the Content-Range header 157 | if (!headers["Content-Range"]) { 158 | // TODO: read until the next boundary 159 | throw new Error("Missing Content-Range header"); 160 | } 161 | const [unit, start, end] = parseContentRangeHeader( 162 | headers["Content-Range"], 163 | ); 164 | if (unit !== "bytes") { 165 | throw new Error("Invalid Content-Range header"); 166 | } 167 | const contentLength = end - start + 1; 168 | const data = new Uint8Array(contentLength); 169 | for (let i = 0; i < contentLength; i++) { 170 | while (length === 0) { 171 | if (await extend()) { 172 | return; 173 | } 174 | } 175 | data[i] = buf[ptr]; 176 | ptr = (ptr + 1) % buf.length; 177 | length--; 178 | } 179 | yield { data: data.buffer, headers }; 180 | headers = {}; 181 | 182 | // read the trailing \r\n 183 | for (const c of ["\r", "\n"]) { 184 | while (length === 0) { 185 | if (await extend()) { 186 | return; 187 | } 188 | } 189 | if (buf[ptr] === c.charCodeAt(0)) { 190 | ptr = (ptr + 1) % buf.length; 191 | length--; 192 | } else { 193 | // invalid boundary 194 | throw new Error("Invalid boundary"); 195 | } 196 | } 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /src/resolver/range-request.ts: -------------------------------------------------------------------------------- 1 | import { Config } from "../index"; 2 | import parseMultipartBody from "./multipart"; 3 | import { LengthIntegrityError } from "./resolver"; 4 | 5 | async function resolveIndividualPromises( 6 | url: string, 7 | ranges: { start: number; end: number; expectedLength?: number }[], 8 | ) { 9 | // fallback to resolving ranges individually 10 | const individualRangePromises = ranges.map( 11 | async ({ start, end, expectedLength }) => { 12 | const rangeHeader = `${start}-${end}`; 13 | const res = await fetch(url, { 14 | headers: { Range: `bytes=${rangeHeader}` }, 15 | }); 16 | 17 | const totalLength = Number( 18 | res.headers.get("Content-Range")!.split("/")[1], 19 | ); 20 | if (expectedLength && totalLength !== expectedLength) { 21 | throw new LengthIntegrityError(); 22 | } 23 | return { 24 | data: await res.arrayBuffer(), 25 | totalLength: totalLength, 26 | }; 27 | }, 28 | ); 29 | return await Promise.all(individualRangePromises); 30 | } 31 | 32 | export async function requestRanges( 33 | url: string, 34 | ranges: { start: number; end: number; expectedLength?: number }[], 35 | config: Config, 36 | ): Promise<{ data: ArrayBuffer; totalLength: number }[]> { 37 | const { useMultipartByteRanges } = config; 38 | if (useMultipartByteRanges === false) { 39 | return await resolveIndividualPromises(url, ranges); 40 | } 41 | 42 | for (const { start, end } of ranges) { 43 | if (end - start <= 0) { 44 | throw new Error( 45 | `Invalid range: The start (${start}) and end (${end}) of a range are equal, indicating a zero-length range.`, 46 | ); 47 | } 48 | } 49 | 50 | const rangesHeader = ranges 51 | .map(({ start, end }) => `${start}-${end}`) 52 | .join(","); 53 | 54 | const response = await fetch(url, { 55 | headers: { 56 | Range: `bytes=${rangesHeader}`, 57 | Accept: "multipart/bytesranges", 58 | }, 59 | }); 60 | 61 | switch (response.status) { 62 | case 200: 63 | console.warn( 64 | `useMultipartByteRanges is enabled but the server indicated did not respond with a subset of bytes. Set useMultipartByteRanges: false in your Appendable config object.`, 65 | ); 66 | return await resolveIndividualPromises(url, ranges); 67 | case 206: 68 | const contentType = response.headers.get("Content-Type"); 69 | if (!contentType) { 70 | throw new Error("Missing Content-Type in response"); 71 | } 72 | if (contentType.includes("multipart/byteranges")) { 73 | let chunks = []; 74 | 75 | if (!response.body) { 76 | throw new Error(`response body is null: ${response.body}`); 77 | } 78 | 79 | for await (const chunk of parseMultipartBody( 80 | contentType, 81 | response.body, 82 | )) { 83 | chunks.push(chunk); 84 | } 85 | 86 | // the last element is null since the final boundary marker is followed by another delim. 87 | if (chunks[chunks.length - 1].data === undefined) { 88 | chunks.pop(); 89 | } 90 | 91 | return chunks.map(({ data, headers }) => { 92 | const totalLengthStr = headers["content-range"]?.split("/")[1]; 93 | const totalLength = totalLengthStr ? parseInt(totalLengthStr, 10) : 0; 94 | 95 | return { data, totalLength }; 96 | }); 97 | } else if (response.headers.has("Content-Range")) { 98 | const abuf = await response.arrayBuffer(); 99 | const totalLength = Number( 100 | response.headers.get("Content-Range")!.split("/")[1], 101 | ); 102 | return [ 103 | { 104 | data: abuf, 105 | totalLength: totalLength, 106 | }, 107 | ]; 108 | } else { 109 | throw new Error(`Unexpected response format: ${contentType}`); 110 | } 111 | case 416: 112 | const requestedRange = response.headers.get("Range") || rangesHeader; 113 | throw new Error( 114 | `Resolver cannot serve the requested ranges: ${requestedRange}`, 115 | ); 116 | default: 117 | throw new Error(`Expected 206 or 200 response, got ${response.status}`); 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/resolver/resolver.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * RangeResolver is a function that takes a range of bytes and returns a promise 3 | * that resolves to an ArrayBuffer containing the bytes in that range. Note that 4 | * the range is inclusive. 5 | * 6 | * Additionally, the RangeResolver must return a checksum which is computed from 7 | * the source data. This checksum is used to verify that the data has not been 8 | * changed between requests. The checksum can be any type, for example it is 9 | * valid to use the last modified timestamp of the source data or the total 10 | * length of the data. This checksum is passed to the RangeResolver on future 11 | * requests as the `checksum` argument. If it does not match the checksum when 12 | * reading the data, the RangeResolver should throw a LengthIntegrityError. 13 | * 14 | * @see LengthIntegrityError 15 | */ 16 | export type RangeResolver = ( 17 | args: { 18 | start: number; 19 | end: number; 20 | expectedLength?: number; 21 | }[], 22 | ) => Promise< 23 | { 24 | data: ArrayBuffer; 25 | totalLength: number; 26 | }[] 27 | >; 28 | 29 | /** 30 | * LengthIntegrityError is thrown by a RangeResolver when the length argument is 31 | * inconsistent with the data returned. This is used to detect when the data has 32 | * changed between requests. 33 | * 34 | * When a LengthIntegrityError is thrown, typically the cache is evicted and the 35 | * query will be tried again with the exception of the data file where the error 36 | * is ignored due to the assumed immutability of the data file. 37 | * 38 | * @see RangeResolver 39 | */ 40 | export class LengthIntegrityError extends Error { 41 | constructor() { 42 | super("length integrity error"); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/tests/bptree.test.ts: -------------------------------------------------------------------------------- 1 | import { BPTree, MetaPage, ReferencedValue } from "../bptree/bptree"; 2 | import { MemoryPointer } from "../bptree/node"; 3 | import { FieldType } from "../db/database"; 4 | import { FileFormat } from "../file/meta"; 5 | import { RangeResolver } from "../resolver/resolver"; 6 | import { readBinaryFile } from "./test-util"; 7 | import { maxUint64 } from "../file/multi"; 8 | 9 | class testMetaPage implements MetaPage { 10 | private readonly rootMP: MemoryPointer; 11 | 12 | constructor(mp: MemoryPointer) { 13 | this.rootMP = mp; 14 | } 15 | 16 | async root(): Promise { 17 | return this.rootMP; 18 | } 19 | } 20 | 21 | describe("test BPTree", () => { 22 | let mockRangeResolver: RangeResolver; 23 | let mockDataFileResolver: RangeResolver; 24 | let btree: BPTree; 25 | 26 | beforeEach(() => { 27 | mockDataFileResolver = async ([]) => { 28 | return [ 29 | { 30 | data: new ArrayBuffer(0), 31 | totalLength: 0, 32 | }, 33 | ]; 34 | }; 35 | 36 | mockRangeResolver = async ([{ start, end }]) => { 37 | const indexFile = await readBinaryFile("btree_1.bin"); 38 | const slicedPart = indexFile.slice(start, end + 1); 39 | 40 | const arrayBuffer = slicedPart.buffer.slice( 41 | slicedPart.byteOffset, 42 | slicedPart.byteOffset + slicedPart.byteLength, 43 | ); 44 | 45 | return [ 46 | { 47 | data: arrayBuffer, 48 | totalLength: arrayBuffer.byteLength, 49 | }, 50 | ]; 51 | }; 52 | 53 | const page = new testMetaPage({ offset: 8192n, length: 88 }); 54 | btree = new BPTree( 55 | mockRangeResolver, 56 | page, 57 | mockDataFileResolver, 58 | FileFormat.CSV, 59 | FieldType.String, 60 | 6, 61 | 4, 62 | ); 63 | }); 64 | 65 | it("should read a bptree and find items", async () => { 66 | let idx = 1; 67 | for (const value of ["hello", "world", "moooo", "cooow"]) { 68 | const keyBuf = new TextEncoder().encode(value).buffer; 69 | const key = new ReferencedValue({ offset: 0n, length: 0 }, keyBuf); 70 | 71 | const [rv, mp] = await btree.find(key); 72 | 73 | expect(value).toEqual(new TextDecoder().decode(rv.value)); 74 | expect(mp.offset).toEqual(BigInt(idx)); 75 | idx += 1; 76 | } 77 | }); 78 | }); 79 | 80 | describe("test BPTree iterator count", () => { 81 | let mockRangeResolver: RangeResolver; 82 | let mockDataFileResolver: RangeResolver; 83 | let btree: BPTree; 84 | 85 | beforeEach(() => { 86 | mockDataFileResolver = async ([]) => { 87 | return [ 88 | { 89 | data: new ArrayBuffer(0), 90 | totalLength: 0, 91 | }, 92 | ]; 93 | }; 94 | 95 | mockRangeResolver = async ([{ start, end }]) => { 96 | const indexFile = await readBinaryFile("btree_1023.bin"); 97 | const slicedPart = indexFile.slice(start, end + 1); 98 | 99 | const arrayBuffer = slicedPart.buffer.slice( 100 | slicedPart.byteOffset, 101 | slicedPart.byteOffset + slicedPart.byteLength, 102 | ); 103 | 104 | return [ 105 | { 106 | data: arrayBuffer, 107 | totalLength: arrayBuffer.byteLength, 108 | }, 109 | ]; 110 | }; 111 | 112 | const page = new testMetaPage({ offset: 8192n, length: 88 }); 113 | btree = new BPTree( 114 | mockRangeResolver, 115 | page, 116 | mockDataFileResolver, 117 | FileFormat.CSV, 118 | FieldType.String, 119 | 9, 120 | 10, 121 | ); 122 | }); 123 | 124 | it("should count the value 23 10 times", async () => { 125 | const valueBuf = new ArrayBuffer(8); 126 | new DataView(valueBuf).setFloat64(0, Number(23)); 127 | 128 | const valueRef = new ReferencedValue({ offset: 0n, length: 0 }, valueBuf); 129 | 130 | const iter = btree.iter(valueRef); 131 | 132 | let count = 0; 133 | 134 | while (await iter.next()) { 135 | const currKey = iter.getKey(); 136 | if (ReferencedValue.compareBytes(valueBuf, currKey.value) === 0) { 137 | count++; 138 | } 139 | } 140 | 141 | expect(count).toEqual(10); 142 | }); 143 | 144 | it("should count the value 23 10 times reverse", async () => { 145 | const valueBuf = new ArrayBuffer(8); 146 | new DataView(valueBuf).setFloat64(0, Number(23)); 147 | 148 | const valueRef = new ReferencedValue( 149 | { offset: maxUint64, length: 0 }, 150 | valueBuf, 151 | ); 152 | 153 | const iter = btree.iter(valueRef); 154 | let count = 0; 155 | 156 | while (await iter.prev()) { 157 | const currKey = iter.getKey(); 158 | if (ReferencedValue.compareBytes(valueBuf, currKey.value) === 0) { 159 | count++; 160 | } 161 | } 162 | 163 | expect(count).toEqual(10); 164 | }); 165 | }); 166 | -------------------------------------------------------------------------------- /src/tests/index-file.test.ts: -------------------------------------------------------------------------------- 1 | import { FieldType } from "../db/database"; 2 | import { FileFormat, readFileMeta, readIndexMeta } from "../file/meta"; 3 | import { readBinaryFile } from "./test-util"; 4 | 5 | describe("test file parsing", () => { 6 | let fileMetaBuffer: Uint8Array; 7 | let indexMetaBuffer: Uint8Array; 8 | 9 | beforeAll(async () => { 10 | fileMetaBuffer = await readBinaryFile("filemeta.bin"); 11 | indexMetaBuffer = await readBinaryFile("indexmeta.bin"); 12 | }); 13 | 14 | it("should read the file meta", async () => { 15 | const fileMeta = await readFileMeta(fileMetaBuffer.buffer); 16 | expect(fileMeta.format).toEqual(FileFormat.CSV); 17 | expect(fileMeta.version).toEqual(1); 18 | expect(fileMeta.readOffset).toEqual(4096n); 19 | expect(fileMeta.entries).toEqual(34); 20 | }); 21 | 22 | it("should read the index meta", async () => { 23 | const indexMeta = await readIndexMeta(indexMetaBuffer.buffer); 24 | expect(indexMeta.width).toEqual(2); 25 | expect(indexMeta.fieldName).toEqual("howdydo"); 26 | expect(indexMeta.fieldType).toEqual(FieldType.Boolean); 27 | expect(indexMeta.totalFieldValueLength).toEqual(773424601); 28 | }); 29 | }); 30 | -------------------------------------------------------------------------------- /src/tests/mock_binaries/btree_1.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevmo314/appendable/e51125a5b8b4f6a9e7940515d1920b267bb3d510/src/tests/mock_binaries/btree_1.bin -------------------------------------------------------------------------------- /src/tests/mock_binaries/btree_1023.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevmo314/appendable/e51125a5b8b4f6a9e7940515d1920b267bb3d510/src/tests/mock_binaries/btree_1023.bin -------------------------------------------------------------------------------- /src/tests/mock_binaries/btree_iterator.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevmo314/appendable/e51125a5b8b4f6a9e7940515d1920b267bb3d510/src/tests/mock_binaries/btree_iterator.bin -------------------------------------------------------------------------------- /src/tests/mock_binaries/filemeta.bin: -------------------------------------------------------------------------------- 1 | " -------------------------------------------------------------------------------- /src/tests/mock_binaries/filled_metadata.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevmo314/appendable/e51125a5b8b4f6a9e7940515d1920b267bb3d510/src/tests/mock_binaries/filled_metadata.bin -------------------------------------------------------------------------------- /src/tests/mock_binaries/indexmeta.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevmo314/appendable/e51125a5b8b4f6a9e7940515d1920b267bb3d510/src/tests/mock_binaries/indexmeta.bin -------------------------------------------------------------------------------- /src/tests/mock_binaries/internalnode.bin: -------------------------------------------------------------------------------- 1 |  -------------------------------------------------------------------------------- /src/tests/mock_binaries/leafnode.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevmo314/appendable/e51125a5b8b4f6a9e7940515d1920b267bb3d510/src/tests/mock_binaries/leafnode.bin -------------------------------------------------------------------------------- /src/tests/multi.test.ts: -------------------------------------------------------------------------------- 1 | import { RangeResolver } from "../resolver/resolver"; 2 | import { arrayBufferToString, readBinaryFile } from "./test-util"; 3 | import { ReadMultiBPTree } from "../file/multi"; 4 | 5 | describe("test metadata", () => { 6 | let mockMetadata: Uint8Array; 7 | 8 | beforeAll(async () => { 9 | mockMetadata = await readBinaryFile("filled_metadata.bin"); 10 | }); 11 | 12 | it("reads stored metadata", async () => { 13 | const mockRangeResolver: RangeResolver = async ([{ start, end }]) => { 14 | return [ 15 | { 16 | data: mockMetadata.buffer.slice(start, end), 17 | totalLength: end - start + 1, 18 | }, 19 | ]; 20 | }; 21 | 22 | const tree = ReadMultiBPTree(mockRangeResolver, 0); 23 | const metadata = await tree.metadata(); 24 | expect("hello").toEqual(arrayBufferToString(metadata)); 25 | }); 26 | }); 27 | -------------------------------------------------------------------------------- /src/tests/multipart.test.ts: -------------------------------------------------------------------------------- 1 | import parseMultipartBody from "../resolver/multipart"; 2 | 3 | async function collect(gen: AsyncGenerator) { 4 | const result: T[] = []; 5 | for await (const item of gen) { 6 | result.push(item); 7 | } 8 | return result; 9 | } 10 | 11 | describe("multipart", () => { 12 | it("should parse multipart with two chunks", async () => { 13 | const encoder = new TextEncoder(); 14 | const data = encoder.encode(`--3d6b6a416f9b5\r 15 | Content-Type: text/html\r 16 | Content-Range: bytes 0-50/1270\r 17 | \r 18 | 19 | 20 | 21 | Example Do\r 22 | --3d6b6a416f9b5\r 23 | Content-Type: text/html\r 24 | Content-Range: bytes 100-150/1270\r 25 | \r 26 | eta http-equiv="Content-type" content="text/html; c\r 27 | --3d6b6a416f9b5--`); 28 | const { readable, writable } = new TransformStream(); 29 | const writer = writable.getWriter(); 30 | writer.write(data); 31 | writer.close(); 32 | const multipart = await collect( 33 | parseMultipartBody( 34 | "multipart/byteranges; boundary=3d6b6a416f9b5", 35 | readable, 36 | ), 37 | ); 38 | expect(multipart.length).toBe(2); 39 | const decoder = new TextDecoder(); 40 | expect(decoder.decode(multipart[0].data)).toBe( 41 | "<!doctype html>\n<html>\n<head>\n <title>Example Do", 42 | ); 43 | expect(decoder.decode(multipart[1].data)).toBe( 44 | 'eta http-equiv="Content-type" content="text/html; c', 45 | ); 46 | }); 47 | }); 48 | -------------------------------------------------------------------------------- /src/tests/ngramtable.test.ts: -------------------------------------------------------------------------------- 1 | import { PriorityTable } from "../ngram/table"; 2 | 3 | describe("tests ngram table", () => { 4 | it("correctly tracks the count", () => { 5 | const table = new PriorityTable<string>(); 6 | table.insert("howdy", 3); 7 | table.insert("do", 3); 8 | table.insert("howdy", 2); 9 | 10 | const pq = table.top(); 11 | expect(pq.length).toEqual(2); 12 | expect(pq[0]).toEqual({ key: "howdy", score: 5 }); 13 | expect(pq[1]).toEqual({ key: "do", score: 3 }); 14 | }); 15 | 16 | it("should return null for top", () => { 17 | const table = new PriorityTable<string>(); 18 | const pq = table.top(); 19 | expect(pq.length).toEqual(0); 20 | }); 21 | 22 | it("should correctly clear all entries", () => { 23 | const table = new PriorityTable<string>(); 24 | table.insert("wef", 4); 25 | table.insert("wef", 3); 26 | table.insert("wef", 2); 27 | table.insert("ty", 1); 28 | expect(table.size).toEqual(2); 29 | table.clear(); 30 | 31 | const pq = table.top(); 32 | expect(pq.length).toEqual(0); 33 | expect(table.size).toEqual(0); 34 | }); 35 | 36 | it("handles a large number of varied inserts", () => { 37 | const table = new PriorityTable<string>(); 38 | const entries = new Map<string, number>(); 39 | const itemCount = 1000; 40 | const possibleEntries = ["wef", "wef a", "beef", "tarikoplata", "omoplata"]; 41 | 42 | for (let idx = 0; idx < itemCount; idx++) { 43 | const randomKey = 44 | possibleEntries[Math.floor(Math.random() * possibleEntries.length)]; 45 | table.insert(randomKey, idx); 46 | entries.set(randomKey, (entries.get(randomKey) || 0) + idx); 47 | } 48 | 49 | const sorted = Array.from(entries, ([key, score]) => ({ 50 | key, 51 | score, 52 | })).sort((m, n) => n.score - m.score); 53 | let queue = table.top(); 54 | 55 | expect(sorted).toEqual(queue); 56 | }); 57 | }); 58 | -------------------------------------------------------------------------------- /src/tests/query-builder.test.ts: -------------------------------------------------------------------------------- 1 | import { Database } from "../db/database"; 2 | import { QueryBuilder } from "../db/query-builder"; 3 | import { validateQuery } from "../db/query-validation"; 4 | import { IndexHeader } from "../file/meta"; 5 | 6 | describe("test validate queries", () => { 7 | interface MockSchema { 8 | [key: string]: {}; 9 | VendorID: {}; 10 | store_and_fwd_flag: {}; 11 | fare_amount: {}; 12 | payment_type: {}; 13 | } 14 | 15 | const headers: IndexHeader[] = [ 16 | { 17 | fieldName: "VendorID", 18 | fieldTypes: [2], 19 | }, 20 | { 21 | fieldName: "store_and_fwd_flag", 22 | fieldTypes: [3], 23 | }, 24 | { 25 | fieldName: "fare_amount", 26 | fieldTypes: [2], 27 | }, 28 | { 29 | fieldName: "payment_type", 30 | fieldTypes: [3], 31 | }, 32 | ]; 33 | 34 | let database: Database<MockSchema>; 35 | 36 | it(`test query builder`, () => { 37 | let qb = new QueryBuilder(database); 38 | 39 | let qb1 = qb.where("VendorID", "<=", 2); 40 | 41 | expect(() => { 42 | validateQuery(qb1.toQuery(), headers); 43 | }).not.toThrow(); 44 | }); 45 | 46 | it(`test basic query chain`, () => { 47 | let q = new QueryBuilder(database).where("VendorID", "<=", 2); 48 | let query = q.toQuery(); 49 | 50 | expect(query.where).not.toBeNull(); 51 | expect(query.where).toEqual([ 52 | { key: "VendorID", operation: "<=", value: 2 }, 53 | ]); 54 | 55 | expect(() => { 56 | validateQuery(query, headers); 57 | }).not.toThrow(); 58 | 59 | q = q.orderBy("VendorID", "ASC"); 60 | query = q.toQuery(); 61 | 62 | expect(query.where).not.toBeNull(); 63 | expect(query.where).toEqual([ 64 | { key: "VendorID", operation: "<=", value: 2 }, 65 | ]); 66 | expect(query.orderBy).not.toBeNull(); 67 | expect(query.orderBy).toEqual([{ key: "VendorID", direction: "ASC" }]); 68 | expect(() => { 69 | validateQuery(query, headers); 70 | }).not.toThrow(); 71 | 72 | q = q.select(["VendorID", "store_and_fwd_flag", "fare_amount"]); 73 | query = q.toQuery(); 74 | expect(query.where).not.toBeNull(); 75 | expect(query.where).toEqual([ 76 | { key: "VendorID", operation: "<=", value: 2 }, 77 | ]); 78 | expect(query.orderBy).not.toBeNull(); 79 | expect(query.orderBy).toEqual([{ key: "VendorID", direction: "ASC" }]); 80 | expect(query.select).not.toBeNull(); 81 | expect(query.select).toEqual([ 82 | "VendorID", 83 | "store_and_fwd_flag", 84 | "fare_amount", 85 | ]); 86 | }); 87 | 88 | it(`test basic derived query chain`, () => { 89 | const q0 = new QueryBuilder(database).where("fare_amount", "==", 1); 90 | let query = q0.toQuery(); 91 | 92 | expect(query.where).not.toBeNull(); 93 | expect(query.where).toEqual([ 94 | { key: "fare_amount", operation: "==", value: 1 }, 95 | ]); 96 | 97 | let q1 = q0.orderBy("fare_amount", "DESC"); 98 | query = q1.toQuery(); 99 | 100 | expect(query.where).not.toBeNull(); 101 | expect(query.where).toEqual([ 102 | { key: "fare_amount", operation: "==", value: 1 }, 103 | ]); 104 | expect(query.orderBy).not.toBeNull(); 105 | expect(query.orderBy).toEqual([{ key: "fare_amount", direction: "DESC" }]); 106 | 107 | let q2 = q1.select(["fare_amount"]); 108 | query = q2.toQuery(); 109 | expect(query.where).not.toBeNull(); 110 | expect(query.where).toEqual([ 111 | { key: "fare_amount", operation: "==", value: 1 }, 112 | ]); 113 | expect(query.orderBy).not.toBeNull(); 114 | expect(query.orderBy).toEqual([{ key: "fare_amount", direction: "DESC" }]); 115 | expect(query.select).not.toBeNull(); 116 | expect(query.select).toEqual(["fare_amount"]); 117 | }); 118 | 119 | it(`test multi derived query chain`, () => { 120 | const q0 = new QueryBuilder(database).where("fare_amount", "==", 2); 121 | let query = q0.toQuery(); 122 | 123 | expect(query.where).not.toBeNull(); 124 | expect(query.where).toEqual([ 125 | { key: "fare_amount", operation: "==", value: 2 }, 126 | ]); 127 | 128 | let q1 = q0.where("VendorID", "==", 2); 129 | query = q1.toQuery(); 130 | 131 | expect(query.where).not.toBeNull(); 132 | expect(query.where).toEqual([ 133 | { key: "fare_amount", operation: "==", value: 2 }, 134 | { key: "VendorID", operation: "==", value: 2 }, 135 | ]); 136 | }); 137 | 138 | it(`test green + red queries`, () => { 139 | const q0 = new QueryBuilder(database).where("payment_type", ">", 3); 140 | const failQuery = q0.orderBy("VendorID", "ASC"); 141 | expect(failQuery.toQuery().orderBy).toEqual([ 142 | { key: "VendorID", direction: "ASC" }, 143 | ]); 144 | 145 | const passQuery = q0.orderBy("payment_type", "DESC"); 146 | expect(passQuery.toQuery().orderBy).toEqual([ 147 | { key: "payment_type", direction: "DESC" }, 148 | ]); 149 | 150 | const failQuery2 = passQuery.select(["wef"]); 151 | const passQuery2 = passQuery.select([ 152 | "VendorID", 153 | "payment_type", 154 | "fare_amount", 155 | ]); 156 | 157 | // red queries 158 | [failQuery, failQuery2].forEach((query) => { 159 | expect(() => validateQuery(query.toQuery(), headers)).toThrow(); 160 | }); 161 | 162 | // green queries 163 | [passQuery, passQuery2].forEach((query) => { 164 | expect(() => validateQuery(query.toQuery(), headers)).not.toThrow(); 165 | }); 166 | }); 167 | }); 168 | -------------------------------------------------------------------------------- /src/tests/query-logic.test.ts: -------------------------------------------------------------------------------- 1 | import { FieldType } from "../db/database"; 2 | import { handleSelect, processWhere } from "../db/query-lang"; 3 | 4 | describe("query logic test", () => { 5 | it("should process the given key", () => { 6 | let floatBuf1 = new ArrayBuffer(8); 7 | new DataView(floatBuf1).setFloat64(0, 3.4, true); 8 | 9 | let floatBuf2 = new ArrayBuffer(8); 10 | new DataView(floatBuf2).setFloat64(0, Number(1n), true); 11 | 12 | const values: [ 13 | string | number | bigint | boolean | null, 14 | FieldType, 15 | ArrayBuffer, 16 | ][] = [ 17 | ["howdy", FieldType.String, new TextEncoder().encode("howdy").buffer], 18 | [3.4, FieldType.Float64, floatBuf1], 19 | [1n, FieldType.Float64, floatBuf2], 20 | [true, FieldType.Boolean, new Uint8Array([0]).buffer], 21 | [false, FieldType.Boolean, new Uint8Array([1]).buffer], 22 | [null, FieldType.Null, new ArrayBuffer(0)], 23 | ]; 24 | 25 | for (const [value, expectedType, expectedVBuf] of values) { 26 | // @ts-ignore 27 | const res = processWhere(value); 28 | 29 | if (!res) { 30 | expect(res).not.toBeNull(); 31 | return; 32 | } 33 | 34 | const { valueBuf, fieldType } = res; 35 | expect(expectedType).toEqual(fieldType); 36 | expect(valueBuf).toEqual(expectedVBuf); 37 | } 38 | }); 39 | 40 | it("should select accordingly", () => { 41 | const select = ["george strait", "alan jackson"]; 42 | 43 | const mockJson = { 44 | "george strait": "howdy", 45 | "alan jackson": true, 46 | kelp: null, 47 | wef: 30.4, 48 | }; 49 | 50 | const mockJsonStr = JSON.stringify(mockJson); 51 | const filtered = handleSelect(mockJsonStr, select); 52 | expect(filtered).toEqual({ 53 | "george strait": "howdy", 54 | "alan jackson": true, 55 | }); 56 | 57 | const pass = handleSelect(mockJsonStr); 58 | expect(pass).toEqual(mockJson); 59 | }); 60 | }); 61 | -------------------------------------------------------------------------------- /src/tests/query-validation.test.ts: -------------------------------------------------------------------------------- 1 | import { validateQuery } from "../db/query-validation"; 2 | import { IndexHeader } from "../file/meta"; 3 | import { Query, Search } from "../db/query-lang"; 4 | import { FieldType } from "../db/database"; 5 | 6 | describe("validate search queries", () => { 7 | interface MockSchema { 8 | [key: string]: {}; 9 | Pollo: {}; 10 | Bife: {}; 11 | Cerdo: {}; 12 | } 13 | 14 | const headers: IndexHeader[] = [ 15 | { 16 | fieldName: "Pollo", 17 | fieldTypes: [FieldType.Unigram, FieldType.Bigram, FieldType.Trigram], 18 | }, 19 | { 20 | fieldName: "Bife", 21 | fieldTypes: [FieldType.Unigram, FieldType.Bigram, FieldType.Trigram], 22 | }, 23 | { 24 | fieldName: "Cerdo", 25 | fieldTypes: [FieldType.Unigram, FieldType.Bigram, FieldType.Trigram], 26 | }, 27 | ]; 28 | 29 | it("performs a simple search query", () => { 30 | for (let minGram = 0; minGram <= 3; minGram++) { 31 | for (let maxGram = minGram; maxGram <= 3; maxGram++) { 32 | const search = { 33 | key: "Pollo", 34 | like: "wefhowdy", 35 | minGram, 36 | maxGram, 37 | }; 38 | const q: Query<MockSchema> = { search }; 39 | 40 | expect(() => { 41 | validateQuery(q, headers); 42 | }).not.toThrow(); 43 | } 44 | } 45 | }); 46 | 47 | it("query a defaults to a 12gram", () => { 48 | const search = { 49 | key: "Cerdo", 50 | like: "wefhowdy", 51 | }; 52 | 53 | const q: Query<MockSchema> = { search }; 54 | 55 | expect(() => { 56 | validateQuery(q, headers); 57 | }).not.toThrow(); 58 | 59 | expect(q.search).not.toBeUndefined(); 60 | expect(q.search!.config).not.toBeUndefined(); 61 | expect(q.search!.config!.minGram).toEqual(1); 62 | expect(q.search!.config!.maxGram).toEqual(2); 63 | }); 64 | 65 | it("fails to validate query via unknown header", () => { 66 | const search = { 67 | key: "Atun", 68 | like: "bacalao", 69 | }; 70 | 71 | const q: Query<MockSchema> = { search }; 72 | 73 | expect(() => { 74 | validateQuery(q, headers); 75 | }).toThrow(); 76 | }); 77 | 78 | it("fails to validate query via invalid range", () => { 79 | const search = { 80 | key: "Pollo", 81 | like: "bacalao", 82 | config: { 83 | minGram: 2, 84 | maxGram: 1, 85 | }, 86 | }; 87 | 88 | const q: Query<MockSchema> = { search }; 89 | 90 | expect(() => { 91 | validateQuery(q, headers); 92 | }).toThrow(); 93 | }); 94 | }); 95 | 96 | describe("validate filter queries", () => { 97 | interface MockSchema { 98 | [key: string]: {}; 99 | VendorID: {}; 100 | store_and_fwd_flag: {}; 101 | fare_amount: {}; 102 | payment_type: {}; 103 | } 104 | 105 | const headers: IndexHeader[] = [ 106 | { 107 | fieldName: "VendorID", 108 | fieldTypes: [0], 109 | }, 110 | { 111 | fieldName: "store_and_fwd_flag", 112 | fieldTypes: [6], 113 | }, 114 | { 115 | fieldName: "fare_amount", 116 | fieldTypes: [3], 117 | }, 118 | { 119 | fieldName: "payment_type", 120 | fieldTypes: [3, 0], 121 | }, 122 | ]; 123 | 124 | const validQueries: Query<MockSchema>[] = [ 125 | { 126 | where: [ 127 | { 128 | operation: "==", 129 | key: "VendorID", 130 | value: "", 131 | }, 132 | ], 133 | }, 134 | { 135 | where: [ 136 | { 137 | operation: "<", 138 | key: "fare_amount", 139 | value: 10, 140 | }, 141 | ], 142 | orderBy: [ 143 | { 144 | key: "fare_amount", 145 | direction: "ASC", 146 | }, 147 | ], 148 | }, 149 | { 150 | where: [ 151 | { 152 | operation: ">=", 153 | key: "payment_type", 154 | value: 300, 155 | }, 156 | ], 157 | orderBy: [ 158 | { 159 | key: "payment_type", 160 | direction: "DESC", 161 | }, 162 | ], 163 | select: ["payment_type", "fare_amount"], 164 | }, 165 | { 166 | where: [ 167 | { 168 | operation: "==", 169 | key: "store_and_fwd_flag", 170 | value: false, 171 | }, 172 | ], 173 | select: ["fare_amount", "payment_type"], 174 | }, 175 | ]; 176 | 177 | it("test valid query", () => { 178 | validQueries.forEach((query) => { 179 | expect(() => { 180 | validateQuery(query, headers); 181 | }).not.toThrow(); 182 | }); 183 | }); 184 | 185 | const notValidQueries: Query<MockSchema>[] = [ 186 | { 187 | where: [ 188 | { 189 | operation: "<=", 190 | key: "vendorid", 191 | value: 1, 192 | }, 193 | ], 194 | }, 195 | { 196 | where: [ 197 | { 198 | operation: "==", 199 | key: "store_and_fwd_flag", 200 | value: 10, 201 | }, 202 | ], 203 | orderBy: [ 204 | { 205 | key: "store_an_flag", 206 | direction: "ASC", 207 | }, 208 | ], 209 | }, 210 | { 211 | where: [ 212 | { 213 | operation: "<", 214 | key: "payment_type", 215 | value: false, 216 | }, 217 | ], 218 | select: ["payment_type", "vendorid", "store_and_fwd_flag"], 219 | }, 220 | { 221 | where: [ 222 | { 223 | operation: "==", 224 | key: "payment_type", 225 | value: "", 226 | }, 227 | ], 228 | select: ["paymet_type"], 229 | }, 230 | ]; 231 | 232 | notValidQueries.forEach((query, index) => { 233 | it(`test invalid query ${index}`, () => { 234 | expect(() => validateQuery(query, headers)).toThrow(); 235 | }); 236 | }); 237 | }); 238 | -------------------------------------------------------------------------------- /src/tests/test-util.ts: -------------------------------------------------------------------------------- 1 | import path from "path"; 2 | import fs from "fs/promises"; 3 | 4 | export async function readBinaryFile(filename: string): Promise<Uint8Array> { 5 | const filePath = path.join(__dirname, `mock_binaries/${filename}`); 6 | const data = await fs.readFile(filePath); 7 | return new Uint8Array(data); 8 | } 9 | 10 | export function arrayBufferToString(arrayBuffer: ArrayBuffer): string { 11 | const decoder = new TextDecoder("utf-8"); 12 | return decoder.decode(new Uint8Array(arrayBuffer)); 13 | } 14 | -------------------------------------------------------------------------------- /src/tests/tokenizer.test.ts: -------------------------------------------------------------------------------- 1 | import { NgramTokenizer } from "../ngram/tokenizer"; 2 | import { FieldType } from "../db/database"; 3 | 4 | describe("builds 12grams", () => { 5 | let tok: NgramTokenizer; 6 | let textEncoder: TextEncoder; 7 | 8 | beforeAll(() => { 9 | textEncoder = new TextEncoder(); 10 | }); 11 | 12 | beforeEach(() => { 13 | tok = new NgramTokenizer(1, 2); 14 | }); 15 | 16 | it("builds a basic 12gram", () => { 17 | const phrase = "wakemeup"; 18 | const expected = [ 19 | "w", 20 | "a", 21 | "k", 22 | "e", 23 | "m", 24 | "e", 25 | "u", 26 | "p", 27 | "wa", 28 | "ak", 29 | "ke", 30 | "em", 31 | "me", 32 | "eu", 33 | "up", 34 | ].map((s) => ({ 35 | value: s, 36 | valueBuf: textEncoder.encode(s).buffer, 37 | type: s.length === 1 ? FieldType.Unigram : FieldType.Bigram, 38 | })); 39 | 40 | const trigrams = tok.tokens(phrase); 41 | expect(trigrams).toEqual(expected); 42 | }); 43 | 44 | it("builds a complex 12 gram", () => { 45 | const phrase = "I can't wake up"; 46 | const expected = [ 47 | "i", 48 | "c", 49 | "a", 50 | "n", 51 | "t", 52 | "w", 53 | "a", 54 | "k", 55 | "e", 56 | "u", 57 | "p", 58 | "ca", 59 | "an", 60 | "nt", 61 | "wa", 62 | "ak", 63 | "ke", 64 | "up", 65 | ].map((s) => ({ 66 | value: s, 67 | valueBuf: textEncoder.encode(s).buffer, 68 | type: s.length === 1 ? FieldType.Unigram : FieldType.Bigram, 69 | })); 70 | 71 | const trigrams = tok.tokens(phrase); 72 | expect(trigrams).toEqual(expected); 73 | }); 74 | }); 75 | 76 | describe("builds trigrams", () => { 77 | let tok: NgramTokenizer; 78 | let textEncoder: TextEncoder; 79 | 80 | beforeAll(() => { 81 | textEncoder = new TextEncoder(); 82 | }); 83 | 84 | beforeEach(() => { 85 | tok = new NgramTokenizer(3, 3); 86 | }); 87 | 88 | it("builds a basic trigram", () => { 89 | const phrase = "wakemeup"; 90 | const expected = ["wak", "ake", "kem", "eme", "meu", "eup"].map((s) => ({ 91 | value: s, 92 | valueBuf: textEncoder.encode(s).buffer, 93 | type: FieldType.Trigram, 94 | })); 95 | 96 | const trigrams = tok.tokens(phrase); 97 | expect(trigrams).toEqual(expected); 98 | }); 99 | 100 | it("builds a complex trigram", () => { 101 | const phrase = "I can't wake up"; 102 | const expected = ["can", "ant", "wak", "ake"].map((s) => ({ 103 | value: s, 104 | valueBuf: textEncoder.encode(s).buffer, 105 | type: FieldType.Trigram, 106 | })); 107 | 108 | const trigrams = tok.tokens(phrase); 109 | expect(trigrams).toEqual(expected); 110 | }); 111 | }); 112 | 113 | describe("fuzz shuffle", () => { 114 | let tok: NgramTokenizer; 115 | 116 | beforeEach(() => { 117 | tok = new NgramTokenizer(3, 3); 118 | }); 119 | const generateRandomString = (length: number) => { 120 | const alpha = 121 | "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 "; 122 | let result = ""; 123 | for (let i = 0; i < length; i++) { 124 | result += alpha.charAt(Math.floor(Math.random() * alpha.length)); 125 | } 126 | return result; 127 | }; 128 | 129 | it("shuffles randomly", () => { 130 | for (let i = 0; i < 100; i++) { 131 | const phrase = generateRandomString(Math.floor(Math.random() * 50)); 132 | const trigrams = tok.tokens(phrase); 133 | const shuffled = NgramTokenizer.shuffle(trigrams); 134 | 135 | expect(shuffled.length).toBe(trigrams.length); 136 | expect(new Set(shuffled)).toEqual(new Set(trigrams)); 137 | } 138 | }); 139 | }); 140 | -------------------------------------------------------------------------------- /src/tests/varint.ts: -------------------------------------------------------------------------------- 1 | import { decodeUvarint, encodeUvarint } from "../util/uvarint"; 2 | 3 | describe("test varint codec", () => { 4 | it("should round trip correctly", () => { 5 | let values = [ 6 | 0, 7 | 1, 8 | 2, 9 | 10, 10 | 20, 11 | 63, 12 | 64, 13 | 65, 14 | 127, 15 | 128, 16 | 129, 17 | 255, 18 | 256, 19 | 257, 20 | 1 << (63 - 1), 21 | ]; 22 | 23 | values.forEach((v) => { 24 | const b = encodeUvarint(v); 25 | const w = decodeUvarint(b); 26 | expect(v).toEqual(w); 27 | }); 28 | }); 29 | }); 30 | -------------------------------------------------------------------------------- /src/util/uvarint.ts: -------------------------------------------------------------------------------- 1 | export type UvarintResponse = { 2 | value: number; 3 | bytesRead: number; 4 | }; 5 | 6 | const MAX_VARINT_64 = 10; 7 | 8 | export function encodeUvarint(n: number): ArrayBuffer { 9 | let i = 0; 10 | 11 | let ibuf = new Uint8Array(MAX_VARINT_64); 12 | 13 | while (n >= 0x80) { 14 | ibuf[i++] = (n & 0xff) | 0x80; 15 | n >>= 7; 16 | } 17 | 18 | ibuf[i] = n & 0xff; 19 | 20 | return ibuf.buffer.slice(0, i + 1); 21 | } 22 | 23 | export function decodeUvarint(buf: ArrayBuffer): UvarintResponse { 24 | let x: number = 0; 25 | let s: number = 0; 26 | 27 | const view = new Uint8Array(buf); 28 | 29 | for (let idx = 0; idx <= view.length - 1; idx++) { 30 | let b = view[idx]; 31 | 32 | if (idx === MAX_VARINT_64) { 33 | return { value: 0, bytesRead: -(idx + 1) }; 34 | } 35 | 36 | if (b < 0x80) { 37 | if (idx === MAX_VARINT_64 - 1 && b > 1) { 38 | return { value: 0, bytesRead: -(idx + 1) }; 39 | } 40 | 41 | let value = x | (b << s); 42 | return { value, bytesRead: idx + 1 }; 43 | } 44 | 45 | x |= (b & 0x7f) << s; 46 | s += 7; 47 | } 48 | 49 | return { value: 0, bytesRead: 0 }; 50 | } 51 | --------------------------------------------------------------------------------