├── .github
└── workflows
│ ├── benchmark.yml
│ ├── example-client.yml
│ ├── format.yml
│ └── test.yml
├── .gitignore
├── .prettierignore
├── .python-version
├── Cargo.toml
├── README.md
├── cmd
└── main.go
├── examples
├── README.md
├── benchmarkdiff
│ └── main.go
├── client
│ ├── editor.js
│ ├── index.html
│ ├── search.html
│ ├── server.go
│ └── styles.css
├── visualizer
│ └── main.go
└── workspace
│ ├── fetch_csv.py
│ ├── fetch_jsonl.py
│ ├── palo-alto.jsonl
│ └── requirements.txt
├── go.mod
├── go.sum
├── jest.config.ts
├── package-lock.json
├── package.json
├── pkg
├── appendable
│ ├── appendable.go
│ ├── appendable_test.go
│ ├── index_file.go
│ ├── index_file_test.go
│ └── typescript.go
├── bptree
│ ├── README.md
│ ├── bptree.go
│ ├── bptree_test.go
│ ├── node.go
│ └── node_test.go
├── btree
│ ├── btree.go
│ ├── btree_test.go
│ ├── node.go
│ └── node_test.go
├── buftest
│ ├── buffer.go
│ └── buffer_test.go
├── encoding
│ ├── sizeVarint.go
│ └── sizeVarint_test.go
├── handlers
│ ├── csv.go
│ ├── csv_test.go
│ ├── equality_test.go
│ ├── jsonl.go
│ └── jsonl_test.go
├── hnsw
│ ├── friends.go
│ ├── friends_test.go
│ ├── heap.go
│ ├── heap_test.go
│ ├── hnsw.go
│ └── hnsw_test.go
├── linkedpage
│ ├── linkedpage.go
│ └── linkedpage_test.go
├── metapage
│ └── metapage.go
├── mmap
│ ├── mmap.go
│ ├── mmap_test.go
│ ├── mremap_darwin.go
│ └── mremap_linux.go
├── mocks
│ ├── btree.go
│ ├── main.go
│ ├── meta_page.go
│ ├── metadata.go
│ └── node.go
├── ngram
│ ├── tokenizer.go
│ └── tokenizer_test.go
├── pagefile
│ ├── pagefile.go
│ ├── pagefile_debug.go
│ ├── pagefile_debug_test.go
│ └── pagefile_test.go
├── pointer
│ ├── pointer.go
│ └── referenced_value.go
└── vectorpage
│ ├── manager.go
│ └── manager_test.go
├── scripts
└── jsonl2json
│ ├── Cargo.toml
│ └── src
│ └── main.rs
├── src
├── bptree
│ ├── bptree.ts
│ ├── node.ts
│ └── traversal.ts
├── db
│ ├── database.ts
│ ├── query-builder.ts
│ ├── query-lang.ts
│ └── query-validation.ts
├── file
│ ├── data-file.ts
│ ├── index-file.ts
│ ├── meta.ts
│ └── multi.ts
├── index.ts
├── ngram
│ ├── table.ts
│ └── tokenizer.ts
├── resolver
│ ├── cache.ts
│ ├── multipart.ts
│ ├── range-request.ts
│ └── resolver.ts
├── tests
│ ├── bptree.test.ts
│ ├── index-file.test.ts
│ ├── mock_binaries
│ │ ├── btree_1.bin
│ │ ├── btree_1023.bin
│ │ ├── btree_iterator.bin
│ │ ├── filemeta.bin
│ │ ├── filled_metadata.bin
│ │ ├── indexmeta.bin
│ │ ├── internalnode.bin
│ │ └── leafnode.bin
│ ├── multi.test.ts
│ ├── multipart.test.ts
│ ├── ngramtable.test.ts
│ ├── node.test.ts
│ ├── query-builder.test.ts
│ ├── query-logic.test.ts
│ ├── query-validation.test.ts
│ ├── test-util.ts
│ ├── tokenizer.test.ts
│ └── varint.ts
└── util
│ └── uvarint.ts
└── tsconfig.json
/.github/workflows/benchmark.yml:
--------------------------------------------------------------------------------
1 | name: Benchmark
2 | on:
3 | pull_request:
4 |
5 | jobs:
6 | baseline:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - uses: actions/checkout@v4
10 | with:
11 | path: "comparison"
12 | ref: ${{ github.event.pull_request.head.ref }}
13 | - uses: actions/checkout@v4
14 | with:
15 | path: "baseline"
16 | ref: ${{ github.event.pull_request.base.ref }}
17 |
18 | - uses: actions/setup-python@v4.7.1
19 | - run: |
20 | # Fetch the data in workspace
21 | cd examples/workspace
22 | python3 -m pip install -r requirements.txt
23 | python3 fetch_jsonl.py
24 | working-directory: comparison
25 | - run: |
26 | # Fetch the data in workspace
27 | cd examples/workspace
28 | python3 -m pip install -r requirements.txt
29 | python3 fetch_jsonl.py
30 | working-directory: baseline
31 |
32 | - uses: actions/setup-go@v4
33 | with:
34 | go-version: "1.21"
35 |
36 | - run: |
37 | go run cmd/main.go \
38 | -i output.index \
39 | -jsonl \
40 | -b benchmark.txt \
41 | examples/workspace/green_tripdata_2023-01.jsonl
42 | working-directory: comparison
43 |
44 | - run: |
45 | go run cmd/main.go \
46 | -i output.index \
47 | -jsonl \
48 | -b benchmark.txt \
49 | examples/workspace/green_tripdata_2023-01.jsonl
50 | working-directory: baseline
51 |
52 | - run: go run examples/benchmarkdiff/main.go ../baseline/benchmark.txt benchmark.txt
53 | working-directory: comparison
54 |
55 | - uses: actions/upload-artifact@v4
56 | id: upload-artifact
57 | with:
58 | name: benchmark-diff
59 | path: comparison/output.html
60 |
61 | # post a comment to the PR
62 | - name: Post comment
63 | uses: mshick/add-pr-comment@v2
64 | with:
65 | message: |
66 | Benchmark results: ${{ steps.upload-artifact.outputs.artifact-url }}
67 |
--------------------------------------------------------------------------------
/.github/workflows/example-client.yml:
--------------------------------------------------------------------------------
1 | name: Deploy GitHub Pages Example
2 |
3 | on:
4 | push:
5 | branches: ["main"]
6 | workflow_dispatch:
7 |
8 | permissions:
9 | contents: read
10 | pages: write
11 | id-token: write
12 |
13 | concurrency:
14 | group: "pages"
15 | cancel-in-progress: false
16 |
17 | jobs:
18 | build:
19 | runs-on: ubuntu-latest
20 | steps:
21 | - uses: actions/checkout@v3
22 | - uses: actions/configure-pages@v3
23 | - uses: actions/setup-go@v4.1.0
24 | with:
25 | go-version-file: go.mod
26 | - uses: actions/setup-node@v4.0.0
27 | - uses: actions/setup-python@v4.7.1
28 | - run: |
29 | # Fetch the data in workspace
30 | cd examples/workspace
31 | python3 -m pip install -r requirements.txt
32 | python3 fetch_jsonl.py
33 | cd -
34 |
35 | # Build the index
36 | go run cmd/main.go -i examples/client/green_tripdata_2023-01.index -jsonl examples/workspace/green_tripdata_2023-01.jsonl
37 |
38 | # Copy to client
39 | cp examples/workspace/green_tripdata_2023-01.jsonl examples/client
40 |
41 | # Build the index
42 | go run cmd/main.go -i examples/client/palo-alto.index -s description -jsonl examples/workspace/palo-alto.jsonl
43 |
44 | # Copy to client
45 | cp examples/workspace/palo-alto.jsonl examples/client
46 |
47 | # Build the js lib
48 | npm ci
49 | npm run build
50 |
51 | # Copy the js lib
52 | cp dist/appendable.min.js examples/client
53 | cp dist/appendable.min.js.map examples/client
54 | - uses: actions/upload-pages-artifact@v2
55 | with:
56 | path: examples/client
57 | deploy:
58 | environment:
59 | name: github-pages
60 | url: ${{ steps.deployment.outputs.page_url }}
61 | runs-on: ubuntu-latest
62 | needs: build
63 | steps:
64 | - name: Deploy to GitHub Pages
65 | id: deployment
66 | uses: actions/deploy-pages@v2
67 |
--------------------------------------------------------------------------------
/.github/workflows/format.yml:
--------------------------------------------------------------------------------
1 | name: Format
2 | on:
3 | pull_request:
4 |
5 | jobs:
6 | go-fmt:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - uses: actions/checkout@v4
10 | - uses: actions/setup-go@v4
11 | with:
12 | go-version: "1.21"
13 | - run: if [ "$(gofmt -s -l . | wc -l)" -gt 0 ]; then exit 1; fi
14 |
15 | go-mod-tidy:
16 | runs-on: ubuntu-latest
17 | steps:
18 | - uses: actions/checkout@v4
19 | - uses: actions/setup-go@v4
20 | with:
21 | go-version: "1.21"
22 | - run: |
23 | go mod tidy
24 | STATUS=$(git status --porcelain go.mod go.sum)
25 | if [ ! -z "$STATUS" ]; then
26 | echo "Running go mod tidy modified go.mod and/or go.sum" >> $GITHUB_STEP_SUMMARY
27 | exit 1
28 | fi
29 | exit 0
30 |
31 | prettier:
32 | runs-on: ubuntu-latest
33 | steps:
34 | - uses: actions/checkout@v4
35 | - name: Use Node.js 18
36 | uses: actions/setup-node@v3
37 | with:
38 | node-version: "18"
39 | - run: npx prettier --check "**/*.{js,jsx,ts,tsx,css,scss,md,json}" 2> $GITHUB_STEP_SUMMARY
40 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Test
2 | on:
3 | pull_request:
4 |
5 | jobs:
6 | go-test:
7 | strategy:
8 | matrix:
9 | os: [ubuntu-latest, macos-latest]
10 | runs-on: ${{ matrix.os }}
11 | steps:
12 | - uses: actions/checkout@v3
13 | - uses: actions/setup-go@v4
14 | with:
15 | go-version: "1.21"
16 | - run: go test -v ./...
17 | - run: go vet -v ./...
18 |
19 | node-test:
20 | runs-on: ubuntu-latest
21 | steps:
22 | - uses: actions/checkout@v3
23 | - name: Use Node.js 18
24 | uses: actions/setup-node@v3
25 | with:
26 | node-version: "18"
27 | - run: npm ci
28 | - run: npm run build
29 | - run: npm test
30 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | dist
2 | node_modules
3 | Cargo.lock
4 |
5 | .DS_Store
6 | pprof.out
7 |
8 | # Ignore files
9 | *.index
10 | *.jsonl
11 | *.csv
12 | examples/**/appendable.min.js
13 | examples/**/appendable.min.js.map
14 |
15 | # But include the palo-alto dataset
16 | !examples/workspace/palo-alto.jsonl
17 |
18 | # But include these files in src/tests/mock_binaries
19 | !src/tests/mock_binaries/*.jsonl
20 | !src/tests/mock_binaries/*.cs
21 |
--------------------------------------------------------------------------------
/.prettierignore:
--------------------------------------------------------------------------------
1 | package-lock.json
--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.10.5
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [workspace]
2 | members = ["scripts/jsonl2json"]
3 |
4 | resolver = "2"
--------------------------------------------------------------------------------
/cmd/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "flag"
5 | "fmt"
6 | "log/slog"
7 | "os"
8 | "runtime/pprof"
9 | "time"
10 |
11 | "github.com/kevmo314/appendable/pkg/appendable"
12 | "github.com/kevmo314/appendable/pkg/handlers"
13 | "github.com/kevmo314/appendable/pkg/mmap"
14 | )
15 |
16 | type StringSlice []string
17 |
18 | func (s *StringSlice) String() string {
19 | return fmt.Sprintf("%v", *s)
20 | }
21 |
22 | func (s *StringSlice) Set(value string) error {
23 | *s = append(*s, value)
24 | return nil
25 | }
26 |
27 | func main() {
28 | var debugFlag, jsonlFlag, csvFlag, showTimings bool
29 | var indexFilename, pprofFilename, benchmarkFilename string
30 | var searchHeaders StringSlice
31 |
32 | flag.BoolVar(&debugFlag, "debug", false, "Use logger that prints at the debug-level")
33 | flag.BoolVar(&jsonlFlag, "jsonl", false, "Use JSONL handler")
34 | flag.BoolVar(&csvFlag, "csv", false, "Use CSV handler")
35 | flag.BoolVar(&showTimings, "t", false, "Show time-related metrics")
36 | flag.StringVar(&indexFilename, "i", "", "Specify the existing index of the file to be opened, writing to stdout")
37 | flag.StringVar(&pprofFilename, "pprof", "", "Specify the file to write the pprof data to")
38 | flag.StringVar(&benchmarkFilename, "b", "", "Specify the file to write the benchmark data to")
39 | flag.Var(&searchHeaders, "s", "Specify the headers you want to search")
40 |
41 | flag.Parse()
42 |
43 | logLevel := &slog.LevelVar{}
44 |
45 | if debugFlag {
46 | logLevel.Set(slog.LevelDebug)
47 | }
48 |
49 | if pprofFilename != "" {
50 | f, err := os.Create(pprofFilename)
51 | if err != nil {
52 | panic(err)
53 | }
54 | defer f.Close() // error handling omitted for example
55 | if err := pprof.StartCPUProfile(f); err != nil {
56 | panic(err)
57 | }
58 | defer pprof.StopCPUProfile()
59 | }
60 |
61 | logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: logLevel}))
62 | slog.SetDefault(logger)
63 |
64 | var totalStart, readStart, writeStart time.Time
65 | if showTimings {
66 | totalStart = time.Now()
67 | }
68 |
69 | flag.Usage = func() {
70 | fmt.Printf("Usage: %s [-t] [-i index] [-I index] filename\n", os.Args[0])
71 | flag.PrintDefaults()
72 | os.Exit(1)
73 | }
74 |
75 | args := flag.Args()
76 |
77 | if len(args) != 1 {
78 | flag.Usage()
79 | }
80 |
81 | // Open the data df
82 | df, err := mmap.OpenFile(args[0], os.O_RDONLY, 0)
83 | if err != nil {
84 | panic(err)
85 | }
86 | defer df.Close()
87 |
88 | var dataHandler appendable.DataHandler
89 |
90 | switch {
91 | case jsonlFlag:
92 | dataHandler = handlers.JSONLHandler{}
93 | case csvFlag:
94 | dataHandler = handlers.CSVHandler{}
95 | default:
96 | logger.Error("Please specify the file type with -jsonl or -csv.")
97 | os.Exit(1)
98 | }
99 | if showTimings {
100 | readStart = time.Now()
101 | }
102 | mmpif, err := mmap.OpenFile(indexFilename, os.O_RDWR|os.O_CREATE, 0666)
103 | if err != nil {
104 | panic(err)
105 | }
106 | defer mmpif.Close()
107 |
108 | // Open the index file
109 | i, err := appendable.NewIndexFile(mmpif, dataHandler, searchHeaders)
110 | if err != nil {
111 | panic(err)
112 | }
113 |
114 | if benchmarkFilename != "" {
115 | f, err := os.Create(benchmarkFilename)
116 | if err != nil {
117 | panic(err)
118 | }
119 | defer f.Close() // error handling omitted for example
120 | i.SetBenchmarkFile(f)
121 | }
122 |
123 | if err := i.Synchronize(df.Bytes()); err != nil {
124 | panic(err)
125 | }
126 |
127 | if showTimings {
128 | readDuration := time.Since(readStart)
129 | logger.Info("Opening + synchronizing index file took", slog.Duration("duration", readDuration))
130 | }
131 |
132 | // Write the index file
133 | if showTimings {
134 | writeStart = time.Now()
135 | }
136 |
137 | if err := mmpif.Close(); err != nil {
138 | panic(err)
139 | }
140 |
141 | if showTimings {
142 | writeDuration := time.Since(writeStart)
143 | logger.Info("Writing index file took", slog.Duration("duration", writeDuration))
144 |
145 | totalDuration := time.Since(totalStart)
146 | logger.Info("Total execution time", slog.Duration("duration", totalDuration))
147 | }
148 |
149 | logger.Info("Done!")
150 | }
151 |
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # kevmo314/appendable/examples
2 |
3 | These examples are hosted on this repository's GitHub pages.
4 |
5 | ```
6 | # yellow tripdata
7 | wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
8 |
9 | python3 -c "import pandas; pandas.read_parquet('yellow_tripdata_2023-01.parquet').to_json('yellow_tripdata_2023-01.jsonl', orient='records', lines=True)"
10 | ```
11 |
12 | To build it locally, download the data and convert it to `jsonl`:
13 |
14 | ```sh
15 | cd workspace
16 |
17 | # green tripdata
18 | python3 -m pip install -r requirements.txt
19 |
20 | # fetch data with .jsonl format
21 | python3 fetch_jsonl.py
22 | ```
23 |
24 | Then run the indexing process:
25 |
26 | ```sh
27 | # for jsonl:
28 | npm run build-index-jsonl
29 | ```
30 |
31 | Copy the `.jsonl` file to `/client`
32 |
33 | ```sh
34 | cp green_tripdata_2023-01.jsonl ../client
35 | ```
36 |
37 | Build the AppendableDB client library:
38 |
39 | ```sh
40 | npm run build
41 | ```
42 |
43 | Copy the Appendable library to `/client`
44 |
45 | ```sh
46 | cp ../../dist/appendable.min.js ../client
47 | cp ../../dist/appendable.min.js.map ../client
48 | ```
49 |
50 | Then run the development server:
51 |
52 | ```sh
53 | npm run client
54 | ```
55 |
56 | You should see the example built on http://localhost:8080
57 |
--------------------------------------------------------------------------------
/examples/benchmarkdiff/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "bufio"
5 | "io"
6 | "os"
7 | "strconv"
8 | "strings"
9 |
10 | "github.com/go-echarts/go-echarts/v2/charts"
11 | "github.com/go-echarts/go-echarts/v2/components"
12 | "github.com/go-echarts/go-echarts/v2/opts"
13 | )
14 |
15 | type record struct {
16 | timestamp int
17 | n int
18 | size int
19 | }
20 |
21 | func readFile(f *os.File) ([]record, error) {
22 | // read the file and parse the (timestamp,n,size) tuples
23 | s := bufio.NewScanner(f)
24 | var records []record
25 | for s.Scan() {
26 | // parse the line
27 | line := s.Text()
28 | // split the line
29 | tokens := strings.Split(line, ",")
30 | // convert the tokens to integers
31 | timestamp, err := strconv.Atoi(tokens[0])
32 | if err != nil {
33 | return nil, err
34 | }
35 | n, err := strconv.Atoi(tokens[1])
36 | if err != nil {
37 | return nil, err
38 | }
39 | size, err := strconv.Atoi(tokens[2])
40 | if err != nil {
41 | return nil, err
42 | }
43 | records = append(records, record{timestamp, n, size})
44 | }
45 | return records, s.Err()
46 | }
47 |
48 | func generateXAxis(records []record) []int {
49 | var xAxis []int
50 | for _, r := range records {
51 | xAxis = append(xAxis, r.n)
52 | }
53 | return xAxis
54 | }
55 |
56 | func generateTimestampYAxis(records []record) []opts.LineData {
57 | var yAxis []opts.LineData
58 | for _, r := range records {
59 | yAxis = append(yAxis, opts.LineData{Value: r.timestamp})
60 | }
61 | return yAxis
62 | }
63 |
64 | func generateSizeYAxis(records []record) []opts.LineData {
65 | var yAxis []opts.LineData
66 | for _, r := range records {
67 | yAxis = append(yAxis, opts.LineData{Value: r.size})
68 | }
69 | return yAxis
70 | }
71 |
72 | func generateTimestampDeltaYAxis(r1, r2 []record) []opts.LineData {
73 | var yAxis []opts.LineData
74 | for i := range r1 {
75 | yAxis = append(yAxis, opts.LineData{Value: r2[i].timestamp - r1[i].timestamp})
76 | }
77 | return yAxis
78 | }
79 |
80 | func generateSizeDeltaYAxis(r1, r2 []record) []opts.LineData {
81 | var yAxis []opts.LineData
82 | for i := range r1 {
83 | yAxis = append(yAxis, opts.LineData{Value: r2[i].size - r1[i].size})
84 | }
85 | return yAxis
86 | }
87 |
88 | func main() {
89 | // read two arguments as files and parse the (timestamp,n,size) tuples
90 | f1, err := os.Open(os.Args[1])
91 | if err != nil {
92 | panic(err)
93 | }
94 | defer f1.Close()
95 | records1, err := readFile(f1)
96 | if err != nil {
97 | panic(err)
98 | }
99 | f2, err := os.Open(os.Args[2])
100 | if err != nil {
101 | panic(err)
102 | }
103 | defer f2.Close()
104 | records2, err := readFile(f2)
105 | if err != nil {
106 | panic(err)
107 | }
108 |
109 | // generate four charts:
110 | // 1. timestamp vs n
111 | // 2. pagefile size vs n
112 | // 3. timestamp delta vs n
113 | // 4. pagefile size delta vs n
114 |
115 | line1 := charts.NewLine()
116 | line1.SetGlobalOptions(
117 | charts.WithTooltipOpts(opts.Tooltip{Show: true, Trigger: "axis"}),
118 | charts.WithYAxisOpts(opts.YAxis{
119 | Name: "Time (μs)",
120 | }),
121 | charts.WithXAxisOpts(opts.XAxis{
122 | Name: "Bytes read",
123 | }))
124 | line1.SetXAxis(generateXAxis(records1)).
125 | AddSeries("Run 1", generateTimestampYAxis(records1)).
126 | AddSeries("Run 2", generateTimestampYAxis(records2))
127 |
128 | line2 := charts.NewLine()
129 | line2.SetGlobalOptions(
130 | charts.WithTooltipOpts(opts.Tooltip{Show: true, Trigger: "axis"}),
131 | charts.WithYAxisOpts(opts.YAxis{
132 | Name: "Size (pages)",
133 | }),
134 | charts.WithXAxisOpts(opts.XAxis{
135 | Name: "Bytes read",
136 | }))
137 | line2.SetXAxis(generateXAxis(records1)).
138 | AddSeries("Run 1", generateSizeYAxis(records1)).
139 | AddSeries("Run 2", generateSizeYAxis(records2))
140 |
141 | line3 := charts.NewLine()
142 | line3.SetGlobalOptions(
143 | charts.WithYAxisOpts(opts.YAxis{
144 | Name: "Time delta (μs)",
145 | }),
146 | charts.WithXAxisOpts(opts.XAxis{
147 | Name: "Bytes read",
148 | }))
149 | line3.SetXAxis(generateXAxis(records1)).
150 | AddSeries("Time delta", generateTimestampDeltaYAxis(records1, records2))
151 |
152 | line4 := charts.NewLine()
153 | line4.SetGlobalOptions(
154 | charts.WithYAxisOpts(opts.YAxis{
155 | Name: "Size delta (pages)",
156 | }),
157 | charts.WithXAxisOpts(opts.XAxis{
158 | Name: "Bytes read",
159 | }))
160 | line4.SetXAxis(generateXAxis(records1)).
161 | AddSeries("Size delta", generateSizeDeltaYAxis(records1, records2))
162 |
163 | page := components.NewPage()
164 | page.PageTitle = "Benchmark diff"
165 | page.AddCharts(
166 | line1,
167 | line2,
168 | line3,
169 | line4,
170 | )
171 | f, err := os.Create("output.html")
172 | if err != nil {
173 | panic(err)
174 | }
175 | page.Render(io.MultiWriter(f))
176 | }
177 |
--------------------------------------------------------------------------------
/examples/client/editor.js:
--------------------------------------------------------------------------------
1 | let activeEditor = "json";
2 |
3 | var editor = ace.edit("editor");
4 | editor.setTheme("ace/theme/chrome");
5 |
6 | var jsonSession = ace.createEditSession(
7 | JSON.stringify(
8 | {
9 | where: [
10 | {
11 | operation: ">=",
12 | key: "trip_distance",
13 | value: 10,
14 | },
15 | ],
16 | orderBy: [
17 | {
18 | key: "trip_distance",
19 | direction: "ASC",
20 | },
21 | ],
22 | select: [
23 | "trip_distance",
24 | "VendorID",
25 | "passenger_count",
26 | "fare_amount",
27 | "tip_amount",
28 | "mta_tax",
29 | ],
30 | },
31 | null,
32 | 2,
33 | ),
34 | "ace/mode/json",
35 | );
36 |
37 | var jsCode =
38 | "db\n" +
39 | " .where('trip_distance', '>=', 10)\n" +
40 | " .orderBy('trip_distance', 'ASC')\n" +
41 | " .select([\n" +
42 | " 'trip_distance',\n" +
43 | " 'VendorID',\n" +
44 | " 'passenger_count',\n" +
45 | " 'fare_amount',\n" +
46 | " 'tip_amount',\n" +
47 | " 'mta_tax'\n" +
48 | " ])\n" +
49 | " .get();";
50 |
51 | var jsSession = ace.createEditSession(jsCode, "ace/mode/javascript");
52 |
53 | editor.setSession(jsonSession);
54 |
55 | var jsonTab = document.getElementById("jsonTab");
56 | var jsTab = document.getElementById("jsTab");
57 |
58 | jsonTab.addEventListener("click", function () {
59 | editor.setSession(jsonSession);
60 | attachJsonEditorUX();
61 | activeEditor = "json";
62 | window.activeEditor = activeEditor;
63 | });
64 |
65 | jsTab.addEventListener("click", function () {
66 | editor.setSession(jsSession);
67 | activeEditor = "javascript";
68 | window.activeEditor = activeEditor;
69 | });
70 |
71 | function attachJsonEditorUX() {
72 | // NOTE: when composite indexes get supported, remove this UX feature
73 | // <---- start of UX feature ---->
74 | let isProgramChange = false;
75 | let lastEdited = "none";
76 | let prevWhereKey = "trip_distance";
77 | let prevOrderByKey = "trip_distance";
78 |
79 | function updateKey(editorContent) {
80 | try {
81 | let query = JSON.parse(editorContent);
82 | if (query.where && query.orderBy) {
83 | const whereKey = query.where[0].key;
84 | const orderByKey = query.orderBy[0].key;
85 |
86 | if (lastEdited === "where") {
87 | query.orderBy[0].key = whereKey;
88 | } else if (lastEdited === "orderBy") {
89 | query.where[0].key = orderByKey;
90 | }
91 |
92 | prevWhereKey = whereKey;
93 | prevOrderByKey = orderByKey;
94 |
95 | return JSON.stringify(query, null, 2);
96 | }
97 | } catch (e) {
98 | console.log("Error parsing JSON:", e.message);
99 | console.log("Incomplete string content:", editorContent);
100 | }
101 | return editorContent;
102 | }
103 |
104 | editor.getSession().on("change", function (e) {
105 | if (isProgramChange) {
106 | isProgramChange = false;
107 | return;
108 | }
109 |
110 | const cursorPosition = editor.getCursorPosition();
111 | const editorContent = editor.getSession().getValue();
112 |
113 | let query;
114 | try {
115 | query = JSON.parse(editorContent);
116 | } catch (e) {
117 | return;
118 | }
119 |
120 | const currentWhereKey = query.where ? query.where[0].key : "";
121 | const currentOrderByKey = query.orderBy ? query.orderBy[0].key : "";
122 |
123 | if (currentWhereKey !== prevWhereKey) {
124 | lastEdited = "where";
125 | } else if (currentOrderByKey !== prevOrderByKey) {
126 | lastEdited = "orderBy";
127 | }
128 |
129 | const updatedContent = updateKey(editorContent);
130 |
131 | if (updatedContent !== editorContent) {
132 | isProgramChange = true;
133 |
134 | const doc = editor.getSession().getDocument();
135 | doc.setValue(updatedContent);
136 |
137 | editor.moveCursorToPosition(cursorPosition);
138 | editor.clearSelection();
139 | }
140 | });
141 |
142 | // <---- end of UX feature ---->
143 | }
144 |
145 | attachJsonEditorUX();
146 | window.activeEditor = activeEditor;
147 |
--------------------------------------------------------------------------------
/examples/client/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
38 |
39 |
40 |
41 |
42 |
43 | Appendable - NYC
44 | Green Cab Trip Data in 01/2023
45 |
46 |
54 |
55 | Appendable is querying the JSONL and index files that GitHub pages hosts
56 | directly . There is no server involved here!
57 |
58 |
59 | Keep in mind that while the query syntax supports a lot of different
60 | operations, Appendable doesn't support composite indexes yet. Therefore,
61 | only one field at a time can be filtered on and that field must be used
62 | for sorting.
63 |
64 |
65 |
66 |
70 |
71 |
Query
72 |
73 |
json
74 |
javascript
75 |
76 |
Results -
77 |
Execute
78 |
Fetch more
79 |
80 |
81 |
82 |
87 |
88 |
89 |
198 |
199 |
200 |
201 |
--------------------------------------------------------------------------------
/examples/client/server.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "log"
5 | "net/http"
6 | )
7 |
8 | func main() {
9 | // Set the directory to serve
10 | fs := http.FileServer(http.Dir("./"))
11 |
12 | // Handle all requests by serving a file of the same name
13 | http.Handle("/", fs)
14 |
15 | http.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
16 | http.ServeFile(w, r, "./search.html")
17 | })
18 |
19 | // Define the port to listen on
20 | port := "3001"
21 | log.Printf("Listening on http://localhost:%s/", port)
22 |
23 | // Start the server
24 | err := http.ListenAndServe(":"+port, nil)
25 | if err != nil {
26 | log.Fatal(err)
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/examples/client/styles.css:
--------------------------------------------------------------------------------
1 | body,
2 | html {
3 | margin: 0;
4 | padding: 0px 0px 4px 4px;
5 | }
6 | .flex-1 {
7 | flex: 1;
8 | display: flex;
9 | gap: 0 30px;
10 | height: 100vh;
11 | width: 100vw;
12 | }
13 | .result-row {
14 | cursor: pointer;
15 | }
16 | .result-row:hover {
17 | background-color: yellow;
18 | }
19 | #fields {
20 | max-height: calc(100vh - 50px);
21 | overflow-y: auto;
22 | }
23 | #results {
24 | overflow-y: auto;
25 | max-height: calc(100vh - 670px);
26 | }
27 | #results-header {
28 | width: max-content;
29 | }
30 | .header-item,
31 | .result-cell {
32 | padding: 4px;
33 | text-align: left;
34 | min-width: 200px;
35 | }
36 | .header-item {
37 | background-color: #f0f0f0;
38 | font-weight: bold;
39 | }
40 |
--------------------------------------------------------------------------------
/examples/visualizer/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/binary"
5 | "fmt"
6 | "io"
7 | "os"
8 | "slices"
9 |
10 | "github.com/kevmo314/appendable/pkg/bptree"
11 | "github.com/kevmo314/appendable/pkg/handlers"
12 | "github.com/kevmo314/appendable/pkg/mmap"
13 | "github.com/kevmo314/appendable/pkg/pagefile"
14 | "golang.org/x/sys/unix"
15 | )
16 |
17 | func main() {
18 | f, err := os.Open(os.Args[1])
19 | if err != nil {
20 | panic(err)
21 | }
22 | defer f.Close()
23 |
24 | df, err := os.Open(os.Args[2])
25 | if err != nil {
26 | panic(err)
27 | }
28 | defer df.Close()
29 |
30 | mmdf, err := mmap.NewMemoryMappedFile(df, unix.PROT_READ)
31 | if err != nil {
32 | panic(err)
33 | }
34 |
35 | // create a new pagefile
36 | pf, err := pagefile.NewPageFile(f)
37 | if err != nil {
38 | panic(err)
39 | }
40 |
41 | lmps := []int64{4096} // store a list of the linked meta pages.
42 | fps := []int64{} // store a list of the free pages.
43 |
44 | fmt.Printf("\n\n\nAppendable Visualizer \n\n\n\n")
45 |
46 | // read the free page index
47 | fmt.Printf("Free Page Index ", 0)
48 | if _, err := f.Seek(0, io.SeekStart); err != nil {
49 | panic(err)
50 | }
51 | buf := make([]byte, pf.PageSize())
52 | if _, err := f.Read(buf); err != nil {
53 | panic(err)
54 | }
55 | for j := 0; j < pf.PageSize()/8; j++ {
56 | val := binary.LittleEndian.Uint64(buf[j*8 : j*8+8])
57 | if val == 0 {
58 | break
59 | }
60 | fmt.Printf("%d
", val)
61 | fps = append(fps, int64(val))
62 | }
63 | fmt.Printf(" ")
64 |
65 | slices.Sort(fps)
66 |
67 | for i := int64(0); i < pf.PageCount(); i++ {
68 | offset, err := pf.Page(int(i))
69 | if err != nil {
70 | panic(err)
71 | }
72 | // read the page
73 | if _, err := f.Seek(offset, io.SeekStart); err != nil {
74 | panic(err)
75 | }
76 | if len(fps) > 0 && i == fps[0] {
77 | // this is a free page
78 | fps = fps[1:]
79 | fmt.Printf("Free Page ", offset)
80 | fmt.Printf(" ")
81 | } else if len(lmps) > 0 && offset == lmps[0] {
82 | // this is a linked meta page
83 | lmps = lmps[1:]
84 |
85 | // metaPage, err := linkedpage.NewMultiBPTree(pf, int(i))
86 | // if err != nil {
87 | // panic(err)
88 | // }
89 | fmt.Printf("Linked Meta Page (TODO) ", offset)
90 |
91 | // root, err := metaPage.Root()
92 | // if err != nil {
93 | // panic(err)
94 | // }
95 | // next, err := metaPage.Next()
96 | // if err != nil {
97 | // panic(err)
98 | // }
99 | // exists, err := next.Exists()
100 | // if err != nil {
101 | // panic(err)
102 | // }
103 | // if exists {
104 | // fmt.Printf("Root (%x) - Next (%x)
", root.Offset, root.Offset, next.MemoryPointer().Offset, next.MemoryPointer().Offset)
105 | // lmps = append(lmps, int64(next.MemoryPointer().Offset))
106 | // } else {
107 | // fmt.Printf("Root (%x) - Next (nil)
", root.Offset, root.Offset)
108 | // }
109 | // fmt.Printf("Metadata
")
110 | // md, err := metaPage.Metadata()
111 | // if err != nil {
112 | // panic(err)
113 | // }
114 | // fmt.Printf("%x ", md)
115 | // fmt.Printf("")
116 | } else {
117 | // try to read the page as a bptree node
118 | node := &bptree.BPTreeNode{}
119 | node.Data = mmdf.Bytes()
120 | node.DataParser = &handlers.JSONLHandler{}
121 |
122 | if _, err := f.Seek(offset, io.SeekStart); err != nil {
123 | panic(err)
124 | }
125 | buf := make([]byte, pf.PageSize())
126 | if _, err := f.Read(buf); err != nil {
127 | panic(err)
128 | }
129 | if err := node.UnmarshalBinary(buf); err != nil {
130 | if err == io.EOF {
131 | break
132 | }
133 | panic(err)
134 | }
135 |
136 | if node.Leaf() {
137 | fmt.Printf("B+ Tree Leaf Node ", offset)
138 | } else {
139 | fmt.Printf("B+ Tree Node ", offset)
140 | }
141 | fmt.Printf("Keys
")
142 | for _, k := range node.Keys {
143 | fmt.Printf("%x ", k.Value)
144 | }
145 | fmt.Printf("Pointers
")
146 | for j := 0; j < node.NumPointers(); j++ {
147 | if node.Leaf() {
148 | fmt.Printf("[%x:%x]
", node.Pointer(j).Offset, node.Pointer(j).Offset+uint64(node.Pointer(j).Length))
149 | } else {
150 | fmt.Printf("%x
", node.Pointer(j).Offset, node.Pointer(j).Offset)
151 | }
152 | }
153 | fmt.Printf(" ")
154 | }
155 | }
156 | fmt.Printf(" \n\n\n")
157 |
158 | }
159 |
--------------------------------------------------------------------------------
/examples/workspace/fetch_csv.py:
--------------------------------------------------------------------------------
1 | # Data taken from https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
2 |
3 | import io
4 |
5 | import pandas as pd
6 | import requests
7 |
8 | response = requests.get('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet')
9 |
10 | df = pd.read_parquet(io.BytesIO(response.content))
11 | df.to_csv('green_tripdata_2023-01.csv', index=False)
--------------------------------------------------------------------------------
/examples/workspace/fetch_jsonl.py:
--------------------------------------------------------------------------------
1 | # Data taken from https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
2 |
3 | import io
4 |
5 | import pandas as pd
6 | import requests
7 |
8 | response = requests.get('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet')
9 |
10 | pd.read_parquet(io.BytesIO(response.content)).to_json('green_tripdata_2023-01.jsonl', orient='records', lines=True)
11 |
--------------------------------------------------------------------------------
/examples/workspace/requirements.txt:
--------------------------------------------------------------------------------
1 | pyarrow
2 | pandas
3 | requests
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/kevmo314/appendable
2 |
3 | go 1.22.0
4 |
5 | require (
6 | github.com/go-echarts/go-echarts/v2 v2.3.3
7 | golang.org/x/sys v0.16.0
8 | golang.org/x/text v0.14.0
9 | )
10 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
2 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
3 | github.com/go-echarts/go-echarts/v2 v2.3.3 h1:uImZAk6qLkC6F9ju6mZ5SPBqTyK8xjZKwSmwnCg4bxg=
4 | github.com/go-echarts/go-echarts/v2 v2.3.3/go.mod h1:56YlvzhW/a+du15f3S2qUGNDfKnFOeJSThBIrVFHDtI=
5 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
6 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
7 | github.com/stretchr/testify v1.6.0 h1:jlIyCplCJFULU/01vCkhKuTyc3OorI3bJFuw6obfgho=
8 | github.com/stretchr/testify v1.6.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
9 | golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU=
10 | golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
11 | golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
12 | golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
13 | gopkg.in/yaml.v3 v3.0.0 h1:hjy8E9ON/egN1tAYqKb61G10WtihqetD4sz2H+8nIeA=
14 | gopkg.in/yaml.v3 v3.0.0/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
15 |
--------------------------------------------------------------------------------
/jest.config.ts:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | preset: "ts-jest",
3 | testEnvironment: "node",
4 | };
5 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "appendable",
3 | "version": "1.0.0",
4 | "description": "AppendableDB is an append-only\\*, schemaless, service-less, client-facing database.",
5 | "main": "index.js",
6 | "scripts": {
7 | "build": "esbuild src/index.ts --bundle --minify --sourcemap --outfile=dist/appendable.min.js",
8 | "warp": "rm -rf dist examples/client/appendable.min.js examples/client/appendable.min.js.map && esbuild src/index.ts --bundle --minify --sourcemap --outfile=dist/appendable.min.js",
9 | "client": "cd examples/client && go run server.go",
10 | "test": "jest"
11 | },
12 | "repository": {
13 | "type": "git",
14 | "url": "git+https://github.com/kevmo314/appendable.git"
15 | },
16 | "author": "Kevin Wang ",
17 | "license": "ISC",
18 | "bugs": {
19 | "url": "https://github.com/kevmo314/appendable/issues"
20 | },
21 | "homepage": "https://github.com/kevmo314/appendable#readme",
22 | "dependencies": {
23 | "esbuild": "^0.19.7"
24 | },
25 | "devDependencies": {
26 | "@types/jest": "^29.5.11",
27 | "prettier": "^3.2.1",
28 | "ts-jest": "^29.1.1",
29 | "ts-node": "^10.9.2"
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/pkg/appendable/appendable.go:
--------------------------------------------------------------------------------
1 | package appendable
2 |
3 | import (
4 | "encoding/binary"
5 | "fmt"
6 | "github.com/kevmo314/appendable/pkg/encoding"
7 | "strings"
8 | )
9 |
10 | /**
11 | * The structure of an index file is characterized by some pages that point
12 | * to other pages. Each box below represents a (typically 4kB) page and
13 | * the arrows indicate that there is a pointer to the next page.
14 | *
15 | * +-----------+-----------+ +-------------+ +-------------+ +-------------+
16 | * | Page GC | File Meta | -> | Index Meta | -> | Index Meta | -> | Index Meta |
17 | * +-----------+-----------+ +-------------+ +-------------+ +-------------+
18 | * | | |
19 | * v v v
20 | * +~~~~~~~~~~~~~+ +~~~~~~~~~~~~~+ +~~~~~~~~~~~~~+
21 | * | B+ Tree | | B+ Tree | | B+ Tree |
22 | * +~~~~~~~~~~~~~+ +~~~~~~~~~~~~~+ +~~~~~~~~~~~~~+
23 | *
24 | * Note: By convention, the first FileMeta does not have a pointer to the
25 | * B+ tree. Instead, the first FileMeta is used to store metadata about the
26 | * file itself and only contains a next pointer.
27 | *
28 | * Additionally, the Page GC page is used by the page file to store free page
29 | * indexes for garbage collection.
30 | *
31 | * Consequentially, the index file cannot be smaller than two pages (typically 8kB).
32 | */
33 |
34 | type Version byte
35 |
36 | type Format byte
37 |
38 | const (
39 | FormatJSONL Format = iota
40 | FormatCSV
41 | )
42 |
43 | // FieldType represents the type of data stored in the field, which follows
44 | // JSON types excluding Object and null. Object is broken down into subfields
45 | // and null is not stored.
46 | type FieldType byte
47 |
48 | const (
49 | FieldTypeString FieldType = iota
50 | FieldTypeInt64
51 | FieldTypeUint64
52 | FieldTypeFloat64
53 | FieldTypeObject
54 | FieldTypeArray
55 | FieldTypeBoolean
56 | FieldTypeNull
57 |
58 | FieldTypeTrigram
59 | FieldTypeBigram
60 |
61 | FieldTypeUnigram
62 |
63 | FieldTypeVector
64 | )
65 |
66 | func (t FieldType) TypescriptType() string {
67 | components := []string{}
68 | if t&FieldTypeString != 0 || t&FieldTypeTrigram != 0 || t&FieldTypeBigram != 0 || t&FieldTypeUnigram != 0 {
69 | components = append(components, "string")
70 | }
71 | if t&FieldTypeInt64 != 0 || t&FieldTypeFloat64 != 0 {
72 | components = append(components, "number")
73 | }
74 | if t&FieldTypeObject != 0 {
75 | components = append(components, "Record")
76 | }
77 | if t&FieldTypeArray != 0 {
78 | components = append(components, "any[]")
79 | }
80 | if t&FieldTypeBoolean != 0 {
81 | components = append(components, "boolean")
82 | }
83 | if t&FieldTypeNull != 0 {
84 | components = append(components, "null")
85 | }
86 |
87 | if t&FieldTypeVector != 0 {
88 | components = append(components, "number[]")
89 | }
90 |
91 | if len(components) == 0 {
92 | return "unknown"
93 | }
94 | return strings.Join(components, " | ")
95 | }
96 |
97 | type FileMeta struct {
98 | Version
99 | Format
100 | // An offset to indicate how much data is contained within
101 | // this index. Note that this is implementation-dependent,
102 | // so it is not guaranteed to have any uniform interpretation.
103 | // For example, in JSONL, this is the number of bytes read
104 | // and indexed so far.
105 | ReadOffset uint64
106 | Entries uint64
107 | }
108 |
109 | func (m *FileMeta) MarshalBinary() ([]byte, error) {
110 | buf := make([]byte, 10+encoding.SizeVarint(m.Entries))
111 | buf[0] = byte(m.Version)
112 | buf[1] = byte(m.Format)
113 | binary.LittleEndian.PutUint64(buf[2:], m.ReadOffset)
114 | binary.PutUvarint(buf[10:], m.Entries)
115 | return buf, nil
116 | }
117 |
118 | func (m *FileMeta) UnmarshalBinary(buf []byte) error {
119 | if len(buf) < 10 {
120 | return fmt.Errorf("invalid metadata size: %d", len(buf))
121 | }
122 | m.Version = Version(buf[0])
123 |
124 | fileFormat := buf[1]
125 |
126 | switch fileFormat {
127 | case byte(0):
128 | m.Format = FormatJSONL
129 | case byte(1):
130 | m.Format = FormatCSV
131 | default:
132 | return fmt.Errorf("unrecognized file format: %v", buf[1])
133 | }
134 |
135 | m.ReadOffset = binary.LittleEndian.Uint64(buf[2:])
136 |
137 | e, _ := binary.Uvarint(buf[10:])
138 | m.Entries = e
139 |
140 | return nil
141 | }
142 |
143 | type IndexMeta struct {
144 | FieldName string
145 | FieldType FieldType
146 | Width uint16
147 |
148 | // TotalFieldValueLength represents the cumulative sum of the lengths of the entries within this index, used for computing the average length.
149 | TotalFieldValueLength uint64
150 | }
151 |
152 | func (m *IndexMeta) MarshalBinary() ([]byte, error) {
153 | buf := make([]byte, 2+2+len(m.FieldName)+2+encoding.SizeVarint(m.TotalFieldValueLength))
154 | binary.LittleEndian.PutUint16(buf[0:], uint16(m.FieldType))
155 | binary.LittleEndian.PutUint16(buf[2:], m.Width)
156 | binary.LittleEndian.PutUint16(buf[4:], uint16(len(m.FieldName)))
157 | copy(buf[6:], m.FieldName)
158 | binary.PutUvarint(buf[6+len(m.FieldName):], m.TotalFieldValueLength)
159 | return buf, nil
160 | }
161 |
162 | func (m *IndexMeta) UnmarshalBinary(buf []byte) error {
163 | if len(buf) < 4 {
164 | return fmt.Errorf("invalid metadata size: %d", len(buf))
165 | }
166 | m.FieldType = FieldType(binary.LittleEndian.Uint16(buf[0:]))
167 | m.Width = binary.LittleEndian.Uint16(buf[2:])
168 | nameLength := binary.LittleEndian.Uint16(buf[4:])
169 | if len(buf) < 4+int(nameLength) {
170 | return fmt.Errorf("invalid metadata size: %d", len(buf))
171 | }
172 | m.FieldName = string(buf[6 : 6+nameLength])
173 | tl, _ := binary.Uvarint(buf[6+nameLength:])
174 | m.TotalFieldValueLength = tl
175 | return nil
176 | }
177 |
178 | func DetermineType(ft FieldType) uint16 {
179 | shift := 1 // we'll dedicate 0 to be variable width, everything else is the fixed width + shift
180 | width := uint16(0)
181 | switch ft {
182 | case FieldTypeBoolean:
183 | width = uint16(shift + 1)
184 | case FieldTypeNull:
185 | width = uint16(shift + 0)
186 | case FieldTypeFloat64, FieldTypeInt64, FieldTypeUint64:
187 | width = uint16(shift + 8)
188 | case FieldTypeTrigram:
189 | width = uint16(shift + 3)
190 | case FieldTypeBigram:
191 | width = uint16(shift + 2)
192 | case FieldTypeUnigram:
193 | width = uint16(shift + 1)
194 | }
195 |
196 | return width
197 | }
198 |
--------------------------------------------------------------------------------
/pkg/appendable/appendable_test.go:
--------------------------------------------------------------------------------
1 | package appendable
2 |
3 | import (
4 | "github.com/kevmo314/appendable/pkg/buftest"
5 | "github.com/kevmo314/appendable/pkg/linkedpage"
6 | "github.com/kevmo314/appendable/pkg/pagefile"
7 | "reflect"
8 | "testing"
9 | )
10 |
11 | func TestMarshalMetadata(t *testing.T) {
12 | t.Run("file meta", func(t *testing.T) {
13 | b := buftest.NewSeekableBuffer()
14 | p, err := pagefile.NewPageFile(b)
15 | if err != nil {
16 | t.Fatal(err)
17 | }
18 |
19 | tree, err := linkedpage.NewMultiBPTree(p, 0)
20 | if err != nil {
21 | t.Fatal(err)
22 | }
23 |
24 | page, err := tree.AddNext()
25 | if err != nil {
26 | t.Fatal(err)
27 | }
28 |
29 | fm := &FileMeta{
30 | Version: 1,
31 | Format: 1,
32 | ReadOffset: 69,
33 | Entries: 38,
34 | }
35 |
36 | buf, err := fm.MarshalBinary()
37 | if err != nil {
38 | t.Fatalf("Failed to marshal binary: %v", err)
39 | }
40 |
41 | if err := page.SetMetadata(buf); err != nil {
42 | t.Fatal(err)
43 | }
44 |
45 | // finished marshaling
46 | // <-------->
47 | // start unmarshal
48 |
49 | buf, err = page.Metadata()
50 | if err != nil {
51 | t.Fatal(err)
52 | }
53 |
54 | fm2 := &FileMeta{}
55 |
56 | if err := fm2.UnmarshalBinary(buf); err != nil {
57 | t.Fatal(err)
58 | }
59 |
60 | if !reflect.DeepEqual(fm, fm2) {
61 | t.Fatal("not equal")
62 | }
63 | })
64 |
65 | t.Run("file meta", func(t *testing.T) {
66 | b := buftest.NewSeekableBuffer()
67 | p, err := pagefile.NewPageFile(b)
68 | if err != nil {
69 | t.Fatal(err)
70 | }
71 |
72 | tree, err := linkedpage.NewMultiBPTree(p, 0)
73 | if err != nil {
74 | t.Fatal(err)
75 | }
76 |
77 | page, err := tree.AddNext()
78 | if err != nil {
79 | t.Fatal(err)
80 | }
81 |
82 | im := &IndexMeta{
83 | FieldName: "scarface",
84 | FieldType: FieldTypeString,
85 | Width: 0,
86 | TotalFieldValueLength: 938,
87 | }
88 | buf, err := im.MarshalBinary()
89 | if err != nil {
90 | t.Fatalf("Failed to marshal binary: %v", err)
91 | }
92 |
93 | if err := page.SetMetadata(buf); err != nil {
94 | t.Fatal(err)
95 | }
96 |
97 | // finished marshaling
98 | // <-------->
99 | // start unmarshal
100 |
101 | buf, err = page.Metadata()
102 | if err != nil {
103 | t.Fatal(err)
104 | }
105 |
106 | im2 := &IndexMeta{}
107 |
108 | if err := im2.UnmarshalBinary(buf); err != nil {
109 | t.Fatal(err)
110 | }
111 |
112 | if !reflect.DeepEqual(im, im2) {
113 | t.Fatal("not equal")
114 | }
115 | })
116 |
117 | }
118 |
--------------------------------------------------------------------------------
/pkg/appendable/index_file.go:
--------------------------------------------------------------------------------
1 | package appendable
2 |
3 | import (
4 | "errors"
5 | "fmt"
6 | "github.com/kevmo314/appendable/pkg/linkedpage"
7 | "io"
8 | "time"
9 |
10 | "github.com/kevmo314/appendable/pkg/bptree"
11 | "github.com/kevmo314/appendable/pkg/pagefile"
12 | )
13 |
14 | const CurrentVersion = 1
15 |
16 | type DataHandler interface {
17 | bptree.DataParser
18 | Synchronize(f *IndexFile, df []byte) error
19 | Format() Format
20 | }
21 |
22 | // IndexFile is a representation of the entire index file.
23 | type IndexFile struct {
24 | tree *linkedpage.LinkedPage
25 | dataHandler DataHandler
26 |
27 | pf *pagefile.PageFile
28 | BenchmarkCallback func(int)
29 |
30 | searchHeaders []string
31 | }
32 |
33 | func NewIndexFile(f io.ReadWriteSeeker, dataHandler DataHandler, searchHeaders []string) (*IndexFile, error) {
34 | pf, err := pagefile.NewPageFile(f)
35 | if err != nil {
36 | return nil, fmt.Errorf("failed to create page file: %w", err)
37 | }
38 |
39 | tree, err := linkedpage.NewMultiBPTree(pf, 0)
40 | if err != nil {
41 | return nil, fmt.Errorf("failed to create multi b+ tree: %w", err)
42 | }
43 | // ensure the first page is written.
44 | node, err := tree.Next()
45 | if err != nil && !errors.Is(err, io.EOF) {
46 | return nil, fmt.Errorf("failed to get next meta page: %w", err)
47 | }
48 | if errors.Is(err, io.EOF) {
49 | // the page doesn't exist, so we need to create it
50 | created, err := tree.AddNext()
51 | if err != nil {
52 | return nil, fmt.Errorf("failed to add next meta page: %w", err)
53 | }
54 | metadata := &FileMeta{
55 | Version: CurrentVersion,
56 | Format: dataHandler.Format(),
57 | }
58 | buf, err := metadata.MarshalBinary()
59 | if err != nil {
60 | return nil, fmt.Errorf("failed to marshal metadata: %w", err)
61 | }
62 | if err := created.SetMetadata(buf); err != nil {
63 | return nil, fmt.Errorf("failed to set metadata: %w", err)
64 | }
65 | return &IndexFile{tree: created, dataHandler: dataHandler, pf: pf, searchHeaders: searchHeaders}, nil
66 | } else {
67 | // validate the metadata
68 | buf, err := node.Metadata()
69 | if err != nil {
70 | return nil, fmt.Errorf("failed to read metadata: %w", err)
71 | }
72 | metadata := &FileMeta{}
73 | if err := metadata.UnmarshalBinary(buf); err != nil {
74 | return nil, fmt.Errorf("failed to unmarshal metadata: %w", err)
75 | }
76 | if metadata.Version != CurrentVersion {
77 | return nil, fmt.Errorf("unsupported version: %d", metadata.Version)
78 | }
79 | if metadata.Format != dataHandler.Format() {
80 | return nil, fmt.Errorf("unsupported format: %x", metadata.Format)
81 | }
82 | return &IndexFile{tree: node, dataHandler: dataHandler, pf: pf, searchHeaders: searchHeaders}, nil
83 | }
84 | }
85 |
86 | func (i *IndexFile) Metadata() (*FileMeta, error) {
87 | // the first page consists of associated metadata for the tree
88 | buf, err := i.tree.Metadata()
89 | if err != nil {
90 | return nil, fmt.Errorf("failed to read metadata: %w", err)
91 | }
92 | metadata := &FileMeta{}
93 | return metadata, metadata.UnmarshalBinary(buf)
94 | }
95 |
96 | func (i *IndexFile) SetMetadata(metadata *FileMeta) error {
97 | buf, err := metadata.MarshalBinary()
98 | if err != nil {
99 | return fmt.Errorf("failed to marshal metadata: %w", err)
100 | }
101 | return i.tree.SetMetadata(buf)
102 | }
103 |
104 | func (i *IndexFile) Indexes() (*linkedpage.LinkedPage, error) {
105 | return i.tree.Next()
106 | }
107 |
108 | func (i *IndexFile) IsEmpty() (bool, error) {
109 | n, err := i.tree.Next()
110 | if err != nil && !errors.Is(err, io.EOF) {
111 | return false, fmt.Errorf("failed to get next meta page: %w", err)
112 | }
113 | return n != nil, nil
114 | }
115 |
116 | func (i *IndexFile) IndexFieldNames() ([]string, error) {
117 | var fieldNames []string
118 | uniqueFieldNames := make(map[string]bool)
119 |
120 | mp := i.tree
121 |
122 | for {
123 | next, err := mp.Next()
124 | if err != nil {
125 | if errors.Is(err, io.EOF) {
126 | break
127 | }
128 | return nil, fmt.Errorf("failed to get next meta page: %w", err)
129 | }
130 | buf, err := next.Metadata()
131 | if err != nil {
132 | return nil, fmt.Errorf("failed to read metadata: %w", err)
133 | }
134 | metadata := &IndexMeta{}
135 | if err := metadata.UnmarshalBinary(buf); err != nil {
136 | return nil, fmt.Errorf("failed to unmarshal metadata: %w", err)
137 | }
138 |
139 | if _, ok := uniqueFieldNames[metadata.FieldName]; !ok {
140 | uniqueFieldNames[metadata.FieldName] = true
141 | fieldNames = append(fieldNames, metadata.FieldName)
142 | }
143 | mp = next
144 | }
145 |
146 | return fieldNames, nil
147 | }
148 |
149 | func (i *IndexFile) FindOrCreateIndex(name string, fieldType FieldType) (*linkedpage.LinkedPage, *IndexMeta, error) {
150 | mp := i.tree
151 | for {
152 | next, err := mp.Next()
153 | if err != nil {
154 | if errors.Is(err, io.EOF) {
155 | break
156 | }
157 | return nil, nil, fmt.Errorf("failed to get next meta page: %w", err)
158 | }
159 | buf, err := next.Metadata()
160 | if err != nil {
161 | return nil, nil, fmt.Errorf("failed to read metadata: %w", err)
162 | }
163 | metadata := &IndexMeta{}
164 | if err := metadata.UnmarshalBinary(buf); err != nil {
165 | return nil, nil, fmt.Errorf("failed to unmarshal metadata: %w", err)
166 | }
167 | if metadata.FieldName == name && metadata.FieldType == fieldType {
168 | return next, metadata, nil
169 | }
170 | mp = next
171 | }
172 | // we haven't found the index, so we need to create it
173 | next, err := mp.AddNext()
174 | if err != nil {
175 | return nil, nil, fmt.Errorf("failed to add next meta page: %w", err)
176 | }
177 | metadata := &IndexMeta{}
178 | metadata.FieldName = name
179 | metadata.FieldType = fieldType
180 | metadata.Width = DetermineType(fieldType)
181 | metadata.TotalFieldValueLength = uint64(0)
182 | buf, err := metadata.MarshalBinary()
183 | if err != nil {
184 | return nil, nil, fmt.Errorf("failed to marshal metadata: %w", err)
185 | }
186 | return next, metadata, next.SetMetadata(buf)
187 | }
188 |
189 | // Synchronize will synchronize the index file with the data file.
190 | // This is a convenience method and is equivalent to calling
191 | // Synchronize() on the data handler itself.
192 | func (i *IndexFile) Synchronize(df []byte) error {
193 | return i.dataHandler.Synchronize(i, df)
194 | }
195 |
196 | func (i *IndexFile) SetBenchmarkFile(f io.Writer) {
197 | t0 := time.Now()
198 | i.BenchmarkCallback = func(n int) {
199 | // write timestamp, number of records, and number of pages
200 | dt := time.Since(t0)
201 | fmt.Fprintf(f, "%d,%d,%d\n", dt.Microseconds(), n, i.pf.PageCount())
202 | }
203 | }
204 |
205 | func (i *IndexFile) IsSearch(fieldName string) bool {
206 | for _, sh := range i.searchHeaders {
207 | if fieldName == sh {
208 | return true
209 | }
210 | }
211 |
212 | return false
213 | }
214 |
--------------------------------------------------------------------------------
/pkg/appendable/index_file_test.go:
--------------------------------------------------------------------------------
1 | package appendable
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/kevmo314/appendable/pkg/buftest"
7 | )
8 |
9 | type FormatHandler struct{ ReturnsFormat Format }
10 |
11 | func (f FormatHandler) Format() Format {
12 | return f.ReturnsFormat
13 | }
14 |
15 | func (f FormatHandler) Synchronize(f1 *IndexFile, df []byte) error {
16 | return nil
17 | }
18 |
19 | func (f FormatHandler) Parse(data []byte) []byte {
20 | return nil
21 | }
22 |
23 | func TestIndexFile(t *testing.T) {
24 | t.Run("validate metadata throws error if format doesn't match on second read", func(t *testing.T) {
25 | f := buftest.NewSeekableBuffer()
26 |
27 | var em []string
28 |
29 | if _, err := NewIndexFile(
30 | f,
31 | &FormatHandler{ReturnsFormat: Format(1)},
32 | em,
33 | ); err != nil {
34 | t.Fatal(err)
35 | }
36 |
37 | // try creating a new index file with a different format
38 | if _, err := NewIndexFile(f, &FormatHandler{ReturnsFormat: Format(2)}, em); err == nil {
39 | t.Fatal("expected error")
40 | }
41 | })
42 | }
43 |
44 | func TestWidthAllocation(t *testing.T) {
45 |
46 | type Truth struct {
47 | Type FieldType
48 | Width uint16
49 | }
50 |
51 | t.Run("should correctly allocate the fixed width or else for a given type", func(t *testing.T) {
52 |
53 | ws := [8]Truth{
54 | {FieldTypeArray, 0},
55 | {FieldTypeBoolean, 2},
56 | {FieldTypeNull, 1},
57 | {FieldTypeFloat64, 9},
58 | {FieldTypeInt64, 9},
59 | {FieldTypeObject, 0},
60 | {FieldTypeString, 0},
61 | {FieldTypeUint64, 9},
62 | }
63 |
64 | for _, w := range ws {
65 | expected := w.Width
66 | input := DetermineType(w.Type)
67 |
68 | if expected != input {
69 | t.Errorf("For type: %v, expected: %v, got: %v", w.Type, expected, input)
70 | }
71 | }
72 | })
73 | }
74 |
--------------------------------------------------------------------------------
/pkg/appendable/typescript.go:
--------------------------------------------------------------------------------
1 | package appendable
2 |
3 | // func (f *IndexFile) WriteTypescriptDefinitions(w io.Writer) error {
4 | // _, err := w.Write([]byte(`// This file was generated by github.com/kevmo314/appendable/pkg/appendable/typescript.go`))
5 | // if err != nil {
6 | // return err
7 | // }
8 | // if _, err := w.Write([]byte("\n\nexport type Record = {\n")); err != nil {
9 | // return err
10 | // }
11 | // // iterate over each field in the index header and generate a field for it
12 | // for _, index := range f.Indexes {
13 | // _, err := w.Write([]byte("\t\"" + index.FieldName + "\": " + index.FieldType.TypescriptType() + ";\n"))
14 | // if err != nil {
15 | // return err
16 | // }
17 | // }
18 | // if _, err := w.Write([]byte("}\n")); err != nil {
19 | // return err
20 | // }
21 |
22 | // return nil
23 | // }
24 |
--------------------------------------------------------------------------------
/pkg/bptree/README.md:
--------------------------------------------------------------------------------
1 | # kevmo314/appendable/btree
2 |
3 | This package implements an on-disk B+ tree, taking some inspiration from
4 | https://github.com/spy16/kiwi/tree/master/index/bptree.
5 |
6 | ## On the significance of the 4kB page size
7 |
8 | The B+ tree is designed to be stored on disk, and as such, it is designed to
9 | take advantage of the 4kB page size of most disks. However, in practice we
10 | don't see a material impact on performance when using alternative sizes. So
11 | why do we choose to use 4kB pages?
12 |
13 | In order to garbage collect old B+ tree nodes, we want to have pointers to
14 | freed pages to deallocate them entirely. That is, if we did not use page sizes
15 | and stored nodes contiguously, it would be difficult to garbage collect the exact
16 | number of bytes and we would end up with fragmentation. By using page sizes, we
17 | can simply store a list of freed pages and deallocate them entirely and we can
18 | be sure that the freed page will be sufficient to store the new node.
19 |
20 | Therefore, we must choose a page size that is large enough to store a node.
21 | In practice, the choice of 4kB specifically is arbitrary, but it is a nice way
22 | to align with the page size of most disks.
23 |
--------------------------------------------------------------------------------
/pkg/bptree/node.go:
--------------------------------------------------------------------------------
1 | package bptree
2 |
3 | import (
4 | "encoding/binary"
5 | "fmt"
6 | "github.com/kevmo314/appendable/pkg/encoding"
7 | "github.com/kevmo314/appendable/pkg/pointer"
8 | "io"
9 | )
10 |
11 | type DataParser interface {
12 | Parse([]byte) []byte
13 | }
14 |
15 | type BPTreeNode struct {
16 | Data []byte
17 | DataParser DataParser
18 | // contains the offset of the child node or the offset of the record for leaf
19 | // if the node is a leaf, the last pointer is the offset of the next leaf
20 | LeafPointers []pointer.MemoryPointer
21 | InternalPointers []uint64
22 | Keys []pointer.ReferencedValue
23 |
24 | // the expected width for the BPTree's type
25 | Width uint16
26 | }
27 |
28 | func (n *BPTreeNode) Leaf() bool {
29 | return len(n.LeafPointers) > 0
30 | }
31 |
32 | func (n *BPTreeNode) Pointer(i int) pointer.MemoryPointer {
33 | if n.Leaf() {
34 | return n.LeafPointers[i]
35 | }
36 | return pointer.MemoryPointer{Offset: n.InternalPointers[i]}
37 | }
38 |
39 | func (n *BPTreeNode) NumPointers() int {
40 | return len(n.InternalPointers) + len(n.LeafPointers)
41 | }
42 |
43 | func (n *BPTreeNode) Size() int64 {
44 | size := 4 // number of keys
45 | for _, k := range n.Keys {
46 | o := encoding.SizeVarint(uint64(k.DataPointer.Offset))
47 | l := encoding.SizeVarint(uint64(k.DataPointer.Length))
48 | size += l + o
49 |
50 | if n.Width != uint16(0) {
51 | size += len(k.Value)
52 | }
53 | }
54 | for _, n := range n.LeafPointers {
55 | o := encoding.SizeVarint(uint64(n.Offset))
56 | l := encoding.SizeVarint(uint64(n.Length))
57 | size += o + l
58 | }
59 | for _, n := range n.InternalPointers {
60 | o := len(binary.AppendUvarint([]byte{}, n))
61 | size += o
62 | }
63 | return int64(size)
64 | }
65 |
66 | func (n *BPTreeNode) MarshalBinary() ([]byte, error) {
67 | size := int32(len(n.Keys))
68 |
69 | if size == 0 {
70 | panic("writing empty node")
71 | }
72 | buf := make([]byte, n.Size())
73 | // set the first bit to 1 if it's a leaf
74 | if n.Leaf() {
75 | binary.LittleEndian.PutUint32(buf[:4], uint32(-size))
76 | } else {
77 | binary.LittleEndian.PutUint32(buf[:4], uint32(size))
78 | }
79 | ct := 4
80 | for _, k := range n.Keys {
81 | on := binary.PutUvarint(buf[ct:], k.DataPointer.Offset)
82 | ln := binary.PutUvarint(buf[ct+on:], uint64(k.DataPointer.Length))
83 | ct += on + ln
84 | if n.Width != uint16(0) {
85 | m := copy(buf[ct:ct+len(k.Value)], k.Value)
86 | if m != len(k.Value) {
87 | return nil, fmt.Errorf("failed to copy key: %w", io.ErrShortWrite)
88 | }
89 | ct += m
90 | }
91 | }
92 | for _, p := range n.LeafPointers {
93 | on := binary.PutUvarint(buf[ct:], p.Offset)
94 | ln := binary.PutUvarint(buf[ct+on:], uint64(p.Length))
95 |
96 | ct += on + ln
97 | }
98 | for _, p := range n.InternalPointers {
99 | on := binary.PutUvarint(buf[ct:], p)
100 | ct += on
101 | }
102 | if ct != int(n.Size()) {
103 | panic("size mismatch")
104 | }
105 | return buf, nil
106 | }
107 |
108 | func (n *BPTreeNode) WriteTo(w io.Writer) (int64, error) {
109 | buf, err := n.MarshalBinary()
110 | if err != nil {
111 | return 0, err
112 | }
113 | m, err := w.Write(buf)
114 | return int64(m), err
115 | }
116 |
117 | func (n *BPTreeNode) UnmarshalBinary(buf []byte) error {
118 | size := int32(binary.LittleEndian.Uint32(buf[:4]))
119 | leaf := size < 0
120 | if leaf {
121 | n.LeafPointers = make([]pointer.MemoryPointer, -size)
122 | n.Keys = make([]pointer.ReferencedValue, -size)
123 | } else {
124 | n.InternalPointers = make([]uint64, size+1)
125 | n.Keys = make([]pointer.ReferencedValue, size)
126 | }
127 | if size == 0 {
128 | panic("empty node")
129 | }
130 |
131 | m := 4
132 | for i := range n.Keys {
133 | o, on := binary.Uvarint(buf[m:])
134 | l, ln := binary.Uvarint(buf[m+on:])
135 |
136 | n.Keys[i].DataPointer.Offset = o
137 | n.Keys[i].DataPointer.Length = uint32(l)
138 |
139 | m += on + ln
140 |
141 | if n.Width == uint16(0) {
142 | // read the key out of the memory pointer stored at this position
143 | dp := n.Keys[i].DataPointer
144 | n.Keys[i].Value = n.DataParser.Parse(n.Data[dp.Offset : dp.Offset+uint64(dp.Length)]) // resolving the data-file
145 | } else {
146 | n.Keys[i].Value = buf[m : m+int(n.Width-1)]
147 | m += int(n.Width - 1)
148 | }
149 | }
150 | for i := range n.LeafPointers {
151 |
152 | o, on := binary.Uvarint(buf[m:])
153 | l, ln := binary.Uvarint(buf[m+on:])
154 |
155 | n.LeafPointers[i].Offset = o
156 | n.LeafPointers[i].Length = uint32(l)
157 | m += on + ln
158 | }
159 | for i := range n.InternalPointers {
160 | o, on := binary.Uvarint(buf[m:])
161 | n.InternalPointers[i] = o
162 | m += on
163 | }
164 | return nil
165 | }
166 |
--------------------------------------------------------------------------------
/pkg/bptree/node_test.go:
--------------------------------------------------------------------------------
1 | package bptree
2 |
3 | import (
4 | "bytes"
5 | "github.com/kevmo314/appendable/pkg/pointer"
6 | "reflect"
7 | "testing"
8 | )
9 |
10 | func TestBPTreeNode_ReadWriteLeaf(t *testing.T) {
11 | // Create a test BPTreeNode
12 | node1 := &BPTreeNode{
13 | LeafPointers: []pointer.MemoryPointer{
14 | {Offset: 0, Length: 3},
15 | {Offset: 3, Length: 3},
16 | {Offset: 6, Length: 3},
17 | },
18 | Keys: []pointer.ReferencedValue{
19 | {Value: []byte{0, 1, 2}},
20 | {Value: []byte{1, 2, 3}},
21 | {Value: []byte{3, 4, 5}},
22 | },
23 | Width: uint16(4),
24 | }
25 |
26 | buf := &bytes.Buffer{}
27 | if _, err := node1.WriteTo(buf); err != nil {
28 | t.Fatal(err)
29 | }
30 |
31 | node2 := &BPTreeNode{Width: uint16(4)}
32 | if err := node2.UnmarshalBinary(buf.Bytes()); err != nil {
33 | t.Fatal(err)
34 | }
35 |
36 | if !node2.Leaf() {
37 | t.Fatal("expected leaf node")
38 | }
39 |
40 | if !reflect.DeepEqual(node1, node2) {
41 | t.Fatalf("expected %#v\ngot %#v", node1, node2)
42 | }
43 | }
44 |
45 | func TestBPTreeNode_ReadWriteIntermediate(t *testing.T) {
46 | // Create a test BPTreeNode
47 | node1 := &BPTreeNode{
48 | InternalPointers: []uint64{0, 1, 2, 3},
49 | Keys: []pointer.ReferencedValue{
50 | {Value: []byte{0, 1}},
51 | {Value: []byte{1, 2}},
52 | {Value: []byte{3, 4}},
53 | },
54 | Width: uint16(3),
55 | }
56 |
57 | buf := &bytes.Buffer{}
58 | if _, err := node1.WriteTo(buf); err != nil {
59 | t.Fatal(err)
60 | }
61 |
62 | node2 := &BPTreeNode{Width: uint16(3)}
63 | if err := node2.UnmarshalBinary(buf.Bytes()); err != nil {
64 | t.Fatal(err)
65 | }
66 |
67 | if node2.Leaf() {
68 | t.Fatal("expected intermediate node")
69 | }
70 |
71 | if !reflect.DeepEqual(node1, node2) {
72 | t.Fatalf("expected %#v, got %#v", node1, node2)
73 | }
74 | }
75 |
76 | func TestBPTreeNode_CompareReferencedValues(t *testing.T) {
77 | rv := []pointer.ReferencedValue{
78 | {
79 | Value: []byte{0},
80 | },
81 | {
82 | Value: []byte{1},
83 | DataPointer: pointer.MemoryPointer{Offset: 0},
84 | }, {
85 | Value: []byte{1},
86 | DataPointer: pointer.MemoryPointer{Offset: 1},
87 | }, {
88 | Value: []byte{1},
89 | DataPointer: pointer.MemoryPointer{Offset: 1, Length: 1},
90 | },
91 | }
92 | for i := 0; i < len(rv); i++ {
93 | for j := 0; j < len(rv); j++ {
94 | cmp := pointer.CompareReferencedValues(rv[i], rv[j])
95 | if i < j && cmp >= 0 {
96 | t.Fatalf("expected %d < %d", i, j)
97 | }
98 | if i > j && cmp <= 0 {
99 | t.Fatalf("expected %d > %d", i, j)
100 | }
101 | if i == j && cmp != 0 {
102 | t.Fatalf("expected %d == %d", i, j)
103 | }
104 | }
105 | }
106 | }
107 |
--------------------------------------------------------------------------------
/pkg/btree/node.go:
--------------------------------------------------------------------------------
1 | package btree
2 |
3 | import (
4 | "encoding/binary"
5 | "fmt"
6 | "github.com/kevmo314/appendable/pkg/encoding"
7 | "github.com/kevmo314/appendable/pkg/hnsw"
8 | "github.com/kevmo314/appendable/pkg/pointer"
9 | "io"
10 | "math"
11 | )
12 |
13 | type BTreeNode struct {
14 | Ids []pointer.ReferencedId
15 | Vectors []hnsw.Point
16 |
17 | Offsets []uint64
18 | Width uint16
19 | VectorDim uint64
20 | }
21 |
22 | func (n *BTreeNode) Size() int64 {
23 | size := 4
24 |
25 | for _, k := range n.Ids {
26 | size += encoding.SizeVarint(k.DataPointer.Offset)
27 | size += encoding.SizeVarint(uint64(k.DataPointer.Length))
28 | size += encoding.SizeVarint(uint64(k.Value))
29 | }
30 |
31 | for _, n := range n.Offsets {
32 | size += encoding.SizeVarint(n)
33 | }
34 |
35 | if n.VectorDim == 0 {
36 | panic("VectorDim cannot be zero")
37 | }
38 |
39 | size += encoding.SizeVarint(n.VectorDim)
40 | size += len(n.Vectors) * (4 * int(n.VectorDim))
41 |
42 | return int64(size)
43 | }
44 |
45 | func (n *BTreeNode) Leaf() bool {
46 | return len(n.Offsets) == 0
47 | }
48 |
49 | func (n *BTreeNode) MarshalBinary() ([]byte, error) {
50 | size := int32(len(n.Ids))
51 |
52 | if size == 0 {
53 | panic("writing empty node, no ids found!")
54 | }
55 |
56 | buf := make([]byte, n.Size())
57 |
58 | if n.Leaf() {
59 | binary.LittleEndian.PutUint32(buf[:4], uint32(-size))
60 | } else {
61 | binary.LittleEndian.PutUint32(buf[:4], uint32(size))
62 | }
63 |
64 | ct := 4
65 | for _, k := range n.Ids {
66 | on := binary.PutUvarint(buf[ct:], k.DataPointer.Offset)
67 | ln := binary.PutUvarint(buf[ct+on:], uint64(k.DataPointer.Length))
68 | vn := binary.PutUvarint(buf[ct+on+ln:], uint64(k.Value))
69 | ct += on + ln + vn
70 | }
71 |
72 | for _, n := range n.Offsets {
73 | on := binary.PutUvarint(buf[ct:], n)
74 | ct += on
75 | }
76 |
77 | vdn := binary.PutUvarint(buf[ct:], n.VectorDim)
78 | ct += vdn
79 |
80 | for _, v := range n.Vectors {
81 | for _, elem := range v {
82 | binary.LittleEndian.PutUint32(buf[ct:], math.Float32bits(elem))
83 | ct += 4
84 | }
85 | }
86 |
87 | if ct != int(n.Size()) {
88 | panic(fmt.Sprintf("size mismatch. ct: %v, size: %v", ct, n.Size()))
89 | }
90 |
91 | return buf, nil
92 | }
93 |
94 | func (n *BTreeNode) UnmarshalBinary(buf []byte) error {
95 | size := int32(binary.LittleEndian.Uint32(buf[:4]))
96 | leaf := size < 0
97 |
98 | if leaf {
99 | n.Ids = make([]pointer.ReferencedId, -size)
100 | n.Vectors = make([]hnsw.Point, -size)
101 | n.Offsets = make([]uint64, 0)
102 | } else {
103 | n.Ids = make([]pointer.ReferencedId, size)
104 | n.Vectors = make([]hnsw.Point, size)
105 | n.Offsets = make([]uint64, size+1)
106 | }
107 |
108 | if size == 0 {
109 | panic("empty node")
110 | }
111 |
112 | m := 4
113 | for i := range n.Ids {
114 | o, on := binary.Uvarint(buf[m:])
115 | l, ln := binary.Uvarint(buf[m+on:])
116 |
117 | n.Ids[i].DataPointer.Offset = o
118 | n.Ids[i].DataPointer.Length = uint32(l)
119 |
120 | m += on + ln
121 |
122 | v, vn := binary.Uvarint(buf[m:])
123 | n.Ids[i].Value = hnsw.Id(v)
124 |
125 | m += vn
126 | }
127 |
128 | if !leaf {
129 | for i := range n.Offsets {
130 | o, on := binary.Uvarint(buf[m:])
131 | n.Offsets[i] = o
132 | m += on
133 | }
134 | }
135 |
136 | vecdim, vdn := binary.Uvarint(buf[m:])
137 | n.VectorDim = vecdim
138 | m += vdn
139 |
140 | for i := range n.Vectors {
141 | vector := make(hnsw.Point, vecdim)
142 |
143 | for vi := range vector {
144 | vector[vi] = float32(binary.LittleEndian.Uint32(buf[m:]))
145 | m += 4
146 | }
147 |
148 | n.Vectors[i] = vector
149 | }
150 |
151 | return nil
152 | }
153 |
154 | func (n *BTreeNode) WriteTo(w io.Writer) (int64, error) {
155 | buf, err := n.MarshalBinary()
156 | if err != nil {
157 | return 0, err
158 | }
159 | m, err := w.Write(buf)
160 | return int64(m), err
161 | }
162 |
--------------------------------------------------------------------------------
/pkg/btree/node_test.go:
--------------------------------------------------------------------------------
1 | package btree
2 |
3 | import (
4 | "bytes"
5 | "github.com/kevmo314/appendable/pkg/hnsw"
6 | "github.com/kevmo314/appendable/pkg/pointer"
7 | "reflect"
8 | "testing"
9 | )
10 |
11 | func TestBTreeNode_Size(t *testing.T) {
12 | t.Run("node size", func(t *testing.T) {
13 | n := &BTreeNode{ // 4
14 | Ids: []pointer.ReferencedId{{Value: 1}, {Value: 2}, {Value: 3}}, // 3 * (3)
15 | Vectors: []hnsw.Point{{1, 1}, {2, 2}, {3, 3}}, // 6 * 4 == 3 * 2 * 4 // 24
16 | Offsets: make([]uint64, 0),
17 | VectorDim: 2, // 1
18 | }
19 |
20 | if n.Size() != 38 {
21 | t.Fatalf("wrong size: %d", n.Size())
22 | }
23 | })
24 | }
25 |
26 | func TestBTreeNode_MarshalBinary(t *testing.T) {
27 | t.Run("leaf node", func(t *testing.T) {
28 | n := &BTreeNode{
29 | Ids: []pointer.ReferencedId{
30 | {Value: 1},
31 | {Value: 2},
32 | {Value: 3},
33 | },
34 | Vectors: []hnsw.Point{{0, 0}, {0, 0}, {0, 0}},
35 | Offsets: make([]uint64, 0),
36 | VectorDim: 2,
37 | }
38 |
39 | buf := &bytes.Buffer{}
40 | if _, err := n.WriteTo(buf); err != nil {
41 | t.Fatal(err)
42 | }
43 |
44 | m := &BTreeNode{}
45 | if err := m.UnmarshalBinary(buf.Bytes()); err != nil {
46 | t.Fatal(err)
47 | }
48 |
49 | if !m.Leaf() {
50 | t.Fatalf("expected leaf node, but got %v offsets", len(m.Offsets))
51 | }
52 |
53 | if !reflect.DeepEqual(n, m) {
54 | t.Fatalf("encoded\n%#v\ndecoded\n%#v", n, m)
55 | }
56 | })
57 |
58 | t.Run("intermediate node", func(t *testing.T) {
59 | n := &BTreeNode{
60 | Ids: []pointer.ReferencedId{
61 | {Value: 1},
62 | {Value: 2},
63 | {Value: 3},
64 | },
65 | Vectors: []hnsw.Point{{0, 0}, {0, 0}, {0, 0}},
66 | Offsets: []uint64{0, 4096, 8192, 6969},
67 | VectorDim: 2,
68 | }
69 |
70 | buf := &bytes.Buffer{}
71 | if _, err := n.WriteTo(buf); err != nil {
72 | t.Fatal(err)
73 | }
74 |
75 | m := &BTreeNode{}
76 | if err := m.UnmarshalBinary(buf.Bytes()); err != nil {
77 | t.Fatal(err)
78 | }
79 |
80 | if m.Leaf() {
81 | t.Fatal("expected intermediate node")
82 | }
83 |
84 | if !reflect.DeepEqual(n, m) {
85 | t.Fatalf("encoded\n%#v\ndecoded\n%#v", n, m)
86 | }
87 | })
88 | }
89 |
--------------------------------------------------------------------------------
/pkg/buftest/buffer.go:
--------------------------------------------------------------------------------
1 | package buftest
2 |
3 | import (
4 | "io"
5 | "os"
6 | )
7 |
8 | // SeekableBuffer is a buffer that can be seeked into.
9 | // this replicates the behavior of a file on disk without having to write to disk
10 | // which is useful for testing.
11 | type SeekableBuffer struct {
12 | buf []byte
13 | pos int
14 | }
15 |
16 | func NewSeekableBuffer() *SeekableBuffer {
17 | return &SeekableBuffer{}
18 | }
19 |
20 | func (b *SeekableBuffer) Bytes() []byte {
21 | return b.buf
22 | }
23 |
24 | func (b *SeekableBuffer) Write(p []byte) (int, error) {
25 | n := copy(b.buf[b.pos:], p)
26 | if n < len(p) {
27 | b.buf = append(b.buf, p[n:]...)
28 | }
29 | b.pos += len(p)
30 | return len(p), nil
31 | }
32 |
33 | func (b *SeekableBuffer) Seek(offset int64, whence int) (int64, error) {
34 | switch whence {
35 | case io.SeekStart:
36 | b.pos = int(offset)
37 | case io.SeekCurrent:
38 | b.pos += int(offset)
39 | case io.SeekEnd:
40 | b.pos = len(b.buf) + int(offset)
41 | }
42 | if b.pos < 0 {
43 | b.pos = 0
44 | }
45 | if b.pos > len(b.buf) {
46 | b.pos = len(b.buf)
47 | }
48 | return int64(b.pos), nil
49 | }
50 |
51 | func (b *SeekableBuffer) Read(p []byte) (int, error) {
52 | if b.pos >= len(b.buf) {
53 | return 0, io.EOF
54 | }
55 | n := copy(p, b.buf[b.pos:])
56 | b.pos += n
57 | return n, nil
58 | }
59 |
60 | func (b *SeekableBuffer) Truncate(size int64) error {
61 | if size < 0 {
62 | return io.ErrShortBuffer
63 | }
64 | if size > int64(len(b.buf)) {
65 | return io.ErrShortWrite
66 | }
67 | b.buf = b.buf[:size]
68 | return nil
69 | }
70 |
71 | func (b *SeekableBuffer) WriteAt(p []byte, off int64) (int, error) {
72 | if off < 0 {
73 | return 0, io.ErrShortBuffer
74 | }
75 | if off > int64(len(b.buf)) {
76 | return 0, io.ErrShortWrite
77 | }
78 | n := copy(b.buf[off:], p)
79 | if n < len(p) {
80 | b.buf = append(b.buf, p[n:]...)
81 | }
82 | return len(p), nil
83 | }
84 |
85 | func (b *SeekableBuffer) ReadAt(p []byte, off int64) (int, error) {
86 | if off < 0 {
87 | return 0, io.ErrShortBuffer
88 | }
89 | if off > int64(len(b.buf)) {
90 | return 0, io.EOF
91 | }
92 | n := copy(p, b.buf[off:])
93 | return n, nil
94 | }
95 |
96 | func (b *SeekableBuffer) WriteToDisk(filename string) error {
97 | return os.WriteFile(filename, b.buf, 0644)
98 | }
99 |
100 | var _ io.ReadWriteSeeker = &SeekableBuffer{}
101 | var _ io.ReaderAt = &SeekableBuffer{}
102 | var _ io.WriterAt = &SeekableBuffer{}
103 |
--------------------------------------------------------------------------------
/pkg/buftest/buffer_test.go:
--------------------------------------------------------------------------------
1 | package buftest
2 |
3 | import (
4 | "io"
5 | "testing"
6 | )
7 |
8 | func TestSeekableBuffer(t *testing.T) {
9 | t.Run("Write", func(t *testing.T) {
10 | b := NewSeekableBuffer()
11 | n, err := b.Write([]byte("hello"))
12 | if err != nil {
13 | t.Fatal(err)
14 | }
15 | if n != 5 {
16 | t.Fatalf("expected to write 5 bytes, wrote %d", n)
17 | }
18 | if string(b.buf) != "hello" {
19 | t.Fatalf("expected to write 'hello', wrote %s", string(b.buf))
20 | }
21 | })
22 |
23 | t.Run("write to end", func(t *testing.T) {
24 | b := NewSeekableBuffer()
25 | if _, err := b.Write([]byte("hello")); err != nil {
26 | t.Fatal(err)
27 | }
28 | if _, err := b.Seek(-2, io.SeekEnd); err != nil {
29 | t.Fatal(err)
30 | }
31 | if _, err := b.Write([]byte("world")); err != nil {
32 | t.Fatal(err)
33 | }
34 | if string(b.buf) != "helworld" {
35 | t.Fatalf("expected to write 'helworld', wrote %s", string(b.buf))
36 | }
37 | })
38 |
39 | t.Run("Seek", func(t *testing.T) {
40 | b := NewSeekableBuffer()
41 | if _, err := b.Write([]byte("helloo")); err != nil {
42 | t.Fatal(err)
43 | }
44 | if _, err := b.Seek(0, io.SeekStart); err != nil {
45 | t.Fatal(err)
46 | }
47 | if _, err := b.Write([]byte("world")); err != nil {
48 | t.Fatal(err)
49 | }
50 | if string(b.buf) != "worldo" {
51 | t.Fatalf("expected to write 'worldo', wrote %s", string(b.buf))
52 | }
53 | })
54 |
55 | t.Run("Read", func(t *testing.T) {
56 | b := NewSeekableBuffer()
57 | if _, err := b.Write([]byte("hello")); err != nil {
58 | t.Fatal(err)
59 | }
60 | if _, err := b.Seek(0, io.SeekStart); err != nil {
61 | t.Fatal(err)
62 | }
63 | buf := make([]byte, 5)
64 | n, err := b.Read(buf)
65 | if err != nil {
66 | t.Fatal(err)
67 | }
68 | if n != 5 {
69 | t.Fatalf("expected to read 5 bytes, read %d", n)
70 | }
71 | if string(buf) != "hello" {
72 | t.Fatalf("expected to read 'hello', read %s", string(buf))
73 | }
74 | })
75 |
76 | t.Run("read from middle", func(t *testing.T) {
77 | b := NewSeekableBuffer()
78 | if _, err := b.Write([]byte("hello")); err != nil {
79 | t.Fatal(err)
80 | }
81 | if _, err := b.Seek(2, io.SeekStart); err != nil {
82 | t.Fatal(err)
83 | }
84 | buf := make([]byte, 3)
85 | n, err := b.Read(buf)
86 | if err != nil {
87 | t.Fatal(err)
88 | }
89 | if n != 3 {
90 | t.Fatalf("expected to read 3 bytes, read %d", n)
91 | }
92 | if string(buf) != "llo" {
93 | t.Fatalf("expected to read 'llo', read %s", string(buf))
94 | }
95 | })
96 |
97 | t.Run("truncate", func(t *testing.T) {
98 | b := NewSeekableBuffer()
99 | if _, err := b.Write([]byte("hello")); err != nil {
100 | t.Fatal(err)
101 | }
102 | if err := b.Truncate(3); err != nil {
103 | t.Fatal(err)
104 | }
105 | if string(b.buf) != "hel" {
106 | t.Fatalf("expected to truncate to 'hel', truncated to %s", string(b.buf))
107 | }
108 | })
109 | }
110 |
--------------------------------------------------------------------------------
/pkg/encoding/sizeVarint.go:
--------------------------------------------------------------------------------
1 | package encoding
2 |
3 | import "math/bits"
4 |
5 | func SizeVarint(v uint64) int {
6 | return int(9*uint32(bits.Len64(v))+64) / 64
7 | }
8 |
--------------------------------------------------------------------------------
/pkg/encoding/sizeVarint_test.go:
--------------------------------------------------------------------------------
1 | package encoding
2 |
3 | import (
4 | "encoding/binary"
5 | "math/rand"
6 | "testing"
7 | "time"
8 | )
9 |
10 | func TestSizeVariant(t *testing.T) {
11 | rand.Seed(time.Now().UnixNano())
12 |
13 | const iterations = 1000
14 |
15 | for i := 0; i < iterations; i++ {
16 | randomNumber := rand.Uint64()
17 |
18 | x := len(binary.AppendUvarint([]byte{}, randomNumber))
19 | y := SizeVarint(randomNumber)
20 |
21 | if x != y {
22 | t.Fatalf("Mismatch for %d: binary.AppendUvarint size = %d, SizeVarint size = %d", randomNumber, x, y)
23 | }
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/pkg/handlers/csv.go:
--------------------------------------------------------------------------------
1 | package handlers
2 |
3 | import (
4 | "bytes"
5 | "encoding/binary"
6 | "encoding/csv"
7 | "fmt"
8 | "github.com/kevmo314/appendable/pkg/pointer"
9 | "io"
10 | "log/slog"
11 | "math"
12 | "strconv"
13 | "strings"
14 |
15 | "github.com/kevmo314/appendable/pkg/appendable"
16 | "github.com/kevmo314/appendable/pkg/bptree"
17 | )
18 |
19 | type CSVHandler struct {
20 | io.ReadSeeker
21 | }
22 |
23 | var _ appendable.DataHandler = (*CSVHandler)(nil)
24 |
25 | func (c CSVHandler) Format() appendable.Format {
26 | return appendable.FormatCSV
27 | }
28 |
29 | func (c CSVHandler) Synchronize(f *appendable.IndexFile, df []byte) error {
30 | slog.Debug("Starting CSV synchronization")
31 |
32 | var headers []string
33 | var err error
34 |
35 | metadata, err := f.Metadata()
36 | if err != nil {
37 | return fmt.Errorf("failed to read metadata: %w", err)
38 | }
39 |
40 | fieldNames, err := f.IndexFieldNames()
41 | if err != nil {
42 | return fmt.Errorf("failed to retrieve index field names: %w", err)
43 | }
44 | headers = fieldNames
45 |
46 | for {
47 | i := bytes.IndexByte(df[metadata.ReadOffset:], '\n')
48 | if i == -1 {
49 | break
50 | }
51 |
52 | if len(headers) == 0 {
53 | slog.Info("Parsing CSV headers")
54 | dec := csv.NewReader(bytes.NewReader(df[metadata.ReadOffset : metadata.ReadOffset+uint64(i)]))
55 | headers, err = dec.Read()
56 | if err != nil {
57 | slog.Error("failed to parse CSV header", "error", err)
58 | return fmt.Errorf("failed to parse CSV header: %w", err)
59 | }
60 | metadata.ReadOffset += uint64(i) + 1
61 | continue
62 | }
63 |
64 | dec := csv.NewReader(bytes.NewReader(df[metadata.ReadOffset : metadata.ReadOffset+uint64(i)]))
65 |
66 | if err := c.handleCSVLine(f, df, dec, headers, []string{}, pointer.MemoryPointer{
67 | Offset: metadata.ReadOffset,
68 | Length: uint32(i),
69 | }); err != nil {
70 | return fmt.Errorf("failed to handle object: %w", err)
71 | }
72 |
73 | metadata.ReadOffset += uint64(i) + 1 // include the newline
74 | }
75 |
76 | // update the metadata
77 | if err := f.SetMetadata(metadata); err != nil {
78 | return fmt.Errorf("failed to set metadata: %w", err)
79 | }
80 |
81 | slog.Debug("indexes", slog.Any("", f.Indexes))
82 | slog.Debug("Ending CSV synchronization")
83 | slog.Debug("=========")
84 | return nil
85 | }
86 |
87 | func fieldRankCsvField(fieldValue any) int {
88 | slog.Debug("serialize", slog.Any("fieldValue", fieldValue))
89 | switch fieldValue.(type) {
90 | case nil:
91 | slog.Debug("nil", slog.Any("fieldValue", fieldValue))
92 | return 1
93 | case bool:
94 | slog.Debug("bool", slog.Any("fieldValue", fieldValue))
95 | return 2
96 | case int, int8, int16, int32, int64, float32, float64:
97 | slog.Debug("number", slog.Any("fieldValue", fieldValue))
98 | return 3
99 | case string:
100 | slog.Debug("string", slog.Any("fieldValue", fieldValue))
101 | return 4
102 | default:
103 | panic("unknown type")
104 | }
105 | }
106 |
107 | func InferCSVField(fieldValue string) (interface{}, appendable.FieldType) {
108 | if fieldValue == "" {
109 | return nil, appendable.FieldTypeNull
110 | }
111 |
112 | if i, err := strconv.Atoi(fieldValue); err == nil {
113 |
114 | return float64(i), appendable.FieldTypeFloat64
115 | }
116 |
117 | if f, err := strconv.ParseFloat(fieldValue, 64); err == nil {
118 |
119 | return float64(f), appendable.FieldTypeFloat64
120 | }
121 |
122 | if b, err := strconv.ParseBool(fieldValue); err == nil {
123 | return b, appendable.FieldTypeBoolean
124 | }
125 |
126 | return fieldValue, appendable.FieldTypeString
127 | }
128 |
129 | func (c CSVHandler) Parse(value []byte) []byte {
130 | parsed, fieldType := InferCSVField(string(value))
131 |
132 | switch fieldType {
133 | case appendable.FieldTypeFloat64:
134 | buf := make([]byte, 8)
135 | binary.BigEndian.PutUint64(buf, math.Float64bits(parsed.(float64)))
136 | return buf
137 | case appendable.FieldTypeBoolean:
138 | if parsed.(bool) {
139 | return []byte{1}
140 | } else {
141 | return []byte{0}
142 | }
143 | case appendable.FieldTypeString:
144 | return []byte(parsed.(string))
145 | case appendable.FieldTypeNull:
146 | // nil values are a bit of a degenerate case, we are essentially using the bptree
147 | // as a set. we store the value as an empty byte slice.
148 | return []byte{}
149 | }
150 | panic("unknown type")
151 | }
152 |
153 | func (c CSVHandler) handleCSVLine(f *appendable.IndexFile, df []byte, dec *csv.Reader, headers []string, path []string, data pointer.MemoryPointer) error {
154 | record, err := dec.Read()
155 | if err != nil {
156 | slog.Error("Failed to read CSV record at index", "error", err)
157 | return fmt.Errorf("failed to read CSV record: %w", err)
158 | }
159 |
160 | cumulativeLength := uint64(0)
161 |
162 | for fieldIndex, fieldValue := range record {
163 | if fieldIndex >= len(headers) {
164 | slog.Error("Field index is out of bounds with headers", "fieldIndex", fieldIndex, "headers", slog.Any("headers", headers))
165 | return fmt.Errorf("field index %d is out of bounds with header", fieldIndex)
166 | }
167 |
168 | fieldName := headers[fieldIndex]
169 |
170 | name := strings.Join(append(path, fieldName), ".")
171 |
172 | fieldOffset := data.Offset + cumulativeLength
173 | fieldLength := uint32(len(fieldValue))
174 |
175 | _, fieldType := InferCSVField(fieldValue)
176 | page, _, err := f.FindOrCreateIndex(name, fieldType)
177 |
178 | if err != nil {
179 | return fmt.Errorf("failed to find or create index: %w", err)
180 | }
181 |
182 | mp := pointer.MemoryPointer{
183 | Offset: fieldOffset,
184 | Length: fieldLength,
185 | }
186 |
187 | if err := page.BPTree(&bptree.BPTree{Data: df, DataParser: CSVHandler{}, Width: uint16(0)}).Insert(pointer.ReferencedValue{Value: c.Parse([]byte(fieldValue)), DataPointer: mp}, data); err != nil {
188 | return fmt.Errorf("failed to insert into b+tree: %w", err)
189 | }
190 |
191 | cumulativeLength += uint64(fieldLength + 1)
192 | }
193 |
194 | return nil
195 | }
196 |
--------------------------------------------------------------------------------
/pkg/hnsw/friends.go:
--------------------------------------------------------------------------------
1 | package hnsw
2 |
3 | import (
4 | "encoding/binary"
5 | "errors"
6 | "fmt"
7 | "math"
8 | )
9 |
10 | type Point []float32
11 |
12 | type Friends struct {
13 | friends []*DistHeap
14 | maxLevels map[Id]int
15 | }
16 |
17 | // NewFriends creates a new vector, note the max level is inclusive.
18 | func NewFriends(topLevel int) *Friends {
19 | friends := make([]*DistHeap, topLevel+1)
20 |
21 | for i := 0; i <= topLevel; i++ {
22 | friends[i] = NewDistHeap()
23 | }
24 |
25 | return &Friends{
26 | friends: friends,
27 | maxLevels: make(map[Id]int),
28 | }
29 | }
30 |
31 | func (v *Friends) NumLevels() int {
32 | return len(v.friends)
33 | }
34 |
35 | func (v *Friends) TopLevel() int {
36 | return len(v.friends) - 1
37 | }
38 |
39 | func (v *Friends) HasLevel(level int) bool {
40 | if level < 0 {
41 | panic("level must be nonzero positive integer")
42 | }
43 |
44 | return level <= v.TopLevel()
45 | }
46 |
47 | // InsertFriendsAtLevel requires level must be zero-indexed and friendId must be valid at this level
48 | func (v *Friends) InsertFriendsAtLevel(level int, friendId Id, dist float32) {
49 | if !v.HasLevel(level) {
50 | panic("failed to insert friends at level, as level is not valId")
51 | }
52 |
53 | for i := 0; i <= level; i++ {
54 | v.friends[i].Insert(friendId, dist)
55 | }
56 |
57 | v.maxLevels[friendId] = level
58 | }
59 |
60 | func (v *Friends) GetFriendsAtLevel(level int) (*DistHeap, error) {
61 | if !v.HasLevel(level) {
62 | return nil, errors.New("failed to get friends at level")
63 | }
64 |
65 | return v.friends[level], nil
66 | }
67 |
68 | func (v *Friends) Flush(numNeighbors int) ([]byte, error) {
69 | if len(v.friends) == 0 {
70 | panic("no levels to be found")
71 | }
72 |
73 | // for every neighbor, we're going to serialize
74 | // +-------+-------------------------+
75 | // | level | Id
76 |
77 | buf := make([]byte, (4+1)*numNeighbors)
78 |
79 | level0 := v.friends[0]
80 | copyLevel0 := level0.Clone()
81 |
82 | for i := 0; i < numNeighbors; i++ {
83 | if copyLevel0.IsEmpty() {
84 | // write out max values here
85 | continue
86 | }
87 |
88 | closestItem, err := copyLevel0.PopMinItem()
89 | if err != nil {
90 | return []byte{}, fmt.Errorf("failed to find closest item in friends: %v", err)
91 | }
92 |
93 | closestId := closestItem.id
94 | closestIdMaxLevel, ok := v.maxLevels[closestId]
95 |
96 | if !ok {
97 | return []byte{}, fmt.Errorf("failed to find id %v in maxLevels map", closestId)
98 | }
99 |
100 | buf[i*(1+4)] = byte(closestIdMaxLevel)
101 | binary.BigEndian.PutUint32(buf[i*(1+4)+1:], uint32(closestId))
102 | }
103 |
104 | return buf, nil
105 | }
106 |
107 | func EuclidDistance(p0, p1 Point) float32 {
108 | var sum float32
109 |
110 | for i := range p0 {
111 | delta := p0[i] - p1[i]
112 | sum += delta * delta
113 | }
114 |
115 | return float32(math.Sqrt(float64(sum)))
116 | }
117 |
118 | // NearlyEqual is sourced from scalar package written by gonum
119 | // https://pkg.go.dev/gonum.org/v1/gonum/floats/scalar#EqualWithinAbsOrRel
120 | func NearlyEqual(a, b float32) bool {
121 | return EqualWithinAbs(float64(a), float64(b)) || EqualWithinRel(float64(a), float64(b))
122 | }
123 |
124 | // EqualWithinAbs returns true when a and b have an absolute difference
125 | // not greater than tol.
126 | func EqualWithinAbs(a, b float64) bool {
127 | return a == b || math.Abs(a-b) <= 1e-6
128 | }
129 |
130 | // minNormalFloat64 is the smallest normal number. For 64 bit IEEE-754
131 | // floats this is 2^{-1022}.
132 | const minNormalFloat64 = 0x1p-1022
133 |
134 | // EqualWithinRel returns true when the difference between a and b
135 | // is not greater than tol times the greater absolute value of a and b,
136 | //
137 | // abs(a-b) <= tol * max(abs(a), abs(b)).
138 | func EqualWithinRel(a, b float64) bool {
139 | if a == b {
140 | return true
141 | }
142 | delta := math.Abs(a - b)
143 | if delta <= minNormalFloat64 {
144 | return delta <= 1e-6*minNormalFloat64
145 | }
146 | // We depend on the division in this relationship to Identify
147 | // infinities (we rely on the NaN to fail the test) otherwise
148 | // we compare Infs of the same sign and evaluate Infs as equal
149 | // independent of sign.
150 | return delta/math.Max(math.Abs(a), math.Abs(b)) <= 1e-6
151 | }
152 |
--------------------------------------------------------------------------------
/pkg/hnsw/friends_test.go:
--------------------------------------------------------------------------------
1 | package hnsw
2 |
3 | import (
4 | "math"
5 | "reflect"
6 | "testing"
7 | )
8 |
9 | func TestVector_LevelManagement(t *testing.T) {
10 |
11 | /*
12 | hex has 6 layers from [0..5]
13 | oct has 8 layers from [0..8]
14 | */
15 | t.Run("check levels for oct and hex vectors", func(t *testing.T) {
16 | hexId := Id(1)
17 | hex := []float32{9, 2.0, 30}
18 |
19 | hexFriends := NewFriends(6)
20 |
21 | if hexFriends.TopLevel() != 6 {
22 | t.Fatalf("since 0-indexed, the max level is 5, got: %v", hexFriends.TopLevel())
23 | }
24 |
25 | if hexFriends.NumLevels() != 7 {
26 | t.Fatalf("since 0-indexed, the number of levels is 6, got: %v", hexFriends.NumLevels())
27 | }
28 |
29 | octId := Id(2)
30 | oct := []float32{0, 2, 3}
31 | octFriends := NewFriends(8)
32 |
33 | if octFriends.TopLevel() != 8 {
34 | t.Fatalf("since 0-indexed, the max level is 7, got: %v", octFriends.TopLevel())
35 | }
36 |
37 | if octFriends.NumLevels() != 9 {
38 | t.Fatalf("since 0-indexed, the number of levels is 8, got: %v", octFriends.NumLevels())
39 | }
40 |
41 | for i := 0; i <= 6; i++ {
42 | if !hexFriends.HasLevel(i) {
43 | t.Fatalf("since 0-indexed, the level #%v is missing", i)
44 | }
45 | }
46 |
47 | for i := 7; i <= 8; i++ {
48 | if hexFriends.HasLevel(i) {
49 | t.Fatalf("since 0-indexed, expected the level #%v to be missing", i)
50 | }
51 | }
52 |
53 | hexOctDist := EuclidDistance(oct, hex)
54 |
55 | hexFriends.InsertFriendsAtLevel(5, octId, hexOctDist)
56 | octFriends.InsertFriendsAtLevel(5, hexId, hexOctDist)
57 |
58 | for i := 0; i <= 5; i++ {
59 | hexFriends, err := hexFriends.GetFriendsAtLevel(i)
60 | if err != nil {
61 | t.Fatal(err)
62 | }
63 |
64 | octFriends, err := octFriends.GetFriendsAtLevel(i)
65 | if err != nil {
66 | t.Fatal(err)
67 | }
68 |
69 | if hexFriends.Len() != 1 || octFriends.Len() != 1 {
70 | t.Fatalf("expected hex and oct friends list at level %v to be 1, got: %v || %v", i, hexFriends.Len(), octFriends.Len())
71 | }
72 |
73 | top, err := hexFriends.PeekMinItem()
74 | if err != nil {
75 | t.Fatal(err)
76 | }
77 | if top.id != octId {
78 | t.Fatalf("expected %v, got %v", octId, top.id)
79 | }
80 |
81 | top, err = octFriends.PeekMinItem()
82 | if err != nil {
83 | t.Fatal(err)
84 | }
85 | if top.id != hexId {
86 | t.Fatalf("expected %v, got %v", hexId, top.id)
87 | }
88 | }
89 | })
90 |
91 | }
92 |
93 | func TestVector_EuclidDistance(t *testing.T) {
94 |
95 | type vectorPair struct {
96 | v0, v1 Point
97 | expected float32
98 | }
99 |
100 | basic := []vectorPair{
101 | {
102 | v0: Point{5, 3, 0},
103 | v1: Point{2, -2, float32(math.Sqrt(2))},
104 | expected: 6,
105 | },
106 | {
107 | v0: Point{1, 0, -5},
108 | v1: Point{-3, 2, -1},
109 | expected: 6,
110 | },
111 | {
112 | v0: Point{1, 3},
113 | v1: Point{5, 2},
114 | expected: float32(math.Sqrt(17)),
115 | },
116 | {
117 | v0: Point{0, 1, 4},
118 | v1: Point{2, 9, 1},
119 | expected: float32(math.Sqrt(77)),
120 | },
121 | {
122 | v0: Point{0},
123 | v1: Point{0},
124 | expected: 0,
125 | },
126 | {
127 | v0: Point{10, 20, 30, 40},
128 | v1: Point{10, 20, 30, 40},
129 | expected: 0,
130 | },
131 | }
132 |
133 | t.Run("correctly computes the distance of two vectors", func(t *testing.T) {
134 | for i, pair := range basic {
135 | dist := EuclidDistance(pair.v1, pair.v0)
136 |
137 | if !NearlyEqual(dist, pair.expected) {
138 | t.Fatalf("iter i: %v, expected %v and %v to be equal", i, dist, pair.expected)
139 | }
140 | }
141 | })
142 | }
143 |
144 | func TestFriends_Flush(t *testing.T) {
145 | t.Run("flush single friend", func(t *testing.T) {
146 | f := NewFriends(3)
147 |
148 | f.InsertFriendsAtLevel(2, 1, 4)
149 |
150 | buf, err := f.Flush(1)
151 | if err != nil {
152 | t.Fatal(err)
153 | }
154 | if !reflect.DeepEqual(buf, []byte{2, 0, 0, 0, 1}) {
155 | t.Fatalf("expected %v, got %v", []byte{2, 0, 0, 0, 1}, buf)
156 | }
157 | })
158 |
159 | t.Run("flushes 8 friends exactly", func(t *testing.T) {
160 | f := NewFriends(4)
161 | f.InsertFriendsAtLevel(2, 1, 1)
162 | f.InsertFriendsAtLevel(3, 2, 2)
163 | f.InsertFriendsAtLevel(1, 3, 3)
164 | f.InsertFriendsAtLevel(0, 4, 4)
165 | f.InsertFriendsAtLevel(4, 5, 5)
166 | f.InsertFriendsAtLevel(2, 6, 6)
167 | f.InsertFriendsAtLevel(0, 7, 7)
168 | f.InsertFriendsAtLevel(2, 8, 8)
169 |
170 | buf, err := f.Flush(8)
171 | if err != nil {
172 | t.Fatal(err)
173 | }
174 |
175 | if !reflect.DeepEqual(buf, []byte{2, 0, 0, 0, 1, 3, 0, 0, 0, 2, 1, 0, 0, 0, 3, 0, 0, 0, 0, 4, 4, 0, 0, 0, 5, 2, 0, 0, 0, 6, 0, 0, 0, 0, 7, 2, 0, 0, 0, 8}) {
176 | t.Fatalf("expected %v, got %v", []byte{2, 0, 0, 0, 1, 3, 0, 0, 0, 2, 1, 0, 0, 0, 3, 0, 0, 0, 0, 4, 4, 0, 0, 0, 5, 2, 0, 0, 0, 6, 0, 0, 0, 0, 7, 2, 0, 0, 0, 8}, buf)
177 | }
178 | })
179 | }
180 |
--------------------------------------------------------------------------------
/pkg/hnsw/heap.go:
--------------------------------------------------------------------------------
1 | package hnsw
2 |
3 | import (
4 | "fmt"
5 | "maps"
6 | "math/bits"
7 | )
8 |
9 | type Item struct {
10 | id Id
11 | dist float32
12 | }
13 |
14 | var EmptyHeapError = fmt.Errorf("Empty Heap")
15 |
16 | type DistHeap struct {
17 | items []*Item
18 | visited map[Id]int
19 | }
20 |
21 | func level(i int) int {
22 | // floor(log2(i + 1))
23 | return bits.Len(uint(i)+1) - 1
24 | }
25 |
26 | func isMinLevel(i int) bool {
27 | return level(i)%2 == 0
28 | }
29 |
30 | func lchild(i int) int {
31 | return i*2 + 1
32 | }
33 |
34 | func rchild(i int) int {
35 | return i*2 + 2
36 | }
37 |
38 | func parent(i int) int {
39 | return (i - 1) / 2
40 | }
41 |
42 | func hasParent(i int) bool {
43 | return i > 0
44 | }
45 |
46 | func hasGrandparent(i int) bool {
47 | return i > 2
48 | }
49 |
50 | func grandparent(i int) int {
51 | return parent(parent(i))
52 | }
53 |
54 | func (d *DistHeap) down(i, n int) bool {
55 | min := isMinLevel(i)
56 | i0 := i
57 | for {
58 | m := i
59 |
60 | l := lchild(i)
61 | if l >= n || l < 0 /* overflow */ {
62 | break
63 | }
64 | if d.Less(l, m) == min {
65 | m = l
66 | }
67 |
68 | r := rchild(i)
69 | if r < n && d.Less(r, m) == min {
70 | m = r
71 | }
72 |
73 | // grandchildren are contiguous i*4+3+{0,1,2,3}
74 | for g := lchild(l); g < n && g <= rchild(r); g++ {
75 | if d.Less(g, m) == min {
76 | m = g
77 | }
78 | }
79 |
80 | if m == i {
81 | break
82 | }
83 |
84 | d.Swap(i, m)
85 |
86 | if m == l || m == r {
87 | break
88 | }
89 |
90 | // m is grandchild
91 | p := parent(m)
92 | if d.Less(p, m) == min {
93 | d.Swap(m, p)
94 | }
95 | i = m
96 | }
97 | return i > i0
98 | }
99 |
100 | func (d *DistHeap) up(i int) {
101 | min := isMinLevel(i)
102 |
103 | if hasParent(i) {
104 | p := parent(i)
105 | if d.Less(p, i) == min {
106 | d.Swap(i, p)
107 | min = !min
108 | i = p
109 | }
110 | }
111 |
112 | for hasGrandparent(i) {
113 | g := grandparent(i)
114 | if d.Less(i, g) != min {
115 | return
116 | }
117 |
118 | d.Swap(i, g)
119 | i = g
120 | }
121 | }
122 |
123 | func NewDistHeap() *DistHeap {
124 | d := &DistHeap{
125 | items: make([]*Item, 0),
126 | visited: make(map[Id]int),
127 | }
128 | return d
129 | }
130 |
131 | func (d *DistHeap) Clone() *DistHeap {
132 | n := &DistHeap{
133 | items: make([]*Item, len(d.items)),
134 | visited: make(map[Id]int, len(d.visited)),
135 | }
136 |
137 | copy(n.items, d.items)
138 | maps.Copy(n.visited, d.visited)
139 |
140 | return n
141 | }
142 |
143 | func (d *DistHeap) PeekMinItem() (*Item, error) {
144 | if d.IsEmpty() {
145 | return nil, EmptyHeapError
146 | }
147 |
148 | return d.items[0], nil
149 | }
150 | func (d *DistHeap) PeekMaxItem() (*Item, error) {
151 | if d.Len() == 0 {
152 | return nil, EmptyHeapError
153 | }
154 |
155 | // Find the maximum element without removing it
156 | n := d.Len()
157 |
158 | i := 0
159 | l := lchild(0)
160 | if l < n && !d.Less(l, i) {
161 | i = l
162 | }
163 |
164 | r := rchild(0)
165 | if r < n && !d.Less(r, i) {
166 | i = r
167 | }
168 |
169 | return d.items[i], nil
170 | }
171 | func (d *DistHeap) PopMinItem() (*Item, error) {
172 | if d.IsEmpty() {
173 | return nil, EmptyHeapError
174 | }
175 |
176 | n := d.Len() - 1
177 | d.Swap(0, n)
178 | d.down(0, n)
179 | return d.Pop(), nil
180 | }
181 | func (d *DistHeap) PopMaxItem() (*Item, error) {
182 | if d.IsEmpty() {
183 | return nil, EmptyHeapError
184 | }
185 |
186 | n := d.Len()
187 | i := 0
188 | l := lchild(0)
189 |
190 | if l < n && !d.Less(l, i) {
191 | i = l
192 | }
193 |
194 | r := rchild(0)
195 | if r < n && !d.Less(r, i) {
196 | i = r
197 | }
198 |
199 | d.Swap(i, n-1)
200 | d.down(i, n-1)
201 |
202 | return d.Pop(), nil
203 | }
204 | func (d *DistHeap) Insert(id Id, dist float32) {
205 | index, ok := d.visited[id]
206 |
207 | if !ok {
208 | d.Push(&Item{id: id, dist: dist})
209 | d.visited[id] = d.Len() - 1
210 | d.up(d.Len() - 1)
211 | return
212 | }
213 |
214 | d.items[index].dist = dist
215 | d.Fix(index)
216 | }
217 |
218 | func (d *DistHeap) Fix(i int) {
219 | if !d.down(i, d.Len()) {
220 | d.up(i)
221 | }
222 | }
223 |
224 | func (d DistHeap) IsEmpty() bool { return len(d.items) == 0 }
225 | func (d DistHeap) Len() int { return len(d.items) }
226 | func (d DistHeap) Less(i, j int) bool { return d.items[i].dist < d.items[j].dist }
227 | func (d DistHeap) Swap(i, j int) {
228 | d.visited[d.items[i].id], d.visited[d.items[j].id] = j, i
229 | d.items[i], d.items[j] = d.items[j], d.items[i]
230 | }
231 | func (d *DistHeap) Push(x *Item) {
232 | (*d).items = append((*d).items, x)
233 | }
234 | func (d *DistHeap) Pop() *Item {
235 | old := (*d).items
236 | n := len(old)
237 | x := old[n-1]
238 | (*d).items = old[0 : n-1]
239 | delete(d.visited, x.id)
240 | return x
241 | }
242 |
--------------------------------------------------------------------------------
/pkg/hnsw/heap_test.go:
--------------------------------------------------------------------------------
1 | package hnsw
2 |
3 | import (
4 | "reflect"
5 | "testing"
6 | )
7 |
8 | func TestHeap(t *testing.T) {
9 |
10 | t.Run("basic min max properties", func(t *testing.T) {
11 | h := NewDistHeap()
12 |
13 | for i := 10; i > 0; i-- {
14 | h.Insert(Id(i), float32(10-i))
15 | }
16 |
17 | if h.Len() != 10 {
18 | t.Fatalf("heap length should be 10, got %v", h.Len())
19 | }
20 |
21 | expectedId := Id(10)
22 | for !h.IsEmpty() {
23 | peekMinItem, err := h.PeekMinItem()
24 | if err != nil {
25 | t.Fatalf("failed to peek min item: %v", err)
26 | }
27 |
28 | minItem, err := h.PopMinItem()
29 | if err != nil {
30 | t.Fatalf("failed to pop min item, err: %v", err)
31 | }
32 |
33 | if peekMinItem.id != minItem.id {
34 | t.Fatalf("mismatched item id, expected %v, got %v", expectedId, peekMinItem.id)
35 | }
36 |
37 | if minItem.id != expectedId {
38 | t.Fatalf("mismatched ids, expected %v, got: %v", expectedId, minItem.id)
39 | }
40 |
41 | expectedId -= 1
42 | }
43 | })
44 |
45 | t.Run("basic min max properties 2", func(t *testing.T) {
46 | h := NewDistHeap()
47 |
48 | for i := 0; i <= 10; i++ {
49 | h.Insert(Id(i), float32(10-i))
50 | }
51 |
52 | maxExpectedId := Id(0)
53 | minExpectedId := Id(10)
54 |
55 | for !h.IsEmpty() {
56 | peekMaxItem, err := h.PeekMaxItem()
57 |
58 | if err != nil {
59 | t.Fatalf("failed to peek max item, err: %v", err)
60 | }
61 |
62 | maxItem, err := h.PopMaxItem()
63 |
64 | if err != nil {
65 | t.Fatalf("failed to pop max item, err: %v", err)
66 | }
67 |
68 | if peekMaxItem.id != maxItem.id {
69 | t.Fatalf("mismatched max ids, expected %v, got: %v", maxItem.id, peekMaxItem.id)
70 | }
71 |
72 | if maxItem.id != maxExpectedId {
73 | t.Fatalf("expected id to be %v, got %v", maxExpectedId, maxItem.id)
74 | }
75 |
76 | if h.IsEmpty() {
77 | continue
78 | }
79 |
80 | peekMinItem, err := h.PeekMinItem()
81 | if err != nil {
82 | t.Fatalf("failed to peek min item, err: %v", err)
83 | }
84 |
85 | minItem, err := h.PopMinItem()
86 |
87 | if err != nil {
88 | t.Fatalf("failed to pop min item, err: %v", err)
89 | }
90 |
91 | if peekMinItem.id != minItem.id {
92 | t.Fatalf("mismatched min ids, expected %v, got: %v", maxItem.id, peekMaxItem.id)
93 | }
94 |
95 | if minItem.id != minExpectedId {
96 | t.Fatalf("expected id to be %v, got %v", minExpectedId, minItem.id)
97 | }
98 |
99 | minExpectedId -= 1
100 | maxExpectedId += 1
101 | }
102 | })
103 |
104 | t.Run("bricks and ladders || min heap", func(t *testing.T) {
105 | type Case struct {
106 | heights []int
107 | bricks int
108 | ladders int
109 | expected int
110 | }
111 |
112 | cases := [3]Case{
113 | {
114 | heights: []int{4, 2, 7, 6, 9, 14, 12},
115 | bricks: 5,
116 | ladders: 1,
117 | expected: 4,
118 | },
119 | {
120 | heights: []int{4, 12, 2, 7, 3, 18, 20, 3, 19},
121 | bricks: 10,
122 | ladders: 2,
123 | expected: 7,
124 | },
125 | {
126 | heights: []int{14, 3, 19, 3},
127 | bricks: 17,
128 | ladders: 0,
129 | expected: 3,
130 | },
131 | }
132 |
133 | for _, c := range cases {
134 | res, err := furthestBuildings(c.heights, c.bricks, c.ladders)
135 | if err != nil {
136 | t.Fatal(err)
137 | }
138 |
139 | if res != c.expected {
140 | t.Errorf("got %d, want %d", res, c.expected)
141 | }
142 | }
143 | })
144 |
145 | t.Run("copy", func(t *testing.T) {
146 | m := NewDistHeap()
147 |
148 | for i := 0; i <= 10; i++ {
149 | m.Insert(Id(i), float32(10-i))
150 | }
151 |
152 | n := m.Clone()
153 |
154 | reflect.DeepEqual(m.items, n.items)
155 | reflect.DeepEqual(m.visited, n.visited)
156 |
157 | expectedId := Id(10)
158 |
159 | for !n.IsEmpty() {
160 | item, err := n.PopMinItem()
161 | if err != nil {
162 | return
163 | }
164 |
165 | if item.id != expectedId {
166 | t.Fatalf("expected id to be %v, got %v", expectedId, item.id)
167 | }
168 |
169 | expectedId -= 1
170 | }
171 | })
172 | }
173 |
174 | func furthestBuildings(heights []int, bricks, ladders int) (int, error) {
175 |
176 | ladderJumps := NewDistHeap()
177 |
178 | for idx := 0; idx < len(heights)-1; idx++ {
179 | height := heights[idx]
180 | nextHeight := heights[idx+1]
181 |
182 | if height >= nextHeight {
183 | continue
184 | }
185 |
186 | jump := nextHeight - height
187 |
188 | ladderJumps.Insert(Id(idx), float32(jump))
189 |
190 | if ladderJumps.Len() > ladders {
191 | minLadderJump, err := ladderJumps.PopMinItem()
192 | if err != nil {
193 | return -1, err
194 | }
195 |
196 | if bricks-int(minLadderJump.dist) < 0 {
197 | return idx, nil
198 | }
199 |
200 | bricks -= int(minLadderJump.dist)
201 | }
202 | }
203 |
204 | return len(heights) - 1, nil
205 | }
206 |
--------------------------------------------------------------------------------
/pkg/metapage/metapage.go:
--------------------------------------------------------------------------------
1 | package metapage
2 |
3 | import (
4 | "github.com/kevmo314/appendable/pkg/pointer"
5 | "io"
6 | )
7 |
8 | // MetaPage is an abstract interface over the root page of a bptree
9 | // This allows the caller to control the memory location of the meta
10 | // pointer
11 | type MetaPage interface {
12 | Root() (pointer.MemoryPointer, error)
13 | SetRoot(pointer.MemoryPointer) error
14 | }
15 |
16 | type NodeSerializable interface {
17 | Size() int64
18 | NumPointers() int
19 | MarshalBinary() ([]byte, error)
20 | UnmarshalBinary([]byte) error
21 | WriteTo(w io.Writer) (int64, error)
22 | }
23 |
--------------------------------------------------------------------------------
/pkg/mmap/mmap.go:
--------------------------------------------------------------------------------
1 | // mmap contains utilities to memory map a file while still exposing file append operations.
2 | package mmap
3 |
4 | import (
5 | "fmt"
6 | "io"
7 | "os"
8 |
9 | "golang.org/x/sys/unix"
10 | )
11 |
12 | type MemoryMappedFile struct {
13 | file *os.File
14 | bytes []byte
15 | seek int64
16 |
17 | // parameters used for remapping.
18 | prot, flags int
19 | }
20 |
21 | var _ io.ReadWriteSeeker = &MemoryMappedFile{}
22 | var _ io.Closer = &MemoryMappedFile{}
23 | var _ io.ReaderAt = &MemoryMappedFile{}
24 | var _ io.WriterAt = &MemoryMappedFile{}
25 |
26 | func toProt(flag int) int {
27 | prot := unix.PROT_READ
28 | if flag&os.O_RDWR != 0 {
29 | prot |= unix.PROT_WRITE
30 | }
31 | return prot
32 | }
33 |
34 | func NewMemoryMappedFile(f *os.File, prot int) (*MemoryMappedFile, error) {
35 | fd := uintptr(f.Fd())
36 | fi, err := f.Stat()
37 | if err != nil {
38 | return nil, fmt.Errorf("stat: %v", err)
39 | }
40 | if fi.Size() == 0 {
41 | return &MemoryMappedFile{file: f, bytes: nil, seek: 0, prot: prot, flags: unix.MAP_SHARED}, nil
42 | }
43 | b, err := unix.Mmap(int(fd), 0, int(fi.Size()), prot, unix.MAP_SHARED)
44 | if err != nil {
45 | return nil, fmt.Errorf("mmap: %v", err)
46 | }
47 | return &MemoryMappedFile{file: f, bytes: b, seek: 0, prot: prot, flags: unix.MAP_SHARED}, nil
48 | }
49 |
50 | // Open is a convenience function to open a file and memory map it.
51 | func Open(path string) (*MemoryMappedFile, error) {
52 | return OpenFile(path, os.O_RDWR, 0)
53 | }
54 |
55 | // OpenFile is a convenience function to open a file with the given flags and memory map it.
56 | func OpenFile(path string, flag int, perm os.FileMode) (*MemoryMappedFile, error) {
57 | f, err := os.OpenFile(path, flag, perm)
58 | if err != nil {
59 | return nil, fmt.Errorf("open: %v", err)
60 | }
61 | return NewMemoryMappedFile(f, toProt(flag))
62 | }
63 |
64 | func (m *MemoryMappedFile) File() *os.File {
65 | return m.file
66 | }
67 |
68 | func (m *MemoryMappedFile) Bytes() []byte {
69 | return m.bytes
70 | }
71 |
72 | // Close closes the file and unmaps the memory.
73 | func (m *MemoryMappedFile) Close() error {
74 | if m.bytes == nil {
75 | return m.file.Close()
76 | }
77 | if err := unix.Munmap(m.bytes); err != nil {
78 | return fmt.Errorf("munmap: %v", err)
79 | }
80 | return m.file.Close()
81 | }
82 |
83 | // Seek sets the offset for the next Read or Write on file to offset.
84 | func (m *MemoryMappedFile) Seek(offset int64, whence int) (int64, error) {
85 | var abs int64
86 | switch whence {
87 | case io.SeekStart:
88 | abs = offset
89 | case io.SeekCurrent:
90 | abs = m.seek + offset
91 | case io.SeekEnd:
92 | abs = int64(len(m.bytes)) + offset
93 | default:
94 | return 0, fmt.Errorf("mmap: invalid whence")
95 | }
96 | if abs < 0 {
97 | return 0, fmt.Errorf("mmap: negative position")
98 | } else if abs > int64(len(m.bytes)) {
99 | return 0, fmt.Errorf("mmap: position out of bounds")
100 | }
101 | m.seek = abs
102 | return abs, nil
103 | }
104 |
105 | // Read reads up to len(b) bytes from the file.
106 | func (m *MemoryMappedFile) Read(b []byte) (int, error) {
107 | n := copy(b, m.bytes[m.seek:])
108 | m.seek += int64(n)
109 | if n < len(b) {
110 | return n, io.EOF
111 | }
112 | return n, nil
113 | }
114 |
115 | // ReadAt reads len(b) bytes from the file starting at byte offset off.
116 | func (m *MemoryMappedFile) ReadAt(b []byte, off int64) (int, error) {
117 | n := copy(b, m.bytes[off:])
118 | if n < len(b) {
119 | return n, io.EOF
120 | }
121 | return n, nil
122 | }
123 |
124 | // Write writes len(b) bytes to the file, appending to the file and remapping if necessary.
125 | func (m *MemoryMappedFile) Write(b []byte) (int, error) {
126 | n, err := m.WriteAt(b, m.seek)
127 | if err != nil {
128 | return 0, err
129 | }
130 | m.seek += int64(n)
131 | return n, nil
132 | }
133 |
134 | // WriteAt writes len(b) bytes to the file starting at byte offset off.
135 | func (m *MemoryMappedFile) WriteAt(b []byte, off int64) (int, error) {
136 | // check if the file needs to be remapped
137 | if off+int64(len(b)) > int64(len(m.bytes)) {
138 | // write the data and remap the file
139 | if _, err := m.file.WriteAt(b, off); err != nil {
140 | return 0, err
141 | }
142 | fi, err := m.file.Stat()
143 | if err != nil {
144 | return 0, err
145 | }
146 | if m.bytes == nil {
147 | m.bytes, err = unix.Mmap(int(m.file.Fd()), 0, int(fi.Size()), m.prot, m.flags)
148 | if err != nil {
149 | return 0, fmt.Errorf("mmap: %v", err)
150 | }
151 | return len(b), nil
152 | }
153 | b, err := mremap(m.bytes, int(m.file.Fd()), int(fi.Size()), m.prot, m.flags)
154 | if err != nil {
155 | return 0, fmt.Errorf("mmap: %v", err)
156 | }
157 | m.bytes = b
158 | return len(b), nil
159 | }
160 | // write the data
161 | n := copy(m.bytes[off:], b)
162 | return n, nil
163 | }
164 |
--------------------------------------------------------------------------------
/pkg/mmap/mremap_darwin.go:
--------------------------------------------------------------------------------
1 | package mmap
2 |
3 | import "golang.org/x/sys/unix"
4 |
5 | func mremap(oldAddress []byte, fd, newSize, prot, flags int) ([]byte, error) {
6 | // darwin doesn't have mremap, so we have to munmap and mmap the new size
7 |
8 | // unmap the old address
9 | if err := unix.Munmap(oldAddress); err != nil {
10 | return nil, err
11 | }
12 | return unix.Mmap(fd, 0, newSize, prot, flags)
13 | }
14 |
--------------------------------------------------------------------------------
/pkg/mmap/mremap_linux.go:
--------------------------------------------------------------------------------
1 | package mmap
2 |
3 | import "golang.org/x/sys/unix"
4 |
5 | func mremap(oldAddress []byte, fd, newSize, prot, flags int) ([]byte, error) {
6 | return unix.Mremap(oldAddress, newSize, unix.MREMAP_MAYMOVE)
7 | }
8 |
--------------------------------------------------------------------------------
/pkg/mocks/btree.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/binary"
5 | "github.com/kevmo314/appendable/pkg/bptree"
6 | "github.com/kevmo314/appendable/pkg/buftest"
7 | "github.com/kevmo314/appendable/pkg/pagefile"
8 | "github.com/kevmo314/appendable/pkg/pointer"
9 | "log"
10 | "math"
11 | )
12 |
13 | func generateBasicBtree() {
14 | b := buftest.NewSeekableBuffer()
15 | p, err := pagefile.NewPageFile(b)
16 | if err != nil {
17 | log.Fatalf("%v", err)
18 | }
19 | mp, err := newTestMetaPage(p)
20 |
21 | if err != nil {
22 | log.Fatalf("%v", err)
23 | }
24 |
25 | tree := &bptree.BPTree{PageFile: p, MetaPage: mp, Width: uint16(6)}
26 | if err := tree.Insert(pointer.ReferencedValue{Value: []byte("hello")}, pointer.MemoryPointer{Offset: 1, Length: 5}); err != nil {
27 | log.Fatalf("%v", err)
28 | }
29 | if err := tree.Insert(pointer.ReferencedValue{Value: []byte("world")}, pointer.MemoryPointer{Offset: 2, Length: 5}); err != nil {
30 | log.Fatalf("%v", err)
31 | }
32 | if err := tree.Insert(pointer.ReferencedValue{Value: []byte("moooo")}, pointer.MemoryPointer{Offset: 3, Length: 5}); err != nil {
33 | log.Fatalf("%v", err)
34 | }
35 | if err := tree.Insert(pointer.ReferencedValue{Value: []byte("cooow")}, pointer.MemoryPointer{Offset: 4, Length: 5}); err != nil {
36 | log.Fatalf("%v", err)
37 | }
38 |
39 | if err := b.WriteToDisk("BPTree_1.bin"); err != nil {
40 | log.Fatalf("%v", err)
41 | }
42 | }
43 |
44 | type StubDataParser struct{}
45 |
46 | func (s *StubDataParser) Parse(value []byte) []byte {
47 | return []byte{1, 2, 3, 4, 5, 6, 7, 8}
48 | }
49 |
50 | func generateBtreeIterator() {
51 |
52 | b := buftest.NewSeekableBuffer()
53 | p, err := pagefile.NewPageFile(b)
54 | if err != nil {
55 | log.Fatalf("%v", err)
56 | }
57 |
58 | mp, err := newTestMetaPage(p)
59 |
60 | if err != nil {
61 | log.Fatalf("%v", err)
62 | }
63 | tree := &bptree.BPTree{PageFile: p, MetaPage: mp, Data: make([]byte, 16384*4+8), DataParser: &StubDataParser{}, Width: uint16(0)}
64 | for i := 0; i < 16384*4; i++ {
65 | if err := tree.Insert(pointer.ReferencedValue{
66 | Value: []byte{1, 2, 3, 4, 5, 6, 7, 8},
67 | // DataPointer is used as a disambiguator.
68 | DataPointer: pointer.MemoryPointer{Offset: uint64(i), Length: 8},
69 | }, pointer.MemoryPointer{Offset: uint64(i)}); err != nil {
70 | log.Fatalf("%v", err)
71 | }
72 | }
73 |
74 | b.WriteToDisk("btree_iterator.bin")
75 | }
76 |
77 | func generate1023Btree() {
78 | b := buftest.NewSeekableBuffer()
79 | p, err := pagefile.NewPageFile(b)
80 | if err != nil {
81 | log.Fatalf("%v", err)
82 | }
83 |
84 | mp, err := newTestMetaPage(p)
85 |
86 | if err != nil {
87 | log.Fatalf("%v", err)
88 | }
89 | tree := &bptree.BPTree{PageFile: p, MetaPage: mp, Width: uint16(9)}
90 | count := 10
91 |
92 | for i := 0; i < count; i++ {
93 | buf := make([]byte, 8)
94 | binary.BigEndian.PutUint64(buf, math.Float64bits(23))
95 |
96 | if err := tree.Insert(pointer.ReferencedValue{Value: buf, DataPointer: pointer.MemoryPointer{Offset: uint64(i)}}, pointer.MemoryPointer{Offset: uint64(i), Length: uint32(len(buf))}); err != nil {
97 | log.Fatal(err)
98 | }
99 | }
100 |
101 | b.WriteToDisk("BPTree_1023.bin")
102 | }
103 |
--------------------------------------------------------------------------------
/pkg/mocks/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | func main() {
4 | // generateUVariantTestCases()
5 | //generateFilledMetadata()
6 | //generateBasicBtree()
7 | //generateInternalNode()
8 | //generateLeafNode()
9 | //generateBtreeIterator()
10 | // generateFileMeta()
11 | //generateIndexMeta()
12 | //generate1023Btree()
13 | }
14 |
--------------------------------------------------------------------------------
/pkg/mocks/meta_page.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/binary"
5 | "fmt"
6 | "github.com/kevmo314/appendable/pkg/pagefile"
7 | "github.com/kevmo314/appendable/pkg/pointer"
8 | "io"
9 | )
10 |
11 | type testMetaPage struct {
12 | pf *pagefile.PageFile
13 | root pointer.MemoryPointer
14 | }
15 |
16 | func (m *testMetaPage) SetRoot(mp pointer.MemoryPointer) error {
17 | m.root = mp
18 | return m.write()
19 | }
20 |
21 | func (m *testMetaPage) Root() (pointer.MemoryPointer, error) {
22 | return m.root, nil
23 | }
24 |
25 | func (m *testMetaPage) write() error {
26 | buf := make([]byte, 8)
27 | binary.LittleEndian.PutUint64(buf, m.root.Offset)
28 | if _, err := m.pf.Seek(4096, io.SeekStart); err != nil {
29 | return err
30 | }
31 | if _, err := m.pf.Write(buf); err != nil {
32 | return err
33 | }
34 | return nil
35 | }
36 |
37 | func newTestMetaPage(pf *pagefile.PageFile) (*testMetaPage, error) {
38 | meta := &testMetaPage{pf: pf}
39 | offset, err := pf.NewPage([]byte{0, 0, 0, 0, 0, 0, 0, 0})
40 | if err != nil {
41 | return nil, fmt.Errorf("%v", err)
42 | }
43 | // first page is garbage collection
44 | if offset != 4096 {
45 | return nil, fmt.Errorf("expected offset 0, got %d", offset)
46 | }
47 | return meta, nil
48 | }
49 |
--------------------------------------------------------------------------------
/pkg/mocks/metadata.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "log"
5 | "os"
6 |
7 | "github.com/kevmo314/appendable/pkg/appendable"
8 | "github.com/kevmo314/appendable/pkg/buftest"
9 | "github.com/kevmo314/appendable/pkg/linkedpage"
10 | "github.com/kevmo314/appendable/pkg/pagefile"
11 | )
12 |
13 | func generateFilledMetadata() {
14 | b := buftest.NewSeekableBuffer()
15 | p, err := pagefile.NewPageFile(b)
16 | if err != nil {
17 | log.Fatalf("%v", err)
18 | }
19 | tree, err := linkedpage.NewMultiBPTree(p, 0)
20 | if err != nil {
21 | log.Fatalf("%v", err)
22 | }
23 | node, err := tree.AddNext()
24 | if err != nil {
25 | log.Fatalf("%v", err)
26 | }
27 | if err := node.SetMetadata([]byte("hello")); err != nil {
28 | log.Fatalf("%v", err)
29 | }
30 |
31 | b.WriteToDisk("filled_metadata.bin")
32 | }
33 |
34 | func writeByteToFile(data []byte, filename string) error {
35 | if err := os.WriteFile(filename, data, 0644); err != nil {
36 | return err
37 | }
38 | return nil
39 | }
40 |
41 | func generateFileMeta() {
42 | fm := appendable.FileMeta{}
43 | fm.Format = 1
44 | fm.Version = 1
45 | fm.ReadOffset = 4096
46 | fm.Entries = 34
47 |
48 | b, err := fm.MarshalBinary()
49 | if err != nil {
50 | log.Fatalf("failed to write file meta to disk")
51 | }
52 |
53 | if err := writeByteToFile(b, "filemeta.bin"); err != nil {
54 | log.Fatalf("failed to write bytes to disk")
55 | }
56 | }
57 |
58 | func generateIndexMeta() {
59 | im := appendable.IndexMeta{}
60 | im.FieldName = "howdydo"
61 | im.FieldType = appendable.FieldTypeBoolean
62 | im.Width = appendable.DetermineType(appendable.FieldTypeBoolean)
63 | im.TotalFieldValueLength = 773424601
64 |
65 | b, err := im.MarshalBinary()
66 | if err != nil {
67 | log.Fatal("failed to write index meta to disk")
68 | }
69 |
70 | if err := writeByteToFile(b, "indexmeta.bin"); err != nil {
71 | log.Fatalf("failed to write bytes to disk")
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/pkg/mocks/node.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "bytes"
5 | "encoding/binary"
6 | "fmt"
7 | "github.com/kevmo314/appendable/pkg/bptree"
8 | "github.com/kevmo314/appendable/pkg/pointer"
9 | "log"
10 | "os"
11 | )
12 |
13 | func writeBufferToFile(buf *bytes.Buffer, filename string) error {
14 | if err := os.WriteFile(filename, buf.Bytes(), 0644); err != nil {
15 | return err
16 | }
17 | return nil
18 | }
19 |
20 | func generateLeafNode() {
21 | // Create a test BPTreeNode
22 | node1 := &bptree.BPTreeNode{
23 | LeafPointers: []pointer.MemoryPointer{
24 | {Offset: 0, Length: 3},
25 | {Offset: 3, Length: 3},
26 | {Offset: 6, Length: 3},
27 | },
28 | Keys: []pointer.ReferencedValue{
29 | {Value: []byte{0, 1, 2}},
30 | {Value: []byte{1, 2, 3}},
31 | {Value: []byte{3, 4, 5}},
32 | },
33 | Width: uint16(4),
34 | }
35 |
36 | buf := &bytes.Buffer{}
37 | if _, err := node1.WriteTo(buf); err != nil {
38 | log.Fatal(err)
39 | }
40 |
41 | writeBufferToFile(buf, "leafnode.bin")
42 | }
43 |
44 | func generateInternalNode() {
45 | // Create a test BPTreeNode
46 | node1 := &bptree.BPTreeNode{
47 | InternalPointers: []uint64{0, 1, 2, 3},
48 | Keys: []pointer.ReferencedValue{
49 | {Value: []byte{0, 1}},
50 | {Value: []byte{1, 2}},
51 | {Value: []byte{3, 4}},
52 | },
53 | Width: uint16(3),
54 | }
55 |
56 | buf := &bytes.Buffer{}
57 | if _, err := node1.WriteTo(buf); err != nil {
58 | log.Fatal(err)
59 | }
60 |
61 | writeBufferToFile(buf, "internalnode.bin")
62 |
63 | }
64 |
65 | func generateUVariantTestCases() {
66 | var tests = []uint64{
67 | 0,
68 | 1,
69 | 2,
70 | 10,
71 | 20,
72 | 63,
73 | 64,
74 | 65,
75 | 127,
76 | 128,
77 | 129,
78 | 255,
79 | 256,
80 | 257,
81 | 1<<63 - 1,
82 | }
83 |
84 | for _, x := range tests {
85 | buf := make([]byte, binary.MaxVarintLen64)
86 | n := binary.PutUvarint(buf, x)
87 | y, m := binary.Uvarint(buf[0:n])
88 |
89 | fmt.Printf("Test case - Value: %d, Encoded Bytes: %d\n", x, n)
90 | fmt.Printf("Decoded Value: %d, Bytes Read: %d\n", y, m)
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/pkg/ngram/tokenizer.go:
--------------------------------------------------------------------------------
1 | package ngram
2 |
3 | import (
4 | "golang.org/x/text/unicode/norm"
5 | "hash/fnv"
6 | "math/rand"
7 | "strings"
8 | "unicode"
9 | "unicode/utf8"
10 | )
11 |
12 | // NgramTokenizer generates the following tokens with the lengths: 1, 2, 3.
13 | // This is for two searching modes:
14 | // By default, we'll use the 12gram, that is the min-gram: 1 and max-gram: 2.
15 | // Also support trigrams, which have min-gram: 3, max-gram: 3.
16 |
17 | type Token struct {
18 | Word string
19 | Offset uint64
20 | Length uint32
21 | }
22 |
23 | // BuildTrigram makes two passes
24 | //
25 | // 1 - splits by white space and keeps track of the positions
26 | // 2 - performs a sliding window and builds trigrams
27 |
28 | func normalizeToAscii(s string) (string, map[int]int) {
29 | ogOffsets := make(map[int]int)
30 |
31 | var b strings.Builder
32 | norm := norm.NFKD.String(s)
33 |
34 | additionalOffsets := 0
35 |
36 | newIndex := 0
37 |
38 | for i, r := range norm {
39 | if utf8.RuneLen(r) > 1 {
40 | additionalOffsets += utf8.RuneLen(r) - 1
41 | }
42 |
43 | if r <= unicode.MaxASCII {
44 | b.WriteRune(r)
45 | ogOffsets[newIndex] = i - additionalOffsets
46 | newIndex++
47 | }
48 |
49 | }
50 | return b.String(), ogOffsets
51 | }
52 |
53 | func combineHashes(tokens []Token) int64 {
54 | h := fnv.New32a()
55 | for _, t := range tokens {
56 | h.Write([]byte(t.Word))
57 | }
58 | return int64(h.Sum32())
59 | }
60 |
61 | func Shuffle(tokens []Token) []Token {
62 | soup := make([]Token, len(tokens))
63 | copy(soup, tokens)
64 |
65 | seed := combineHashes(tokens)
66 | rand.Seed(seed)
67 | for i := len(tokens) - 1; i > 0; i-- {
68 | j := rand.Intn(i + 1)
69 | soup[i], soup[j] = soup[j], soup[i]
70 | }
71 |
72 | return soup
73 | }
74 |
75 | func BuildNgram(phrase string, gl int) []Token {
76 | var ngramTokens []Token
77 |
78 | var words [][]int
79 | var currWord []int
80 |
81 | clean, ogOffsets := normalizeToAscii(phrase)
82 |
83 | runes := []rune(clean)
84 | for i := 0; i < len(runes); i++ {
85 | r := runes[i]
86 |
87 | if unicode.IsLetter(r) || unicode.IsDigit(r) {
88 | currWord = append(currWord, i)
89 | } else if unicode.IsSpace(r) {
90 | if len(currWord) >= gl {
91 | words = append(words, currWord)
92 | }
93 | currWord = []int{}
94 | }
95 | }
96 |
97 | if len(currWord) >= gl {
98 | words = append(words, currWord)
99 | }
100 |
101 | for _, wOffsets := range words {
102 | for i := 0; i <= len(wOffsets)-gl; i++ {
103 |
104 | var str string
105 |
106 | p := 0
107 | for j := i; j < i+gl; j++ {
108 | str += string(runes[wOffsets[j]])
109 | p = j
110 | }
111 |
112 | q := ogOffsets[wOffsets[i]]
113 | ngramTokens = append(ngramTokens, Token{
114 | Word: strings.ToLower(str),
115 | Offset: uint64(q),
116 | Length: uint32(ogOffsets[wOffsets[p]] - q + 1),
117 | })
118 |
119 | }
120 | }
121 |
122 | return ngramTokens
123 | }
124 |
--------------------------------------------------------------------------------
/pkg/pagefile/pagefile.go:
--------------------------------------------------------------------------------
1 | package pagefile
2 |
3 | import (
4 | "encoding/binary"
5 | "errors"
6 | "io"
7 | )
8 |
9 | type ReadWriteSeekPager interface {
10 | io.ReadWriteSeeker
11 |
12 | Page(int) (int64, error)
13 | NewPage([]byte) (int64, error)
14 | FreePage(int64) error
15 |
16 | LastPage() int64
17 |
18 | PageSize() int
19 | SlotSize() int
20 |
21 | PageCount() int64
22 | }
23 |
24 | type PageFile struct {
25 | io.ReadWriteSeeker
26 | pageSize int
27 | slotSize int
28 |
29 | // local cache of free pages to avoid reading from disk too often.
30 | freePageIndexes [512]int64
31 | freePageHead, freePageCount int
32 |
33 | lastPage int64
34 | }
35 |
36 | var _ ReadWriteSeekPager = &PageFile{}
37 |
38 | // const maxFreePageIndices = 512
39 | const pageSizeBytes = 4096 // 4kB by default.
40 | const slotSizeBytes = 256
41 |
42 | func NewPageFile(rws io.ReadWriteSeeker) (*PageFile, error) {
43 | // check if the rws is empty. if it is, allocate one page for the free page indexes
44 | // if it is not, read the free page indexes from the last page
45 | if _, err := rws.Seek(0, io.SeekStart); err != nil {
46 | return nil, err
47 | }
48 | buf := make([]byte, pageSizeBytes)
49 | _, err := rws.Read(buf)
50 | if err != nil && err != io.EOF {
51 | return nil, err
52 | }
53 | pf := &PageFile{
54 | ReadWriteSeeker: rws,
55 | pageSize: pageSizeBytes,
56 | slotSize: slotSizeBytes,
57 | }
58 | if err == io.EOF {
59 | // allocate one page for the free page indexes
60 | if _, err := rws.Write(buf); err != nil {
61 | return nil, err
62 | }
63 | } else {
64 | for i := 0; i < len(pf.freePageIndexes); i++ {
65 | offset := int64(binary.LittleEndian.Uint64(buf[i*8 : (i+1)*8]))
66 | if offset != 0 {
67 | pf.freePageIndexes[pf.freePageHead] = offset
68 | pf.freePageHead = (pf.freePageHead + 1) % len(pf.freePageIndexes)
69 | pf.freePageCount++
70 | } else {
71 | break
72 | }
73 | }
74 | }
75 | // figure out what the last page is
76 | n, err := rws.Seek(0, io.SeekEnd)
77 | if err != nil {
78 | return nil, err
79 | }
80 | if n%int64(pf.pageSize) != 0 {
81 | return nil, errors.New("file size is not a multiple of the page size")
82 | }
83 | pf.lastPage = n / int64(pf.pageSize)
84 | return pf, nil
85 | }
86 |
87 | func (pf *PageFile) LastPage() int64 {
88 | return pf.lastPage
89 | }
90 |
91 | func (pf *PageFile) Page(i int) (int64, error) {
92 | if i < 0 {
93 | return 0, errors.New("page index cannot be negative")
94 | }
95 | // i + 1 because the first page is reserved for the free page indexes
96 | return int64(i+1) * int64(pf.pageSize), nil
97 | }
98 |
99 | func (pf *PageFile) writeFreePageIndices() error {
100 | buf := make([]byte, len(pf.freePageIndexes)*8)
101 | tail := (pf.freePageHead - pf.freePageCount + len(pf.freePageIndexes)) % len(pf.freePageIndexes)
102 | for i := 0; i < pf.freePageCount; i++ {
103 | offset := pf.freePageIndexes[tail+i]
104 | binary.LittleEndian.PutUint64(buf[i*8:(i+1)*8], uint64(offset))
105 | }
106 | if _, err := pf.ReadWriteSeeker.Seek(0, io.SeekStart); err != nil {
107 | return err
108 | }
109 | if _, err := pf.ReadWriteSeeker.Write(buf); err != nil {
110 | return err
111 | }
112 | return nil
113 | }
114 |
115 | func (pf *PageFile) FreePageIndex() (int64, error) {
116 | // find the first free page index and return it
117 | if pf.freePageCount == 0 {
118 | return -1, nil
119 | }
120 | // pop from the tail
121 | tail := (pf.freePageHead - pf.freePageCount + len(pf.freePageIndexes)) % len(pf.freePageIndexes)
122 | offset := pf.freePageIndexes[tail]
123 | pf.freePageIndexes[tail] = 0
124 | pf.freePageCount--
125 |
126 | if err := pf.writeFreePageIndices(); err != nil {
127 | return 0, err
128 | }
129 |
130 | return offset, nil
131 | }
132 |
133 | func (pf *PageFile) NewPage(buf []byte) (int64, error) {
134 | if buf != nil && len(buf) > pf.pageSize {
135 | return 0, errors.New("buffer is too large")
136 | }
137 |
138 | // if there are free pages, return the first one
139 | offset, err := pf.FreePageIndex()
140 | if err != nil {
141 | return 0, err
142 | }
143 | if offset != -1 {
144 | // seek to the free page
145 | if _, err := pf.ReadWriteSeeker.Seek(offset, io.SeekStart); err != nil {
146 | return 0, err
147 | }
148 | } else {
149 | n, err := pf.ReadWriteSeeker.Seek(0, io.SeekEnd)
150 | if err != nil {
151 | return 0, err
152 | }
153 | offset = n
154 | pf.lastPage++
155 | }
156 |
157 | // if the offset is not a multiple of the page size, we need to pad the file
158 | // with zeros to the next page boundary.
159 | var pad int64
160 | if pf.pageSize > 0 && offset%int64(pf.pageSize) != 0 {
161 | // Calculate the number of bytes to pad
162 | pad = int64(pf.pageSize) - (offset % int64(pf.pageSize))
163 | // Write the padding
164 | if _, err := pf.Write(make([]byte, pad)); err != nil {
165 | return 0, err
166 | }
167 | }
168 | page := make([]byte, pf.pageSize)
169 | if buf != nil {
170 | copy(page, buf)
171 | }
172 | if _, err := pf.ReadWriteSeeker.Write(page); err != nil {
173 | return 0, err
174 | }
175 | if _, err := pf.ReadWriteSeeker.Seek(offset, io.SeekStart); err != nil {
176 | return 0, err
177 | }
178 | return offset + pad, nil
179 | }
180 |
181 | func (pf *PageFile) FreePage(offset int64) error {
182 | if offset%int64(pf.pageSize) != 0 {
183 | return errors.New("offset is not a multiple of the page size")
184 | }
185 | if pf.freePageCount == len(pf.freePageIndexes) {
186 | return errors.New("free page index is full")
187 | }
188 |
189 | for i := range pf.freePageIndexes {
190 | if pf.freePageIndexes[i] == offset {
191 | return errors.New("offset already exists")
192 | }
193 | }
194 |
195 | // push to the head
196 | pf.freePageIndexes[pf.freePageHead] = offset
197 | pf.freePageHead = (pf.freePageHead + 1) % len(pf.freePageIndexes)
198 | pf.freePageCount++
199 |
200 | return pf.writeFreePageIndices()
201 | }
202 |
203 | func (pf *PageFile) PageSize() int {
204 | return pf.pageSize
205 | }
206 |
207 | func (pf *PageFile) SlotSize() int {
208 | return slotSizeBytes
209 | }
210 |
211 | func (pf *PageFile) PageCount() int64 {
212 | return pf.lastPage
213 | }
214 |
--------------------------------------------------------------------------------
/pkg/pagefile/pagefile_debug.go:
--------------------------------------------------------------------------------
1 | //go:build !release
2 |
3 | package pagefile
4 |
5 | import "io"
6 |
7 | func (pf *PageFile) Write(buf []byte) (int, error) {
8 | n, err := pf.ReadWriteSeeker.Seek(0, io.SeekCurrent)
9 | if err != nil {
10 | return 0, err
11 | }
12 | if n%int64(pf.pageSize)+int64(len(buf)) > int64(pf.pageSize) {
13 | panic("writing across page boundary not allowed")
14 | }
15 | return pf.ReadWriteSeeker.Write(buf)
16 | }
17 |
18 | func (pf *PageFile) Read(buf []byte) (int, error) {
19 | n, err := pf.ReadWriteSeeker.Seek(0, io.SeekCurrent)
20 | if err != nil {
21 | return 0, err
22 | }
23 | if n%int64(pf.pageSize)+int64(len(buf)) > int64(pf.pageSize) {
24 | panic("reading across page boundary not allowed")
25 | }
26 | return pf.ReadWriteSeeker.Read(buf)
27 | }
28 |
--------------------------------------------------------------------------------
/pkg/pagefile/pagefile_debug_test.go:
--------------------------------------------------------------------------------
1 | package pagefile
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/kevmo314/appendable/pkg/buftest"
7 | )
8 |
9 | func TestWriteAcrossBoundaryPanicsInDebug(t *testing.T) {
10 | defer func() {
11 | if r := recover(); r == nil {
12 | t.Errorf("The code did not panic")
13 | }
14 | }()
15 |
16 | buf := buftest.NewSeekableBuffer()
17 | pf, err := NewPageFile(buf)
18 | if err != nil {
19 | t.Fatal(err)
20 | }
21 | if _, err := pf.Write(make([]byte, pf.PageSize()+1)); err != nil {
22 | t.Fatal(err)
23 | }
24 | }
25 |
26 | func TestReadAcrossBoundaryPanicsInDebug(t *testing.T) {
27 | defer func() {
28 | if r := recover(); r == nil {
29 | t.Errorf("The code did not panic")
30 | }
31 | }()
32 |
33 | buf := buftest.NewSeekableBuffer()
34 | pf, err := NewPageFile(buf)
35 | if err != nil {
36 | t.Fatal(err)
37 | }
38 | if _, err := pf.Read(make([]byte, pf.PageSize()+1)); err != nil {
39 | t.Fatal(err)
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/pkg/pagefile/pagefile_test.go:
--------------------------------------------------------------------------------
1 | package pagefile
2 |
3 | import (
4 | "io"
5 | "testing"
6 |
7 | "github.com/kevmo314/appendable/pkg/buftest"
8 | )
9 |
10 | func TestPageFile(t *testing.T) {
11 | t.Run("allocates first page", func(t *testing.T) {
12 | buf := buftest.NewSeekableBuffer()
13 | pf, err := NewPageFile(buf)
14 | if err != nil {
15 | t.Fatal(err)
16 | }
17 | offset, err := pf.NewPage(nil)
18 | if err != nil {
19 | t.Fatal(err)
20 | }
21 | if offset != pageSizeBytes {
22 | t.Fatalf("expected offset %d, got %d", pageSizeBytes, offset)
23 | }
24 | })
25 |
26 | t.Run("page size allocates pages", func(t *testing.T) {
27 | buf := buftest.NewSeekableBuffer()
28 | pf, err := NewPageFile(buf)
29 | if err != nil {
30 | t.Fatal(err)
31 | }
32 | offset1, err := pf.NewPage(nil)
33 | if err != nil {
34 | t.Fatal(err)
35 | }
36 | if offset1 != pageSizeBytes {
37 | t.Fatalf("expected offset %d, got %d", pageSizeBytes, offset1)
38 | }
39 | // check the seek location
40 | n, err := buf.Seek(0, io.SeekCurrent)
41 | if err != nil {
42 | t.Fatal(err)
43 | }
44 | if n != pageSizeBytes {
45 | t.Fatalf("expected offset %d, got %d", pageSizeBytes, n)
46 | }
47 | offset2, err := pf.NewPage(nil)
48 | if err != nil {
49 | t.Fatal(err)
50 | }
51 | if offset2 != pageSizeBytes*2 {
52 | t.Fatalf("expected offset %d, got %d", pageSizeBytes*2, offset2)
53 | }
54 | m, err := buf.Seek(0, io.SeekCurrent)
55 | if err != nil {
56 | t.Fatal(err)
57 | }
58 | if m != pageSizeBytes*2 {
59 | t.Fatalf("expected offset %d, got %d", pageSizeBytes*2, m)
60 | }
61 | })
62 |
63 | t.Run("page size allocates page with data", func(t *testing.T) {
64 | buf := buftest.NewSeekableBuffer()
65 | pf, err := NewPageFile(buf)
66 | if err != nil {
67 | t.Fatal(err)
68 | }
69 | data := []byte("hello")
70 | offset1, err := pf.NewPage(data)
71 | if err != nil {
72 | t.Fatal(err)
73 | }
74 | if offset1 != pageSizeBytes {
75 | t.Fatalf("expected offset %d, got %d", pageSizeBytes, offset1)
76 | }
77 | if _, err := pf.Seek(offset1, io.SeekStart); err != nil {
78 | t.Fatal(err)
79 | }
80 | buf2 := make([]byte, len(data))
81 | if _, err := pf.Read(buf2); err != nil {
82 | t.Fatal(err)
83 | }
84 | if string(buf2) != string(data) {
85 | t.Fatalf("expected %s, got %s", string(data), string(buf2))
86 | }
87 | })
88 |
89 | t.Run("new page seeks to page", func(t *testing.T) {
90 | buf := buftest.NewSeekableBuffer()
91 | pf, err := NewPageFile(buf)
92 | if err != nil {
93 | t.Fatal(err)
94 | }
95 | offset1, err := pf.NewPage(nil)
96 | if err != nil {
97 | t.Fatal(err)
98 | }
99 | offset2, err := pf.Seek(0, io.SeekCurrent)
100 | if err != nil {
101 | t.Fatal(err)
102 | }
103 | if offset1 != offset2 {
104 | t.Fatalf("expected offset %d, got %d", offset1, offset2)
105 | }
106 | })
107 |
108 | t.Run("free page reuses page", func(t *testing.T) {
109 | buf := buftest.NewSeekableBuffer()
110 | pf, err := NewPageFile(buf)
111 | if err != nil {
112 | t.Fatal(err)
113 | }
114 | offset1, err := pf.NewPage(nil)
115 | if err != nil {
116 | t.Fatal(err)
117 | }
118 | if offset1 != pageSizeBytes {
119 | t.Fatalf("expected offset %d, got %d", pageSizeBytes, offset1)
120 | }
121 | // need to write at least one byte to trigger a new page.
122 | if _, err := pf.Write(make([]byte, 1)); err != nil {
123 | t.Fatal(err)
124 | }
125 | offset2, err := pf.NewPage(nil)
126 | if err != nil {
127 | t.Fatal(err)
128 | }
129 | if offset2 != pageSizeBytes*2 {
130 | t.Fatalf("expected offset %d, got %d", 2*pageSizeBytes, offset2)
131 | }
132 |
133 | if err := pf.FreePage(offset1); err != nil {
134 | t.Fatal(err)
135 | }
136 | offset3, err := pf.NewPage(nil)
137 | if err != nil {
138 | t.Fatal(err)
139 | }
140 | if offset3 != offset1 {
141 | t.Fatalf("expected offset %d, got %d", offset2, offset3)
142 | }
143 | })
144 |
145 | t.Run("free page behaves like a circular buffer", func(t *testing.T) {
146 | buf := buftest.NewSeekableBuffer()
147 | pf, err := NewPageFile(buf)
148 | if err != nil {
149 | t.Fatal(err)
150 | }
151 | offsets := make([]int64, 0, 10)
152 | for i := 0; i < 10; i++ {
153 | offset, err := pf.NewPage(nil)
154 | if err != nil {
155 | t.Fatal(err)
156 | }
157 | if i > 0 && offset != offsets[i-1]+pageSizeBytes {
158 | t.Fatalf("expected offset %d, got %d", offsets[i-1]+pageSizeBytes, offset)
159 | }
160 | offsets = append(offsets, offset)
161 | }
162 | for i := 0; i < 10; i++ {
163 | if err := pf.FreePage(offsets[i]); err != nil {
164 | t.Fatal(err)
165 | }
166 | }
167 | for i := 0; i < 10; i++ {
168 | offset, err := pf.NewPage(nil)
169 | if err != nil {
170 | t.Fatal(err)
171 | }
172 | if offset != offsets[i] {
173 | t.Fatalf("expected offset %d, got %d", offsets[i], offset)
174 | }
175 | }
176 | })
177 |
178 | t.Run("cannot double free a page", func(t *testing.T) {
179 | buf := buftest.NewSeekableBuffer()
180 | pf, err := NewPageFile(buf)
181 | if err != nil {
182 | t.Fatal(err)
183 | }
184 | offset, err := pf.NewPage(nil)
185 | if err != nil {
186 | t.Fatal(err)
187 | }
188 | if err := pf.FreePage(offset); err != nil {
189 | t.Fatal(err)
190 | }
191 | if err := pf.FreePage(offset); err == nil {
192 | t.Fatal("expected error")
193 | }
194 | })
195 |
196 | t.Run("track number of pages", func(t *testing.T) {
197 | buf := buftest.NewSeekableBuffer()
198 | pf, err := NewPageFile(buf)
199 | if err != nil {
200 | t.Fatal(err)
201 | }
202 | if pf.PageCount() != 1 {
203 | t.Fatalf("expected 1, got %d", pf.PageCount())
204 | }
205 | offset, err := pf.NewPage(nil)
206 | if err != nil {
207 | t.Fatal(err)
208 | }
209 | if pf.PageCount() != 2 {
210 | t.Fatalf("expected 2, got %d", pf.PageCount())
211 | }
212 | if err := pf.FreePage(offset); err != nil {
213 | t.Fatal(err)
214 | }
215 | if pf.PageCount() != 2 {
216 | t.Fatalf("expected 2, got %d", pf.PageCount())
217 | }
218 | if _, err := pf.NewPage(nil); err != nil {
219 | t.Fatal(err)
220 | }
221 | if pf.PageCount() != 2 {
222 | t.Fatalf("expected 2, got %d", pf.PageCount())
223 | }
224 | if _, err := pf.NewPage(nil); err != nil {
225 | t.Fatal(err)
226 | }
227 | if pf.PageCount() != 3 {
228 | t.Fatalf("expected 3, got %d", pf.PageCount())
229 | }
230 | })
231 | }
232 |
--------------------------------------------------------------------------------
/pkg/pointer/pointer.go:
--------------------------------------------------------------------------------
1 | package pointer
2 |
3 | import "fmt"
4 |
5 | // MemoryPointer is a uint64 offset and uint32 length
6 | type MemoryPointer struct {
7 | Offset uint64
8 | Length uint32
9 | }
10 |
11 | func (mp MemoryPointer) String() string {
12 | return fmt.Sprintf("Pointer[%08x:%08x]", mp.Offset, mp.Offset+uint64(mp.Length))
13 | }
14 |
--------------------------------------------------------------------------------
/pkg/pointer/referenced_value.go:
--------------------------------------------------------------------------------
1 | package pointer
2 |
3 | import (
4 | "bytes"
5 | "fmt"
6 | "github.com/kevmo314/appendable/pkg/hnsw"
7 | )
8 |
9 | type ReferencedValue struct {
10 | // it is generally optional to set the DataPointer. if it is not set, the
11 | // value is taken to be unreferenced and is stored directly in the node.
12 | // if it is set, the value is used for comparison but the value is stored
13 | // as a reference to the DataPointer.
14 | //
15 | // caveat: DataPointer is used as a disambiguator for the value. the b+ tree
16 | // implementation does not support duplicate keys and uses the DataPointer
17 | // to disambiguate between keys that compare as equal.
18 | DataPointer MemoryPointer
19 | Value []byte
20 | }
21 |
22 | type ReferencedId struct {
23 | DataPointer MemoryPointer
24 | Value hnsw.Id
25 | }
26 |
27 | func (rv ReferencedValue) String() string {
28 | return fmt.Sprintf("ReferencedValue@%s{%s}", rv.DataPointer, rv.Value)
29 | }
30 |
31 | func (rv ReferencedId) String() string {
32 | return fmt.Sprintf("ReferencedId@%s{%d}", rv.DataPointer, rv.Value)
33 | }
34 |
35 | func CompareReferencedValues(a, b ReferencedValue) int {
36 | if cmp := bytes.Compare(a.Value, b.Value); cmp != 0 {
37 | return cmp
38 | } else if a.DataPointer.Offset < b.DataPointer.Offset {
39 | return -1
40 | } else if a.DataPointer.Offset > b.DataPointer.Offset {
41 | return 1
42 | } else if a.DataPointer.Length < b.DataPointer.Length {
43 | return -1
44 | } else if a.DataPointer.Length > b.DataPointer.Length {
45 | return 1
46 | }
47 | return 0
48 | }
49 |
50 | func CompareReferencedIds(a, b ReferencedId) int {
51 | if a.Value > b.Value {
52 | return 1
53 | } else if a.Value < b.Value {
54 | return -1
55 | }
56 |
57 | return 0
58 | }
59 |
--------------------------------------------------------------------------------
/pkg/vectorpage/manager.go:
--------------------------------------------------------------------------------
1 | package vectorpage
2 |
3 | import (
4 | "fmt"
5 | "github.com/kevmo314/appendable/pkg/bptree"
6 | "github.com/kevmo314/appendable/pkg/btree"
7 | "github.com/kevmo314/appendable/pkg/hnsw"
8 | "github.com/kevmo314/appendable/pkg/pointer"
9 | )
10 |
11 | type HNSWAdjacencyPage [16][8]uint32
12 |
13 | type VectorPageManager struct {
14 | btree *btree.BTree
15 | // vectors []*hnsw.Point
16 |
17 | bptree *bptree.BPTree
18 | // neighborhood map[hnsw.Id]*hnsw.Friends
19 |
20 | hnsw *hnsw.Hnsw
21 | }
22 |
23 | func NewVectorPageManager(btree *btree.BTree, bptree *bptree.BPTree, hnsw *hnsw.Hnsw) *VectorPageManager {
24 | if btree == nil || bptree == nil {
25 | panic("btree and bptree must not be nil")
26 | }
27 |
28 | return &VectorPageManager{
29 | btree: btree,
30 | bptree: bptree,
31 | hnsw: hnsw,
32 | }
33 | }
34 |
35 | func (vp *VectorPageManager) AddNode(x hnsw.Point) error {
36 | xId, err := vp.hnsw.InsertVector(x)
37 | if err != nil {
38 | return err
39 | }
40 |
41 | // write point to btree
42 | if err := vp.btree.Insert(pointer.ReferencedId{Value: xId}, x); err != nil {
43 | return err
44 | }
45 |
46 | // write friends to bptree
47 | xFriends, err := vp.hnsw.Neighborhood(xId)
48 | if err != nil {
49 | return fmt.Errorf("vector id %v not found in hnsw neighborhood", x)
50 | }
51 | xfriendsBuf, err := xFriends.Flush(8)
52 | if err != nil {
53 | return err
54 | }
55 |
56 | if err := vp.bptree.Insert(pointer.ReferencedValue{Value: xfriendsBuf}, pointer.MemoryPointer{}); err != nil {
57 | return fmt.Errorf("failed to insert buf: %v", err)
58 | }
59 |
60 | return nil
61 | }
62 |
--------------------------------------------------------------------------------
/pkg/vectorpage/manager_test.go:
--------------------------------------------------------------------------------
1 | package vectorpage
2 |
3 | import (
4 | "github.com/kevmo314/appendable/pkg/hnsw"
5 | "testing"
6 | )
7 |
8 | func TestNewVectorPageManager(t *testing.T) {
9 |
10 | t.Run("", func(t *testing.T) {
11 | p0 := hnsw.Point{3, 3}
12 |
13 | h := hnsw.NewHnsw(2, 10, 8, p0)
14 |
15 | for i := 0; i < 100; i++ {
16 | id, err := h.InsertVector(hnsw.Point{float32(i), float32(i)})
17 | if err != nil {
18 | t.Fatal(err)
19 | }
20 |
21 | if id != hnsw.Id(i+1) {
22 | t.Fatalf("expected id %d, got %d", id, i+1)
23 | }
24 | }
25 | })
26 | }
27 |
--------------------------------------------------------------------------------
/scripts/jsonl2json/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "jsonl2json"
3 | version = "0.1.0"
4 | edition = "2021"
5 |
6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
7 |
8 | [dependencies]
9 | anyhow = "1.0.82"
10 | clap = { version = "4.5.4", features = ["derive"] }
11 | serde = { version = "1.0.198", features = ["derive"] }
12 | serde_json = "1.0.116"
13 |
--------------------------------------------------------------------------------
/scripts/jsonl2json/src/main.rs:
--------------------------------------------------------------------------------
1 | use std::fs::File;
2 | use std::io::{BufRead, BufReader};
3 | use clap::Parser;
4 | use anyhow::{Context, Result};
5 | use serde_json::Value;
6 |
7 | #[derive(Parser, Debug)]
8 | struct Args {
9 | #[arg(short)]
10 | file_path: String
11 | }
12 |
13 | fn main() -> Result<()>{
14 | let args = Args::parse();
15 | let file_path = args.file_path;
16 |
17 | let jsonl_file = File::open(&file_path)?;
18 | let reader = BufReader::new(jsonl_file);
19 |
20 | let mut array: Vec = vec![];
21 |
22 | for line in reader.lines() {
23 | let line = line?;
24 | let json: Value = serde_json::from_str(&line)?;
25 | array.push(json);
26 | }
27 |
28 | let output_path = file_path.replace(".jsonl", ".json").to_owned();
29 | let json_string = serde_json::to_string_pretty(&array)
30 | .with_context(|| "Failed to serialize JSON data")?;
31 |
32 | std::fs::write(&output_path, json_string.as_bytes())
33 | .with_context(|| format!("Failed to write to file: {}", output_path))?;
34 |
35 | return Ok(())
36 | }
37 |
--------------------------------------------------------------------------------
/src/bptree/traversal.ts:
--------------------------------------------------------------------------------
1 | import { BPTree, ReferencedValue } from "./bptree";
2 | import { BPTreeNode, MemoryPointer } from "./node";
3 |
4 | export type TraversalRecord = {
5 | node: BPTreeNode;
6 | index: number;
7 | pointer: MemoryPointer;
8 | };
9 |
10 | export class TraversalIterator {
11 | private tree: BPTree;
12 | private readonly key: ReferencedValue;
13 | private records: TraversalRecord[];
14 |
15 | constructor(tree: BPTree, key: ReferencedValue) {
16 | this.tree = tree;
17 | this.key = key;
18 | this.records = []; // note this works iff all records are non-empty
19 | }
20 |
21 | async init(): Promise {
22 | const rootResponse = await this.tree.root();
23 |
24 | if (rootResponse.rootNode === null) {
25 | return false;
26 | }
27 |
28 | const root = rootResponse.rootNode;
29 | const offset = rootResponse.pointer;
30 | this.records = await this.tree.traverse(this.key, root, offset);
31 |
32 | return true;
33 | }
34 |
35 | getKey(): ReferencedValue {
36 | return this.records[0].node.keys[this.records[0].index];
37 | }
38 |
39 | getPointer(): MemoryPointer {
40 | return this.records[0].node.pointer(this.records[0].index);
41 | }
42 |
43 | async increment(i: number, delta: number): Promise {
44 | if (i === this.records.length) {
45 | return false;
46 | }
47 |
48 | this.records[i].index += delta;
49 | const rolloverLeft = this.records[i].index < 0;
50 | const rolloverRight =
51 | this.records[i].index >= this.records[i].node.numPointers();
52 |
53 | if (rolloverLeft || rolloverRight) {
54 | if (!(await this.increment(i + 1, delta))) {
55 | return false;
56 | }
57 |
58 | if (!this.records[i + 1]) {
59 | return false;
60 | }
61 | // propagate the rollover
62 | this.records[i].node = await this.records[i + 1].node.child(
63 | this.records[i + 1].index,
64 | );
65 |
66 | if (rolloverLeft) {
67 | this.records[i].index = this.records[i].node.numPointers() - 1;
68 | } else {
69 | this.records[i].index = 0;
70 | }
71 | }
72 |
73 | return true;
74 | }
75 |
76 | async next(): Promise {
77 | if (this.records.length === 0) {
78 | const res = await this.init();
79 |
80 | return (
81 | res && this.records[0].index !== this.records[0].node.numPointers()
82 | );
83 | }
84 |
85 | return this.increment(0, 1);
86 | }
87 |
88 | async prev(): Promise {
89 | if (this.records.length === 0) {
90 | const res = await this.init();
91 | if (!res) {
92 | return false;
93 | }
94 | }
95 |
96 | return this.increment(0, -1);
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/src/db/query-builder.ts:
--------------------------------------------------------------------------------
1 | import { Database } from "./database";
2 | import { OrderBy, Query, Schema, WhereNode } from "./query-lang";
3 | /**
4 | * A class for building and executing database queries in a flexible API style.
5 | * Allows chaining methods for 'where', 'orderBy', 'select', and 'limit' clauses.
6 | */
7 | export class QueryBuilder {
8 | private queryObject: Query = {
9 | where: [],
10 | orderBy: undefined,
11 | select: undefined,
12 | limit: undefined,
13 | };
14 |
15 | constructor(private database: Database) {}
16 |
17 | toQuery(): Query {
18 | return {
19 | where: this.queryObject.where ? [...this.queryObject.where] : [],
20 | orderBy: this.queryObject.orderBy
21 | ? [...this.queryObject.orderBy]
22 | : undefined,
23 | select: this.queryObject.select
24 | ? [...this.queryObject.select]
25 | : undefined,
26 | limit: this.queryObject.limit,
27 | };
28 | }
29 |
30 | /**
31 | * Executes the constructed query
32 | */
33 | get() {
34 | return this.database.query(this.queryObject);
35 | }
36 |
37 | where(
38 | key: keyof T,
39 | operation: WhereNode["operation"],
40 | value: T[keyof T],
41 | ): QueryBuilder {
42 | const newQuery = new QueryBuilder(this.database);
43 | newQuery.queryObject = {
44 | ...this.queryObject,
45 | where: [...(this.queryObject.where || []), { key, operation, value }],
46 | };
47 | return newQuery;
48 | }
49 |
50 | orderBy(key: keyof T, direction: OrderBy["direction"]): QueryBuilder {
51 | const newQuery = new QueryBuilder(this.database);
52 | newQuery.queryObject = {
53 | ...this.queryObject,
54 | orderBy: [...(this.queryObject.orderBy || []), { key, direction }],
55 | };
56 | return newQuery;
57 | }
58 |
59 | select(keys: (keyof T)[]): QueryBuilder {
60 | const newQuery = new QueryBuilder(this.database);
61 | newQuery.queryObject = {
62 | ...this.queryObject,
63 | select: keys,
64 | };
65 | return newQuery;
66 | }
67 |
68 | limit(limit: number): QueryBuilder {
69 | const newQuery = new QueryBuilder(this.database);
70 | newQuery.queryObject = {
71 | ...this.queryObject,
72 | limit: limit,
73 | };
74 | return newQuery;
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/db/query-lang.ts:
--------------------------------------------------------------------------------
1 | import { FieldType } from "./database";
2 |
3 | export type Schema = {
4 | [key: string]: {};
5 | };
6 |
7 | export type WhereNode = {
8 | operation: "<" | "<=" | "==" | ">=" | ">";
9 | key: keyof T;
10 | value: T[K];
11 | };
12 |
13 | export type SearchConfig = {
14 | minGram: number;
15 | maxGram: number;
16 | };
17 |
18 | export type Search = {
19 | key: keyof T;
20 | like: string;
21 | config?: SearchConfig;
22 | };
23 |
24 | export type OrderBy = {
25 | key: keyof T;
26 | direction: "ASC" | "DESC";
27 | };
28 |
29 | export type SelectField = keyof T;
30 |
31 | export type Query = {
32 | where?: WhereNode[];
33 | orderBy?: OrderBy[];
34 | select?: SelectField[];
35 | search?: Search;
36 | limit?: number;
37 | };
38 |
39 | type QueryWhere = {
40 | valueBuf: ArrayBuffer;
41 | fieldType: FieldType;
42 | };
43 |
44 | export function processWhere(value: T[keyof T]): QueryWhere | null {
45 | let valueBuf: ArrayBuffer;
46 |
47 | if (value === null) {
48 | return {
49 | fieldType: FieldType.Null,
50 | valueBuf: new ArrayBuffer(0),
51 | };
52 | } else {
53 | switch (typeof value) {
54 | case "bigint":
55 | case "number":
56 | valueBuf = new ArrayBuffer(8);
57 | new DataView(valueBuf).setFloat64(0, Number(value));
58 | return {
59 | fieldType: FieldType.Float64,
60 | valueBuf,
61 | };
62 | case "boolean":
63 | return {
64 | fieldType: FieldType.Boolean,
65 | valueBuf: new Uint8Array([value ? 1 : 0]).buffer,
66 | };
67 |
68 | case "string":
69 | return {
70 | fieldType: FieldType.String,
71 | valueBuf: new TextEncoder().encode(value as string).buffer,
72 | };
73 | }
74 | }
75 |
76 | return null;
77 | }
78 |
79 | export function handleSelect(data: string, select?: (keyof T)[]) {
80 | let jData = JSON.parse(data);
81 | if (select && select.length > 0) {
82 | return select.reduce(
83 | (acc, field) => {
84 | if (field in jData) {
85 | acc[field] = jData[field];
86 | }
87 | return acc;
88 | },
89 | {} as Pick,
90 | );
91 | }
92 |
93 | return jData;
94 | }
95 |
--------------------------------------------------------------------------------
/src/db/query-validation.ts:
--------------------------------------------------------------------------------
1 | import { IndexHeader } from "../file/meta";
2 | import { FieldType, fieldTypeToString } from "./database";
3 | import {
4 | OrderBy,
5 | Schema,
6 | Query,
7 | SelectField,
8 | WhereNode,
9 | Search,
10 | } from "./query-lang";
11 |
12 | function checkType(headerType: number[], queryType: FieldType): boolean {
13 | return headerType.includes(queryType);
14 | }
15 |
16 | function validateWhere(
17 | where: WhereNode[] | undefined,
18 | headers: IndexHeader[],
19 | ): void {
20 | if (!where || !Array.isArray(where) || where.length === 0) {
21 | throw new Error("Missing 'where' clause.");
22 | }
23 |
24 | for (const whereNode of where) {
25 | if (!["<", "<=", "==", ">=", ">"].includes(whereNode.operation)) {
26 | throw new Error("Invalid operation in 'where' clause.");
27 | }
28 |
29 | if (typeof whereNode.key !== "string") {
30 | throw new Error("'key' in 'where' clause must be a string.");
31 | }
32 |
33 | const header = headers.find((h) => h.fieldName === whereNode.key);
34 |
35 | if (!header) {
36 | throw new Error(
37 | `key: ${whereNode.key} in 'where' clause does not exist in dataset.`,
38 | );
39 | }
40 |
41 | if (typeof whereNode.value === "undefined") {
42 | throw new Error("'value' in 'where' clause is missing.");
43 | }
44 |
45 | const headerType = header.fieldTypes;
46 |
47 | if (whereNode.value === null) {
48 | if (!checkType(headerType, FieldType.Null)) {
49 | throw new Error(
50 | `null type not included in ${whereNode.key}'s header types.`,
51 | );
52 | }
53 | } else {
54 | switch (typeof whereNode.value) {
55 | case "bigint":
56 | case "number":
57 | if (
58 | !checkType(headerType, FieldType.Float64) &&
59 | !checkType(headerType, FieldType.Uint64) &&
60 | !checkType(headerType, FieldType.Int64)
61 | ) {
62 | throw new Error(
63 | `number type not included in ${whereNode.key}'s header types.`,
64 | );
65 | }
66 | break;
67 |
68 | case "string":
69 | if (!checkType(headerType, FieldType.String)) {
70 | throw new Error(
71 | `string type not included in ${whereNode.key}'s header types`,
72 | );
73 | }
74 | break;
75 |
76 | case "boolean":
77 | if (!checkType(headerType, FieldType.Boolean)) {
78 | throw new Error(
79 | `boolean type not included in ${whereNode.key}'s header types`,
80 | );
81 | }
82 | break;
83 |
84 | default:
85 | throw new Error(
86 | `unrecognized type: ${typeof whereNode.value} not included in ${whereNode.key}'s header types`,
87 | );
88 | }
89 | }
90 | }
91 | }
92 |
93 | function validateOrderBy(
94 | orderBy: OrderBy[] | undefined,
95 | whereKey: string,
96 | ): void {
97 | if (orderBy) {
98 | if (!Array.isArray(orderBy) || orderBy.length === 0) {
99 | throw new Error("Invalid 'orderBy' clause.");
100 | }
101 |
102 | // Note: currently we only support one orderBy and it must be the where clause. When we add composite indexes and complex querying, refactor.
103 | const orderByObj = orderBy[0];
104 |
105 | if (!["ASC", "DESC"].includes(orderByObj.direction)) {
106 | throw new Error("Invalid direction in `orderBy`.");
107 | }
108 |
109 | if (orderByObj.key !== whereKey) {
110 | throw new Error("'key' in `orderBy` must match `key` in `where` clause");
111 | }
112 | }
113 | }
114 |
115 | function validateSelect(
116 | select: SelectField[] | undefined,
117 | headers: IndexHeader[],
118 | ): void {
119 | if (select) {
120 | if (!Array.isArray(select)) {
121 | throw new Error(`select is not an array: ${select}`);
122 | }
123 |
124 | if (select.length <= 0) {
125 | throw new Error(`select clause is empty: ${select}`);
126 | }
127 |
128 | let hset = new Set();
129 | headers.map((h) => hset.add(h.fieldName));
130 |
131 | select.map((s) => {
132 | if (!hset.has(s as string)) {
133 | throw new Error(
134 | `${s as string} is not included in the field name headers`,
135 | );
136 | }
137 | });
138 | }
139 | }
140 |
141 | export function validateSearch(
142 | search: Search,
143 | headers: IndexHeader[],
144 | ) {
145 | if (!search.config) {
146 | search.config = {
147 | minGram: 1,
148 | maxGram: 2,
149 | };
150 | }
151 | const { config } = search;
152 | let { minGram, maxGram } = config;
153 |
154 | const fh = headers.find((h) => h.fieldName === search.key);
155 |
156 | if (!fh) {
157 | throw new Error(
158 | `Unable to find index header for key: ${search.key as string}`,
159 | );
160 | }
161 |
162 | let gset = new Set([FieldType.Unigram, FieldType.Bigram, FieldType.Trigram]);
163 | const { fieldTypes } = fh;
164 | fieldTypes.forEach((ft) => (gset.has(ft) ? gset.delete(ft) : {}));
165 |
166 | if (gset.size != 0) {
167 | throw new Error(
168 | `Unable to find valid ngram field types: ${[...gset.keys()].map((f) => fieldTypeToString(f))} for index header: ${search.key as string}.`,
169 | );
170 | }
171 |
172 | if (maxGram > 3 || minGram > 3) {
173 | throw new Error(
174 | `Invalid gram length configuration. ${config.minGram} and ${config.maxGram} cannot be greater than 3.`,
175 | );
176 | }
177 |
178 | if (minGram < 1 || maxGram < 1) {
179 | throw new Error(
180 | `Invalid gram length configuration. ${config.minGram} and ${config.maxGram} cannot be less than 3.`,
181 | );
182 | }
183 |
184 | if (minGram > maxGram) {
185 | throw new Error(
186 | `Invalid gram length configuration: minGram ${config.minGram} cannot be greater than maxGram ${config.maxGram}.`,
187 | );
188 | }
189 | }
190 |
191 | export function validateQuery(
192 | query: Query,
193 | headers: IndexHeader[],
194 | ): void {
195 | if (query.search) {
196 | validateSearch(query.search, headers);
197 | }
198 |
199 | if (query.where) {
200 | validateWhere(query.where, headers);
201 | validateOrderBy(query.orderBy, query.where![0].key as string);
202 | validateSelect(query.select, headers);
203 | }
204 | }
205 |
--------------------------------------------------------------------------------
/src/file/data-file.ts:
--------------------------------------------------------------------------------
1 | import { Config } from "../index";
2 | import { requestRanges } from "../resolver/range-request";
3 | import { RangeResolver } from "../resolver/resolver";
4 |
5 | export class DataFile {
6 | private originalResolver?: RangeResolver;
7 |
8 | private constructor(private resolver: RangeResolver) {}
9 |
10 | static forUrl(url: string, config: Config) {
11 | return DataFile.forResolver(
12 | async (ranges) => await requestRanges(url, ranges, config),
13 | );
14 | }
15 |
16 | static forResolver(resolver: RangeResolver) {
17 | const instance = new DataFile(async (ranges) => {
18 | return await resolver(ranges);
19 | });
20 | instance.originalResolver = resolver;
21 | return instance;
22 | }
23 |
24 | getResolver(): RangeResolver | undefined {
25 | return this.originalResolver;
26 | }
27 |
28 | async get(start: number, end: number) {
29 | if (end <= start) {
30 | throw new Error(`Invalid range for start: ${start} and end: ${end}`);
31 | }
32 |
33 | const res = await this.resolver([{ start, end }]);
34 | return new TextDecoder().decode(res[0].data);
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/file/index-file.ts:
--------------------------------------------------------------------------------
1 | import { LinkedMetaPage, PAGE_SIZE_BYTES, ReadMultiBPTree } from "./multi";
2 | import { RangeResolver } from "../resolver/resolver";
3 | import {
4 | IndexHeader,
5 | IndexMeta,
6 | collectIndexMetas,
7 | readIndexMeta,
8 | readFileMeta,
9 | FileMeta,
10 | } from "./meta";
11 | import { FieldType } from "../db/database";
12 | import { Config } from "..";
13 | import { requestRanges } from "../resolver/range-request";
14 |
15 | export class IndexFile {
16 | static async forUrl(url: string, config: Config) {
17 | return await IndexFile.forResolver(
18 | async (ranges) => await requestRanges(url, ranges, config),
19 | );
20 | }
21 |
22 | static async forResolver(
23 | resolver: RangeResolver,
24 | ): Promise> {
25 | return new IndexFileV1(resolver);
26 | }
27 | }
28 |
29 | export interface VersionedIndexFile {
30 | getResolver(): RangeResolver;
31 |
32 | tree(): Promise;
33 |
34 | metadata(): Promise;
35 |
36 | indexHeaders(): Promise;
37 |
38 | seek(header: string, fieldType: FieldType): Promise;
39 |
40 | fetchMetaPages(): Promise;
41 | }
42 |
43 | export class IndexFileV1 implements VersionedIndexFile {
44 | private _tree?: LinkedMetaPage;
45 |
46 | private linkedMetaPages: LinkedMetaPage[] = [];
47 |
48 | constructor(private resolver: RangeResolver) {}
49 |
50 | getResolver(): RangeResolver {
51 | return this.resolver;
52 | }
53 |
54 | async tree(): Promise {
55 | if (this._tree) {
56 | return this._tree;
57 | }
58 |
59 | const tree = ReadMultiBPTree(this.resolver, 0);
60 |
61 | this._tree = tree;
62 | return tree;
63 | }
64 |
65 | async metadata(): Promise {
66 | const tree = await this.tree();
67 | const buffer = await tree.metadata();
68 |
69 | return readFileMeta(buffer);
70 | }
71 |
72 | async seek(header: string, fieldType: FieldType): Promise {
73 | const tree = await this.tree();
74 | let currMp = await tree.next();
75 |
76 | if (!currMp) {
77 | throw new Error(`failed to fetch meta pages`);
78 | }
79 |
80 | let headerMps = [];
81 |
82 | while (true) {
83 | const indexMeta = readIndexMeta(await currMp.metadata());
84 | if (indexMeta.fieldName === header) {
85 | if (fieldType === FieldType.Float64) {
86 | // if key is a number or bigint, we cast it as a float64 type
87 | if (
88 | indexMeta.fieldType === FieldType.Float64 ||
89 | indexMeta.fieldType === FieldType.Int64 ||
90 | indexMeta.fieldType === FieldType.Uint64
91 | ) {
92 | headerMps.push(currMp);
93 | }
94 | } else {
95 | if (fieldType === indexMeta.fieldType) {
96 | headerMps.push(currMp);
97 | }
98 | }
99 | }
100 |
101 | const nextMp = await currMp.next();
102 | if (!nextMp) {
103 | break;
104 | }
105 | currMp = nextMp;
106 | }
107 |
108 | return headerMps;
109 | }
110 |
111 | async fetchMetaPages(): Promise {
112 | const tree = await this.tree();
113 | let currMp = await tree.next();
114 |
115 | if (!currMp) {
116 | throw new Error(`failed to fetch meta pages`);
117 | }
118 |
119 | while (true) {
120 | this.linkedMetaPages.push(currMp);
121 |
122 | const nextMp = await currMp.next();
123 | if (!nextMp) {
124 | break;
125 | }
126 | currMp = nextMp;
127 | }
128 | }
129 |
130 | async indexHeaders(): Promise {
131 | if (this.linkedMetaPages.length === 0) {
132 | await this.fetchMetaPages();
133 | }
134 |
135 | let indexMetas: IndexMeta[] = [];
136 | for (let idx = 0; idx <= this.linkedMetaPages.length - 1; idx++) {
137 | const currMp = this.linkedMetaPages[idx];
138 | const im = readIndexMeta(await currMp.metadata());
139 | indexMetas.push(im);
140 | const nextMp = await currMp.next();
141 | if (!nextMp) {
142 | break;
143 | }
144 | }
145 |
146 | return collectIndexMetas(indexMetas);
147 | }
148 | }
149 |
--------------------------------------------------------------------------------
/src/file/meta.ts:
--------------------------------------------------------------------------------
1 | import { decodeUvarint } from "../util/uvarint";
2 |
3 | export enum FileFormat {
4 | JSONL = 0,
5 | CSV = 1,
6 | }
7 |
8 | export type FileMeta = {
9 | version: number;
10 | format: FileFormat;
11 | readOffset: bigint;
12 | entries: number;
13 | };
14 |
15 | export async function readFileMeta(buffer: ArrayBuffer): Promise {
16 | // unmarshall binary for FileMeta
17 | if (buffer.byteLength <= 10) {
18 | throw new Error(
19 | `incorrect byte length! Want: 10, got ${buffer.byteLength}`,
20 | );
21 | }
22 |
23 | const dataView = new DataView(buffer);
24 | const version = dataView.getUint8(0);
25 | const format = dataView.getUint8(1);
26 |
27 | if (Object.values(FileFormat).indexOf(format) === -1) {
28 | throw new Error(`unexpected file format. Got: ${format}`);
29 | }
30 |
31 | const readOffset = dataView.getBigUint64(2, true);
32 |
33 | const { value: entries } = decodeUvarint(buffer.slice(10));
34 |
35 | return {
36 | version,
37 | format,
38 | readOffset,
39 | entries,
40 | };
41 | }
42 |
43 | export type IndexMeta = {
44 | fieldName: string;
45 | fieldType: number;
46 | width: number;
47 | totalFieldValueLength: number;
48 | };
49 |
50 | export type IndexHeader = {
51 | fieldName: string;
52 | fieldTypes: number[];
53 | };
54 |
55 | export function readIndexMeta(buffer: ArrayBuffer): IndexMeta {
56 | if (buffer.byteLength < 6) {
57 | throw new Error(`invalid metadata size ${buffer.byteLength}`);
58 | }
59 |
60 | const dataView = new DataView(buffer);
61 | const fieldType = dataView.getUint16(0, true);
62 | const width = dataView.getUint16(2, true);
63 | const nameLength = dataView.getUint16(4, true);
64 |
65 | if (buffer.byteLength < 6 + nameLength) {
66 | throw new Error(`invalid metadata size ${buffer.byteLength}`);
67 | }
68 |
69 | const fieldNameBuffer = buffer.slice(6, 6 + nameLength);
70 | const fieldName = new TextDecoder("utf-8").decode(fieldNameBuffer);
71 |
72 | const { value: totalFieldValueLength } = decodeUvarint(
73 | buffer.slice(6 + nameLength),
74 | );
75 |
76 | return {
77 | fieldName,
78 | fieldType,
79 | width,
80 | totalFieldValueLength,
81 | };
82 | }
83 |
84 | export function collectIndexMetas(indexMetas: IndexMeta[]): IndexHeader[] {
85 | const headersMap: Map = new Map();
86 |
87 | for (const meta of indexMetas) {
88 | if (!headersMap.has(meta.fieldName)) {
89 | headersMap.set(meta.fieldName, [meta.fieldType]);
90 | } else {
91 | const updatedTypes = headersMap.get(meta.fieldName);
92 | updatedTypes?.push(meta.fieldType);
93 | headersMap.set(meta.fieldName, updatedTypes!!);
94 | }
95 | }
96 |
97 | const indexHeaders: IndexHeader[] = [];
98 | headersMap.forEach((fieldTypes, fieldName) => {
99 | indexHeaders.push({ fieldName, fieldTypes });
100 | });
101 |
102 | return indexHeaders;
103 | }
104 |
--------------------------------------------------------------------------------
/src/file/multi.ts:
--------------------------------------------------------------------------------
1 | import { RangeResolver } from "../resolver/resolver";
2 | import { MemoryPointer } from "../bptree/node";
3 |
4 | export const PAGE_SIZE_BYTES = 4096;
5 | export const SLOT_SIZE_BYTES = 256;
6 | export const maxUint64 = 2n ** 64n - 1n;
7 | const POINTER_BYTES = 8;
8 | const LENGTH_BYTES = 4;
9 | const COUNT_BYTE = 1;
10 |
11 | export class LinkedMetaPage {
12 | constructor(
13 | private readonly resolver: RangeResolver,
14 | private readonly offset: bigint,
15 | private readonly index: number,
16 | private metaPageDataPromise?: Promise<
17 | { data: ArrayBuffer; totalLength: number }[]
18 | >,
19 | ) {}
20 |
21 | async root(): Promise {
22 | const pageData = await this.getMetaPage();
23 |
24 | // we seek by 12 bytes since offset is 8 bytes, length is 4 bytes
25 | const data = pageData.slice(
26 | this.rootMemoryPointerPageOffset(),
27 | this.rootMemoryPointerPageOffset() + POINTER_BYTES + LENGTH_BYTES,
28 | );
29 |
30 | if (data.byteLength != POINTER_BYTES + LENGTH_BYTES) {
31 | throw new Error(
32 | `failed to properly fetch root node. Got ${data.byteLength}`,
33 | );
34 | }
35 |
36 | const view = new DataView(data);
37 |
38 | const pointerOffset = view.getBigUint64(0, true);
39 | const lengthOffset = view.getUint32(POINTER_BYTES, true);
40 |
41 | return {
42 | offset: pointerOffset,
43 | length: lengthOffset,
44 | };
45 | }
46 |
47 | async metadata(): Promise {
48 | const pageData = await this.getMetaPage();
49 | const rootPointer = POINTER_BYTES + LENGTH_BYTES;
50 | const metadata = pageData.slice(
51 | this.rootMemoryPointerPageOffset() + rootPointer,
52 | );
53 | const metadataView = new DataView(metadata);
54 | // we need to seek past the root pointer
55 | const metadataLength = metadataView.getUint8(0);
56 | return metadataView.buffer.slice(1, 1 + metadataLength);
57 | }
58 |
59 | private async getMetaPage(): Promise {
60 | if (!this.metaPageDataPromise) {
61 | this.metaPageDataPromise = this.resolver([
62 | {
63 | start: Number(this.offset),
64 | end: Number(this.offset) + PAGE_SIZE_BYTES - 1,
65 | },
66 | ]);
67 | }
68 |
69 | const res = await this.metaPageDataPromise;
70 | const { data } = res[0];
71 |
72 | return data;
73 | }
74 |
75 | async next() {
76 | const pageData = await this.getMetaPage();
77 | const view = new DataView(pageData);
78 |
79 | const count = view.getUint8(POINTER_BYTES);
80 |
81 | if (this.index < count - 1) {
82 | return new LinkedMetaPage(
83 | this.resolver,
84 | this.offset,
85 | this.index + 1,
86 | this.metaPageDataPromise,
87 | );
88 | }
89 |
90 | const nextOffset = view.getBigUint64(0, true);
91 |
92 | if (nextOffset === maxUint64) {
93 | return null;
94 | }
95 |
96 | return new LinkedMetaPage(this.resolver, nextOffset, 0);
97 | }
98 |
99 | private rootMemoryPointerPageOffset(): number {
100 | return (
101 | POINTER_BYTES +
102 | COUNT_BYTE +
103 | this.index * (POINTER_BYTES + COUNT_BYTE + SLOT_SIZE_BYTES)
104 | );
105 | }
106 | }
107 |
108 | export function ReadMultiBPTree(
109 | resolver: RangeResolver,
110 | idx: number,
111 | ): LinkedMetaPage {
112 | let offset = idx < 0 ? BigInt(0) : BigInt(idx + 1) * BigInt(PAGE_SIZE_BYTES);
113 | return new LinkedMetaPage(resolver, offset, 0);
114 | }
115 |
--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
1 | import { DataFile } from "./file/data-file";
2 | import { Database, FieldType, fieldTypeToString } from "./db/database";
3 | import { IndexFile } from "./file/index-file";
4 | import { RangeResolver } from "./resolver/resolver";
5 |
6 | export type Config = {
7 | useMultipartByteRanges?: boolean;
8 | };
9 |
10 | export async function init(
11 | dataUrl: string | RangeResolver,
12 | indexUrl: string | RangeResolver,
13 | config?: Config,
14 | ) {
15 | if (!config) {
16 | config = { useMultipartByteRanges: true };
17 | }
18 |
19 | return Database.forDataFileAndIndexFile(
20 | typeof dataUrl === "string"
21 | ? DataFile.forUrl(dataUrl, config)
22 | : DataFile.forResolver(dataUrl),
23 | typeof indexUrl === "string"
24 | ? await IndexFile.forUrl(indexUrl, config)
25 | : await IndexFile.forResolver(indexUrl),
26 | );
27 | }
28 |
29 | interface GlobalMap {
30 | Appendable: {
31 | init: Function;
32 | FieldType: typeof FieldType;
33 | fieldTypeToString: Function;
34 | };
35 | }
36 |
37 | declare global {
38 | var Appendable: GlobalMap["Appendable"];
39 | }
40 |
41 | globalThis.Appendable = {
42 | init,
43 | FieldType,
44 | fieldTypeToString,
45 | };
46 |
--------------------------------------------------------------------------------
/src/ngram/table.ts:
--------------------------------------------------------------------------------
1 | type Entry = { key: K; score: number };
2 |
3 | export class PriorityTable {
4 | private map: Map = new Map();
5 |
6 | insert(key: K, score: number) {
7 | const prevScore = this.map.get(key) ?? 0;
8 | this.map.set(key, prevScore + score);
9 | }
10 |
11 | top() {
12 | return Array.from(this.map, ([key, score]) => ({ key, score })).sort(
13 | (m, n) => n.score - m.score,
14 | );
15 | }
16 | get size(): number {
17 | return this.map.size;
18 | }
19 |
20 | clear(): void {
21 | this.map.clear();
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/src/ngram/tokenizer.ts:
--------------------------------------------------------------------------------
1 | import { FieldType } from "../db/database";
2 |
3 | export type NgramToken = {
4 | value: string;
5 | valueBuf: ArrayBuffer;
6 | type: FieldType;
7 | };
8 |
9 | export class NgramTokenizer {
10 | private readonly minGram: number;
11 | private readonly maxGram: number;
12 |
13 | private allGrams: Map = new Map([
14 | [1, FieldType.Unigram],
15 | [2, FieldType.Bigram],
16 | [3, FieldType.Trigram],
17 | ]);
18 |
19 | private static encoder: TextEncoder = new TextEncoder();
20 |
21 | constructor(minGram: number, maxGram: number) {
22 | this.maxGram = maxGram;
23 | this.minGram = minGram;
24 | }
25 |
26 | tokens(phrase: string): NgramToken[] {
27 | let ngrams: NgramToken[] = [];
28 |
29 | let wordOffsets: number[][] = [];
30 | let currentWordOffsets: number[] = [];
31 |
32 | Array.from(phrase).forEach((c, idx) => {
33 | if (/[a-zA-Z0-9]/.test(c)) {
34 | currentWordOffsets.push(idx);
35 | } else if (/\s/.test(c)) {
36 | if (currentWordOffsets.length >= this.minGram) {
37 | wordOffsets.push(currentWordOffsets);
38 | }
39 | currentWordOffsets = [];
40 | }
41 | });
42 |
43 | if (currentWordOffsets.length >= this.minGram) {
44 | wordOffsets.push(currentWordOffsets);
45 | }
46 |
47 | for (let N = this.minGram; N <= this.maxGram; N++) {
48 | const gType = this.allGrams.get(N);
49 |
50 | if (!gType) {
51 | throw new Error(`Unrecognized gram type for gram length: ${N}`);
52 | }
53 |
54 | wordOffsets.forEach((word) => {
55 | for (let idx = 0; idx <= word.length - N; idx++) {
56 | let str = "";
57 |
58 | for (let jdx = idx; jdx <= idx + N - 1; jdx++) {
59 | str += phrase[word[jdx]];
60 | }
61 |
62 | let value = str.toLowerCase();
63 |
64 | ngrams.push({
65 | value,
66 | valueBuf: NgramTokenizer.encoder.encode(value).buffer,
67 | type: gType,
68 | });
69 | }
70 | });
71 | }
72 |
73 | return ngrams;
74 | }
75 |
76 | static shuffle(tokens: NgramToken[]): NgramToken[] {
77 | // https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle
78 | let soup = [...tokens];
79 |
80 | for (let idx = tokens.length - 1; idx > 0; idx--) {
81 | const jdx = Math.floor(Math.random() * (idx + 1));
82 | [soup[idx], soup[jdx]] = [soup[jdx], soup[idx]];
83 | }
84 |
85 | return soup;
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/src/resolver/cache.ts:
--------------------------------------------------------------------------------
1 | import { RangeResolver } from "./resolver";
2 |
3 | export function cache(resolver: RangeResolver): RangeResolver {
4 | const cache: [
5 | [number, number],
6 | Promise<{ data: ArrayBuffer; totalLength: number }[]>,
7 | ][] = [];
8 |
9 | return async ([{ start, end }]): Promise<
10 | { data: ArrayBuffer; totalLength: number }[]
11 | > => {
12 | // check if start-end is contained in any of the cached ranges
13 | const cached = cache.find(([[s, e]]) => s <= start && end <= e);
14 | if (cached) {
15 | return cached[1].then((cachedData) => {
16 | const data = cachedData[0].data.slice(
17 | start - cached[0][0],
18 | end - cached[0][0],
19 | );
20 | return [
21 | {
22 | data,
23 | totalLength: cachedData[0].totalLength,
24 | },
25 | ];
26 | });
27 | }
28 |
29 | // TODO: check if start-end overlaps with any of the cached ranges
30 |
31 | const promise = resolver([{ start, end }]);
32 | cache.push([[start, end], promise]);
33 | return promise;
34 | };
35 | }
36 |
--------------------------------------------------------------------------------
/src/resolver/multipart.ts:
--------------------------------------------------------------------------------
1 | function getReader(stream: ReadableStream) {
2 | let residual: Uint8Array | null = null;
3 | let readDone = false;
4 | let reader: ReadableStreamDefaultReader = stream.getReader();
5 | return async (
6 | buf: Uint8Array,
7 | ): Promise> => {
8 | if (reader instanceof ReadableStreamBYOBReader) {
9 | return await reader.read(buf);
10 | } else {
11 | while (true) {
12 | if (residual) {
13 | const n = Math.min(residual.length, buf.length);
14 | buf.set(residual.subarray(0, n));
15 | residual = residual.subarray(n);
16 | if (residual.length === 0) {
17 | residual = null;
18 | }
19 | return {
20 | done: readDone && residual === null,
21 | value: buf.subarray(0, n),
22 | };
23 | }
24 | const result = await reader.read();
25 | if (result.value) {
26 | residual = result.value;
27 | }
28 | readDone ||= result.done;
29 | }
30 | }
31 | };
32 | }
33 |
34 | function parseContentRangeHeader(
35 | header: string,
36 | ): [string, number, number, number] {
37 | // parse bytes a-b/c
38 | const tokens = header.split(" ");
39 | if (tokens.length !== 2) {
40 | throw new Error("Invalid Content-Range header");
41 | }
42 | const [range, total] = tokens[1].split("/");
43 | const [start, end] = range.split("-");
44 | return [tokens[0], Number(start), Number(end), Number(total)];
45 | }
46 |
47 | export default async function* parseMultipartBody(
48 | contentType: string,
49 | stream: ReadableStream,
50 | ): AsyncGenerator<{ data: ArrayBuffer; headers: Record }> {
51 | const reader = getReader(stream);
52 | const tokens = contentType.split(";");
53 | if (tokens[0] !== "multipart/byteranges") {
54 | throw new Error("Not a multipart/byteranges body");
55 | }
56 | const boundaryToken = tokens
57 | .map((s) => s.trim())
58 | .find((s) => s.startsWith("boundary="))
59 | ?.split("=", 2)?.[1];
60 | if (!boundaryToken) {
61 | throw new Error("No boundary found");
62 | }
63 | const boundary = `--${boundaryToken}`;
64 |
65 | let headers: Record = {};
66 |
67 | const buf = new Uint8Array(4096);
68 | let ptr = 0;
69 | let length = 0;
70 |
71 | const extend = async () => {
72 | if (length === buf.byteLength) {
73 | throw new Error("no buffer space left");
74 | }
75 | const { done, value } = await reader(
76 | ptr + length >= buf.length
77 | ? buf.subarray((ptr + length) % buf.length, ptr)
78 | : buf.subarray(ptr + length, buf.length),
79 | );
80 | if (done) {
81 | return done;
82 | }
83 | length += value.length;
84 | return false;
85 | };
86 |
87 | while (true) {
88 | // read boundary
89 | for (let i = 0; i < boundary.length; i++) {
90 | while (length === 0) {
91 | if (await extend()) {
92 | return;
93 | }
94 | }
95 | if (buf[ptr] !== boundary.charCodeAt(i)) {
96 | console.log("boundary.charCode", buf[ptr], boundary.charCodeAt(i), i);
97 | throw new Error("Invalid boundary");
98 | }
99 | ptr = (ptr + 1) % buf.length;
100 | length--;
101 | }
102 |
103 | // read the boundary terminator
104 | for (const c of ["\r", "\n"]) {
105 | while (length === 0) {
106 | if (await extend()) {
107 | return;
108 | }
109 | }
110 | if (buf[ptr] === c.charCodeAt(0)) {
111 | ptr = (ptr + 1) % buf.length;
112 | length--;
113 | } else if (buf[ptr] === "-".charCodeAt(0)) {
114 | // eof
115 | return;
116 | } else {
117 | // invalid boundary
118 | throw new Error("Invalid boundary");
119 | }
120 | }
121 |
122 | // read headers
123 | let lastByte = 0;
124 | let header: number[] = [];
125 | while (true) {
126 | while (length === 0) {
127 | if (await extend()) {
128 | return;
129 | }
130 | }
131 | const byte = buf[ptr];
132 | ptr = (ptr + 1) % buf.length;
133 | length--;
134 | if (lastByte === "\r".charCodeAt(0) && byte === "\n".charCodeAt(0)) {
135 | // end of header
136 | if (header.length === 1 /* it's an \r */) {
137 | // end of headers
138 | break;
139 | } else {
140 | const decoded = new TextDecoder().decode(new Uint8Array(header));
141 | const tokens = decoded.split(":", 2);
142 | if (tokens.length !== 2) {
143 | throw new Error(`Invalid header: ${decoded}`);
144 | }
145 | const [key, value] = tokens;
146 | headers[key.trim()] = value.trim();
147 | header.length = 0;
148 | }
149 | } else {
150 | header.push(byte);
151 | }
152 | lastByte = byte;
153 | }
154 |
155 | // read body
156 | // read the Content-Range header
157 | if (!headers["Content-Range"]) {
158 | // TODO: read until the next boundary
159 | throw new Error("Missing Content-Range header");
160 | }
161 | const [unit, start, end] = parseContentRangeHeader(
162 | headers["Content-Range"],
163 | );
164 | if (unit !== "bytes") {
165 | throw new Error("Invalid Content-Range header");
166 | }
167 | const contentLength = end - start + 1;
168 | const data = new Uint8Array(contentLength);
169 | for (let i = 0; i < contentLength; i++) {
170 | while (length === 0) {
171 | if (await extend()) {
172 | return;
173 | }
174 | }
175 | data[i] = buf[ptr];
176 | ptr = (ptr + 1) % buf.length;
177 | length--;
178 | }
179 | yield { data: data.buffer, headers };
180 | headers = {};
181 |
182 | // read the trailing \r\n
183 | for (const c of ["\r", "\n"]) {
184 | while (length === 0) {
185 | if (await extend()) {
186 | return;
187 | }
188 | }
189 | if (buf[ptr] === c.charCodeAt(0)) {
190 | ptr = (ptr + 1) % buf.length;
191 | length--;
192 | } else {
193 | // invalid boundary
194 | throw new Error("Invalid boundary");
195 | }
196 | }
197 | }
198 | }
199 |
--------------------------------------------------------------------------------
/src/resolver/range-request.ts:
--------------------------------------------------------------------------------
1 | import { Config } from "../index";
2 | import parseMultipartBody from "./multipart";
3 | import { LengthIntegrityError } from "./resolver";
4 |
5 | async function resolveIndividualPromises(
6 | url: string,
7 | ranges: { start: number; end: number; expectedLength?: number }[],
8 | ) {
9 | // fallback to resolving ranges individually
10 | const individualRangePromises = ranges.map(
11 | async ({ start, end, expectedLength }) => {
12 | const rangeHeader = `${start}-${end}`;
13 | const res = await fetch(url, {
14 | headers: { Range: `bytes=${rangeHeader}` },
15 | });
16 |
17 | const totalLength = Number(
18 | res.headers.get("Content-Range")!.split("/")[1],
19 | );
20 | if (expectedLength && totalLength !== expectedLength) {
21 | throw new LengthIntegrityError();
22 | }
23 | return {
24 | data: await res.arrayBuffer(),
25 | totalLength: totalLength,
26 | };
27 | },
28 | );
29 | return await Promise.all(individualRangePromises);
30 | }
31 |
32 | export async function requestRanges(
33 | url: string,
34 | ranges: { start: number; end: number; expectedLength?: number }[],
35 | config: Config,
36 | ): Promise<{ data: ArrayBuffer; totalLength: number }[]> {
37 | const { useMultipartByteRanges } = config;
38 | if (useMultipartByteRanges === false) {
39 | return await resolveIndividualPromises(url, ranges);
40 | }
41 |
42 | for (const { start, end } of ranges) {
43 | if (end - start <= 0) {
44 | throw new Error(
45 | `Invalid range: The start (${start}) and end (${end}) of a range are equal, indicating a zero-length range.`,
46 | );
47 | }
48 | }
49 |
50 | const rangesHeader = ranges
51 | .map(({ start, end }) => `${start}-${end}`)
52 | .join(",");
53 |
54 | const response = await fetch(url, {
55 | headers: {
56 | Range: `bytes=${rangesHeader}`,
57 | Accept: "multipart/bytesranges",
58 | },
59 | });
60 |
61 | switch (response.status) {
62 | case 200:
63 | console.warn(
64 | `useMultipartByteRanges is enabled but the server indicated did not respond with a subset of bytes. Set useMultipartByteRanges: false in your Appendable config object.`,
65 | );
66 | return await resolveIndividualPromises(url, ranges);
67 | case 206:
68 | const contentType = response.headers.get("Content-Type");
69 | if (!contentType) {
70 | throw new Error("Missing Content-Type in response");
71 | }
72 | if (contentType.includes("multipart/byteranges")) {
73 | let chunks = [];
74 |
75 | if (!response.body) {
76 | throw new Error(`response body is null: ${response.body}`);
77 | }
78 |
79 | for await (const chunk of parseMultipartBody(
80 | contentType,
81 | response.body,
82 | )) {
83 | chunks.push(chunk);
84 | }
85 |
86 | // the last element is null since the final boundary marker is followed by another delim.
87 | if (chunks[chunks.length - 1].data === undefined) {
88 | chunks.pop();
89 | }
90 |
91 | return chunks.map(({ data, headers }) => {
92 | const totalLengthStr = headers["content-range"]?.split("/")[1];
93 | const totalLength = totalLengthStr ? parseInt(totalLengthStr, 10) : 0;
94 |
95 | return { data, totalLength };
96 | });
97 | } else if (response.headers.has("Content-Range")) {
98 | const abuf = await response.arrayBuffer();
99 | const totalLength = Number(
100 | response.headers.get("Content-Range")!.split("/")[1],
101 | );
102 | return [
103 | {
104 | data: abuf,
105 | totalLength: totalLength,
106 | },
107 | ];
108 | } else {
109 | throw new Error(`Unexpected response format: ${contentType}`);
110 | }
111 | case 416:
112 | const requestedRange = response.headers.get("Range") || rangesHeader;
113 | throw new Error(
114 | `Resolver cannot serve the requested ranges: ${requestedRange}`,
115 | );
116 | default:
117 | throw new Error(`Expected 206 or 200 response, got ${response.status}`);
118 | }
119 | }
120 |
--------------------------------------------------------------------------------
/src/resolver/resolver.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * RangeResolver is a function that takes a range of bytes and returns a promise
3 | * that resolves to an ArrayBuffer containing the bytes in that range. Note that
4 | * the range is inclusive.
5 | *
6 | * Additionally, the RangeResolver must return a checksum which is computed from
7 | * the source data. This checksum is used to verify that the data has not been
8 | * changed between requests. The checksum can be any type, for example it is
9 | * valid to use the last modified timestamp of the source data or the total
10 | * length of the data. This checksum is passed to the RangeResolver on future
11 | * requests as the `checksum` argument. If it does not match the checksum when
12 | * reading the data, the RangeResolver should throw a LengthIntegrityError.
13 | *
14 | * @see LengthIntegrityError
15 | */
16 | export type RangeResolver = (
17 | args: {
18 | start: number;
19 | end: number;
20 | expectedLength?: number;
21 | }[],
22 | ) => Promise<
23 | {
24 | data: ArrayBuffer;
25 | totalLength: number;
26 | }[]
27 | >;
28 |
29 | /**
30 | * LengthIntegrityError is thrown by a RangeResolver when the length argument is
31 | * inconsistent with the data returned. This is used to detect when the data has
32 | * changed between requests.
33 | *
34 | * When a LengthIntegrityError is thrown, typically the cache is evicted and the
35 | * query will be tried again with the exception of the data file where the error
36 | * is ignored due to the assumed immutability of the data file.
37 | *
38 | * @see RangeResolver
39 | */
40 | export class LengthIntegrityError extends Error {
41 | constructor() {
42 | super("length integrity error");
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/tests/bptree.test.ts:
--------------------------------------------------------------------------------
1 | import { BPTree, MetaPage, ReferencedValue } from "../bptree/bptree";
2 | import { MemoryPointer } from "../bptree/node";
3 | import { FieldType } from "../db/database";
4 | import { FileFormat } from "../file/meta";
5 | import { RangeResolver } from "../resolver/resolver";
6 | import { readBinaryFile } from "./test-util";
7 | import { maxUint64 } from "../file/multi";
8 |
9 | class testMetaPage implements MetaPage {
10 | private readonly rootMP: MemoryPointer;
11 |
12 | constructor(mp: MemoryPointer) {
13 | this.rootMP = mp;
14 | }
15 |
16 | async root(): Promise {
17 | return this.rootMP;
18 | }
19 | }
20 |
21 | describe("test BPTree", () => {
22 | let mockRangeResolver: RangeResolver;
23 | let mockDataFileResolver: RangeResolver;
24 | let btree: BPTree;
25 |
26 | beforeEach(() => {
27 | mockDataFileResolver = async ([]) => {
28 | return [
29 | {
30 | data: new ArrayBuffer(0),
31 | totalLength: 0,
32 | },
33 | ];
34 | };
35 |
36 | mockRangeResolver = async ([{ start, end }]) => {
37 | const indexFile = await readBinaryFile("btree_1.bin");
38 | const slicedPart = indexFile.slice(start, end + 1);
39 |
40 | const arrayBuffer = slicedPart.buffer.slice(
41 | slicedPart.byteOffset,
42 | slicedPart.byteOffset + slicedPart.byteLength,
43 | );
44 |
45 | return [
46 | {
47 | data: arrayBuffer,
48 | totalLength: arrayBuffer.byteLength,
49 | },
50 | ];
51 | };
52 |
53 | const page = new testMetaPage({ offset: 8192n, length: 88 });
54 | btree = new BPTree(
55 | mockRangeResolver,
56 | page,
57 | mockDataFileResolver,
58 | FileFormat.CSV,
59 | FieldType.String,
60 | 6,
61 | 4,
62 | );
63 | });
64 |
65 | it("should read a bptree and find items", async () => {
66 | let idx = 1;
67 | for (const value of ["hello", "world", "moooo", "cooow"]) {
68 | const keyBuf = new TextEncoder().encode(value).buffer;
69 | const key = new ReferencedValue({ offset: 0n, length: 0 }, keyBuf);
70 |
71 | const [rv, mp] = await btree.find(key);
72 |
73 | expect(value).toEqual(new TextDecoder().decode(rv.value));
74 | expect(mp.offset).toEqual(BigInt(idx));
75 | idx += 1;
76 | }
77 | });
78 | });
79 |
80 | describe("test BPTree iterator count", () => {
81 | let mockRangeResolver: RangeResolver;
82 | let mockDataFileResolver: RangeResolver;
83 | let btree: BPTree;
84 |
85 | beforeEach(() => {
86 | mockDataFileResolver = async ([]) => {
87 | return [
88 | {
89 | data: new ArrayBuffer(0),
90 | totalLength: 0,
91 | },
92 | ];
93 | };
94 |
95 | mockRangeResolver = async ([{ start, end }]) => {
96 | const indexFile = await readBinaryFile("btree_1023.bin");
97 | const slicedPart = indexFile.slice(start, end + 1);
98 |
99 | const arrayBuffer = slicedPart.buffer.slice(
100 | slicedPart.byteOffset,
101 | slicedPart.byteOffset + slicedPart.byteLength,
102 | );
103 |
104 | return [
105 | {
106 | data: arrayBuffer,
107 | totalLength: arrayBuffer.byteLength,
108 | },
109 | ];
110 | };
111 |
112 | const page = new testMetaPage({ offset: 8192n, length: 88 });
113 | btree = new BPTree(
114 | mockRangeResolver,
115 | page,
116 | mockDataFileResolver,
117 | FileFormat.CSV,
118 | FieldType.String,
119 | 9,
120 | 10,
121 | );
122 | });
123 |
124 | it("should count the value 23 10 times", async () => {
125 | const valueBuf = new ArrayBuffer(8);
126 | new DataView(valueBuf).setFloat64(0, Number(23));
127 |
128 | const valueRef = new ReferencedValue({ offset: 0n, length: 0 }, valueBuf);
129 |
130 | const iter = btree.iter(valueRef);
131 |
132 | let count = 0;
133 |
134 | while (await iter.next()) {
135 | const currKey = iter.getKey();
136 | if (ReferencedValue.compareBytes(valueBuf, currKey.value) === 0) {
137 | count++;
138 | }
139 | }
140 |
141 | expect(count).toEqual(10);
142 | });
143 |
144 | it("should count the value 23 10 times reverse", async () => {
145 | const valueBuf = new ArrayBuffer(8);
146 | new DataView(valueBuf).setFloat64(0, Number(23));
147 |
148 | const valueRef = new ReferencedValue(
149 | { offset: maxUint64, length: 0 },
150 | valueBuf,
151 | );
152 |
153 | const iter = btree.iter(valueRef);
154 | let count = 0;
155 |
156 | while (await iter.prev()) {
157 | const currKey = iter.getKey();
158 | if (ReferencedValue.compareBytes(valueBuf, currKey.value) === 0) {
159 | count++;
160 | }
161 | }
162 |
163 | expect(count).toEqual(10);
164 | });
165 | });
166 |
--------------------------------------------------------------------------------
/src/tests/index-file.test.ts:
--------------------------------------------------------------------------------
1 | import { FieldType } from "../db/database";
2 | import { FileFormat, readFileMeta, readIndexMeta } from "../file/meta";
3 | import { readBinaryFile } from "./test-util";
4 |
5 | describe("test file parsing", () => {
6 | let fileMetaBuffer: Uint8Array;
7 | let indexMetaBuffer: Uint8Array;
8 |
9 | beforeAll(async () => {
10 | fileMetaBuffer = await readBinaryFile("filemeta.bin");
11 | indexMetaBuffer = await readBinaryFile("indexmeta.bin");
12 | });
13 |
14 | it("should read the file meta", async () => {
15 | const fileMeta = await readFileMeta(fileMetaBuffer.buffer);
16 | expect(fileMeta.format).toEqual(FileFormat.CSV);
17 | expect(fileMeta.version).toEqual(1);
18 | expect(fileMeta.readOffset).toEqual(4096n);
19 | expect(fileMeta.entries).toEqual(34);
20 | });
21 |
22 | it("should read the index meta", async () => {
23 | const indexMeta = await readIndexMeta(indexMetaBuffer.buffer);
24 | expect(indexMeta.width).toEqual(2);
25 | expect(indexMeta.fieldName).toEqual("howdydo");
26 | expect(indexMeta.fieldType).toEqual(FieldType.Boolean);
27 | expect(indexMeta.totalFieldValueLength).toEqual(773424601);
28 | });
29 | });
30 |
--------------------------------------------------------------------------------
/src/tests/mock_binaries/btree_1.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevmo314/appendable/e51125a5b8b4f6a9e7940515d1920b267bb3d510/src/tests/mock_binaries/btree_1.bin
--------------------------------------------------------------------------------
/src/tests/mock_binaries/btree_1023.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevmo314/appendable/e51125a5b8b4f6a9e7940515d1920b267bb3d510/src/tests/mock_binaries/btree_1023.bin
--------------------------------------------------------------------------------
/src/tests/mock_binaries/btree_iterator.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevmo314/appendable/e51125a5b8b4f6a9e7940515d1920b267bb3d510/src/tests/mock_binaries/btree_iterator.bin
--------------------------------------------------------------------------------
/src/tests/mock_binaries/filemeta.bin:
--------------------------------------------------------------------------------
1 | "
--------------------------------------------------------------------------------
/src/tests/mock_binaries/filled_metadata.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevmo314/appendable/e51125a5b8b4f6a9e7940515d1920b267bb3d510/src/tests/mock_binaries/filled_metadata.bin
--------------------------------------------------------------------------------
/src/tests/mock_binaries/indexmeta.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevmo314/appendable/e51125a5b8b4f6a9e7940515d1920b267bb3d510/src/tests/mock_binaries/indexmeta.bin
--------------------------------------------------------------------------------
/src/tests/mock_binaries/internalnode.bin:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/tests/mock_binaries/leafnode.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kevmo314/appendable/e51125a5b8b4f6a9e7940515d1920b267bb3d510/src/tests/mock_binaries/leafnode.bin
--------------------------------------------------------------------------------
/src/tests/multi.test.ts:
--------------------------------------------------------------------------------
1 | import { RangeResolver } from "../resolver/resolver";
2 | import { arrayBufferToString, readBinaryFile } from "./test-util";
3 | import { ReadMultiBPTree } from "../file/multi";
4 |
5 | describe("test metadata", () => {
6 | let mockMetadata: Uint8Array;
7 |
8 | beforeAll(async () => {
9 | mockMetadata = await readBinaryFile("filled_metadata.bin");
10 | });
11 |
12 | it("reads stored metadata", async () => {
13 | const mockRangeResolver: RangeResolver = async ([{ start, end }]) => {
14 | return [
15 | {
16 | data: mockMetadata.buffer.slice(start, end),
17 | totalLength: end - start + 1,
18 | },
19 | ];
20 | };
21 |
22 | const tree = ReadMultiBPTree(mockRangeResolver, 0);
23 | const metadata = await tree.metadata();
24 | expect("hello").toEqual(arrayBufferToString(metadata));
25 | });
26 | });
27 |
--------------------------------------------------------------------------------
/src/tests/multipart.test.ts:
--------------------------------------------------------------------------------
1 | import parseMultipartBody from "../resolver/multipart";
2 |
3 | async function collect(gen: AsyncGenerator) {
4 | const result: T[] = [];
5 | for await (const item of gen) {
6 | result.push(item);
7 | }
8 | return result;
9 | }
10 |
11 | describe("multipart", () => {
12 | it("should parse multipart with two chunks", async () => {
13 | const encoder = new TextEncoder();
14 | const data = encoder.encode(`--3d6b6a416f9b5\r
15 | Content-Type: text/html\r
16 | Content-Range: bytes 0-50/1270\r
17 | \r
18 |
19 |
20 |
21 | Example Do\r
22 | --3d6b6a416f9b5\r
23 | Content-Type: text/html\r
24 | Content-Range: bytes 100-150/1270\r
25 | \r
26 | eta http-equiv="Content-type" content="text/html; c\r
27 | --3d6b6a416f9b5--`);
28 | const { readable, writable } = new TransformStream();
29 | const writer = writable.getWriter();
30 | writer.write(data);
31 | writer.close();
32 | const multipart = await collect(
33 | parseMultipartBody(
34 | "multipart/byteranges; boundary=3d6b6a416f9b5",
35 | readable,
36 | ),
37 | );
38 | expect(multipart.length).toBe(2);
39 | const decoder = new TextDecoder();
40 | expect(decoder.decode(multipart[0].data)).toBe(
41 | "\n\n\n Example Do",
42 | );
43 | expect(decoder.decode(multipart[1].data)).toBe(
44 | 'eta http-equiv="Content-type" content="text/html; c',
45 | );
46 | });
47 | });
48 |
--------------------------------------------------------------------------------
/src/tests/ngramtable.test.ts:
--------------------------------------------------------------------------------
1 | import { PriorityTable } from "../ngram/table";
2 |
3 | describe("tests ngram table", () => {
4 | it("correctly tracks the count", () => {
5 | const table = new PriorityTable();
6 | table.insert("howdy", 3);
7 | table.insert("do", 3);
8 | table.insert("howdy", 2);
9 |
10 | const pq = table.top();
11 | expect(pq.length).toEqual(2);
12 | expect(pq[0]).toEqual({ key: "howdy", score: 5 });
13 | expect(pq[1]).toEqual({ key: "do", score: 3 });
14 | });
15 |
16 | it("should return null for top", () => {
17 | const table = new PriorityTable();
18 | const pq = table.top();
19 | expect(pq.length).toEqual(0);
20 | });
21 |
22 | it("should correctly clear all entries", () => {
23 | const table = new PriorityTable();
24 | table.insert("wef", 4);
25 | table.insert("wef", 3);
26 | table.insert("wef", 2);
27 | table.insert("ty", 1);
28 | expect(table.size).toEqual(2);
29 | table.clear();
30 |
31 | const pq = table.top();
32 | expect(pq.length).toEqual(0);
33 | expect(table.size).toEqual(0);
34 | });
35 |
36 | it("handles a large number of varied inserts", () => {
37 | const table = new PriorityTable();
38 | const entries = new Map();
39 | const itemCount = 1000;
40 | const possibleEntries = ["wef", "wef a", "beef", "tarikoplata", "omoplata"];
41 |
42 | for (let idx = 0; idx < itemCount; idx++) {
43 | const randomKey =
44 | possibleEntries[Math.floor(Math.random() * possibleEntries.length)];
45 | table.insert(randomKey, idx);
46 | entries.set(randomKey, (entries.get(randomKey) || 0) + idx);
47 | }
48 |
49 | const sorted = Array.from(entries, ([key, score]) => ({
50 | key,
51 | score,
52 | })).sort((m, n) => n.score - m.score);
53 | let queue = table.top();
54 |
55 | expect(sorted).toEqual(queue);
56 | });
57 | });
58 |
--------------------------------------------------------------------------------
/src/tests/query-builder.test.ts:
--------------------------------------------------------------------------------
1 | import { Database } from "../db/database";
2 | import { QueryBuilder } from "../db/query-builder";
3 | import { validateQuery } from "../db/query-validation";
4 | import { IndexHeader } from "../file/meta";
5 |
6 | describe("test validate queries", () => {
7 | interface MockSchema {
8 | [key: string]: {};
9 | VendorID: {};
10 | store_and_fwd_flag: {};
11 | fare_amount: {};
12 | payment_type: {};
13 | }
14 |
15 | const headers: IndexHeader[] = [
16 | {
17 | fieldName: "VendorID",
18 | fieldTypes: [2],
19 | },
20 | {
21 | fieldName: "store_and_fwd_flag",
22 | fieldTypes: [3],
23 | },
24 | {
25 | fieldName: "fare_amount",
26 | fieldTypes: [2],
27 | },
28 | {
29 | fieldName: "payment_type",
30 | fieldTypes: [3],
31 | },
32 | ];
33 |
34 | let database: Database;
35 |
36 | it(`test query builder`, () => {
37 | let qb = new QueryBuilder(database);
38 |
39 | let qb1 = qb.where("VendorID", "<=", 2);
40 |
41 | expect(() => {
42 | validateQuery(qb1.toQuery(), headers);
43 | }).not.toThrow();
44 | });
45 |
46 | it(`test basic query chain`, () => {
47 | let q = new QueryBuilder(database).where("VendorID", "<=", 2);
48 | let query = q.toQuery();
49 |
50 | expect(query.where).not.toBeNull();
51 | expect(query.where).toEqual([
52 | { key: "VendorID", operation: "<=", value: 2 },
53 | ]);
54 |
55 | expect(() => {
56 | validateQuery(query, headers);
57 | }).not.toThrow();
58 |
59 | q = q.orderBy("VendorID", "ASC");
60 | query = q.toQuery();
61 |
62 | expect(query.where).not.toBeNull();
63 | expect(query.where).toEqual([
64 | { key: "VendorID", operation: "<=", value: 2 },
65 | ]);
66 | expect(query.orderBy).not.toBeNull();
67 | expect(query.orderBy).toEqual([{ key: "VendorID", direction: "ASC" }]);
68 | expect(() => {
69 | validateQuery(query, headers);
70 | }).not.toThrow();
71 |
72 | q = q.select(["VendorID", "store_and_fwd_flag", "fare_amount"]);
73 | query = q.toQuery();
74 | expect(query.where).not.toBeNull();
75 | expect(query.where).toEqual([
76 | { key: "VendorID", operation: "<=", value: 2 },
77 | ]);
78 | expect(query.orderBy).not.toBeNull();
79 | expect(query.orderBy).toEqual([{ key: "VendorID", direction: "ASC" }]);
80 | expect(query.select).not.toBeNull();
81 | expect(query.select).toEqual([
82 | "VendorID",
83 | "store_and_fwd_flag",
84 | "fare_amount",
85 | ]);
86 | });
87 |
88 | it(`test basic derived query chain`, () => {
89 | const q0 = new QueryBuilder(database).where("fare_amount", "==", 1);
90 | let query = q0.toQuery();
91 |
92 | expect(query.where).not.toBeNull();
93 | expect(query.where).toEqual([
94 | { key: "fare_amount", operation: "==", value: 1 },
95 | ]);
96 |
97 | let q1 = q0.orderBy("fare_amount", "DESC");
98 | query = q1.toQuery();
99 |
100 | expect(query.where).not.toBeNull();
101 | expect(query.where).toEqual([
102 | { key: "fare_amount", operation: "==", value: 1 },
103 | ]);
104 | expect(query.orderBy).not.toBeNull();
105 | expect(query.orderBy).toEqual([{ key: "fare_amount", direction: "DESC" }]);
106 |
107 | let q2 = q1.select(["fare_amount"]);
108 | query = q2.toQuery();
109 | expect(query.where).not.toBeNull();
110 | expect(query.where).toEqual([
111 | { key: "fare_amount", operation: "==", value: 1 },
112 | ]);
113 | expect(query.orderBy).not.toBeNull();
114 | expect(query.orderBy).toEqual([{ key: "fare_amount", direction: "DESC" }]);
115 | expect(query.select).not.toBeNull();
116 | expect(query.select).toEqual(["fare_amount"]);
117 | });
118 |
119 | it(`test multi derived query chain`, () => {
120 | const q0 = new QueryBuilder(database).where("fare_amount", "==", 2);
121 | let query = q0.toQuery();
122 |
123 | expect(query.where).not.toBeNull();
124 | expect(query.where).toEqual([
125 | { key: "fare_amount", operation: "==", value: 2 },
126 | ]);
127 |
128 | let q1 = q0.where("VendorID", "==", 2);
129 | query = q1.toQuery();
130 |
131 | expect(query.where).not.toBeNull();
132 | expect(query.where).toEqual([
133 | { key: "fare_amount", operation: "==", value: 2 },
134 | { key: "VendorID", operation: "==", value: 2 },
135 | ]);
136 | });
137 |
138 | it(`test green + red queries`, () => {
139 | const q0 = new QueryBuilder(database).where("payment_type", ">", 3);
140 | const failQuery = q0.orderBy("VendorID", "ASC");
141 | expect(failQuery.toQuery().orderBy).toEqual([
142 | { key: "VendorID", direction: "ASC" },
143 | ]);
144 |
145 | const passQuery = q0.orderBy("payment_type", "DESC");
146 | expect(passQuery.toQuery().orderBy).toEqual([
147 | { key: "payment_type", direction: "DESC" },
148 | ]);
149 |
150 | const failQuery2 = passQuery.select(["wef"]);
151 | const passQuery2 = passQuery.select([
152 | "VendorID",
153 | "payment_type",
154 | "fare_amount",
155 | ]);
156 |
157 | // red queries
158 | [failQuery, failQuery2].forEach((query) => {
159 | expect(() => validateQuery(query.toQuery(), headers)).toThrow();
160 | });
161 |
162 | // green queries
163 | [passQuery, passQuery2].forEach((query) => {
164 | expect(() => validateQuery(query.toQuery(), headers)).not.toThrow();
165 | });
166 | });
167 | });
168 |
--------------------------------------------------------------------------------
/src/tests/query-logic.test.ts:
--------------------------------------------------------------------------------
1 | import { FieldType } from "../db/database";
2 | import { handleSelect, processWhere } from "../db/query-lang";
3 |
4 | describe("query logic test", () => {
5 | it("should process the given key", () => {
6 | let floatBuf1 = new ArrayBuffer(8);
7 | new DataView(floatBuf1).setFloat64(0, 3.4, true);
8 |
9 | let floatBuf2 = new ArrayBuffer(8);
10 | new DataView(floatBuf2).setFloat64(0, Number(1n), true);
11 |
12 | const values: [
13 | string | number | bigint | boolean | null,
14 | FieldType,
15 | ArrayBuffer,
16 | ][] = [
17 | ["howdy", FieldType.String, new TextEncoder().encode("howdy").buffer],
18 | [3.4, FieldType.Float64, floatBuf1],
19 | [1n, FieldType.Float64, floatBuf2],
20 | [true, FieldType.Boolean, new Uint8Array([0]).buffer],
21 | [false, FieldType.Boolean, new Uint8Array([1]).buffer],
22 | [null, FieldType.Null, new ArrayBuffer(0)],
23 | ];
24 |
25 | for (const [value, expectedType, expectedVBuf] of values) {
26 | // @ts-ignore
27 | const res = processWhere(value);
28 |
29 | if (!res) {
30 | expect(res).not.toBeNull();
31 | return;
32 | }
33 |
34 | const { valueBuf, fieldType } = res;
35 | expect(expectedType).toEqual(fieldType);
36 | expect(valueBuf).toEqual(expectedVBuf);
37 | }
38 | });
39 |
40 | it("should select accordingly", () => {
41 | const select = ["george strait", "alan jackson"];
42 |
43 | const mockJson = {
44 | "george strait": "howdy",
45 | "alan jackson": true,
46 | kelp: null,
47 | wef: 30.4,
48 | };
49 |
50 | const mockJsonStr = JSON.stringify(mockJson);
51 | const filtered = handleSelect(mockJsonStr, select);
52 | expect(filtered).toEqual({
53 | "george strait": "howdy",
54 | "alan jackson": true,
55 | });
56 |
57 | const pass = handleSelect(mockJsonStr);
58 | expect(pass).toEqual(mockJson);
59 | });
60 | });
61 |
--------------------------------------------------------------------------------
/src/tests/query-validation.test.ts:
--------------------------------------------------------------------------------
1 | import { validateQuery } from "../db/query-validation";
2 | import { IndexHeader } from "../file/meta";
3 | import { Query, Search } from "../db/query-lang";
4 | import { FieldType } from "../db/database";
5 |
6 | describe("validate search queries", () => {
7 | interface MockSchema {
8 | [key: string]: {};
9 | Pollo: {};
10 | Bife: {};
11 | Cerdo: {};
12 | }
13 |
14 | const headers: IndexHeader[] = [
15 | {
16 | fieldName: "Pollo",
17 | fieldTypes: [FieldType.Unigram, FieldType.Bigram, FieldType.Trigram],
18 | },
19 | {
20 | fieldName: "Bife",
21 | fieldTypes: [FieldType.Unigram, FieldType.Bigram, FieldType.Trigram],
22 | },
23 | {
24 | fieldName: "Cerdo",
25 | fieldTypes: [FieldType.Unigram, FieldType.Bigram, FieldType.Trigram],
26 | },
27 | ];
28 |
29 | it("performs a simple search query", () => {
30 | for (let minGram = 0; minGram <= 3; minGram++) {
31 | for (let maxGram = minGram; maxGram <= 3; maxGram++) {
32 | const search = {
33 | key: "Pollo",
34 | like: "wefhowdy",
35 | minGram,
36 | maxGram,
37 | };
38 | const q: Query = { search };
39 |
40 | expect(() => {
41 | validateQuery(q, headers);
42 | }).not.toThrow();
43 | }
44 | }
45 | });
46 |
47 | it("query a defaults to a 12gram", () => {
48 | const search = {
49 | key: "Cerdo",
50 | like: "wefhowdy",
51 | };
52 |
53 | const q: Query = { search };
54 |
55 | expect(() => {
56 | validateQuery(q, headers);
57 | }).not.toThrow();
58 |
59 | expect(q.search).not.toBeUndefined();
60 | expect(q.search!.config).not.toBeUndefined();
61 | expect(q.search!.config!.minGram).toEqual(1);
62 | expect(q.search!.config!.maxGram).toEqual(2);
63 | });
64 |
65 | it("fails to validate query via unknown header", () => {
66 | const search = {
67 | key: "Atun",
68 | like: "bacalao",
69 | };
70 |
71 | const q: Query = { search };
72 |
73 | expect(() => {
74 | validateQuery(q, headers);
75 | }).toThrow();
76 | });
77 |
78 | it("fails to validate query via invalid range", () => {
79 | const search = {
80 | key: "Pollo",
81 | like: "bacalao",
82 | config: {
83 | minGram: 2,
84 | maxGram: 1,
85 | },
86 | };
87 |
88 | const q: Query = { search };
89 |
90 | expect(() => {
91 | validateQuery(q, headers);
92 | }).toThrow();
93 | });
94 | });
95 |
96 | describe("validate filter queries", () => {
97 | interface MockSchema {
98 | [key: string]: {};
99 | VendorID: {};
100 | store_and_fwd_flag: {};
101 | fare_amount: {};
102 | payment_type: {};
103 | }
104 |
105 | const headers: IndexHeader[] = [
106 | {
107 | fieldName: "VendorID",
108 | fieldTypes: [0],
109 | },
110 | {
111 | fieldName: "store_and_fwd_flag",
112 | fieldTypes: [6],
113 | },
114 | {
115 | fieldName: "fare_amount",
116 | fieldTypes: [3],
117 | },
118 | {
119 | fieldName: "payment_type",
120 | fieldTypes: [3, 0],
121 | },
122 | ];
123 |
124 | const validQueries: Query[] = [
125 | {
126 | where: [
127 | {
128 | operation: "==",
129 | key: "VendorID",
130 | value: "",
131 | },
132 | ],
133 | },
134 | {
135 | where: [
136 | {
137 | operation: "<",
138 | key: "fare_amount",
139 | value: 10,
140 | },
141 | ],
142 | orderBy: [
143 | {
144 | key: "fare_amount",
145 | direction: "ASC",
146 | },
147 | ],
148 | },
149 | {
150 | where: [
151 | {
152 | operation: ">=",
153 | key: "payment_type",
154 | value: 300,
155 | },
156 | ],
157 | orderBy: [
158 | {
159 | key: "payment_type",
160 | direction: "DESC",
161 | },
162 | ],
163 | select: ["payment_type", "fare_amount"],
164 | },
165 | {
166 | where: [
167 | {
168 | operation: "==",
169 | key: "store_and_fwd_flag",
170 | value: false,
171 | },
172 | ],
173 | select: ["fare_amount", "payment_type"],
174 | },
175 | ];
176 |
177 | it("test valid query", () => {
178 | validQueries.forEach((query) => {
179 | expect(() => {
180 | validateQuery(query, headers);
181 | }).not.toThrow();
182 | });
183 | });
184 |
185 | const notValidQueries: Query[] = [
186 | {
187 | where: [
188 | {
189 | operation: "<=",
190 | key: "vendorid",
191 | value: 1,
192 | },
193 | ],
194 | },
195 | {
196 | where: [
197 | {
198 | operation: "==",
199 | key: "store_and_fwd_flag",
200 | value: 10,
201 | },
202 | ],
203 | orderBy: [
204 | {
205 | key: "store_an_flag",
206 | direction: "ASC",
207 | },
208 | ],
209 | },
210 | {
211 | where: [
212 | {
213 | operation: "<",
214 | key: "payment_type",
215 | value: false,
216 | },
217 | ],
218 | select: ["payment_type", "vendorid", "store_and_fwd_flag"],
219 | },
220 | {
221 | where: [
222 | {
223 | operation: "==",
224 | key: "payment_type",
225 | value: "",
226 | },
227 | ],
228 | select: ["paymet_type"],
229 | },
230 | ];
231 |
232 | notValidQueries.forEach((query, index) => {
233 | it(`test invalid query ${index}`, () => {
234 | expect(() => validateQuery(query, headers)).toThrow();
235 | });
236 | });
237 | });
238 |
--------------------------------------------------------------------------------
/src/tests/test-util.ts:
--------------------------------------------------------------------------------
1 | import path from "path";
2 | import fs from "fs/promises";
3 |
4 | export async function readBinaryFile(filename: string): Promise {
5 | const filePath = path.join(__dirname, `mock_binaries/${filename}`);
6 | const data = await fs.readFile(filePath);
7 | return new Uint8Array(data);
8 | }
9 |
10 | export function arrayBufferToString(arrayBuffer: ArrayBuffer): string {
11 | const decoder = new TextDecoder("utf-8");
12 | return decoder.decode(new Uint8Array(arrayBuffer));
13 | }
14 |
--------------------------------------------------------------------------------
/src/tests/tokenizer.test.ts:
--------------------------------------------------------------------------------
1 | import { NgramTokenizer } from "../ngram/tokenizer";
2 | import { FieldType } from "../db/database";
3 |
4 | describe("builds 12grams", () => {
5 | let tok: NgramTokenizer;
6 | let textEncoder: TextEncoder;
7 |
8 | beforeAll(() => {
9 | textEncoder = new TextEncoder();
10 | });
11 |
12 | beforeEach(() => {
13 | tok = new NgramTokenizer(1, 2);
14 | });
15 |
16 | it("builds a basic 12gram", () => {
17 | const phrase = "wakemeup";
18 | const expected = [
19 | "w",
20 | "a",
21 | "k",
22 | "e",
23 | "m",
24 | "e",
25 | "u",
26 | "p",
27 | "wa",
28 | "ak",
29 | "ke",
30 | "em",
31 | "me",
32 | "eu",
33 | "up",
34 | ].map((s) => ({
35 | value: s,
36 | valueBuf: textEncoder.encode(s).buffer,
37 | type: s.length === 1 ? FieldType.Unigram : FieldType.Bigram,
38 | }));
39 |
40 | const trigrams = tok.tokens(phrase);
41 | expect(trigrams).toEqual(expected);
42 | });
43 |
44 | it("builds a complex 12 gram", () => {
45 | const phrase = "I can't wake up";
46 | const expected = [
47 | "i",
48 | "c",
49 | "a",
50 | "n",
51 | "t",
52 | "w",
53 | "a",
54 | "k",
55 | "e",
56 | "u",
57 | "p",
58 | "ca",
59 | "an",
60 | "nt",
61 | "wa",
62 | "ak",
63 | "ke",
64 | "up",
65 | ].map((s) => ({
66 | value: s,
67 | valueBuf: textEncoder.encode(s).buffer,
68 | type: s.length === 1 ? FieldType.Unigram : FieldType.Bigram,
69 | }));
70 |
71 | const trigrams = tok.tokens(phrase);
72 | expect(trigrams).toEqual(expected);
73 | });
74 | });
75 |
76 | describe("builds trigrams", () => {
77 | let tok: NgramTokenizer;
78 | let textEncoder: TextEncoder;
79 |
80 | beforeAll(() => {
81 | textEncoder = new TextEncoder();
82 | });
83 |
84 | beforeEach(() => {
85 | tok = new NgramTokenizer(3, 3);
86 | });
87 |
88 | it("builds a basic trigram", () => {
89 | const phrase = "wakemeup";
90 | const expected = ["wak", "ake", "kem", "eme", "meu", "eup"].map((s) => ({
91 | value: s,
92 | valueBuf: textEncoder.encode(s).buffer,
93 | type: FieldType.Trigram,
94 | }));
95 |
96 | const trigrams = tok.tokens(phrase);
97 | expect(trigrams).toEqual(expected);
98 | });
99 |
100 | it("builds a complex trigram", () => {
101 | const phrase = "I can't wake up";
102 | const expected = ["can", "ant", "wak", "ake"].map((s) => ({
103 | value: s,
104 | valueBuf: textEncoder.encode(s).buffer,
105 | type: FieldType.Trigram,
106 | }));
107 |
108 | const trigrams = tok.tokens(phrase);
109 | expect(trigrams).toEqual(expected);
110 | });
111 | });
112 |
113 | describe("fuzz shuffle", () => {
114 | let tok: NgramTokenizer;
115 |
116 | beforeEach(() => {
117 | tok = new NgramTokenizer(3, 3);
118 | });
119 | const generateRandomString = (length: number) => {
120 | const alpha =
121 | "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 ";
122 | let result = "";
123 | for (let i = 0; i < length; i++) {
124 | result += alpha.charAt(Math.floor(Math.random() * alpha.length));
125 | }
126 | return result;
127 | };
128 |
129 | it("shuffles randomly", () => {
130 | for (let i = 0; i < 100; i++) {
131 | const phrase = generateRandomString(Math.floor(Math.random() * 50));
132 | const trigrams = tok.tokens(phrase);
133 | const shuffled = NgramTokenizer.shuffle(trigrams);
134 |
135 | expect(shuffled.length).toBe(trigrams.length);
136 | expect(new Set(shuffled)).toEqual(new Set(trigrams));
137 | }
138 | });
139 | });
140 |
--------------------------------------------------------------------------------
/src/tests/varint.ts:
--------------------------------------------------------------------------------
1 | import { decodeUvarint, encodeUvarint } from "../util/uvarint";
2 |
3 | describe("test varint codec", () => {
4 | it("should round trip correctly", () => {
5 | let values = [
6 | 0,
7 | 1,
8 | 2,
9 | 10,
10 | 20,
11 | 63,
12 | 64,
13 | 65,
14 | 127,
15 | 128,
16 | 129,
17 | 255,
18 | 256,
19 | 257,
20 | 1 << (63 - 1),
21 | ];
22 |
23 | values.forEach((v) => {
24 | const b = encodeUvarint(v);
25 | const w = decodeUvarint(b);
26 | expect(v).toEqual(w);
27 | });
28 | });
29 | });
30 |
--------------------------------------------------------------------------------
/src/util/uvarint.ts:
--------------------------------------------------------------------------------
1 | export type UvarintResponse = {
2 | value: number;
3 | bytesRead: number;
4 | };
5 |
6 | const MAX_VARINT_64 = 10;
7 |
8 | export function encodeUvarint(n: number): ArrayBuffer {
9 | let i = 0;
10 |
11 | let ibuf = new Uint8Array(MAX_VARINT_64);
12 |
13 | while (n >= 0x80) {
14 | ibuf[i++] = (n & 0xff) | 0x80;
15 | n >>= 7;
16 | }
17 |
18 | ibuf[i] = n & 0xff;
19 |
20 | return ibuf.buffer.slice(0, i + 1);
21 | }
22 |
23 | export function decodeUvarint(buf: ArrayBuffer): UvarintResponse {
24 | let x: number = 0;
25 | let s: number = 0;
26 |
27 | const view = new Uint8Array(buf);
28 |
29 | for (let idx = 0; idx <= view.length - 1; idx++) {
30 | let b = view[idx];
31 |
32 | if (idx === MAX_VARINT_64) {
33 | return { value: 0, bytesRead: -(idx + 1) };
34 | }
35 |
36 | if (b < 0x80) {
37 | if (idx === MAX_VARINT_64 - 1 && b > 1) {
38 | return { value: 0, bytesRead: -(idx + 1) };
39 | }
40 |
41 | let value = x | (b << s);
42 | return { value, bytesRead: idx + 1 };
43 | }
44 |
45 | x |= (b & 0x7f) << s;
46 | s += 7;
47 | }
48 |
49 | return { value: 0, bytesRead: 0 };
50 | }
51 |
--------------------------------------------------------------------------------