├── media └── image.png ├── .gitattributes ├── go.mod ├── .gitignore ├── LICENSE ├── .github └── workflows │ └── static.yml ├── go.sum ├── Makefile ├── query.go ├── index_test.go ├── query_test.go ├── index.go ├── analyzer.go ├── skiplist_test.go ├── serialization.go ├── search.go └── skiplist.go /media/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wizenheimer/blaze/HEAD/media/image.png -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Exclude YAML and HTML files from language statistics 2 | *.yaml linguist-vendored 3 | *.yml linguist-vendored 4 | *.html linguist-vendored 5 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/wizenheimer/blaze 2 | 3 | go 1.24.2 4 | 5 | require ( 6 | github.com/RoaringBitmap/roaring v1.9.4 7 | github.com/kljensen/snowball v0.10.0 8 | ) 9 | 10 | require ( 11 | github.com/bits-and-blooms/bitset v1.12.0 // indirect 12 | github.com/mschoch/smat v0.2.0 // indirect 13 | ) 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | bin/ 8 | dist/ 9 | 10 | # Test binary, built with `go test -c` 11 | *.test 12 | 13 | # Output of the go coverage tool 14 | *.out 15 | coverage.html 16 | coverage.out 17 | 18 | # Go workspace file 19 | go.work 20 | go.work.sum 21 | 22 | # Dependency directories 23 | vendor/ 24 | 25 | # Go build cache 26 | .cache/ 27 | 28 | # IDEs and editors 29 | .vscode/ 30 | .idea/ 31 | *.swp 32 | *.swo 33 | *~ 34 | .DS_Store 35 | 36 | # Environment variables 37 | .env 38 | .env.local 39 | .env.*.local 40 | 41 | # Temporary files 42 | tmp/ 43 | temp/ 44 | *.tmp 45 | 46 | # Logs 47 | *.log 48 | 49 | # OS generated files 50 | Thumbs.db 51 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 wizenheimer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/static.yml: -------------------------------------------------------------------------------- 1 | # Simple workflow for deploying static content to GitHub Pages 2 | name: Deploy static content to Pages 3 | on: 4 | # Runs on pushes targeting the default branch 5 | push: 6 | branches: ["main"] 7 | # Allows you to run this workflow manually from the Actions tab 8 | workflow_dispatch: 9 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 10 | permissions: 11 | contents: read 12 | pages: write 13 | id-token: write 14 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 15 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 16 | concurrency: 17 | group: "pages" 18 | cancel-in-progress: false 19 | jobs: 20 | # Single deploy job since we're just deploying 21 | deploy: 22 | environment: 23 | name: github-pages 24 | url: ${{ steps.deployment.outputs.page_url }} 25 | runs-on: ubuntu-latest 26 | steps: 27 | - name: Checkout 28 | uses: actions/checkout@v4 29 | - name: Setup Pages 30 | uses: actions/configure-pages@v5 31 | - name: Upload artifact 32 | uses: actions/upload-pages-artifact@v3 33 | with: 34 | # Upload only the public directory 35 | path: './public' 36 | - name: Deploy to GitHub Pages 37 | id: deployment 38 | uses: actions/deploy-pages@v4 39 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/RoaringBitmap/roaring v1.9.4 h1:yhEIoH4YezLYT04s1nHehNO64EKFTop/wBhxv2QzDdQ= 2 | github.com/RoaringBitmap/roaring v1.9.4/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90= 3 | github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA= 4 | github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= 5 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= 6 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 7 | github.com/kljensen/snowball v0.10.0 h1:8qgaBLraSuUVHtGH5tJ+VdGpqgfcaE2WkswL/C3nVhY= 8 | github.com/kljensen/snowball v0.10.0/go.mod h1:bJcxtur1W5Qw4fVj9tk5W88zyRcGQQjqahFErdcDTHk= 9 | github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM= 10 | github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw= 11 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 12 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 13 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 14 | github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= 15 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 16 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 17 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= 18 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 19 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Color definitions 2 | RED := \033[0;31m 3 | GREEN := \033[0;32m 4 | YELLOW := \033[0;33m 5 | BLUE := \033[0;34m 6 | MAGENTA := \033[0;35m 7 | CYAN := \033[0;36m 8 | RESET := \033[0m 9 | 10 | # Project configuration 11 | PKG_NAME := blaze 12 | GO_FILES := $(shell find . -type f -name '*.go' -not -path "./vendor/*") 13 | 14 | .PHONY: all test clean fmt lint vet help deps tidy bench test-coverage check 15 | 16 | # Default target 17 | all: help 18 | 19 | # Run tests 20 | test: 21 | @echo "$(CYAN)Running tests...$(RESET)" 22 | @go test -v -race -coverprofile=coverage.out ./... 23 | @echo "$(GREEN)✓ Tests complete$(RESET)" 24 | 25 | # Run tests with coverage report 26 | test-coverage: test 27 | @echo "$(CYAN)Generating coverage report...$(RESET)" 28 | @go tool cover -html=coverage.out -o coverage.html 29 | @echo "$(GREEN)✓ Coverage report: coverage.html$(RESET)" 30 | 31 | # Run benchmarks 32 | bench: 33 | @echo "$(CYAN)Running benchmarks...$(RESET)" 34 | @go test -bench=. -benchmem ./... 35 | @echo "$(GREEN)✓ Benchmarks complete$(RESET)" 36 | 37 | # Format code 38 | fmt: 39 | @echo "$(CYAN)Formatting code...$(RESET)" 40 | @gofmt -s -w $(GO_FILES) 41 | @echo "$(GREEN)✓ Code formatted$(RESET)" 42 | 43 | # Lint code 44 | lint: 45 | @echo "$(CYAN)Running linter...$(RESET)" 46 | @if command -v golangci-lint >/dev/null 2>&1; then \ 47 | golangci-lint run ./...; \ 48 | echo "$(GREEN)✓ Linting complete$(RESET)"; \ 49 | else \ 50 | echo "$(YELLOW)⚠ golangci-lint not installed. Run: brew install golangci-lint$(RESET)"; \ 51 | fi 52 | 53 | # Run go vet 54 | vet: 55 | @echo "$(CYAN)Running go vet...$(RESET)" 56 | @go vet ./... 57 | @echo "$(GREEN)✓ Vet complete$(RESET)" 58 | 59 | # Check for common issues 60 | check: fmt vet lint test 61 | @echo "$(GREEN)✓ All checks passed$(RESET)" 62 | 63 | # Download dependencies 64 | deps: 65 | @echo "$(CYAN)Downloading dependencies...$(RESET)" 66 | @go mod download 67 | @go mod verify 68 | @echo "$(GREEN)✓ Dependencies downloaded$(RESET)" 69 | 70 | # Tidy dependencies 71 | tidy: 72 | @echo "$(CYAN)Tidying dependencies...$(RESET)" 73 | @go mod tidy 74 | @echo "$(GREEN)✓ Dependencies tidied$(RESET)" 75 | 76 | # Clean generated files and build artifacts 77 | clean: 78 | @echo "$(CYAN)Cleaning...$(RESET)" 79 | @rm -f coverage.out coverage.html 80 | @echo "$(GREEN)✓ Clean complete$(RESET)" 81 | 82 | # Display help 83 | help: 84 | @echo "$(CYAN)" 85 | @echo "╔╗ ╦ ╔═╗╔═╗╔═╗" 86 | @echo "╠╩╗║ ╠═╣╔═╝║╣ " 87 | @echo "╚═╝╩═╝╩ ╩╚═╝╚═╝" 88 | @echo "$(RESET)" 89 | @echo "$(MAGENTA)Fast Search Index Library$(RESET)" 90 | @echo "" 91 | @echo "$(MAGENTA)═══════════════════════════════════════════════$(RESET)" 92 | @echo "" 93 | @echo "$(YELLOW)Development Commands:$(RESET)" 94 | @echo " $(GREEN)make fmt$(RESET) - Format Go code" 95 | @echo " $(GREEN)make vet$(RESET) - Run go vet" 96 | @echo " $(GREEN)make lint$(RESET) - Run golangci-lint" 97 | @echo " $(GREEN)make check$(RESET) - Run fmt, vet, lint, and test" 98 | @echo "" 99 | @echo "$(YELLOW)Testing Commands:$(RESET)" 100 | @echo " $(GREEN)make test$(RESET) - Run tests with race detector" 101 | @echo " $(GREEN)make test-coverage$(RESET) - Run tests and generate coverage report" 102 | @echo " $(GREEN)make bench$(RESET) - Run benchmarks" 103 | @echo "" 104 | @echo "$(YELLOW)Dependency Commands:$(RESET)" 105 | @echo " $(GREEN)make deps$(RESET) - Download dependencies" 106 | @echo " $(GREEN)make tidy$(RESET) - Tidy dependencies" 107 | @echo "" 108 | @echo "$(YELLOW)Utility Commands:$(RESET)" 109 | @echo " $(GREEN)make clean$(RESET) - Remove generated files and artifacts" 110 | @echo " $(GREEN)make help$(RESET) - Display this help message" 111 | @echo "" 112 | @echo "$(MAGENTA)═══════════════════════════════════════════════$(RESET)" 113 | 114 | -------------------------------------------------------------------------------- /query.go: -------------------------------------------------------------------------------- 1 | package blaze 2 | 3 | import ( 4 | "github.com/RoaringBitmap/roaring" 5 | ) 6 | 7 | // ═══════════════════════════════════════════════════════════════════════════════ 8 | // QUERY BUILDER: Type-Safe Boolean Queries with Roaring Bitmaps 9 | // ═══════════════════════════════════════════════════════════════════════════════ 10 | // Instead of parsing strings like "machine AND learning", use a fluent API: 11 | // 12 | // EXAMPLE USAGE: 13 | // -------------- 14 | // Query: Find documents with "machine" AND "learning" 15 | // 16 | // results := NewQueryBuilder(index). 17 | // Term("machine"). 18 | // And(). 19 | // Term("learning"). 20 | // Execute() 21 | // 22 | // Query: Find documents with ("cat" OR "dog") but NOT "snake" 23 | // 24 | // results := NewQueryBuilder(index). 25 | // Group(func(q *QueryBuilder) { 26 | // q.Term("cat").Or().Term("dog") 27 | // }). 28 | // And().Not().Term("snake"). 29 | // Execute() 30 | // 31 | // WHY BUILDER PATTERN? 32 | // -------------------- 33 | // ✓ Type-safe: Compiler catches errors 34 | // ✓ IDE-friendly: Auto-completion works 35 | // ✓ Fluent: Reads like natural language 36 | // ✓ Fast: Direct bitmap operations (no parsing overhead) 37 | // ✓ Composable: Easy to build complex queries programmatically 38 | // ═══════════════════════════════════════════════════════════════════════════════ 39 | 40 | // QueryBuilder provides a fluent interface for building boolean queries 41 | type QueryBuilder struct { 42 | index *InvertedIndex 43 | stack []*roaring.Bitmap // Stack of intermediate results 44 | ops []QueryOp // Stack of pending operations 45 | negate bool // Whether next term should be negated 46 | terms []string // Track terms for BM25 scoring 47 | } 48 | 49 | // QueryOp represents a pending boolean operation 50 | type QueryOp int 51 | 52 | const ( 53 | OpNone QueryOp = iota 54 | OpAnd 55 | OpOr 56 | ) 57 | 58 | // NewQueryBuilder creates a new query builder 59 | // 60 | // EXAMPLE: 61 | // -------- 62 | // 63 | // qb := NewQueryBuilder(index) 64 | // results := qb.Term("machine").And().Term("learning").Execute() 65 | func NewQueryBuilder(index *InvertedIndex) *QueryBuilder { 66 | return &QueryBuilder{ 67 | index: index, 68 | stack: make([]*roaring.Bitmap, 0), 69 | ops: make([]QueryOp, 0), 70 | negate: false, 71 | terms: make([]string, 0), 72 | } 73 | } 74 | 75 | // Term adds a term to the query 76 | // 77 | // WHAT IT DOES: 78 | // ------------- 79 | // 1. Gets the roaring bitmap for the term (instant document lookup) 80 | // 2. Applies any pending NOT operation 81 | // 3. Combines with previous results using AND/OR 82 | // 83 | // EXAMPLE: 84 | // -------- 85 | // 86 | // qb.Term("machine") // Find all docs with "machine" 87 | // 88 | // PERFORMANCE: 89 | // ------------ 90 | // O(1) bitmap lookup - no skip list traversal needed! 91 | func (qb *QueryBuilder) Term(term string) *QueryBuilder { 92 | // Analyze the term (lowercase, stem, etc.) 93 | tokens := Analyze(term) 94 | if len(tokens) == 0 { 95 | // Empty term - push empty bitmap 96 | qb.pushBitmap(roaring.NewBitmap()) 97 | return qb 98 | } 99 | 100 | // Track term for BM25 scoring (if not negated) 101 | analyzedTerm := tokens[0] 102 | if !qb.negate { 103 | qb.terms = append(qb.terms, analyzedTerm) 104 | } 105 | 106 | // Get bitmap for the analyzed term 107 | bitmap := qb.getTermBitmap(analyzedTerm) 108 | 109 | // Apply negation if needed 110 | if qb.negate { 111 | bitmap = qb.negateBitmap(bitmap) 112 | qb.negate = false 113 | } 114 | 115 | qb.pushBitmap(bitmap) 116 | return qb 117 | } 118 | 119 | // Phrase adds a phrase query (exact sequence of words) 120 | // 121 | // WHAT IT DOES: 122 | // ------------- 123 | // 1. Analyzes the phrase (just like during indexing) 124 | // 2. Uses skip lists to find exact phrase matches 125 | // 3. Converts results to a bitmap for boolean operations 126 | // 127 | // EXAMPLE: 128 | // -------- 129 | // 130 | // qb.Phrase("machine learning") // Find exact phrase 131 | // 132 | // NOTE: Phrase queries need position information, so we use skip lists 133 | func (qb *QueryBuilder) Phrase(phrase string) *QueryBuilder { 134 | // Analyze the phrase to match what was indexed 135 | // This converts "Machine Learning" to "machin learn" etc. 136 | tokens := Analyze(phrase) 137 | if len(tokens) == 0 { 138 | qb.pushBitmap(roaring.NewBitmap()) 139 | return qb 140 | } 141 | 142 | // Track terms for BM25 scoring (if not negated) 143 | if !qb.negate { 144 | qb.terms = append(qb.terms, tokens...) 145 | } 146 | 147 | // Reconstruct the analyzed phrase 148 | analyzedPhrase := "" 149 | for i, token := range tokens { 150 | if i > 0 { 151 | analyzedPhrase += " " 152 | } 153 | analyzedPhrase += token 154 | } 155 | 156 | // Use existing phrase search from skip lists 157 | matches := qb.index.FindAllPhrases(analyzedPhrase, BOFDocument) 158 | 159 | // Convert to bitmap 160 | bitmap := roaring.NewBitmap() 161 | for _, match := range matches { 162 | if !match[0].IsEnd() { 163 | bitmap.Add(uint32(match[0].GetDocumentID())) 164 | } 165 | } 166 | 167 | // Apply negation if needed 168 | if qb.negate { 169 | bitmap = qb.negateBitmap(bitmap) 170 | qb.negate = false 171 | } 172 | 173 | qb.pushBitmap(bitmap) 174 | return qb 175 | } 176 | 177 | // And adds an AND operation 178 | // 179 | // EXAMPLE: 180 | // -------- 181 | // 182 | // qb.Term("machine").And().Term("learning") 183 | // // Returns docs with BOTH "machine" AND "learning" 184 | // 185 | // PERFORMANCE: 186 | // ------------ 187 | // Roaring bitmap intersection: O(1) for compressed chunks 188 | func (qb *QueryBuilder) And() *QueryBuilder { 189 | qb.ops = append(qb.ops, OpAnd) 190 | return qb 191 | } 192 | 193 | // Or adds an OR operation 194 | // 195 | // EXAMPLE: 196 | // -------- 197 | // 198 | // qb.Term("cat").Or().Term("dog") 199 | // // Returns docs with "cat" OR "dog" (or both) 200 | // 201 | // PERFORMANCE: 202 | // ------------ 203 | // Roaring bitmap union: O(1) for compressed chunks 204 | func (qb *QueryBuilder) Or() *QueryBuilder { 205 | qb.ops = append(qb.ops, OpOr) 206 | return qb 207 | } 208 | 209 | // Not negates the next term 210 | // 211 | // EXAMPLE: 212 | // -------- 213 | // 214 | // qb.Term("python").And().Not().Term("snake") 215 | // // Returns docs with "python" but NOT "snake" 216 | // 217 | // PERFORMANCE: 218 | // ------------ 219 | // Roaring bitmap difference: O(1) for compressed chunks 220 | func (qb *QueryBuilder) Not() *QueryBuilder { 221 | qb.negate = true 222 | return qb 223 | } 224 | 225 | // Group creates a sub-query with its own scope 226 | // 227 | // EXAMPLE: 228 | // -------- 229 | // 230 | // qb.Group(func(q *QueryBuilder) { 231 | // q.Term("cat").Or().Term("dog") 232 | // }).And().Term("pet") 233 | // // Returns: (cat OR dog) AND pet 234 | // 235 | // USE CASE: Control operator precedence 236 | func (qb *QueryBuilder) Group(fn func(*QueryBuilder)) *QueryBuilder { 237 | // Create a new sub-query 238 | subQuery := NewQueryBuilder(qb.index) 239 | 240 | // Execute the group function 241 | fn(subQuery) 242 | 243 | // Get the result from the sub-query 244 | result := subQuery.Execute() 245 | 246 | // Apply negation if needed 247 | if qb.negate { 248 | result = qb.negateBitmap(result) 249 | qb.negate = false 250 | } 251 | 252 | qb.pushBitmap(result) 253 | return qb 254 | } 255 | 256 | // Execute runs the query and returns matching document IDs as a bitmap 257 | // 258 | // ALGORITHM: 259 | // ---------- 260 | // 1. Process all terms and operations in order 261 | // 2. Apply AND/OR operations using roaring bitmap operations 262 | // 3. Return final bitmap of matching documents 263 | // 264 | // EXAMPLE: 265 | // -------- 266 | // 267 | // qb := NewQueryBuilder(index) 268 | // results := qb.Term("machine").And().Term("learning").Execute() 269 | // // results is a roaring.Bitmap with doc IDs 270 | // 271 | // PERFORMANCE: 272 | // ------------ 273 | // All operations use optimized roaring bitmap operations: 274 | // - AND: bitmap intersection (fast!) 275 | // - OR: bitmap union (fast!) 276 | // - NOT: bitmap difference (fast!) 277 | func (qb *QueryBuilder) Execute() *roaring.Bitmap { 278 | if len(qb.stack) == 0 { 279 | return roaring.NewBitmap() 280 | } 281 | 282 | // Process the stack with operations 283 | result := qb.stack[0] 284 | for i := 1; i < len(qb.stack); i++ { 285 | if i-1 < len(qb.ops) { 286 | op := qb.ops[i-1] 287 | switch op { 288 | case OpAnd: 289 | // Intersection: docs in BOTH bitmaps 290 | result = roaring.And(result, qb.stack[i]) 291 | case OpOr: 292 | // Union: docs in EITHER bitmap 293 | result = roaring.Or(result, qb.stack[i]) 294 | } 295 | } 296 | } 297 | 298 | return result 299 | } 300 | 301 | // ExecuteWithBM25 runs the query and returns ranked results using BM25 302 | // 303 | // ALGORITHM: 304 | // ---------- 305 | // 1. Execute boolean query → Get bitmap of matching docs 306 | // 2. Extract terms from the query 307 | // 3. Calculate BM25 score for each matching document 308 | // 4. Sort by score and return top K 309 | // 310 | // EXAMPLE: 311 | // -------- 312 | // 313 | // qb := NewQueryBuilder(index) 314 | // matches := qb.Term("machine").And().Term("learning"). 315 | // ExecuteWithBM25(10) 316 | // // Returns top 10 matches sorted by BM25 score 317 | func (qb *QueryBuilder) ExecuteWithBM25(maxResults int) []Match { 318 | // Execute boolean query 319 | resultBitmap := qb.Execute() 320 | 321 | // Extract terms for BM25 scoring 322 | terms := qb.extractTerms() 323 | 324 | // Score each matching document 325 | var results []Match 326 | iter := resultBitmap.Iterator() 327 | for iter.HasNext() { 328 | docID := int(iter.Next()) 329 | score := qb.index.calculateBM25Score(docID, terms) 330 | 331 | if score > 0 { 332 | results = append(results, Match{ 333 | DocID: docID, 334 | Score: score, 335 | }) 336 | } 337 | } 338 | 339 | // Sort by score (descending) 340 | qb.index.sortMatchesByScore(results) 341 | 342 | // Return top K 343 | return limitResults(results, maxResults) 344 | } 345 | 346 | // ═══════════════════════════════════════════════════════════════════════════════ 347 | // INTERNAL HELPER METHODS 348 | // ═══════════════════════════════════════════════════════════════════════════════ 349 | 350 | // getTermBitmap retrieves the roaring bitmap for a term 351 | func (qb *QueryBuilder) getTermBitmap(term string) *roaring.Bitmap { 352 | if bitmap, exists := qb.index.DocBitmaps[term]; exists { 353 | return bitmap.Clone() // Clone to avoid modifying original 354 | } 355 | return roaring.NewBitmap() // Empty bitmap if term not found 356 | } 357 | 358 | // negateBitmap returns all documents EXCEPT those in the bitmap 359 | func (qb *QueryBuilder) negateBitmap(bitmap *roaring.Bitmap) *roaring.Bitmap { 360 | // Create bitmap of all documents 361 | allDocs := roaring.NewBitmap() 362 | for docID := range qb.index.DocStats { 363 | allDocs.Add(uint32(docID)) 364 | } 365 | 366 | // Return difference: all docs - bitmap 367 | return roaring.AndNot(allDocs, bitmap) 368 | } 369 | 370 | // pushBitmap pushes a bitmap onto the stack 371 | func (qb *QueryBuilder) pushBitmap(bitmap *roaring.Bitmap) { 372 | qb.stack = append(qb.stack, bitmap) 373 | } 374 | 375 | // extractTerms extracts all terms used in the query for BM25 scoring 376 | func (qb *QueryBuilder) extractTerms() []string { 377 | return qb.terms 378 | } 379 | 380 | // ═══════════════════════════════════════════════════════════════════════════════ 381 | // CONVENIENCE METHODS FOR COMMON PATTERNS 382 | // ═══════════════════════════════════════════════════════════════════════════════ 383 | 384 | // AllOf finds documents containing ALL of the given terms (AND) 385 | // 386 | // EXAMPLE: 387 | // -------- 388 | // 389 | // results := AllOf(index, "machine", "learning", "python") 390 | // // Same as: Term("machine").And().Term("learning").And().Term("python") 391 | func AllOf(index *InvertedIndex, terms ...string) *roaring.Bitmap { 392 | if len(terms) == 0 { 393 | return roaring.NewBitmap() 394 | } 395 | 396 | qb := NewQueryBuilder(index).Term(terms[0]) 397 | for i := 1; i < len(terms); i++ { 398 | qb.And().Term(terms[i]) 399 | } 400 | return qb.Execute() 401 | } 402 | 403 | // AnyOf finds documents containing ANY of the given terms (OR) 404 | // 405 | // EXAMPLE: 406 | // -------- 407 | // 408 | // results := AnyOf(index, "cat", "dog", "bird") 409 | // // Same as: Term("cat").Or().Term("dog").Or().Term("bird") 410 | func AnyOf(index *InvertedIndex, terms ...string) *roaring.Bitmap { 411 | if len(terms) == 0 { 412 | return roaring.NewBitmap() 413 | } 414 | 415 | qb := NewQueryBuilder(index).Term(terms[0]) 416 | for i := 1; i < len(terms); i++ { 417 | qb.Or().Term(terms[i]) 418 | } 419 | return qb.Execute() 420 | } 421 | 422 | // TermExcluding finds documents with a term but excluding another 423 | // 424 | // EXAMPLE: 425 | // -------- 426 | // 427 | // results := TermExcluding(index, "python", "snake") 428 | // // Same as: Term("python").And().Not().Term("snake") 429 | func TermExcluding(index *InvertedIndex, include, exclude string) *roaring.Bitmap { 430 | return NewQueryBuilder(index). 431 | Term(include). 432 | And().Not().Term(exclude). 433 | Execute() 434 | } 435 | -------------------------------------------------------------------------------- /index_test.go: -------------------------------------------------------------------------------- 1 | package blaze 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | // ═══════════════════════════════════════════════════════════════════════════════ 8 | // INVERTED INDEX CREATION TESTS 9 | // ═══════════════════════════════════════════════════════════════════════════════ 10 | 11 | func TestNewInvertedIndex(t *testing.T) { 12 | idx := NewInvertedIndex() 13 | 14 | if idx == nil { 15 | t.Fatal("NewInvertedIndex() returned nil") 16 | } 17 | 18 | if idx.PostingsList == nil { 19 | t.Error("PostingsList is nil") 20 | } 21 | 22 | if len(idx.PostingsList) != 0 { 23 | t.Errorf("New index has %d entries, want 0", len(idx.PostingsList)) 24 | } 25 | } 26 | 27 | // ═══════════════════════════════════════════════════════════════════════════════ 28 | // INDEXING TESTS 29 | // ═══════════════════════════════════════════════════════════════════════════════ 30 | 31 | func TestInvertedIndex_Index_SingleDocument(t *testing.T) { 32 | idx := NewInvertedIndex() 33 | 34 | // Index a simple document 35 | idx.Index(1, "quick brown fox") 36 | 37 | // Verify tokens were indexed 38 | tokens := []string{"quick", "brown", "fox"} 39 | for _, token := range tokens { 40 | if _, exists := idx.PostingsList[token]; !exists { 41 | t.Errorf("Token %q was not indexed", token) 42 | } 43 | } 44 | } 45 | 46 | func TestInvertedIndex_Index_MultipleDocuments(t *testing.T) { 47 | idx := NewInvertedIndex() 48 | 49 | // Index multiple documents 50 | idx.Index(1, "quick brown fox") 51 | idx.Index(2, "sleepy dog") 52 | idx.Index(3, "quick brown cats") 53 | 54 | // Check that all unique tokens are indexed (after stemming) 55 | expectedTokens := map[string]bool{ 56 | "quick": true, 57 | "brown": true, 58 | "fox": true, 59 | "sleepi": true, // stemmed from "sleepy" 60 | "dog": true, 61 | "cat": true, // stemmed from "cats" 62 | } 63 | 64 | for token := range expectedTokens { 65 | if _, exists := idx.PostingsList[token]; !exists { 66 | t.Errorf("Token %q was not indexed", token) 67 | } 68 | } 69 | } 70 | 71 | func TestInvertedIndex_Index_DuplicateWords(t *testing.T) { 72 | idx := NewInvertedIndex() 73 | 74 | // Index document with duplicate words 75 | idx.Index(1, "quick quick brown") 76 | 77 | // Verify "quick" has multiple positions 78 | skipList, exists := idx.PostingsList["quick"] 79 | if !exists { 80 | t.Fatal("Token 'quick' was not indexed") 81 | } 82 | 83 | // Count occurrences 84 | count := 0 85 | iter := skipList.Iterator() 86 | if iter.current != nil { 87 | count++ 88 | } 89 | for iter.HasNext() { 90 | iter.Next() 91 | count++ 92 | } 93 | 94 | if count != 2 { 95 | t.Errorf("Token 'quick' has %d occurrences, want 2", count) 96 | } 97 | } 98 | 99 | func TestInvertedIndex_Index_EmptyDocument(t *testing.T) { 100 | idx := NewInvertedIndex() 101 | 102 | // Index empty document 103 | idx.Index(1, "") 104 | 105 | // Should have no tokens 106 | if len(idx.PostingsList) != 0 { 107 | t.Errorf("Empty document created %d tokens, want 0", len(idx.PostingsList)) 108 | } 109 | } 110 | 111 | func TestInvertedIndex_Index_StopWords(t *testing.T) { 112 | idx := NewInvertedIndex() 113 | 114 | // Index document with stop words 115 | idx.Index(1, "the quick brown fox") 116 | 117 | // "the" should be removed by analyzer 118 | if _, exists := idx.PostingsList["the"]; exists { 119 | t.Error("Stop word 'the' should not be indexed") 120 | } 121 | 122 | // Other words should exist 123 | if _, exists := idx.PostingsList["quick"]; !exists { 124 | t.Error("Token 'quick' should be indexed") 125 | } 126 | } 127 | 128 | // ═══════════════════════════════════════════════════════════════════════════════ 129 | // FIRST OPERATION TESTS 130 | // ═══════════════════════════════════════════════════════════════════════════════ 131 | 132 | func TestInvertedIndex_First_SingleOccurrence(t *testing.T) { 133 | idx := NewInvertedIndex() 134 | idx.Index(1, "quick brown fox") 135 | 136 | pos, err := idx.First("quick") 137 | if err != nil { 138 | t.Fatalf("First() error = %v, want nil", err) 139 | } 140 | 141 | if pos.GetDocumentID() != 1 { 142 | t.Errorf("First() document = %d, want 1", pos.GetDocumentID()) 143 | } 144 | 145 | if pos.GetOffset() != 0 { 146 | t.Errorf("First() offset = %d, want 0", pos.GetOffset()) 147 | } 148 | } 149 | 150 | func TestInvertedIndex_First_MultipleOccurrences(t *testing.T) { 151 | idx := NewInvertedIndex() 152 | idx.Index(1, "brown fox") 153 | idx.Index(2, "quick brown") 154 | idx.Index(3, "brown dog") 155 | 156 | pos, err := idx.First("brown") 157 | if err != nil { 158 | t.Fatalf("First() error = %v, want nil", err) 159 | } 160 | 161 | // Should return the first occurrence (Doc1, Pos0) 162 | if pos.GetDocumentID() != 1 || pos.GetOffset() != 0 { 163 | t.Errorf("First() = Doc%d:Pos%d, want Doc1:Pos0", 164 | pos.GetDocumentID(), pos.GetOffset()) 165 | } 166 | } 167 | 168 | func TestInvertedIndex_First_NotFound(t *testing.T) { 169 | idx := NewInvertedIndex() 170 | idx.Index(1, "quick brown fox") 171 | 172 | _, err := idx.First("elephant") 173 | if err != ErrNoPostingList { 174 | t.Errorf("First() error = %v, want %v", err, ErrNoPostingList) 175 | } 176 | } 177 | 178 | // ═══════════════════════════════════════════════════════════════════════════════ 179 | // LAST OPERATION TESTS 180 | // ═══════════════════════════════════════════════════════════════════════════════ 181 | 182 | func TestInvertedIndex_Last_SingleOccurrence(t *testing.T) { 183 | idx := NewInvertedIndex() 184 | idx.Index(1, "quick brown fox") 185 | 186 | pos, err := idx.Last("fox") 187 | if err != nil { 188 | t.Fatalf("Last() error = %v, want nil", err) 189 | } 190 | 191 | if pos.GetDocumentID() != 1 || pos.GetOffset() != 2 { 192 | t.Errorf("Last() = Doc%d:Pos%d, want Doc1:Pos2", 193 | pos.GetDocumentID(), pos.GetOffset()) 194 | } 195 | } 196 | 197 | func TestInvertedIndex_Last_MultipleOccurrences(t *testing.T) { 198 | idx := NewInvertedIndex() 199 | idx.Index(1, "brown fox") 200 | idx.Index(2, "quick brown") 201 | idx.Index(3, "brown dog") 202 | 203 | pos, err := idx.Last("brown") 204 | if err != nil { 205 | t.Fatalf("Last() error = %v, want nil", err) 206 | } 207 | 208 | // Should return the last occurrence (Doc3, Pos0) 209 | if pos.GetDocumentID() != 3 || pos.GetOffset() != 0 { 210 | t.Errorf("Last() = Doc%d:Pos%d, want Doc3:Pos0", 211 | pos.GetDocumentID(), pos.GetOffset()) 212 | } 213 | } 214 | 215 | func TestInvertedIndex_Last_NotFound(t *testing.T) { 216 | idx := NewInvertedIndex() 217 | idx.Index(1, "quick brown fox") 218 | 219 | _, err := idx.Last("elephant") 220 | if err != ErrNoPostingList { 221 | t.Errorf("Last() error = %v, want %v", err, ErrNoPostingList) 222 | } 223 | } 224 | 225 | // ═══════════════════════════════════════════════════════════════════════════════ 226 | // NEXT OPERATION TESTS 227 | // ═══════════════════════════════════════════════════════════════════════════════ 228 | 229 | func TestInvertedIndex_Next_FromBeginning(t *testing.T) { 230 | idx := NewInvertedIndex() 231 | idx.Index(1, "quick brown fox") 232 | 233 | // Next from BOF should return First 234 | pos, err := idx.Next("quick", BOFDocument) 235 | if err != nil { 236 | t.Fatalf("Next() error = %v, want nil", err) 237 | } 238 | 239 | if pos.GetDocumentID() != 1 || pos.GetOffset() != 0 { 240 | t.Errorf("Next() = Doc%d:Pos%d, want Doc1:Pos0", 241 | pos.GetDocumentID(), pos.GetOffset()) 242 | } 243 | } 244 | 245 | func TestInvertedIndex_Next_MultipleOccurrences(t *testing.T) { 246 | idx := NewInvertedIndex() 247 | idx.Index(1, "quick brown fox") 248 | idx.Index(2, "quick dog") 249 | idx.Index(3, "lazy quick") 250 | 251 | // Get first occurrence 252 | pos1, _ := idx.Next("quick", BOFDocument) 253 | if pos1.GetDocumentID() != 1 { 254 | t.Errorf("First occurrence in Doc%d, want Doc1", pos1.GetDocumentID()) 255 | } 256 | 257 | // Get second occurrence 258 | pos2, _ := idx.Next("quick", pos1) 259 | if pos2.GetDocumentID() != 2 { 260 | t.Errorf("Second occurrence in Doc%d, want Doc2", pos2.GetDocumentID()) 261 | } 262 | 263 | // Get third occurrence 264 | pos3, _ := idx.Next("quick", pos2) 265 | if pos3.GetDocumentID() != 3 { 266 | t.Errorf("Third occurrence in Doc%d, want Doc3", pos3.GetDocumentID()) 267 | } 268 | 269 | // No more occurrences 270 | pos4, _ := idx.Next("quick", pos3) 271 | if !pos4.IsEnd() { 272 | t.Error("Next() should return EOF after last occurrence") 273 | } 274 | } 275 | 276 | func TestInvertedIndex_Next_FromEOF(t *testing.T) { 277 | idx := NewInvertedIndex() 278 | idx.Index(1, "quick brown fox") 279 | 280 | pos, _ := idx.Next("quick", EOFDocument) 281 | if !pos.IsEnd() { 282 | t.Error("Next() from EOF should return EOF") 283 | } 284 | } 285 | 286 | func TestInvertedIndex_Next_NotFound(t *testing.T) { 287 | idx := NewInvertedIndex() 288 | idx.Index(1, "quick brown fox") 289 | 290 | _, err := idx.Next("elephant", BOFDocument) 291 | if err != ErrNoPostingList { 292 | t.Errorf("Next() error = %v, want %v", err, ErrNoPostingList) 293 | } 294 | } 295 | 296 | // ═══════════════════════════════════════════════════════════════════════════════ 297 | // PREVIOUS OPERATION TESTS 298 | // ═══════════════════════════════════════════════════════════════════════════════ 299 | 300 | func TestInvertedIndex_Previous_FromEnd(t *testing.T) { 301 | idx := NewInvertedIndex() 302 | idx.Index(1, "quick brown fox") 303 | 304 | // Previous from EOF should return Last 305 | pos, err := idx.Previous("fox", EOFDocument) 306 | if err != nil { 307 | t.Fatalf("Previous() error = %v, want nil", err) 308 | } 309 | 310 | if pos.GetDocumentID() != 1 || pos.GetOffset() != 2 { 311 | t.Errorf("Previous() = Doc%d:Pos%d, want Doc1:Pos2", 312 | pos.GetDocumentID(), pos.GetOffset()) 313 | } 314 | } 315 | 316 | func TestInvertedIndex_Previous_MultipleOccurrences(t *testing.T) { 317 | idx := NewInvertedIndex() 318 | idx.Index(1, "quick brown fox") 319 | idx.Index(2, "quick dog") 320 | idx.Index(3, "lazy quick") 321 | 322 | // Get last occurrence 323 | pos3, _ := idx.Previous("quick", EOFDocument) 324 | if pos3.GetDocumentID() != 3 { 325 | t.Errorf("Last occurrence in Doc%d, want Doc3", pos3.GetDocumentID()) 326 | } 327 | 328 | // Get second-to-last occurrence 329 | pos2, _ := idx.Previous("quick", pos3) 330 | if pos2.GetDocumentID() != 2 { 331 | t.Errorf("Second-to-last occurrence in Doc%d, want Doc2", pos2.GetDocumentID()) 332 | } 333 | 334 | // Get first occurrence 335 | pos1, _ := idx.Previous("quick", pos2) 336 | if pos1.GetDocumentID() != 1 { 337 | t.Errorf("First occurrence in Doc%d, want Doc1", pos1.GetDocumentID()) 338 | } 339 | 340 | // No more occurrences 341 | pos0, _ := idx.Previous("quick", pos1) 342 | if !pos0.IsBeginning() { 343 | t.Error("Previous() should return BOF before first occurrence") 344 | } 345 | } 346 | 347 | func TestInvertedIndex_Previous_FromBOF(t *testing.T) { 348 | idx := NewInvertedIndex() 349 | idx.Index(1, "quick brown fox") 350 | 351 | pos, _ := idx.Previous("quick", BOFDocument) 352 | if !pos.IsBeginning() { 353 | t.Error("Previous() from BOF should return BOF") 354 | } 355 | } 356 | 357 | func TestInvertedIndex_Previous_NotFound(t *testing.T) { 358 | idx := NewInvertedIndex() 359 | idx.Index(1, "quick brown fox") 360 | 361 | _, err := idx.Previous("elephant", EOFDocument) 362 | if err != ErrNoPostingList { 363 | t.Errorf("Previous() error = %v, want %v", err, ErrNoPostingList) 364 | } 365 | } 366 | 367 | // ═══════════════════════════════════════════════════════════════════════════════ 368 | // INTEGRATION TESTS 369 | // ═══════════════════════════════════════════════════════════════════════════════ 370 | 371 | func TestInvertedIndex_ComplexScenario(t *testing.T) { 372 | idx := NewInvertedIndex() 373 | 374 | // Index multiple documents with overlapping vocabulary 375 | idx.Index(1, "the quick brown fox jumps over the lazy dog") 376 | idx.Index(2, "the lazy brown dog sleeps") 377 | idx.Index(3, "quick brown foxes are clever") 378 | 379 | // Test 1: Verify "brown" appears in all three documents 380 | brownDocs := []int{} 381 | pos, _ := idx.First("brown") 382 | brownDocs = append(brownDocs, pos.GetDocumentID()) 383 | 384 | for !pos.IsEnd() { 385 | pos, _ = idx.Next("brown", pos) 386 | if !pos.IsEnd() { 387 | brownDocs = append(brownDocs, pos.GetDocumentID()) 388 | } 389 | } 390 | 391 | expectedDocs := []int{1, 2, 3} 392 | if len(brownDocs) != len(expectedDocs) { 393 | t.Errorf("Found 'brown' in %d documents, want %d", len(brownDocs), len(expectedDocs)) 394 | } 395 | 396 | for i, docID := range brownDocs { 397 | if docID != expectedDocs[i] { 398 | t.Errorf("Document %d: got Doc%d, want Doc%d", i, docID, expectedDocs[i]) 399 | } 400 | } 401 | 402 | // Test 2: Verify "quick" only appears in Doc1 and Doc3 403 | quickDocs := []int{} 404 | pos, _ = idx.First("quick") 405 | quickDocs = append(quickDocs, pos.GetDocumentID()) 406 | 407 | pos, _ = idx.Next("quick", pos) 408 | if !pos.IsEnd() { 409 | quickDocs = append(quickDocs, pos.GetDocumentID()) 410 | } 411 | 412 | expectedQuickDocs := []int{1, 3} 413 | if len(quickDocs) != len(expectedQuickDocs) { 414 | t.Errorf("Found 'quick' in %d documents, want %d", len(quickDocs), len(expectedQuickDocs)) 415 | } 416 | } 417 | 418 | func TestInvertedIndex_PositionOrdering(t *testing.T) { 419 | idx := NewInvertedIndex() 420 | 421 | // Index document where same word appears multiple times 422 | idx.Index(1, "fox fox fox") 423 | 424 | // Get all positions 425 | var positions []int 426 | pos, _ := idx.First("fox") 427 | positions = append(positions, pos.GetOffset()) 428 | 429 | for !pos.IsEnd() { 430 | pos, _ = idx.Next("fox", pos) 431 | if !pos.IsEnd() { 432 | positions = append(positions, pos.GetOffset()) 433 | } 434 | } 435 | 436 | // Verify positions are in order: 0, 1, 2 437 | expected := []int{0, 1, 2} 438 | if len(positions) != len(expected) { 439 | t.Fatalf("Found %d positions, want %d", len(positions), len(expected)) 440 | } 441 | 442 | for i, offset := range positions { 443 | if offset != expected[i] { 444 | t.Errorf("Position %d: offset = %d, want %d", i, offset, expected[i]) 445 | } 446 | } 447 | } 448 | 449 | // ═══════════════════════════════════════════════════════════════════════════════ 450 | // CONCURRENCY TESTS 451 | // ═══════════════════════════════════════════════════════════════════════════════ 452 | 453 | func TestInvertedIndex_ConcurrentIndexing(t *testing.T) { 454 | idx := NewInvertedIndex() 455 | 456 | // Index documents concurrently 457 | done := make(chan bool, 3) 458 | 459 | go func() { 460 | idx.Index(1, "quick brown fox") 461 | done <- true 462 | }() 463 | 464 | go func() { 465 | idx.Index(2, "sleepy dog") 466 | done <- true 467 | }() 468 | 469 | go func() { 470 | idx.Index(3, "quick brown cats") 471 | done <- true 472 | }() 473 | 474 | // Wait for all goroutines to complete 475 | <-done 476 | <-done 477 | <-done 478 | 479 | // Verify all documents were indexed (checking stemmed tokens) 480 | tokens := []string{"quick", "brown", "fox", "sleepi", "dog", "cat"} 481 | for _, token := range tokens { 482 | if _, exists := idx.PostingsList[token]; !exists { 483 | t.Errorf("Token %q was not indexed (concurrent indexing issue)", token) 484 | } 485 | } 486 | } 487 | -------------------------------------------------------------------------------- /query_test.go: -------------------------------------------------------------------------------- 1 | package blaze 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/RoaringBitmap/roaring" 7 | ) 8 | 9 | // ═══════════════════════════════════════════════════════════════════════════════ 10 | // QUERY BUILDER TESTS 11 | // ═══════════════════════════════════════════════════════════════════════════════ 12 | 13 | // setupTestIndex creates a test index with sample documents 14 | func setupTestIndex() *InvertedIndex { 15 | idx := NewInvertedIndex() 16 | 17 | // Document 1: "machine learning is fun" 18 | idx.Index(1, "machine learning is fun") 19 | 20 | // Document 2: "deep learning and machine learning" 21 | idx.Index(2, "deep learning and machine learning") 22 | 23 | // Document 3: "python programming is great" 24 | idx.Index(3, "python programming is great") 25 | 26 | // Document 4: "machine learning with python" 27 | idx.Index(4, "machine learning with python") 28 | 29 | // Document 5: "cats and dogs are pets" 30 | idx.Index(5, "cats and dogs are pets") 31 | 32 | return idx 33 | } 34 | 35 | // TestQueryBuilder_SingleTerm tests querying for a single term 36 | func TestQueryBuilder_SingleTerm(t *testing.T) { 37 | idx := setupTestIndex() 38 | 39 | // Query: Find documents with "machine" 40 | results := NewQueryBuilder(idx). 41 | Term("machine"). 42 | Execute() 43 | 44 | // Should match docs 1, 2, 4 45 | expected := []int{1, 2, 4} 46 | actual := bitmapToSlice(results) 47 | 48 | if !slicesEqual(actual, expected) { 49 | t.Errorf("Expected docs %v, got %v", expected, actual) 50 | } 51 | } 52 | 53 | // TestQueryBuilder_And tests AND operation 54 | func TestQueryBuilder_And(t *testing.T) { 55 | idx := setupTestIndex() 56 | 57 | // Query: Find documents with "machine" AND "python" 58 | results := NewQueryBuilder(idx). 59 | Term("machine"). 60 | And(). 61 | Term("python"). 62 | Execute() 63 | 64 | // Should match only doc 4 65 | expected := []int{4} 66 | actual := bitmapToSlice(results) 67 | 68 | if !slicesEqual(actual, expected) { 69 | t.Errorf("Expected docs %v, got %v", expected, actual) 70 | } 71 | } 72 | 73 | // TestQueryBuilder_Or tests OR operation 74 | func TestQueryBuilder_Or(t *testing.T) { 75 | idx := setupTestIndex() 76 | 77 | // Query: Find documents with "cats" OR "dogs" 78 | results := NewQueryBuilder(idx). 79 | Term("cats"). 80 | Or(). 81 | Term("dogs"). 82 | Execute() 83 | 84 | // Should match doc 5 (which contains both) 85 | expected := []int{5} 86 | actual := bitmapToSlice(results) 87 | 88 | if !slicesEqual(actual, expected) { 89 | t.Errorf("Expected docs %v, got %v", expected, actual) 90 | } 91 | } 92 | 93 | // TestQueryBuilder_Not tests NOT operation 94 | func TestQueryBuilder_Not(t *testing.T) { 95 | idx := setupTestIndex() 96 | 97 | // Query: Find documents with "learning" but NOT "deep" 98 | results := NewQueryBuilder(idx). 99 | Term("learning"). 100 | And().Not(). 101 | Term("deep"). 102 | Execute() 103 | 104 | // Should match docs 1, 4 (not 2, which has "deep") 105 | expected := []int{1, 4} 106 | actual := bitmapToSlice(results) 107 | 108 | if !slicesEqual(actual, expected) { 109 | t.Errorf("Expected docs %v, got %v", expected, actual) 110 | } 111 | } 112 | 113 | // TestQueryBuilder_ComplexQuery tests a complex boolean query 114 | func TestQueryBuilder_ComplexQuery(t *testing.T) { 115 | idx := setupTestIndex() 116 | 117 | // Query: (machine OR python) AND learning 118 | results := NewQueryBuilder(idx). 119 | Group(func(q *QueryBuilder) { 120 | q.Term("machine").Or().Term("python") 121 | }). 122 | And(). 123 | Term("learning"). 124 | Execute() 125 | 126 | // Should match docs 1, 2, 4 127 | // Doc 1: has machine and learning 128 | // Doc 2: has machine and learning 129 | // Doc 3: has python but no learning 130 | // Doc 4: has machine, python, and learning 131 | expected := []int{1, 2, 4} 132 | actual := bitmapToSlice(results) 133 | 134 | if !slicesEqual(actual, expected) { 135 | t.Errorf("Expected docs %v, got %v", expected, actual) 136 | } 137 | } 138 | 139 | // TestQueryBuilder_Phrase tests phrase query 140 | func TestQueryBuilder_Phrase(t *testing.T) { 141 | idx := setupTestIndex() 142 | 143 | // Query: Find exact phrase "machine learning" 144 | results := NewQueryBuilder(idx). 145 | Phrase("machine learning"). 146 | Execute() 147 | 148 | // Should match docs 1, 2, 4 149 | expected := []int{1, 2, 4} 150 | actual := bitmapToSlice(results) 151 | 152 | if !slicesEqual(actual, expected) { 153 | t.Errorf("Expected docs %v, got %v", expected, actual) 154 | } 155 | } 156 | 157 | // TestQueryBuilder_PhraseWithBoolean tests combining phrase and boolean 158 | func TestQueryBuilder_PhraseWithBoolean(t *testing.T) { 159 | idx := setupTestIndex() 160 | 161 | // Query: "machine learning" AND python 162 | results := NewQueryBuilder(idx). 163 | Phrase("machine learning"). 164 | And(). 165 | Term("python"). 166 | Execute() 167 | 168 | // Should match only doc 4 169 | expected := []int{4} 170 | actual := bitmapToSlice(results) 171 | 172 | if !slicesEqual(actual, expected) { 173 | t.Errorf("Expected docs %v, got %v", expected, actual) 174 | } 175 | } 176 | 177 | // TestQueryBuilder_ExecuteWithBM25 tests BM25 scoring 178 | func TestQueryBuilder_ExecuteWithBM25(t *testing.T) { 179 | idx := setupTestIndex() 180 | 181 | // Query: machine AND learning (with BM25 scoring) 182 | results := NewQueryBuilder(idx). 183 | Term("machine"). 184 | And(). 185 | Term("learning"). 186 | ExecuteWithBM25(10) 187 | 188 | // Should return docs with positive scores 189 | if len(results) == 0 { 190 | t.Error("Expected BM25 results, got none") 191 | } 192 | 193 | // All results should have positive scores 194 | for _, match := range results { 195 | if match.Score <= 0 { 196 | t.Errorf("Expected positive score, got %f", match.Score) 197 | } 198 | } 199 | 200 | // Results should be sorted by score (descending) 201 | for i := 1; i < len(results); i++ { 202 | if results[i].Score > results[i-1].Score { 203 | t.Errorf("Results not sorted: score[%d]=%.2f > score[%d]=%.2f", 204 | i, results[i].Score, i-1, results[i-1].Score) 205 | } 206 | } 207 | } 208 | 209 | // TestQueryBuilder_EmptyQuery tests empty query 210 | func TestQueryBuilder_EmptyQuery(t *testing.T) { 211 | idx := setupTestIndex() 212 | 213 | // Empty query should return no results 214 | results := NewQueryBuilder(idx).Execute() 215 | 216 | if results.GetCardinality() != 0 { 217 | t.Errorf("Expected 0 results for empty query, got %d", results.GetCardinality()) 218 | } 219 | } 220 | 221 | // TestQueryBuilder_NonExistentTerm tests querying for non-existent term 222 | func TestQueryBuilder_NonExistentTerm(t *testing.T) { 223 | idx := setupTestIndex() 224 | 225 | // Query for a term that doesn't exist 226 | results := NewQueryBuilder(idx). 227 | Term("quantum"). 228 | Execute() 229 | 230 | if results.GetCardinality() != 0 { 231 | t.Errorf("Expected 0 results for non-existent term, got %d", results.GetCardinality()) 232 | } 233 | } 234 | 235 | // TestQueryBuilder_MultipleAnds tests chaining multiple AND operations 236 | func TestQueryBuilder_MultipleAnds(t *testing.T) { 237 | idx := setupTestIndex() 238 | 239 | // Query: machine AND learning AND python 240 | results := NewQueryBuilder(idx). 241 | Term("machine"). 242 | And().Term("learning"). 243 | And().Term("python"). 244 | Execute() 245 | 246 | // Should match only doc 4 247 | expected := []int{4} 248 | actual := bitmapToSlice(results) 249 | 250 | if !slicesEqual(actual, expected) { 251 | t.Errorf("Expected docs %v, got %v", expected, actual) 252 | } 253 | } 254 | 255 | // TestQueryBuilder_MultipleOrs tests chaining multiple OR operations 256 | func TestQueryBuilder_MultipleOrs(t *testing.T) { 257 | idx := setupTestIndex() 258 | 259 | // Query: cats OR dogs OR pets 260 | results := NewQueryBuilder(idx). 261 | Term("cats"). 262 | Or().Term("dogs"). 263 | Or().Term("pets"). 264 | Execute() 265 | 266 | // Should match doc 5 267 | expected := []int{5} 268 | actual := bitmapToSlice(results) 269 | 270 | if !slicesEqual(actual, expected) { 271 | t.Errorf("Expected docs %v, got %v", expected, actual) 272 | } 273 | } 274 | 275 | // TestQueryBuilder_NestedGroups tests nested group operations 276 | func TestQueryBuilder_NestedGroups(t *testing.T) { 277 | idx := setupTestIndex() 278 | 279 | // Query: ((machine OR deep) AND learning) AND NOT python 280 | results := NewQueryBuilder(idx). 281 | Group(func(q *QueryBuilder) { 282 | q.Group(func(qq *QueryBuilder) { 283 | qq.Term("machine").Or().Term("deep") 284 | }).And().Term("learning") 285 | }). 286 | And().Not().Term("python"). 287 | Execute() 288 | 289 | // Should match docs 1, 2 (not 4 which has python) 290 | expected := []int{1, 2} 291 | actual := bitmapToSlice(results) 292 | 293 | if !slicesEqual(actual, expected) { 294 | t.Errorf("Expected docs %v, got %v", expected, actual) 295 | } 296 | } 297 | 298 | // ═══════════════════════════════════════════════════════════════════════════════ 299 | // CONVENIENCE FUNCTION TESTS 300 | // ═══════════════════════════════════════════════════════════════════════════════ 301 | 302 | // TestAllOf tests AllOf convenience function 303 | func TestAllOf(t *testing.T) { 304 | idx := setupTestIndex() 305 | 306 | // Find docs with machine, learning, and python 307 | results := AllOf(idx, "machine", "learning", "python") 308 | 309 | expected := []int{4} 310 | actual := bitmapToSlice(results) 311 | 312 | if !slicesEqual(actual, expected) { 313 | t.Errorf("Expected docs %v, got %v", expected, actual) 314 | } 315 | } 316 | 317 | // TestAnyOf tests AnyOf convenience function 318 | func TestAnyOf(t *testing.T) { 319 | idx := setupTestIndex() 320 | 321 | // Find docs with cats, dogs, or python 322 | results := AnyOf(idx, "cats", "dogs", "python") 323 | 324 | // Should match docs 3, 4, 5 325 | expected := []int{3, 4, 5} 326 | actual := bitmapToSlice(results) 327 | 328 | if !slicesEqual(actual, expected) { 329 | t.Errorf("Expected docs %v, got %v", expected, actual) 330 | } 331 | } 332 | 333 | // TestTermExcluding tests TermExcluding convenience function 334 | func TestTermExcluding(t *testing.T) { 335 | idx := setupTestIndex() 336 | 337 | // Find docs with "learning" but not "deep" 338 | results := TermExcluding(idx, "learning", "deep") 339 | 340 | expected := []int{1, 4} 341 | actual := bitmapToSlice(results) 342 | 343 | if !slicesEqual(actual, expected) { 344 | t.Errorf("Expected docs %v, got %v", expected, actual) 345 | } 346 | } 347 | 348 | // TestAllOf_EmptyTerms tests AllOf with no terms 349 | func TestAllOf_EmptyTerms(t *testing.T) { 350 | idx := setupTestIndex() 351 | 352 | results := AllOf(idx) 353 | 354 | if results.GetCardinality() != 0 { 355 | t.Errorf("Expected 0 results for empty AllOf, got %d", results.GetCardinality()) 356 | } 357 | } 358 | 359 | // TestAnyOf_EmptyTerms tests AnyOf with no terms 360 | func TestAnyOf_EmptyTerms(t *testing.T) { 361 | idx := setupTestIndex() 362 | 363 | results := AnyOf(idx) 364 | 365 | if results.GetCardinality() != 0 { 366 | t.Errorf("Expected 0 results for empty AnyOf, got %d", results.GetCardinality()) 367 | } 368 | } 369 | 370 | // ═══════════════════════════════════════════════════════════════════════════════ 371 | // REAL-WORLD QUERY PATTERNS 372 | // ═══════════════════════════════════════════════════════════════════════════════ 373 | 374 | // TestQueryBuilder_SearchEnginePattern tests a typical search engine query 375 | func TestQueryBuilder_SearchEnginePattern(t *testing.T) { 376 | idx := setupTestIndex() 377 | 378 | // Typical search: "machine learning" (phrase) OR just "python" 379 | results := NewQueryBuilder(idx). 380 | Phrase("machine learning"). 381 | Or(). 382 | Term("python"). 383 | Execute() 384 | 385 | // Should match docs 1, 2, 3, 4 386 | expected := []int{1, 2, 3, 4} 387 | actual := bitmapToSlice(results) 388 | 389 | if !slicesEqual(actual, expected) { 390 | t.Errorf("Expected docs %v, got %v", expected, actual) 391 | } 392 | } 393 | 394 | // TestQueryBuilder_FilteringPattern tests filtering unwanted content 395 | func TestQueryBuilder_FilteringPattern(t *testing.T) { 396 | idx := setupTestIndex() 397 | 398 | // Find programming content but exclude python 399 | results := NewQueryBuilder(idx). 400 | Term("programming"). 401 | And().Not(). 402 | Term("python"). 403 | Execute() 404 | 405 | // Should return no results (all programming docs have python) 406 | if results.GetCardinality() != 0 { 407 | t.Errorf("Expected 0 results, got %d", results.GetCardinality()) 408 | } 409 | } 410 | 411 | // TestQueryBuilder_CategoryPattern tests category-based search 412 | func TestQueryBuilder_CategoryPattern(t *testing.T) { 413 | idx := setupTestIndex() 414 | 415 | // Find AI/ML docs: (machine OR deep) AND learning 416 | results := NewQueryBuilder(idx). 417 | Group(func(q *QueryBuilder) { 418 | q.Term("machine").Or().Term("deep") 419 | }). 420 | And(). 421 | Term("learning"). 422 | Execute() 423 | 424 | expected := []int{1, 2, 4} 425 | actual := bitmapToSlice(results) 426 | 427 | if !slicesEqual(actual, expected) { 428 | t.Errorf("Expected docs %v, got %v", expected, actual) 429 | } 430 | } 431 | 432 | // ═══════════════════════════════════════════════════════════════════════════════ 433 | // PERFORMANCE TESTS 434 | // ═══════════════════════════════════════════════════════════════════════════════ 435 | 436 | // BenchmarkQueryBuilder_Simple benchmarks simple query 437 | func BenchmarkQueryBuilder_Simple(b *testing.B) { 438 | idx := setupTestIndex() 439 | 440 | b.ResetTimer() 441 | for i := 0; i < b.N; i++ { 442 | NewQueryBuilder(idx). 443 | Term("machine"). 444 | And(). 445 | Term("learning"). 446 | Execute() 447 | } 448 | } 449 | 450 | // BenchmarkQueryBuilder_Complex benchmarks complex query 451 | func BenchmarkQueryBuilder_Complex(b *testing.B) { 452 | idx := setupTestIndex() 453 | 454 | b.ResetTimer() 455 | for i := 0; i < b.N; i++ { 456 | NewQueryBuilder(idx). 457 | Group(func(q *QueryBuilder) { 458 | q.Term("machine").Or().Term("deep") 459 | }). 460 | And(). 461 | Term("learning"). 462 | And().Not(). 463 | Term("python"). 464 | Execute() 465 | } 466 | } 467 | 468 | // BenchmarkQueryBuilder_WithBM25 benchmarks query with BM25 scoring 469 | func BenchmarkQueryBuilder_WithBM25(b *testing.B) { 470 | idx := setupTestIndex() 471 | 472 | b.ResetTimer() 473 | for i := 0; i < b.N; i++ { 474 | NewQueryBuilder(idx). 475 | Term("machine"). 476 | And(). 477 | Term("learning"). 478 | ExecuteWithBM25(10) 479 | } 480 | } 481 | 482 | // ═══════════════════════════════════════════════════════════════════════════════ 483 | // HELPER FUNCTIONS 484 | // ═══════════════════════════════════════════════════════════════════════════════ 485 | 486 | // bitmapToSlice converts a roaring bitmap to a sorted slice of ints 487 | func bitmapToSlice(bitmap *roaring.Bitmap) []int { 488 | if bitmap == nil { 489 | return []int{} 490 | } 491 | 492 | result := make([]int, 0, bitmap.GetCardinality()) 493 | iter := bitmap.Iterator() 494 | for iter.HasNext() { 495 | result = append(result, int(iter.Next())) 496 | } 497 | return result 498 | } 499 | 500 | // slicesEqual checks if two slices are equal 501 | func slicesEqual(a, b []int) bool { 502 | if len(a) != len(b) { 503 | return false 504 | } 505 | for i := range a { 506 | if a[i] != b[i] { 507 | return false 508 | } 509 | } 510 | return true 511 | } 512 | -------------------------------------------------------------------------------- /index.go: -------------------------------------------------------------------------------- 1 | // Package index implements an inverted index for full-text search 2 | // 3 | // ═══════════════════════════════════════════════════════════════════════════════ 4 | // WHAT IS AN INVERTED INDEX? 5 | // ═══════════════════════════════════════════════════════════════════════════════ 6 | // An inverted index is like the index at the back of a book, but for search engines. 7 | // 8 | // Example: Given these documents: 9 | // Doc 1: "the quick brown fox" 10 | // Doc 2: "the lazy dog" 11 | // Doc 3: "quick brown dogs" 12 | // 13 | // The inverted index would look like: 14 | // "quick" → [Doc1:Pos1, Doc3:Pos0] 15 | // "brown" → [Doc1:Pos2, Doc3:Pos1] 16 | // "fox" → [Doc1:Pos3] 17 | // "lazy" → [Doc2:Pos1] 18 | // "dog" → [Doc2:Pos2] 19 | // "dogs" → [Doc3:Pos2] 20 | // 21 | // This allows us to: 22 | // 1. Find documents containing a word instantly (without scanning all docs) 23 | // 2. Find phrases by checking if word positions are consecutive 24 | // 3. Rank results by how close words appear to each other (proximity) 25 | // 26 | // ═══════════════════════════════════════════════════════════════════════════════ 27 | 28 | package blaze 29 | 30 | import ( 31 | "errors" 32 | "log/slog" 33 | "sync" 34 | 35 | "github.com/RoaringBitmap/roaring" 36 | ) 37 | 38 | // ═══════════════════════════════════════════════════════════════════════════════ 39 | // ERROR DEFINITIONS 40 | // ═══════════════════════════════════════════════════════════════════════════════ 41 | // We define errors as package-level variables so they can be compared with == 42 | // This is a Go best practice for error handling. 43 | var ( 44 | ErrNoPostingList = errors.New("no posting list exists for token") 45 | ErrNoNextElement = errors.New("no next element found") 46 | ErrNoPrevElement = errors.New("no previous element found") 47 | ) 48 | 49 | // ═══════════════════════════════════════════════════════════════════════════════ 50 | // BM25 RANKING SYSTEM 51 | // ═══════════════════════════════════════════════════════════════════════════════ 52 | // BM25 (Best Matching 25) is a ranking function used by search engines to estimate 53 | // the relevance of documents to a given search query. 54 | // 55 | // WHY BM25? 56 | // --------- 57 | // 1. Industry standard: Used by Elasticsearch, Solr, Lucene 58 | // 2. Accounts for document length (longer docs don't unfairly rank higher) 59 | // 3. Accounts for term frequency saturation (10 vs 100 occurrences matter less) 60 | // 4. Accounts for term rarity (rare terms are more significant) 61 | // 62 | // BM25 FORMULA: 63 | // ------------- 64 | // For each term in the query: 65 | // score += IDF(term) * (TF * (k1 + 1)) / (TF + k1 * (1 - b + b * (docLen / avgDocLen))) 66 | // 67 | // Where: 68 | // IDF = Inverse Document Frequency (how rare is this term?) 69 | // TF = Term Frequency (how often does term appear in this doc?) 70 | // k1 = Term frequency saturation parameter (typically 1.2-2.0) 71 | // b = Length normalization parameter (typically 0.75) 72 | // docLen = Length of this document 73 | // avgDocLen = Average document length in the corpus 74 | // 75 | // EXAMPLE: 76 | // -------- 77 | // Query: "machine learning" 78 | // Doc A: 100 words, contains "machine" 3 times, "learning" 2 times 79 | // Doc B: 500 words, contains "machine" 5 times, "learning" 8 times 80 | // 81 | // Despite Doc B having more occurrences, Doc A might score higher because: 82 | // 1. Doc A is shorter (length normalization) 83 | // 2. The density of query terms is higher in Doc A 84 | // ═══════════════════════════════════════════════════════════════════════════════ 85 | 86 | // BM25Parameters holds the tuning parameters for BM25 algorithm 87 | type BM25Parameters struct { 88 | K1 float64 // Term frequency saturation (typical: 1.2-2.0) 89 | B float64 // Length normalization (typical: 0.75) 90 | } 91 | 92 | // DefaultBM25Parameters returns the standard BM25 parameters 93 | func DefaultBM25Parameters() BM25Parameters { 94 | return BM25Parameters{ 95 | K1: 1.5, // Moderate term frequency saturation 96 | B: 0.75, // Standard length normalization 97 | } 98 | } 99 | 100 | // DocumentStats stores statistics about a single document 101 | type DocumentStats struct { 102 | DocID int // Document identifier 103 | Length int // Number of terms in the document 104 | TermFreqs map[string]int // How many times each term appears 105 | } 106 | 107 | // ═══════════════════════════════════════════════════════════════════════════════ 108 | // CORE DATA STRUCTURE: InvertedIndex with HYBRID STORAGE 109 | // ═══════════════════════════════════════════════════════════════════════════════ 110 | // The InvertedIndex uses a hybrid approach for maximum efficiency: 111 | // 112 | // Architecture: 113 | // 114 | // InvertedIndex 115 | // ├── DocBitmaps: map[string]*roaring.Bitmap (DOCUMENT-LEVEL) 116 | // │ ├── "quick" → Bitmap of document IDs [1, 3, 5, ...] 117 | // │ ├── "brown" → Bitmap of document IDs [1, 2, 7, ...] 118 | // │ └── "fox" → Bitmap of document IDs [3, 5, ...] 119 | // ├── PostingsList: map[string]SkipList (POSITION-LEVEL) 120 | // │ ├── "quick" → SkipList of exact positions 121 | // │ ├── "brown" → SkipList of exact positions 122 | // │ └── "fox" → SkipList of exact positions 123 | // └── mu: mutex for thread safety 124 | // 125 | // Why Hybrid Storage? 126 | // - Roaring Bitmaps: Lightning-fast for document-level operations (AND, OR, NOT) 127 | // 10-100x memory compression, O(1) boolean operations 128 | // - Skip Lists: Essential for position-based queries (phrases, proximity) 129 | // 130 | // This gives us the best of both worlds! 131 | // ═══════════════════════════════════════════════════════════════════════════════ 132 | type InvertedIndex struct { 133 | mu sync.Mutex // Protects against concurrent access 134 | 135 | // DOCUMENT-LEVEL STORAGE (for fast document lookups and boolean queries) 136 | DocBitmaps map[string]*roaring.Bitmap // Term → Bitmap of document IDs 137 | 138 | // POSITION-LEVEL STORAGE (for phrase search, proximity) 139 | PostingsList map[string]SkipList // Term → Positions 140 | 141 | // =============================== 142 | // BM25 INDEXING DATA STRUCTURES 143 | // =============================== 144 | DocStats map[int]DocumentStats // DocID → statistics 145 | TotalDocs int // Total number of indexed documents 146 | TotalTerms int64 // Total number of terms across all docs 147 | BM25Params BM25Parameters // BM25 tuning parameters 148 | } 149 | 150 | // NewInvertedIndex creates a new empty inverted index with hybrid storage and BM25 support 151 | func NewInvertedIndex() *InvertedIndex { 152 | return &InvertedIndex{ 153 | DocBitmaps: make(map[string]*roaring.Bitmap), // Initialize document-level bitmaps 154 | PostingsList: make(map[string]SkipList), // Initialize position-level skip lists 155 | DocStats: make(map[int]DocumentStats), 156 | TotalDocs: 0, 157 | TotalTerms: 0, 158 | BM25Params: DefaultBM25Parameters(), 159 | } 160 | } 161 | 162 | // ═══════════════════════════════════════════════════════════════════════════════ 163 | // INDEXING: Building the Search Index 164 | // ═══════════════════════════════════════════════════════════════════════════════ 165 | 166 | // Index adds a document to the inverted index 167 | // 168 | // STEP-BY-STEP EXAMPLE: 169 | // ---------------------- 170 | // Input: docID=1, document="The quick brown fox" 171 | // 172 | // Step 1: Tokenization 173 | // 174 | // analyzer.Analyze() converts to: ["quick", "brown", "fox"] 175 | // (Note: "The" is removed as a stop word, and words are lowercased) 176 | // 177 | // Step 2: For each token, record its position 178 | // 179 | // Token "quick" at position 0 in document 1 180 | // Token "brown" at position 1 in document 1 181 | // Token "fox" at position 2 in document 1 182 | // 183 | // Step 3: Update the index 184 | // 185 | // PostingsList["quick"] ← add Position{DocID:1, Offset:0} 186 | // PostingsList["brown"] ← add Position{DocID:1, Offset:1} 187 | // PostingsList["fox"] ← add Position{DocID:1, Offset:2} 188 | // 189 | // Why record positions and not just document IDs? 190 | // - Positions let us do phrase search ("brown fox" requires consecutive positions) 191 | // - Positions let us rank by proximity (closer words = more relevant) 192 | // 193 | // Thread Safety Note: 194 | // - We lock the entire indexing operation to prevent race conditions 195 | // - If we didn't lock, two goroutines could corrupt the data structure 196 | // ═══════════════════════════════════════════════════════════════════════════════ 197 | // BM25 INDEXING 198 | // ═══════════════════════════════════════════════════════════════════════════════ 199 | // Index also enriches the index with BM25 statistics 200 | // 201 | // WHAT'S DIFFERENT WITH BM25: 202 | // --------------------------- 203 | // In addition to building the inverted index, we now track: 204 | // 1. Document length (number of terms) 205 | // 2. Term frequencies per document (how many times each term appears) 206 | // 3. Total number of documents (for IDF calculation) 207 | // 4. Total number of terms (for average document length) 208 | // 209 | // This metadata enables BM25 scoring later during search. 210 | func (idx *InvertedIndex) Index(docID int, document string) { 211 | idx.mu.Lock() // Acquire lock - only one goroutine can index at a time 212 | defer idx.mu.Unlock() // Release lock when function returns (even if it panics) 213 | 214 | slog.Info("indexing document", slog.Int("docID", docID)) 215 | 216 | // STEP 1: Break document into searchable tokens 217 | // Example: "The Quick Brown Fox!" → ["quick", "brown", "fox"] 218 | tokens := Analyze(document) 219 | 220 | // STEP 2: Initialize document statistics 221 | docStats := DocumentStats{ 222 | DocID: docID, 223 | Length: len(tokens), 224 | TermFreqs: make(map[string]int), 225 | } 226 | 227 | // STEP 3: Index each token and track term frequencies 228 | for position, token := range tokens { 229 | idx.indexToken(token, docID, position) 230 | docStats.TermFreqs[token]++ 231 | } 232 | 233 | // STEP 4: Update global statistics 234 | idx.DocStats[docID] = docStats 235 | idx.TotalDocs++ 236 | idx.TotalTerms += int64(len(tokens)) 237 | } 238 | 239 | // indexToken adds a single token occurrence to the index (HYBRID STORAGE) 240 | // 241 | // HOW IT WORKS: 242 | // ------------- 243 | // 1. Update Roaring Bitmap (document-level) 244 | // - Set the bit for this document ID 245 | // - Enables fast document lookups and boolean operations 246 | // - Compressed storage (10-100x smaller than skip lists alone) 247 | // 248 | // 2. Update Skip List (position-level) 249 | // - Insert exact position (docID, offset) 250 | // - Enables phrase search and proximity ranking 251 | // - Maintains all position information 252 | // 253 | // 3. Best of both worlds! 254 | // - Fast document queries via bitmaps 255 | // - Detailed position queries via skip lists 256 | // 257 | // DocumentID and Offset are stored as ints 258 | // - The SkipList uses sentinel values (BOF=MinInt, EOF=MaxInt) to mark boundaries 259 | // - All position values are integers (no casting needed) 260 | func (idx *InvertedIndex) indexToken(token string, docID, position int) { 261 | // STEP 1: Update roaring bitmap (document-level) 262 | // Create bitmap if this is the first time seeing this token 263 | if idx.DocBitmaps[token] == nil { 264 | idx.DocBitmaps[token] = roaring.NewBitmap() 265 | } 266 | // Set the bit for this document ID 267 | idx.DocBitmaps[token].Add(uint32(docID)) 268 | 269 | // STEP 2: Update skip list (position-level) 270 | // Check if this token already has a posting list 271 | skipList, exists := idx.getPostingList(token) 272 | if !exists { 273 | // First time seeing this token - create a new SkipList 274 | skipList = *NewSkipList() 275 | } 276 | 277 | // Add this occurrence to the token's posting list 278 | skipList.Insert(Position{ 279 | DocumentID: docID, // Which document? 280 | Offset: position, // Where in the document? 281 | }) 282 | 283 | // Save the updated SkipList back to the map 284 | // (In Go, maps don't update automatically when you modify a struct value) 285 | idx.PostingsList[token] = skipList 286 | } 287 | 288 | // getPostingList retrieves the posting list for a token 289 | // 290 | // This is a simple helper to avoid repeating map lookup code. 291 | // Returns (skipList, true) if found, (empty, false) if not found. 292 | func (idx *InvertedIndex) getPostingList(token string) (SkipList, bool) { 293 | skipList, exists := idx.PostingsList[token] 294 | return skipList, exists 295 | } 296 | 297 | // ═══════════════════════════════════════════════════════════════════════════════ 298 | // BASIC SEARCH OPERATIONS 299 | // ═══════════════════════════════════════════════════════════════════════════════ 300 | // These four methods (First, Last, Next, Previous) form the foundation of 301 | // all search operations. Everything else is built on top of these primitives. 302 | // 303 | // Think of them like iterator operations: 304 | // - First: Go to the beginning 305 | // - Last: Go to the end 306 | // - Next: Move forward 307 | // - Previous: Move backward 308 | // ═══════════════════════════════════════════════════════════════════════════════ 309 | 310 | // First returns the first occurrence of a token in the index 311 | // 312 | // EXAMPLE: 313 | // -------- 314 | // Given: "quick" appears at [Doc1:Pos1, Doc3:Pos0, Doc5:Pos2] 315 | // First("quick") returns Doc3:Pos0 (the earliest occurrence) 316 | // 317 | // Use case: Start searching for a token from the beginning 318 | func (idx *InvertedIndex) First(token string) (Position, error) { 319 | skipList, exists := idx.getPostingList(token) 320 | if !exists { 321 | return EOFDocument, ErrNoPostingList 322 | } 323 | 324 | // The first position is at the bottom level (level 0) of the SkipList 325 | // The Head node points to the first real node via Tower[0] 326 | return skipList.Head.Tower[0].Key, nil 327 | } 328 | 329 | // Last returns the last occurrence of a token in the index 330 | // 331 | // EXAMPLE: 332 | // -------- 333 | // Given: "quick" appears at [Doc1:Pos1, Doc3:Pos0, Doc5:Pos2] 334 | // Last("quick") returns Doc5:Pos2 (the latest occurrence) 335 | // 336 | // Use case: Search backwards from the end 337 | func (idx *InvertedIndex) Last(token string) (Position, error) { 338 | skipList, exists := idx.getPostingList(token) 339 | if !exists { 340 | return EOFDocument, ErrNoPostingList 341 | } 342 | 343 | // Traverse to the end of the SkipList 344 | return skipList.Last(), nil 345 | } 346 | 347 | // Next finds the next occurrence of a token after the given position 348 | // 349 | // EXAMPLE: 350 | // -------- 351 | // Given: "brown" appears at [Doc1:Pos2, Doc3:Pos1, Doc3:Pos5, Doc5:Pos0] 352 | // Next("brown", Doc3:Pos1) returns Doc3:Pos5 353 | // Next("brown", Doc3:Pos5) returns Doc5:Pos0 354 | // Next("brown", Doc5:Pos0) returns EOF (no more occurrences) 355 | // 356 | // Special cases: 357 | // - If currentPos is BOF (beginning of file), return First 358 | // - If currentPos is already EOF (end of file), stay at EOF 359 | // 360 | // Use case: Iterate through all occurrences of a word 361 | func (idx *InvertedIndex) Next(token string, currentPos Position) (Position, error) { 362 | // Special case: Starting from the beginning 363 | if currentPos.IsBeginning() { 364 | return idx.First(token) 365 | } 366 | 367 | // Special case: Already at the end 368 | if currentPos.IsEnd() { 369 | return EOFDocument, nil 370 | } 371 | 372 | // Get the posting list for this token 373 | skipList, exists := idx.getPostingList(token) 374 | if !exists { 375 | return EOFDocument, ErrNoPostingList 376 | } 377 | 378 | // Find the next position after currentPos in the SkipList 379 | // FindGreaterThan returns the smallest position > currentPos 380 | nextPos, _ := skipList.FindGreaterThan(currentPos) 381 | return nextPos, nil 382 | } 383 | 384 | // Previous finds the previous occurrence of a token before the given position 385 | // 386 | // EXAMPLE: 387 | // -------- 388 | // Given: "brown" appears at [Doc1:Pos2, Doc3:Pos1, Doc3:Pos5, Doc5:Pos0] 389 | // Previous("brown", Doc5:Pos0) returns Doc3:Pos5 390 | // Previous("brown", Doc3:Pos5) returns Doc3:Pos1 391 | // Previous("brown", Doc1:Pos2) returns BOF (no earlier occurrences) 392 | // 393 | // Use case: Search backwards through occurrences 394 | func (idx *InvertedIndex) Previous(token string, currentPos Position) (Position, error) { 395 | // Special case: Starting from the end 396 | if currentPos.IsEnd() { 397 | return idx.Last(token) 398 | } 399 | 400 | // Special case: Already at the beginning 401 | if currentPos.IsBeginning() { 402 | return BOFDocument, nil 403 | } 404 | 405 | // Get the posting list for this token 406 | skipList, exists := idx.getPostingList(token) 407 | if !exists { 408 | return BOFDocument, ErrNoPostingList 409 | } 410 | 411 | // Find the previous position before currentPos in the SkipList 412 | // FindLessThan returns the largest position < currentPos 413 | prevPos, _ := skipList.FindLessThan(currentPos) 414 | return prevPos, nil 415 | } 416 | -------------------------------------------------------------------------------- /analyzer.go: -------------------------------------------------------------------------------- 1 | // ═══════════════════════════════════════════════════════════════════════════════ 2 | // TEXT ANALYSIS OVERVIEW 3 | // ═══════════════════════════════════════════════════════════════════════════════ 4 | // Text analysis transforms raw text into searchable tokens through a multi-stage 5 | // pipeline. This process is crucial for effective full-text search. 6 | // 7 | // ANALYSIS PIPELINE: 8 | // ------------------ 9 | // 1. Tokenization → Split text into words 10 | // 2. Lowercasing → Normalize case ("Quick" → "quick") 11 | // 3. Stop word removal → Remove common words ("the", "a", etc.) 12 | // 4. Length filtering → Remove very short tokens (< 2 chars) 13 | // 5. Stemming → Reduce words to root form ("running" → "run") 14 | // 15 | // EXAMPLE TRANSFORMATION: 16 | // ----------------------- 17 | // Input: "The Quick Brown Fox Jumps!" 18 | // Step 1: ["The", "Quick", "Brown", "Fox", "Jumps"] (tokenize) 19 | // Step 2: ["the", "quick", "brown", "fox", "jumps"] (lowercase) 20 | // Step 3: ["quick", "brown", "fox", "jumps"] (remove stopwords) 21 | // Step 4: ["quick", "brown", "fox", "jumps"] (length filter - all pass) 22 | // Step 5: ["quick", "brown", "fox", "jump"] (stemming) 23 | // 24 | // WHY THIS MATTERS: 25 | // ----------------- 26 | // Proper analysis ensures: 27 | // - "Running" matches "run", "runs", "ran" 28 | // - "The dog" matches "DOG" (case insensitive) 29 | // - Common words don't pollute the index 30 | // - Search results are relevant and accurate 31 | // ═══════════════════════════════════════════════════════════════════════════════ 32 | 33 | package blaze 34 | 35 | import ( 36 | "strings" 37 | "unicode" 38 | 39 | snowballeng "github.com/kljensen/snowball/english" 40 | ) 41 | 42 | // AnalyzerConfig holds configuration options for text analysis 43 | // 44 | // This allows customization of the analysis pipeline without modifying code. 45 | // Future enhancements could add language support, custom stopwords, etc. 46 | type AnalyzerConfig struct { 47 | MinTokenLength int // Minimum token length to keep (default: 2) 48 | EnableStemming bool // Whether to apply stemming (default: true) 49 | EnableStopwords bool // Whether to remove stopwords (default: true) 50 | } 51 | 52 | // DefaultConfig returns the standard analyzer configuration 53 | func DefaultConfig() AnalyzerConfig { 54 | return AnalyzerConfig{ 55 | MinTokenLength: 2, 56 | EnableStemming: true, 57 | EnableStopwords: true, 58 | } 59 | } 60 | 61 | // Analyze transforms raw text into searchable tokens using the default pipeline 62 | // 63 | // This is the main entry point for text analysis. It applies all filters in sequence: 64 | // 1. Tokenization 65 | // 2. Lowercasing 66 | // 3. Stopword filtering 67 | // 4. Length filtering 68 | // 5. Stemming 69 | // 70 | // Example: 71 | // 72 | // tokens := Analyze("The quick brown fox jumps over the lazy dog") 73 | // // Returns: ["quick", "brown", "fox", "jump", "lazi", "dog"] 74 | func Analyze(text string) []string { 75 | return AnalyzeWithConfig(text, DefaultConfig()) 76 | } 77 | 78 | // AnalyzeWithConfig transforms text using a custom configuration 79 | // 80 | // This allows fine-grained control over the analysis pipeline. 81 | // 82 | // Example: 83 | // 84 | // config := AnalyzerConfig{MinTokenLength: 3, EnableStemming: false} 85 | // tokens := AnalyzeWithConfig("The quick brown fox", config) 86 | func AnalyzeWithConfig(text string, config AnalyzerConfig) []string { 87 | tokens := tokenize(text) 88 | tokens = lowercaseFilter(tokens) 89 | 90 | if config.EnableStopwords { 91 | tokens = stopwordFilter(tokens) 92 | } 93 | 94 | tokens = lengthFilter(tokens, config.MinTokenLength) 95 | 96 | if config.EnableStemming { 97 | tokens = stemmerFilter(tokens) 98 | } 99 | 100 | return tokens 101 | } 102 | 103 | // tokenize splits text into individual words 104 | // 105 | // ALGORITHM: 106 | // ---------- 107 | // Uses Unicode-aware splitting: any non-letter and non-digit character is a delimiter. 108 | // 109 | // Examples: 110 | // 111 | // "hello-world" → ["hello", "world"] 112 | // "user@email.com" → ["user", "email", "com"] 113 | // "price: $9.99" → ["price", "9", "99"] 114 | // "café" → ["café"] (Unicode letters preserved) 115 | // 116 | // Why FieldsFunc? 117 | // - Handles Unicode properly (unlike simple string splitting) 118 | // - Treats multiple delimiters as one (no empty tokens) 119 | // - Fast and memory efficient (Go standard library optimization) 120 | func tokenize(text string) []string { 121 | return strings.FieldsFunc(text, func(r rune) bool { 122 | // Split on any character that is not a letter or a number 123 | return !unicode.IsLetter(r) && !unicode.IsNumber(r) 124 | }) 125 | } 126 | 127 | // lowercaseFilter normalizes token casing 128 | // 129 | // WHY IT MATTERS: 130 | // --------------- 131 | // Without lowercasing, "Quick", "quick", and "QUICK" would be treated as 132 | // different words, creating a poor search experience. 133 | // 134 | // Example: 135 | // 136 | // ["Hello", "World"] → ["hello", "world"] 137 | // 138 | // Performance Note: 139 | // - Pre-allocates slice to avoid dynamic growth 140 | // - Uses strings.ToLower for proper Unicode handling 141 | func lowercaseFilter(tokens []string) []string { 142 | r := make([]string, len(tokens)) 143 | for i, token := range tokens { 144 | r[i] = strings.ToLower(token) 145 | } 146 | return r 147 | } 148 | 149 | // stopwordFilter removes common English words that don't add search value 150 | // 151 | // STOPWORDS EXPLAINED: 152 | // -------------------- 153 | // Words like "the", "a", "is" appear in almost every document, so they: 154 | // - Waste index space 155 | // - Don't help distinguish documents 156 | // - Slow down search 157 | // 158 | // Example: 159 | // 160 | // ["the", "quick", "brown", "fox"] → ["quick", "brown", "fox"] 161 | // 162 | // Implementation Note: 163 | // - Uses map lookup for O(1) checking 164 | // - Pre-allocates capacity to reduce reallocations 165 | func stopwordFilter(tokens []string) []string { 166 | r := make([]string, 0, len(tokens)) 167 | for _, token := range tokens { 168 | if !isStopword(token) { 169 | r = append(r, token) 170 | } 171 | } 172 | return r 173 | } 174 | 175 | // lengthFilter removes tokens that are too short to be meaningful 176 | // 177 | // WHY FILTER BY LENGTH? 178 | // --------------------- 179 | // Very short tokens (1-2 characters) are often: 180 | // - Not semantically meaningful ("a", "i", "to") 181 | // - Result in too many false matches 182 | // - Already caught by stopword filter 183 | // 184 | // Example (minLength=2): 185 | // 186 | // ["a", "go", "cat", "i"] → ["go", "cat"] 187 | // 188 | // Performance: 189 | // - O(n) single pass 190 | // - Pre-allocated capacity 191 | func lengthFilter(tokens []string, minLength int) []string { 192 | r := make([]string, 0, len(tokens)) 193 | for _, token := range tokens { 194 | if len(token) >= minLength { 195 | r = append(r, token) 196 | } 197 | } 198 | return r 199 | } 200 | 201 | // stemmerFilter reduces words to their root form 202 | // 203 | // STEMMING EXPLAINED: 204 | // ------------------- 205 | // Stemming removes suffixes to find the word root: 206 | // 207 | // "running", "runs", "ran" → "run" 208 | // "connection", "connected", "connecting" → "connect" 209 | // 210 | // WHY IT MATTERS: 211 | // --------------- 212 | // Without stemming, a search for "run" wouldn't match documents containing 213 | // "running" or "runs", even though they're clearly related. 214 | // 215 | // ALGORITHM: 216 | // ---------- 217 | // Uses the Snowball (Porter2) stemmer, which applies linguistic rules 218 | // to remove common English suffixes. 219 | // 220 | // Example: 221 | // 222 | // ["running", "quickly", "foxes"] → ["run", "quick", "fox"] 223 | // 224 | // Trade-offs: 225 | // + Improves recall (finds more relevant documents) 226 | // + Reduces index size (fewer unique terms) 227 | // - May over-stem (e.g., "university" → "univers") 228 | // - Language-specific (this implementation is English-only) 229 | func stemmerFilter(tokens []string) []string { 230 | r := make([]string, len(tokens)) 231 | for i, token := range tokens { 232 | r[i] = snowballeng.Stem(token, false) 233 | } 234 | return r 235 | } 236 | 237 | // isStopword checks if a token is a common English stopword 238 | // 239 | // Uses a hash map for O(1) lookup performance. 240 | // The map uses struct{} as values (0 bytes) instead of strings (16 bytes) 241 | // for memory efficiency. 242 | func isStopword(token string) bool { 243 | _, exists := englishStopwords[token] 244 | return exists 245 | } 246 | 247 | // englishStopwords contains common English words to exclude from indexing 248 | // 249 | // MEMORY OPTIMIZATION: 250 | // -------------------- 251 | // Uses struct{} (empty struct) as the value type instead of string or bool. 252 | // - struct{}: 0 bytes per entry 253 | // - string: 16 bytes per entry 254 | // - bool: 1 byte per entry 255 | // 256 | // For 300+ stopwords, this saves ~5KB of memory. 257 | // 258 | // STOPWORD SELECTION: 259 | // ------------------- 260 | // This list includes: 261 | // - Articles: a, an, the 262 | // - Prepositions: in, on, at, to 263 | // - Conjunctions: and, but, or 264 | // - Pronouns: he, she, it, they 265 | // - Common verbs: is, are, was, were 266 | // - Numbers: one, two, three, etc. 267 | var englishStopwords = map[string]struct{}{ 268 | "a": {}, 269 | "about": {}, 270 | "above": {}, 271 | "across": {}, 272 | "after": {}, 273 | "afterwards": {}, 274 | "again": {}, 275 | "against": {}, 276 | "all": {}, 277 | "almost": {}, 278 | "alone": {}, 279 | "along": {}, 280 | "already": {}, 281 | "also": {}, 282 | "although": {}, 283 | "always": {}, 284 | "am": {}, 285 | "among": {}, 286 | "amongst": {}, 287 | "amoungst": {}, 288 | "amount": {}, 289 | "an": {}, 290 | "and": {}, 291 | "another": {}, 292 | "any": {}, 293 | "anyhow": {}, 294 | "anyone": {}, 295 | "anything": {}, 296 | "anyway": {}, 297 | "anywhere": {}, 298 | "are": {}, 299 | "around": {}, 300 | "as": {}, 301 | "at": {}, 302 | "back": {}, 303 | "be": {}, 304 | "became": {}, 305 | "because": {}, 306 | "become": {}, 307 | "becomes": {}, 308 | "becoming": {}, 309 | "been": {}, 310 | "before": {}, 311 | "beforehand": {}, 312 | "behind": {}, 313 | "being": {}, 314 | "below": {}, 315 | "beside": {}, 316 | "besides": {}, 317 | "between": {}, 318 | "beyond": {}, 319 | "bill": {}, 320 | "both": {}, 321 | "bottom": {}, 322 | "but": {}, 323 | "by": {}, 324 | "call": {}, 325 | "can": {}, 326 | "cannot": {}, 327 | "cant": {}, 328 | "co": {}, 329 | "con": {}, 330 | "could": {}, 331 | "couldnt": {}, 332 | "cry": {}, 333 | "de": {}, 334 | "describe": {}, 335 | "detail": {}, 336 | "do": {}, 337 | "done": {}, 338 | "down": {}, 339 | "due": {}, 340 | "during": {}, 341 | "each": {}, 342 | "eg": {}, 343 | "eight": {}, 344 | "either": {}, 345 | "eleven": {}, 346 | "else": {}, 347 | "elsewhere": {}, 348 | "empty": {}, 349 | "enough": {}, 350 | "etc": {}, 351 | "even": {}, 352 | "ever": {}, 353 | "every": {}, 354 | "everyone": {}, 355 | "everything": {}, 356 | "everywhere": {}, 357 | "except": {}, 358 | "few": {}, 359 | "fifteen": {}, 360 | "fify": {}, 361 | "fill": {}, 362 | "find": {}, 363 | "fire": {}, 364 | "first": {}, 365 | "five": {}, 366 | "for": {}, 367 | "former": {}, 368 | "formerly": {}, 369 | "forty": {}, 370 | "found": {}, 371 | "four": {}, 372 | "from": {}, 373 | "front": {}, 374 | "full": {}, 375 | "further": {}, 376 | "get": {}, 377 | "give": {}, 378 | "go": {}, 379 | "had": {}, 380 | "has": {}, 381 | "hasnt": {}, 382 | "have": {}, 383 | "he": {}, 384 | "hence": {}, 385 | "her": {}, 386 | "here": {}, 387 | "hereafter": {}, 388 | "hereby": {}, 389 | "herein": {}, 390 | "hereupon": {}, 391 | "hers": {}, 392 | "herself": {}, 393 | "him": {}, 394 | "himself": {}, 395 | "his": {}, 396 | "how": {}, 397 | "however": {}, 398 | "hundred": {}, 399 | "ie": {}, 400 | "if": {}, 401 | "in": {}, 402 | "inc": {}, 403 | "indeed": {}, 404 | "interest": {}, 405 | "into": {}, 406 | "is": {}, 407 | "it": {}, 408 | "its": {}, 409 | "itself": {}, 410 | "keep": {}, 411 | "last": {}, 412 | "latter": {}, 413 | "latterly": {}, 414 | "least": {}, 415 | "less": {}, 416 | "ltd": {}, 417 | "made": {}, 418 | "many": {}, 419 | "may": {}, 420 | "me": {}, 421 | "meanwhile": {}, 422 | "might": {}, 423 | "mill": {}, 424 | "mine": {}, 425 | "more": {}, 426 | "moreover": {}, 427 | "most": {}, 428 | "mostly": {}, 429 | "move": {}, 430 | "much": {}, 431 | "must": {}, 432 | "my": {}, 433 | "myself": {}, 434 | "name": {}, 435 | "namely": {}, 436 | "neither": {}, 437 | "never": {}, 438 | "nevertheless": {}, 439 | "next": {}, 440 | "nine": {}, 441 | "no": {}, 442 | "nobody": {}, 443 | "none": {}, 444 | "noone": {}, 445 | "nor": {}, 446 | "not": {}, 447 | "nothing": {}, 448 | "now": {}, 449 | "nowhere": {}, 450 | "of": {}, 451 | "off": {}, 452 | "often": {}, 453 | "on": {}, 454 | "once": {}, 455 | "one": {}, 456 | "only": {}, 457 | "onto": {}, 458 | "or": {}, 459 | "other": {}, 460 | "others": {}, 461 | "otherwise": {}, 462 | "our": {}, 463 | "ours": {}, 464 | "ourselves": {}, 465 | "out": {}, 466 | "over": {}, 467 | "own": {}, 468 | "part": {}, 469 | "per": {}, 470 | "perhaps": {}, 471 | "please": {}, 472 | "put": {}, 473 | "rather": {}, 474 | "re": {}, 475 | "same": {}, 476 | "see": {}, 477 | "seem": {}, 478 | "seemed": {}, 479 | "seeming": {}, 480 | "seems": {}, 481 | "serious": {}, 482 | "several": {}, 483 | "she": {}, 484 | "should": {}, 485 | "show": {}, 486 | "side": {}, 487 | "since": {}, 488 | "sincere": {}, 489 | "six": {}, 490 | "sixty": {}, 491 | "so": {}, 492 | "some": {}, 493 | "somehow": {}, 494 | "someone": {}, 495 | "something": {}, 496 | "sometime": {}, 497 | "sometimes": {}, 498 | "somewhere": {}, 499 | "still": {}, 500 | "such": {}, 501 | "system": {}, 502 | "take": {}, 503 | "ten": {}, 504 | "than": {}, 505 | "that": {}, 506 | "the": {}, 507 | "their": {}, 508 | "them": {}, 509 | "themselves": {}, 510 | "then": {}, 511 | "thence": {}, 512 | "there": {}, 513 | "thereafter": {}, 514 | "thereby": {}, 515 | "therefore": {}, 516 | "therein": {}, 517 | "thereupon": {}, 518 | "these": {}, 519 | "they": {}, 520 | "thickv": {}, 521 | "thin": {}, 522 | "third": {}, 523 | "this": {}, 524 | "those": {}, 525 | "though": {}, 526 | "three": {}, 527 | "through": {}, 528 | "throughout": {}, 529 | "thru": {}, 530 | "thus": {}, 531 | "to": {}, 532 | "together": {}, 533 | "too": {}, 534 | "top": {}, 535 | "toward": {}, 536 | "towards": {}, 537 | "twelve": {}, 538 | "twenty": {}, 539 | "two": {}, 540 | "un": {}, 541 | "under": {}, 542 | "until": {}, 543 | "up": {}, 544 | "upon": {}, 545 | "us": {}, 546 | "very": {}, 547 | "via": {}, 548 | "was": {}, 549 | "we": {}, 550 | "well": {}, 551 | "were": {}, 552 | "what": {}, 553 | "whatever": {}, 554 | "when": {}, 555 | "whence": {}, 556 | "whenever": {}, 557 | "where": {}, 558 | "whereafter": {}, 559 | "whereas": {}, 560 | "whereby": {}, 561 | "wherein": {}, 562 | "whereupon": {}, 563 | "wherever": {}, 564 | "whether": {}, 565 | "which": {}, 566 | "while": {}, 567 | "whither": {}, 568 | "who": {}, 569 | "whoever": {}, 570 | "whole": {}, 571 | "whom": {}, 572 | "whose": {}, 573 | "why": {}, 574 | "will": {}, 575 | "with": {}, 576 | "within": {}, 577 | "without": {}, 578 | "would": {}, 579 | "yet": {}, 580 | "you": {}, 581 | "your": {}, 582 | "yours": {}, 583 | "yourself": {}, 584 | "yourselves": {}} 585 | -------------------------------------------------------------------------------- /skiplist_test.go: -------------------------------------------------------------------------------- 1 | package blaze 2 | 3 | import ( 4 | "math" 5 | "testing" 6 | ) 7 | 8 | // ═══════════════════════════════════════════════════════════════════════════════ 9 | // POSITION TESTS 10 | // ═══════════════════════════════════════════════════════════════════════════════ 11 | 12 | func TestPosition_GetDocumentID(t *testing.T) { 13 | pos := Position{DocumentID: 42, Offset: 10} 14 | if got := pos.GetDocumentID(); got != 42 { 15 | t.Errorf("GetDocumentID() = %d, want 42", got) 16 | } 17 | } 18 | 19 | func TestPosition_GetOffset(t *testing.T) { 20 | pos := Position{DocumentID: 42, Offset: 10} 21 | if got := pos.GetOffset(); got != 10 { 22 | t.Errorf("GetOffset() = %d, want 10", got) 23 | } 24 | } 25 | 26 | func TestPosition_IsBeginning(t *testing.T) { 27 | tests := []struct { 28 | name string 29 | pos Position 30 | want bool 31 | }{ 32 | {"BOF position", Position{DocumentID: BOF, Offset: BOF}, true}, 33 | {"Regular position", Position{DocumentID: 1, Offset: 0}, false}, 34 | {"EOF position", Position{DocumentID: EOF, Offset: EOF}, false}, 35 | } 36 | 37 | for _, tt := range tests { 38 | t.Run(tt.name, func(t *testing.T) { 39 | if got := tt.pos.IsBeginning(); got != tt.want { 40 | t.Errorf("IsBeginning() = %v, want %v", got, tt.want) 41 | } 42 | }) 43 | } 44 | } 45 | 46 | func TestPosition_IsEnd(t *testing.T) { 47 | tests := []struct { 48 | name string 49 | pos Position 50 | want bool 51 | }{ 52 | {"EOF position", Position{DocumentID: EOF, Offset: EOF}, true}, 53 | {"Regular position", Position{DocumentID: 1, Offset: 0}, false}, 54 | {"BOF position", Position{DocumentID: BOF, Offset: BOF}, false}, 55 | } 56 | 57 | for _, tt := range tests { 58 | t.Run(tt.name, func(t *testing.T) { 59 | if got := tt.pos.IsEnd(); got != tt.want { 60 | t.Errorf("IsEnd() = %v, want %v", got, tt.want) 61 | } 62 | }) 63 | } 64 | } 65 | 66 | func TestPosition_IsBefore(t *testing.T) { 67 | tests := []struct { 68 | name string 69 | pos Position 70 | other Position 71 | want bool 72 | }{ 73 | { 74 | "Same doc, earlier offset", 75 | Position{DocumentID: 1, Offset: 5}, 76 | Position{DocumentID: 1, Offset: 10}, 77 | true, 78 | }, 79 | { 80 | "Same doc, later offset", 81 | Position{DocumentID: 1, Offset: 10}, 82 | Position{DocumentID: 1, Offset: 5}, 83 | false, 84 | }, 85 | { 86 | "Earlier doc", 87 | Position{DocumentID: 1, Offset: 100}, 88 | Position{DocumentID: 2, Offset: 0}, 89 | true, 90 | }, 91 | { 92 | "Later doc", 93 | Position{DocumentID: 2, Offset: 0}, 94 | Position{DocumentID: 1, Offset: 100}, 95 | false, 96 | }, 97 | { 98 | "BOF before regular", 99 | Position{DocumentID: BOF, Offset: BOF}, 100 | Position{DocumentID: 1, Offset: 0}, 101 | true, 102 | }, 103 | { 104 | "Regular before EOF", 105 | Position{DocumentID: 1, Offset: 0}, 106 | Position{DocumentID: EOF, Offset: EOF}, 107 | true, 108 | }, 109 | { 110 | "Same position", 111 | Position{DocumentID: 1, Offset: 5}, 112 | Position{DocumentID: 1, Offset: 5}, 113 | false, 114 | }, 115 | } 116 | 117 | for _, tt := range tests { 118 | t.Run(tt.name, func(t *testing.T) { 119 | if got := tt.pos.IsBefore(tt.other); got != tt.want { 120 | t.Errorf("IsBefore() = %v, want %v", got, tt.want) 121 | } 122 | }) 123 | } 124 | } 125 | 126 | func TestPosition_IsAfter(t *testing.T) { 127 | tests := []struct { 128 | name string 129 | pos Position 130 | other Position 131 | want bool 132 | }{ 133 | { 134 | "Same doc, later offset", 135 | Position{DocumentID: 1, Offset: 10}, 136 | Position{DocumentID: 1, Offset: 5}, 137 | true, 138 | }, 139 | { 140 | "Same doc, earlier offset", 141 | Position{DocumentID: 1, Offset: 5}, 142 | Position{DocumentID: 1, Offset: 10}, 143 | false, 144 | }, 145 | { 146 | "Later doc", 147 | Position{DocumentID: 2, Offset: 0}, 148 | Position{DocumentID: 1, Offset: 100}, 149 | true, 150 | }, 151 | { 152 | "Earlier doc", 153 | Position{DocumentID: 1, Offset: 100}, 154 | Position{DocumentID: 2, Offset: 0}, 155 | false, 156 | }, 157 | { 158 | "EOF after regular", 159 | Position{DocumentID: EOF, Offset: EOF}, 160 | Position{DocumentID: 1, Offset: 0}, 161 | true, 162 | }, 163 | { 164 | "Regular after BOF", 165 | Position{DocumentID: 1, Offset: 0}, 166 | Position{DocumentID: BOF, Offset: BOF}, 167 | true, 168 | }, 169 | { 170 | "Same position", 171 | Position{DocumentID: 1, Offset: 5}, 172 | Position{DocumentID: 1, Offset: 5}, 173 | false, 174 | }, 175 | } 176 | 177 | for _, tt := range tests { 178 | t.Run(tt.name, func(t *testing.T) { 179 | if got := tt.pos.IsAfter(tt.other); got != tt.want { 180 | t.Errorf("IsAfter() = %v, want %v", got, tt.want) 181 | } 182 | }) 183 | } 184 | } 185 | 186 | func TestPosition_Equals(t *testing.T) { 187 | tests := []struct { 188 | name string 189 | pos Position 190 | other Position 191 | want bool 192 | }{ 193 | { 194 | "Same position", 195 | Position{DocumentID: 1, Offset: 5}, 196 | Position{DocumentID: 1, Offset: 5}, 197 | true, 198 | }, 199 | { 200 | "Different offset", 201 | Position{DocumentID: 1, Offset: 5}, 202 | Position{DocumentID: 1, Offset: 10}, 203 | false, 204 | }, 205 | { 206 | "Different document", 207 | Position{DocumentID: 1, Offset: 5}, 208 | Position{DocumentID: 2, Offset: 5}, 209 | false, 210 | }, 211 | { 212 | "Both BOF", 213 | Position{DocumentID: BOF, Offset: BOF}, 214 | Position{DocumentID: BOF, Offset: BOF}, 215 | true, 216 | }, 217 | { 218 | "Both EOF", 219 | Position{DocumentID: EOF, Offset: EOF}, 220 | Position{DocumentID: EOF, Offset: EOF}, 221 | true, 222 | }, 223 | } 224 | 225 | for _, tt := range tests { 226 | t.Run(tt.name, func(t *testing.T) { 227 | if got := tt.pos.Equals(tt.other); got != tt.want { 228 | t.Errorf("Equals() = %v, want %v", got, tt.want) 229 | } 230 | }) 231 | } 232 | } 233 | 234 | // ═══════════════════════════════════════════════════════════════════════════════ 235 | // SKIP LIST BASIC TESTS 236 | // ═══════════════════════════════════════════════════════════════════════════════ 237 | 238 | func TestNewSkipList(t *testing.T) { 239 | sl := NewSkipList() 240 | 241 | if sl.Head == nil { 242 | t.Error("NewSkipList() created nil Head") 243 | } 244 | 245 | if sl.Height != 1 { 246 | t.Errorf("NewSkipList() Height = %d, want 1", sl.Height) 247 | } 248 | } 249 | 250 | func TestSkipList_Insert_Single(t *testing.T) { 251 | sl := NewSkipList() 252 | pos := Position{DocumentID: 1, Offset: 5} 253 | 254 | sl.Insert(pos) 255 | 256 | // Verify the element was inserted 257 | found, err := sl.Find(pos) 258 | if err != nil { 259 | t.Errorf("Find() error = %v, want nil", err) 260 | } 261 | 262 | if !found.Equals(pos) { 263 | t.Errorf("Find() = %v, want %v", found, pos) 264 | } 265 | } 266 | 267 | func TestSkipList_Insert_Multiple(t *testing.T) { 268 | sl := NewSkipList() 269 | 270 | positions := []Position{ 271 | {DocumentID: 1, Offset: 5}, 272 | {DocumentID: 1, Offset: 10}, 273 | {DocumentID: 2, Offset: 0}, 274 | {DocumentID: 2, Offset: 15}, 275 | {DocumentID: 3, Offset: 7}, 276 | } 277 | 278 | // Insert all positions 279 | for _, pos := range positions { 280 | sl.Insert(pos) 281 | } 282 | 283 | // Verify all can be found 284 | for _, pos := range positions { 285 | found, err := sl.Find(pos) 286 | if err != nil { 287 | t.Errorf("Find(%v) error = %v, want nil", pos, err) 288 | } 289 | if !found.Equals(pos) { 290 | t.Errorf("Find(%v) = %v, want %v", pos, found, pos) 291 | } 292 | } 293 | } 294 | 295 | func TestSkipList_Insert_Duplicate(t *testing.T) { 296 | sl := NewSkipList() 297 | pos := Position{DocumentID: 1, Offset: 5} 298 | 299 | // Insert twice 300 | sl.Insert(pos) 301 | sl.Insert(pos) 302 | 303 | // Should only exist once 304 | found, err := sl.Find(pos) 305 | if err != nil { 306 | t.Errorf("Find() error = %v, want nil", err) 307 | } 308 | if !found.Equals(pos) { 309 | t.Errorf("Find() = %v, want %v", found, pos) 310 | } 311 | 312 | // Count elements using iterator 313 | count := 0 314 | iter := sl.Iterator() 315 | // First element is at current position 316 | if iter.current != nil { 317 | count++ 318 | } 319 | // Rest of elements via HasNext/Next 320 | for iter.HasNext() { 321 | iter.Next() 322 | count++ 323 | } 324 | 325 | if count != 1 { 326 | t.Errorf("Skip list has %d elements, want 1", count) 327 | } 328 | } 329 | 330 | func TestSkipList_Insert_OutOfOrder(t *testing.T) { 331 | sl := NewSkipList() 332 | 333 | // Insert in reverse order 334 | positions := []Position{ 335 | {DocumentID: 5, Offset: 10}, 336 | {DocumentID: 3, Offset: 7}, 337 | {DocumentID: 4, Offset: 2}, 338 | {DocumentID: 1, Offset: 0}, 339 | {DocumentID: 2, Offset: 5}, 340 | } 341 | 342 | for _, pos := range positions { 343 | sl.Insert(pos) 344 | } 345 | 346 | // Verify they're stored in sorted order 347 | expected := []Position{ 348 | {DocumentID: 1, Offset: 0}, 349 | {DocumentID: 2, Offset: 5}, 350 | {DocumentID: 3, Offset: 7}, 351 | {DocumentID: 4, Offset: 2}, 352 | {DocumentID: 5, Offset: 10}, 353 | } 354 | 355 | // Get all positions using iterator 356 | var result []Position 357 | iter := sl.Iterator() 358 | // Get first element 359 | if iter.current != nil { 360 | result = append(result, iter.current.Key) 361 | } 362 | // Get remaining elements 363 | for iter.HasNext() { 364 | result = append(result, iter.Next()) 365 | } 366 | 367 | if len(result) != len(expected) { 368 | t.Fatalf("Got %d positions, want %d", len(result), len(expected)) 369 | } 370 | 371 | for idx, pos := range result { 372 | if !pos.Equals(expected[idx]) { 373 | t.Errorf("Position at index %d = %v, want %v", idx, pos, expected[idx]) 374 | } 375 | } 376 | } 377 | 378 | // ═══════════════════════════════════════════════════════════════════════════════ 379 | // SEARCH AND FIND TESTS 380 | // ═══════════════════════════════════════════════════════════════════════════════ 381 | 382 | func TestSkipList_Find_NotFound(t *testing.T) { 383 | sl := NewSkipList() 384 | sl.Insert(Position{DocumentID: 1, Offset: 5}) 385 | sl.Insert(Position{DocumentID: 2, Offset: 10}) 386 | 387 | pos := Position{DocumentID: 1, Offset: 7} 388 | _, err := sl.Find(pos) 389 | 390 | if err != ErrKeyNotFound { 391 | t.Errorf("Find() error = %v, want %v", err, ErrKeyNotFound) 392 | } 393 | } 394 | 395 | func TestSkipList_Find_EmptyList(t *testing.T) { 396 | sl := NewSkipList() 397 | 398 | pos := Position{DocumentID: 1, Offset: 0} 399 | _, err := sl.Find(pos) 400 | 401 | if err != ErrKeyNotFound { 402 | t.Errorf("Find() error = %v, want %v", err, ErrKeyNotFound) 403 | } 404 | } 405 | 406 | func TestSkipList_FindLessThan(t *testing.T) { 407 | sl := NewSkipList() 408 | 409 | // Insert: 5, 10, 15, 20 410 | sl.Insert(Position{DocumentID: 1, Offset: 5}) 411 | sl.Insert(Position{DocumentID: 1, Offset: 10}) 412 | sl.Insert(Position{DocumentID: 1, Offset: 15}) 413 | sl.Insert(Position{DocumentID: 1, Offset: 20}) 414 | 415 | tests := []struct { 416 | name string 417 | key Position 418 | want Position 419 | wantErr error 420 | }{ 421 | { 422 | "Find less than 17", 423 | Position{DocumentID: 1, Offset: 17}, 424 | Position{DocumentID: 1, Offset: 15}, 425 | nil, 426 | }, 427 | { 428 | "Find less than 15", 429 | Position{DocumentID: 1, Offset: 15}, 430 | Position{DocumentID: 1, Offset: 10}, 431 | nil, 432 | }, 433 | { 434 | "Find less than 5 (first element)", 435 | Position{DocumentID: 1, Offset: 5}, 436 | BOFDocument, 437 | ErrNoElementFound, 438 | }, 439 | { 440 | "Find less than 0 (before first)", 441 | Position{DocumentID: 1, Offset: 0}, 442 | BOFDocument, 443 | ErrNoElementFound, 444 | }, 445 | } 446 | 447 | for _, tt := range tests { 448 | t.Run(tt.name, func(t *testing.T) { 449 | got, err := sl.FindLessThan(tt.key) 450 | 451 | if err != tt.wantErr { 452 | t.Errorf("FindLessThan() error = %v, want %v", err, tt.wantErr) 453 | } 454 | 455 | if !got.Equals(tt.want) { 456 | t.Errorf("FindLessThan() = %v, want %v", got, tt.want) 457 | } 458 | }) 459 | } 460 | } 461 | 462 | func TestSkipList_FindGreaterThan(t *testing.T) { 463 | sl := NewSkipList() 464 | 465 | // Insert: 5, 10, 15, 20 466 | sl.Insert(Position{DocumentID: 1, Offset: 5}) 467 | sl.Insert(Position{DocumentID: 1, Offset: 10}) 468 | sl.Insert(Position{DocumentID: 1, Offset: 15}) 469 | sl.Insert(Position{DocumentID: 1, Offset: 20}) 470 | 471 | tests := []struct { 472 | name string 473 | key Position 474 | want Position 475 | wantErr error 476 | }{ 477 | { 478 | "Find greater than 10 (exists)", 479 | Position{DocumentID: 1, Offset: 10}, 480 | Position{DocumentID: 1, Offset: 15}, 481 | nil, 482 | }, 483 | { 484 | "Find greater than 12 (doesn't exist)", 485 | Position{DocumentID: 1, Offset: 12}, 486 | Position{DocumentID: 1, Offset: 15}, 487 | nil, 488 | }, 489 | { 490 | "Find greater than 20 (last element)", 491 | Position{DocumentID: 1, Offset: 20}, 492 | EOFDocument, 493 | ErrNoElementFound, 494 | }, 495 | { 496 | "Find greater than 25 (after last)", 497 | Position{DocumentID: 1, Offset: 25}, 498 | EOFDocument, 499 | ErrNoElementFound, 500 | }, 501 | } 502 | 503 | for _, tt := range tests { 504 | t.Run(tt.name, func(t *testing.T) { 505 | got, err := sl.FindGreaterThan(tt.key) 506 | 507 | if err != tt.wantErr { 508 | t.Errorf("FindGreaterThan() error = %v, want %v", err, tt.wantErr) 509 | } 510 | 511 | if !got.Equals(tt.want) { 512 | t.Errorf("FindGreaterThan() = %v, want %v", got, tt.want) 513 | } 514 | }) 515 | } 516 | } 517 | 518 | // ═══════════════════════════════════════════════════════════════════════════════ 519 | // DELETE TESTS 520 | // ═══════════════════════════════════════════════════════════════════════════════ 521 | 522 | func TestSkipList_Delete_Single(t *testing.T) { 523 | sl := NewSkipList() 524 | pos := Position{DocumentID: 1, Offset: 5} 525 | 526 | sl.Insert(pos) 527 | 528 | // Delete the element 529 | deleted := sl.Delete(pos) 530 | if !deleted { 531 | t.Error("Delete() = false, want true") 532 | } 533 | 534 | // Verify it's gone 535 | _, err := sl.Find(pos) 536 | if err != ErrKeyNotFound { 537 | t.Errorf("Find() after delete error = %v, want %v", err, ErrKeyNotFound) 538 | } 539 | } 540 | 541 | func TestSkipList_Delete_Middle(t *testing.T) { 542 | sl := NewSkipList() 543 | 544 | // Insert: 5, 10, 15, 20 545 | sl.Insert(Position{DocumentID: 1, Offset: 5}) 546 | sl.Insert(Position{DocumentID: 1, Offset: 10}) 547 | sl.Insert(Position{DocumentID: 1, Offset: 15}) 548 | sl.Insert(Position{DocumentID: 1, Offset: 20}) 549 | 550 | // Delete middle element 551 | deleted := sl.Delete(Position{DocumentID: 1, Offset: 10}) 552 | if !deleted { 553 | t.Error("Delete() = false, want true") 554 | } 555 | 556 | // Verify it's gone 557 | _, err := sl.Find(Position{DocumentID: 1, Offset: 10}) 558 | if err != ErrKeyNotFound { 559 | t.Errorf("Find() after delete error = %v, want %v", err, ErrKeyNotFound) 560 | } 561 | 562 | // Verify others still exist 563 | remaining := []Position{ 564 | {DocumentID: 1, Offset: 5}, 565 | {DocumentID: 1, Offset: 15}, 566 | {DocumentID: 1, Offset: 20}, 567 | } 568 | 569 | for _, pos := range remaining { 570 | _, err := sl.Find(pos) 571 | if err != nil { 572 | t.Errorf("Find(%v) error = %v, want nil", pos, err) 573 | } 574 | } 575 | } 576 | 577 | func TestSkipList_Delete_NotFound(t *testing.T) { 578 | sl := NewSkipList() 579 | sl.Insert(Position{DocumentID: 1, Offset: 5}) 580 | 581 | deleted := sl.Delete(Position{DocumentID: 1, Offset: 10}) 582 | if deleted { 583 | t.Error("Delete() = true, want false (element not found)") 584 | } 585 | } 586 | 587 | func TestSkipList_Delete_EmptyList(t *testing.T) { 588 | sl := NewSkipList() 589 | 590 | deleted := sl.Delete(Position{DocumentID: 1, Offset: 0}) 591 | if deleted { 592 | t.Error("Delete() = true, want false (empty list)") 593 | } 594 | } 595 | 596 | func TestSkipList_Delete_All(t *testing.T) { 597 | sl := NewSkipList() 598 | 599 | positions := []Position{ 600 | {DocumentID: 1, Offset: 5}, 601 | {DocumentID: 1, Offset: 10}, 602 | {DocumentID: 2, Offset: 15}, 603 | } 604 | 605 | // Insert all 606 | for _, pos := range positions { 607 | sl.Insert(pos) 608 | } 609 | 610 | // Delete all 611 | for _, pos := range positions { 612 | deleted := sl.Delete(pos) 613 | if !deleted { 614 | t.Errorf("Delete(%v) = false, want true", pos) 615 | } 616 | } 617 | 618 | // Verify list is empty 619 | iter := sl.Iterator() 620 | if iter.HasNext() { 621 | t.Error("List should be empty after deleting all elements") 622 | } 623 | } 624 | 625 | // ═══════════════════════════════════════════════════════════════════════════════ 626 | // ITERATOR TESTS 627 | // ═══════════════════════════════════════════════════════════════════════════════ 628 | 629 | func TestSkipList_Iterator_Empty(t *testing.T) { 630 | sl := NewSkipList() 631 | iter := sl.Iterator() 632 | 633 | if iter.HasNext() { 634 | t.Error("HasNext() = true for empty list, want false") 635 | } 636 | 637 | pos := iter.Next() 638 | if !pos.Equals(EOFDocument) { 639 | t.Errorf("Next() on empty list = %v, want %v", pos, EOFDocument) 640 | } 641 | } 642 | 643 | func TestSkipList_Iterator_Single(t *testing.T) { 644 | sl := NewSkipList() 645 | expected := Position{DocumentID: 1, Offset: 5} 646 | sl.Insert(expected) 647 | 648 | iter := sl.Iterator() 649 | 650 | // First element is at current position 651 | if iter.current == nil { 652 | t.Fatal("Iterator current is nil, expected first element") 653 | } 654 | 655 | pos := iter.current.Key 656 | if !pos.Equals(expected) { 657 | t.Errorf("First element = %v, want %v", pos, expected) 658 | } 659 | 660 | // Should have no next elements 661 | if iter.HasNext() { 662 | t.Error("HasNext() = true for single element list, want false") 663 | } 664 | } 665 | 666 | func TestSkipList_Iterator_Multiple(t *testing.T) { 667 | sl := NewSkipList() 668 | 669 | expected := []Position{ 670 | {DocumentID: 1, Offset: 5}, 671 | {DocumentID: 1, Offset: 10}, 672 | {DocumentID: 2, Offset: 0}, 673 | {DocumentID: 2, Offset: 15}, 674 | {DocumentID: 3, Offset: 7}, 675 | } 676 | 677 | // Insert all 678 | for _, pos := range expected { 679 | sl.Insert(pos) 680 | } 681 | 682 | // Get all positions using iterator 683 | var result []Position 684 | iter := sl.Iterator() 685 | // Get first element 686 | if iter.current != nil { 687 | result = append(result, iter.current.Key) 688 | } 689 | // Get remaining elements 690 | for iter.HasNext() { 691 | result = append(result, iter.Next()) 692 | } 693 | 694 | if len(result) != len(expected) { 695 | t.Errorf("Iterator returned %d elements, want %d", len(result), len(expected)) 696 | } 697 | 698 | for idx, pos := range result { 699 | if idx >= len(expected) { 700 | t.Fatalf("Iterator returned more elements than expected") 701 | } 702 | 703 | if !pos.Equals(expected[idx]) { 704 | t.Errorf("Position at index %d = %v, want %v", idx, pos, expected[idx]) 705 | } 706 | } 707 | } 708 | 709 | // ═══════════════════════════════════════════════════════════════════════════════ 710 | // LAST OPERATION TESTS 711 | // ═══════════════════════════════════════════════════════════════════════════════ 712 | 713 | func TestSkipList_Last_Empty(t *testing.T) { 714 | sl := NewSkipList() 715 | last := sl.Last() 716 | 717 | // In an empty list, Last() returns the head's key (which is zero value) 718 | if last.DocumentID != 0 || last.Offset != 0 { 719 | t.Errorf("Last() on empty list = %v, want zero position", last) 720 | } 721 | } 722 | 723 | func TestSkipList_Last_Single(t *testing.T) { 724 | sl := NewSkipList() 725 | expected := Position{DocumentID: 1, Offset: 5} 726 | sl.Insert(expected) 727 | 728 | last := sl.Last() 729 | if !last.Equals(expected) { 730 | t.Errorf("Last() = %v, want %v", last, expected) 731 | } 732 | } 733 | 734 | func TestSkipList_Last_Multiple(t *testing.T) { 735 | sl := NewSkipList() 736 | 737 | sl.Insert(Position{DocumentID: 1, Offset: 5}) 738 | sl.Insert(Position{DocumentID: 2, Offset: 10}) 739 | sl.Insert(Position{DocumentID: 3, Offset: 15}) 740 | 741 | expected := Position{DocumentID: 3, Offset: 15} 742 | last := sl.Last() 743 | 744 | if !last.Equals(expected) { 745 | t.Errorf("Last() = %v, want %v", last, expected) 746 | } 747 | } 748 | 749 | // ═══════════════════════════════════════════════════════════════════════════════ 750 | // EDGE CASE AND STRESS TESTS 751 | // ═══════════════════════════════════════════════════════════════════════════════ 752 | 753 | func TestSkipList_SameDocument_DifferentOffsets(t *testing.T) { 754 | sl := NewSkipList() 755 | 756 | // Insert multiple positions in the same document 757 | for offset := 0; offset < 10; offset++ { 758 | sl.Insert(Position{DocumentID: 1, Offset: offset}) 759 | } 760 | 761 | // Verify all are present and in order 762 | var result []Position 763 | iter := sl.Iterator() 764 | // Get first element 765 | if iter.current != nil { 766 | result = append(result, iter.current.Key) 767 | } 768 | // Get remaining elements 769 | for iter.HasNext() { 770 | result = append(result, iter.Next()) 771 | } 772 | 773 | if len(result) != 10 { 774 | t.Errorf("Found %d positions, want 10", len(result)) 775 | } 776 | 777 | for offset, pos := range result { 778 | expected := Position{DocumentID: 1, Offset: offset} 779 | 780 | if !pos.Equals(expected) { 781 | t.Errorf("Position at offset %d = %v, want %v", offset, pos, expected) 782 | } 783 | } 784 | } 785 | 786 | func TestSkipList_MultipleDocs_MultipleOffsets(t *testing.T) { 787 | sl := NewSkipList() 788 | 789 | // Insert grid: 3 documents x 5 offsets each 790 | for doc := 1; doc <= 3; doc++ { 791 | for offset := 0; offset < 5; offset++ { 792 | sl.Insert(Position{DocumentID: doc, Offset: offset}) 793 | } 794 | } 795 | 796 | // Get all positions using iterator 797 | var result []Position 798 | iter := sl.Iterator() 799 | // Get first element 800 | if iter.current != nil { 801 | result = append(result, iter.current.Key) 802 | } 803 | // Get remaining elements 804 | for iter.HasNext() { 805 | result = append(result, iter.Next()) 806 | } 807 | 808 | if len(result) != 15 { 809 | t.Errorf("Found %d positions, want 15", len(result)) 810 | } 811 | 812 | // Verify ordering (should be doc-major order) 813 | idx := 0 814 | for doc := 1; doc <= 3; doc++ { 815 | for offset := 0; offset < 5; offset++ { 816 | if idx >= len(result) { 817 | t.Fatal("Not enough positions in result") 818 | } 819 | 820 | expected := Position{DocumentID: doc, Offset: offset} 821 | 822 | if !result[idx].Equals(expected) { 823 | t.Errorf("Position at index %d = %v, want %v", idx, result[idx], expected) 824 | } 825 | 826 | idx++ 827 | } 828 | } 829 | } 830 | 831 | func TestSkipList_LargeDataset(t *testing.T) { 832 | sl := NewSkipList() 833 | 834 | // Insert 1000 positions 835 | n := 1000 836 | for i := 0; i < n; i++ { 837 | sl.Insert(Position{DocumentID: i / 10, Offset: i % 10}) 838 | } 839 | 840 | // Verify count 841 | count := 0 842 | iter := sl.Iterator() 843 | // Count first element 844 | if iter.current != nil { 845 | count++ 846 | } 847 | // Count remaining elements 848 | for iter.HasNext() { 849 | iter.Next() 850 | count++ 851 | } 852 | 853 | if count != n { 854 | t.Errorf("Found %d positions, want %d", count, n) 855 | } 856 | 857 | // Spot check some positions 858 | testPositions := []Position{ 859 | {DocumentID: 0, Offset: 0}, 860 | {DocumentID: 50, Offset: 5}, 861 | {DocumentID: 99, Offset: 9}, 862 | } 863 | 864 | for _, pos := range testPositions { 865 | found, err := sl.Find(pos) 866 | if err != nil { 867 | t.Errorf("Find(%v) error = %v, want nil", pos, err) 868 | } 869 | if !found.Equals(pos) { 870 | t.Errorf("Find(%v) = %v, want %v", pos, found, pos) 871 | } 872 | } 873 | } 874 | 875 | func TestSkipList_InfinityValues(t *testing.T) { 876 | // Test that sentinel values work correctly 877 | if BOF >= 0 { 878 | t.Error("BOF should be negative (math.MinInt)") 879 | } 880 | 881 | if EOF <= 0 { 882 | t.Error("EOF should be positive (math.MaxInt)") 883 | } 884 | 885 | if BOF != math.MinInt { 886 | t.Errorf("BOF should be math.MinInt, got %d", BOF) 887 | } 888 | 889 | if EOF != math.MaxInt { 890 | t.Errorf("EOF should be math.MaxInt, got %d", EOF) 891 | } 892 | 893 | // BOF should be less than any regular position 894 | regularPos := Position{DocumentID: 0, Offset: 0} 895 | if !BOFDocument.IsBefore(regularPos) { 896 | t.Error("BOF should be before any regular position") 897 | } 898 | 899 | // EOF should be greater than any regular position 900 | if !regularPos.IsBefore(EOFDocument) { 901 | t.Error("Any regular position should be before EOF") 902 | } 903 | } 904 | 905 | // ═══════════════════════════════════════════════════════════════════════════════ 906 | // BENCHMARK TESTS 907 | // ═══════════════════════════════════════════════════════════════════════════════ 908 | 909 | func BenchmarkSkipList_Insert(b *testing.B) { 910 | sl := NewSkipList() 911 | 912 | b.ResetTimer() 913 | for i := 0; i < b.N; i++ { 914 | sl.Insert(Position{DocumentID: i / 1000, Offset: i % 1000}) 915 | } 916 | } 917 | 918 | func BenchmarkSkipList_Find(b *testing.B) { 919 | sl := NewSkipList() 920 | 921 | // Pre-populate with 10000 elements 922 | for i := 0; i < 10000; i++ { 923 | sl.Insert(Position{DocumentID: i / 100, Offset: i % 100}) 924 | } 925 | 926 | b.ResetTimer() 927 | for i := 0; i < b.N; i++ { 928 | sl.Find(Position{DocumentID: i / 100 % 100, Offset: i % 100}) 929 | } 930 | } 931 | 932 | func BenchmarkSkipList_Delete(b *testing.B) { 933 | // Re-populate for each iteration 934 | for i := 0; i < b.N; i++ { 935 | b.StopTimer() 936 | sl := NewSkipList() 937 | for j := 0; j < 1000; j++ { 938 | sl.Insert(Position{DocumentID: j / 10, Offset: j % 10}) 939 | } 940 | b.StartTimer() 941 | 942 | sl.Delete(Position{DocumentID: i / 10 % 100, Offset: i % 10}) 943 | } 944 | } 945 | 946 | func BenchmarkSkipList_Iterator(b *testing.B) { 947 | sl := NewSkipList() 948 | 949 | // Pre-populate with 1000 elements 950 | for i := 0; i < 1000; i++ { 951 | sl.Insert(Position{DocumentID: i / 10, Offset: i % 10}) 952 | } 953 | 954 | b.ResetTimer() 955 | for i := 0; i < b.N; i++ { 956 | iter := sl.Iterator() 957 | // Process first element 958 | if iter.current != nil { 959 | _ = iter.current.Key 960 | } 961 | // Process remaining elements 962 | for iter.HasNext() { 963 | iter.Next() 964 | } 965 | } 966 | } 967 | 968 | func BenchmarkSkipList_FindLessThan(b *testing.B) { 969 | sl := NewSkipList() 970 | 971 | // Pre-populate with 10000 elements 972 | for i := 0; i < 10000; i++ { 973 | sl.Insert(Position{DocumentID: i / 100, Offset: i % 100}) 974 | } 975 | 976 | b.ResetTimer() 977 | for i := 0; i < b.N; i++ { 978 | sl.FindLessThan(Position{DocumentID: i / 100 % 100, Offset: i % 100}) 979 | } 980 | } 981 | 982 | func BenchmarkSkipList_FindGreaterThan(b *testing.B) { 983 | sl := NewSkipList() 984 | 985 | // Pre-populate with 10000 elements 986 | for i := 0; i < 10000; i++ { 987 | sl.Insert(Position{DocumentID: i / 100, Offset: i % 100}) 988 | } 989 | 990 | b.ResetTimer() 991 | for i := 0; i < b.N; i++ { 992 | sl.FindGreaterThan(Position{DocumentID: i / 100 % 100, Offset: i % 100}) 993 | } 994 | } 995 | -------------------------------------------------------------------------------- /serialization.go: -------------------------------------------------------------------------------- 1 | package blaze 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "math" 7 | 8 | "github.com/RoaringBitmap/roaring" 9 | ) 10 | 11 | // ═══════════════════════════════════════════════════════════════════════════════ 12 | // SERIALIZATION: Saving and Loading the Index 13 | // ═══════════════════════════════════════════════════════════════════════════════ 14 | // Why serialize? 15 | // - Save index to disk for persistence 16 | // - Send index over network 17 | // - Create backups 18 | // 19 | // BINARY FORMAT: 20 | // -------------- 21 | // We use a custom binary format for efficiency: 22 | // - Smaller file size than JSON (important for large indexes) 23 | // - Faster to parse than JSON 24 | // - Preserves exact structure (including skip list towers) 25 | // 26 | // FORMAT STRUCTURE: 27 | // ----------------- 28 | // For each term: 29 | // [term_length: uint32][term: bytes] 30 | // [node_data_length: uint32][node_data: positions...] 31 | // [tower_data: for each node...] 32 | // 33 | // ENCODING STRATEGY: 34 | // ------------------ 35 | // The tricky part is encoding the skip list tower structure: 36 | // 1. Assign each node a sequential index (1, 2, 3, ...) 37 | // 2. Store node positions (DocID, Offset pairs) 38 | // 3. Store tower pointers as indices (not memory addresses!) 39 | // 40 | // Why use indices instead of pointers? 41 | // - Pointers are meaningless after deserialization (different memory locations) 42 | // - Indices are stable and can be reconstructed 43 | // 44 | // ═══════════════════════════════════════════════════════════════════════════════ 45 | 46 | // Encode serializes the inverted index to binary format 47 | // 48 | // COMPLETE EXAMPLE: 49 | // ----------------- 50 | // Index contains: 51 | // 52 | // "quick" → SkipList with nodes at [Doc1:Pos1, Doc3:Pos0] 53 | // "brown" → SkipList with nodes at [Doc1:Pos2] 54 | // 55 | // Encoded format: 56 | // 57 | // [5]['q','u','i','c','k'] ← Term name 58 | // [16][1,1,3,0] ← Node positions (2 positions × 8 bytes each) 59 | // [4][2][2][0] ← Tower structure (node1→node2, node2→nil) 60 | // [5]['b','r','o','w','n'] ← Next term 61 | // [8][1,2] ← Node position 62 | // [2][0] ← Tower structure (only one node, no next) 63 | // 64 | // The encoder object keeps track of our position in the output buffer. 65 | // Encode serializes the inverted index with HYBRID STORAGE including BM25 statistics 66 | // 67 | // BINARY FORMAT: 68 | // -------------- 69 | // [Header] 70 | // - TotalDocs: uint32 71 | // - TotalTerms: uint64 72 | // - BM25.K1: float64 73 | // - BM25.B: float64 74 | // - NumDocStats: uint32 75 | // 76 | // [Document Statistics] (for each document) 77 | // - DocID: uint32 78 | // - Length: uint32 79 | // - NumTerms: uint32 80 | // - For each term: 81 | // - TermLength: uint32 82 | // - Term: bytes 83 | // - Frequency: uint32 84 | // 85 | // [Roaring Bitmaps] (NEW - for fast document lookups) 86 | // - NumBitmaps: uint32 87 | // - For each term: 88 | // - TermLength: uint32 89 | // - Term: bytes 90 | // - BitmapLength: uint32 91 | // - Bitmap: bytes (roaring's native serialization) 92 | // 93 | // [Posting Lists] (position data for phrase search) 94 | // - For each term... 95 | func (idx *InvertedIndex) Encode() ([]byte, error) { 96 | buf := new(bytes.Buffer) 97 | 98 | // Write header with BM25 metadata 99 | if err := idx.encodeHeader(buf); err != nil { 100 | return nil, err 101 | } 102 | 103 | // Write document statistics 104 | if err := idx.encodeDocStats(buf); err != nil { 105 | return nil, err 106 | } 107 | 108 | // Write roaring bitmaps (NEW!) 109 | if err := idx.encodeRoaringBitmaps(buf); err != nil { 110 | return nil, err 111 | } 112 | 113 | // Write posting lists (existing format) 114 | encoder := newIndexEncoder(buf) 115 | for term, skipList := range idx.PostingsList { 116 | if err := encoder.encodeTerm(term, skipList); err != nil { 117 | return nil, err 118 | } 119 | } 120 | 121 | return buf.Bytes(), nil 122 | } 123 | 124 | // encodeHeader writes the index metadata 125 | func (idx *InvertedIndex) encodeHeader(buf *bytes.Buffer) error { 126 | // Write corpus statistics 127 | if err := binary.Write(buf, binary.LittleEndian, uint32(idx.TotalDocs)); err != nil { 128 | return err 129 | } 130 | if err := binary.Write(buf, binary.LittleEndian, uint64(idx.TotalTerms)); err != nil { 131 | return err 132 | } 133 | 134 | // Write BM25 parameters 135 | if err := binary.Write(buf, binary.LittleEndian, idx.BM25Params.K1); err != nil { 136 | return err 137 | } 138 | if err := binary.Write(buf, binary.LittleEndian, idx.BM25Params.B); err != nil { 139 | return err 140 | } 141 | 142 | // Write number of documents with statistics 143 | if err := binary.Write(buf, binary.LittleEndian, uint32(len(idx.DocStats))); err != nil { 144 | return err 145 | } 146 | 147 | return nil 148 | } 149 | 150 | // encodeDocStats writes document statistics for BM25 151 | func (idx *InvertedIndex) encodeDocStats(buf *bytes.Buffer) error { 152 | for _, docStats := range idx.DocStats { 153 | // Write document ID and length 154 | if err := binary.Write(buf, binary.LittleEndian, uint32(docStats.DocID)); err != nil { 155 | return err 156 | } 157 | if err := binary.Write(buf, binary.LittleEndian, uint32(docStats.Length)); err != nil { 158 | return err 159 | } 160 | 161 | // Write number of unique terms 162 | if err := binary.Write(buf, binary.LittleEndian, uint32(len(docStats.TermFreqs))); err != nil { 163 | return err 164 | } 165 | 166 | // Write each term and its frequency 167 | for term, freq := range docStats.TermFreqs { 168 | // Write term length and term 169 | termBytes := []byte(term) 170 | if err := binary.Write(buf, binary.LittleEndian, uint32(len(termBytes))); err != nil { 171 | return err 172 | } 173 | if _, err := buf.Write(termBytes); err != nil { 174 | return err 175 | } 176 | 177 | // Write frequency 178 | if err := binary.Write(buf, binary.LittleEndian, uint32(freq)); err != nil { 179 | return err 180 | } 181 | } 182 | } 183 | 184 | return nil 185 | } 186 | 187 | // encodeRoaringBitmaps writes the roaring bitmaps for document-level storage 188 | // 189 | // ROARING BITMAP SERIALIZATION: 190 | // ------------------------------ 191 | // Roaring bitmaps have their own efficient binary format via ToBytes() 192 | // We just need to wrap it with term names and lengths 193 | // 194 | // FORMAT: 195 | // ------- 196 | // [NumBitmaps: uint32] 197 | // For each bitmap: 198 | // 199 | // [TermLength: uint32][Term: bytes] 200 | // [BitmapLength: uint32][Bitmap: bytes] 201 | // 202 | // EXAMPLE: 203 | // -------- 204 | // Term "quick" appears in documents [1, 3, 5, 100, 500] 205 | // Roaring serializes this to ~20 bytes (vs 40 bytes for raw integers!) 206 | // 207 | // COMPRESSION BENEFITS: 208 | // --------------------- 209 | // For term "the" in 500,000 documents: 210 | // - Skip list: ~24 MB (500k nodes × 48 bytes) 211 | // - Roaring bitmap: ~60 KB (400x compression!) 212 | func (idx *InvertedIndex) encodeRoaringBitmaps(buf *bytes.Buffer) error { 213 | // Write number of bitmaps 214 | if err := binary.Write(buf, binary.LittleEndian, uint32(len(idx.DocBitmaps))); err != nil { 215 | return err 216 | } 217 | 218 | // Write each term and its bitmap 219 | for term, bitmap := range idx.DocBitmaps { 220 | // Write term name 221 | termBytes := []byte(term) 222 | if err := binary.Write(buf, binary.LittleEndian, uint32(len(termBytes))); err != nil { 223 | return err 224 | } 225 | if _, err := buf.Write(termBytes); err != nil { 226 | return err 227 | } 228 | 229 | // Write roaring bitmap (it has its own compact serialization) 230 | bitmapBytes, err := bitmap.ToBytes() 231 | if err != nil { 232 | return err 233 | } 234 | if err := binary.Write(buf, binary.LittleEndian, uint32(len(bitmapBytes))); err != nil { 235 | return err 236 | } 237 | if _, err := buf.Write(bitmapBytes); err != nil { 238 | return err 239 | } 240 | } 241 | 242 | return nil 243 | } 244 | 245 | // indexEncoder handles the encoding process 246 | // 247 | // This encapsulates the encoding state and provides helper methods. 248 | // Using a struct is cleaner than passing a buffer around everywhere. 249 | type indexEncoder struct { 250 | buffer *bytes.Buffer // Accumulates the serialized data 251 | } 252 | 253 | func newIndexEncoder(buffer *bytes.Buffer) *indexEncoder { 254 | return &indexEncoder{ 255 | buffer: buffer, 256 | } 257 | } 258 | 259 | // encodeTerm serializes a single term and its skip list 260 | // 261 | // THREE-PHASE ENCODING: 262 | // --------------------- 263 | // Phase 1: Write the term name 264 | // Phase 2: Write node positions (DocID, Offset pairs) 265 | // Phase 3: Write tower structure (how nodes link together) 266 | func (e *indexEncoder) encodeTerm(term string, skipList SkipList) error { 267 | // PHASE 1: Write term name 268 | // Format: [length: uint32][bytes] 269 | if err := e.writeString(term); err != nil { 270 | return err 271 | } 272 | 273 | // PHASE 2: Build node index map 274 | // Assign each node a sequential index: Head=1, Next=2, etc. 275 | // This map lets us convert node pointers to indices 276 | nodeMap := e.buildNodeIndexMap(skipList) 277 | 278 | // PHASE 3: Write node positions 279 | // Format: [length: uint32][DocID: uint32][Offset: uint32]... 280 | nodeData := e.encodeNodePositions(skipList) 281 | if err := e.writeBytes(nodeData); err != nil { 282 | return err 283 | } 284 | 285 | // PHASE 4: Write tower structure 286 | // This is the most complex part - see encodeTowerStructure 287 | return e.encodeTowerStructure(skipList, nodeMap) 288 | } 289 | 290 | // writeString writes a length-prefixed string 291 | // 292 | // Format: [length: 4 bytes][string: length bytes] 293 | // 294 | // Example: "quick" (5 characters) 295 | // 296 | // Binary: [0x05, 0x00, 0x00, 0x00, 'q', 'u', 'i', 'c', 'k'] 297 | // ^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^ 298 | // length = 5 actual string bytes 299 | func (e *indexEncoder) writeString(s string) error { 300 | data := []byte(s) 301 | 302 | // Write length as 32-bit unsigned integer (4 bytes) 303 | if err := binary.Write(e.buffer, binary.LittleEndian, uint32(len(data))); err != nil { 304 | return err 305 | } 306 | 307 | // Write the actual string bytes 308 | _, err := e.buffer.Write(data) 309 | return err 310 | } 311 | 312 | // writeBytes writes a length-prefixed byte array 313 | // 314 | // Same as writeString but for arbitrary byte data 315 | func (e *indexEncoder) writeBytes(data []byte) error { 316 | // Write length prefix 317 | if err := binary.Write(e.buffer, binary.LittleEndian, uint32(len(data))); err != nil { 318 | return err 319 | } 320 | 321 | // Write the data 322 | _, err := e.buffer.Write(data) 323 | return err 324 | } 325 | 326 | // buildNodeIndexMap creates a mapping from node positions to sequential indices 327 | // 328 | // WHY DO WE NEED THIS? 329 | // -------------------- 330 | // Skip list nodes are connected via pointers (memory addresses). 331 | // We can't serialize pointers because: 332 | // 1. Memory addresses change between program runs 333 | // 2. Addresses are meaningless on different machines 334 | // 335 | // Solution: Assign each node a stable index (1, 2, 3, ...) 336 | // Then we can say "Node 1 points to Node 3" instead of memory addresses. 337 | // 338 | // EXAMPLE: 339 | // -------- 340 | // Skip list: Head → Node{Doc1:Pos1} → Node{Doc3:Pos0} → nil 341 | // 342 | // Build map: 343 | // 344 | // {Doc1:Pos1} → Index 1 345 | // {Doc3:Pos0} → Index 2 346 | // 347 | // Now we can encode towers as: "Node 1 points to Index 2" 348 | func (e *indexEncoder) buildNodeIndexMap(skipList SkipList) map[nodePosition]int { 349 | nodeMap := make(map[nodePosition]int) 350 | current := skipList.Head 351 | index := 1 // Start from 1 (0 means nil/null) 352 | 353 | // Traverse the bottom level of the skip list 354 | for current != nil { 355 | // Create a compact position identifier 356 | pos := nodePosition{ 357 | DocID: int32(current.Key.DocumentID), 358 | Position: int32(current.Key.Offset), 359 | } 360 | 361 | // Assign this node the next sequential index 362 | nodeMap[pos] = index 363 | index++ 364 | 365 | // Move to next node 366 | current = current.Tower[0] 367 | } 368 | 369 | return nodeMap 370 | } 371 | 372 | // encodeNodePositions serializes all node positions (DocID, Offset pairs) 373 | // 374 | // FORMAT: 375 | // ------- 376 | // For each node: [DocID: int32][Offset: int32] 377 | // 378 | // EXAMPLE: 379 | // -------- 380 | // Nodes: [Doc1:Pos1, Doc3:Pos0, Doc3:Pos5] 381 | // Encoded: [1][1][3][0][3][5] 382 | // 383 | // ^^^ ^^^ ^^^ ^^^ ^^^ ^^^ 384 | // DocID Off DocID Off DocID Off 385 | // 386 | // Total: 6 × 4 bytes = 24 bytes 387 | func (e *indexEncoder) encodeNodePositions(skipList SkipList) []byte { 388 | buf := new(bytes.Buffer) 389 | current := skipList.Head 390 | 391 | // Traverse all nodes in the skip list 392 | for current != nil { 393 | // Write document ID (4 bytes) 394 | binary.Write(buf, binary.LittleEndian, int32(current.Key.DocumentID)) 395 | 396 | // Write offset (4 bytes) 397 | binary.Write(buf, binary.LittleEndian, int32(current.Key.Offset)) 398 | 399 | // Move to next node 400 | current = current.Tower[0] 401 | } 402 | 403 | return buf.Bytes() 404 | } 405 | 406 | // encodeTowerStructure serializes the skip list tower connections 407 | // 408 | // TOWER STRUCTURE RECAP: 409 | // ---------------------- 410 | // A skip list node has a "tower" of forward pointers at different levels: 411 | // 412 | // Level 2: [*]---------------→[*]----------→nil 413 | // Level 1: [*]------→[*]------→[*]------→[*]→nil 414 | // Level 0: [*]→[*]→[*]→[*]→[*]→[*]→[*]→[*]→nil 415 | // 416 | // Each node's tower is an array of pointers to other nodes. 417 | // 418 | // ENCODING STRATEGY: 419 | // ------------------ 420 | // For each node, we encode which nodes its tower points to (as indices). 421 | // 422 | // EXAMPLE: 423 | // -------- 424 | // Node 1 tower: [Node2, Node4, nil, nil, ...] (2 levels high) 425 | // Node 2 tower: [Node3, nil, nil, ...] (1 level high) 426 | // Node 3 tower: [Node4, nil, nil, ...] (1 level high) 427 | // 428 | // Encoded: 429 | // 430 | // Node 1: [length=4][2][4] ← 2 indices × 2 bytes = 4 bytes 431 | // Node 2: [length=2][3] ← 1 index × 2 bytes = 2 bytes 432 | // Node 3: [length=2][4] ← 1 index × 2 bytes = 2 bytes 433 | func (e *indexEncoder) encodeTowerStructure(skipList SkipList, nodeMap map[nodePosition]int) error { 434 | current := skipList.Head 435 | 436 | // Encode tower for each node in the skip list 437 | for current != nil { 438 | towerData := e.encodeTowerForNode(current, nodeMap) 439 | if err := e.writeBytes(towerData); err != nil { 440 | return err 441 | } 442 | current = current.Tower[0] 443 | } 444 | 445 | return nil 446 | } 447 | 448 | // encodeTowerForNode encodes the tower structure for a single node 449 | // 450 | // PROCESS: 451 | // -------- 452 | // 1. Collect all non-nil tower pointers 453 | // 2. Convert each pointer to its index (using nodeMap) 454 | // 3. Write indices as uint16 values 455 | // 456 | // Special case: If tower is empty (no forward pointers), write [0] 457 | func (e *indexEncoder) encodeTowerForNode(node *Node, nodeMap map[nodePosition]int) []byte { 458 | buf := new(bytes.Buffer) 459 | 460 | // Collect all non-nil tower levels 461 | towerIndices := e.collectTowerIndices(node, nodeMap) 462 | 463 | if len(towerIndices) == 0 { 464 | // Empty tower - write a single zero 465 | binary.Write(buf, binary.LittleEndian, uint16(0)) 466 | } else { 467 | // Write each index as a 2-byte value 468 | for _, index := range towerIndices { 469 | binary.Write(buf, binary.LittleEndian, uint16(index)) 470 | } 471 | } 472 | 473 | return buf.Bytes() 474 | } 475 | 476 | // collectTowerIndices extracts tower pointers and converts them to indices 477 | // 478 | // WALK THROUGH: 479 | // ------------- 480 | // Given a node with tower: [PtrA, PtrB, nil, nil, ...] 481 | // 482 | // Step 1: Check level 0 - PtrA exists 483 | // - Look up PtrA's position in nodeMap 484 | // - Get index: 3 485 | // - Add 3 to indices 486 | // 487 | // Step 2: Check level 1 - PtrB exists 488 | // - Look up PtrB's position in nodeMap 489 | // - Get index: 7 490 | // - Add 7 to indices 491 | // 492 | // Step 3: Check level 2 - nil 493 | // - Stop here 494 | // 495 | // Result: [3, 7] 496 | func (e *indexEncoder) collectTowerIndices(node *Node, nodeMap map[nodePosition]int) []int { 497 | var indices []int 498 | 499 | // Walk up the tower until we hit a nil pointer 500 | for level := 0; level < MaxHeight; level++ { 501 | if node.Tower[level] == nil { 502 | break // No more levels 503 | } 504 | 505 | // Get the position of the target node 506 | pos := nodePosition{ 507 | DocID: int32(node.Tower[level].Key.DocumentID), 508 | Position: int32(node.Tower[level].Key.Offset), 509 | } 510 | 511 | // Look up the target node's index 512 | indices = append(indices, nodeMap[pos]) 513 | } 514 | 515 | return indices 516 | } 517 | 518 | // nodePosition represents a compact node position for encoding 519 | // 520 | // We use int32 to match our internal representation: 521 | // - Document IDs are integers 522 | // - Positions are integers 523 | // - Sentinel values (BOF/EOF) use int as well 524 | // 525 | // int32 provides: 526 | // - 4 bytes per value (efficient storage) 527 | // - Range: -2,147,483,648 to 2,147,483,647 528 | // - Sufficient for document IDs and positions 529 | type nodePosition struct { 530 | DocID int32 531 | Position int32 532 | } 533 | 534 | // ═══════════════════════════════════════════════════════════════════════════════ 535 | // DESERIALIZATION: Loading the Index from Binary Data 536 | // ═══════════════════════════════════════════════════════════════════════════════ 537 | // This is the reverse of encoding - we read the binary data and reconstruct 538 | // the entire index structure including all skip list pointers. 539 | // 540 | // THREE-PHASE DECODING: 541 | // --------------------- 542 | // Phase 1: Read term names and node positions 543 | // Phase 2: Create node objects 544 | // Phase 3: Reconstruct tower pointers (the tricky part!) 545 | // 546 | // ═══════════════════════════════════════════════════════════════════════════════ 547 | 548 | // Decode deserializes binary data back into an inverted index 549 | // 550 | // PROCESS: 551 | // -------- 552 | // 1. Create a decoder to track our position in the byte array 553 | // 2. Repeatedly decode terms until we reach the end 554 | // 3. Reconstruct the PostingsList map 555 | // 556 | // EXAMPLE: 557 | // -------- 558 | // Input: [5]['quick'][16][1,1,3,0][4][2][2][0]... 559 | // Output: PostingsList["quick"] = SkipList{...} 560 | // Decode deserializes binary data back into an inverted index with HYBRID STORAGE and BM25 stats 561 | func (idx *InvertedIndex) Decode(data []byte) error { 562 | offset := 0 563 | 564 | // Read header with BM25 metadata 565 | newOffset, err := idx.decodeHeader(data, offset) 566 | if err != nil { 567 | return err 568 | } 569 | offset = newOffset 570 | 571 | // Read document statistics 572 | newOffset, err = idx.decodeDocStats(data, offset) 573 | if err != nil { 574 | return err 575 | } 576 | offset = newOffset 577 | 578 | // Read roaring bitmaps (NEW!) 579 | newOffset, err = idx.decodeRoaringBitmaps(data, offset) 580 | if err != nil { 581 | return err 582 | } 583 | offset = newOffset 584 | 585 | // Read posting lists (existing format) 586 | decoder := newIndexDecoder(data, offset) 587 | recoveredIndex := make(map[string]SkipList) 588 | 589 | for !decoder.isComplete() { 590 | term, skipList, err := decoder.decodeTerm() 591 | if err != nil { 592 | return err 593 | } 594 | recoveredIndex[term] = skipList 595 | } 596 | 597 | idx.PostingsList = recoveredIndex 598 | return nil 599 | } 600 | 601 | // decodeHeader reads the index metadata 602 | func (idx *InvertedIndex) decodeHeader(data []byte, offset int) (int, error) { 603 | // Read corpus statistics 604 | idx.TotalDocs = int(binary.LittleEndian.Uint32(data[offset : offset+4])) 605 | offset += 4 606 | 607 | idx.TotalTerms = int64(binary.LittleEndian.Uint64(data[offset : offset+8])) 608 | offset += 8 609 | 610 | // Read BM25 parameters 611 | idx.BM25Params.K1 = math.Float64frombits(binary.LittleEndian.Uint64(data[offset : offset+8])) 612 | offset += 8 613 | 614 | idx.BM25Params.B = math.Float64frombits(binary.LittleEndian.Uint64(data[offset : offset+8])) 615 | offset += 8 616 | 617 | return offset, nil 618 | } 619 | 620 | // decodeDocStats reads document statistics 621 | func (idx *InvertedIndex) decodeDocStats(data []byte, offset int) (int, error) { 622 | // Read number of documents 623 | numDocs := int(binary.LittleEndian.Uint32(data[offset : offset+4])) 624 | offset += 4 625 | 626 | idx.DocStats = make(map[int]DocumentStats, numDocs) 627 | 628 | for i := 0; i < numDocs; i++ { 629 | // Read document ID and length 630 | docID := int(binary.LittleEndian.Uint32(data[offset : offset+4])) 631 | offset += 4 632 | 633 | length := int(binary.LittleEndian.Uint32(data[offset : offset+4])) 634 | offset += 4 635 | 636 | // Read number of unique terms 637 | numTerms := int(binary.LittleEndian.Uint32(data[offset : offset+4])) 638 | offset += 4 639 | 640 | // Initialize document stats 641 | docStats := DocumentStats{ 642 | DocID: docID, 643 | Length: length, 644 | TermFreqs: make(map[string]int, numTerms), 645 | } 646 | 647 | // Read each term and its frequency 648 | for j := 0; j < numTerms; j++ { 649 | // Read term length 650 | termLen := int(binary.LittleEndian.Uint32(data[offset : offset+4])) 651 | offset += 4 652 | 653 | // Read term 654 | term := string(data[offset : offset+termLen]) 655 | offset += termLen 656 | 657 | // Read frequency 658 | freq := int(binary.LittleEndian.Uint32(data[offset : offset+4])) 659 | offset += 4 660 | 661 | docStats.TermFreqs[term] = freq 662 | } 663 | 664 | idx.DocStats[docID] = docStats 665 | } 666 | 667 | return offset, nil 668 | } 669 | 670 | // decodeRoaringBitmaps reads the roaring bitmaps for document-level storage 671 | // 672 | // DESERIALIZATION: 673 | // ---------------- 674 | // Read each term and its roaring bitmap, reconstructing the DocBitmaps map 675 | // 676 | // FORMAT: 677 | // ------- 678 | // [NumBitmaps: uint32] 679 | // For each bitmap: 680 | // 681 | // [TermLength: uint32][Term: bytes] 682 | // [BitmapLength: uint32][Bitmap: bytes] 683 | // 684 | // RECOVERY: 685 | // --------- 686 | // We create a new roaring.Bitmap for each term and deserialize it 687 | // using roaring's native UnmarshalBinary method 688 | func (idx *InvertedIndex) decodeRoaringBitmaps(data []byte, offset int) (int, error) { 689 | // Read number of bitmaps 690 | numBitmaps := int(binary.LittleEndian.Uint32(data[offset : offset+4])) 691 | offset += 4 692 | 693 | // Initialize the DocBitmaps map 694 | idx.DocBitmaps = make(map[string]*roaring.Bitmap, numBitmaps) 695 | 696 | // Read each term and its bitmap 697 | for i := 0; i < numBitmaps; i++ { 698 | // Read term length 699 | termLen := int(binary.LittleEndian.Uint32(data[offset : offset+4])) 700 | offset += 4 701 | 702 | // Read term 703 | term := string(data[offset : offset+termLen]) 704 | offset += termLen 705 | 706 | // Read bitmap length 707 | bitmapLen := int(binary.LittleEndian.Uint32(data[offset : offset+4])) 708 | offset += 4 709 | 710 | // Read and deserialize bitmap 711 | bitmap := roaring.NewBitmap() 712 | if err := bitmap.UnmarshalBinary(data[offset : offset+bitmapLen]); err != nil { 713 | return 0, err 714 | } 715 | offset += bitmapLen 716 | 717 | // Store in map 718 | idx.DocBitmaps[term] = bitmap 719 | } 720 | 721 | return offset, nil 722 | } 723 | 724 | // indexDecoder handles the decoding process 725 | // 726 | // State management: 727 | // - data: The full byte array we're decoding 728 | // - offset: Our current position in the array 729 | type indexDecoder struct { 730 | data []byte 731 | offset int 732 | } 733 | 734 | func newIndexDecoder(data []byte, offset int) *indexDecoder { 735 | return &indexDecoder{ 736 | data: data, 737 | offset: offset, 738 | } 739 | } 740 | 741 | // isComplete checks if we've decoded all the data 742 | func (d *indexDecoder) isComplete() bool { 743 | return d.offset >= len(d.data) 744 | } 745 | 746 | // decodeTerm decodes a single term and its skip list 747 | // 748 | // DECODING SEQUENCE: 749 | // ------------------ 750 | // 1. Read term name: "quick" 751 | // 2. Read node positions: [Doc1:Pos1, Doc3:Pos0] 752 | // 3. Create node objects with these positions 753 | // 4. Read tower structure and link nodes together 754 | // 5. Return the reconstructed SkipList 755 | func (d *indexDecoder) decodeTerm() (string, SkipList, error) { 756 | // Step 1: Read the term name 757 | term, err := d.readString() 758 | if err != nil { 759 | return "", SkipList{}, err 760 | } 761 | 762 | // Step 2: Read and decode node positions 763 | // Returns a map: Index → Node pointer 764 | nodeMap, err := d.decodeNodePositions() 765 | if err != nil { 766 | return "", SkipList{}, err 767 | } 768 | 769 | // Step 3: Decode tower structure (reconnect the nodes) 770 | height, err := d.decodeTowerStructure(nodeMap) 771 | if err != nil { 772 | return "", SkipList{}, err 773 | } 774 | 775 | // Step 4: Create the SkipList structure 776 | skipList := SkipList{ 777 | Head: nodeMap[1], // First node is always at index 1 778 | Height: height, 779 | } 780 | 781 | return term, skipList, nil 782 | } 783 | 784 | // readString reads a length-prefixed string 785 | // 786 | // Format: [length: 4 bytes][string: length bytes] 787 | // 788 | // EXAMPLE: 789 | // -------- 790 | // Data: [0x05, 0x00, 0x00, 0x00, 'q', 'u', 'i', 'c', 'k', ...] 791 | // 792 | // ^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^ 793 | // length = 5 string bytes 794 | // 795 | // Returns: "quick" 796 | // Advances offset by: 4 + 5 = 9 bytes 797 | func (d *indexDecoder) readString() (string, error) { 798 | // Read the length (4 bytes) 799 | length := int(binary.LittleEndian.Uint32(d.data[d.offset : d.offset+4])) 800 | d.offset += 4 801 | 802 | // Read the string bytes 803 | str := string(d.data[d.offset : d.offset+length]) 804 | d.offset += length 805 | 806 | return str, nil 807 | } 808 | 809 | // decodeNodePositions reconstructs all nodes from their serialized positions 810 | // 811 | // INPUT FORMAT: 812 | // ------------- 813 | // [dataLength: 4 bytes][DocID: 4 bytes][Offset: 4 bytes]... 814 | // 815 | // PROCESS: 816 | // -------- 817 | // 1. Read data length: How many bytes of position data? 818 | // 2. Calculate number of values: dataLength / 4 819 | // 3. Read pairs of values: (DocID, Offset) 820 | // 4. Create Node objects 821 | // 5. Assign sequential indices: 1, 2, 3, ... 822 | // 823 | // EXAMPLE: 824 | // -------- 825 | // Data: [16][1][1][3][0] 826 | // 827 | // ^^^ 16 bytes of position data 828 | // ^^ DocID=1, Offset=1 → Node 1 829 | // ^^ DocID=3, Offset=0 → Node 2 830 | // 831 | // Result: map[1→Node{Doc1:Pos1}, 2→Node{Doc3:Pos0}] 832 | func (d *indexDecoder) decodeNodePositions() (map[int]*Node, error) { 833 | // Read the length of position data 834 | dataLength := int(binary.LittleEndian.Uint32(d.data[d.offset : d.offset+4])) 835 | d.offset += 4 836 | 837 | nodeMap := make(map[int]*Node) 838 | nodeIndex := 1 839 | 840 | // Each position is 8 bytes: 4 for DocID + 4 for Offset 841 | // So numValues = dataLength / 4 gives us the total number of int32s 842 | // And we process them in pairs 843 | numValues := dataLength / 4 844 | 845 | for i := 0; i < numValues; i += 2 { 846 | // Read Document ID (as int32) 847 | docID := int32(binary.LittleEndian.Uint32(d.data[d.offset : d.offset+4])) 848 | d.offset += 4 849 | 850 | // Read Offset (as int32) 851 | offset := int32(binary.LittleEndian.Uint32(d.data[d.offset : d.offset+4])) 852 | d.offset += 4 853 | 854 | // Create a new node with this position 855 | node := &Node{ 856 | Key: Position{ 857 | DocumentID: int(docID), 858 | Offset: int(offset), 859 | }, 860 | } 861 | 862 | // Assign it a sequential index 863 | nodeMap[nodeIndex] = node 864 | nodeIndex++ 865 | } 866 | 867 | return nodeMap, nil 868 | } 869 | 870 | // decodeTowerStructure reconstructs the skip list tower connections 871 | // 872 | // THIS IS THE MAGIC STEP! 873 | // ----------------------- 874 | // We now have nodes, but they're not connected. 875 | // This function reads the tower indices and reconnects everything. 876 | // 877 | // INPUT FORMAT (for each node): 878 | // ----------------------------- 879 | // [towerLength: 4 bytes][index1: 2 bytes][index2: 2 bytes]... 880 | // 881 | // EXAMPLE: 882 | // -------- 883 | // Node 1: [4][2][4] ← Tower has 2 levels: points to nodes 2 and 4 884 | // Node 2: [2][3] ← Tower has 1 level: points to node 3 885 | // Node 3: [2][0] ← Tower has 1 level: points to nothing (end) 886 | // 887 | // RECONSTRUCTION: 888 | // --------------- 889 | // For Node 1: 890 | // - Read indices: [2, 4] 891 | // - Set Tower[0] = nodeMap[2] 892 | // - Set Tower[1] = nodeMap[4] 893 | // 894 | // Result: Node 1 is now connected to nodes 2 and 4 at levels 0 and 1! 895 | func (d *indexDecoder) decodeTowerStructure(nodeMap map[int]*Node) (int, error) { 896 | maxHeight := 1 // Track the maximum tower height 897 | nodeCount := len(nodeMap) 898 | 899 | // Process tower data for each node 900 | for nodeIndex := 1; nodeIndex <= nodeCount; nodeIndex++ { 901 | // Read the length of tower data for this node 902 | towerLength := int(binary.LittleEndian.Uint32(d.data[d.offset : d.offset+4])) 903 | d.offset += 4 904 | 905 | // Calculate how many indices are stored (each index is 2 bytes) 906 | numIndices := towerLength / 2 907 | 908 | // Read each tower level 909 | for level := 0; level < numIndices; level++ { 910 | // Read the target node index 911 | targetIndex := int(binary.LittleEndian.Uint16(d.data[d.offset : d.offset+2])) 912 | d.offset += 2 913 | 914 | // If index is not 0 (0 means nil), connect the nodes 915 | if targetIndex != 0 { 916 | nodeMap[nodeIndex].Tower[level] = nodeMap[targetIndex] 917 | 918 | // Track maximum height 919 | if level+1 > maxHeight { 920 | maxHeight = level + 1 921 | } 922 | } 923 | } 924 | } 925 | 926 | return maxHeight, nil 927 | } 928 | -------------------------------------------------------------------------------- /search.go: -------------------------------------------------------------------------------- 1 | package blaze 2 | 3 | import ( 4 | "crypto/md5" 5 | "encoding/hex" 6 | "encoding/json" 7 | "fmt" 8 | "log/slog" 9 | "math" 10 | "sort" 11 | "strings" 12 | ) 13 | 14 | // ═══════════════════════════════════════════════════════════════════════════════ 15 | // PHRASE SEARCH: Finding Multi-Word Sequences 16 | // ═══════════════════════════════════════════════════════════════════════════════ 17 | // Phrase search finds exact sequences of words. 18 | // 19 | // THE ALGORITHM: 20 | // -------------- 21 | // To find "quick brown fox", we need three words at consecutive positions 22 | // in the same document. 23 | // 24 | // Strategy: 25 | // 1. Find ANY occurrence of all three words (might not be consecutive) 26 | // 2. Walk backwards to find the start of the phrase 27 | // 3. Check if the positions are consecutive 28 | // 4. If yes, we found it! If no, try again from a different starting point 29 | // 30 | // VISUAL EXAMPLE: 31 | // --------------- 32 | // Document: "the quick brown dog ate the brown fox quickly" 33 | // Positions: 0 1 2 3 4 5 6 7 8 34 | // 35 | // Searching for "brown fox": 36 | // Attempt 1: 37 | // - Find "brown" (any occurrence): Pos 2 38 | // - Find "fox" after Pos 2: Pos 7 39 | // - Walk back from "fox" to find "brown": Pos 6 40 | // - Check: Are Pos 6 and Pos 7 consecutive? YES! → Found it! 41 | // 42 | // ═══════════════════════════════════════════════════════════════════════════════ 43 | 44 | // NextPhrase finds the next occurrence of a phrase (sequence of words) in the index 45 | // 46 | // ALGORITHM WALKTHROUGH: 47 | // ---------------------- 48 | // Query: "quick brown fox" 49 | // StartPos: Beginning of file 50 | // 51 | // Step 1: Find the END of a potential phrase 52 | // - Find "quick" after startPos → maybe Doc2:Pos3 53 | // - Find "brown" after that → maybe Doc2:Pos4 54 | // - Find "fox" after that → maybe Doc2:Pos5 55 | // - endPos = Doc2:Pos5 56 | // 57 | // Step 2: Walk BACKWARDS to find the START 58 | // - From endPos, find previous "brown" → Doc2:Pos4 59 | // - From there, find previous "quick" → Doc2:Pos3 60 | // - phraseStart = Doc2:Pos3 61 | // 62 | // Step 3: Validate it's a real phrase 63 | // - Same document? Yes (both Doc2) 64 | // - Consecutive positions? Yes (3, 4, 5) 65 | // - Distance = 5 - 3 = 2 (which equals 3 words - 1) ✓ 66 | // 67 | // Step 4: If not valid, recurse from phraseStart 68 | // - This handles cases where words appear multiple times 69 | // 70 | // Why this algorithm? 71 | // - It's efficient: We use the index to jump between occurrences 72 | // - It handles multiple occurrences: Recursion keeps searching 73 | // - It validates correctness: We check for consecutive positions 74 | func (idx *InvertedIndex) NextPhrase(query string, startPos Position) []Position { 75 | terms := strings.Fields(query) // Split "quick brown fox" → ["quick", "brown", "fox"] 76 | 77 | // STEP 1: Find the end of a potential phrase match 78 | endPos := idx.findPhraseEnd(terms, startPos) 79 | if endPos.IsEnd() { 80 | // No more occurrences of all words exist 81 | return []Position{EOFDocument, EOFDocument} 82 | } 83 | 84 | // STEP 2: Walk backwards to find where the phrase starts 85 | phraseStart := idx.findPhraseStart(terms, endPos) 86 | 87 | // STEP 3: Validate that we found a real consecutive phrase 88 | if idx.isValidPhrase(phraseStart, endPos, len(terms)) { 89 | // Success! Return [start, end] positions of the phrase 90 | return []Position{phraseStart, endPos} 91 | } 92 | 93 | // STEP 4: Not a valid phrase - try again from the start position 94 | // This handles cases like: "brown dog brown fox" when searching for "brown fox" 95 | return idx.NextPhrase(query, phraseStart) 96 | } 97 | 98 | // findPhraseEnd locates the ending position of a potential phrase 99 | // 100 | // HOW IT WORKS: 101 | // ------------- 102 | // Starting from startPos, we "hop" through the document finding each word. 103 | // 104 | // Example: Finding "quick brown fox" starting from Doc1:Pos0 105 | // 106 | // Step 1: currentPos = Doc1:Pos0 107 | // Step 2: Find "quick" after Doc1:Pos0 → currentPos = Doc1:Pos3 108 | // Step 3: Find "brown" after Doc1:Pos3 → currentPos = Doc1:Pos4 109 | // Step 4: Find "fox" after Doc1:Pos4 → currentPos = Doc1:Pos5 110 | // Return: Doc1:Pos5 (position of the last word "fox") 111 | // 112 | // If any word isn't found, we return EOF (no phrase exists) 113 | func (idx *InvertedIndex) findPhraseEnd(terms []string, startPos Position) Position { 114 | currentPos := startPos 115 | 116 | // For each word in the phrase, find its next occurrence 117 | for _, term := range terms { 118 | currentPos, _ = idx.Next(term, currentPos) 119 | 120 | // If we can't find this word, the phrase doesn't exist 121 | if currentPos.IsEnd() { 122 | return EOFDocument 123 | } 124 | } 125 | 126 | // currentPos now points to the last word of the phrase 127 | return currentPos 128 | } 129 | 130 | // findPhraseStart walks backward to find where the phrase begins 131 | // 132 | // HOW IT WORKS: 133 | // ------------- 134 | // Starting from the END position, we walk backwards through the phrase. 135 | // 136 | // Example: We found "fox" at Doc1:Pos5, now find the start of "quick brown fox" 137 | // 138 | // Step 1: currentPos = Doc1:Pos5 (we're at "fox") 139 | // Step 2: Find "brown" before Doc1:Pos5 → currentPos = Doc1:Pos4 140 | // Step 3: Find "quick" before Doc1:Pos4 → currentPos = Doc1:Pos3 141 | // Return: Doc1:Pos3 (position of the first word "quick") 142 | // 143 | // Why skip the last word? 144 | // - We already know where the last word is (at endPos) 145 | // - We only need to walk back through the first N-1 words 146 | func (idx *InvertedIndex) findPhraseStart(terms []string, endPos Position) Position { 147 | currentPos := endPos 148 | 149 | // Walk backwards through all words EXCEPT the last one 150 | // (we already know the last word's position - it's endPos) 151 | for i := len(terms) - 2; i >= 0; i-- { 152 | currentPos, _ = idx.Previous(terms[i], currentPos) 153 | } 154 | 155 | // currentPos now points to the first word of the phrase 156 | return currentPos 157 | } 158 | 159 | // isValidPhrase checks if positions form a valid consecutive phrase 160 | // 161 | // VALIDATION RULES: 162 | // ----------------- 163 | // For a valid phrase, we need: 164 | // 1. All words in the SAME document 165 | // 2. Words at CONSECUTIVE positions 166 | // 167 | // Example: Checking "quick brown fox" (3 words) 168 | // 169 | // start = Doc1:Pos3 170 | // end = Doc1:Pos5 171 | // 172 | // Check 1: Same document? Doc1 == Doc1 ✓ 173 | // Check 2: Consecutive? (5 - 3) == (3 - 1) → 2 == 2 ✓ 174 | // Result: VALID 175 | // 176 | // Counter-example: NOT a valid phrase 177 | // 178 | // start = Doc1:Pos3 179 | // end = Doc1:Pos7 180 | // 181 | // Check 2: Consecutive? (7 - 3) == (3 - 1) → 4 == 2 ✗ 182 | // Result: INVALID (there are extra words between them) 183 | func (idx *InvertedIndex) isValidPhrase(start, end Position, termCount int) bool { 184 | // Calculate expected distance for consecutive words 185 | // For 3 words, positions should be like [0,1,2] → distance = 2 186 | expectedDistance := termCount - 1 187 | 188 | // Calculate actual distance between start and end 189 | actualDistance := end.GetOffset() - start.GetOffset() 190 | 191 | // Both conditions must be true 192 | return start.DocumentID == end.DocumentID && actualDistance == expectedDistance 193 | } 194 | 195 | // FindAllPhrases finds ALL occurrences of a phrase in the entire index 196 | // 197 | // ALGORITHM: 198 | // ---------- 199 | // This is just a loop that repeatedly calls NextPhrase until we reach EOF. 200 | // 201 | // Example: Finding all occurrences of "brown fox" 202 | // 203 | // Iteration 1: 204 | // - Search from BOF → Found at Doc2:Pos[3-4] 205 | // - Add to results 206 | // - Continue from Doc2:Pos3 207 | // 208 | // Iteration 2: 209 | // - Search from Doc2:Pos3 → Found at Doc5:Pos[1-2] 210 | // - Add to results 211 | // - Continue from Doc5:Pos1 212 | // 213 | // Iteration 3: 214 | // - Search from Doc5:Pos1 → Returns EOF 215 | // - Stop searching 216 | // 217 | // Result: [[Doc2:Pos3-4], [Doc5:Pos1-2]] 218 | func (idx *InvertedIndex) FindAllPhrases(query string, startPos Position) [][]Position { 219 | var allMatches [][]Position 220 | currentPos := BOFDocument // Start from the beginning 221 | 222 | // Keep searching until we reach the end of file 223 | for !currentPos.IsEnd() { 224 | // Find the next occurrence of the phrase 225 | phrasePositions := idx.NextPhrase(query, currentPos) 226 | phraseStart := phrasePositions[0] 227 | 228 | // If we found a valid phrase (not EOF), add it to results 229 | if !phraseStart.IsEnd() { 230 | allMatches = append(allMatches, phrasePositions) 231 | } 232 | 233 | // Move to where we found the phrase to continue searching 234 | currentPos = phraseStart 235 | } 236 | 237 | return allMatches 238 | } 239 | 240 | // ═══════════════════════════════════════════════════════════════════════════════ 241 | // PROXIMITY SEARCH: Finding Documents Containing All Terms 242 | // ═══════════════════════════════════════════════════════════════════════════════ 243 | // A "cover" is a range of positions that contains ALL search terms. 244 | // Unlike phrase search, the words don't need to be consecutive or in order. 245 | // 246 | // EXAMPLE: 247 | // -------- 248 | // Document: "the quick brown dog jumped over the lazy fox" 249 | // Positions: 0 1 2 3 4 5 6 7 8 250 | // 251 | // Searching for ["quick", "fox"]: 252 | // Cover 1: Pos 1 to Pos 8 (entire range containing both words) 253 | // This is the MINIMAL cover (smallest range containing all terms) 254 | // 255 | // WHY USE COVERS? 256 | // --------------- 257 | // Covers are used for: 258 | // 1. Boolean search: Find documents with ALL terms (AND query) 259 | // 2. Proximity ranking: Closer terms = higher relevance 260 | // 3. Snippet generation: Show the most relevant part of a document 261 | // 262 | // THE ALGORITHM: 263 | // -------------- 264 | // To find a cover: 265 | // 1. Find the FURTHEST occurrence of any term (this is the cover end) 266 | // 2. Walk BACKWARDS to find the EARLIEST occurrence of each term 267 | // 3. Check if all terms are in the same document 268 | // 4. If yes, we found a cover! If no, try again. 269 | // ═══════════════════════════════════════════════════════════════════════════════ 270 | 271 | // NextCover finds the next "cover" - a range containing all given tokens 272 | // 273 | // ALGORITHM WALKTHROUGH: 274 | // ---------------------- 275 | // Query: ["quick", "fox", "brown"] 276 | // StartPos: Beginning of file 277 | // 278 | // PHASE 1: Find the cover END (furthest position) 279 | // - Find "quick" after startPos → maybe Doc2:Pos1 280 | // - Find "fox" after startPos → maybe Doc2:Pos8 ← furthest 281 | // - Find "brown" after startPos → maybe Doc2:Pos2 282 | // - coverEnd = Doc2:Pos8 283 | // 284 | // PHASE 2: Find the cover START (earliest position before end) 285 | // - From Doc2:Pos9, find previous "quick" → Doc2:Pos1 ← earliest 286 | // - From Doc2:Pos9, find previous "fox" → Doc2:Pos8 287 | // - From Doc2:Pos9, find previous "brown" → Doc2:Pos2 288 | // - coverStart = Doc2:Pos1 289 | // 290 | // PHASE 3: Validate the cover 291 | // - Same document? Yes (all in Doc2) ✓ 292 | // - Return [Doc2:Pos1, Doc2:Pos8] 293 | // 294 | // If not same document, recurse from coverStart to find the next cover. 295 | // 296 | // Why this algorithm? 297 | // - Greedy approach: We find the furthest occurrence first 298 | // - Efficient: Uses index jumps instead of scanning 299 | // - Minimal covers: Always finds the smallest valid range 300 | func (idx *InvertedIndex) NextCover(tokens []string, startPos Position) []Position { 301 | // PHASE 1: Find the END of the cover (furthest position) 302 | coverEnd := idx.findCoverEnd(tokens, startPos) 303 | if coverEnd.IsEnd() { 304 | // Can't find all tokens - no cover exists 305 | return []Position{EOFDocument, EOFDocument} 306 | } 307 | 308 | // PHASE 2: Find the START of the cover (earliest position) 309 | coverStart := idx.findCoverStart(tokens, coverEnd) 310 | 311 | // PHASE 3: Validate the cover 312 | if coverStart.DocumentID == coverEnd.DocumentID { 313 | // Success! All tokens are in the same document 314 | return []Position{coverStart, coverEnd} 315 | } 316 | 317 | // Tokens span multiple documents - try again from coverStart 318 | return idx.NextCover(tokens, coverStart) 319 | } 320 | 321 | // findCoverEnd finds the furthest position among all tokens 322 | // 323 | // HOW IT WORKS: 324 | // ------------- 325 | // We find the next occurrence of EACH token and track the furthest one. 326 | // 327 | // Example: Finding cover end for ["quick", "brown", "fox"] from Doc1:Pos0 328 | // 329 | // Step 1: Find "quick" after Pos0 → Doc2:Pos1 330 | // maxPos = Doc2:Pos1 331 | // 332 | // Step 2: Find "brown" after Pos0 → Doc2:Pos2 333 | // Is Doc2:Pos2 after Doc2:Pos1? Yes 334 | // maxPos = Doc2:Pos2 335 | // 336 | // Step 3: Find "fox" after Pos0 → Doc2:Pos8 337 | // Is Doc2:Pos8 after Doc2:Pos2? Yes 338 | // maxPos = Doc2:Pos8 339 | // 340 | // Return: Doc2:Pos8 (the furthest position) 341 | // 342 | // Special case: If ANY token returns EOF, we can't form a cover 343 | func (idx *InvertedIndex) findCoverEnd(tokens []string, startPos Position) Position { 344 | maxPos := startPos 345 | 346 | for _, token := range tokens { 347 | // Find next occurrence of this token 348 | tokenPos, _ := idx.Next(token, startPos) 349 | 350 | // If any token is not found, we can't create a cover 351 | if tokenPos.IsEnd() { 352 | return EOFDocument 353 | } 354 | 355 | // Keep track of the furthest position 356 | if tokenPos.IsAfter(maxPos) { 357 | maxPos = tokenPos 358 | } 359 | } 360 | 361 | return maxPos 362 | } 363 | 364 | // findCoverStart finds the earliest position that still covers all tokens 365 | // 366 | // HOW IT WORKS: 367 | // ------------- 368 | // Starting from just after the cover end, we walk backwards to find each token. 369 | // 370 | // Example: Finding cover start for ["quick", "brown", "fox"] 371 | // 372 | // with coverEnd at Doc2:Pos8 373 | // 374 | // searchBound = Doc2:Pos9 (one position after the end) 375 | // 376 | // Step 1: Find "quick" before Pos9 → Doc2:Pos1 ← earliest so far 377 | // minPos = Doc2:Pos1 378 | // 379 | // Step 2: Find "brown" before Pos9 → Doc2:Pos2 380 | // Is Doc2:Pos2 before Doc2:Pos1? No 381 | // minPos stays Doc2:Pos1 382 | // 383 | // Step 3: Find "fox" before Pos9 → Doc2:Pos8 384 | // Is Doc2:Pos8 before Doc2:Pos1? No 385 | // minPos stays Doc2:Pos1 386 | // 387 | // Return: Doc2:Pos1 (the earliest position) 388 | // 389 | // Why search from (endPos + 1)? 390 | // - Previous() returns positions STRICTLY BEFORE the search point 391 | // - By searching from endPos+1, we can find tokens AT endPos 392 | func (idx *InvertedIndex) findCoverStart(tokens []string, endPos Position) Position { 393 | minPos := BOFDocument 394 | 395 | // Create a search bound just after the cover end 396 | // This ensures we can find tokens AT the end position 397 | searchBound := Position{ 398 | DocumentID: endPos.DocumentID, 399 | Offset: endPos.Offset + 1, 400 | } 401 | 402 | for _, token := range tokens { 403 | // Find the previous occurrence of this token before searchBound 404 | tokenPos, _ := idx.Previous(token, searchBound) 405 | 406 | // Keep track of the earliest position 407 | if minPos.IsBeginning() || tokenPos.IsBefore(minPos) { 408 | minPos = tokenPos 409 | } 410 | } 411 | 412 | return minPos 413 | } 414 | 415 | // ═══════════════════════════════════════════════════════════════════════════════ 416 | // RANKING: Scoring Search Results by Relevance 417 | // ═══════════════════════════════════════════════════════════════════════════════ 418 | // Not all search results are equally relevant. We need to rank them! 419 | // 420 | // PROXIMITY RANKING: 421 | // ------------------ 422 | // The idea: Documents where search terms appear CLOSER together are more relevant. 423 | // 424 | // Example: Searching for "machine learning" 425 | // Doc A: "machine learning is..." (distance: 1) → HIGH score 426 | // Doc B: "machine ... learning" (distance: 3) → MEDIUM score 427 | // Doc C: "machine ... ... ... learning" (distance: 5) → LOW score 428 | // 429 | // SCORING FORMULA: 430 | // ---------------- 431 | // For each cover in a document: 432 | // score += 1 / (coverEnd - coverStart + 1) 433 | // 434 | // Why this formula? 435 | // - Smaller distances → larger scores (inversely proportional) 436 | // - Multiple covers → higher score (sum of all covers) 437 | // - Simple and fast to compute 438 | // 439 | // EXAMPLE CALCULATION: 440 | // -------------------- 441 | // Document: "quick brown fox jumped over quick brown dog" 442 | // Positions: 0 1 2 3 4 5 6 7 443 | // 444 | // Searching for ["quick", "brown"]: 445 | // Cover 1: Pos[0-1] → score += 1/(1-0+1) = 1/2 = 0.5 446 | // Cover 2: Pos[5-6] → score += 1/(6-5+1) = 1/2 = 0.5 447 | // Total score: 1.0 448 | // 449 | // A document with terms closer together: 450 | // Document: "quick brown" 451 | // Positions: 0 1 452 | // Cover 1: Pos[0-1] → score = 1/(1-0+1) = 1/2 = 0.5 453 | // Total score: 0.5 (but only ONE occurrence) 454 | // 455 | // ═══════════════════════════════════════════════════════════════════════════════ 456 | 457 | // Match represents a search result with its positions and relevance score 458 | // 459 | // STRUCTURE: 460 | // ---------- 461 | // Offsets: The [start, end] positions of a cover in a document 462 | // Score: The relevance score (higher = more relevant) 463 | // 464 | // Example Match: 465 | // 466 | // Offsets: [Doc3:Pos1, Doc3:Pos5] ← This document matches from Pos1 to Pos5 467 | // Score: 2.7 ← Relevance score 468 | type Match struct { 469 | DocID int // Document identifier 470 | Offsets []Position // Where the match was found [start, end] 471 | Score float64 // How relevant is this match? 472 | } 473 | 474 | // GetKey generates a unique identifier for the match 475 | func (m *Match) GetKey() (string, error) { 476 | data, err := json.Marshal(m.DocID) 477 | if err != nil { 478 | return "", err 479 | } 480 | hash := md5.Sum(data) 481 | return hex.EncodeToString(hash[:]), nil 482 | } 483 | 484 | // calculateIDF computes the Inverse Document Frequency for a term 485 | // 486 | // IDF FORMULA: 487 | // ------------ 488 | // IDF(term) = log((N - df + 0.5) / (df + 0.5) + 1) 489 | // 490 | // Where: 491 | // 492 | // N = total number of documents 493 | // df = number of documents containing the term 494 | // 495 | // INTUITION: 496 | // ---------- 497 | // - Rare terms (low df) get high IDF scores 498 | // - Common terms (high df) get low IDF scores 499 | // - This makes rare terms more important for ranking 500 | // 501 | // EXAMPLE: 502 | // -------- 503 | // Total docs: 1000 504 | // Term "the": appears in 950 docs → IDF ≈ 0.05 (very common, low importance) 505 | // Term "quantum": appears in 5 docs → IDF ≈ 5.3 (rare, high importance) 506 | // 507 | // PERFORMANCE BOOST WITH ROARING BITMAPS: 508 | // ---------------------------------------- 509 | // Instead of traversing skip lists to count documents, we use bitmaps: 510 | // - Old way: O(n) traverse skip list, count unique docs 511 | // - New way: O(1) bitmap.GetCardinality() 512 | // This is 10-100x faster for common terms! 513 | func (idx *InvertedIndex) calculateIDF(term string) float64 { 514 | // Use roaring bitmap for instant document count 515 | bitmap, exists := idx.DocBitmaps[term] 516 | if !exists { 517 | return 0.0 518 | } 519 | 520 | // Get document frequency instantly from bitmap cardinality 521 | df := float64(bitmap.GetCardinality()) 522 | 523 | if df == 0 { 524 | return 0.0 525 | } 526 | 527 | N := float64(idx.TotalDocs) 528 | 529 | // BM25 IDF formula (with smoothing to avoid negative values) 530 | return math.Log((N-df+0.5)/(df+0.5) + 1.0) 531 | } 532 | 533 | // countDocsInPostingList counts unique documents in a posting list 534 | func (idx *InvertedIndex) countDocsInPostingList(skipList SkipList) int { 535 | uniqueDocs := make(map[int]bool) 536 | 537 | current := skipList.Head.Tower[0] 538 | for current != nil { 539 | docID := current.Key.GetDocumentID() 540 | uniqueDocs[docID] = true 541 | current = current.Tower[0] 542 | } 543 | 544 | return len(uniqueDocs) 545 | } 546 | 547 | // calculateBM25Score computes the BM25 score for a document given query terms 548 | // 549 | // BM25 ALGORITHM: 550 | // --------------- 551 | // 1. For each query term: 552 | // a. Calculate IDF(term) - how rare is this term? 553 | // b. Get term frequency in document 554 | // c. Apply saturation and length normalization 555 | // d. Accumulate score 556 | // 557 | // EXAMPLE CALCULATION: 558 | // -------------------- 559 | // Query: "machine learning" 560 | // Doc 5: 200 words, "machine" appears 3 times, "learning" appears 2 times 561 | // Corpus: 1000 docs, avg length 150 words 562 | // 563 | // For "machine" (appears in 100 docs): 564 | // 565 | // IDF = log((1000 - 100 + 0.5) / (100 + 0.5) + 1) ≈ 2.3 566 | // TF = 3 567 | // normalized_TF = (3 * 2.5) / (3 + 1.5 * (1 - 0.75 + 0.75 * (200/150))) 568 | // = 7.5 / 5.0 ≈ 1.5 569 | // score += 2.3 * 1.5 ≈ 3.45 570 | // 571 | // For "learning" (appears in 50 docs): 572 | // 573 | // IDF ≈ 2.9 574 | // TF = 2 575 | // normalized_TF ≈ 1.2 576 | // score += 2.9 * 1.2 ≈ 3.48 577 | // 578 | // Total BM25 score: 3.45 + 3.48 = 6.93 579 | func (idx *InvertedIndex) calculateBM25Score(docID int, queryTerms []string) float64 { 580 | docStats, exists := idx.DocStats[docID] 581 | if !exists { 582 | return 0.0 583 | } 584 | 585 | // Calculate average document length 586 | avgDocLen := float64(idx.TotalTerms) / float64(idx.TotalDocs) 587 | docLen := float64(docStats.Length) 588 | 589 | score := 0.0 590 | k1 := idx.BM25Params.K1 591 | b := idx.BM25Params.B 592 | 593 | // Process each query term 594 | for _, term := range queryTerms { 595 | // Get IDF for this term 596 | idf := idx.calculateIDF(term) 597 | 598 | // Get term frequency in this document 599 | tf := float64(docStats.TermFreqs[term]) 600 | 601 | if tf > 0 { 602 | // BM25 formula with length normalization 603 | numerator := tf * (k1 + 1) 604 | denominator := tf + k1*(1-b+b*(docLen/avgDocLen)) 605 | score += idf * (numerator / denominator) 606 | } 607 | } 608 | 609 | return score 610 | } 611 | 612 | // RankBM25 performs BM25 ranking of search results 613 | // 614 | // ALGORITHM: 615 | // ---------- 616 | // 1. Tokenize query 617 | // 2. Find all documents containing at least one query term 618 | // 3. Calculate BM25 score for each document 619 | // 4. Sort by score (descending) 620 | // 5. Return top K results 621 | // 622 | // EXAMPLE: 623 | // -------- 624 | // Query: "machine learning algorithms" 625 | // 626 | // Step 1: Tokenize → ["machine", "learning", "algorithms"] 627 | // 628 | // Step 2: Find candidate documents: 629 | // 630 | // "machine" appears in: [Doc1, Doc3, Doc5, Doc7] 631 | // "learning" appears in: [Doc1, Doc2, Doc5] 632 | // "algorithms" appears in: [Doc2, Doc5, Doc8] 633 | // Candidates: [Doc1, Doc2, Doc3, Doc5, Doc7, Doc8] 634 | // 635 | // Step 3: Calculate BM25 scores: 636 | // 637 | // Doc1: 12.5 (has "machine" and "learning") 638 | // Doc2: 8.3 (has "learning" and "algorithms") 639 | // Doc3: 3.2 (only has "machine") 640 | // Doc5: 15.7 (has all three terms!) 641 | // Doc7: 2.1 (only has "machine") 642 | // Doc8: 4.5 (only has "algorithms") 643 | // 644 | // Step 4: Sort: [Doc5, Doc1, Doc2, Doc8, Doc3, Doc7] 645 | // 646 | // Step 5: Return top 3: [Doc5, Doc1, Doc2] 647 | func (idx *InvertedIndex) RankBM25(query string, maxResults int) []Match { 648 | slog.Info("BM25 ranking", slog.String("query", query)) 649 | 650 | tokens := Analyze(query) 651 | if len(tokens) == 0 { 652 | return []Match{} 653 | } 654 | 655 | slog.Info("search tokens", slog.String("tokens", fmt.Sprintf("%v", tokens))) 656 | 657 | // Find all candidate documents (documents containing at least one query term) 658 | candidates := idx.findCandidateDocuments(tokens) 659 | 660 | // Calculate BM25 score for each candidate 661 | results := make([]Match, 0, len(candidates)) 662 | for docID := range candidates { 663 | score := idx.calculateBM25Score(docID, tokens) 664 | 665 | if score > 0 { 666 | results = append(results, Match{ 667 | DocID: docID, 668 | Offsets: candidates[docID], // Positions where terms appear 669 | Score: score, 670 | }) 671 | } 672 | } 673 | 674 | // Sort by score (descending) 675 | idx.sortMatchesByScore(results) 676 | 677 | // Return top K results 678 | return limitResults(results, maxResults) 679 | } 680 | 681 | // findCandidateDocuments finds all documents containing at least one query term 682 | // 683 | // Returns a map: DocID → Positions where query terms appear 684 | // 685 | // PERFORMANCE BOOST WITH ROARING BITMAPS: 686 | // ---------------------------------------- 687 | // We use a two-phase approach: 688 | // 1. Fast filtering: Use bitmaps to find candidate document IDs (O(1) per term) 689 | // 2. Position lookup: Only fetch positions for candidate documents 690 | // 691 | // OLD APPROACH: Traverse every skip list node (slow) 692 | // NEW APPROACH: Bitmap union + targeted position lookup (fast!) 693 | func (idx *InvertedIndex) findCandidateDocuments(tokens []string) map[int][]Position { 694 | candidates := make(map[int][]Position) 695 | 696 | // PHASE 1: Use bitmaps to quickly find all candidate document IDs 697 | candidateDocs := make(map[int]bool) 698 | for _, token := range tokens { 699 | bitmap, exists := idx.DocBitmaps[token] 700 | if !exists { 701 | continue 702 | } 703 | 704 | // Iterate through document IDs in the bitmap 705 | iter := bitmap.Iterator() 706 | for iter.HasNext() { 707 | docID := int(iter.Next()) 708 | candidateDocs[docID] = true 709 | } 710 | } 711 | 712 | // PHASE 2: For each candidate document, fetch positions from skip lists 713 | // This is still needed for BM25 scoring (we need exact positions) 714 | for _, token := range tokens { 715 | skipList, exists := idx.getPostingList(token) 716 | if !exists { 717 | continue 718 | } 719 | 720 | // Only traverse skip list for positions in candidate documents 721 | current := skipList.Head.Tower[0] 722 | for current != nil { 723 | docID := current.Key.GetDocumentID() 724 | // Only add if this is a candidate document 725 | if candidateDocs[docID] { 726 | candidates[docID] = append(candidates[docID], current.Key) 727 | } 728 | current = current.Tower[0] 729 | } 730 | } 731 | 732 | return candidates 733 | } 734 | 735 | // sortMatchesByScore sorts matches by score in descending order (higher scores first) 736 | func (idx *InvertedIndex) sortMatchesByScore(matches []Match) { 737 | sort.Slice(matches, func(i, j int) bool { 738 | return matches[i].Score > matches[j].Score 739 | }) 740 | } 741 | 742 | // RankProximity performs proximity-based ranking of search results 743 | // 744 | // THIS IS THE MAIN SEARCH FUNCTION! 745 | // 746 | // COMPLETE EXAMPLE: 747 | // ----------------- 748 | // Query: "machine learning" 749 | // MaxResults: 10 750 | // 751 | // Step 1: Tokenize query 752 | // 753 | // "machine learning" → ["machine", "learning"] 754 | // 755 | // Step 2: Find all covers (ranges containing both words) 756 | // 757 | // Doc1: Cover[0-1], Cover[5-6] → score = 0.5 + 0.5 = 1.0 758 | // Doc2: Cover[0-5] → score = 0.167 759 | // Doc3: Cover[2-3], Cover[10-11] → score = 0.5 + 0.5 = 1.0 760 | // Doc4: Cover[1-1] → Wait, both words at same position? Impossible! 761 | // (This means one word appears twice) 762 | // 763 | // Step 3: Return top 10 results 764 | // 765 | // Result: [Doc1, Doc3, Doc2] (sorted by score, limited to 10) 766 | // 767 | // ALGORITHM WALKTHROUGH: 768 | // ---------------------- 769 | // We iterate through ALL covers in the index, accumulating scores per document. 770 | // 771 | // Iteration 1: Find first cover → Doc1:Pos[0-1] 772 | // - New document Doc1 detected 773 | // - Calculate score: 1/(1-0+1) = 0.5 774 | // - Current document: Doc1, current score: 0.5 775 | // 776 | // Iteration 2: Find next cover → Doc1:Pos[5-6] 777 | // - Still in Doc1 (not a new document) 778 | // - Add to score: 0.5 + 1/(6-5+1) = 1.0 779 | // - Current document: Doc1, current score: 1.0 780 | // 781 | // Iteration 3: Find next cover → Doc2:Pos[0-5] 782 | // - New document Doc2 detected! 783 | // - Save previous: Match{Doc1, score=1.0} 784 | // - Start new: Doc2, score = 1/(5-0+1) = 0.167 785 | // 786 | // ... continue until EOF ... 787 | // 788 | // Final step: Return top K results 789 | func (idx *InvertedIndex) RankProximity(query string, maxResults int) []Match { 790 | slog.Info("proximity ranking", slog.String("query", query)) 791 | 792 | // STEP 1: Tokenize the query (same as indexing) 793 | tokens := Analyze(query) 794 | if len(tokens) == 0 { 795 | // Empty query → no results 796 | return []Match{} 797 | } 798 | 799 | slog.Info("search tokens", slog.String("tokens", fmt.Sprintf("%v", tokens))) 800 | 801 | // STEP 2: Find and score all covers 802 | results := idx.collectProximityMatches(tokens) 803 | 804 | // STEP 3: Limit to top K results 805 | return limitResults(results, maxResults) 806 | } 807 | 808 | // collectProximityMatches finds and scores all proximity matches 809 | // 810 | // This is the core ranking loop that: 811 | // 1. Finds all covers 812 | // 2. Groups them by document 813 | // 3. Calculates cumulative scores per document 814 | // 815 | // STATE TRACKING: 816 | // --------------- 817 | // We maintain state across iterations: 818 | // - currentCandidate: The [start, end] positions of the current document's match 819 | // - currentScore: The accumulated score for the current document 820 | // - matches: The final list of all document matches 821 | // 822 | // TRANSITION DETECTION: 823 | // --------------------- 824 | // When we find a cover in a NEW document: 825 | // 826 | // → Save the current document's match 827 | // → Reset state for the new document 828 | func (idx *InvertedIndex) collectProximityMatches(tokens []string) []Match { 829 | var matches []Match 830 | 831 | // Find the first cover to initialize our state 832 | coverPositions := idx.NextCover(tokens, BOFDocument) 833 | coverStart, coverEnd := coverPositions[0], coverPositions[1] 834 | 835 | // Initialize tracking variables 836 | currentCandidate := []Position{coverStart, coverEnd} 837 | currentScore := 0.0 838 | 839 | // Loop through all covers until we reach EOF 840 | for !coverStart.IsEnd() { 841 | // DETECTION: Did we move to a new document? 842 | if currentCandidate[0].DocumentID < coverStart.DocumentID { 843 | // Yes! Save the previous document's match 844 | matches = append(matches, Match{ 845 | Offsets: currentCandidate, 846 | Score: currentScore, 847 | }) 848 | 849 | // Reset state for the new document 850 | currentCandidate = []Position{coverStart, coverEnd} 851 | currentScore = 0 852 | } 853 | 854 | // SCORING: Calculate proximity score for this cover 855 | // Formula: 1 / (distance + 1) 856 | // - Smaller distance → higher score 857 | // - +1 to avoid division by zero when start==end 858 | proximity := float64(coverEnd.Offset - coverStart.Offset + 1) 859 | currentScore += 1 / proximity 860 | 861 | // Find the next cover 862 | coverPositions = idx.NextCover(tokens, coverStart) 863 | coverStart, coverEnd = coverPositions[0], coverPositions[1] 864 | } 865 | 866 | // Don't forget the last document! 867 | // When we reach EOF, we still have one unsaved match 868 | if !currentCandidate[0].IsEnd() { 869 | matches = append(matches, Match{ 870 | Offsets: currentCandidate, 871 | Score: currentScore, 872 | }) 873 | } 874 | 875 | return matches 876 | } 877 | 878 | // limitResults returns at most maxResults items 879 | // 880 | // Simple helper to truncate the results list. 881 | // Uses math.Min to avoid index-out-of-bounds errors. 882 | // 883 | // Example: 884 | // 885 | // matches = [Match1, Match2, Match3, Match4, Match5] 886 | // maxResults = 3 887 | // Returns: [Match1, Match2, Match3] 888 | func limitResults(matches []Match, maxResults int) []Match { 889 | limit := int(math.Min(float64(maxResults), float64(len(matches)))) 890 | return matches[:limit] 891 | } 892 | -------------------------------------------------------------------------------- /skiplist.go: -------------------------------------------------------------------------------- 1 | package blaze 2 | 3 | import ( 4 | "errors" 5 | "math" 6 | "math/rand" 7 | "time" 8 | ) 9 | 10 | // ═══════════════════════════════════════════════════════════════════════════════ 11 | // WHAT IS A SKIP LIST? 12 | // ═══════════════════════════════════════════════════════════════════════════════ 13 | // A skip list is a probabilistic data structure that allows O(log n) search, 14 | // insert, and delete operations - similar to a balanced tree, but simpler! 15 | // 16 | // VISUAL REPRESENTATION: 17 | // ---------------------- 18 | // Think of it as a linked list with "express lanes": 19 | // 20 | // Level 3: HEAD -------------------------------------> [30] -----------> NULL 21 | // Level 2: HEAD ----------------> [15] -------------> [30] -----------> NULL 22 | // Level 1: HEAD -------> [10] --> [15] --> [20] ----> [30] -----------> NULL 23 | // Level 0: HEAD --> [5] -> [10] -> [15] -> [20] -> [25] -> [30] -> [35] -> NULL 24 | // ^^^ ^^^ ^^^ ^^^ ^^^ ^^^ ^^^ 25 | // Actual data in the skip list nodes 26 | // 27 | // HOW IT WORKS: 28 | // ------------- 29 | // - Level 0 (bottom): Contains ALL elements in sorted order 30 | // - Higher levels: Contain progressively fewer elements (like express lanes) 31 | // - Searching: Start at the highest level, drop down when needed 32 | // 33 | // SEARCH EXAMPLE (finding 20): 34 | // ----------------------------- 35 | // 1. Start at HEAD, Level 3 36 | // 2. Level 3: Move to 30? No, 30 > 20, so drop to Level 2 37 | // 3. Level 2: Move to 15? Yes, 15 < 20, advance to 15 38 | // 4. Level 2: Move to 30? No, 30 > 20, so drop to Level 1 39 | // 5. Level 1: Move to 20? Yes! Found it! 40 | // 41 | // Time Complexity: O(log n) average case 42 | // - Each level skips roughly half the elements 43 | // - Similar to binary search, but on a linked structure 44 | // 45 | // WHY USE SKIP LISTS IN A SEARCH ENGINE? 46 | // --------------------------------------- 47 | // 1. Fast lookups: O(log n) to find any position 48 | // 2. Fast range queries: Find all positions in a document efficiently 49 | // 3. Maintains sorted order: Essential for phrase search 50 | // 4. Simple implementation: Easier than balanced trees (no rotations!) 51 | // 5. Good cache locality: Level 0 can be traversed sequentially 52 | // 53 | // ═══════════════════════════════════════════════════════════════════════════════ 54 | 55 | const MaxHeight = 32 // Maximum tower height (supports billions of elements) 56 | 57 | // ═══════════════════════════════════════════════════════════════════════════════ 58 | // SENTINEL VALUES 59 | // ═══════════════════════════════════════════════════════════════════════════════ 60 | // We use MaxInt and MinInt as boundary markers 61 | // 62 | // WHY USE MAX/MIN INT? 63 | // -------------------- 64 | // - Makes comparisons cleaner (no special cases for "empty") 65 | // - Always guarantees: BOF < any_position < EOF 66 | // - Simplifies edge cases in search algorithms 67 | // 68 | // Example: Searching from the "beginning" 69 | // 70 | // Without sentinels: Need to check "is this the first call?" 71 | // With sentinels: Just use BOF as the starting position! 72 | var ( 73 | EOF = math.MaxInt // End Of File: maximum integer value (larger than any real position) 74 | BOF = math.MinInt // Beginning Of File: minimum integer value (smaller than any real position) 75 | ) 76 | 77 | var ( 78 | ErrKeyNotFound = errors.New("key not found") 79 | ErrNoElementFound = errors.New("no element found") 80 | ) 81 | 82 | // ═══════════════════════════════════════════════════════════════════════════════ 83 | // POSITION: A Location in a Document 84 | // ═══════════════════════════════════════════════════════════════════════════════ 85 | // Position identifies a specific word in a specific document 86 | // 87 | // EXAMPLE: 88 | // -------- 89 | // Document 5: "The quick brown fox jumps" 90 | // Position{DocumentID: 5, Offset: 2} refers to "brown" 91 | // 92 | // WHY USE INT? 93 | // ------------ 94 | // - Represents actual integer document IDs and offsets 95 | // - Supports sentinel values (BOF = MinInt, EOF = MaxInt) 96 | // - No casting needed - simpler and more efficient 97 | // 98 | // ORDERING: 99 | // --------- 100 | // Positions are ordered first by DocumentID, then by Offset: 101 | // 102 | // Doc1:Pos5 < Doc1:Pos10 < Doc2:Pos0 < Doc2:Pos3 103 | // 104 | // ═══════════════════════════════════════════════════════════════════════════════ 105 | type Position struct { 106 | DocumentID int // Which document? 107 | Offset int // Which word in the document? (0-indexed) 108 | } 109 | 110 | // Sentinel positions for convenience 111 | var ( 112 | BOFDocument = Position{DocumentID: BOF, Offset: BOF} // Before all documents 113 | EOFDocument = Position{DocumentID: EOF, Offset: EOF} // After all documents 114 | ) 115 | 116 | // ═══════════════════════════════════════════════════════════════════════════════ 117 | // POSITION HELPER METHODS 118 | // ═══════════════════════════════════════════════════════════════════════════════ 119 | // These methods make Position comparisons more readable and less error-prone 120 | // ═══════════════════════════════════════════════════════════════════════════════ 121 | 122 | // GetDocumentID returns the document ID 123 | // (Convenience method for consistent API) 124 | func (p *Position) GetDocumentID() int { 125 | return p.DocumentID 126 | } 127 | 128 | // GetOffset returns the offset 129 | // (Convenience method for consistent API) 130 | func (p *Position) GetOffset() int { 131 | return p.Offset 132 | } 133 | 134 | // IsBeginning checks if this is the BOF sentinel 135 | // 136 | // Example usage: 137 | // 138 | // if pos.IsBeginning() { 139 | // // We're at the start, no previous element exists 140 | // } 141 | func (p *Position) IsBeginning() bool { 142 | return p.Offset == BOF 143 | } 144 | 145 | // IsEnd checks if this is the EOF sentinel 146 | // 147 | // Example usage: 148 | // 149 | // if pos.IsEnd() { 150 | // // We've reached the end, stop searching 151 | // } 152 | func (p *Position) IsEnd() bool { 153 | return p.Offset == EOF 154 | } 155 | 156 | // IsBefore checks if this position comes before another position 157 | // 158 | // ORDERING RULES: 159 | // --------------- 160 | // Position A < Position B if: 161 | // 1. A.DocumentID < B.DocumentID, OR 162 | // 2. Same document AND A.Offset < B.Offset 163 | // 164 | // EXAMPLES: 165 | // --------- 166 | // Doc1:Pos5 < Doc1:Pos10 → true (same doc, 5 < 10) 167 | // Doc1:Pos5 < Doc2:Pos0 → true (doc 1 < doc 2) 168 | // Doc2:Pos0 < Doc1:Pos5 → false (doc 2 > doc 1) 169 | func (p *Position) IsBefore(other Position) bool { 170 | // Check document order first 171 | if p.DocumentID < other.DocumentID { 172 | return true 173 | } 174 | 175 | // Same document: check offset order 176 | return p.DocumentID == other.DocumentID && p.Offset < other.Offset 177 | } 178 | 179 | // IsAfter checks if this position comes after another position 180 | // 181 | // This is the opposite of IsBefore (with equality handled separately) 182 | func (p *Position) IsAfter(other Position) bool { 183 | // Check document order first 184 | if p.DocumentID > other.DocumentID { 185 | return true 186 | } 187 | 188 | // Same document: check offset order 189 | return p.DocumentID == other.DocumentID && p.Offset > other.Offset 190 | } 191 | 192 | // Equals checks if two positions are identical 193 | // 194 | // Example: 195 | // 196 | // Doc1:Pos5 == Doc1:Pos5 → true 197 | // Doc1:Pos5 == Doc1:Pos6 → false 198 | func (p *Position) Equals(other Position) bool { 199 | return p.DocumentID == other.DocumentID && p.Offset == other.Offset 200 | } 201 | 202 | // ═══════════════════════════════════════════════════════════════════════════════ 203 | // NODE: A Skip List Node 204 | // ═══════════════════════════════════════════════════════════════════════════════ 205 | // Each node stores: 206 | // 1. A Key (Position): The data we're storing 207 | // 2. A Tower: Array of pointers to next nodes at each level 208 | // 209 | // TOWER VISUALIZATION: 210 | // -------------------- 211 | // For a node with height 3: 212 | // 213 | // Tower[2] -----> (points to a node far ahead) 214 | // Tower[1] -----> (points to a node ahead) 215 | // Tower[0] -----> (points to the very next node) 216 | // 217 | // The higher the level, the further ahead we skip! 218 | // ═══════════════════════════════════════════════════════════════════════════════ 219 | type Node struct { 220 | Key Position // The position stored in this node 221 | Tower [MaxHeight]*Node // Array of forward pointers (one per level) 222 | } 223 | 224 | // ═══════════════════════════════════════════════════════════════════════════════ 225 | // SKIP LIST: The Main Data Structure 226 | // ═══════════════════════════════════════════════════════════════════════════════ 227 | type SkipList struct { 228 | Head *Node // Sentinel head node (doesn't contain real data) 229 | Height int // Current height of the tallest tower 230 | } 231 | 232 | // NewSkipList creates an empty skip list 233 | // 234 | // INITIAL STATE: 235 | // -------------- 236 | // HEAD (empty node) with no forward pointers 237 | // Height = 1 (even empty lists have level 0) 238 | func NewSkipList() *SkipList { 239 | return &SkipList{ 240 | Head: &Node{}, // Empty sentinel head 241 | Height: 1, 242 | } 243 | } 244 | 245 | // ═══════════════════════════════════════════════════════════════════════════════ 246 | // SEARCH: The Core Operation 247 | // ═══════════════════════════════════════════════════════════════════════════════ 248 | // Search is the foundation of all skip list operations. 249 | // It returns TWO things: 250 | // 1. The node with the exact key (or nil if not found) 251 | // 2. A "journey" array: the path we took to get there 252 | // 253 | // WHY RETURN THE JOURNEY? 254 | // ------------------------ 255 | // The journey tells us which node is BEFORE the target at each level. 256 | // This is essential for: 257 | // - Insert: We need to know where to splice in the new node 258 | // - Delete: We need to know which nodes to update 259 | // - FindLessThan: The journey already contains the answer! 260 | // 261 | // SEARCH ALGORITHM: 262 | // ----------------- 263 | // Start at the highest level and work down: 264 | // 1. At each level, move right as far as possible (while staying < target) 265 | // 2. When we can't move right, drop down one level 266 | // 3. Repeat until we reach level 0 267 | // 4. Check if we found the exact key 268 | // 269 | // VISUAL EXAMPLE (searching for 20): 270 | // ----------------------------------- 271 | // Level 2: HEAD ------[10]------[30] Start at HEAD, level 2 272 | // ^^^ Can we jump to 10? Yes! (10 < 20) 273 | // ^^^ Can we jump to 30? No! (30 > 20) 274 | // Drop to level 1... 275 | // 276 | // Level 1: HEAD --[10]--[15]--[20]--[30] At 10, level 1 277 | // ^^^ Can we jump to 15? Yes! (15 < 20) 278 | // ^^^ Can we jump to 20? STOP! Check this 279 | // 280 | // Level 0: We'd check if 20 exists at level 0 281 | // 282 | // Journey captured: [level0: node15, level1: node15, level2: node10] 283 | // ═══════════════════════════════════════════════════════════════════════════════ 284 | 285 | // Search finds a key in the skip list and returns the path taken 286 | // 287 | // RETURN VALUES: 288 | // -------------- 289 | // 1. *Node: The node with exact key (nil if not found) 290 | // 2. [MaxHeight]*Node: Journey array - the predecessor at each level 291 | // 292 | // EXAMPLE: 293 | // -------- 294 | // Skip list: [5] -> [10] -> [15] -> [20] 295 | // Search(15) returns: 296 | // - found: Node{15} 297 | // - journey[0]: Node{10} (predecessor at level 0) 298 | // - journey[1]: Node{10} (predecessor at level 1) 299 | // - ... 300 | func (sl *SkipList) Search(key Position) (*Node, [MaxHeight]*Node) { 301 | var journey [MaxHeight]*Node // Track the path we take 302 | current := sl.Head // Start at the sentinel head 303 | 304 | // Traverse from highest level down to level 0 305 | for level := sl.Height - 1; level >= 0; level-- { 306 | // Move forward as far as possible at this level 307 | current = sl.traverseLevel(current, key, level) 308 | 309 | // Record where we ended up at this level 310 | // (This is the predecessor for this level) 311 | journey[level] = current 312 | } 313 | 314 | // Check if we found an exact match 315 | // current now points to the largest node < key 316 | // So current.Tower[0] might be the exact key 317 | next := current.Tower[0] 318 | if next != nil && next.Key.Equals(key) { 319 | return next, journey // Found it! 320 | } 321 | 322 | return nil, journey // Not found, but journey is still useful 323 | } 324 | 325 | // traverseLevel advances along a single level as far as possible 326 | // 327 | // PROCESS: 328 | // -------- 329 | // Starting from 'start', move forward while next.Key < target 330 | // Stop when: next.Key >= target OR next == nil 331 | // 332 | // EXAMPLE: 333 | // -------- 334 | // Level: HEAD -> [5] -> [10] -> [15] -> [20] -> nil 335 | // Target: 17 336 | // 337 | // Step 1: At HEAD, next = 5, should advance? Yes (5 < 17) 338 | // Step 2: At 5, next = 10, should advance? Yes (10 < 17) 339 | // Step 3: At 10, next = 15, should advance? Yes (15 < 17) 340 | // Step 4: At 15, next = 20, should advance? No! (20 > 17) 341 | // Return: node 15 342 | func (sl *SkipList) traverseLevel(start *Node, target Position, level int) *Node { 343 | current := start 344 | 345 | // Keep moving forward while we can 346 | next := current.Tower[level] 347 | for next != nil { 348 | // Should we advance to the next node? 349 | if sl.shouldAdvance(next.Key, target) { 350 | current = next // Yes, move forward 351 | next = current.Tower[level] // Update next to the next node 352 | } else { 353 | break // No, stop here 354 | } 355 | } 356 | 357 | return current 358 | } 359 | 360 | // shouldAdvance determines if we should move to the next node 361 | // 362 | // DECISION RULE: 363 | // -------------- 364 | // Advance if: next.Key < target 365 | // Stop if: next.Key >= target 366 | // 367 | // This ensures we stop at the largest node that's still less than target 368 | func (sl *SkipList) shouldAdvance(nodeKey, targetKey Position) bool { 369 | // Don't advance if we've reached or passed the target 370 | if nodeKey.Equals(targetKey) { 371 | return false 372 | } 373 | 374 | // Advance only if the node key is less than target 375 | return nodeKey.IsBefore(targetKey) 376 | } 377 | 378 | // ═══════════════════════════════════════════════════════════════════════════════ 379 | // FIND OPERATIONS: Building on Search 380 | // ═══════════════════════════════════════════════════════════════════════════════ 381 | // These operations use Search as a building block 382 | // ═══════════════════════════════════════════════════════════════════════════════ 383 | 384 | // Find searches for an exact key match 385 | // 386 | // # This is a simple wrapper around Search that only returns the key 387 | // 388 | // Example: 389 | // 390 | // Find(Doc1:Pos5) returns Doc1:Pos5 if it exists, else error 391 | func (sl *SkipList) Find(key Position) (Position, error) { 392 | found, _ := sl.Search(key) 393 | 394 | if found == nil { 395 | return EOFDocument, ErrKeyNotFound 396 | } 397 | 398 | return found.Key, nil 399 | } 400 | 401 | // FindLessThan finds the largest key less than the given key 402 | // 403 | // HOW IT WORKS: 404 | // ------------- 405 | // The journey from Search already gives us this answer! 406 | // journey[0] is the largest node < key at the bottom level 407 | // 408 | // EXAMPLE: 409 | // -------- 410 | // Skip list: [5] -> [10] -> [15] -> [20] 411 | // FindLessThan(17) returns 15 412 | // FindLessThan(15) returns 10 413 | // FindLessThan(5) returns BOF (nothing before 5) 414 | // 415 | // USE CASE: 416 | // --------- 417 | // In search: "Find the previous occurrence of 'quick' before position X" 418 | func (sl *SkipList) FindLessThan(key Position) (Position, error) { 419 | _, journey := sl.Search(key) 420 | 421 | predecessor := journey[0] // The node before key at level 0 422 | 423 | // Check edge cases 424 | if predecessor == nil || predecessor == sl.Head { 425 | return BOFDocument, ErrNoElementFound 426 | } 427 | 428 | return predecessor.Key, nil 429 | } 430 | 431 | // FindGreaterThan finds the smallest key greater than the given key 432 | // 433 | // TWO CASES: 434 | // ---------- 435 | // 1. Key exists: Return the next node after it 436 | // 2. Key doesn't exist: Return the next node after where it would be 437 | // 438 | // EXAMPLE: 439 | // -------- 440 | // Skip list: [5] -> [10] -> [15] -> [20] 441 | // FindGreaterThan(10) returns 15 (next after 10) 442 | // FindGreaterThan(12) returns 15 (next after where 12 would be) 443 | // FindGreaterThan(20) returns EOF (nothing after 20) 444 | // 445 | // USE CASE: 446 | // --------- 447 | // In search: "Find the next occurrence of 'quick' after position X" 448 | func (sl *SkipList) FindGreaterThan(key Position) (Position, error) { 449 | found, journey := sl.Search(key) 450 | 451 | // CASE 1: Key exists - return its successor 452 | if found != nil { 453 | if found.Tower[0] != nil { 454 | return found.Tower[0].Key, nil 455 | } 456 | return EOFDocument, ErrNoElementFound 457 | } 458 | 459 | // CASE 2: Key doesn't exist - return next node after where it would be 460 | predecessor := journey[0] 461 | if predecessor != nil && predecessor.Tower[0] != nil { 462 | return predecessor.Tower[0].Key, nil 463 | } 464 | 465 | return EOFDocument, ErrNoElementFound 466 | } 467 | 468 | // ═══════════════════════════════════════════════════════════════════════════════ 469 | // INSERT: Adding Elements to the Skip List 470 | // ═══════════════════════════════════════════════════════════════════════════════ 471 | // Insertion is a two-phase process: 472 | // 1. Search to find where the new element should go 473 | // 2. Splice the new node into the list at multiple levels 474 | // 475 | // PROBABILISTIC HEIGHT: 476 | // --------------------- 477 | // Each new node gets a random height (tower height): 478 | // - 50% chance of height 1 479 | // - 25% chance of height 2 480 | // - 12.5% chance of height 3 481 | // - ... 482 | // 483 | // This randomness is what makes skip lists work! 484 | // It ensures roughly logarithmic performance on average. 485 | // 486 | // INSERT EXAMPLE: 487 | // --------------- 488 | // Inserting 17 with height 2: 489 | // 490 | // Before: 491 | // Level 1: HEAD -------> [10] ------------> [20] 492 | // Level 0: HEAD -> [5] -> [10] -> [15] -> [20] 493 | // 494 | // After: 495 | // Level 1: HEAD -------> [10] -> [17] ----> [20] 496 | // Level 0: HEAD -> [5] -> [10] -> [15] -> [17] -> [20] 497 | // ^^^ 498 | // new node 499 | // ═══════════════════════════════════════════════════════════════════════════════ 500 | 501 | // Insert adds a new key to the skip list (or updates if it exists) 502 | // 503 | // ALGORITHM: 504 | // ---------- 505 | // 1. Search for the key (get the journey/path) 506 | // 2. If found, update the existing node 507 | // 3. If not found: 508 | // a. Generate a random height for the new node 509 | // b. Create the new node 510 | // c. Link it into the list at each level 511 | // d. Update the skip list's height if needed 512 | // 513 | // EXAMPLE WALKTHROUGH: 514 | // -------------------- 515 | // Inserting Doc2:Pos5 into skip list: [Doc1:Pos3, Doc2:Pos10] 516 | // 517 | // Step 1: Search(Doc2:Pos5) 518 | // - Not found 519 | // - journey[0] = Node{Doc1:Pos3} (predecessor at level 0) 520 | // 521 | // Step 2: Generate height = 2 (random) 522 | // 523 | // Step 3: Create Node{Doc2:Pos5} 524 | // 525 | // Step 4: Link at level 0 and level 1: 526 | // - Level 0: Doc1:Pos3 -> Doc2:Pos5 -> Doc2:Pos10 527 | // - Level 1: HEAD -> Doc2:Pos5 -> ... 528 | func (sl *SkipList) Insert(key Position) { 529 | found, journey := sl.Search(key) 530 | 531 | // If key already exists, just update it 532 | if found != nil { 533 | found.Key = key 534 | return 535 | } 536 | 537 | // Generate a random height for the new node 538 | height := sl.randomHeight() 539 | 540 | // Create the new node 541 | newNode := &Node{Key: key} 542 | 543 | // Link the node into the skip list 544 | sl.linkNode(newNode, journey, height) 545 | 546 | // Update skip list height if necessary 547 | if height > sl.Height { 548 | sl.Height = height 549 | } 550 | } 551 | 552 | // linkNode connects a new node into the skip list structure 553 | // 554 | // LINKING PROCESS (for each level): 555 | // ---------------------------------- 556 | // 1. Find the predecessor at this level (from journey) 557 | // 2. Set newNode.Tower[level] = predecessor.Tower[level] 558 | // 3. Set predecessor.Tower[level] = newNode 559 | // 560 | // VISUAL EXAMPLE (linking at level 1): 561 | // ------------------------------------- 562 | // Before: 563 | // 564 | // predecessor -> [oldNext] 565 | // 566 | // After: 567 | // 568 | // predecessor -> [newNode] -> [oldNext] 569 | // 570 | // The newNode "splices" itself between predecessor and oldNext 571 | func (sl *SkipList) linkNode(node *Node, journey [MaxHeight]*Node, height int) { 572 | // Link the node at each level up to its height 573 | for level := 0; level < height; level++ { 574 | predecessor := journey[level] 575 | 576 | // Edge case: If no predecessor at this level, use HEAD 577 | if predecessor == nil { 578 | predecessor = sl.Head 579 | } 580 | 581 | // Splice the node into the linked list at this level 582 | // 1. New node points to what predecessor was pointing to 583 | node.Tower[level] = predecessor.Tower[level] 584 | // 2. Predecessor now points to new node 585 | predecessor.Tower[level] = node 586 | } 587 | } 588 | 589 | // ═══════════════════════════════════════════════════════════════════════════════ 590 | // DELETE: Removing Elements from the Skip List 591 | // ═══════════════════════════════════════════════════════════════════════════════ 592 | // Deletion is the reverse of insertion: 593 | // 1. Search for the key 594 | // 2. Unlink it from all levels 595 | // 3. Clean up: reduce height if top levels are now empty 596 | // ═══════════════════════════════════════════════════════════════════════════════ 597 | 598 | // Delete removes a key from the skip list 599 | // 600 | // ALGORITHM: 601 | // ---------- 602 | // 1. Search for the key 603 | // 2. If not found, return false 604 | // 3. If found: 605 | // a. Unlink it from all levels 606 | // b. Shrink the skip list height if needed 607 | // 608 | // EXAMPLE: 609 | // -------- 610 | // Deleting 15: 611 | // 612 | // Before: 613 | // Level 1: HEAD -------> [10] -> [15] ----> [20] 614 | // Level 0: HEAD -> [5] -> [10] -> [15] -> [20] 615 | // 616 | // After: 617 | // Level 1: HEAD -------> [10] ------------> [20] 618 | // Level 0: HEAD -> [5] -> [10] ------------> [20] 619 | // 620 | // (15 removed) 621 | func (sl *SkipList) Delete(key Position) bool { 622 | found, journey := sl.Search(key) 623 | 624 | // Key doesn't exist 625 | if found == nil { 626 | return false 627 | } 628 | 629 | // Unlink the node from all levels 630 | for level := 0; level < sl.Height; level++ { 631 | // If the predecessor at this level doesn't point to our node, 632 | // we've finished unlinking (node wasn't tall enough for higher levels) 633 | if journey[level].Tower[level] != found { 634 | break 635 | } 636 | 637 | // Bypass the node: predecessor points to node's successor 638 | journey[level].Tower[level] = found.Tower[level] 639 | } 640 | 641 | // Clean up: reduce height if top levels are now empty 642 | sl.shrink() 643 | return true 644 | } 645 | 646 | // ═══════════════════════════════════════════════════════════════════════════════ 647 | // UTILITY OPERATIONS 648 | // ═══════════════════════════════════════════════════════════════════════════════ 649 | 650 | // Last returns the last position in the skip list 651 | // 652 | // HOW IT WORKS: 653 | // ------------- 654 | // Simply traverse level 0 until we reach the end 655 | // 656 | // Example: 657 | // Skip list: [5] -> [10] -> [15] -> [20] -> nil 658 | // Last() returns 20 659 | func (sl *SkipList) Last() Position { 660 | current := sl.Head 661 | 662 | // Traverse the bottom level to the end 663 | for next := current.Tower[0]; next != nil; next = next.Tower[0] { 664 | current = next 665 | } 666 | 667 | return current.Key 668 | } 669 | 670 | // shrink reduces the height if top levels are empty 671 | // 672 | // WHY SHRINK? 673 | // ----------- 674 | // After deletions, the top levels might become empty. 675 | // Shrinking improves performance by not searching empty levels. 676 | // 677 | // EXAMPLE: 678 | // -------- 679 | // Before (after deleting the only height-3 node): 680 | // Level 2: HEAD -> nil (empty!) 681 | // Level 1: HEAD -> [10] -> [20] 682 | // Level 0: HEAD -> [5] -> [10] -> [15] -> [20] 683 | // Height: 3 684 | // 685 | // After shrinking: 686 | // Level 1: HEAD -> [10] -> [20] 687 | // Level 0: HEAD -> [5] -> [10] -> [15] -> [20] 688 | // Height: 2 (top level removed) 689 | func (sl *SkipList) shrink() { 690 | // Check levels from top down 691 | for level := sl.Height - 1; level >= 0; level-- { 692 | if sl.Head.Tower[level] == nil { 693 | sl.Height-- // This level is empty, reduce height 694 | } else { 695 | break // Found a non-empty level, stop 696 | } 697 | } 698 | } 699 | 700 | // ═══════════════════════════════════════════════════════════════════════════════ 701 | // RANDOM HEIGHT GENERATION 702 | // ═══════════════════════════════════════════════════════════════════════════════ 703 | // This is the "magic" that makes skip lists work! 704 | // 705 | // THE COIN FLIP ALGORITHM: 706 | // ------------------------- 707 | // Flip a fair coin repeatedly: 708 | // - Heads: Increase height by 1, flip again 709 | // - Tails: Stop, return current height 710 | // 711 | // PROBABILITY DISTRIBUTION: 712 | // -------------------------- 713 | // Height 1: 50% (tails on first flip) 714 | // Height 2: 25% (heads then tails) 715 | // Height 3: 12.5% (heads, heads, tails) 716 | // Height 4: 6.25% (heads, heads, heads, tails) 717 | // ... 718 | // 719 | // This creates a geometric distribution that ensures: 720 | // - Most nodes have height 1 (50%) 721 | // - Few nodes have height 2 (25%) 722 | // - Very few nodes have height 3 (12.5%) 723 | // - Extremely rare to have height > 10 724 | // 725 | // WHY THIS WORKS: 726 | // --------------- 727 | // With N elements and this distribution: 728 | // - Expected number of nodes at level 0: N 729 | // - Expected number of nodes at level 1: N/2 730 | // - Expected number of nodes at level 2: N/4 731 | // - Expected number of nodes at level 3: N/8 732 | // ... 733 | // 734 | // This creates O(log N) expected search time! 735 | // ═══════════════════════════════════════════════════════════════════════════════ 736 | 737 | // randomHeight generates a random height for a new node 738 | // 739 | // IMPLEMENTATION: 740 | // --------------- 741 | // 1. Start with height = 1 742 | // 2. Flip a coin (random < 0.5) 743 | // 3. If heads and not at max: increase height, repeat 744 | // 4. If tails or at max: return current height 745 | func (sl *SkipList) randomHeight() int { 746 | height := 1 747 | rng := rand.New(rand.NewSource(time.Now().UnixNano())) 748 | 749 | // Keep "flipping coins" (50% probability) 750 | for rng.Float64() < 0.5 && height < MaxHeight { 751 | height++ 752 | } 753 | 754 | return height 755 | } 756 | 757 | // ═══════════════════════════════════════════════════════════════════════════════ 758 | // ITERATOR: Sequential Access to Elements 759 | // ═══════════════════════════════════════════════════════════════════════════════ 760 | // While skip lists support fast random access, sometimes we need to 761 | // traverse all elements in order. The iterator provides this capability. 762 | // 763 | // USAGE PATTERN: 764 | // -------------- 765 | // iter := skipList.Iterator() 766 | // for iter.HasNext() { 767 | // pos := iter.Next() 768 | // // Process position... 769 | // } 770 | // 771 | // EXAMPLE: 772 | // -------- 773 | // Skip list: [Doc1:Pos1, Doc1:Pos5, Doc2:Pos0, Doc2:Pos3] 774 | // 775 | // iter := skipList.Iterator() 776 | // iter.Next() → Doc1:Pos1 777 | // iter.Next() → Doc1:Pos5 778 | // iter.Next() → Doc2:Pos0 779 | // iter.Next() → Doc2:Pos3 780 | // iter.Next() → EOF 781 | // ═══════════════════════════════════════════════════════════════════════════════ 782 | 783 | // Iterator provides sequential access to skip list elements 784 | // 785 | // IMPLEMENTATION NOTE: 786 | // -------------------- 787 | // We only traverse level 0 (the bottom level) which contains all elements 788 | // in sorted order. Higher levels are just shortcuts for searching. 789 | type Iterator struct { 790 | current *Node // The current position in the iteration 791 | } 792 | 793 | // Iterator creates a new iterator starting at the first element 794 | // 795 | // INITIALIZATION: 796 | // --------------- 797 | // We start at the first real element (sl.Head.Tower[0]) 798 | // NOT at the Head itself (which is just a sentinel) 799 | func (sl *SkipList) Iterator() *Iterator { 800 | return &Iterator{current: sl.Head.Tower[0]} 801 | } 802 | 803 | // HasNext checks if there are more elements to iterate 804 | // 805 | // LOGIC: 806 | // ------ 807 | // There are more elements if: 808 | // - current is not nil (we haven't fallen off the end), AND 809 | // - current.Tower[0] is not nil (there's a next element) 810 | // 811 | // Example states: 812 | // - HasNext() == true: current -> [next] -> ... 813 | // - HasNext() == false: current -> nil (at the last element) 814 | func (it *Iterator) HasNext() bool { 815 | return it.current != nil && it.current.Tower[0] != nil 816 | } 817 | 818 | // Next advances to and returns the next position 819 | // 820 | // PROCESS: 821 | // -------- 822 | // 1. Move to the next node 823 | // 2. If we've reached the end, return EOF 824 | // 3. Otherwise, return the current position 825 | // 826 | // IMPORTANT: 827 | // ---------- 828 | // Always check HasNext() before calling Next() to avoid 829 | // returning EOF unexpectedly! 830 | // 831 | // EXAMPLE USAGE: 832 | // -------------- 833 | // iter := skipList.Iterator() 834 | // 835 | // for iter.HasNext() { 836 | // pos := iter.Next() 837 | // fmt.Printf("Doc %d, Pos %d\n", pos.GetDocumentID(), pos.GetOffset()) 838 | // } 839 | func (it *Iterator) Next() Position { 840 | // Check if we're already at the end 841 | if it.current == nil { 842 | return EOFDocument 843 | } 844 | 845 | // Move to the next node 846 | it.current = it.current.Tower[0] 847 | 848 | // Check if we've reached the end after moving 849 | if it.current == nil { 850 | return EOFDocument 851 | } 852 | 853 | // Return the current position 854 | return it.current.Key 855 | } 856 | 857 | // ═══════════════════════════════════════════════════════════════════════════════ 858 | // SKIP LIST SUMMARY 859 | // ═══════════════════════════════════════════════════════════════════════════════ 860 | // 861 | // KEY CONCEPTS: 862 | // ------------- 863 | // 1. Multiple levels: Express lanes for faster searching 864 | // 2. Probabilistic balancing: Random heights keep it balanced on average 865 | // 3. Sorted order: Always maintains elements in sorted order 866 | // 4. O(log n) operations: Search, insert, delete all average O(log n) 867 | // 868 | // WHY IT'S PERFECT FOR SEARCH ENGINES: 869 | // ------------------------------------- 870 | // 1. Fast positional lookups: Find any document/position quickly 871 | // 2. Range queries: Find all positions in a document efficiently 872 | // 3. Sorted iteration: Process results in order 873 | // 4. Simple implementation: No complex tree rotations needed 874 | // 5. Good cache performance: Sequential access on level 0 875 | // 876 | // OPERATIONS SUMMARY: 877 | // ------------------- 878 | // - Search(key): Find exact key or where it would be → O(log n) 879 | // - Insert(key): Add new element → O(log n) 880 | // - Delete(key): Remove element → O(log n) 881 | // - Find(key): Check if key exists → O(log n) 882 | // - FindLessThan(key): Find predecessor → O(log n) 883 | // - FindGreaterThan(key): Find successor → O(log n) 884 | // - Last(): Find last element → O(n) worst case, O(1) with tail pointer 885 | // - Iterator(): Sequential traversal → O(n) for all elements 886 | // 887 | // SPACE COMPLEXITY: 888 | // ----------------- 889 | // - Average: O(n) where n is the number of elements 890 | // - Each node has ~2 pointers on average (geometric distribution) 891 | // - Worst case: O(n * MaxHeight) but extremely unlikely 892 | // 893 | // PERFORMANCE CHARACTERISTICS: 894 | // ----------------------------- 895 | // - Search: O(log n) expected, O(n) worst case (very rare) 896 | // - Insert: O(log n) expected, O(n) worst case (very rare) 897 | // - Delete: O(log n) expected, O(n) worst case (very rare) 898 | // - Space: O(n) expected, O(n * log n) worst case 899 | // 900 | // The "worst case" scenarios are so rare they're not practically relevant. 901 | // The randomization ensures good performance with extremely high probability. 902 | // 903 | // COMPARISON TO OTHER DATA STRUCTURES: 904 | // ------------------------------------- 905 | // vs. Balanced Trees (AVL, Red-Black): 906 | // + Simpler implementation (no rotations) 907 | // + Better constant factors in practice 908 | // + Lock-free variants easier to implement 909 | // - Slightly worse worst-case guarantees (probabilistic vs deterministic) 910 | // 911 | // vs. Hash Tables: 912 | // + Maintains sorted order (hash tables don't) 913 | // + Supports range queries efficiently 914 | // + No rehashing needed 915 | // - Slower than hash tables for exact lookups (O(log n) vs O(1)) 916 | // 917 | // vs. Arrays: 918 | // + Fast insertion/deletion (no shifting elements) 919 | // + Dynamic sizing (no reallocation) 920 | // - Slower random access (O(log n) vs O(1)) 921 | // - More memory overhead (pointers) 922 | // 923 | // REAL-WORLD APPLICATIONS: 924 | // ------------------------- 925 | // 1. Database indexes (LevelDB, RocksDB use skip lists) 926 | // 2. In-memory caches (Redis sorted sets use skip lists) 927 | // 3. Search engines (inverted indexes like this one!) 928 | // 4. Concurrent data structures (easier to make lock-free than trees) 929 | // 5. Time-series databases (sorted by timestamp) 930 | // 931 | // ═══════════════════════════════════════════════════════════════════════════════ 932 | --------------------------------------------------------------------------------