├── media
    └── image.png
├── .gitattributes
├── go.mod
├── .gitignore
├── LICENSE
├── .github
    └── workflows
    │   └── static.yml
├── go.sum
├── Makefile
├── query.go
├── index_test.go
├── query_test.go
├── index.go
├── analyzer.go
├── skiplist_test.go
├── serialization.go
├── search.go
└── skiplist.go


/media/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wizenheimer/blaze/HEAD/media/image.png


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Exclude YAML and HTML files from language statistics
2 | *.yaml linguist-vendored
3 | *.yml linguist-vendored
4 | *.html linguist-vendored
5 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/wizenheimer/blaze
 2 | 
 3 | go 1.24.2
 4 | 
 5 | require (
 6 | 	github.com/RoaringBitmap/roaring v1.9.4
 7 | 	github.com/kljensen/snowball v0.10.0
 8 | )
 9 | 
10 | require (
11 | 	github.com/bits-and-blooms/bitset v1.12.0 // indirect
12 | 	github.com/mschoch/smat v0.2.0 // indirect
13 | )
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Binaries for programs and plugins
 2 | *.exe
 3 | *.exe~
 4 | *.dll
 5 | *.so
 6 | *.dylib
 7 | bin/
 8 | dist/
 9 | 
10 | # Test binary, built with `go test -c`
11 | *.test
12 | 
13 | # Output of the go coverage tool
14 | *.out
15 | coverage.html
16 | coverage.out
17 | 
18 | # Go workspace file
19 | go.work
20 | go.work.sum
21 | 
22 | # Dependency directories
23 | vendor/
24 | 
25 | # Go build cache
26 | .cache/
27 | 
28 | # IDEs and editors
29 | .vscode/
30 | .idea/
31 | *.swp
32 | *.swo
33 | *~
34 | .DS_Store
35 | 
36 | # Environment variables
37 | .env
38 | .env.local
39 | .env.*.local
40 | 
41 | # Temporary files
42 | tmp/
43 | temp/
44 | *.tmp
45 | 
46 | # Logs
47 | *.log
48 | 
49 | # OS generated files
50 | Thumbs.db
51 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 wizenheimer
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/static.yml:
--------------------------------------------------------------------------------
 1 | # Simple workflow for deploying static content to GitHub Pages
 2 | name: Deploy static content to Pages
 3 | on:
 4 |   # Runs on pushes targeting the default branch
 5 |   push:
 6 |     branches: ["main"]
 7 |   # Allows you to run this workflow manually from the Actions tab
 8 |   workflow_dispatch:
 9 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
10 | permissions:
11 |   contents: read
12 |   pages: write
13 |   id-token: write
14 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
15 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
16 | concurrency:
17 |   group: "pages"
18 |   cancel-in-progress: false
19 | jobs:
20 |   # Single deploy job since we're just deploying
21 |   deploy:
22 |     environment:
23 |       name: github-pages
24 |       url: ${{ steps.deployment.outputs.page_url }}
25 |     runs-on: ubuntu-latest
26 |     steps:
27 |       - name: Checkout
28 |         uses: actions/checkout@v4
29 |       - name: Setup Pages
30 |         uses: actions/configure-pages@v5
31 |       - name: Upload artifact
32 |         uses: actions/upload-pages-artifact@v3
33 |         with:
34 |           # Upload only the public directory
35 |           path: './public'
36 |       - name: Deploy to GitHub Pages
37 |         id: deployment
38 |         uses: actions/deploy-pages@v4
39 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/RoaringBitmap/roaring v1.9.4 h1:yhEIoH4YezLYT04s1nHehNO64EKFTop/wBhxv2QzDdQ=
 2 | github.com/RoaringBitmap/roaring v1.9.4/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90=
 3 | github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA=
 4 | github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
 5 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
 6 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 7 | github.com/kljensen/snowball v0.10.0 h1:8qgaBLraSuUVHtGH5tJ+VdGpqgfcaE2WkswL/C3nVhY=
 8 | github.com/kljensen/snowball v0.10.0/go.mod h1:bJcxtur1W5Qw4fVj9tk5W88zyRcGQQjqahFErdcDTHk=
 9 | github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM=
10 | github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw=
11 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
12 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
13 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
14 | github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
15 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
16 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
17 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
18 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
19 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # Color definitions
  2 | RED     := \033[0;31m
  3 | GREEN   := \033[0;32m
  4 | YELLOW  := \033[0;33m
  5 | BLUE    := \033[0;34m
  6 | MAGENTA := \033[0;35m
  7 | CYAN    := \033[0;36m
  8 | RESET   := \033[0m
  9 | 
 10 | # Project configuration
 11 | PKG_NAME := blaze
 12 | GO_FILES := $(shell find . -type f -name '*.go' -not -path "./vendor/*")
 13 | 
 14 | .PHONY: all test clean fmt lint vet help deps tidy bench test-coverage check
 15 | 
 16 | # Default target
 17 | all: help
 18 | 
 19 | # Run tests
 20 | test:
 21 | 	@echo "$(CYAN)Running tests...$(RESET)"
 22 | 	@go test -v -race -coverprofile=coverage.out ./...
 23 | 	@echo "$(GREEN)✓ Tests complete$(RESET)"
 24 | 
 25 | # Run tests with coverage report
 26 | test-coverage: test
 27 | 	@echo "$(CYAN)Generating coverage report...$(RESET)"
 28 | 	@go tool cover -html=coverage.out -o coverage.html
 29 | 	@echo "$(GREEN)✓ Coverage report: coverage.html$(RESET)"
 30 | 
 31 | # Run benchmarks
 32 | bench:
 33 | 	@echo "$(CYAN)Running benchmarks...$(RESET)"
 34 | 	@go test -bench=. -benchmem ./...
 35 | 	@echo "$(GREEN)✓ Benchmarks complete$(RESET)"
 36 | 
 37 | # Format code
 38 | fmt:
 39 | 	@echo "$(CYAN)Formatting code...$(RESET)"
 40 | 	@gofmt -s -w $(GO_FILES)
 41 | 	@echo "$(GREEN)✓ Code formatted$(RESET)"
 42 | 
 43 | # Lint code
 44 | lint:
 45 | 	@echo "$(CYAN)Running linter...$(RESET)"
 46 | 	@if command -v golangci-lint >/dev/null 2>&1; then \
 47 | 		golangci-lint run ./...; \
 48 | 		echo "$(GREEN)✓ Linting complete$(RESET)"; \
 49 | 	else \
 50 | 		echo "$(YELLOW)⚠ golangci-lint not installed. Run: brew install golangci-lint$(RESET)"; \
 51 | 	fi
 52 | 
 53 | # Run go vet
 54 | vet:
 55 | 	@echo "$(CYAN)Running go vet...$(RESET)"
 56 | 	@go vet ./...
 57 | 	@echo "$(GREEN)✓ Vet complete$(RESET)"
 58 | 
 59 | # Check for common issues
 60 | check: fmt vet lint test
 61 | 	@echo "$(GREEN)✓ All checks passed$(RESET)"
 62 | 
 63 | # Download dependencies
 64 | deps:
 65 | 	@echo "$(CYAN)Downloading dependencies...$(RESET)"
 66 | 	@go mod download
 67 | 	@go mod verify
 68 | 	@echo "$(GREEN)✓ Dependencies downloaded$(RESET)"
 69 | 
 70 | # Tidy dependencies
 71 | tidy:
 72 | 	@echo "$(CYAN)Tidying dependencies...$(RESET)"
 73 | 	@go mod tidy
 74 | 	@echo "$(GREEN)✓ Dependencies tidied$(RESET)"
 75 | 
 76 | # Clean generated files and build artifacts
 77 | clean:
 78 | 	@echo "$(CYAN)Cleaning...$(RESET)"
 79 | 	@rm -f coverage.out coverage.html
 80 | 	@echo "$(GREEN)✓ Clean complete$(RESET)"
 81 | 
 82 | # Display help
 83 | help:
 84 | 	@echo "$(CYAN)"
 85 | 	@echo "╔╗ ╦  ╔═╗╔═╗╔═╗"
 86 | 	@echo "╠╩╗║  ╠═╣╔═╝║╣ "
 87 | 	@echo "╚═╝╩═╝╩ ╩╚═╝╚═╝"
 88 | 	@echo "$(RESET)"
 89 | 	@echo "$(MAGENTA)Fast Search Index Library$(RESET)"
 90 | 	@echo ""
 91 | 	@echo "$(MAGENTA)═══════════════════════════════════════════════$(RESET)"
 92 | 	@echo ""
 93 | 	@echo "$(YELLOW)Development Commands:$(RESET)"
 94 | 	@echo "  $(GREEN)make fmt$(RESET)            - Format Go code"
 95 | 	@echo "  $(GREEN)make vet$(RESET)            - Run go vet"
 96 | 	@echo "  $(GREEN)make lint$(RESET)           - Run golangci-lint"
 97 | 	@echo "  $(GREEN)make check$(RESET)          - Run fmt, vet, lint, and test"
 98 | 	@echo ""
 99 | 	@echo "$(YELLOW)Testing Commands:$(RESET)"
100 | 	@echo "  $(GREEN)make test$(RESET)           - Run tests with race detector"
101 | 	@echo "  $(GREEN)make test-coverage$(RESET)  - Run tests and generate coverage report"
102 | 	@echo "  $(GREEN)make bench$(RESET)          - Run benchmarks"
103 | 	@echo ""
104 | 	@echo "$(YELLOW)Dependency Commands:$(RESET)"
105 | 	@echo "  $(GREEN)make deps$(RESET)           - Download dependencies"
106 | 	@echo "  $(GREEN)make tidy$(RESET)           - Tidy dependencies"
107 | 	@echo ""
108 | 	@echo "$(YELLOW)Utility Commands:$(RESET)"
109 | 	@echo "  $(GREEN)make clean$(RESET)          - Remove generated files and artifacts"
110 | 	@echo "  $(GREEN)make help$(RESET)           - Display this help message"
111 | 	@echo ""
112 | 	@echo "$(MAGENTA)═══════════════════════════════════════════════$(RESET)"
113 | 
114 | 


--------------------------------------------------------------------------------
/query.go:
--------------------------------------------------------------------------------
  1 | package blaze
  2 | 
  3 | import (
  4 | 	"github.com/RoaringBitmap/roaring"
  5 | )
  6 | 
  7 | // ═══════════════════════════════════════════════════════════════════════════════
  8 | // QUERY BUILDER: Type-Safe Boolean Queries with Roaring Bitmaps
  9 | // ═══════════════════════════════════════════════════════════════════════════════
 10 | // Instead of parsing strings like "machine AND learning", use a fluent API:
 11 | //
 12 | // EXAMPLE USAGE:
 13 | // --------------
 14 | // Query: Find documents with "machine" AND "learning"
 15 | //
 16 | //	results := NewQueryBuilder(index).
 17 | //	    Term("machine").
 18 | //	    And().
 19 | //	    Term("learning").
 20 | //	    Execute()
 21 | //
 22 | // Query: Find documents with ("cat" OR "dog") but NOT "snake"
 23 | //
 24 | //	results := NewQueryBuilder(index).
 25 | //	    Group(func(q *QueryBuilder) {
 26 | //	        q.Term("cat").Or().Term("dog")
 27 | //	    }).
 28 | //	    And().Not().Term("snake").
 29 | //	    Execute()
 30 | //
 31 | // WHY BUILDER PATTERN?
 32 | // --------------------
 33 | // ✓ Type-safe: Compiler catches errors
 34 | // ✓ IDE-friendly: Auto-completion works
 35 | // ✓ Fluent: Reads like natural language
 36 | // ✓ Fast: Direct bitmap operations (no parsing overhead)
 37 | // ✓ Composable: Easy to build complex queries programmatically
 38 | // ═══════════════════════════════════════════════════════════════════════════════
 39 | 
 40 | // QueryBuilder provides a fluent interface for building boolean queries
 41 | type QueryBuilder struct {
 42 | 	index  *InvertedIndex
 43 | 	stack  []*roaring.Bitmap // Stack of intermediate results
 44 | 	ops    []QueryOp         // Stack of pending operations
 45 | 	negate bool              // Whether next term should be negated
 46 | 	terms  []string          // Track terms for BM25 scoring
 47 | }
 48 | 
 49 | // QueryOp represents a pending boolean operation
 50 | type QueryOp int
 51 | 
 52 | const (
 53 | 	OpNone QueryOp = iota
 54 | 	OpAnd
 55 | 	OpOr
 56 | )
 57 | 
 58 | // NewQueryBuilder creates a new query builder
 59 | //
 60 | // EXAMPLE:
 61 | // --------
 62 | //
 63 | //	qb := NewQueryBuilder(index)
 64 | //	results := qb.Term("machine").And().Term("learning").Execute()
 65 | func NewQueryBuilder(index *InvertedIndex) *QueryBuilder {
 66 | 	return &QueryBuilder{
 67 | 		index:  index,
 68 | 		stack:  make([]*roaring.Bitmap, 0),
 69 | 		ops:    make([]QueryOp, 0),
 70 | 		negate: false,
 71 | 		terms:  make([]string, 0),
 72 | 	}
 73 | }
 74 | 
 75 | // Term adds a term to the query
 76 | //
 77 | // WHAT IT DOES:
 78 | // -------------
 79 | // 1. Gets the roaring bitmap for the term (instant document lookup)
 80 | // 2. Applies any pending NOT operation
 81 | // 3. Combines with previous results using AND/OR
 82 | //
 83 | // EXAMPLE:
 84 | // --------
 85 | //
 86 | //	qb.Term("machine")  // Find all docs with "machine"
 87 | //
 88 | // PERFORMANCE:
 89 | // ------------
 90 | // O(1) bitmap lookup - no skip list traversal needed!
 91 | func (qb *QueryBuilder) Term(term string) *QueryBuilder {
 92 | 	// Analyze the term (lowercase, stem, etc.)
 93 | 	tokens := Analyze(term)
 94 | 	if len(tokens) == 0 {
 95 | 		// Empty term - push empty bitmap
 96 | 		qb.pushBitmap(roaring.NewBitmap())
 97 | 		return qb
 98 | 	}
 99 | 
100 | 	// Track term for BM25 scoring (if not negated)
101 | 	analyzedTerm := tokens[0]
102 | 	if !qb.negate {
103 | 		qb.terms = append(qb.terms, analyzedTerm)
104 | 	}
105 | 
106 | 	// Get bitmap for the analyzed term
107 | 	bitmap := qb.getTermBitmap(analyzedTerm)
108 | 
109 | 	// Apply negation if needed
110 | 	if qb.negate {
111 | 		bitmap = qb.negateBitmap(bitmap)
112 | 		qb.negate = false
113 | 	}
114 | 
115 | 	qb.pushBitmap(bitmap)
116 | 	return qb
117 | }
118 | 
119 | // Phrase adds a phrase query (exact sequence of words)
120 | //
121 | // WHAT IT DOES:
122 | // -------------
123 | // 1. Analyzes the phrase (just like during indexing)
124 | // 2. Uses skip lists to find exact phrase matches
125 | // 3. Converts results to a bitmap for boolean operations
126 | //
127 | // EXAMPLE:
128 | // --------
129 | //
130 | //	qb.Phrase("machine learning")  // Find exact phrase
131 | //
132 | // NOTE: Phrase queries need position information, so we use skip lists
133 | func (qb *QueryBuilder) Phrase(phrase string) *QueryBuilder {
134 | 	// Analyze the phrase to match what was indexed
135 | 	// This converts "Machine Learning" to "machin learn" etc.
136 | 	tokens := Analyze(phrase)
137 | 	if len(tokens) == 0 {
138 | 		qb.pushBitmap(roaring.NewBitmap())
139 | 		return qb
140 | 	}
141 | 
142 | 	// Track terms for BM25 scoring (if not negated)
143 | 	if !qb.negate {
144 | 		qb.terms = append(qb.terms, tokens...)
145 | 	}
146 | 
147 | 	// Reconstruct the analyzed phrase
148 | 	analyzedPhrase := ""
149 | 	for i, token := range tokens {
150 | 		if i > 0 {
151 | 			analyzedPhrase += " "
152 | 		}
153 | 		analyzedPhrase += token
154 | 	}
155 | 
156 | 	// Use existing phrase search from skip lists
157 | 	matches := qb.index.FindAllPhrases(analyzedPhrase, BOFDocument)
158 | 
159 | 	// Convert to bitmap
160 | 	bitmap := roaring.NewBitmap()
161 | 	for _, match := range matches {
162 | 		if !match[0].IsEnd() {
163 | 			bitmap.Add(uint32(match[0].GetDocumentID()))
164 | 		}
165 | 	}
166 | 
167 | 	// Apply negation if needed
168 | 	if qb.negate {
169 | 		bitmap = qb.negateBitmap(bitmap)
170 | 		qb.negate = false
171 | 	}
172 | 
173 | 	qb.pushBitmap(bitmap)
174 | 	return qb
175 | }
176 | 
177 | // And adds an AND operation
178 | //
179 | // EXAMPLE:
180 | // --------
181 | //
182 | //	qb.Term("machine").And().Term("learning")
183 | //	// Returns docs with BOTH "machine" AND "learning"
184 | //
185 | // PERFORMANCE:
186 | // ------------
187 | // Roaring bitmap intersection: O(1) for compressed chunks
188 | func (qb *QueryBuilder) And() *QueryBuilder {
189 | 	qb.ops = append(qb.ops, OpAnd)
190 | 	return qb
191 | }
192 | 
193 | // Or adds an OR operation
194 | //
195 | // EXAMPLE:
196 | // --------
197 | //
198 | //	qb.Term("cat").Or().Term("dog")
199 | //	// Returns docs with "cat" OR "dog" (or both)
200 | //
201 | // PERFORMANCE:
202 | // ------------
203 | // Roaring bitmap union: O(1) for compressed chunks
204 | func (qb *QueryBuilder) Or() *QueryBuilder {
205 | 	qb.ops = append(qb.ops, OpOr)
206 | 	return qb
207 | }
208 | 
209 | // Not negates the next term
210 | //
211 | // EXAMPLE:
212 | // --------
213 | //
214 | //	qb.Term("python").And().Not().Term("snake")
215 | //	// Returns docs with "python" but NOT "snake"
216 | //
217 | // PERFORMANCE:
218 | // ------------
219 | // Roaring bitmap difference: O(1) for compressed chunks
220 | func (qb *QueryBuilder) Not() *QueryBuilder {
221 | 	qb.negate = true
222 | 	return qb
223 | }
224 | 
225 | // Group creates a sub-query with its own scope
226 | //
227 | // EXAMPLE:
228 | // --------
229 | //
230 | //	qb.Group(func(q *QueryBuilder) {
231 | //	    q.Term("cat").Or().Term("dog")
232 | //	}).And().Term("pet")
233 | //	// Returns: (cat OR dog) AND pet
234 | //
235 | // USE CASE: Control operator precedence
236 | func (qb *QueryBuilder) Group(fn func(*QueryBuilder)) *QueryBuilder {
237 | 	// Create a new sub-query
238 | 	subQuery := NewQueryBuilder(qb.index)
239 | 
240 | 	// Execute the group function
241 | 	fn(subQuery)
242 | 
243 | 	// Get the result from the sub-query
244 | 	result := subQuery.Execute()
245 | 
246 | 	// Apply negation if needed
247 | 	if qb.negate {
248 | 		result = qb.negateBitmap(result)
249 | 		qb.negate = false
250 | 	}
251 | 
252 | 	qb.pushBitmap(result)
253 | 	return qb
254 | }
255 | 
256 | // Execute runs the query and returns matching document IDs as a bitmap
257 | //
258 | // ALGORITHM:
259 | // ----------
260 | // 1. Process all terms and operations in order
261 | // 2. Apply AND/OR operations using roaring bitmap operations
262 | // 3. Return final bitmap of matching documents
263 | //
264 | // EXAMPLE:
265 | // --------
266 | //
267 | //	qb := NewQueryBuilder(index)
268 | //	results := qb.Term("machine").And().Term("learning").Execute()
269 | //	// results is a roaring.Bitmap with doc IDs
270 | //
271 | // PERFORMANCE:
272 | // ------------
273 | // All operations use optimized roaring bitmap operations:
274 | // - AND: bitmap intersection (fast!)
275 | // - OR: bitmap union (fast!)
276 | // - NOT: bitmap difference (fast!)
277 | func (qb *QueryBuilder) Execute() *roaring.Bitmap {
278 | 	if len(qb.stack) == 0 {
279 | 		return roaring.NewBitmap()
280 | 	}
281 | 
282 | 	// Process the stack with operations
283 | 	result := qb.stack[0]
284 | 	for i := 1; i < len(qb.stack); i++ {
285 | 		if i-1 < len(qb.ops) {
286 | 			op := qb.ops[i-1]
287 | 			switch op {
288 | 			case OpAnd:
289 | 				// Intersection: docs in BOTH bitmaps
290 | 				result = roaring.And(result, qb.stack[i])
291 | 			case OpOr:
292 | 				// Union: docs in EITHER bitmap
293 | 				result = roaring.Or(result, qb.stack[i])
294 | 			}
295 | 		}
296 | 	}
297 | 
298 | 	return result
299 | }
300 | 
301 | // ExecuteWithBM25 runs the query and returns ranked results using BM25
302 | //
303 | // ALGORITHM:
304 | // ----------
305 | // 1. Execute boolean query → Get bitmap of matching docs
306 | // 2. Extract terms from the query
307 | // 3. Calculate BM25 score for each matching document
308 | // 4. Sort by score and return top K
309 | //
310 | // EXAMPLE:
311 | // --------
312 | //
313 | //	qb := NewQueryBuilder(index)
314 | //	matches := qb.Term("machine").And().Term("learning").
315 | //	    ExecuteWithBM25(10)
316 | //	// Returns top 10 matches sorted by BM25 score
317 | func (qb *QueryBuilder) ExecuteWithBM25(maxResults int) []Match {
318 | 	// Execute boolean query
319 | 	resultBitmap := qb.Execute()
320 | 
321 | 	// Extract terms for BM25 scoring
322 | 	terms := qb.extractTerms()
323 | 
324 | 	// Score each matching document
325 | 	var results []Match
326 | 	iter := resultBitmap.Iterator()
327 | 	for iter.HasNext() {
328 | 		docID := int(iter.Next())
329 | 		score := qb.index.calculateBM25Score(docID, terms)
330 | 
331 | 		if score > 0 {
332 | 			results = append(results, Match{
333 | 				DocID: docID,
334 | 				Score: score,
335 | 			})
336 | 		}
337 | 	}
338 | 
339 | 	// Sort by score (descending)
340 | 	qb.index.sortMatchesByScore(results)
341 | 
342 | 	// Return top K
343 | 	return limitResults(results, maxResults)
344 | }
345 | 
346 | // ═══════════════════════════════════════════════════════════════════════════════
347 | // INTERNAL HELPER METHODS
348 | // ═══════════════════════════════════════════════════════════════════════════════
349 | 
350 | // getTermBitmap retrieves the roaring bitmap for a term
351 | func (qb *QueryBuilder) getTermBitmap(term string) *roaring.Bitmap {
352 | 	if bitmap, exists := qb.index.DocBitmaps[term]; exists {
353 | 		return bitmap.Clone() // Clone to avoid modifying original
354 | 	}
355 | 	return roaring.NewBitmap() // Empty bitmap if term not found
356 | }
357 | 
358 | // negateBitmap returns all documents EXCEPT those in the bitmap
359 | func (qb *QueryBuilder) negateBitmap(bitmap *roaring.Bitmap) *roaring.Bitmap {
360 | 	// Create bitmap of all documents
361 | 	allDocs := roaring.NewBitmap()
362 | 	for docID := range qb.index.DocStats {
363 | 		allDocs.Add(uint32(docID))
364 | 	}
365 | 
366 | 	// Return difference: all docs - bitmap
367 | 	return roaring.AndNot(allDocs, bitmap)
368 | }
369 | 
370 | // pushBitmap pushes a bitmap onto the stack
371 | func (qb *QueryBuilder) pushBitmap(bitmap *roaring.Bitmap) {
372 | 	qb.stack = append(qb.stack, bitmap)
373 | }
374 | 
375 | // extractTerms extracts all terms used in the query for BM25 scoring
376 | func (qb *QueryBuilder) extractTerms() []string {
377 | 	return qb.terms
378 | }
379 | 
380 | // ═══════════════════════════════════════════════════════════════════════════════
381 | // CONVENIENCE METHODS FOR COMMON PATTERNS
382 | // ═══════════════════════════════════════════════════════════════════════════════
383 | 
384 | // AllOf finds documents containing ALL of the given terms (AND)
385 | //
386 | // EXAMPLE:
387 | // --------
388 | //
389 | //	results := AllOf(index, "machine", "learning", "python")
390 | //	// Same as: Term("machine").And().Term("learning").And().Term("python")
391 | func AllOf(index *InvertedIndex, terms ...string) *roaring.Bitmap {
392 | 	if len(terms) == 0 {
393 | 		return roaring.NewBitmap()
394 | 	}
395 | 
396 | 	qb := NewQueryBuilder(index).Term(terms[0])
397 | 	for i := 1; i < len(terms); i++ {
398 | 		qb.And().Term(terms[i])
399 | 	}
400 | 	return qb.Execute()
401 | }
402 | 
403 | // AnyOf finds documents containing ANY of the given terms (OR)
404 | //
405 | // EXAMPLE:
406 | // --------
407 | //
408 | //	results := AnyOf(index, "cat", "dog", "bird")
409 | //	// Same as: Term("cat").Or().Term("dog").Or().Term("bird")
410 | func AnyOf(index *InvertedIndex, terms ...string) *roaring.Bitmap {
411 | 	if len(terms) == 0 {
412 | 		return roaring.NewBitmap()
413 | 	}
414 | 
415 | 	qb := NewQueryBuilder(index).Term(terms[0])
416 | 	for i := 1; i < len(terms); i++ {
417 | 		qb.Or().Term(terms[i])
418 | 	}
419 | 	return qb.Execute()
420 | }
421 | 
422 | // TermExcluding finds documents with a term but excluding another
423 | //
424 | // EXAMPLE:
425 | // --------
426 | //
427 | //	results := TermExcluding(index, "python", "snake")
428 | //	// Same as: Term("python").And().Not().Term("snake")
429 | func TermExcluding(index *InvertedIndex, include, exclude string) *roaring.Bitmap {
430 | 	return NewQueryBuilder(index).
431 | 		Term(include).
432 | 		And().Not().Term(exclude).
433 | 		Execute()
434 | }
435 | 


--------------------------------------------------------------------------------
/index_test.go:
--------------------------------------------------------------------------------
  1 | package blaze
  2 | 
  3 | import (
  4 | 	"testing"
  5 | )
  6 | 
  7 | // ═══════════════════════════════════════════════════════════════════════════════
  8 | // INVERTED INDEX CREATION TESTS
  9 | // ═══════════════════════════════════════════════════════════════════════════════
 10 | 
 11 | func TestNewInvertedIndex(t *testing.T) {
 12 | 	idx := NewInvertedIndex()
 13 | 
 14 | 	if idx == nil {
 15 | 		t.Fatal("NewInvertedIndex() returned nil")
 16 | 	}
 17 | 
 18 | 	if idx.PostingsList == nil {
 19 | 		t.Error("PostingsList is nil")
 20 | 	}
 21 | 
 22 | 	if len(idx.PostingsList) != 0 {
 23 | 		t.Errorf("New index has %d entries, want 0", len(idx.PostingsList))
 24 | 	}
 25 | }
 26 | 
 27 | // ═══════════════════════════════════════════════════════════════════════════════
 28 | // INDEXING TESTS
 29 | // ═══════════════════════════════════════════════════════════════════════════════
 30 | 
 31 | func TestInvertedIndex_Index_SingleDocument(t *testing.T) {
 32 | 	idx := NewInvertedIndex()
 33 | 
 34 | 	// Index a simple document
 35 | 	idx.Index(1, "quick brown fox")
 36 | 
 37 | 	// Verify tokens were indexed
 38 | 	tokens := []string{"quick", "brown", "fox"}
 39 | 	for _, token := range tokens {
 40 | 		if _, exists := idx.PostingsList[token]; !exists {
 41 | 			t.Errorf("Token %q was not indexed", token)
 42 | 		}
 43 | 	}
 44 | }
 45 | 
 46 | func TestInvertedIndex_Index_MultipleDocuments(t *testing.T) {
 47 | 	idx := NewInvertedIndex()
 48 | 
 49 | 	// Index multiple documents
 50 | 	idx.Index(1, "quick brown fox")
 51 | 	idx.Index(2, "sleepy dog")
 52 | 	idx.Index(3, "quick brown cats")
 53 | 
 54 | 	// Check that all unique tokens are indexed (after stemming)
 55 | 	expectedTokens := map[string]bool{
 56 | 		"quick":  true,
 57 | 		"brown":  true,
 58 | 		"fox":    true,
 59 | 		"sleepi": true, // stemmed from "sleepy"
 60 | 		"dog":    true,
 61 | 		"cat":    true, // stemmed from "cats"
 62 | 	}
 63 | 
 64 | 	for token := range expectedTokens {
 65 | 		if _, exists := idx.PostingsList[token]; !exists {
 66 | 			t.Errorf("Token %q was not indexed", token)
 67 | 		}
 68 | 	}
 69 | }
 70 | 
 71 | func TestInvertedIndex_Index_DuplicateWords(t *testing.T) {
 72 | 	idx := NewInvertedIndex()
 73 | 
 74 | 	// Index document with duplicate words
 75 | 	idx.Index(1, "quick quick brown")
 76 | 
 77 | 	// Verify "quick" has multiple positions
 78 | 	skipList, exists := idx.PostingsList["quick"]
 79 | 	if !exists {
 80 | 		t.Fatal("Token 'quick' was not indexed")
 81 | 	}
 82 | 
 83 | 	// Count occurrences
 84 | 	count := 0
 85 | 	iter := skipList.Iterator()
 86 | 	if iter.current != nil {
 87 | 		count++
 88 | 	}
 89 | 	for iter.HasNext() {
 90 | 		iter.Next()
 91 | 		count++
 92 | 	}
 93 | 
 94 | 	if count != 2 {
 95 | 		t.Errorf("Token 'quick' has %d occurrences, want 2", count)
 96 | 	}
 97 | }
 98 | 
 99 | func TestInvertedIndex_Index_EmptyDocument(t *testing.T) {
100 | 	idx := NewInvertedIndex()
101 | 
102 | 	// Index empty document
103 | 	idx.Index(1, "")
104 | 
105 | 	// Should have no tokens
106 | 	if len(idx.PostingsList) != 0 {
107 | 		t.Errorf("Empty document created %d tokens, want 0", len(idx.PostingsList))
108 | 	}
109 | }
110 | 
111 | func TestInvertedIndex_Index_StopWords(t *testing.T) {
112 | 	idx := NewInvertedIndex()
113 | 
114 | 	// Index document with stop words
115 | 	idx.Index(1, "the quick brown fox")
116 | 
117 | 	// "the" should be removed by analyzer
118 | 	if _, exists := idx.PostingsList["the"]; exists {
119 | 		t.Error("Stop word 'the' should not be indexed")
120 | 	}
121 | 
122 | 	// Other words should exist
123 | 	if _, exists := idx.PostingsList["quick"]; !exists {
124 | 		t.Error("Token 'quick' should be indexed")
125 | 	}
126 | }
127 | 
128 | // ═══════════════════════════════════════════════════════════════════════════════
129 | // FIRST OPERATION TESTS
130 | // ═══════════════════════════════════════════════════════════════════════════════
131 | 
132 | func TestInvertedIndex_First_SingleOccurrence(t *testing.T) {
133 | 	idx := NewInvertedIndex()
134 | 	idx.Index(1, "quick brown fox")
135 | 
136 | 	pos, err := idx.First("quick")
137 | 	if err != nil {
138 | 		t.Fatalf("First() error = %v, want nil", err)
139 | 	}
140 | 
141 | 	if pos.GetDocumentID() != 1 {
142 | 		t.Errorf("First() document = %d, want 1", pos.GetDocumentID())
143 | 	}
144 | 
145 | 	if pos.GetOffset() != 0 {
146 | 		t.Errorf("First() offset = %d, want 0", pos.GetOffset())
147 | 	}
148 | }
149 | 
150 | func TestInvertedIndex_First_MultipleOccurrences(t *testing.T) {
151 | 	idx := NewInvertedIndex()
152 | 	idx.Index(1, "brown fox")
153 | 	idx.Index(2, "quick brown")
154 | 	idx.Index(3, "brown dog")
155 | 
156 | 	pos, err := idx.First("brown")
157 | 	if err != nil {
158 | 		t.Fatalf("First() error = %v, want nil", err)
159 | 	}
160 | 
161 | 	// Should return the first occurrence (Doc1, Pos0)
162 | 	if pos.GetDocumentID() != 1 || pos.GetOffset() != 0 {
163 | 		t.Errorf("First() = Doc%d:Pos%d, want Doc1:Pos0",
164 | 			pos.GetDocumentID(), pos.GetOffset())
165 | 	}
166 | }
167 | 
168 | func TestInvertedIndex_First_NotFound(t *testing.T) {
169 | 	idx := NewInvertedIndex()
170 | 	idx.Index(1, "quick brown fox")
171 | 
172 | 	_, err := idx.First("elephant")
173 | 	if err != ErrNoPostingList {
174 | 		t.Errorf("First() error = %v, want %v", err, ErrNoPostingList)
175 | 	}
176 | }
177 | 
178 | // ═══════════════════════════════════════════════════════════════════════════════
179 | // LAST OPERATION TESTS
180 | // ═══════════════════════════════════════════════════════════════════════════════
181 | 
182 | func TestInvertedIndex_Last_SingleOccurrence(t *testing.T) {
183 | 	idx := NewInvertedIndex()
184 | 	idx.Index(1, "quick brown fox")
185 | 
186 | 	pos, err := idx.Last("fox")
187 | 	if err != nil {
188 | 		t.Fatalf("Last() error = %v, want nil", err)
189 | 	}
190 | 
191 | 	if pos.GetDocumentID() != 1 || pos.GetOffset() != 2 {
192 | 		t.Errorf("Last() = Doc%d:Pos%d, want Doc1:Pos2",
193 | 			pos.GetDocumentID(), pos.GetOffset())
194 | 	}
195 | }
196 | 
197 | func TestInvertedIndex_Last_MultipleOccurrences(t *testing.T) {
198 | 	idx := NewInvertedIndex()
199 | 	idx.Index(1, "brown fox")
200 | 	idx.Index(2, "quick brown")
201 | 	idx.Index(3, "brown dog")
202 | 
203 | 	pos, err := idx.Last("brown")
204 | 	if err != nil {
205 | 		t.Fatalf("Last() error = %v, want nil", err)
206 | 	}
207 | 
208 | 	// Should return the last occurrence (Doc3, Pos0)
209 | 	if pos.GetDocumentID() != 3 || pos.GetOffset() != 0 {
210 | 		t.Errorf("Last() = Doc%d:Pos%d, want Doc3:Pos0",
211 | 			pos.GetDocumentID(), pos.GetOffset())
212 | 	}
213 | }
214 | 
215 | func TestInvertedIndex_Last_NotFound(t *testing.T) {
216 | 	idx := NewInvertedIndex()
217 | 	idx.Index(1, "quick brown fox")
218 | 
219 | 	_, err := idx.Last("elephant")
220 | 	if err != ErrNoPostingList {
221 | 		t.Errorf("Last() error = %v, want %v", err, ErrNoPostingList)
222 | 	}
223 | }
224 | 
225 | // ═══════════════════════════════════════════════════════════════════════════════
226 | // NEXT OPERATION TESTS
227 | // ═══════════════════════════════════════════════════════════════════════════════
228 | 
229 | func TestInvertedIndex_Next_FromBeginning(t *testing.T) {
230 | 	idx := NewInvertedIndex()
231 | 	idx.Index(1, "quick brown fox")
232 | 
233 | 	// Next from BOF should return First
234 | 	pos, err := idx.Next("quick", BOFDocument)
235 | 	if err != nil {
236 | 		t.Fatalf("Next() error = %v, want nil", err)
237 | 	}
238 | 
239 | 	if pos.GetDocumentID() != 1 || pos.GetOffset() != 0 {
240 | 		t.Errorf("Next() = Doc%d:Pos%d, want Doc1:Pos0",
241 | 			pos.GetDocumentID(), pos.GetOffset())
242 | 	}
243 | }
244 | 
245 | func TestInvertedIndex_Next_MultipleOccurrences(t *testing.T) {
246 | 	idx := NewInvertedIndex()
247 | 	idx.Index(1, "quick brown fox")
248 | 	idx.Index(2, "quick dog")
249 | 	idx.Index(3, "lazy quick")
250 | 
251 | 	// Get first occurrence
252 | 	pos1, _ := idx.Next("quick", BOFDocument)
253 | 	if pos1.GetDocumentID() != 1 {
254 | 		t.Errorf("First occurrence in Doc%d, want Doc1", pos1.GetDocumentID())
255 | 	}
256 | 
257 | 	// Get second occurrence
258 | 	pos2, _ := idx.Next("quick", pos1)
259 | 	if pos2.GetDocumentID() != 2 {
260 | 		t.Errorf("Second occurrence in Doc%d, want Doc2", pos2.GetDocumentID())
261 | 	}
262 | 
263 | 	// Get third occurrence
264 | 	pos3, _ := idx.Next("quick", pos2)
265 | 	if pos3.GetDocumentID() != 3 {
266 | 		t.Errorf("Third occurrence in Doc%d, want Doc3", pos3.GetDocumentID())
267 | 	}
268 | 
269 | 	// No more occurrences
270 | 	pos4, _ := idx.Next("quick", pos3)
271 | 	if !pos4.IsEnd() {
272 | 		t.Error("Next() should return EOF after last occurrence")
273 | 	}
274 | }
275 | 
276 | func TestInvertedIndex_Next_FromEOF(t *testing.T) {
277 | 	idx := NewInvertedIndex()
278 | 	idx.Index(1, "quick brown fox")
279 | 
280 | 	pos, _ := idx.Next("quick", EOFDocument)
281 | 	if !pos.IsEnd() {
282 | 		t.Error("Next() from EOF should return EOF")
283 | 	}
284 | }
285 | 
286 | func TestInvertedIndex_Next_NotFound(t *testing.T) {
287 | 	idx := NewInvertedIndex()
288 | 	idx.Index(1, "quick brown fox")
289 | 
290 | 	_, err := idx.Next("elephant", BOFDocument)
291 | 	if err != ErrNoPostingList {
292 | 		t.Errorf("Next() error = %v, want %v", err, ErrNoPostingList)
293 | 	}
294 | }
295 | 
296 | // ═══════════════════════════════════════════════════════════════════════════════
297 | // PREVIOUS OPERATION TESTS
298 | // ═══════════════════════════════════════════════════════════════════════════════
299 | 
300 | func TestInvertedIndex_Previous_FromEnd(t *testing.T) {
301 | 	idx := NewInvertedIndex()
302 | 	idx.Index(1, "quick brown fox")
303 | 
304 | 	// Previous from EOF should return Last
305 | 	pos, err := idx.Previous("fox", EOFDocument)
306 | 	if err != nil {
307 | 		t.Fatalf("Previous() error = %v, want nil", err)
308 | 	}
309 | 
310 | 	if pos.GetDocumentID() != 1 || pos.GetOffset() != 2 {
311 | 		t.Errorf("Previous() = Doc%d:Pos%d, want Doc1:Pos2",
312 | 			pos.GetDocumentID(), pos.GetOffset())
313 | 	}
314 | }
315 | 
316 | func TestInvertedIndex_Previous_MultipleOccurrences(t *testing.T) {
317 | 	idx := NewInvertedIndex()
318 | 	idx.Index(1, "quick brown fox")
319 | 	idx.Index(2, "quick dog")
320 | 	idx.Index(3, "lazy quick")
321 | 
322 | 	// Get last occurrence
323 | 	pos3, _ := idx.Previous("quick", EOFDocument)
324 | 	if pos3.GetDocumentID() != 3 {
325 | 		t.Errorf("Last occurrence in Doc%d, want Doc3", pos3.GetDocumentID())
326 | 	}
327 | 
328 | 	// Get second-to-last occurrence
329 | 	pos2, _ := idx.Previous("quick", pos3)
330 | 	if pos2.GetDocumentID() != 2 {
331 | 		t.Errorf("Second-to-last occurrence in Doc%d, want Doc2", pos2.GetDocumentID())
332 | 	}
333 | 
334 | 	// Get first occurrence
335 | 	pos1, _ := idx.Previous("quick", pos2)
336 | 	if pos1.GetDocumentID() != 1 {
337 | 		t.Errorf("First occurrence in Doc%d, want Doc1", pos1.GetDocumentID())
338 | 	}
339 | 
340 | 	// No more occurrences
341 | 	pos0, _ := idx.Previous("quick", pos1)
342 | 	if !pos0.IsBeginning() {
343 | 		t.Error("Previous() should return BOF before first occurrence")
344 | 	}
345 | }
346 | 
347 | func TestInvertedIndex_Previous_FromBOF(t *testing.T) {
348 | 	idx := NewInvertedIndex()
349 | 	idx.Index(1, "quick brown fox")
350 | 
351 | 	pos, _ := idx.Previous("quick", BOFDocument)
352 | 	if !pos.IsBeginning() {
353 | 		t.Error("Previous() from BOF should return BOF")
354 | 	}
355 | }
356 | 
357 | func TestInvertedIndex_Previous_NotFound(t *testing.T) {
358 | 	idx := NewInvertedIndex()
359 | 	idx.Index(1, "quick brown fox")
360 | 
361 | 	_, err := idx.Previous("elephant", EOFDocument)
362 | 	if err != ErrNoPostingList {
363 | 		t.Errorf("Previous() error = %v, want %v", err, ErrNoPostingList)
364 | 	}
365 | }
366 | 
367 | // ═══════════════════════════════════════════════════════════════════════════════
368 | // INTEGRATION TESTS
369 | // ═══════════════════════════════════════════════════════════════════════════════
370 | 
371 | func TestInvertedIndex_ComplexScenario(t *testing.T) {
372 | 	idx := NewInvertedIndex()
373 | 
374 | 	// Index multiple documents with overlapping vocabulary
375 | 	idx.Index(1, "the quick brown fox jumps over the lazy dog")
376 | 	idx.Index(2, "the lazy brown dog sleeps")
377 | 	idx.Index(3, "quick brown foxes are clever")
378 | 
379 | 	// Test 1: Verify "brown" appears in all three documents
380 | 	brownDocs := []int{}
381 | 	pos, _ := idx.First("brown")
382 | 	brownDocs = append(brownDocs, pos.GetDocumentID())
383 | 
384 | 	for !pos.IsEnd() {
385 | 		pos, _ = idx.Next("brown", pos)
386 | 		if !pos.IsEnd() {
387 | 			brownDocs = append(brownDocs, pos.GetDocumentID())
388 | 		}
389 | 	}
390 | 
391 | 	expectedDocs := []int{1, 2, 3}
392 | 	if len(brownDocs) != len(expectedDocs) {
393 | 		t.Errorf("Found 'brown' in %d documents, want %d", len(brownDocs), len(expectedDocs))
394 | 	}
395 | 
396 | 	for i, docID := range brownDocs {
397 | 		if docID != expectedDocs[i] {
398 | 			t.Errorf("Document %d: got Doc%d, want Doc%d", i, docID, expectedDocs[i])
399 | 		}
400 | 	}
401 | 
402 | 	// Test 2: Verify "quick" only appears in Doc1 and Doc3
403 | 	quickDocs := []int{}
404 | 	pos, _ = idx.First("quick")
405 | 	quickDocs = append(quickDocs, pos.GetDocumentID())
406 | 
407 | 	pos, _ = idx.Next("quick", pos)
408 | 	if !pos.IsEnd() {
409 | 		quickDocs = append(quickDocs, pos.GetDocumentID())
410 | 	}
411 | 
412 | 	expectedQuickDocs := []int{1, 3}
413 | 	if len(quickDocs) != len(expectedQuickDocs) {
414 | 		t.Errorf("Found 'quick' in %d documents, want %d", len(quickDocs), len(expectedQuickDocs))
415 | 	}
416 | }
417 | 
418 | func TestInvertedIndex_PositionOrdering(t *testing.T) {
419 | 	idx := NewInvertedIndex()
420 | 
421 | 	// Index document where same word appears multiple times
422 | 	idx.Index(1, "fox fox fox")
423 | 
424 | 	// Get all positions
425 | 	var positions []int
426 | 	pos, _ := idx.First("fox")
427 | 	positions = append(positions, pos.GetOffset())
428 | 
429 | 	for !pos.IsEnd() {
430 | 		pos, _ = idx.Next("fox", pos)
431 | 		if !pos.IsEnd() {
432 | 			positions = append(positions, pos.GetOffset())
433 | 		}
434 | 	}
435 | 
436 | 	// Verify positions are in order: 0, 1, 2
437 | 	expected := []int{0, 1, 2}
438 | 	if len(positions) != len(expected) {
439 | 		t.Fatalf("Found %d positions, want %d", len(positions), len(expected))
440 | 	}
441 | 
442 | 	for i, offset := range positions {
443 | 		if offset != expected[i] {
444 | 			t.Errorf("Position %d: offset = %d, want %d", i, offset, expected[i])
445 | 		}
446 | 	}
447 | }
448 | 
449 | // ═══════════════════════════════════════════════════════════════════════════════
450 | // CONCURRENCY TESTS
451 | // ═══════════════════════════════════════════════════════════════════════════════
452 | 
453 | func TestInvertedIndex_ConcurrentIndexing(t *testing.T) {
454 | 	idx := NewInvertedIndex()
455 | 
456 | 	// Index documents concurrently
457 | 	done := make(chan bool, 3)
458 | 
459 | 	go func() {
460 | 		idx.Index(1, "quick brown fox")
461 | 		done <- true
462 | 	}()
463 | 
464 | 	go func() {
465 | 		idx.Index(2, "sleepy dog")
466 | 		done <- true
467 | 	}()
468 | 
469 | 	go func() {
470 | 		idx.Index(3, "quick brown cats")
471 | 		done <- true
472 | 	}()
473 | 
474 | 	// Wait for all goroutines to complete
475 | 	<-done
476 | 	<-done
477 | 	<-done
478 | 
479 | 	// Verify all documents were indexed (checking stemmed tokens)
480 | 	tokens := []string{"quick", "brown", "fox", "sleepi", "dog", "cat"}
481 | 	for _, token := range tokens {
482 | 		if _, exists := idx.PostingsList[token]; !exists {
483 | 			t.Errorf("Token %q was not indexed (concurrent indexing issue)", token)
484 | 		}
485 | 	}
486 | }
487 | 


--------------------------------------------------------------------------------
/query_test.go:
--------------------------------------------------------------------------------
  1 | package blaze
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/RoaringBitmap/roaring"
  7 | )
  8 | 
  9 | // ═══════════════════════════════════════════════════════════════════════════════
 10 | // QUERY BUILDER TESTS
 11 | // ═══════════════════════════════════════════════════════════════════════════════
 12 | 
 13 | // setupTestIndex creates a test index with sample documents
 14 | func setupTestIndex() *InvertedIndex {
 15 | 	idx := NewInvertedIndex()
 16 | 
 17 | 	// Document 1: "machine learning is fun"
 18 | 	idx.Index(1, "machine learning is fun")
 19 | 
 20 | 	// Document 2: "deep learning and machine learning"
 21 | 	idx.Index(2, "deep learning and machine learning")
 22 | 
 23 | 	// Document 3: "python programming is great"
 24 | 	idx.Index(3, "python programming is great")
 25 | 
 26 | 	// Document 4: "machine learning with python"
 27 | 	idx.Index(4, "machine learning with python")
 28 | 
 29 | 	// Document 5: "cats and dogs are pets"
 30 | 	idx.Index(5, "cats and dogs are pets")
 31 | 
 32 | 	return idx
 33 | }
 34 | 
 35 | // TestQueryBuilder_SingleTerm tests querying for a single term
 36 | func TestQueryBuilder_SingleTerm(t *testing.T) {
 37 | 	idx := setupTestIndex()
 38 | 
 39 | 	// Query: Find documents with "machine"
 40 | 	results := NewQueryBuilder(idx).
 41 | 		Term("machine").
 42 | 		Execute()
 43 | 
 44 | 	// Should match docs 1, 2, 4
 45 | 	expected := []int{1, 2, 4}
 46 | 	actual := bitmapToSlice(results)
 47 | 
 48 | 	if !slicesEqual(actual, expected) {
 49 | 		t.Errorf("Expected docs %v, got %v", expected, actual)
 50 | 	}
 51 | }
 52 | 
 53 | // TestQueryBuilder_And tests AND operation
 54 | func TestQueryBuilder_And(t *testing.T) {
 55 | 	idx := setupTestIndex()
 56 | 
 57 | 	// Query: Find documents with "machine" AND "python"
 58 | 	results := NewQueryBuilder(idx).
 59 | 		Term("machine").
 60 | 		And().
 61 | 		Term("python").
 62 | 		Execute()
 63 | 
 64 | 	// Should match only doc 4
 65 | 	expected := []int{4}
 66 | 	actual := bitmapToSlice(results)
 67 | 
 68 | 	if !slicesEqual(actual, expected) {
 69 | 		t.Errorf("Expected docs %v, got %v", expected, actual)
 70 | 	}
 71 | }
 72 | 
 73 | // TestQueryBuilder_Or tests OR operation
 74 | func TestQueryBuilder_Or(t *testing.T) {
 75 | 	idx := setupTestIndex()
 76 | 
 77 | 	// Query: Find documents with "cats" OR "dogs"
 78 | 	results := NewQueryBuilder(idx).
 79 | 		Term("cats").
 80 | 		Or().
 81 | 		Term("dogs").
 82 | 		Execute()
 83 | 
 84 | 	// Should match doc 5 (which contains both)
 85 | 	expected := []int{5}
 86 | 	actual := bitmapToSlice(results)
 87 | 
 88 | 	if !slicesEqual(actual, expected) {
 89 | 		t.Errorf("Expected docs %v, got %v", expected, actual)
 90 | 	}
 91 | }
 92 | 
 93 | // TestQueryBuilder_Not tests NOT operation
 94 | func TestQueryBuilder_Not(t *testing.T) {
 95 | 	idx := setupTestIndex()
 96 | 
 97 | 	// Query: Find documents with "learning" but NOT "deep"
 98 | 	results := NewQueryBuilder(idx).
 99 | 		Term("learning").
100 | 		And().Not().
101 | 		Term("deep").
102 | 		Execute()
103 | 
104 | 	// Should match docs 1, 4 (not 2, which has "deep")
105 | 	expected := []int{1, 4}
106 | 	actual := bitmapToSlice(results)
107 | 
108 | 	if !slicesEqual(actual, expected) {
109 | 		t.Errorf("Expected docs %v, got %v", expected, actual)
110 | 	}
111 | }
112 | 
113 | // TestQueryBuilder_ComplexQuery tests a complex boolean query
114 | func TestQueryBuilder_ComplexQuery(t *testing.T) {
115 | 	idx := setupTestIndex()
116 | 
117 | 	// Query: (machine OR python) AND learning
118 | 	results := NewQueryBuilder(idx).
119 | 		Group(func(q *QueryBuilder) {
120 | 			q.Term("machine").Or().Term("python")
121 | 		}).
122 | 		And().
123 | 		Term("learning").
124 | 		Execute()
125 | 
126 | 	// Should match docs 1, 2, 4
127 | 	// Doc 1: has machine and learning
128 | 	// Doc 2: has machine and learning
129 | 	// Doc 3: has python but no learning
130 | 	// Doc 4: has machine, python, and learning
131 | 	expected := []int{1, 2, 4}
132 | 	actual := bitmapToSlice(results)
133 | 
134 | 	if !slicesEqual(actual, expected) {
135 | 		t.Errorf("Expected docs %v, got %v", expected, actual)
136 | 	}
137 | }
138 | 
139 | // TestQueryBuilder_Phrase tests phrase query
140 | func TestQueryBuilder_Phrase(t *testing.T) {
141 | 	idx := setupTestIndex()
142 | 
143 | 	// Query: Find exact phrase "machine learning"
144 | 	results := NewQueryBuilder(idx).
145 | 		Phrase("machine learning").
146 | 		Execute()
147 | 
148 | 	// Should match docs 1, 2, 4
149 | 	expected := []int{1, 2, 4}
150 | 	actual := bitmapToSlice(results)
151 | 
152 | 	if !slicesEqual(actual, expected) {
153 | 		t.Errorf("Expected docs %v, got %v", expected, actual)
154 | 	}
155 | }
156 | 
157 | // TestQueryBuilder_PhraseWithBoolean tests combining phrase and boolean
158 | func TestQueryBuilder_PhraseWithBoolean(t *testing.T) {
159 | 	idx := setupTestIndex()
160 | 
161 | 	// Query: "machine learning" AND python
162 | 	results := NewQueryBuilder(idx).
163 | 		Phrase("machine learning").
164 | 		And().
165 | 		Term("python").
166 | 		Execute()
167 | 
168 | 	// Should match only doc 4
169 | 	expected := []int{4}
170 | 	actual := bitmapToSlice(results)
171 | 
172 | 	if !slicesEqual(actual, expected) {
173 | 		t.Errorf("Expected docs %v, got %v", expected, actual)
174 | 	}
175 | }
176 | 
177 | // TestQueryBuilder_ExecuteWithBM25 tests BM25 scoring
178 | func TestQueryBuilder_ExecuteWithBM25(t *testing.T) {
179 | 	idx := setupTestIndex()
180 | 
181 | 	// Query: machine AND learning (with BM25 scoring)
182 | 	results := NewQueryBuilder(idx).
183 | 		Term("machine").
184 | 		And().
185 | 		Term("learning").
186 | 		ExecuteWithBM25(10)
187 | 
188 | 	// Should return docs with positive scores
189 | 	if len(results) == 0 {
190 | 		t.Error("Expected BM25 results, got none")
191 | 	}
192 | 
193 | 	// All results should have positive scores
194 | 	for _, match := range results {
195 | 		if match.Score <= 0 {
196 | 			t.Errorf("Expected positive score, got %f", match.Score)
197 | 		}
198 | 	}
199 | 
200 | 	// Results should be sorted by score (descending)
201 | 	for i := 1; i < len(results); i++ {
202 | 		if results[i].Score > results[i-1].Score {
203 | 			t.Errorf("Results not sorted: score[%d]=%.2f > score[%d]=%.2f",
204 | 				i, results[i].Score, i-1, results[i-1].Score)
205 | 		}
206 | 	}
207 | }
208 | 
209 | // TestQueryBuilder_EmptyQuery tests empty query
210 | func TestQueryBuilder_EmptyQuery(t *testing.T) {
211 | 	idx := setupTestIndex()
212 | 
213 | 	// Empty query should return no results
214 | 	results := NewQueryBuilder(idx).Execute()
215 | 
216 | 	if results.GetCardinality() != 0 {
217 | 		t.Errorf("Expected 0 results for empty query, got %d", results.GetCardinality())
218 | 	}
219 | }
220 | 
221 | // TestQueryBuilder_NonExistentTerm tests querying for non-existent term
222 | func TestQueryBuilder_NonExistentTerm(t *testing.T) {
223 | 	idx := setupTestIndex()
224 | 
225 | 	// Query for a term that doesn't exist
226 | 	results := NewQueryBuilder(idx).
227 | 		Term("quantum").
228 | 		Execute()
229 | 
230 | 	if results.GetCardinality() != 0 {
231 | 		t.Errorf("Expected 0 results for non-existent term, got %d", results.GetCardinality())
232 | 	}
233 | }
234 | 
235 | // TestQueryBuilder_MultipleAnds tests chaining multiple AND operations
236 | func TestQueryBuilder_MultipleAnds(t *testing.T) {
237 | 	idx := setupTestIndex()
238 | 
239 | 	// Query: machine AND learning AND python
240 | 	results := NewQueryBuilder(idx).
241 | 		Term("machine").
242 | 		And().Term("learning").
243 | 		And().Term("python").
244 | 		Execute()
245 | 
246 | 	// Should match only doc 4
247 | 	expected := []int{4}
248 | 	actual := bitmapToSlice(results)
249 | 
250 | 	if !slicesEqual(actual, expected) {
251 | 		t.Errorf("Expected docs %v, got %v", expected, actual)
252 | 	}
253 | }
254 | 
255 | // TestQueryBuilder_MultipleOrs tests chaining multiple OR operations
256 | func TestQueryBuilder_MultipleOrs(t *testing.T) {
257 | 	idx := setupTestIndex()
258 | 
259 | 	// Query: cats OR dogs OR pets
260 | 	results := NewQueryBuilder(idx).
261 | 		Term("cats").
262 | 		Or().Term("dogs").
263 | 		Or().Term("pets").
264 | 		Execute()
265 | 
266 | 	// Should match doc 5
267 | 	expected := []int{5}
268 | 	actual := bitmapToSlice(results)
269 | 
270 | 	if !slicesEqual(actual, expected) {
271 | 		t.Errorf("Expected docs %v, got %v", expected, actual)
272 | 	}
273 | }
274 | 
275 | // TestQueryBuilder_NestedGroups tests nested group operations
276 | func TestQueryBuilder_NestedGroups(t *testing.T) {
277 | 	idx := setupTestIndex()
278 | 
279 | 	// Query: ((machine OR deep) AND learning) AND NOT python
280 | 	results := NewQueryBuilder(idx).
281 | 		Group(func(q *QueryBuilder) {
282 | 			q.Group(func(qq *QueryBuilder) {
283 | 				qq.Term("machine").Or().Term("deep")
284 | 			}).And().Term("learning")
285 | 		}).
286 | 		And().Not().Term("python").
287 | 		Execute()
288 | 
289 | 	// Should match docs 1, 2 (not 4 which has python)
290 | 	expected := []int{1, 2}
291 | 	actual := bitmapToSlice(results)
292 | 
293 | 	if !slicesEqual(actual, expected) {
294 | 		t.Errorf("Expected docs %v, got %v", expected, actual)
295 | 	}
296 | }
297 | 
298 | // ═══════════════════════════════════════════════════════════════════════════════
299 | // CONVENIENCE FUNCTION TESTS
300 | // ═══════════════════════════════════════════════════════════════════════════════
301 | 
302 | // TestAllOf tests AllOf convenience function
303 | func TestAllOf(t *testing.T) {
304 | 	idx := setupTestIndex()
305 | 
306 | 	// Find docs with machine, learning, and python
307 | 	results := AllOf(idx, "machine", "learning", "python")
308 | 
309 | 	expected := []int{4}
310 | 	actual := bitmapToSlice(results)
311 | 
312 | 	if !slicesEqual(actual, expected) {
313 | 		t.Errorf("Expected docs %v, got %v", expected, actual)
314 | 	}
315 | }
316 | 
317 | // TestAnyOf tests AnyOf convenience function
318 | func TestAnyOf(t *testing.T) {
319 | 	idx := setupTestIndex()
320 | 
321 | 	// Find docs with cats, dogs, or python
322 | 	results := AnyOf(idx, "cats", "dogs", "python")
323 | 
324 | 	// Should match docs 3, 4, 5
325 | 	expected := []int{3, 4, 5}
326 | 	actual := bitmapToSlice(results)
327 | 
328 | 	if !slicesEqual(actual, expected) {
329 | 		t.Errorf("Expected docs %v, got %v", expected, actual)
330 | 	}
331 | }
332 | 
333 | // TestTermExcluding tests TermExcluding convenience function
334 | func TestTermExcluding(t *testing.T) {
335 | 	idx := setupTestIndex()
336 | 
337 | 	// Find docs with "learning" but not "deep"
338 | 	results := TermExcluding(idx, "learning", "deep")
339 | 
340 | 	expected := []int{1, 4}
341 | 	actual := bitmapToSlice(results)
342 | 
343 | 	if !slicesEqual(actual, expected) {
344 | 		t.Errorf("Expected docs %v, got %v", expected, actual)
345 | 	}
346 | }
347 | 
348 | // TestAllOf_EmptyTerms tests AllOf with no terms
349 | func TestAllOf_EmptyTerms(t *testing.T) {
350 | 	idx := setupTestIndex()
351 | 
352 | 	results := AllOf(idx)
353 | 
354 | 	if results.GetCardinality() != 0 {
355 | 		t.Errorf("Expected 0 results for empty AllOf, got %d", results.GetCardinality())
356 | 	}
357 | }
358 | 
359 | // TestAnyOf_EmptyTerms tests AnyOf with no terms
360 | func TestAnyOf_EmptyTerms(t *testing.T) {
361 | 	idx := setupTestIndex()
362 | 
363 | 	results := AnyOf(idx)
364 | 
365 | 	if results.GetCardinality() != 0 {
366 | 		t.Errorf("Expected 0 results for empty AnyOf, got %d", results.GetCardinality())
367 | 	}
368 | }
369 | 
370 | // ═══════════════════════════════════════════════════════════════════════════════
371 | // REAL-WORLD QUERY PATTERNS
372 | // ═══════════════════════════════════════════════════════════════════════════════
373 | 
374 | // TestQueryBuilder_SearchEnginePattern tests a typical search engine query
375 | func TestQueryBuilder_SearchEnginePattern(t *testing.T) {
376 | 	idx := setupTestIndex()
377 | 
378 | 	// Typical search: "machine learning" (phrase) OR just "python"
379 | 	results := NewQueryBuilder(idx).
380 | 		Phrase("machine learning").
381 | 		Or().
382 | 		Term("python").
383 | 		Execute()
384 | 
385 | 	// Should match docs 1, 2, 3, 4
386 | 	expected := []int{1, 2, 3, 4}
387 | 	actual := bitmapToSlice(results)
388 | 
389 | 	if !slicesEqual(actual, expected) {
390 | 		t.Errorf("Expected docs %v, got %v", expected, actual)
391 | 	}
392 | }
393 | 
394 | // TestQueryBuilder_FilteringPattern tests filtering unwanted content
395 | func TestQueryBuilder_FilteringPattern(t *testing.T) {
396 | 	idx := setupTestIndex()
397 | 
398 | 	// Find programming content but exclude python
399 | 	results := NewQueryBuilder(idx).
400 | 		Term("programming").
401 | 		And().Not().
402 | 		Term("python").
403 | 		Execute()
404 | 
405 | 	// Should return no results (all programming docs have python)
406 | 	if results.GetCardinality() != 0 {
407 | 		t.Errorf("Expected 0 results, got %d", results.GetCardinality())
408 | 	}
409 | }
410 | 
411 | // TestQueryBuilder_CategoryPattern tests category-based search
412 | func TestQueryBuilder_CategoryPattern(t *testing.T) {
413 | 	idx := setupTestIndex()
414 | 
415 | 	// Find AI/ML docs: (machine OR deep) AND learning
416 | 	results := NewQueryBuilder(idx).
417 | 		Group(func(q *QueryBuilder) {
418 | 			q.Term("machine").Or().Term("deep")
419 | 		}).
420 | 		And().
421 | 		Term("learning").
422 | 		Execute()
423 | 
424 | 	expected := []int{1, 2, 4}
425 | 	actual := bitmapToSlice(results)
426 | 
427 | 	if !slicesEqual(actual, expected) {
428 | 		t.Errorf("Expected docs %v, got %v", expected, actual)
429 | 	}
430 | }
431 | 
432 | // ═══════════════════════════════════════════════════════════════════════════════
433 | // PERFORMANCE TESTS
434 | // ═══════════════════════════════════════════════════════════════════════════════
435 | 
436 | // BenchmarkQueryBuilder_Simple benchmarks simple query
437 | func BenchmarkQueryBuilder_Simple(b *testing.B) {
438 | 	idx := setupTestIndex()
439 | 
440 | 	b.ResetTimer()
441 | 	for i := 0; i < b.N; i++ {
442 | 		NewQueryBuilder(idx).
443 | 			Term("machine").
444 | 			And().
445 | 			Term("learning").
446 | 			Execute()
447 | 	}
448 | }
449 | 
450 | // BenchmarkQueryBuilder_Complex benchmarks complex query
451 | func BenchmarkQueryBuilder_Complex(b *testing.B) {
452 | 	idx := setupTestIndex()
453 | 
454 | 	b.ResetTimer()
455 | 	for i := 0; i < b.N; i++ {
456 | 		NewQueryBuilder(idx).
457 | 			Group(func(q *QueryBuilder) {
458 | 				q.Term("machine").Or().Term("deep")
459 | 			}).
460 | 			And().
461 | 			Term("learning").
462 | 			And().Not().
463 | 			Term("python").
464 | 			Execute()
465 | 	}
466 | }
467 | 
468 | // BenchmarkQueryBuilder_WithBM25 benchmarks query with BM25 scoring
469 | func BenchmarkQueryBuilder_WithBM25(b *testing.B) {
470 | 	idx := setupTestIndex()
471 | 
472 | 	b.ResetTimer()
473 | 	for i := 0; i < b.N; i++ {
474 | 		NewQueryBuilder(idx).
475 | 			Term("machine").
476 | 			And().
477 | 			Term("learning").
478 | 			ExecuteWithBM25(10)
479 | 	}
480 | }
481 | 
482 | // ═══════════════════════════════════════════════════════════════════════════════
483 | // HELPER FUNCTIONS
484 | // ═══════════════════════════════════════════════════════════════════════════════
485 | 
486 | // bitmapToSlice converts a roaring bitmap to a sorted slice of ints
487 | func bitmapToSlice(bitmap *roaring.Bitmap) []int {
488 | 	if bitmap == nil {
489 | 		return []int{}
490 | 	}
491 | 
492 | 	result := make([]int, 0, bitmap.GetCardinality())
493 | 	iter := bitmap.Iterator()
494 | 	for iter.HasNext() {
495 | 		result = append(result, int(iter.Next()))
496 | 	}
497 | 	return result
498 | }
499 | 
500 | // slicesEqual checks if two slices are equal
501 | func slicesEqual(a, b []int) bool {
502 | 	if len(a) != len(b) {
503 | 		return false
504 | 	}
505 | 	for i := range a {
506 | 		if a[i] != b[i] {
507 | 			return false
508 | 		}
509 | 	}
510 | 	return true
511 | }
512 | 


--------------------------------------------------------------------------------
/index.go:
--------------------------------------------------------------------------------
  1 | // Package index implements an inverted index for full-text search
  2 | //
  3 | // ═══════════════════════════════════════════════════════════════════════════════
  4 | // WHAT IS AN INVERTED INDEX?
  5 | // ═══════════════════════════════════════════════════════════════════════════════
  6 | // An inverted index is like the index at the back of a book, but for search engines.
  7 | //
  8 | // Example: Given these documents:
  9 | //   Doc 1: "the quick brown fox"
 10 | //   Doc 2: "the lazy dog"
 11 | //   Doc 3: "quick brown dogs"
 12 | //
 13 | // The inverted index would look like:
 14 | //   "quick"  → [Doc1:Pos1, Doc3:Pos0]
 15 | //   "brown"  → [Doc1:Pos2, Doc3:Pos1]
 16 | //   "fox"    → [Doc1:Pos3]
 17 | //   "lazy"   → [Doc2:Pos1]
 18 | //   "dog"    → [Doc2:Pos2]
 19 | //   "dogs"   → [Doc3:Pos2]
 20 | //
 21 | // This allows us to:
 22 | // 1. Find documents containing a word instantly (without scanning all docs)
 23 | // 2. Find phrases by checking if word positions are consecutive
 24 | // 3. Rank results by how close words appear to each other (proximity)
 25 | //
 26 | // ═══════════════════════════════════════════════════════════════════════════════
 27 | 
 28 | package blaze
 29 | 
 30 | import (
 31 | 	"errors"
 32 | 	"log/slog"
 33 | 	"sync"
 34 | 
 35 | 	"github.com/RoaringBitmap/roaring"
 36 | )
 37 | 
 38 | // ═══════════════════════════════════════════════════════════════════════════════
 39 | // ERROR DEFINITIONS
 40 | // ═══════════════════════════════════════════════════════════════════════════════
 41 | // We define errors as package-level variables so they can be compared with ==
 42 | // This is a Go best practice for error handling.
 43 | var (
 44 | 	ErrNoPostingList = errors.New("no posting list exists for token")
 45 | 	ErrNoNextElement = errors.New("no next element found")
 46 | 	ErrNoPrevElement = errors.New("no previous element found")
 47 | )
 48 | 
 49 | // ═══════════════════════════════════════════════════════════════════════════════
 50 | // BM25 RANKING SYSTEM
 51 | // ═══════════════════════════════════════════════════════════════════════════════
 52 | // BM25 (Best Matching 25) is a ranking function used by search engines to estimate
 53 | // the relevance of documents to a given search query.
 54 | //
 55 | // WHY BM25?
 56 | // ---------
 57 | // 1. Industry standard: Used by Elasticsearch, Solr, Lucene
 58 | // 2. Accounts for document length (longer docs don't unfairly rank higher)
 59 | // 3. Accounts for term frequency saturation (10 vs 100 occurrences matter less)
 60 | // 4. Accounts for term rarity (rare terms are more significant)
 61 | //
 62 | // BM25 FORMULA:
 63 | // -------------
 64 | // For each term in the query:
 65 | //   score += IDF(term) * (TF * (k1 + 1)) / (TF + k1 * (1 - b + b * (docLen / avgDocLen)))
 66 | //
 67 | // Where:
 68 | //   IDF = Inverse Document Frequency (how rare is this term?)
 69 | //   TF = Term Frequency (how often does term appear in this doc?)
 70 | //   k1 = Term frequency saturation parameter (typically 1.2-2.0)
 71 | //   b = Length normalization parameter (typically 0.75)
 72 | //   docLen = Length of this document
 73 | //   avgDocLen = Average document length in the corpus
 74 | //
 75 | // EXAMPLE:
 76 | // --------
 77 | // Query: "machine learning"
 78 | // Doc A: 100 words, contains "machine" 3 times, "learning" 2 times
 79 | // Doc B: 500 words, contains "machine" 5 times, "learning" 8 times
 80 | //
 81 | // Despite Doc B having more occurrences, Doc A might score higher because:
 82 | // 1. Doc A is shorter (length normalization)
 83 | // 2. The density of query terms is higher in Doc A
 84 | // ═══════════════════════════════════════════════════════════════════════════════
 85 | 
 86 | // BM25Parameters holds the tuning parameters for BM25 algorithm
 87 | type BM25Parameters struct {
 88 | 	K1 float64 // Term frequency saturation (typical: 1.2-2.0)
 89 | 	B  float64 // Length normalization (typical: 0.75)
 90 | }
 91 | 
 92 | // DefaultBM25Parameters returns the standard BM25 parameters
 93 | func DefaultBM25Parameters() BM25Parameters {
 94 | 	return BM25Parameters{
 95 | 		K1: 1.5,  // Moderate term frequency saturation
 96 | 		B:  0.75, // Standard length normalization
 97 | 	}
 98 | }
 99 | 
100 | // DocumentStats stores statistics about a single document
101 | type DocumentStats struct {
102 | 	DocID     int            // Document identifier
103 | 	Length    int            // Number of terms in the document
104 | 	TermFreqs map[string]int // How many times each term appears
105 | }
106 | 
107 | // ═══════════════════════════════════════════════════════════════════════════════
108 | // CORE DATA STRUCTURE: InvertedIndex with HYBRID STORAGE
109 | // ═══════════════════════════════════════════════════════════════════════════════
110 | // The InvertedIndex uses a hybrid approach for maximum efficiency:
111 | //
112 | // Architecture:
113 | //
114 | //	InvertedIndex
115 | //	├── DocBitmaps: map[string]*roaring.Bitmap  (DOCUMENT-LEVEL)
116 | //	│   ├── "quick" → Bitmap of document IDs [1, 3, 5, ...]
117 | //	│   ├── "brown" → Bitmap of document IDs [1, 2, 7, ...]
118 | //	│   └── "fox"   → Bitmap of document IDs [3, 5, ...]
119 | //	├── PostingsList: map[string]SkipList       (POSITION-LEVEL)
120 | //	│   ├── "quick" → SkipList of exact positions
121 | //	│   ├── "brown" → SkipList of exact positions
122 | //	│   └── "fox"   → SkipList of exact positions
123 | //	└── mu: mutex for thread safety
124 | //
125 | // Why Hybrid Storage?
126 | //   - Roaring Bitmaps: Lightning-fast for document-level operations (AND, OR, NOT)
127 | //     10-100x memory compression, O(1) boolean operations
128 | //   - Skip Lists: Essential for position-based queries (phrases, proximity)
129 | //
130 | // This gives us the best of both worlds!
131 | // ═══════════════════════════════════════════════════════════════════════════════
132 | type InvertedIndex struct {
133 | 	mu sync.Mutex // Protects against concurrent access
134 | 
135 | 	// DOCUMENT-LEVEL STORAGE (for fast document lookups and boolean queries)
136 | 	DocBitmaps map[string]*roaring.Bitmap // Term → Bitmap of document IDs
137 | 
138 | 	// POSITION-LEVEL STORAGE (for phrase search, proximity)
139 | 	PostingsList map[string]SkipList // Term → Positions
140 | 
141 | 	// ===============================
142 | 	// BM25 INDEXING DATA STRUCTURES
143 | 	// ===============================
144 | 	DocStats   map[int]DocumentStats // DocID → statistics
145 | 	TotalDocs  int                   // Total number of indexed documents
146 | 	TotalTerms int64                 // Total number of terms across all docs
147 | 	BM25Params BM25Parameters        // BM25 tuning parameters
148 | }
149 | 
150 | // NewInvertedIndex creates a new empty inverted index with hybrid storage and BM25 support
151 | func NewInvertedIndex() *InvertedIndex {
152 | 	return &InvertedIndex{
153 | 		DocBitmaps:   make(map[string]*roaring.Bitmap), // Initialize document-level bitmaps
154 | 		PostingsList: make(map[string]SkipList),        // Initialize position-level skip lists
155 | 		DocStats:     make(map[int]DocumentStats),
156 | 		TotalDocs:    0,
157 | 		TotalTerms:   0,
158 | 		BM25Params:   DefaultBM25Parameters(),
159 | 	}
160 | }
161 | 
162 | // ═══════════════════════════════════════════════════════════════════════════════
163 | // INDEXING: Building the Search Index
164 | // ═══════════════════════════════════════════════════════════════════════════════
165 | 
166 | // Index adds a document to the inverted index
167 | //
168 | // STEP-BY-STEP EXAMPLE:
169 | // ----------------------
170 | // Input: docID=1, document="The quick brown fox"
171 | //
172 | // Step 1: Tokenization
173 | //
174 | //	analyzer.Analyze() converts to: ["quick", "brown", "fox"]
175 | //	(Note: "The" is removed as a stop word, and words are lowercased)
176 | //
177 | // Step 2: For each token, record its position
178 | //
179 | //	Token "quick" at position 0 in document 1
180 | //	Token "brown" at position 1 in document 1
181 | //	Token "fox"   at position 2 in document 1
182 | //
183 | // Step 3: Update the index
184 | //
185 | //	PostingsList["quick"] ← add Position{DocID:1, Offset:0}
186 | //	PostingsList["brown"] ← add Position{DocID:1, Offset:1}
187 | //	PostingsList["fox"]   ← add Position{DocID:1, Offset:2}
188 | //
189 | // Why record positions and not just document IDs?
190 | // - Positions let us do phrase search ("brown fox" requires consecutive positions)
191 | // - Positions let us rank by proximity (closer words = more relevant)
192 | //
193 | // Thread Safety Note:
194 | // - We lock the entire indexing operation to prevent race conditions
195 | // - If we didn't lock, two goroutines could corrupt the data structure
196 | // ═══════════════════════════════════════════════════════════════════════════════
197 | // BM25 INDEXING
198 | // ═══════════════════════════════════════════════════════════════════════════════
199 | // Index also enriches the index with BM25 statistics
200 | //
201 | // WHAT'S DIFFERENT WITH BM25:
202 | // ---------------------------
203 | // In addition to building the inverted index, we now track:
204 | // 1. Document length (number of terms)
205 | // 2. Term frequencies per document (how many times each term appears)
206 | // 3. Total number of documents (for IDF calculation)
207 | // 4. Total number of terms (for average document length)
208 | //
209 | // This metadata enables BM25 scoring later during search.
210 | func (idx *InvertedIndex) Index(docID int, document string) {
211 | 	idx.mu.Lock()         // Acquire lock - only one goroutine can index at a time
212 | 	defer idx.mu.Unlock() // Release lock when function returns (even if it panics)
213 | 
214 | 	slog.Info("indexing document", slog.Int("docID", docID))
215 | 
216 | 	// STEP 1: Break document into searchable tokens
217 | 	// Example: "The Quick Brown Fox!" → ["quick", "brown", "fox"]
218 | 	tokens := Analyze(document)
219 | 
220 | 	// STEP 2: Initialize document statistics
221 | 	docStats := DocumentStats{
222 | 		DocID:     docID,
223 | 		Length:    len(tokens),
224 | 		TermFreqs: make(map[string]int),
225 | 	}
226 | 
227 | 	// STEP 3: Index each token and track term frequencies
228 | 	for position, token := range tokens {
229 | 		idx.indexToken(token, docID, position)
230 | 		docStats.TermFreqs[token]++
231 | 	}
232 | 
233 | 	// STEP 4: Update global statistics
234 | 	idx.DocStats[docID] = docStats
235 | 	idx.TotalDocs++
236 | 	idx.TotalTerms += int64(len(tokens))
237 | }
238 | 
239 | // indexToken adds a single token occurrence to the index (HYBRID STORAGE)
240 | //
241 | // HOW IT WORKS:
242 | // -------------
243 | // 1. Update Roaring Bitmap (document-level)
244 | //   - Set the bit for this document ID
245 | //   - Enables fast document lookups and boolean operations
246 | //   - Compressed storage (10-100x smaller than skip lists alone)
247 | //
248 | // 2. Update Skip List (position-level)
249 | //   - Insert exact position (docID, offset)
250 | //   - Enables phrase search and proximity ranking
251 | //   - Maintains all position information
252 | //
253 | // 3. Best of both worlds!
254 | //   - Fast document queries via bitmaps
255 | //   - Detailed position queries via skip lists
256 | //
257 | // DocumentID and Offset are stored as ints
258 | // - The SkipList uses sentinel values (BOF=MinInt, EOF=MaxInt) to mark boundaries
259 | // - All position values are integers (no casting needed)
260 | func (idx *InvertedIndex) indexToken(token string, docID, position int) {
261 | 	// STEP 1: Update roaring bitmap (document-level)
262 | 	// Create bitmap if this is the first time seeing this token
263 | 	if idx.DocBitmaps[token] == nil {
264 | 		idx.DocBitmaps[token] = roaring.NewBitmap()
265 | 	}
266 | 	// Set the bit for this document ID
267 | 	idx.DocBitmaps[token].Add(uint32(docID))
268 | 
269 | 	// STEP 2: Update skip list (position-level)
270 | 	// Check if this token already has a posting list
271 | 	skipList, exists := idx.getPostingList(token)
272 | 	if !exists {
273 | 		// First time seeing this token - create a new SkipList
274 | 		skipList = *NewSkipList()
275 | 	}
276 | 
277 | 	// Add this occurrence to the token's posting list
278 | 	skipList.Insert(Position{
279 | 		DocumentID: docID,    // Which document?
280 | 		Offset:     position, // Where in the document?
281 | 	})
282 | 
283 | 	// Save the updated SkipList back to the map
284 | 	// (In Go, maps don't update automatically when you modify a struct value)
285 | 	idx.PostingsList[token] = skipList
286 | }
287 | 
288 | // getPostingList retrieves the posting list for a token
289 | //
290 | // This is a simple helper to avoid repeating map lookup code.
291 | // Returns (skipList, true) if found, (empty, false) if not found.
292 | func (idx *InvertedIndex) getPostingList(token string) (SkipList, bool) {
293 | 	skipList, exists := idx.PostingsList[token]
294 | 	return skipList, exists
295 | }
296 | 
297 | // ═══════════════════════════════════════════════════════════════════════════════
298 | // BASIC SEARCH OPERATIONS
299 | // ═══════════════════════════════════════════════════════════════════════════════
300 | // These four methods (First, Last, Next, Previous) form the foundation of
301 | // all search operations. Everything else is built on top of these primitives.
302 | //
303 | // Think of them like iterator operations:
304 | // - First: Go to the beginning
305 | // - Last: Go to the end
306 | // - Next: Move forward
307 | // - Previous: Move backward
308 | // ═══════════════════════════════════════════════════════════════════════════════
309 | 
310 | // First returns the first occurrence of a token in the index
311 | //
312 | // EXAMPLE:
313 | // --------
314 | // Given: "quick" appears at [Doc1:Pos1, Doc3:Pos0, Doc5:Pos2]
315 | // First("quick") returns Doc3:Pos0 (the earliest occurrence)
316 | //
317 | // Use case: Start searching for a token from the beginning
318 | func (idx *InvertedIndex) First(token string) (Position, error) {
319 | 	skipList, exists := idx.getPostingList(token)
320 | 	if !exists {
321 | 		return EOFDocument, ErrNoPostingList
322 | 	}
323 | 
324 | 	// The first position is at the bottom level (level 0) of the SkipList
325 | 	// The Head node points to the first real node via Tower[0]
326 | 	return skipList.Head.Tower[0].Key, nil
327 | }
328 | 
329 | // Last returns the last occurrence of a token in the index
330 | //
331 | // EXAMPLE:
332 | // --------
333 | // Given: "quick" appears at [Doc1:Pos1, Doc3:Pos0, Doc5:Pos2]
334 | // Last("quick") returns Doc5:Pos2 (the latest occurrence)
335 | //
336 | // Use case: Search backwards from the end
337 | func (idx *InvertedIndex) Last(token string) (Position, error) {
338 | 	skipList, exists := idx.getPostingList(token)
339 | 	if !exists {
340 | 		return EOFDocument, ErrNoPostingList
341 | 	}
342 | 
343 | 	// Traverse to the end of the SkipList
344 | 	return skipList.Last(), nil
345 | }
346 | 
347 | // Next finds the next occurrence of a token after the given position
348 | //
349 | // EXAMPLE:
350 | // --------
351 | // Given: "brown" appears at [Doc1:Pos2, Doc3:Pos1, Doc3:Pos5, Doc5:Pos0]
352 | // Next("brown", Doc3:Pos1) returns Doc3:Pos5
353 | // Next("brown", Doc3:Pos5) returns Doc5:Pos0
354 | // Next("brown", Doc5:Pos0) returns EOF (no more occurrences)
355 | //
356 | // Special cases:
357 | // - If currentPos is BOF (beginning of file), return First
358 | // - If currentPos is already EOF (end of file), stay at EOF
359 | //
360 | // Use case: Iterate through all occurrences of a word
361 | func (idx *InvertedIndex) Next(token string, currentPos Position) (Position, error) {
362 | 	// Special case: Starting from the beginning
363 | 	if currentPos.IsBeginning() {
364 | 		return idx.First(token)
365 | 	}
366 | 
367 | 	// Special case: Already at the end
368 | 	if currentPos.IsEnd() {
369 | 		return EOFDocument, nil
370 | 	}
371 | 
372 | 	// Get the posting list for this token
373 | 	skipList, exists := idx.getPostingList(token)
374 | 	if !exists {
375 | 		return EOFDocument, ErrNoPostingList
376 | 	}
377 | 
378 | 	// Find the next position after currentPos in the SkipList
379 | 	// FindGreaterThan returns the smallest position > currentPos
380 | 	nextPos, _ := skipList.FindGreaterThan(currentPos)
381 | 	return nextPos, nil
382 | }
383 | 
384 | // Previous finds the previous occurrence of a token before the given position
385 | //
386 | // EXAMPLE:
387 | // --------
388 | // Given: "brown" appears at [Doc1:Pos2, Doc3:Pos1, Doc3:Pos5, Doc5:Pos0]
389 | // Previous("brown", Doc5:Pos0) returns Doc3:Pos5
390 | // Previous("brown", Doc3:Pos5) returns Doc3:Pos1
391 | // Previous("brown", Doc1:Pos2) returns BOF (no earlier occurrences)
392 | //
393 | // Use case: Search backwards through occurrences
394 | func (idx *InvertedIndex) Previous(token string, currentPos Position) (Position, error) {
395 | 	// Special case: Starting from the end
396 | 	if currentPos.IsEnd() {
397 | 		return idx.Last(token)
398 | 	}
399 | 
400 | 	// Special case: Already at the beginning
401 | 	if currentPos.IsBeginning() {
402 | 		return BOFDocument, nil
403 | 	}
404 | 
405 | 	// Get the posting list for this token
406 | 	skipList, exists := idx.getPostingList(token)
407 | 	if !exists {
408 | 		return BOFDocument, ErrNoPostingList
409 | 	}
410 | 
411 | 	// Find the previous position before currentPos in the SkipList
412 | 	// FindLessThan returns the largest position < currentPos
413 | 	prevPos, _ := skipList.FindLessThan(currentPos)
414 | 	return prevPos, nil
415 | }
416 | 


--------------------------------------------------------------------------------
/analyzer.go:
--------------------------------------------------------------------------------
  1 | // ═══════════════════════════════════════════════════════════════════════════════
  2 | // TEXT ANALYSIS OVERVIEW
  3 | // ═══════════════════════════════════════════════════════════════════════════════
  4 | // Text analysis transforms raw text into searchable tokens through a multi-stage
  5 | // pipeline. This process is crucial for effective full-text search.
  6 | //
  7 | // ANALYSIS PIPELINE:
  8 | // ------------------
  9 | //  1. Tokenization   → Split text into words
 10 | //  2. Lowercasing    → Normalize case ("Quick" → "quick")
 11 | //  3. Stop word removal → Remove common words ("the", "a", etc.)
 12 | //  4. Length filtering  → Remove very short tokens (< 2 chars)
 13 | //  5. Stemming       → Reduce words to root form ("running" → "run")
 14 | //
 15 | // EXAMPLE TRANSFORMATION:
 16 | // -----------------------
 17 | // Input:  "The Quick Brown Fox Jumps!"
 18 | // Step 1: ["The", "Quick", "Brown", "Fox", "Jumps"]     (tokenize)
 19 | // Step 2: ["the", "quick", "brown", "fox", "jumps"]     (lowercase)
 20 | // Step 3: ["quick", "brown", "fox", "jumps"]            (remove stopwords)
 21 | // Step 4: ["quick", "brown", "fox", "jumps"]            (length filter - all pass)
 22 | // Step 5: ["quick", "brown", "fox", "jump"]             (stemming)
 23 | //
 24 | // WHY THIS MATTERS:
 25 | // -----------------
 26 | // Proper analysis ensures:
 27 | // - "Running" matches "run", "runs", "ran"
 28 | // - "The dog" matches "DOG" (case insensitive)
 29 | // - Common words don't pollute the index
 30 | // - Search results are relevant and accurate
 31 | // ═══════════════════════════════════════════════════════════════════════════════
 32 | 
 33 | package blaze
 34 | 
 35 | import (
 36 | 	"strings"
 37 | 	"unicode"
 38 | 
 39 | 	snowballeng "github.com/kljensen/snowball/english"
 40 | )
 41 | 
 42 | // AnalyzerConfig holds configuration options for text analysis
 43 | //
 44 | // This allows customization of the analysis pipeline without modifying code.
 45 | // Future enhancements could add language support, custom stopwords, etc.
 46 | type AnalyzerConfig struct {
 47 | 	MinTokenLength  int  // Minimum token length to keep (default: 2)
 48 | 	EnableStemming  bool // Whether to apply stemming (default: true)
 49 | 	EnableStopwords bool // Whether to remove stopwords (default: true)
 50 | }
 51 | 
 52 | // DefaultConfig returns the standard analyzer configuration
 53 | func DefaultConfig() AnalyzerConfig {
 54 | 	return AnalyzerConfig{
 55 | 		MinTokenLength:  2,
 56 | 		EnableStemming:  true,
 57 | 		EnableStopwords: true,
 58 | 	}
 59 | }
 60 | 
 61 | // Analyze transforms raw text into searchable tokens using the default pipeline
 62 | //
 63 | // This is the main entry point for text analysis. It applies all filters in sequence:
 64 | // 1. Tokenization
 65 | // 2. Lowercasing
 66 | // 3. Stopword filtering
 67 | // 4. Length filtering
 68 | // 5. Stemming
 69 | //
 70 | // Example:
 71 | //
 72 | //	tokens := Analyze("The quick brown fox jumps over the lazy dog")
 73 | //	// Returns: ["quick", "brown", "fox", "jump", "lazi", "dog"]
 74 | func Analyze(text string) []string {
 75 | 	return AnalyzeWithConfig(text, DefaultConfig())
 76 | }
 77 | 
 78 | // AnalyzeWithConfig transforms text using a custom configuration
 79 | //
 80 | // This allows fine-grained control over the analysis pipeline.
 81 | //
 82 | // Example:
 83 | //
 84 | //	config := AnalyzerConfig{MinTokenLength: 3, EnableStemming: false}
 85 | //	tokens := AnalyzeWithConfig("The quick brown fox", config)
 86 | func AnalyzeWithConfig(text string, config AnalyzerConfig) []string {
 87 | 	tokens := tokenize(text)
 88 | 	tokens = lowercaseFilter(tokens)
 89 | 
 90 | 	if config.EnableStopwords {
 91 | 		tokens = stopwordFilter(tokens)
 92 | 	}
 93 | 
 94 | 	tokens = lengthFilter(tokens, config.MinTokenLength)
 95 | 
 96 | 	if config.EnableStemming {
 97 | 		tokens = stemmerFilter(tokens)
 98 | 	}
 99 | 
100 | 	return tokens
101 | }
102 | 
103 | // tokenize splits text into individual words
104 | //
105 | // ALGORITHM:
106 | // ----------
107 | // Uses Unicode-aware splitting: any non-letter and non-digit character is a delimiter.
108 | //
109 | // Examples:
110 | //
111 | //	"hello-world"      → ["hello", "world"]
112 | //	"user@email.com"   → ["user", "email", "com"]
113 | //	"price: $9.99"     → ["price", "9", "99"]
114 | //	"café"             → ["café"]  (Unicode letters preserved)
115 | //
116 | // Why FieldsFunc?
117 | // - Handles Unicode properly (unlike simple string splitting)
118 | // - Treats multiple delimiters as one (no empty tokens)
119 | // - Fast and memory efficient (Go standard library optimization)
120 | func tokenize(text string) []string {
121 | 	return strings.FieldsFunc(text, func(r rune) bool {
122 | 		// Split on any character that is not a letter or a number
123 | 		return !unicode.IsLetter(r) && !unicode.IsNumber(r)
124 | 	})
125 | }
126 | 
127 | // lowercaseFilter normalizes token casing
128 | //
129 | // WHY IT MATTERS:
130 | // ---------------
131 | // Without lowercasing, "Quick", "quick", and "QUICK" would be treated as
132 | // different words, creating a poor search experience.
133 | //
134 | // Example:
135 | //
136 | //	["Hello", "World"] → ["hello", "world"]
137 | //
138 | // Performance Note:
139 | // - Pre-allocates slice to avoid dynamic growth
140 | // - Uses strings.ToLower for proper Unicode handling
141 | func lowercaseFilter(tokens []string) []string {
142 | 	r := make([]string, len(tokens))
143 | 	for i, token := range tokens {
144 | 		r[i] = strings.ToLower(token)
145 | 	}
146 | 	return r
147 | }
148 | 
149 | // stopwordFilter removes common English words that don't add search value
150 | //
151 | // STOPWORDS EXPLAINED:
152 | // --------------------
153 | // Words like "the", "a", "is" appear in almost every document, so they:
154 | // - Waste index space
155 | // - Don't help distinguish documents
156 | // - Slow down search
157 | //
158 | // Example:
159 | //
160 | //	["the", "quick", "brown", "fox"] → ["quick", "brown", "fox"]
161 | //
162 | // Implementation Note:
163 | // - Uses map lookup for O(1) checking
164 | // - Pre-allocates capacity to reduce reallocations
165 | func stopwordFilter(tokens []string) []string {
166 | 	r := make([]string, 0, len(tokens))
167 | 	for _, token := range tokens {
168 | 		if !isStopword(token) {
169 | 			r = append(r, token)
170 | 		}
171 | 	}
172 | 	return r
173 | }
174 | 
175 | // lengthFilter removes tokens that are too short to be meaningful
176 | //
177 | // WHY FILTER BY LENGTH?
178 | // ---------------------
179 | // Very short tokens (1-2 characters) are often:
180 | // - Not semantically meaningful ("a", "i", "to")
181 | // - Result in too many false matches
182 | // - Already caught by stopword filter
183 | //
184 | // Example (minLength=2):
185 | //
186 | //	["a", "go", "cat", "i"] → ["go", "cat"]
187 | //
188 | // Performance:
189 | // - O(n) single pass
190 | // - Pre-allocated capacity
191 | func lengthFilter(tokens []string, minLength int) []string {
192 | 	r := make([]string, 0, len(tokens))
193 | 	for _, token := range tokens {
194 | 		if len(token) >= minLength {
195 | 			r = append(r, token)
196 | 		}
197 | 	}
198 | 	return r
199 | }
200 | 
201 | // stemmerFilter reduces words to their root form
202 | //
203 | // STEMMING EXPLAINED:
204 | // -------------------
205 | // Stemming removes suffixes to find the word root:
206 | //
207 | //	"running", "runs", "ran" → "run"
208 | //	"connection", "connected", "connecting" → "connect"
209 | //
210 | // WHY IT MATTERS:
211 | // ---------------
212 | // Without stemming, a search for "run" wouldn't match documents containing
213 | // "running" or "runs", even though they're clearly related.
214 | //
215 | // ALGORITHM:
216 | // ----------
217 | // Uses the Snowball (Porter2) stemmer, which applies linguistic rules
218 | // to remove common English suffixes.
219 | //
220 | // Example:
221 | //
222 | //	["running", "quickly", "foxes"] → ["run", "quick", "fox"]
223 | //
224 | // Trade-offs:
225 | // + Improves recall (finds more relevant documents)
226 | // + Reduces index size (fewer unique terms)
227 | // - May over-stem (e.g., "university" → "univers")
228 | // - Language-specific (this implementation is English-only)
229 | func stemmerFilter(tokens []string) []string {
230 | 	r := make([]string, len(tokens))
231 | 	for i, token := range tokens {
232 | 		r[i] = snowballeng.Stem(token, false)
233 | 	}
234 | 	return r
235 | }
236 | 
237 | // isStopword checks if a token is a common English stopword
238 | //
239 | // Uses a hash map for O(1) lookup performance.
240 | // The map uses struct{} as values (0 bytes) instead of strings (16 bytes)
241 | // for memory efficiency.
242 | func isStopword(token string) bool {
243 | 	_, exists := englishStopwords[token]
244 | 	return exists
245 | }
246 | 
247 | // englishStopwords contains common English words to exclude from indexing
248 | //
249 | // MEMORY OPTIMIZATION:
250 | // --------------------
251 | // Uses struct{} (empty struct) as the value type instead of string or bool.
252 | // - struct{}: 0 bytes per entry
253 | // - string:   16 bytes per entry
254 | // - bool:     1 byte per entry
255 | //
256 | // For 300+ stopwords, this saves ~5KB of memory.
257 | //
258 | // STOPWORD SELECTION:
259 | // -------------------
260 | // This list includes:
261 | // - Articles: a, an, the
262 | // - Prepositions: in, on, at, to
263 | // - Conjunctions: and, but, or
264 | // - Pronouns: he, she, it, they
265 | // - Common verbs: is, are, was, were
266 | // - Numbers: one, two, three, etc.
267 | var englishStopwords = map[string]struct{}{
268 | 	"a":            {},
269 | 	"about":        {},
270 | 	"above":        {},
271 | 	"across":       {},
272 | 	"after":        {},
273 | 	"afterwards":   {},
274 | 	"again":        {},
275 | 	"against":      {},
276 | 	"all":          {},
277 | 	"almost":       {},
278 | 	"alone":        {},
279 | 	"along":        {},
280 | 	"already":      {},
281 | 	"also":         {},
282 | 	"although":     {},
283 | 	"always":       {},
284 | 	"am":           {},
285 | 	"among":        {},
286 | 	"amongst":      {},
287 | 	"amoungst":     {},
288 | 	"amount":       {},
289 | 	"an":           {},
290 | 	"and":          {},
291 | 	"another":      {},
292 | 	"any":          {},
293 | 	"anyhow":       {},
294 | 	"anyone":       {},
295 | 	"anything":     {},
296 | 	"anyway":       {},
297 | 	"anywhere":     {},
298 | 	"are":          {},
299 | 	"around":       {},
300 | 	"as":           {},
301 | 	"at":           {},
302 | 	"back":         {},
303 | 	"be":           {},
304 | 	"became":       {},
305 | 	"because":      {},
306 | 	"become":       {},
307 | 	"becomes":      {},
308 | 	"becoming":     {},
309 | 	"been":         {},
310 | 	"before":       {},
311 | 	"beforehand":   {},
312 | 	"behind":       {},
313 | 	"being":        {},
314 | 	"below":        {},
315 | 	"beside":       {},
316 | 	"besides":      {},
317 | 	"between":      {},
318 | 	"beyond":       {},
319 | 	"bill":         {},
320 | 	"both":         {},
321 | 	"bottom":       {},
322 | 	"but":          {},
323 | 	"by":           {},
324 | 	"call":         {},
325 | 	"can":          {},
326 | 	"cannot":       {},
327 | 	"cant":         {},
328 | 	"co":           {},
329 | 	"con":          {},
330 | 	"could":        {},
331 | 	"couldnt":      {},
332 | 	"cry":          {},
333 | 	"de":           {},
334 | 	"describe":     {},
335 | 	"detail":       {},
336 | 	"do":           {},
337 | 	"done":         {},
338 | 	"down":         {},
339 | 	"due":          {},
340 | 	"during":       {},
341 | 	"each":         {},
342 | 	"eg":           {},
343 | 	"eight":        {},
344 | 	"either":       {},
345 | 	"eleven":       {},
346 | 	"else":         {},
347 | 	"elsewhere":    {},
348 | 	"empty":        {},
349 | 	"enough":       {},
350 | 	"etc":          {},
351 | 	"even":         {},
352 | 	"ever":         {},
353 | 	"every":        {},
354 | 	"everyone":     {},
355 | 	"everything":   {},
356 | 	"everywhere":   {},
357 | 	"except":       {},
358 | 	"few":          {},
359 | 	"fifteen":      {},
360 | 	"fify":         {},
361 | 	"fill":         {},
362 | 	"find":         {},
363 | 	"fire":         {},
364 | 	"first":        {},
365 | 	"five":         {},
366 | 	"for":          {},
367 | 	"former":       {},
368 | 	"formerly":     {},
369 | 	"forty":        {},
370 | 	"found":        {},
371 | 	"four":         {},
372 | 	"from":         {},
373 | 	"front":        {},
374 | 	"full":         {},
375 | 	"further":      {},
376 | 	"get":          {},
377 | 	"give":         {},
378 | 	"go":           {},
379 | 	"had":          {},
380 | 	"has":          {},
381 | 	"hasnt":        {},
382 | 	"have":         {},
383 | 	"he":           {},
384 | 	"hence":        {},
385 | 	"her":          {},
386 | 	"here":         {},
387 | 	"hereafter":    {},
388 | 	"hereby":       {},
389 | 	"herein":       {},
390 | 	"hereupon":     {},
391 | 	"hers":         {},
392 | 	"herself":      {},
393 | 	"him":          {},
394 | 	"himself":      {},
395 | 	"his":          {},
396 | 	"how":          {},
397 | 	"however":      {},
398 | 	"hundred":      {},
399 | 	"ie":           {},
400 | 	"if":           {},
401 | 	"in":           {},
402 | 	"inc":          {},
403 | 	"indeed":       {},
404 | 	"interest":     {},
405 | 	"into":         {},
406 | 	"is":           {},
407 | 	"it":           {},
408 | 	"its":          {},
409 | 	"itself":       {},
410 | 	"keep":         {},
411 | 	"last":         {},
412 | 	"latter":       {},
413 | 	"latterly":     {},
414 | 	"least":        {},
415 | 	"less":         {},
416 | 	"ltd":          {},
417 | 	"made":         {},
418 | 	"many":         {},
419 | 	"may":          {},
420 | 	"me":           {},
421 | 	"meanwhile":    {},
422 | 	"might":        {},
423 | 	"mill":         {},
424 | 	"mine":         {},
425 | 	"more":         {},
426 | 	"moreover":     {},
427 | 	"most":         {},
428 | 	"mostly":       {},
429 | 	"move":         {},
430 | 	"much":         {},
431 | 	"must":         {},
432 | 	"my":           {},
433 | 	"myself":       {},
434 | 	"name":         {},
435 | 	"namely":       {},
436 | 	"neither":      {},
437 | 	"never":        {},
438 | 	"nevertheless": {},
439 | 	"next":         {},
440 | 	"nine":         {},
441 | 	"no":           {},
442 | 	"nobody":       {},
443 | 	"none":         {},
444 | 	"noone":        {},
445 | 	"nor":          {},
446 | 	"not":          {},
447 | 	"nothing":      {},
448 | 	"now":          {},
449 | 	"nowhere":      {},
450 | 	"of":           {},
451 | 	"off":          {},
452 | 	"often":        {},
453 | 	"on":           {},
454 | 	"once":         {},
455 | 	"one":          {},
456 | 	"only":         {},
457 | 	"onto":         {},
458 | 	"or":           {},
459 | 	"other":        {},
460 | 	"others":       {},
461 | 	"otherwise":    {},
462 | 	"our":          {},
463 | 	"ours":         {},
464 | 	"ourselves":    {},
465 | 	"out":          {},
466 | 	"over":         {},
467 | 	"own":          {},
468 | 	"part":         {},
469 | 	"per":          {},
470 | 	"perhaps":      {},
471 | 	"please":       {},
472 | 	"put":          {},
473 | 	"rather":       {},
474 | 	"re":           {},
475 | 	"same":         {},
476 | 	"see":          {},
477 | 	"seem":         {},
478 | 	"seemed":       {},
479 | 	"seeming":      {},
480 | 	"seems":        {},
481 | 	"serious":      {},
482 | 	"several":      {},
483 | 	"she":          {},
484 | 	"should":       {},
485 | 	"show":         {},
486 | 	"side":         {},
487 | 	"since":        {},
488 | 	"sincere":      {},
489 | 	"six":          {},
490 | 	"sixty":        {},
491 | 	"so":           {},
492 | 	"some":         {},
493 | 	"somehow":      {},
494 | 	"someone":      {},
495 | 	"something":    {},
496 | 	"sometime":     {},
497 | 	"sometimes":    {},
498 | 	"somewhere":    {},
499 | 	"still":        {},
500 | 	"such":         {},
501 | 	"system":       {},
502 | 	"take":         {},
503 | 	"ten":          {},
504 | 	"than":         {},
505 | 	"that":         {},
506 | 	"the":          {},
507 | 	"their":        {},
508 | 	"them":         {},
509 | 	"themselves":   {},
510 | 	"then":         {},
511 | 	"thence":       {},
512 | 	"there":        {},
513 | 	"thereafter":   {},
514 | 	"thereby":      {},
515 | 	"therefore":    {},
516 | 	"therein":      {},
517 | 	"thereupon":    {},
518 | 	"these":        {},
519 | 	"they":         {},
520 | 	"thickv":       {},
521 | 	"thin":         {},
522 | 	"third":        {},
523 | 	"this":         {},
524 | 	"those":        {},
525 | 	"though":       {},
526 | 	"three":        {},
527 | 	"through":      {},
528 | 	"throughout":   {},
529 | 	"thru":         {},
530 | 	"thus":         {},
531 | 	"to":           {},
532 | 	"together":     {},
533 | 	"too":          {},
534 | 	"top":          {},
535 | 	"toward":       {},
536 | 	"towards":      {},
537 | 	"twelve":       {},
538 | 	"twenty":       {},
539 | 	"two":          {},
540 | 	"un":           {},
541 | 	"under":        {},
542 | 	"until":        {},
543 | 	"up":           {},
544 | 	"upon":         {},
545 | 	"us":           {},
546 | 	"very":         {},
547 | 	"via":          {},
548 | 	"was":          {},
549 | 	"we":           {},
550 | 	"well":         {},
551 | 	"were":         {},
552 | 	"what":         {},
553 | 	"whatever":     {},
554 | 	"when":         {},
555 | 	"whence":       {},
556 | 	"whenever":     {},
557 | 	"where":        {},
558 | 	"whereafter":   {},
559 | 	"whereas":      {},
560 | 	"whereby":      {},
561 | 	"wherein":      {},
562 | 	"whereupon":    {},
563 | 	"wherever":     {},
564 | 	"whether":      {},
565 | 	"which":        {},
566 | 	"while":        {},
567 | 	"whither":      {},
568 | 	"who":          {},
569 | 	"whoever":      {},
570 | 	"whole":        {},
571 | 	"whom":         {},
572 | 	"whose":        {},
573 | 	"why":          {},
574 | 	"will":         {},
575 | 	"with":         {},
576 | 	"within":       {},
577 | 	"without":      {},
578 | 	"would":        {},
579 | 	"yet":          {},
580 | 	"you":          {},
581 | 	"your":         {},
582 | 	"yours":        {},
583 | 	"yourself":     {},
584 | 	"yourselves":   {}}
585 | 


--------------------------------------------------------------------------------
/skiplist_test.go:
--------------------------------------------------------------------------------
  1 | package blaze
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"testing"
  6 | )
  7 | 
  8 | // ═══════════════════════════════════════════════════════════════════════════════
  9 | // POSITION TESTS
 10 | // ═══════════════════════════════════════════════════════════════════════════════
 11 | 
 12 | func TestPosition_GetDocumentID(t *testing.T) {
 13 | 	pos := Position{DocumentID: 42, Offset: 10}
 14 | 	if got := pos.GetDocumentID(); got != 42 {
 15 | 		t.Errorf("GetDocumentID() = %d, want 42", got)
 16 | 	}
 17 | }
 18 | 
 19 | func TestPosition_GetOffset(t *testing.T) {
 20 | 	pos := Position{DocumentID: 42, Offset: 10}
 21 | 	if got := pos.GetOffset(); got != 10 {
 22 | 		t.Errorf("GetOffset() = %d, want 10", got)
 23 | 	}
 24 | }
 25 | 
 26 | func TestPosition_IsBeginning(t *testing.T) {
 27 | 	tests := []struct {
 28 | 		name string
 29 | 		pos  Position
 30 | 		want bool
 31 | 	}{
 32 | 		{"BOF position", Position{DocumentID: BOF, Offset: BOF}, true},
 33 | 		{"Regular position", Position{DocumentID: 1, Offset: 0}, false},
 34 | 		{"EOF position", Position{DocumentID: EOF, Offset: EOF}, false},
 35 | 	}
 36 | 
 37 | 	for _, tt := range tests {
 38 | 		t.Run(tt.name, func(t *testing.T) {
 39 | 			if got := tt.pos.IsBeginning(); got != tt.want {
 40 | 				t.Errorf("IsBeginning() = %v, want %v", got, tt.want)
 41 | 			}
 42 | 		})
 43 | 	}
 44 | }
 45 | 
 46 | func TestPosition_IsEnd(t *testing.T) {
 47 | 	tests := []struct {
 48 | 		name string
 49 | 		pos  Position
 50 | 		want bool
 51 | 	}{
 52 | 		{"EOF position", Position{DocumentID: EOF, Offset: EOF}, true},
 53 | 		{"Regular position", Position{DocumentID: 1, Offset: 0}, false},
 54 | 		{"BOF position", Position{DocumentID: BOF, Offset: BOF}, false},
 55 | 	}
 56 | 
 57 | 	for _, tt := range tests {
 58 | 		t.Run(tt.name, func(t *testing.T) {
 59 | 			if got := tt.pos.IsEnd(); got != tt.want {
 60 | 				t.Errorf("IsEnd() = %v, want %v", got, tt.want)
 61 | 			}
 62 | 		})
 63 | 	}
 64 | }
 65 | 
 66 | func TestPosition_IsBefore(t *testing.T) {
 67 | 	tests := []struct {
 68 | 		name  string
 69 | 		pos   Position
 70 | 		other Position
 71 | 		want  bool
 72 | 	}{
 73 | 		{
 74 | 			"Same doc, earlier offset",
 75 | 			Position{DocumentID: 1, Offset: 5},
 76 | 			Position{DocumentID: 1, Offset: 10},
 77 | 			true,
 78 | 		},
 79 | 		{
 80 | 			"Same doc, later offset",
 81 | 			Position{DocumentID: 1, Offset: 10},
 82 | 			Position{DocumentID: 1, Offset: 5},
 83 | 			false,
 84 | 		},
 85 | 		{
 86 | 			"Earlier doc",
 87 | 			Position{DocumentID: 1, Offset: 100},
 88 | 			Position{DocumentID: 2, Offset: 0},
 89 | 			true,
 90 | 		},
 91 | 		{
 92 | 			"Later doc",
 93 | 			Position{DocumentID: 2, Offset: 0},
 94 | 			Position{DocumentID: 1, Offset: 100},
 95 | 			false,
 96 | 		},
 97 | 		{
 98 | 			"BOF before regular",
 99 | 			Position{DocumentID: BOF, Offset: BOF},
100 | 			Position{DocumentID: 1, Offset: 0},
101 | 			true,
102 | 		},
103 | 		{
104 | 			"Regular before EOF",
105 | 			Position{DocumentID: 1, Offset: 0},
106 | 			Position{DocumentID: EOF, Offset: EOF},
107 | 			true,
108 | 		},
109 | 		{
110 | 			"Same position",
111 | 			Position{DocumentID: 1, Offset: 5},
112 | 			Position{DocumentID: 1, Offset: 5},
113 | 			false,
114 | 		},
115 | 	}
116 | 
117 | 	for _, tt := range tests {
118 | 		t.Run(tt.name, func(t *testing.T) {
119 | 			if got := tt.pos.IsBefore(tt.other); got != tt.want {
120 | 				t.Errorf("IsBefore() = %v, want %v", got, tt.want)
121 | 			}
122 | 		})
123 | 	}
124 | }
125 | 
126 | func TestPosition_IsAfter(t *testing.T) {
127 | 	tests := []struct {
128 | 		name  string
129 | 		pos   Position
130 | 		other Position
131 | 		want  bool
132 | 	}{
133 | 		{
134 | 			"Same doc, later offset",
135 | 			Position{DocumentID: 1, Offset: 10},
136 | 			Position{DocumentID: 1, Offset: 5},
137 | 			true,
138 | 		},
139 | 		{
140 | 			"Same doc, earlier offset",
141 | 			Position{DocumentID: 1, Offset: 5},
142 | 			Position{DocumentID: 1, Offset: 10},
143 | 			false,
144 | 		},
145 | 		{
146 | 			"Later doc",
147 | 			Position{DocumentID: 2, Offset: 0},
148 | 			Position{DocumentID: 1, Offset: 100},
149 | 			true,
150 | 		},
151 | 		{
152 | 			"Earlier doc",
153 | 			Position{DocumentID: 1, Offset: 100},
154 | 			Position{DocumentID: 2, Offset: 0},
155 | 			false,
156 | 		},
157 | 		{
158 | 			"EOF after regular",
159 | 			Position{DocumentID: EOF, Offset: EOF},
160 | 			Position{DocumentID: 1, Offset: 0},
161 | 			true,
162 | 		},
163 | 		{
164 | 			"Regular after BOF",
165 | 			Position{DocumentID: 1, Offset: 0},
166 | 			Position{DocumentID: BOF, Offset: BOF},
167 | 			true,
168 | 		},
169 | 		{
170 | 			"Same position",
171 | 			Position{DocumentID: 1, Offset: 5},
172 | 			Position{DocumentID: 1, Offset: 5},
173 | 			false,
174 | 		},
175 | 	}
176 | 
177 | 	for _, tt := range tests {
178 | 		t.Run(tt.name, func(t *testing.T) {
179 | 			if got := tt.pos.IsAfter(tt.other); got != tt.want {
180 | 				t.Errorf("IsAfter() = %v, want %v", got, tt.want)
181 | 			}
182 | 		})
183 | 	}
184 | }
185 | 
186 | func TestPosition_Equals(t *testing.T) {
187 | 	tests := []struct {
188 | 		name  string
189 | 		pos   Position
190 | 		other Position
191 | 		want  bool
192 | 	}{
193 | 		{
194 | 			"Same position",
195 | 			Position{DocumentID: 1, Offset: 5},
196 | 			Position{DocumentID: 1, Offset: 5},
197 | 			true,
198 | 		},
199 | 		{
200 | 			"Different offset",
201 | 			Position{DocumentID: 1, Offset: 5},
202 | 			Position{DocumentID: 1, Offset: 10},
203 | 			false,
204 | 		},
205 | 		{
206 | 			"Different document",
207 | 			Position{DocumentID: 1, Offset: 5},
208 | 			Position{DocumentID: 2, Offset: 5},
209 | 			false,
210 | 		},
211 | 		{
212 | 			"Both BOF",
213 | 			Position{DocumentID: BOF, Offset: BOF},
214 | 			Position{DocumentID: BOF, Offset: BOF},
215 | 			true,
216 | 		},
217 | 		{
218 | 			"Both EOF",
219 | 			Position{DocumentID: EOF, Offset: EOF},
220 | 			Position{DocumentID: EOF, Offset: EOF},
221 | 			true,
222 | 		},
223 | 	}
224 | 
225 | 	for _, tt := range tests {
226 | 		t.Run(tt.name, func(t *testing.T) {
227 | 			if got := tt.pos.Equals(tt.other); got != tt.want {
228 | 				t.Errorf("Equals() = %v, want %v", got, tt.want)
229 | 			}
230 | 		})
231 | 	}
232 | }
233 | 
234 | // ═══════════════════════════════════════════════════════════════════════════════
235 | // SKIP LIST BASIC TESTS
236 | // ═══════════════════════════════════════════════════════════════════════════════
237 | 
238 | func TestNewSkipList(t *testing.T) {
239 | 	sl := NewSkipList()
240 | 
241 | 	if sl.Head == nil {
242 | 		t.Error("NewSkipList() created nil Head")
243 | 	}
244 | 
245 | 	if sl.Height != 1 {
246 | 		t.Errorf("NewSkipList() Height = %d, want 1", sl.Height)
247 | 	}
248 | }
249 | 
250 | func TestSkipList_Insert_Single(t *testing.T) {
251 | 	sl := NewSkipList()
252 | 	pos := Position{DocumentID: 1, Offset: 5}
253 | 
254 | 	sl.Insert(pos)
255 | 
256 | 	// Verify the element was inserted
257 | 	found, err := sl.Find(pos)
258 | 	if err != nil {
259 | 		t.Errorf("Find() error = %v, want nil", err)
260 | 	}
261 | 
262 | 	if !found.Equals(pos) {
263 | 		t.Errorf("Find() = %v, want %v", found, pos)
264 | 	}
265 | }
266 | 
267 | func TestSkipList_Insert_Multiple(t *testing.T) {
268 | 	sl := NewSkipList()
269 | 
270 | 	positions := []Position{
271 | 		{DocumentID: 1, Offset: 5},
272 | 		{DocumentID: 1, Offset: 10},
273 | 		{DocumentID: 2, Offset: 0},
274 | 		{DocumentID: 2, Offset: 15},
275 | 		{DocumentID: 3, Offset: 7},
276 | 	}
277 | 
278 | 	// Insert all positions
279 | 	for _, pos := range positions {
280 | 		sl.Insert(pos)
281 | 	}
282 | 
283 | 	// Verify all can be found
284 | 	for _, pos := range positions {
285 | 		found, err := sl.Find(pos)
286 | 		if err != nil {
287 | 			t.Errorf("Find(%v) error = %v, want nil", pos, err)
288 | 		}
289 | 		if !found.Equals(pos) {
290 | 			t.Errorf("Find(%v) = %v, want %v", pos, found, pos)
291 | 		}
292 | 	}
293 | }
294 | 
295 | func TestSkipList_Insert_Duplicate(t *testing.T) {
296 | 	sl := NewSkipList()
297 | 	pos := Position{DocumentID: 1, Offset: 5}
298 | 
299 | 	// Insert twice
300 | 	sl.Insert(pos)
301 | 	sl.Insert(pos)
302 | 
303 | 	// Should only exist once
304 | 	found, err := sl.Find(pos)
305 | 	if err != nil {
306 | 		t.Errorf("Find() error = %v, want nil", err)
307 | 	}
308 | 	if !found.Equals(pos) {
309 | 		t.Errorf("Find() = %v, want %v", found, pos)
310 | 	}
311 | 
312 | 	// Count elements using iterator
313 | 	count := 0
314 | 	iter := sl.Iterator()
315 | 	// First element is at current position
316 | 	if iter.current != nil {
317 | 		count++
318 | 	}
319 | 	// Rest of elements via HasNext/Next
320 | 	for iter.HasNext() {
321 | 		iter.Next()
322 | 		count++
323 | 	}
324 | 
325 | 	if count != 1 {
326 | 		t.Errorf("Skip list has %d elements, want 1", count)
327 | 	}
328 | }
329 | 
330 | func TestSkipList_Insert_OutOfOrder(t *testing.T) {
331 | 	sl := NewSkipList()
332 | 
333 | 	// Insert in reverse order
334 | 	positions := []Position{
335 | 		{DocumentID: 5, Offset: 10},
336 | 		{DocumentID: 3, Offset: 7},
337 | 		{DocumentID: 4, Offset: 2},
338 | 		{DocumentID: 1, Offset: 0},
339 | 		{DocumentID: 2, Offset: 5},
340 | 	}
341 | 
342 | 	for _, pos := range positions {
343 | 		sl.Insert(pos)
344 | 	}
345 | 
346 | 	// Verify they're stored in sorted order
347 | 	expected := []Position{
348 | 		{DocumentID: 1, Offset: 0},
349 | 		{DocumentID: 2, Offset: 5},
350 | 		{DocumentID: 3, Offset: 7},
351 | 		{DocumentID: 4, Offset: 2},
352 | 		{DocumentID: 5, Offset: 10},
353 | 	}
354 | 
355 | 	// Get all positions using iterator
356 | 	var result []Position
357 | 	iter := sl.Iterator()
358 | 	// Get first element
359 | 	if iter.current != nil {
360 | 		result = append(result, iter.current.Key)
361 | 	}
362 | 	// Get remaining elements
363 | 	for iter.HasNext() {
364 | 		result = append(result, iter.Next())
365 | 	}
366 | 
367 | 	if len(result) != len(expected) {
368 | 		t.Fatalf("Got %d positions, want %d", len(result), len(expected))
369 | 	}
370 | 
371 | 	for idx, pos := range result {
372 | 		if !pos.Equals(expected[idx]) {
373 | 			t.Errorf("Position at index %d = %v, want %v", idx, pos, expected[idx])
374 | 		}
375 | 	}
376 | }
377 | 
378 | // ═══════════════════════════════════════════════════════════════════════════════
379 | // SEARCH AND FIND TESTS
380 | // ═══════════════════════════════════════════════════════════════════════════════
381 | 
382 | func TestSkipList_Find_NotFound(t *testing.T) {
383 | 	sl := NewSkipList()
384 | 	sl.Insert(Position{DocumentID: 1, Offset: 5})
385 | 	sl.Insert(Position{DocumentID: 2, Offset: 10})
386 | 
387 | 	pos := Position{DocumentID: 1, Offset: 7}
388 | 	_, err := sl.Find(pos)
389 | 
390 | 	if err != ErrKeyNotFound {
391 | 		t.Errorf("Find() error = %v, want %v", err, ErrKeyNotFound)
392 | 	}
393 | }
394 | 
395 | func TestSkipList_Find_EmptyList(t *testing.T) {
396 | 	sl := NewSkipList()
397 | 
398 | 	pos := Position{DocumentID: 1, Offset: 0}
399 | 	_, err := sl.Find(pos)
400 | 
401 | 	if err != ErrKeyNotFound {
402 | 		t.Errorf("Find() error = %v, want %v", err, ErrKeyNotFound)
403 | 	}
404 | }
405 | 
406 | func TestSkipList_FindLessThan(t *testing.T) {
407 | 	sl := NewSkipList()
408 | 
409 | 	// Insert: 5, 10, 15, 20
410 | 	sl.Insert(Position{DocumentID: 1, Offset: 5})
411 | 	sl.Insert(Position{DocumentID: 1, Offset: 10})
412 | 	sl.Insert(Position{DocumentID: 1, Offset: 15})
413 | 	sl.Insert(Position{DocumentID: 1, Offset: 20})
414 | 
415 | 	tests := []struct {
416 | 		name    string
417 | 		key     Position
418 | 		want    Position
419 | 		wantErr error
420 | 	}{
421 | 		{
422 | 			"Find less than 17",
423 | 			Position{DocumentID: 1, Offset: 17},
424 | 			Position{DocumentID: 1, Offset: 15},
425 | 			nil,
426 | 		},
427 | 		{
428 | 			"Find less than 15",
429 | 			Position{DocumentID: 1, Offset: 15},
430 | 			Position{DocumentID: 1, Offset: 10},
431 | 			nil,
432 | 		},
433 | 		{
434 | 			"Find less than 5 (first element)",
435 | 			Position{DocumentID: 1, Offset: 5},
436 | 			BOFDocument,
437 | 			ErrNoElementFound,
438 | 		},
439 | 		{
440 | 			"Find less than 0 (before first)",
441 | 			Position{DocumentID: 1, Offset: 0},
442 | 			BOFDocument,
443 | 			ErrNoElementFound,
444 | 		},
445 | 	}
446 | 
447 | 	for _, tt := range tests {
448 | 		t.Run(tt.name, func(t *testing.T) {
449 | 			got, err := sl.FindLessThan(tt.key)
450 | 
451 | 			if err != tt.wantErr {
452 | 				t.Errorf("FindLessThan() error = %v, want %v", err, tt.wantErr)
453 | 			}
454 | 
455 | 			if !got.Equals(tt.want) {
456 | 				t.Errorf("FindLessThan() = %v, want %v", got, tt.want)
457 | 			}
458 | 		})
459 | 	}
460 | }
461 | 
462 | func TestSkipList_FindGreaterThan(t *testing.T) {
463 | 	sl := NewSkipList()
464 | 
465 | 	// Insert: 5, 10, 15, 20
466 | 	sl.Insert(Position{DocumentID: 1, Offset: 5})
467 | 	sl.Insert(Position{DocumentID: 1, Offset: 10})
468 | 	sl.Insert(Position{DocumentID: 1, Offset: 15})
469 | 	sl.Insert(Position{DocumentID: 1, Offset: 20})
470 | 
471 | 	tests := []struct {
472 | 		name    string
473 | 		key     Position
474 | 		want    Position
475 | 		wantErr error
476 | 	}{
477 | 		{
478 | 			"Find greater than 10 (exists)",
479 | 			Position{DocumentID: 1, Offset: 10},
480 | 			Position{DocumentID: 1, Offset: 15},
481 | 			nil,
482 | 		},
483 | 		{
484 | 			"Find greater than 12 (doesn't exist)",
485 | 			Position{DocumentID: 1, Offset: 12},
486 | 			Position{DocumentID: 1, Offset: 15},
487 | 			nil,
488 | 		},
489 | 		{
490 | 			"Find greater than 20 (last element)",
491 | 			Position{DocumentID: 1, Offset: 20},
492 | 			EOFDocument,
493 | 			ErrNoElementFound,
494 | 		},
495 | 		{
496 | 			"Find greater than 25 (after last)",
497 | 			Position{DocumentID: 1, Offset: 25},
498 | 			EOFDocument,
499 | 			ErrNoElementFound,
500 | 		},
501 | 	}
502 | 
503 | 	for _, tt := range tests {
504 | 		t.Run(tt.name, func(t *testing.T) {
505 | 			got, err := sl.FindGreaterThan(tt.key)
506 | 
507 | 			if err != tt.wantErr {
508 | 				t.Errorf("FindGreaterThan() error = %v, want %v", err, tt.wantErr)
509 | 			}
510 | 
511 | 			if !got.Equals(tt.want) {
512 | 				t.Errorf("FindGreaterThan() = %v, want %v", got, tt.want)
513 | 			}
514 | 		})
515 | 	}
516 | }
517 | 
518 | // ═══════════════════════════════════════════════════════════════════════════════
519 | // DELETE TESTS
520 | // ═══════════════════════════════════════════════════════════════════════════════
521 | 
522 | func TestSkipList_Delete_Single(t *testing.T) {
523 | 	sl := NewSkipList()
524 | 	pos := Position{DocumentID: 1, Offset: 5}
525 | 
526 | 	sl.Insert(pos)
527 | 
528 | 	// Delete the element
529 | 	deleted := sl.Delete(pos)
530 | 	if !deleted {
531 | 		t.Error("Delete() = false, want true")
532 | 	}
533 | 
534 | 	// Verify it's gone
535 | 	_, err := sl.Find(pos)
536 | 	if err != ErrKeyNotFound {
537 | 		t.Errorf("Find() after delete error = %v, want %v", err, ErrKeyNotFound)
538 | 	}
539 | }
540 | 
541 | func TestSkipList_Delete_Middle(t *testing.T) {
542 | 	sl := NewSkipList()
543 | 
544 | 	// Insert: 5, 10, 15, 20
545 | 	sl.Insert(Position{DocumentID: 1, Offset: 5})
546 | 	sl.Insert(Position{DocumentID: 1, Offset: 10})
547 | 	sl.Insert(Position{DocumentID: 1, Offset: 15})
548 | 	sl.Insert(Position{DocumentID: 1, Offset: 20})
549 | 
550 | 	// Delete middle element
551 | 	deleted := sl.Delete(Position{DocumentID: 1, Offset: 10})
552 | 	if !deleted {
553 | 		t.Error("Delete() = false, want true")
554 | 	}
555 | 
556 | 	// Verify it's gone
557 | 	_, err := sl.Find(Position{DocumentID: 1, Offset: 10})
558 | 	if err != ErrKeyNotFound {
559 | 		t.Errorf("Find() after delete error = %v, want %v", err, ErrKeyNotFound)
560 | 	}
561 | 
562 | 	// Verify others still exist
563 | 	remaining := []Position{
564 | 		{DocumentID: 1, Offset: 5},
565 | 		{DocumentID: 1, Offset: 15},
566 | 		{DocumentID: 1, Offset: 20},
567 | 	}
568 | 
569 | 	for _, pos := range remaining {
570 | 		_, err := sl.Find(pos)
571 | 		if err != nil {
572 | 			t.Errorf("Find(%v) error = %v, want nil", pos, err)
573 | 		}
574 | 	}
575 | }
576 | 
577 | func TestSkipList_Delete_NotFound(t *testing.T) {
578 | 	sl := NewSkipList()
579 | 	sl.Insert(Position{DocumentID: 1, Offset: 5})
580 | 
581 | 	deleted := sl.Delete(Position{DocumentID: 1, Offset: 10})
582 | 	if deleted {
583 | 		t.Error("Delete() = true, want false (element not found)")
584 | 	}
585 | }
586 | 
587 | func TestSkipList_Delete_EmptyList(t *testing.T) {
588 | 	sl := NewSkipList()
589 | 
590 | 	deleted := sl.Delete(Position{DocumentID: 1, Offset: 0})
591 | 	if deleted {
592 | 		t.Error("Delete() = true, want false (empty list)")
593 | 	}
594 | }
595 | 
596 | func TestSkipList_Delete_All(t *testing.T) {
597 | 	sl := NewSkipList()
598 | 
599 | 	positions := []Position{
600 | 		{DocumentID: 1, Offset: 5},
601 | 		{DocumentID: 1, Offset: 10},
602 | 		{DocumentID: 2, Offset: 15},
603 | 	}
604 | 
605 | 	// Insert all
606 | 	for _, pos := range positions {
607 | 		sl.Insert(pos)
608 | 	}
609 | 
610 | 	// Delete all
611 | 	for _, pos := range positions {
612 | 		deleted := sl.Delete(pos)
613 | 		if !deleted {
614 | 			t.Errorf("Delete(%v) = false, want true", pos)
615 | 		}
616 | 	}
617 | 
618 | 	// Verify list is empty
619 | 	iter := sl.Iterator()
620 | 	if iter.HasNext() {
621 | 		t.Error("List should be empty after deleting all elements")
622 | 	}
623 | }
624 | 
625 | // ═══════════════════════════════════════════════════════════════════════════════
626 | // ITERATOR TESTS
627 | // ═══════════════════════════════════════════════════════════════════════════════
628 | 
629 | func TestSkipList_Iterator_Empty(t *testing.T) {
630 | 	sl := NewSkipList()
631 | 	iter := sl.Iterator()
632 | 
633 | 	if iter.HasNext() {
634 | 		t.Error("HasNext() = true for empty list, want false")
635 | 	}
636 | 
637 | 	pos := iter.Next()
638 | 	if !pos.Equals(EOFDocument) {
639 | 		t.Errorf("Next() on empty list = %v, want %v", pos, EOFDocument)
640 | 	}
641 | }
642 | 
643 | func TestSkipList_Iterator_Single(t *testing.T) {
644 | 	sl := NewSkipList()
645 | 	expected := Position{DocumentID: 1, Offset: 5}
646 | 	sl.Insert(expected)
647 | 
648 | 	iter := sl.Iterator()
649 | 
650 | 	// First element is at current position
651 | 	if iter.current == nil {
652 | 		t.Fatal("Iterator current is nil, expected first element")
653 | 	}
654 | 
655 | 	pos := iter.current.Key
656 | 	if !pos.Equals(expected) {
657 | 		t.Errorf("First element = %v, want %v", pos, expected)
658 | 	}
659 | 
660 | 	// Should have no next elements
661 | 	if iter.HasNext() {
662 | 		t.Error("HasNext() = true for single element list, want false")
663 | 	}
664 | }
665 | 
666 | func TestSkipList_Iterator_Multiple(t *testing.T) {
667 | 	sl := NewSkipList()
668 | 
669 | 	expected := []Position{
670 | 		{DocumentID: 1, Offset: 5},
671 | 		{DocumentID: 1, Offset: 10},
672 | 		{DocumentID: 2, Offset: 0},
673 | 		{DocumentID: 2, Offset: 15},
674 | 		{DocumentID: 3, Offset: 7},
675 | 	}
676 | 
677 | 	// Insert all
678 | 	for _, pos := range expected {
679 | 		sl.Insert(pos)
680 | 	}
681 | 
682 | 	// Get all positions using iterator
683 | 	var result []Position
684 | 	iter := sl.Iterator()
685 | 	// Get first element
686 | 	if iter.current != nil {
687 | 		result = append(result, iter.current.Key)
688 | 	}
689 | 	// Get remaining elements
690 | 	for iter.HasNext() {
691 | 		result = append(result, iter.Next())
692 | 	}
693 | 
694 | 	if len(result) != len(expected) {
695 | 		t.Errorf("Iterator returned %d elements, want %d", len(result), len(expected))
696 | 	}
697 | 
698 | 	for idx, pos := range result {
699 | 		if idx >= len(expected) {
700 | 			t.Fatalf("Iterator returned more elements than expected")
701 | 		}
702 | 
703 | 		if !pos.Equals(expected[idx]) {
704 | 			t.Errorf("Position at index %d = %v, want %v", idx, pos, expected[idx])
705 | 		}
706 | 	}
707 | }
708 | 
709 | // ═══════════════════════════════════════════════════════════════════════════════
710 | // LAST OPERATION TESTS
711 | // ═══════════════════════════════════════════════════════════════════════════════
712 | 
713 | func TestSkipList_Last_Empty(t *testing.T) {
714 | 	sl := NewSkipList()
715 | 	last := sl.Last()
716 | 
717 | 	// In an empty list, Last() returns the head's key (which is zero value)
718 | 	if last.DocumentID != 0 || last.Offset != 0 {
719 | 		t.Errorf("Last() on empty list = %v, want zero position", last)
720 | 	}
721 | }
722 | 
723 | func TestSkipList_Last_Single(t *testing.T) {
724 | 	sl := NewSkipList()
725 | 	expected := Position{DocumentID: 1, Offset: 5}
726 | 	sl.Insert(expected)
727 | 
728 | 	last := sl.Last()
729 | 	if !last.Equals(expected) {
730 | 		t.Errorf("Last() = %v, want %v", last, expected)
731 | 	}
732 | }
733 | 
734 | func TestSkipList_Last_Multiple(t *testing.T) {
735 | 	sl := NewSkipList()
736 | 
737 | 	sl.Insert(Position{DocumentID: 1, Offset: 5})
738 | 	sl.Insert(Position{DocumentID: 2, Offset: 10})
739 | 	sl.Insert(Position{DocumentID: 3, Offset: 15})
740 | 
741 | 	expected := Position{DocumentID: 3, Offset: 15}
742 | 	last := sl.Last()
743 | 
744 | 	if !last.Equals(expected) {
745 | 		t.Errorf("Last() = %v, want %v", last, expected)
746 | 	}
747 | }
748 | 
749 | // ═══════════════════════════════════════════════════════════════════════════════
750 | // EDGE CASE AND STRESS TESTS
751 | // ═══════════════════════════════════════════════════════════════════════════════
752 | 
753 | func TestSkipList_SameDocument_DifferentOffsets(t *testing.T) {
754 | 	sl := NewSkipList()
755 | 
756 | 	// Insert multiple positions in the same document
757 | 	for offset := 0; offset < 10; offset++ {
758 | 		sl.Insert(Position{DocumentID: 1, Offset: offset})
759 | 	}
760 | 
761 | 	// Verify all are present and in order
762 | 	var result []Position
763 | 	iter := sl.Iterator()
764 | 	// Get first element
765 | 	if iter.current != nil {
766 | 		result = append(result, iter.current.Key)
767 | 	}
768 | 	// Get remaining elements
769 | 	for iter.HasNext() {
770 | 		result = append(result, iter.Next())
771 | 	}
772 | 
773 | 	if len(result) != 10 {
774 | 		t.Errorf("Found %d positions, want 10", len(result))
775 | 	}
776 | 
777 | 	for offset, pos := range result {
778 | 		expected := Position{DocumentID: 1, Offset: offset}
779 | 
780 | 		if !pos.Equals(expected) {
781 | 			t.Errorf("Position at offset %d = %v, want %v", offset, pos, expected)
782 | 		}
783 | 	}
784 | }
785 | 
786 | func TestSkipList_MultipleDocs_MultipleOffsets(t *testing.T) {
787 | 	sl := NewSkipList()
788 | 
789 | 	// Insert grid: 3 documents x 5 offsets each
790 | 	for doc := 1; doc <= 3; doc++ {
791 | 		for offset := 0; offset < 5; offset++ {
792 | 			sl.Insert(Position{DocumentID: doc, Offset: offset})
793 | 		}
794 | 	}
795 | 
796 | 	// Get all positions using iterator
797 | 	var result []Position
798 | 	iter := sl.Iterator()
799 | 	// Get first element
800 | 	if iter.current != nil {
801 | 		result = append(result, iter.current.Key)
802 | 	}
803 | 	// Get remaining elements
804 | 	for iter.HasNext() {
805 | 		result = append(result, iter.Next())
806 | 	}
807 | 
808 | 	if len(result) != 15 {
809 | 		t.Errorf("Found %d positions, want 15", len(result))
810 | 	}
811 | 
812 | 	// Verify ordering (should be doc-major order)
813 | 	idx := 0
814 | 	for doc := 1; doc <= 3; doc++ {
815 | 		for offset := 0; offset < 5; offset++ {
816 | 			if idx >= len(result) {
817 | 				t.Fatal("Not enough positions in result")
818 | 			}
819 | 
820 | 			expected := Position{DocumentID: doc, Offset: offset}
821 | 
822 | 			if !result[idx].Equals(expected) {
823 | 				t.Errorf("Position at index %d = %v, want %v", idx, result[idx], expected)
824 | 			}
825 | 
826 | 			idx++
827 | 		}
828 | 	}
829 | }
830 | 
831 | func TestSkipList_LargeDataset(t *testing.T) {
832 | 	sl := NewSkipList()
833 | 
834 | 	// Insert 1000 positions
835 | 	n := 1000
836 | 	for i := 0; i < n; i++ {
837 | 		sl.Insert(Position{DocumentID: i / 10, Offset: i % 10})
838 | 	}
839 | 
840 | 	// Verify count
841 | 	count := 0
842 | 	iter := sl.Iterator()
843 | 	// Count first element
844 | 	if iter.current != nil {
845 | 		count++
846 | 	}
847 | 	// Count remaining elements
848 | 	for iter.HasNext() {
849 | 		iter.Next()
850 | 		count++
851 | 	}
852 | 
853 | 	if count != n {
854 | 		t.Errorf("Found %d positions, want %d", count, n)
855 | 	}
856 | 
857 | 	// Spot check some positions
858 | 	testPositions := []Position{
859 | 		{DocumentID: 0, Offset: 0},
860 | 		{DocumentID: 50, Offset: 5},
861 | 		{DocumentID: 99, Offset: 9},
862 | 	}
863 | 
864 | 	for _, pos := range testPositions {
865 | 		found, err := sl.Find(pos)
866 | 		if err != nil {
867 | 			t.Errorf("Find(%v) error = %v, want nil", pos, err)
868 | 		}
869 | 		if !found.Equals(pos) {
870 | 			t.Errorf("Find(%v) = %v, want %v", pos, found, pos)
871 | 		}
872 | 	}
873 | }
874 | 
875 | func TestSkipList_InfinityValues(t *testing.T) {
876 | 	// Test that sentinel values work correctly
877 | 	if BOF >= 0 {
878 | 		t.Error("BOF should be negative (math.MinInt)")
879 | 	}
880 | 
881 | 	if EOF <= 0 {
882 | 		t.Error("EOF should be positive (math.MaxInt)")
883 | 	}
884 | 
885 | 	if BOF != math.MinInt {
886 | 		t.Errorf("BOF should be math.MinInt, got %d", BOF)
887 | 	}
888 | 
889 | 	if EOF != math.MaxInt {
890 | 		t.Errorf("EOF should be math.MaxInt, got %d", EOF)
891 | 	}
892 | 
893 | 	// BOF should be less than any regular position
894 | 	regularPos := Position{DocumentID: 0, Offset: 0}
895 | 	if !BOFDocument.IsBefore(regularPos) {
896 | 		t.Error("BOF should be before any regular position")
897 | 	}
898 | 
899 | 	// EOF should be greater than any regular position
900 | 	if !regularPos.IsBefore(EOFDocument) {
901 | 		t.Error("Any regular position should be before EOF")
902 | 	}
903 | }
904 | 
905 | // ═══════════════════════════════════════════════════════════════════════════════
906 | // BENCHMARK TESTS
907 | // ═══════════════════════════════════════════════════════════════════════════════
908 | 
909 | func BenchmarkSkipList_Insert(b *testing.B) {
910 | 	sl := NewSkipList()
911 | 
912 | 	b.ResetTimer()
913 | 	for i := 0; i < b.N; i++ {
914 | 		sl.Insert(Position{DocumentID: i / 1000, Offset: i % 1000})
915 | 	}
916 | }
917 | 
918 | func BenchmarkSkipList_Find(b *testing.B) {
919 | 	sl := NewSkipList()
920 | 
921 | 	// Pre-populate with 10000 elements
922 | 	for i := 0; i < 10000; i++ {
923 | 		sl.Insert(Position{DocumentID: i / 100, Offset: i % 100})
924 | 	}
925 | 
926 | 	b.ResetTimer()
927 | 	for i := 0; i < b.N; i++ {
928 | 		sl.Find(Position{DocumentID: i / 100 % 100, Offset: i % 100})
929 | 	}
930 | }
931 | 
932 | func BenchmarkSkipList_Delete(b *testing.B) {
933 | 	// Re-populate for each iteration
934 | 	for i := 0; i < b.N; i++ {
935 | 		b.StopTimer()
936 | 		sl := NewSkipList()
937 | 		for j := 0; j < 1000; j++ {
938 | 			sl.Insert(Position{DocumentID: j / 10, Offset: j % 10})
939 | 		}
940 | 		b.StartTimer()
941 | 
942 | 		sl.Delete(Position{DocumentID: i / 10 % 100, Offset: i % 10})
943 | 	}
944 | }
945 | 
946 | func BenchmarkSkipList_Iterator(b *testing.B) {
947 | 	sl := NewSkipList()
948 | 
949 | 	// Pre-populate with 1000 elements
950 | 	for i := 0; i < 1000; i++ {
951 | 		sl.Insert(Position{DocumentID: i / 10, Offset: i % 10})
952 | 	}
953 | 
954 | 	b.ResetTimer()
955 | 	for i := 0; i < b.N; i++ {
956 | 		iter := sl.Iterator()
957 | 		// Process first element
958 | 		if iter.current != nil {
959 | 			_ = iter.current.Key
960 | 		}
961 | 		// Process remaining elements
962 | 		for iter.HasNext() {
963 | 			iter.Next()
964 | 		}
965 | 	}
966 | }
967 | 
968 | func BenchmarkSkipList_FindLessThan(b *testing.B) {
969 | 	sl := NewSkipList()
970 | 
971 | 	// Pre-populate with 10000 elements
972 | 	for i := 0; i < 10000; i++ {
973 | 		sl.Insert(Position{DocumentID: i / 100, Offset: i % 100})
974 | 	}
975 | 
976 | 	b.ResetTimer()
977 | 	for i := 0; i < b.N; i++ {
978 | 		sl.FindLessThan(Position{DocumentID: i / 100 % 100, Offset: i % 100})
979 | 	}
980 | }
981 | 
982 | func BenchmarkSkipList_FindGreaterThan(b *testing.B) {
983 | 	sl := NewSkipList()
984 | 
985 | 	// Pre-populate with 10000 elements
986 | 	for i := 0; i < 10000; i++ {
987 | 		sl.Insert(Position{DocumentID: i / 100, Offset: i % 100})
988 | 	}
989 | 
990 | 	b.ResetTimer()
991 | 	for i := 0; i < b.N; i++ {
992 | 		sl.FindGreaterThan(Position{DocumentID: i / 100 % 100, Offset: i % 100})
993 | 	}
994 | }
995 | 


--------------------------------------------------------------------------------
/serialization.go:
--------------------------------------------------------------------------------
  1 | package blaze
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/binary"
  6 | 	"math"
  7 | 
  8 | 	"github.com/RoaringBitmap/roaring"
  9 | )
 10 | 
 11 | // ═══════════════════════════════════════════════════════════════════════════════
 12 | // SERIALIZATION: Saving and Loading the Index
 13 | // ═══════════════════════════════════════════════════════════════════════════════
 14 | // Why serialize?
 15 | // - Save index to disk for persistence
 16 | // - Send index over network
 17 | // - Create backups
 18 | //
 19 | // BINARY FORMAT:
 20 | // --------------
 21 | // We use a custom binary format for efficiency:
 22 | // - Smaller file size than JSON (important for large indexes)
 23 | // - Faster to parse than JSON
 24 | // - Preserves exact structure (including skip list towers)
 25 | //
 26 | // FORMAT STRUCTURE:
 27 | // -----------------
 28 | // For each term:
 29 | //   [term_length: uint32][term: bytes]
 30 | //   [node_data_length: uint32][node_data: positions...]
 31 | //   [tower_data: for each node...]
 32 | //
 33 | // ENCODING STRATEGY:
 34 | // ------------------
 35 | // The tricky part is encoding the skip list tower structure:
 36 | // 1. Assign each node a sequential index (1, 2, 3, ...)
 37 | // 2. Store node positions (DocID, Offset pairs)
 38 | // 3. Store tower pointers as indices (not memory addresses!)
 39 | //
 40 | // Why use indices instead of pointers?
 41 | // - Pointers are meaningless after deserialization (different memory locations)
 42 | // - Indices are stable and can be reconstructed
 43 | //
 44 | // ═══════════════════════════════════════════════════════════════════════════════
 45 | 
 46 | // Encode serializes the inverted index to binary format
 47 | //
 48 | // COMPLETE EXAMPLE:
 49 | // -----------------
 50 | // Index contains:
 51 | //
 52 | //	"quick" → SkipList with nodes at [Doc1:Pos1, Doc3:Pos0]
 53 | //	"brown" → SkipList with nodes at [Doc1:Pos2]
 54 | //
 55 | // Encoded format:
 56 | //
 57 | //	[5]['q','u','i','c','k']  ← Term name
 58 | //	[16][1,1,3,0]              ← Node positions (2 positions × 8 bytes each)
 59 | //	[4][2][2][0]               ← Tower structure (node1→node2, node2→nil)
 60 | //	[5]['b','r','o','w','n']  ← Next term
 61 | //	[8][1,2]                   ← Node position
 62 | //	[2][0]                     ← Tower structure (only one node, no next)
 63 | //
 64 | // The encoder object keeps track of our position in the output buffer.
 65 | // Encode serializes the inverted index with HYBRID STORAGE including BM25 statistics
 66 | //
 67 | // BINARY FORMAT:
 68 | // --------------
 69 | // [Header]
 70 | //   - TotalDocs: uint32
 71 | //   - TotalTerms: uint64
 72 | //   - BM25.K1: float64
 73 | //   - BM25.B: float64
 74 | //   - NumDocStats: uint32
 75 | //
 76 | // [Document Statistics] (for each document)
 77 | //   - DocID: uint32
 78 | //   - Length: uint32
 79 | //   - NumTerms: uint32
 80 | //   - For each term:
 81 | //   - TermLength: uint32
 82 | //   - Term: bytes
 83 | //   - Frequency: uint32
 84 | //
 85 | // [Roaring Bitmaps] (NEW - for fast document lookups)
 86 | //   - NumBitmaps: uint32
 87 | //   - For each term:
 88 | //   - TermLength: uint32
 89 | //   - Term: bytes
 90 | //   - BitmapLength: uint32
 91 | //   - Bitmap: bytes (roaring's native serialization)
 92 | //
 93 | // [Posting Lists] (position data for phrase search)
 94 | //   - For each term...
 95 | func (idx *InvertedIndex) Encode() ([]byte, error) {
 96 | 	buf := new(bytes.Buffer)
 97 | 
 98 | 	// Write header with BM25 metadata
 99 | 	if err := idx.encodeHeader(buf); err != nil {
100 | 		return nil, err
101 | 	}
102 | 
103 | 	// Write document statistics
104 | 	if err := idx.encodeDocStats(buf); err != nil {
105 | 		return nil, err
106 | 	}
107 | 
108 | 	// Write roaring bitmaps (NEW!)
109 | 	if err := idx.encodeRoaringBitmaps(buf); err != nil {
110 | 		return nil, err
111 | 	}
112 | 
113 | 	// Write posting lists (existing format)
114 | 	encoder := newIndexEncoder(buf)
115 | 	for term, skipList := range idx.PostingsList {
116 | 		if err := encoder.encodeTerm(term, skipList); err != nil {
117 | 			return nil, err
118 | 		}
119 | 	}
120 | 
121 | 	return buf.Bytes(), nil
122 | }
123 | 
124 | // encodeHeader writes the index metadata
125 | func (idx *InvertedIndex) encodeHeader(buf *bytes.Buffer) error {
126 | 	// Write corpus statistics
127 | 	if err := binary.Write(buf, binary.LittleEndian, uint32(idx.TotalDocs)); err != nil {
128 | 		return err
129 | 	}
130 | 	if err := binary.Write(buf, binary.LittleEndian, uint64(idx.TotalTerms)); err != nil {
131 | 		return err
132 | 	}
133 | 
134 | 	// Write BM25 parameters
135 | 	if err := binary.Write(buf, binary.LittleEndian, idx.BM25Params.K1); err != nil {
136 | 		return err
137 | 	}
138 | 	if err := binary.Write(buf, binary.LittleEndian, idx.BM25Params.B); err != nil {
139 | 		return err
140 | 	}
141 | 
142 | 	// Write number of documents with statistics
143 | 	if err := binary.Write(buf, binary.LittleEndian, uint32(len(idx.DocStats))); err != nil {
144 | 		return err
145 | 	}
146 | 
147 | 	return nil
148 | }
149 | 
150 | // encodeDocStats writes document statistics for BM25
151 | func (idx *InvertedIndex) encodeDocStats(buf *bytes.Buffer) error {
152 | 	for _, docStats := range idx.DocStats {
153 | 		// Write document ID and length
154 | 		if err := binary.Write(buf, binary.LittleEndian, uint32(docStats.DocID)); err != nil {
155 | 			return err
156 | 		}
157 | 		if err := binary.Write(buf, binary.LittleEndian, uint32(docStats.Length)); err != nil {
158 | 			return err
159 | 		}
160 | 
161 | 		// Write number of unique terms
162 | 		if err := binary.Write(buf, binary.LittleEndian, uint32(len(docStats.TermFreqs))); err != nil {
163 | 			return err
164 | 		}
165 | 
166 | 		// Write each term and its frequency
167 | 		for term, freq := range docStats.TermFreqs {
168 | 			// Write term length and term
169 | 			termBytes := []byte(term)
170 | 			if err := binary.Write(buf, binary.LittleEndian, uint32(len(termBytes))); err != nil {
171 | 				return err
172 | 			}
173 | 			if _, err := buf.Write(termBytes); err != nil {
174 | 				return err
175 | 			}
176 | 
177 | 			// Write frequency
178 | 			if err := binary.Write(buf, binary.LittleEndian, uint32(freq)); err != nil {
179 | 				return err
180 | 			}
181 | 		}
182 | 	}
183 | 
184 | 	return nil
185 | }
186 | 
187 | // encodeRoaringBitmaps writes the roaring bitmaps for document-level storage
188 | //
189 | // ROARING BITMAP SERIALIZATION:
190 | // ------------------------------
191 | // Roaring bitmaps have their own efficient binary format via ToBytes()
192 | // We just need to wrap it with term names and lengths
193 | //
194 | // FORMAT:
195 | // -------
196 | // [NumBitmaps: uint32]
197 | // For each bitmap:
198 | //
199 | //	[TermLength: uint32][Term: bytes]
200 | //	[BitmapLength: uint32][Bitmap: bytes]
201 | //
202 | // EXAMPLE:
203 | // --------
204 | // Term "quick" appears in documents [1, 3, 5, 100, 500]
205 | // Roaring serializes this to ~20 bytes (vs 40 bytes for raw integers!)
206 | //
207 | // COMPRESSION BENEFITS:
208 | // ---------------------
209 | // For term "the" in 500,000 documents:
210 | // - Skip list: ~24 MB (500k nodes × 48 bytes)
211 | // - Roaring bitmap: ~60 KB (400x compression!)
212 | func (idx *InvertedIndex) encodeRoaringBitmaps(buf *bytes.Buffer) error {
213 | 	// Write number of bitmaps
214 | 	if err := binary.Write(buf, binary.LittleEndian, uint32(len(idx.DocBitmaps))); err != nil {
215 | 		return err
216 | 	}
217 | 
218 | 	// Write each term and its bitmap
219 | 	for term, bitmap := range idx.DocBitmaps {
220 | 		// Write term name
221 | 		termBytes := []byte(term)
222 | 		if err := binary.Write(buf, binary.LittleEndian, uint32(len(termBytes))); err != nil {
223 | 			return err
224 | 		}
225 | 		if _, err := buf.Write(termBytes); err != nil {
226 | 			return err
227 | 		}
228 | 
229 | 		// Write roaring bitmap (it has its own compact serialization)
230 | 		bitmapBytes, err := bitmap.ToBytes()
231 | 		if err != nil {
232 | 			return err
233 | 		}
234 | 		if err := binary.Write(buf, binary.LittleEndian, uint32(len(bitmapBytes))); err != nil {
235 | 			return err
236 | 		}
237 | 		if _, err := buf.Write(bitmapBytes); err != nil {
238 | 			return err
239 | 		}
240 | 	}
241 | 
242 | 	return nil
243 | }
244 | 
245 | // indexEncoder handles the encoding process
246 | //
247 | // This encapsulates the encoding state and provides helper methods.
248 | // Using a struct is cleaner than passing a buffer around everywhere.
249 | type indexEncoder struct {
250 | 	buffer *bytes.Buffer // Accumulates the serialized data
251 | }
252 | 
253 | func newIndexEncoder(buffer *bytes.Buffer) *indexEncoder {
254 | 	return &indexEncoder{
255 | 		buffer: buffer,
256 | 	}
257 | }
258 | 
259 | // encodeTerm serializes a single term and its skip list
260 | //
261 | // THREE-PHASE ENCODING:
262 | // ---------------------
263 | // Phase 1: Write the term name
264 | // Phase 2: Write node positions (DocID, Offset pairs)
265 | // Phase 3: Write tower structure (how nodes link together)
266 | func (e *indexEncoder) encodeTerm(term string, skipList SkipList) error {
267 | 	// PHASE 1: Write term name
268 | 	// Format: [length: uint32][bytes]
269 | 	if err := e.writeString(term); err != nil {
270 | 		return err
271 | 	}
272 | 
273 | 	// PHASE 2: Build node index map
274 | 	// Assign each node a sequential index: Head=1, Next=2, etc.
275 | 	// This map lets us convert node pointers to indices
276 | 	nodeMap := e.buildNodeIndexMap(skipList)
277 | 
278 | 	// PHASE 3: Write node positions
279 | 	// Format: [length: uint32][DocID: uint32][Offset: uint32]...
280 | 	nodeData := e.encodeNodePositions(skipList)
281 | 	if err := e.writeBytes(nodeData); err != nil {
282 | 		return err
283 | 	}
284 | 
285 | 	// PHASE 4: Write tower structure
286 | 	// This is the most complex part - see encodeTowerStructure
287 | 	return e.encodeTowerStructure(skipList, nodeMap)
288 | }
289 | 
290 | // writeString writes a length-prefixed string
291 | //
292 | // Format: [length: 4 bytes][string: length bytes]
293 | //
294 | // Example: "quick" (5 characters)
295 | //
296 | //	Binary: [0x05, 0x00, 0x00, 0x00, 'q', 'u', 'i', 'c', 'k']
297 | //	         ^^^^^^^^^^^^^^^^^^^^  ^^^^^^^^^^^^^^^^^^^^^^^^^^^
298 | //	         length = 5            actual string bytes
299 | func (e *indexEncoder) writeString(s string) error {
300 | 	data := []byte(s)
301 | 
302 | 	// Write length as 32-bit unsigned integer (4 bytes)
303 | 	if err := binary.Write(e.buffer, binary.LittleEndian, uint32(len(data))); err != nil {
304 | 		return err
305 | 	}
306 | 
307 | 	// Write the actual string bytes
308 | 	_, err := e.buffer.Write(data)
309 | 	return err
310 | }
311 | 
312 | // writeBytes writes a length-prefixed byte array
313 | //
314 | // Same as writeString but for arbitrary byte data
315 | func (e *indexEncoder) writeBytes(data []byte) error {
316 | 	// Write length prefix
317 | 	if err := binary.Write(e.buffer, binary.LittleEndian, uint32(len(data))); err != nil {
318 | 		return err
319 | 	}
320 | 
321 | 	// Write the data
322 | 	_, err := e.buffer.Write(data)
323 | 	return err
324 | }
325 | 
326 | // buildNodeIndexMap creates a mapping from node positions to sequential indices
327 | //
328 | // WHY DO WE NEED THIS?
329 | // --------------------
330 | // Skip list nodes are connected via pointers (memory addresses).
331 | // We can't serialize pointers because:
332 | // 1. Memory addresses change between program runs
333 | // 2. Addresses are meaningless on different machines
334 | //
335 | // Solution: Assign each node a stable index (1, 2, 3, ...)
336 | // Then we can say "Node 1 points to Node 3" instead of memory addresses.
337 | //
338 | // EXAMPLE:
339 | // --------
340 | // Skip list: Head → Node{Doc1:Pos1} → Node{Doc3:Pos0} → nil
341 | //
342 | // Build map:
343 | //
344 | //	{Doc1:Pos1} → Index 1
345 | //	{Doc3:Pos0} → Index 2
346 | //
347 | // Now we can encode towers as: "Node 1 points to Index 2"
348 | func (e *indexEncoder) buildNodeIndexMap(skipList SkipList) map[nodePosition]int {
349 | 	nodeMap := make(map[nodePosition]int)
350 | 	current := skipList.Head
351 | 	index := 1 // Start from 1 (0 means nil/null)
352 | 
353 | 	// Traverse the bottom level of the skip list
354 | 	for current != nil {
355 | 		// Create a compact position identifier
356 | 		pos := nodePosition{
357 | 			DocID:    int32(current.Key.DocumentID),
358 | 			Position: int32(current.Key.Offset),
359 | 		}
360 | 
361 | 		// Assign this node the next sequential index
362 | 		nodeMap[pos] = index
363 | 		index++
364 | 
365 | 		// Move to next node
366 | 		current = current.Tower[0]
367 | 	}
368 | 
369 | 	return nodeMap
370 | }
371 | 
372 | // encodeNodePositions serializes all node positions (DocID, Offset pairs)
373 | //
374 | // FORMAT:
375 | // -------
376 | // For each node: [DocID: int32][Offset: int32]
377 | //
378 | // EXAMPLE:
379 | // --------
380 | // Nodes: [Doc1:Pos1, Doc3:Pos0, Doc3:Pos5]
381 | // Encoded: [1][1][3][0][3][5]
382 | //
383 | //	^^^  ^^^  ^^^  ^^^  ^^^  ^^^
384 | //	DocID Off DocID Off DocID Off
385 | //
386 | // Total: 6 × 4 bytes = 24 bytes
387 | func (e *indexEncoder) encodeNodePositions(skipList SkipList) []byte {
388 | 	buf := new(bytes.Buffer)
389 | 	current := skipList.Head
390 | 
391 | 	// Traverse all nodes in the skip list
392 | 	for current != nil {
393 | 		// Write document ID (4 bytes)
394 | 		binary.Write(buf, binary.LittleEndian, int32(current.Key.DocumentID))
395 | 
396 | 		// Write offset (4 bytes)
397 | 		binary.Write(buf, binary.LittleEndian, int32(current.Key.Offset))
398 | 
399 | 		// Move to next node
400 | 		current = current.Tower[0]
401 | 	}
402 | 
403 | 	return buf.Bytes()
404 | }
405 | 
406 | // encodeTowerStructure serializes the skip list tower connections
407 | //
408 | // TOWER STRUCTURE RECAP:
409 | // ----------------------
410 | // A skip list node has a "tower" of forward pointers at different levels:
411 | //
412 | //	Level 2: [*]---------------→[*]----------→nil
413 | //	Level 1: [*]------→[*]------→[*]------→[*]→nil
414 | //	Level 0: [*]→[*]→[*]→[*]→[*]→[*]→[*]→[*]→nil
415 | //
416 | // Each node's tower is an array of pointers to other nodes.
417 | //
418 | // ENCODING STRATEGY:
419 | // ------------------
420 | // For each node, we encode which nodes its tower points to (as indices).
421 | //
422 | // EXAMPLE:
423 | // --------
424 | // Node 1 tower: [Node2, Node4, nil, nil, ...] (2 levels high)
425 | // Node 2 tower: [Node3, nil, nil, ...]        (1 level high)
426 | // Node 3 tower: [Node4, nil, nil, ...]        (1 level high)
427 | //
428 | // Encoded:
429 | //
430 | //	Node 1: [length=4][2][4]      ← 2 indices × 2 bytes = 4 bytes
431 | //	Node 2: [length=2][3]          ← 1 index × 2 bytes = 2 bytes
432 | //	Node 3: [length=2][4]          ← 1 index × 2 bytes = 2 bytes
433 | func (e *indexEncoder) encodeTowerStructure(skipList SkipList, nodeMap map[nodePosition]int) error {
434 | 	current := skipList.Head
435 | 
436 | 	// Encode tower for each node in the skip list
437 | 	for current != nil {
438 | 		towerData := e.encodeTowerForNode(current, nodeMap)
439 | 		if err := e.writeBytes(towerData); err != nil {
440 | 			return err
441 | 		}
442 | 		current = current.Tower[0]
443 | 	}
444 | 
445 | 	return nil
446 | }
447 | 
448 | // encodeTowerForNode encodes the tower structure for a single node
449 | //
450 | // PROCESS:
451 | // --------
452 | // 1. Collect all non-nil tower pointers
453 | // 2. Convert each pointer to its index (using nodeMap)
454 | // 3. Write indices as uint16 values
455 | //
456 | // Special case: If tower is empty (no forward pointers), write [0]
457 | func (e *indexEncoder) encodeTowerForNode(node *Node, nodeMap map[nodePosition]int) []byte {
458 | 	buf := new(bytes.Buffer)
459 | 
460 | 	// Collect all non-nil tower levels
461 | 	towerIndices := e.collectTowerIndices(node, nodeMap)
462 | 
463 | 	if len(towerIndices) == 0 {
464 | 		// Empty tower - write a single zero
465 | 		binary.Write(buf, binary.LittleEndian, uint16(0))
466 | 	} else {
467 | 		// Write each index as a 2-byte value
468 | 		for _, index := range towerIndices {
469 | 			binary.Write(buf, binary.LittleEndian, uint16(index))
470 | 		}
471 | 	}
472 | 
473 | 	return buf.Bytes()
474 | }
475 | 
476 | // collectTowerIndices extracts tower pointers and converts them to indices
477 | //
478 | // WALK THROUGH:
479 | // -------------
480 | // Given a node with tower: [PtrA, PtrB, nil, nil, ...]
481 | //
482 | // Step 1: Check level 0 - PtrA exists
483 | //   - Look up PtrA's position in nodeMap
484 | //   - Get index: 3
485 | //   - Add 3 to indices
486 | //
487 | // Step 2: Check level 1 - PtrB exists
488 | //   - Look up PtrB's position in nodeMap
489 | //   - Get index: 7
490 | //   - Add 7 to indices
491 | //
492 | // Step 3: Check level 2 - nil
493 | //   - Stop here
494 | //
495 | // Result: [3, 7]
496 | func (e *indexEncoder) collectTowerIndices(node *Node, nodeMap map[nodePosition]int) []int {
497 | 	var indices []int
498 | 
499 | 	// Walk up the tower until we hit a nil pointer
500 | 	for level := 0; level < MaxHeight; level++ {
501 | 		if node.Tower[level] == nil {
502 | 			break // No more levels
503 | 		}
504 | 
505 | 		// Get the position of the target node
506 | 		pos := nodePosition{
507 | 			DocID:    int32(node.Tower[level].Key.DocumentID),
508 | 			Position: int32(node.Tower[level].Key.Offset),
509 | 		}
510 | 
511 | 		// Look up the target node's index
512 | 		indices = append(indices, nodeMap[pos])
513 | 	}
514 | 
515 | 	return indices
516 | }
517 | 
518 | // nodePosition represents a compact node position for encoding
519 | //
520 | // We use int32 to match our internal representation:
521 | // - Document IDs are integers
522 | // - Positions are integers
523 | // - Sentinel values (BOF/EOF) use int as well
524 | //
525 | // int32 provides:
526 | // - 4 bytes per value (efficient storage)
527 | // - Range: -2,147,483,648 to 2,147,483,647
528 | // - Sufficient for document IDs and positions
529 | type nodePosition struct {
530 | 	DocID    int32
531 | 	Position int32
532 | }
533 | 
534 | // ═══════════════════════════════════════════════════════════════════════════════
535 | // DESERIALIZATION: Loading the Index from Binary Data
536 | // ═══════════════════════════════════════════════════════════════════════════════
537 | // This is the reverse of encoding - we read the binary data and reconstruct
538 | // the entire index structure including all skip list pointers.
539 | //
540 | // THREE-PHASE DECODING:
541 | // ---------------------
542 | // Phase 1: Read term names and node positions
543 | // Phase 2: Create node objects
544 | // Phase 3: Reconstruct tower pointers (the tricky part!)
545 | //
546 | // ═══════════════════════════════════════════════════════════════════════════════
547 | 
548 | // Decode deserializes binary data back into an inverted index
549 | //
550 | // PROCESS:
551 | // --------
552 | // 1. Create a decoder to track our position in the byte array
553 | // 2. Repeatedly decode terms until we reach the end
554 | // 3. Reconstruct the PostingsList map
555 | //
556 | // EXAMPLE:
557 | // --------
558 | // Input: [5]['quick'][16][1,1,3,0][4][2][2][0]...
559 | // Output: PostingsList["quick"] = SkipList{...}
560 | // Decode deserializes binary data back into an inverted index with HYBRID STORAGE and BM25 stats
561 | func (idx *InvertedIndex) Decode(data []byte) error {
562 | 	offset := 0
563 | 
564 | 	// Read header with BM25 metadata
565 | 	newOffset, err := idx.decodeHeader(data, offset)
566 | 	if err != nil {
567 | 		return err
568 | 	}
569 | 	offset = newOffset
570 | 
571 | 	// Read document statistics
572 | 	newOffset, err = idx.decodeDocStats(data, offset)
573 | 	if err != nil {
574 | 		return err
575 | 	}
576 | 	offset = newOffset
577 | 
578 | 	// Read roaring bitmaps (NEW!)
579 | 	newOffset, err = idx.decodeRoaringBitmaps(data, offset)
580 | 	if err != nil {
581 | 		return err
582 | 	}
583 | 	offset = newOffset
584 | 
585 | 	// Read posting lists (existing format)
586 | 	decoder := newIndexDecoder(data, offset)
587 | 	recoveredIndex := make(map[string]SkipList)
588 | 
589 | 	for !decoder.isComplete() {
590 | 		term, skipList, err := decoder.decodeTerm()
591 | 		if err != nil {
592 | 			return err
593 | 		}
594 | 		recoveredIndex[term] = skipList
595 | 	}
596 | 
597 | 	idx.PostingsList = recoveredIndex
598 | 	return nil
599 | }
600 | 
601 | // decodeHeader reads the index metadata
602 | func (idx *InvertedIndex) decodeHeader(data []byte, offset int) (int, error) {
603 | 	// Read corpus statistics
604 | 	idx.TotalDocs = int(binary.LittleEndian.Uint32(data[offset : offset+4]))
605 | 	offset += 4
606 | 
607 | 	idx.TotalTerms = int64(binary.LittleEndian.Uint64(data[offset : offset+8]))
608 | 	offset += 8
609 | 
610 | 	// Read BM25 parameters
611 | 	idx.BM25Params.K1 = math.Float64frombits(binary.LittleEndian.Uint64(data[offset : offset+8]))
612 | 	offset += 8
613 | 
614 | 	idx.BM25Params.B = math.Float64frombits(binary.LittleEndian.Uint64(data[offset : offset+8]))
615 | 	offset += 8
616 | 
617 | 	return offset, nil
618 | }
619 | 
620 | // decodeDocStats reads document statistics
621 | func (idx *InvertedIndex) decodeDocStats(data []byte, offset int) (int, error) {
622 | 	// Read number of documents
623 | 	numDocs := int(binary.LittleEndian.Uint32(data[offset : offset+4]))
624 | 	offset += 4
625 | 
626 | 	idx.DocStats = make(map[int]DocumentStats, numDocs)
627 | 
628 | 	for i := 0; i < numDocs; i++ {
629 | 		// Read document ID and length
630 | 		docID := int(binary.LittleEndian.Uint32(data[offset : offset+4]))
631 | 		offset += 4
632 | 
633 | 		length := int(binary.LittleEndian.Uint32(data[offset : offset+4]))
634 | 		offset += 4
635 | 
636 | 		// Read number of unique terms
637 | 		numTerms := int(binary.LittleEndian.Uint32(data[offset : offset+4]))
638 | 		offset += 4
639 | 
640 | 		// Initialize document stats
641 | 		docStats := DocumentStats{
642 | 			DocID:     docID,
643 | 			Length:    length,
644 | 			TermFreqs: make(map[string]int, numTerms),
645 | 		}
646 | 
647 | 		// Read each term and its frequency
648 | 		for j := 0; j < numTerms; j++ {
649 | 			// Read term length
650 | 			termLen := int(binary.LittleEndian.Uint32(data[offset : offset+4]))
651 | 			offset += 4
652 | 
653 | 			// Read term
654 | 			term := string(data[offset : offset+termLen])
655 | 			offset += termLen
656 | 
657 | 			// Read frequency
658 | 			freq := int(binary.LittleEndian.Uint32(data[offset : offset+4]))
659 | 			offset += 4
660 | 
661 | 			docStats.TermFreqs[term] = freq
662 | 		}
663 | 
664 | 		idx.DocStats[docID] = docStats
665 | 	}
666 | 
667 | 	return offset, nil
668 | }
669 | 
670 | // decodeRoaringBitmaps reads the roaring bitmaps for document-level storage
671 | //
672 | // DESERIALIZATION:
673 | // ----------------
674 | // Read each term and its roaring bitmap, reconstructing the DocBitmaps map
675 | //
676 | // FORMAT:
677 | // -------
678 | // [NumBitmaps: uint32]
679 | // For each bitmap:
680 | //
681 | //	[TermLength: uint32][Term: bytes]
682 | //	[BitmapLength: uint32][Bitmap: bytes]
683 | //
684 | // RECOVERY:
685 | // ---------
686 | // We create a new roaring.Bitmap for each term and deserialize it
687 | // using roaring's native UnmarshalBinary method
688 | func (idx *InvertedIndex) decodeRoaringBitmaps(data []byte, offset int) (int, error) {
689 | 	// Read number of bitmaps
690 | 	numBitmaps := int(binary.LittleEndian.Uint32(data[offset : offset+4]))
691 | 	offset += 4
692 | 
693 | 	// Initialize the DocBitmaps map
694 | 	idx.DocBitmaps = make(map[string]*roaring.Bitmap, numBitmaps)
695 | 
696 | 	// Read each term and its bitmap
697 | 	for i := 0; i < numBitmaps; i++ {
698 | 		// Read term length
699 | 		termLen := int(binary.LittleEndian.Uint32(data[offset : offset+4]))
700 | 		offset += 4
701 | 
702 | 		// Read term
703 | 		term := string(data[offset : offset+termLen])
704 | 		offset += termLen
705 | 
706 | 		// Read bitmap length
707 | 		bitmapLen := int(binary.LittleEndian.Uint32(data[offset : offset+4]))
708 | 		offset += 4
709 | 
710 | 		// Read and deserialize bitmap
711 | 		bitmap := roaring.NewBitmap()
712 | 		if err := bitmap.UnmarshalBinary(data[offset : offset+bitmapLen]); err != nil {
713 | 			return 0, err
714 | 		}
715 | 		offset += bitmapLen
716 | 
717 | 		// Store in map
718 | 		idx.DocBitmaps[term] = bitmap
719 | 	}
720 | 
721 | 	return offset, nil
722 | }
723 | 
724 | // indexDecoder handles the decoding process
725 | //
726 | // State management:
727 | // - data: The full byte array we're decoding
728 | // - offset: Our current position in the array
729 | type indexDecoder struct {
730 | 	data   []byte
731 | 	offset int
732 | }
733 | 
734 | func newIndexDecoder(data []byte, offset int) *indexDecoder {
735 | 	return &indexDecoder{
736 | 		data:   data,
737 | 		offset: offset,
738 | 	}
739 | }
740 | 
741 | // isComplete checks if we've decoded all the data
742 | func (d *indexDecoder) isComplete() bool {
743 | 	return d.offset >= len(d.data)
744 | }
745 | 
746 | // decodeTerm decodes a single term and its skip list
747 | //
748 | // DECODING SEQUENCE:
749 | // ------------------
750 | // 1. Read term name: "quick"
751 | // 2. Read node positions: [Doc1:Pos1, Doc3:Pos0]
752 | // 3. Create node objects with these positions
753 | // 4. Read tower structure and link nodes together
754 | // 5. Return the reconstructed SkipList
755 | func (d *indexDecoder) decodeTerm() (string, SkipList, error) {
756 | 	// Step 1: Read the term name
757 | 	term, err := d.readString()
758 | 	if err != nil {
759 | 		return "", SkipList{}, err
760 | 	}
761 | 
762 | 	// Step 2: Read and decode node positions
763 | 	// Returns a map: Index → Node pointer
764 | 	nodeMap, err := d.decodeNodePositions()
765 | 	if err != nil {
766 | 		return "", SkipList{}, err
767 | 	}
768 | 
769 | 	// Step 3: Decode tower structure (reconnect the nodes)
770 | 	height, err := d.decodeTowerStructure(nodeMap)
771 | 	if err != nil {
772 | 		return "", SkipList{}, err
773 | 	}
774 | 
775 | 	// Step 4: Create the SkipList structure
776 | 	skipList := SkipList{
777 | 		Head:   nodeMap[1], // First node is always at index 1
778 | 		Height: height,
779 | 	}
780 | 
781 | 	return term, skipList, nil
782 | }
783 | 
784 | // readString reads a length-prefixed string
785 | //
786 | // Format: [length: 4 bytes][string: length bytes]
787 | //
788 | // EXAMPLE:
789 | // --------
790 | // Data: [0x05, 0x00, 0x00, 0x00, 'q', 'u', 'i', 'c', 'k', ...]
791 | //
792 | //	^^^^^^^^^^^^^^^^^^^^  ^^^^^^^^^^^^^^^^^^^^^^^^^^^
793 | //	length = 5            string bytes
794 | //
795 | // Returns: "quick"
796 | // Advances offset by: 4 + 5 = 9 bytes
797 | func (d *indexDecoder) readString() (string, error) {
798 | 	// Read the length (4 bytes)
799 | 	length := int(binary.LittleEndian.Uint32(d.data[d.offset : d.offset+4]))
800 | 	d.offset += 4
801 | 
802 | 	// Read the string bytes
803 | 	str := string(d.data[d.offset : d.offset+length])
804 | 	d.offset += length
805 | 
806 | 	return str, nil
807 | }
808 | 
809 | // decodeNodePositions reconstructs all nodes from their serialized positions
810 | //
811 | // INPUT FORMAT:
812 | // -------------
813 | // [dataLength: 4 bytes][DocID: 4 bytes][Offset: 4 bytes]...
814 | //
815 | // PROCESS:
816 | // --------
817 | // 1. Read data length: How many bytes of position data?
818 | // 2. Calculate number of values: dataLength / 4
819 | // 3. Read pairs of values: (DocID, Offset)
820 | // 4. Create Node objects
821 | // 5. Assign sequential indices: 1, 2, 3, ...
822 | //
823 | // EXAMPLE:
824 | // --------
825 | // Data: [16][1][1][3][0]
826 | //
827 | //	^^^ 16 bytes of position data
828 | //	    ^^ DocID=1, Offset=1 → Node 1
829 | //	          ^^ DocID=3, Offset=0 → Node 2
830 | //
831 | // Result: map[1→Node{Doc1:Pos1}, 2→Node{Doc3:Pos0}]
832 | func (d *indexDecoder) decodeNodePositions() (map[int]*Node, error) {
833 | 	// Read the length of position data
834 | 	dataLength := int(binary.LittleEndian.Uint32(d.data[d.offset : d.offset+4]))
835 | 	d.offset += 4
836 | 
837 | 	nodeMap := make(map[int]*Node)
838 | 	nodeIndex := 1
839 | 
840 | 	// Each position is 8 bytes: 4 for DocID + 4 for Offset
841 | 	// So numValues = dataLength / 4 gives us the total number of int32s
842 | 	// And we process them in pairs
843 | 	numValues := dataLength / 4
844 | 
845 | 	for i := 0; i < numValues; i += 2 {
846 | 		// Read Document ID (as int32)
847 | 		docID := int32(binary.LittleEndian.Uint32(d.data[d.offset : d.offset+4]))
848 | 		d.offset += 4
849 | 
850 | 		// Read Offset (as int32)
851 | 		offset := int32(binary.LittleEndian.Uint32(d.data[d.offset : d.offset+4]))
852 | 		d.offset += 4
853 | 
854 | 		// Create a new node with this position
855 | 		node := &Node{
856 | 			Key: Position{
857 | 				DocumentID: int(docID),
858 | 				Offset:     int(offset),
859 | 			},
860 | 		}
861 | 
862 | 		// Assign it a sequential index
863 | 		nodeMap[nodeIndex] = node
864 | 		nodeIndex++
865 | 	}
866 | 
867 | 	return nodeMap, nil
868 | }
869 | 
870 | // decodeTowerStructure reconstructs the skip list tower connections
871 | //
872 | // THIS IS THE MAGIC STEP!
873 | // -----------------------
874 | // We now have nodes, but they're not connected.
875 | // This function reads the tower indices and reconnects everything.
876 | //
877 | // INPUT FORMAT (for each node):
878 | // -----------------------------
879 | // [towerLength: 4 bytes][index1: 2 bytes][index2: 2 bytes]...
880 | //
881 | // EXAMPLE:
882 | // --------
883 | // Node 1: [4][2][4]  ← Tower has 2 levels: points to nodes 2 and 4
884 | // Node 2: [2][3]      ← Tower has 1 level: points to node 3
885 | // Node 3: [2][0]      ← Tower has 1 level: points to nothing (end)
886 | //
887 | // RECONSTRUCTION:
888 | // ---------------
889 | // For Node 1:
890 | //   - Read indices: [2, 4]
891 | //   - Set Tower[0] = nodeMap[2]
892 | //   - Set Tower[1] = nodeMap[4]
893 | //
894 | // Result: Node 1 is now connected to nodes 2 and 4 at levels 0 and 1!
895 | func (d *indexDecoder) decodeTowerStructure(nodeMap map[int]*Node) (int, error) {
896 | 	maxHeight := 1 // Track the maximum tower height
897 | 	nodeCount := len(nodeMap)
898 | 
899 | 	// Process tower data for each node
900 | 	for nodeIndex := 1; nodeIndex <= nodeCount; nodeIndex++ {
901 | 		// Read the length of tower data for this node
902 | 		towerLength := int(binary.LittleEndian.Uint32(d.data[d.offset : d.offset+4]))
903 | 		d.offset += 4
904 | 
905 | 		// Calculate how many indices are stored (each index is 2 bytes)
906 | 		numIndices := towerLength / 2
907 | 
908 | 		// Read each tower level
909 | 		for level := 0; level < numIndices; level++ {
910 | 			// Read the target node index
911 | 			targetIndex := int(binary.LittleEndian.Uint16(d.data[d.offset : d.offset+2]))
912 | 			d.offset += 2
913 | 
914 | 			// If index is not 0 (0 means nil), connect the nodes
915 | 			if targetIndex != 0 {
916 | 				nodeMap[nodeIndex].Tower[level] = nodeMap[targetIndex]
917 | 
918 | 				// Track maximum height
919 | 				if level+1 > maxHeight {
920 | 					maxHeight = level + 1
921 | 				}
922 | 			}
923 | 		}
924 | 	}
925 | 
926 | 	return maxHeight, nil
927 | }
928 | 


--------------------------------------------------------------------------------
/search.go:
--------------------------------------------------------------------------------
  1 | package blaze
  2 | 
  3 | import (
  4 | 	"crypto/md5"
  5 | 	"encoding/hex"
  6 | 	"encoding/json"
  7 | 	"fmt"
  8 | 	"log/slog"
  9 | 	"math"
 10 | 	"sort"
 11 | 	"strings"
 12 | )
 13 | 
 14 | // ═══════════════════════════════════════════════════════════════════════════════
 15 | // PHRASE SEARCH: Finding Multi-Word Sequences
 16 | // ═══════════════════════════════════════════════════════════════════════════════
 17 | // Phrase search finds exact sequences of words.
 18 | //
 19 | // THE ALGORITHM:
 20 | // --------------
 21 | // To find "quick brown fox", we need three words at consecutive positions
 22 | // in the same document.
 23 | //
 24 | // Strategy:
 25 | // 1. Find ANY occurrence of all three words (might not be consecutive)
 26 | // 2. Walk backwards to find the start of the phrase
 27 | // 3. Check if the positions are consecutive
 28 | // 4. If yes, we found it! If no, try again from a different starting point
 29 | //
 30 | // VISUAL EXAMPLE:
 31 | // ---------------
 32 | // Document: "the quick brown dog ate the brown fox quickly"
 33 | // Positions:  0     1     2    3   4   5     6    7     8
 34 | //
 35 | // Searching for "brown fox":
 36 | //   Attempt 1:
 37 | //     - Find "brown" (any occurrence): Pos 2
 38 | //     - Find "fox" after Pos 2: Pos 7
 39 | //     - Walk back from "fox" to find "brown": Pos 6
 40 | //     - Check: Are Pos 6 and Pos 7 consecutive? YES! → Found it!
 41 | //
 42 | // ═══════════════════════════════════════════════════════════════════════════════
 43 | 
 44 | // NextPhrase finds the next occurrence of a phrase (sequence of words) in the index
 45 | //
 46 | // ALGORITHM WALKTHROUGH:
 47 | // ----------------------
 48 | // Query: "quick brown fox"
 49 | // StartPos: Beginning of file
 50 | //
 51 | // Step 1: Find the END of a potential phrase
 52 | //   - Find "quick" after startPos → maybe Doc2:Pos3
 53 | //   - Find "brown" after that     → maybe Doc2:Pos4
 54 | //   - Find "fox" after that       → maybe Doc2:Pos5
 55 | //   - endPos = Doc2:Pos5
 56 | //
 57 | // Step 2: Walk BACKWARDS to find the START
 58 | //   - From endPos, find previous "brown" → Doc2:Pos4
 59 | //   - From there, find previous "quick"  → Doc2:Pos3
 60 | //   - phraseStart = Doc2:Pos3
 61 | //
 62 | // Step 3: Validate it's a real phrase
 63 | //   - Same document? Yes (both Doc2)
 64 | //   - Consecutive positions? Yes (3, 4, 5)
 65 | //   - Distance = 5 - 3 = 2 (which equals 3 words - 1) ✓
 66 | //
 67 | // Step 4: If not valid, recurse from phraseStart
 68 | //   - This handles cases where words appear multiple times
 69 | //
 70 | // Why this algorithm?
 71 | // - It's efficient: We use the index to jump between occurrences
 72 | // - It handles multiple occurrences: Recursion keeps searching
 73 | // - It validates correctness: We check for consecutive positions
 74 | func (idx *InvertedIndex) NextPhrase(query string, startPos Position) []Position {
 75 | 	terms := strings.Fields(query) // Split "quick brown fox" → ["quick", "brown", "fox"]
 76 | 
 77 | 	// STEP 1: Find the end of a potential phrase match
 78 | 	endPos := idx.findPhraseEnd(terms, startPos)
 79 | 	if endPos.IsEnd() {
 80 | 		// No more occurrences of all words exist
 81 | 		return []Position{EOFDocument, EOFDocument}
 82 | 	}
 83 | 
 84 | 	// STEP 2: Walk backwards to find where the phrase starts
 85 | 	phraseStart := idx.findPhraseStart(terms, endPos)
 86 | 
 87 | 	// STEP 3: Validate that we found a real consecutive phrase
 88 | 	if idx.isValidPhrase(phraseStart, endPos, len(terms)) {
 89 | 		// Success! Return [start, end] positions of the phrase
 90 | 		return []Position{phraseStart, endPos}
 91 | 	}
 92 | 
 93 | 	// STEP 4: Not a valid phrase - try again from the start position
 94 | 	// This handles cases like: "brown dog brown fox" when searching for "brown fox"
 95 | 	return idx.NextPhrase(query, phraseStart)
 96 | }
 97 | 
 98 | // findPhraseEnd locates the ending position of a potential phrase
 99 | //
100 | // HOW IT WORKS:
101 | // -------------
102 | // Starting from startPos, we "hop" through the document finding each word.
103 | //
104 | // Example: Finding "quick brown fox" starting from Doc1:Pos0
105 | //
106 | //	Step 1: currentPos = Doc1:Pos0
107 | //	Step 2: Find "quick" after Doc1:Pos0 → currentPos = Doc1:Pos3
108 | //	Step 3: Find "brown" after Doc1:Pos3 → currentPos = Doc1:Pos4
109 | //	Step 4: Find "fox" after Doc1:Pos4   → currentPos = Doc1:Pos5
110 | //	Return: Doc1:Pos5 (position of the last word "fox")
111 | //
112 | // If any word isn't found, we return EOF (no phrase exists)
113 | func (idx *InvertedIndex) findPhraseEnd(terms []string, startPos Position) Position {
114 | 	currentPos := startPos
115 | 
116 | 	// For each word in the phrase, find its next occurrence
117 | 	for _, term := range terms {
118 | 		currentPos, _ = idx.Next(term, currentPos)
119 | 
120 | 		// If we can't find this word, the phrase doesn't exist
121 | 		if currentPos.IsEnd() {
122 | 			return EOFDocument
123 | 		}
124 | 	}
125 | 
126 | 	// currentPos now points to the last word of the phrase
127 | 	return currentPos
128 | }
129 | 
130 | // findPhraseStart walks backward to find where the phrase begins
131 | //
132 | // HOW IT WORKS:
133 | // -------------
134 | // Starting from the END position, we walk backwards through the phrase.
135 | //
136 | // Example: We found "fox" at Doc1:Pos5, now find the start of "quick brown fox"
137 | //
138 | //	Step 1: currentPos = Doc1:Pos5 (we're at "fox")
139 | //	Step 2: Find "brown" before Doc1:Pos5 → currentPos = Doc1:Pos4
140 | //	Step 3: Find "quick" before Doc1:Pos4 → currentPos = Doc1:Pos3
141 | //	Return: Doc1:Pos3 (position of the first word "quick")
142 | //
143 | // Why skip the last word?
144 | // - We already know where the last word is (at endPos)
145 | // - We only need to walk back through the first N-1 words
146 | func (idx *InvertedIndex) findPhraseStart(terms []string, endPos Position) Position {
147 | 	currentPos := endPos
148 | 
149 | 	// Walk backwards through all words EXCEPT the last one
150 | 	// (we already know the last word's position - it's endPos)
151 | 	for i := len(terms) - 2; i >= 0; i-- {
152 | 		currentPos, _ = idx.Previous(terms[i], currentPos)
153 | 	}
154 | 
155 | 	// currentPos now points to the first word of the phrase
156 | 	return currentPos
157 | }
158 | 
159 | // isValidPhrase checks if positions form a valid consecutive phrase
160 | //
161 | // VALIDATION RULES:
162 | // -----------------
163 | // For a valid phrase, we need:
164 | // 1. All words in the SAME document
165 | // 2. Words at CONSECUTIVE positions
166 | //
167 | // Example: Checking "quick brown fox" (3 words)
168 | //
169 | //	start = Doc1:Pos3
170 | //	end = Doc1:Pos5
171 | //
172 | //	Check 1: Same document? Doc1 == Doc1 ✓
173 | //	Check 2: Consecutive? (5 - 3) == (3 - 1) → 2 == 2 ✓
174 | //	Result: VALID
175 | //
176 | // Counter-example: NOT a valid phrase
177 | //
178 | //	start = Doc1:Pos3
179 | //	end = Doc1:Pos7
180 | //
181 | //	Check 2: Consecutive? (7 - 3) == (3 - 1) → 4 == 2 ✗
182 | //	Result: INVALID (there are extra words between them)
183 | func (idx *InvertedIndex) isValidPhrase(start, end Position, termCount int) bool {
184 | 	// Calculate expected distance for consecutive words
185 | 	// For 3 words, positions should be like [0,1,2] → distance = 2
186 | 	expectedDistance := termCount - 1
187 | 
188 | 	// Calculate actual distance between start and end
189 | 	actualDistance := end.GetOffset() - start.GetOffset()
190 | 
191 | 	// Both conditions must be true
192 | 	return start.DocumentID == end.DocumentID && actualDistance == expectedDistance
193 | }
194 | 
195 | // FindAllPhrases finds ALL occurrences of a phrase in the entire index
196 | //
197 | // ALGORITHM:
198 | // ----------
199 | // This is just a loop that repeatedly calls NextPhrase until we reach EOF.
200 | //
201 | // Example: Finding all occurrences of "brown fox"
202 | //
203 | //	Iteration 1:
204 | //	  - Search from BOF → Found at Doc2:Pos[3-4]
205 | //	  - Add to results
206 | //	  - Continue from Doc2:Pos3
207 | //
208 | //	Iteration 2:
209 | //	  - Search from Doc2:Pos3 → Found at Doc5:Pos[1-2]
210 | //	  - Add to results
211 | //	  - Continue from Doc5:Pos1
212 | //
213 | //	Iteration 3:
214 | //	  - Search from Doc5:Pos1 → Returns EOF
215 | //	  - Stop searching
216 | //
217 | // Result: [[Doc2:Pos3-4], [Doc5:Pos1-2]]
218 | func (idx *InvertedIndex) FindAllPhrases(query string, startPos Position) [][]Position {
219 | 	var allMatches [][]Position
220 | 	currentPos := BOFDocument // Start from the beginning
221 | 
222 | 	// Keep searching until we reach the end of file
223 | 	for !currentPos.IsEnd() {
224 | 		// Find the next occurrence of the phrase
225 | 		phrasePositions := idx.NextPhrase(query, currentPos)
226 | 		phraseStart := phrasePositions[0]
227 | 
228 | 		// If we found a valid phrase (not EOF), add it to results
229 | 		if !phraseStart.IsEnd() {
230 | 			allMatches = append(allMatches, phrasePositions)
231 | 		}
232 | 
233 | 		// Move to where we found the phrase to continue searching
234 | 		currentPos = phraseStart
235 | 	}
236 | 
237 | 	return allMatches
238 | }
239 | 
240 | // ═══════════════════════════════════════════════════════════════════════════════
241 | // PROXIMITY SEARCH: Finding Documents Containing All Terms
242 | // ═══════════════════════════════════════════════════════════════════════════════
243 | // A "cover" is a range of positions that contains ALL search terms.
244 | // Unlike phrase search, the words don't need to be consecutive or in order.
245 | //
246 | // EXAMPLE:
247 | // --------
248 | // Document: "the quick brown dog jumped over the lazy fox"
249 | // Positions:  0     1     2    3     4      5    6    7    8
250 | //
251 | // Searching for ["quick", "fox"]:
252 | //   Cover 1: Pos 1 to Pos 8 (entire range containing both words)
253 | //   This is the MINIMAL cover (smallest range containing all terms)
254 | //
255 | // WHY USE COVERS?
256 | // ---------------
257 | // Covers are used for:
258 | // 1. Boolean search: Find documents with ALL terms (AND query)
259 | // 2. Proximity ranking: Closer terms = higher relevance
260 | // 3. Snippet generation: Show the most relevant part of a document
261 | //
262 | // THE ALGORITHM:
263 | // --------------
264 | // To find a cover:
265 | // 1. Find the FURTHEST occurrence of any term (this is the cover end)
266 | // 2. Walk BACKWARDS to find the EARLIEST occurrence of each term
267 | // 3. Check if all terms are in the same document
268 | // 4. If yes, we found a cover! If no, try again.
269 | // ═══════════════════════════════════════════════════════════════════════════════
270 | 
271 | // NextCover finds the next "cover" - a range containing all given tokens
272 | //
273 | // ALGORITHM WALKTHROUGH:
274 | // ----------------------
275 | // Query: ["quick", "fox", "brown"]
276 | // StartPos: Beginning of file
277 | //
278 | // PHASE 1: Find the cover END (furthest position)
279 | //   - Find "quick" after startPos → maybe Doc2:Pos1
280 | //   - Find "fox" after startPos   → maybe Doc2:Pos8  ← furthest
281 | //   - Find "brown" after startPos → maybe Doc2:Pos2
282 | //   - coverEnd = Doc2:Pos8
283 | //
284 | // PHASE 2: Find the cover START (earliest position before end)
285 | //   - From Doc2:Pos9, find previous "quick" → Doc2:Pos1  ← earliest
286 | //   - From Doc2:Pos9, find previous "fox"   → Doc2:Pos8
287 | //   - From Doc2:Pos9, find previous "brown" → Doc2:Pos2
288 | //   - coverStart = Doc2:Pos1
289 | //
290 | // PHASE 3: Validate the cover
291 | //   - Same document? Yes (all in Doc2) ✓
292 | //   - Return [Doc2:Pos1, Doc2:Pos8]
293 | //
294 | // If not same document, recurse from coverStart to find the next cover.
295 | //
296 | // Why this algorithm?
297 | // - Greedy approach: We find the furthest occurrence first
298 | // - Efficient: Uses index jumps instead of scanning
299 | // - Minimal covers: Always finds the smallest valid range
300 | func (idx *InvertedIndex) NextCover(tokens []string, startPos Position) []Position {
301 | 	// PHASE 1: Find the END of the cover (furthest position)
302 | 	coverEnd := idx.findCoverEnd(tokens, startPos)
303 | 	if coverEnd.IsEnd() {
304 | 		// Can't find all tokens - no cover exists
305 | 		return []Position{EOFDocument, EOFDocument}
306 | 	}
307 | 
308 | 	// PHASE 2: Find the START of the cover (earliest position)
309 | 	coverStart := idx.findCoverStart(tokens, coverEnd)
310 | 
311 | 	// PHASE 3: Validate the cover
312 | 	if coverStart.DocumentID == coverEnd.DocumentID {
313 | 		// Success! All tokens are in the same document
314 | 		return []Position{coverStart, coverEnd}
315 | 	}
316 | 
317 | 	// Tokens span multiple documents - try again from coverStart
318 | 	return idx.NextCover(tokens, coverStart)
319 | }
320 | 
321 | // findCoverEnd finds the furthest position among all tokens
322 | //
323 | // HOW IT WORKS:
324 | // -------------
325 | // We find the next occurrence of EACH token and track the furthest one.
326 | //
327 | // Example: Finding cover end for ["quick", "brown", "fox"] from Doc1:Pos0
328 | //
329 | //	Step 1: Find "quick" after Pos0 → Doc2:Pos1
330 | //	        maxPos = Doc2:Pos1
331 | //
332 | //	Step 2: Find "brown" after Pos0 → Doc2:Pos2
333 | //	        Is Doc2:Pos2 after Doc2:Pos1? Yes
334 | //	        maxPos = Doc2:Pos2
335 | //
336 | //	Step 3: Find "fox" after Pos0 → Doc2:Pos8
337 | //	        Is Doc2:Pos8 after Doc2:Pos2? Yes
338 | //	        maxPos = Doc2:Pos8
339 | //
340 | //	Return: Doc2:Pos8 (the furthest position)
341 | //
342 | // Special case: If ANY token returns EOF, we can't form a cover
343 | func (idx *InvertedIndex) findCoverEnd(tokens []string, startPos Position) Position {
344 | 	maxPos := startPos
345 | 
346 | 	for _, token := range tokens {
347 | 		// Find next occurrence of this token
348 | 		tokenPos, _ := idx.Next(token, startPos)
349 | 
350 | 		// If any token is not found, we can't create a cover
351 | 		if tokenPos.IsEnd() {
352 | 			return EOFDocument
353 | 		}
354 | 
355 | 		// Keep track of the furthest position
356 | 		if tokenPos.IsAfter(maxPos) {
357 | 			maxPos = tokenPos
358 | 		}
359 | 	}
360 | 
361 | 	return maxPos
362 | }
363 | 
364 | // findCoverStart finds the earliest position that still covers all tokens
365 | //
366 | // HOW IT WORKS:
367 | // -------------
368 | // Starting from just after the cover end, we walk backwards to find each token.
369 | //
370 | // Example: Finding cover start for ["quick", "brown", "fox"]
371 | //
372 | //	       with coverEnd at Doc2:Pos8
373 | //
374 | //	searchBound = Doc2:Pos9 (one position after the end)
375 | //
376 | //	Step 1: Find "quick" before Pos9 → Doc2:Pos1  ← earliest so far
377 | //	        minPos = Doc2:Pos1
378 | //
379 | //	Step 2: Find "brown" before Pos9 → Doc2:Pos2
380 | //	        Is Doc2:Pos2 before Doc2:Pos1? No
381 | //	        minPos stays Doc2:Pos1
382 | //
383 | //	Step 3: Find "fox" before Pos9 → Doc2:Pos8
384 | //	        Is Doc2:Pos8 before Doc2:Pos1? No
385 | //	        minPos stays Doc2:Pos1
386 | //
387 | //	Return: Doc2:Pos1 (the earliest position)
388 | //
389 | // Why search from (endPos + 1)?
390 | // - Previous() returns positions STRICTLY BEFORE the search point
391 | // - By searching from endPos+1, we can find tokens AT endPos
392 | func (idx *InvertedIndex) findCoverStart(tokens []string, endPos Position) Position {
393 | 	minPos := BOFDocument
394 | 
395 | 	// Create a search bound just after the cover end
396 | 	// This ensures we can find tokens AT the end position
397 | 	searchBound := Position{
398 | 		DocumentID: endPos.DocumentID,
399 | 		Offset:     endPos.Offset + 1,
400 | 	}
401 | 
402 | 	for _, token := range tokens {
403 | 		// Find the previous occurrence of this token before searchBound
404 | 		tokenPos, _ := idx.Previous(token, searchBound)
405 | 
406 | 		// Keep track of the earliest position
407 | 		if minPos.IsBeginning() || tokenPos.IsBefore(minPos) {
408 | 			minPos = tokenPos
409 | 		}
410 | 	}
411 | 
412 | 	return minPos
413 | }
414 | 
415 | // ═══════════════════════════════════════════════════════════════════════════════
416 | // RANKING: Scoring Search Results by Relevance
417 | // ═══════════════════════════════════════════════════════════════════════════════
418 | // Not all search results are equally relevant. We need to rank them!
419 | //
420 | // PROXIMITY RANKING:
421 | // ------------------
422 | // The idea: Documents where search terms appear CLOSER together are more relevant.
423 | //
424 | // Example: Searching for "machine learning"
425 | //   Doc A: "machine learning is..."        (distance: 1) → HIGH score
426 | //   Doc B: "machine ... learning"          (distance: 3) → MEDIUM score
427 | //   Doc C: "machine ... ... ... learning"  (distance: 5) → LOW score
428 | //
429 | // SCORING FORMULA:
430 | // ----------------
431 | // For each cover in a document:
432 | //   score += 1 / (coverEnd - coverStart + 1)
433 | //
434 | // Why this formula?
435 | // - Smaller distances → larger scores (inversely proportional)
436 | // - Multiple covers → higher score (sum of all covers)
437 | // - Simple and fast to compute
438 | //
439 | // EXAMPLE CALCULATION:
440 | // --------------------
441 | // Document: "quick brown fox jumped over quick brown dog"
442 | // Positions:   0      1     2     3      4     5      6    7
443 | //
444 | // Searching for ["quick", "brown"]:
445 | //   Cover 1: Pos[0-1] → score += 1/(1-0+1) = 1/2 = 0.5
446 | //   Cover 2: Pos[5-6] → score += 1/(6-5+1) = 1/2 = 0.5
447 | //   Total score: 1.0
448 | //
449 | // A document with terms closer together:
450 | // Document: "quick brown"
451 | // Positions:   0      1
452 | //   Cover 1: Pos[0-1] → score = 1/(1-0+1) = 1/2 = 0.5
453 | //   Total score: 0.5 (but only ONE occurrence)
454 | //
455 | // ═══════════════════════════════════════════════════════════════════════════════
456 | 
457 | // Match represents a search result with its positions and relevance score
458 | //
459 | // STRUCTURE:
460 | // ----------
461 | // Offsets: The [start, end] positions of a cover in a document
462 | // Score: The relevance score (higher = more relevant)
463 | //
464 | // Example Match:
465 | //
466 | //	Offsets: [Doc3:Pos1, Doc3:Pos5]  ← This document matches from Pos1 to Pos5
467 | //	Score: 2.7                         ← Relevance score
468 | type Match struct {
469 | 	DocID   int        // Document identifier
470 | 	Offsets []Position // Where the match was found [start, end]
471 | 	Score   float64    // How relevant is this match?
472 | }
473 | 
474 | // GetKey generates a unique identifier for the match
475 | func (m *Match) GetKey() (string, error) {
476 | 	data, err := json.Marshal(m.DocID)
477 | 	if err != nil {
478 | 		return "", err
479 | 	}
480 | 	hash := md5.Sum(data)
481 | 	return hex.EncodeToString(hash[:]), nil
482 | }
483 | 
484 | // calculateIDF computes the Inverse Document Frequency for a term
485 | //
486 | // IDF FORMULA:
487 | // ------------
488 | // IDF(term) = log((N - df + 0.5) / (df + 0.5) + 1)
489 | //
490 | // Where:
491 | //
492 | //	N = total number of documents
493 | //	df = number of documents containing the term
494 | //
495 | // INTUITION:
496 | // ----------
497 | // - Rare terms (low df) get high IDF scores
498 | // - Common terms (high df) get low IDF scores
499 | // - This makes rare terms more important for ranking
500 | //
501 | // EXAMPLE:
502 | // --------
503 | // Total docs: 1000
504 | // Term "the": appears in 950 docs → IDF ≈ 0.05 (very common, low importance)
505 | // Term "quantum": appears in 5 docs → IDF ≈ 5.3 (rare, high importance)
506 | //
507 | // PERFORMANCE BOOST WITH ROARING BITMAPS:
508 | // ----------------------------------------
509 | // Instead of traversing skip lists to count documents, we use bitmaps:
510 | // - Old way: O(n) traverse skip list, count unique docs
511 | // - New way: O(1) bitmap.GetCardinality()
512 | // This is 10-100x faster for common terms!
513 | func (idx *InvertedIndex) calculateIDF(term string) float64 {
514 | 	// Use roaring bitmap for instant document count
515 | 	bitmap, exists := idx.DocBitmaps[term]
516 | 	if !exists {
517 | 		return 0.0
518 | 	}
519 | 
520 | 	// Get document frequency instantly from bitmap cardinality
521 | 	df := float64(bitmap.GetCardinality())
522 | 
523 | 	if df == 0 {
524 | 		return 0.0
525 | 	}
526 | 
527 | 	N := float64(idx.TotalDocs)
528 | 
529 | 	// BM25 IDF formula (with smoothing to avoid negative values)
530 | 	return math.Log((N-df+0.5)/(df+0.5) + 1.0)
531 | }
532 | 
533 | // countDocsInPostingList counts unique documents in a posting list
534 | func (idx *InvertedIndex) countDocsInPostingList(skipList SkipList) int {
535 | 	uniqueDocs := make(map[int]bool)
536 | 
537 | 	current := skipList.Head.Tower[0]
538 | 	for current != nil {
539 | 		docID := current.Key.GetDocumentID()
540 | 		uniqueDocs[docID] = true
541 | 		current = current.Tower[0]
542 | 	}
543 | 
544 | 	return len(uniqueDocs)
545 | }
546 | 
547 | // calculateBM25Score computes the BM25 score for a document given query terms
548 | //
549 | // BM25 ALGORITHM:
550 | // ---------------
551 | //  1. For each query term:
552 | //     a. Calculate IDF(term) - how rare is this term?
553 | //     b. Get term frequency in document
554 | //     c. Apply saturation and length normalization
555 | //     d. Accumulate score
556 | //
557 | // EXAMPLE CALCULATION:
558 | // --------------------
559 | // Query: "machine learning"
560 | // Doc 5: 200 words, "machine" appears 3 times, "learning" appears 2 times
561 | // Corpus: 1000 docs, avg length 150 words
562 | //
563 | // For "machine" (appears in 100 docs):
564 | //
565 | //	IDF = log((1000 - 100 + 0.5) / (100 + 0.5) + 1) ≈ 2.3
566 | //	TF = 3
567 | //	normalized_TF = (3 * 2.5) / (3 + 1.5 * (1 - 0.75 + 0.75 * (200/150)))
568 | //	              = 7.5 / 5.0 ≈ 1.5
569 | //	score += 2.3 * 1.5 ≈ 3.45
570 | //
571 | // For "learning" (appears in 50 docs):
572 | //
573 | //	IDF ≈ 2.9
574 | //	TF = 2
575 | //	normalized_TF ≈ 1.2
576 | //	score += 2.9 * 1.2 ≈ 3.48
577 | //
578 | // Total BM25 score: 3.45 + 3.48 = 6.93
579 | func (idx *InvertedIndex) calculateBM25Score(docID int, queryTerms []string) float64 {
580 | 	docStats, exists := idx.DocStats[docID]
581 | 	if !exists {
582 | 		return 0.0
583 | 	}
584 | 
585 | 	// Calculate average document length
586 | 	avgDocLen := float64(idx.TotalTerms) / float64(idx.TotalDocs)
587 | 	docLen := float64(docStats.Length)
588 | 
589 | 	score := 0.0
590 | 	k1 := idx.BM25Params.K1
591 | 	b := idx.BM25Params.B
592 | 
593 | 	// Process each query term
594 | 	for _, term := range queryTerms {
595 | 		// Get IDF for this term
596 | 		idf := idx.calculateIDF(term)
597 | 
598 | 		// Get term frequency in this document
599 | 		tf := float64(docStats.TermFreqs[term])
600 | 
601 | 		if tf > 0 {
602 | 			// BM25 formula with length normalization
603 | 			numerator := tf * (k1 + 1)
604 | 			denominator := tf + k1*(1-b+b*(docLen/avgDocLen))
605 | 			score += idf * (numerator / denominator)
606 | 		}
607 | 	}
608 | 
609 | 	return score
610 | }
611 | 
612 | // RankBM25 performs BM25 ranking of search results
613 | //
614 | // ALGORITHM:
615 | // ----------
616 | // 1. Tokenize query
617 | // 2. Find all documents containing at least one query term
618 | // 3. Calculate BM25 score for each document
619 | // 4. Sort by score (descending)
620 | // 5. Return top K results
621 | //
622 | // EXAMPLE:
623 | // --------
624 | // Query: "machine learning algorithms"
625 | //
626 | // Step 1: Tokenize → ["machine", "learning", "algorithms"]
627 | //
628 | // Step 2: Find candidate documents:
629 | //
630 | //	"machine" appears in: [Doc1, Doc3, Doc5, Doc7]
631 | //	"learning" appears in: [Doc1, Doc2, Doc5]
632 | //	"algorithms" appears in: [Doc2, Doc5, Doc8]
633 | //	Candidates: [Doc1, Doc2, Doc3, Doc5, Doc7, Doc8]
634 | //
635 | // Step 3: Calculate BM25 scores:
636 | //
637 | //	Doc1: 12.5 (has "machine" and "learning")
638 | //	Doc2: 8.3 (has "learning" and "algorithms")
639 | //	Doc3: 3.2 (only has "machine")
640 | //	Doc5: 15.7 (has all three terms!)
641 | //	Doc7: 2.1 (only has "machine")
642 | //	Doc8: 4.5 (only has "algorithms")
643 | //
644 | // Step 4: Sort: [Doc5, Doc1, Doc2, Doc8, Doc3, Doc7]
645 | //
646 | // Step 5: Return top 3: [Doc5, Doc1, Doc2]
647 | func (idx *InvertedIndex) RankBM25(query string, maxResults int) []Match {
648 | 	slog.Info("BM25 ranking", slog.String("query", query))
649 | 
650 | 	tokens := Analyze(query)
651 | 	if len(tokens) == 0 {
652 | 		return []Match{}
653 | 	}
654 | 
655 | 	slog.Info("search tokens", slog.String("tokens", fmt.Sprintf("%v", tokens)))
656 | 
657 | 	// Find all candidate documents (documents containing at least one query term)
658 | 	candidates := idx.findCandidateDocuments(tokens)
659 | 
660 | 	// Calculate BM25 score for each candidate
661 | 	results := make([]Match, 0, len(candidates))
662 | 	for docID := range candidates {
663 | 		score := idx.calculateBM25Score(docID, tokens)
664 | 
665 | 		if score > 0 {
666 | 			results = append(results, Match{
667 | 				DocID:   docID,
668 | 				Offsets: candidates[docID], // Positions where terms appear
669 | 				Score:   score,
670 | 			})
671 | 		}
672 | 	}
673 | 
674 | 	// Sort by score (descending)
675 | 	idx.sortMatchesByScore(results)
676 | 
677 | 	// Return top K results
678 | 	return limitResults(results, maxResults)
679 | }
680 | 
681 | // findCandidateDocuments finds all documents containing at least one query term
682 | //
683 | // Returns a map: DocID → Positions where query terms appear
684 | //
685 | // PERFORMANCE BOOST WITH ROARING BITMAPS:
686 | // ----------------------------------------
687 | // We use a two-phase approach:
688 | // 1. Fast filtering: Use bitmaps to find candidate document IDs (O(1) per term)
689 | // 2. Position lookup: Only fetch positions for candidate documents
690 | //
691 | // OLD APPROACH: Traverse every skip list node (slow)
692 | // NEW APPROACH: Bitmap union + targeted position lookup (fast!)
693 | func (idx *InvertedIndex) findCandidateDocuments(tokens []string) map[int][]Position {
694 | 	candidates := make(map[int][]Position)
695 | 
696 | 	// PHASE 1: Use bitmaps to quickly find all candidate document IDs
697 | 	candidateDocs := make(map[int]bool)
698 | 	for _, token := range tokens {
699 | 		bitmap, exists := idx.DocBitmaps[token]
700 | 		if !exists {
701 | 			continue
702 | 		}
703 | 
704 | 		// Iterate through document IDs in the bitmap
705 | 		iter := bitmap.Iterator()
706 | 		for iter.HasNext() {
707 | 			docID := int(iter.Next())
708 | 			candidateDocs[docID] = true
709 | 		}
710 | 	}
711 | 
712 | 	// PHASE 2: For each candidate document, fetch positions from skip lists
713 | 	// This is still needed for BM25 scoring (we need exact positions)
714 | 	for _, token := range tokens {
715 | 		skipList, exists := idx.getPostingList(token)
716 | 		if !exists {
717 | 			continue
718 | 		}
719 | 
720 | 		// Only traverse skip list for positions in candidate documents
721 | 		current := skipList.Head.Tower[0]
722 | 		for current != nil {
723 | 			docID := current.Key.GetDocumentID()
724 | 			// Only add if this is a candidate document
725 | 			if candidateDocs[docID] {
726 | 				candidates[docID] = append(candidates[docID], current.Key)
727 | 			}
728 | 			current = current.Tower[0]
729 | 		}
730 | 	}
731 | 
732 | 	return candidates
733 | }
734 | 
735 | // sortMatchesByScore sorts matches by score in descending order (higher scores first)
736 | func (idx *InvertedIndex) sortMatchesByScore(matches []Match) {
737 | 	sort.Slice(matches, func(i, j int) bool {
738 | 		return matches[i].Score > matches[j].Score
739 | 	})
740 | }
741 | 
742 | // RankProximity performs proximity-based ranking of search results
743 | //
744 | // THIS IS THE MAIN SEARCH FUNCTION!
745 | //
746 | // COMPLETE EXAMPLE:
747 | // -----------------
748 | // Query: "machine learning"
749 | // MaxResults: 10
750 | //
751 | // Step 1: Tokenize query
752 | //
753 | //	"machine learning" → ["machine", "learning"]
754 | //
755 | // Step 2: Find all covers (ranges containing both words)
756 | //
757 | //	Doc1: Cover[0-1], Cover[5-6]    → score = 0.5 + 0.5 = 1.0
758 | //	Doc2: Cover[0-5]                → score = 0.167
759 | //	Doc3: Cover[2-3], Cover[10-11]  → score = 0.5 + 0.5 = 1.0
760 | //	Doc4: Cover[1-1]                → Wait, both words at same position? Impossible!
761 | //	                                   (This means one word appears twice)
762 | //
763 | // Step 3: Return top 10 results
764 | //
765 | //	Result: [Doc1, Doc3, Doc2] (sorted by score, limited to 10)
766 | //
767 | // ALGORITHM WALKTHROUGH:
768 | // ----------------------
769 | // We iterate through ALL covers in the index, accumulating scores per document.
770 | //
771 | // Iteration 1: Find first cover → Doc1:Pos[0-1]
772 | //   - New document Doc1 detected
773 | //   - Calculate score: 1/(1-0+1) = 0.5
774 | //   - Current document: Doc1, current score: 0.5
775 | //
776 | // Iteration 2: Find next cover → Doc1:Pos[5-6]
777 | //   - Still in Doc1 (not a new document)
778 | //   - Add to score: 0.5 + 1/(6-5+1) = 1.0
779 | //   - Current document: Doc1, current score: 1.0
780 | //
781 | // Iteration 3: Find next cover → Doc2:Pos[0-5]
782 | //   - New document Doc2 detected!
783 | //   - Save previous: Match{Doc1, score=1.0}
784 | //   - Start new: Doc2, score = 1/(5-0+1) = 0.167
785 | //
786 | // ... continue until EOF ...
787 | //
788 | // Final step: Return top K results
789 | func (idx *InvertedIndex) RankProximity(query string, maxResults int) []Match {
790 | 	slog.Info("proximity ranking", slog.String("query", query))
791 | 
792 | 	// STEP 1: Tokenize the query (same as indexing)
793 | 	tokens := Analyze(query)
794 | 	if len(tokens) == 0 {
795 | 		// Empty query → no results
796 | 		return []Match{}
797 | 	}
798 | 
799 | 	slog.Info("search tokens", slog.String("tokens", fmt.Sprintf("%v", tokens)))
800 | 
801 | 	// STEP 2: Find and score all covers
802 | 	results := idx.collectProximityMatches(tokens)
803 | 
804 | 	// STEP 3: Limit to top K results
805 | 	return limitResults(results, maxResults)
806 | }
807 | 
808 | // collectProximityMatches finds and scores all proximity matches
809 | //
810 | // This is the core ranking loop that:
811 | // 1. Finds all covers
812 | // 2. Groups them by document
813 | // 3. Calculates cumulative scores per document
814 | //
815 | // STATE TRACKING:
816 | // ---------------
817 | // We maintain state across iterations:
818 | // - currentCandidate: The [start, end] positions of the current document's match
819 | // - currentScore: The accumulated score for the current document
820 | // - matches: The final list of all document matches
821 | //
822 | // TRANSITION DETECTION:
823 | // ---------------------
824 | // When we find a cover in a NEW document:
825 | //
826 | //	→ Save the current document's match
827 | //	→ Reset state for the new document
828 | func (idx *InvertedIndex) collectProximityMatches(tokens []string) []Match {
829 | 	var matches []Match
830 | 
831 | 	// Find the first cover to initialize our state
832 | 	coverPositions := idx.NextCover(tokens, BOFDocument)
833 | 	coverStart, coverEnd := coverPositions[0], coverPositions[1]
834 | 
835 | 	// Initialize tracking variables
836 | 	currentCandidate := []Position{coverStart, coverEnd}
837 | 	currentScore := 0.0
838 | 
839 | 	// Loop through all covers until we reach EOF
840 | 	for !coverStart.IsEnd() {
841 | 		// DETECTION: Did we move to a new document?
842 | 		if currentCandidate[0].DocumentID < coverStart.DocumentID {
843 | 			// Yes! Save the previous document's match
844 | 			matches = append(matches, Match{
845 | 				Offsets: currentCandidate,
846 | 				Score:   currentScore,
847 | 			})
848 | 
849 | 			// Reset state for the new document
850 | 			currentCandidate = []Position{coverStart, coverEnd}
851 | 			currentScore = 0
852 | 		}
853 | 
854 | 		// SCORING: Calculate proximity score for this cover
855 | 		// Formula: 1 / (distance + 1)
856 | 		// - Smaller distance → higher score
857 | 		// - +1 to avoid division by zero when start==end
858 | 		proximity := float64(coverEnd.Offset - coverStart.Offset + 1)
859 | 		currentScore += 1 / proximity
860 | 
861 | 		// Find the next cover
862 | 		coverPositions = idx.NextCover(tokens, coverStart)
863 | 		coverStart, coverEnd = coverPositions[0], coverPositions[1]
864 | 	}
865 | 
866 | 	// Don't forget the last document!
867 | 	// When we reach EOF, we still have one unsaved match
868 | 	if !currentCandidate[0].IsEnd() {
869 | 		matches = append(matches, Match{
870 | 			Offsets: currentCandidate,
871 | 			Score:   currentScore,
872 | 		})
873 | 	}
874 | 
875 | 	return matches
876 | }
877 | 
878 | // limitResults returns at most maxResults items
879 | //
880 | // Simple helper to truncate the results list.
881 | // Uses math.Min to avoid index-out-of-bounds errors.
882 | //
883 | // Example:
884 | //
885 | //	matches = [Match1, Match2, Match3, Match4, Match5]
886 | //	maxResults = 3
887 | //	Returns: [Match1, Match2, Match3]
888 | func limitResults(matches []Match, maxResults int) []Match {
889 | 	limit := int(math.Min(float64(maxResults), float64(len(matches))))
890 | 	return matches[:limit]
891 | }
892 | 


--------------------------------------------------------------------------------
/skiplist.go:
--------------------------------------------------------------------------------
  1 | package blaze
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"math"
  6 | 	"math/rand"
  7 | 	"time"
  8 | )
  9 | 
 10 | // ═══════════════════════════════════════════════════════════════════════════════
 11 | // WHAT IS A SKIP LIST?
 12 | // ═══════════════════════════════════════════════════════════════════════════════
 13 | // A skip list is a probabilistic data structure that allows O(log n) search,
 14 | // insert, and delete operations - similar to a balanced tree, but simpler!
 15 | //
 16 | // VISUAL REPRESENTATION:
 17 | // ----------------------
 18 | // Think of it as a linked list with "express lanes":
 19 | //
 20 | // Level 3: HEAD -------------------------------------> [30] -----------> NULL
 21 | // Level 2: HEAD ----------------> [15] -------------> [30] -----------> NULL
 22 | // Level 1: HEAD -------> [10] --> [15] --> [20] ----> [30] -----------> NULL
 23 | // Level 0: HEAD --> [5] -> [10] -> [15] -> [20] -> [25] -> [30] -> [35] -> NULL
 24 | //                   ^^^    ^^^     ^^^     ^^^     ^^^     ^^^     ^^^
 25 | //                  Actual  data    in      the     skip    list    nodes
 26 | //
 27 | // HOW IT WORKS:
 28 | // -------------
 29 | // - Level 0 (bottom): Contains ALL elements in sorted order
 30 | // - Higher levels: Contain progressively fewer elements (like express lanes)
 31 | // - Searching: Start at the highest level, drop down when needed
 32 | //
 33 | // SEARCH EXAMPLE (finding 20):
 34 | // -----------------------------
 35 | // 1. Start at HEAD, Level 3
 36 | // 2. Level 3: Move to 30? No, 30 > 20, so drop to Level 2
 37 | // 3. Level 2: Move to 15? Yes, 15 < 20, advance to 15
 38 | // 4. Level 2: Move to 30? No, 30 > 20, so drop to Level 1
 39 | // 5. Level 1: Move to 20? Yes! Found it!
 40 | //
 41 | // Time Complexity: O(log n) average case
 42 | // - Each level skips roughly half the elements
 43 | // - Similar to binary search, but on a linked structure
 44 | //
 45 | // WHY USE SKIP LISTS IN A SEARCH ENGINE?
 46 | // ---------------------------------------
 47 | // 1. Fast lookups: O(log n) to find any position
 48 | // 2. Fast range queries: Find all positions in a document efficiently
 49 | // 3. Maintains sorted order: Essential for phrase search
 50 | // 4. Simple implementation: Easier than balanced trees (no rotations!)
 51 | // 5. Good cache locality: Level 0 can be traversed sequentially
 52 | //
 53 | // ═══════════════════════════════════════════════════════════════════════════════
 54 | 
 55 | const MaxHeight = 32 // Maximum tower height (supports billions of elements)
 56 | 
 57 | // ═══════════════════════════════════════════════════════════════════════════════
 58 | // SENTINEL VALUES
 59 | // ═══════════════════════════════════════════════════════════════════════════════
 60 | // We use MaxInt and MinInt as boundary markers
 61 | //
 62 | // WHY USE MAX/MIN INT?
 63 | // --------------------
 64 | // - Makes comparisons cleaner (no special cases for "empty")
 65 | // - Always guarantees: BOF < any_position < EOF
 66 | // - Simplifies edge cases in search algorithms
 67 | //
 68 | // Example: Searching from the "beginning"
 69 | //
 70 | //	Without sentinels: Need to check "is this the first call?"
 71 | //	With sentinels: Just use BOF as the starting position!
 72 | var (
 73 | 	EOF = math.MaxInt // End Of File: maximum integer value (larger than any real position)
 74 | 	BOF = math.MinInt // Beginning Of File: minimum integer value (smaller than any real position)
 75 | )
 76 | 
 77 | var (
 78 | 	ErrKeyNotFound    = errors.New("key not found")
 79 | 	ErrNoElementFound = errors.New("no element found")
 80 | )
 81 | 
 82 | // ═══════════════════════════════════════════════════════════════════════════════
 83 | // POSITION: A Location in a Document
 84 | // ═══════════════════════════════════════════════════════════════════════════════
 85 | // Position identifies a specific word in a specific document
 86 | //
 87 | // EXAMPLE:
 88 | // --------
 89 | // Document 5: "The quick brown fox jumps"
 90 | // Position{DocumentID: 5, Offset: 2} refers to "brown"
 91 | //
 92 | // WHY USE INT?
 93 | // ------------
 94 | // - Represents actual integer document IDs and offsets
 95 | // - Supports sentinel values (BOF = MinInt, EOF = MaxInt)
 96 | // - No casting needed - simpler and more efficient
 97 | //
 98 | // ORDERING:
 99 | // ---------
100 | // Positions are ordered first by DocumentID, then by Offset:
101 | //
102 | //	Doc1:Pos5 < Doc1:Pos10 < Doc2:Pos0 < Doc2:Pos3
103 | //
104 | // ═══════════════════════════════════════════════════════════════════════════════
105 | type Position struct {
106 | 	DocumentID int // Which document?
107 | 	Offset     int // Which word in the document? (0-indexed)
108 | }
109 | 
110 | // Sentinel positions for convenience
111 | var (
112 | 	BOFDocument = Position{DocumentID: BOF, Offset: BOF} // Before all documents
113 | 	EOFDocument = Position{DocumentID: EOF, Offset: EOF} // After all documents
114 | )
115 | 
116 | // ═══════════════════════════════════════════════════════════════════════════════
117 | // POSITION HELPER METHODS
118 | // ═══════════════════════════════════════════════════════════════════════════════
119 | // These methods make Position comparisons more readable and less error-prone
120 | // ═══════════════════════════════════════════════════════════════════════════════
121 | 
122 | // GetDocumentID returns the document ID
123 | // (Convenience method for consistent API)
124 | func (p *Position) GetDocumentID() int {
125 | 	return p.DocumentID
126 | }
127 | 
128 | // GetOffset returns the offset
129 | // (Convenience method for consistent API)
130 | func (p *Position) GetOffset() int {
131 | 	return p.Offset
132 | }
133 | 
134 | // IsBeginning checks if this is the BOF sentinel
135 | //
136 | // Example usage:
137 | //
138 | //	if pos.IsBeginning() {
139 | //	    // We're at the start, no previous element exists
140 | //	}
141 | func (p *Position) IsBeginning() bool {
142 | 	return p.Offset == BOF
143 | }
144 | 
145 | // IsEnd checks if this is the EOF sentinel
146 | //
147 | // Example usage:
148 | //
149 | //	if pos.IsEnd() {
150 | //	    // We've reached the end, stop searching
151 | //	}
152 | func (p *Position) IsEnd() bool {
153 | 	return p.Offset == EOF
154 | }
155 | 
156 | // IsBefore checks if this position comes before another position
157 | //
158 | // ORDERING RULES:
159 | // ---------------
160 | // Position A < Position B if:
161 | //  1. A.DocumentID < B.DocumentID, OR
162 | //  2. Same document AND A.Offset < B.Offset
163 | //
164 | // EXAMPLES:
165 | // ---------
166 | // Doc1:Pos5 < Doc1:Pos10 → true  (same doc, 5 < 10)
167 | // Doc1:Pos5 < Doc2:Pos0  → true  (doc 1 < doc 2)
168 | // Doc2:Pos0 < Doc1:Pos5  → false (doc 2 > doc 1)
169 | func (p *Position) IsBefore(other Position) bool {
170 | 	// Check document order first
171 | 	if p.DocumentID < other.DocumentID {
172 | 		return true
173 | 	}
174 | 
175 | 	// Same document: check offset order
176 | 	return p.DocumentID == other.DocumentID && p.Offset < other.Offset
177 | }
178 | 
179 | // IsAfter checks if this position comes after another position
180 | //
181 | // This is the opposite of IsBefore (with equality handled separately)
182 | func (p *Position) IsAfter(other Position) bool {
183 | 	// Check document order first
184 | 	if p.DocumentID > other.DocumentID {
185 | 		return true
186 | 	}
187 | 
188 | 	// Same document: check offset order
189 | 	return p.DocumentID == other.DocumentID && p.Offset > other.Offset
190 | }
191 | 
192 | // Equals checks if two positions are identical
193 | //
194 | // Example:
195 | //
196 | //	Doc1:Pos5 == Doc1:Pos5 → true
197 | //	Doc1:Pos5 == Doc1:Pos6 → false
198 | func (p *Position) Equals(other Position) bool {
199 | 	return p.DocumentID == other.DocumentID && p.Offset == other.Offset
200 | }
201 | 
202 | // ═══════════════════════════════════════════════════════════════════════════════
203 | // NODE: A Skip List Node
204 | // ═══════════════════════════════════════════════════════════════════════════════
205 | // Each node stores:
206 | // 1. A Key (Position): The data we're storing
207 | // 2. A Tower: Array of pointers to next nodes at each level
208 | //
209 | // TOWER VISUALIZATION:
210 | // --------------------
211 | // For a node with height 3:
212 | //
213 | //	Tower[2] -----> (points to a node far ahead)
214 | //	Tower[1] -----> (points to a node ahead)
215 | //	Tower[0] -----> (points to the very next node)
216 | //
217 | // The higher the level, the further ahead we skip!
218 | // ═══════════════════════════════════════════════════════════════════════════════
219 | type Node struct {
220 | 	Key   Position         // The position stored in this node
221 | 	Tower [MaxHeight]*Node // Array of forward pointers (one per level)
222 | }
223 | 
224 | // ═══════════════════════════════════════════════════════════════════════════════
225 | // SKIP LIST: The Main Data Structure
226 | // ═══════════════════════════════════════════════════════════════════════════════
227 | type SkipList struct {
228 | 	Head   *Node // Sentinel head node (doesn't contain real data)
229 | 	Height int   // Current height of the tallest tower
230 | }
231 | 
232 | // NewSkipList creates an empty skip list
233 | //
234 | // INITIAL STATE:
235 | // --------------
236 | // HEAD (empty node) with no forward pointers
237 | // Height = 1 (even empty lists have level 0)
238 | func NewSkipList() *SkipList {
239 | 	return &SkipList{
240 | 		Head:   &Node{}, // Empty sentinel head
241 | 		Height: 1,
242 | 	}
243 | }
244 | 
245 | // ═══════════════════════════════════════════════════════════════════════════════
246 | // SEARCH: The Core Operation
247 | // ═══════════════════════════════════════════════════════════════════════════════
248 | // Search is the foundation of all skip list operations.
249 | // It returns TWO things:
250 | // 1. The node with the exact key (or nil if not found)
251 | // 2. A "journey" array: the path we took to get there
252 | //
253 | // WHY RETURN THE JOURNEY?
254 | // ------------------------
255 | // The journey tells us which node is BEFORE the target at each level.
256 | // This is essential for:
257 | // - Insert: We need to know where to splice in the new node
258 | // - Delete: We need to know which nodes to update
259 | // - FindLessThan: The journey already contains the answer!
260 | //
261 | // SEARCH ALGORITHM:
262 | // -----------------
263 | // Start at the highest level and work down:
264 | // 1. At each level, move right as far as possible (while staying < target)
265 | // 2. When we can't move right, drop down one level
266 | // 3. Repeat until we reach level 0
267 | // 4. Check if we found the exact key
268 | //
269 | // VISUAL EXAMPLE (searching for 20):
270 | // -----------------------------------
271 | // Level 2: HEAD ------[10]------[30]     Start at HEAD, level 2
272 | //                     ^^^                Can we jump to 10? Yes! (10 < 20)
273 | //                           ^^^          Can we jump to 30? No! (30 > 20)
274 | //                                        Drop to level 1...
275 | //
276 | // Level 1: HEAD --[10]--[15]--[20]--[30]  At 10, level 1
277 | //                       ^^^                Can we jump to 15? Yes! (15 < 20)
278 | //                             ^^^          Can we jump to 20? STOP! Check this
279 | //
280 | // Level 0: We'd check if 20 exists at level 0
281 | //
282 | // Journey captured: [level0: node15, level1: node15, level2: node10]
283 | // ═══════════════════════════════════════════════════════════════════════════════
284 | 
285 | // Search finds a key in the skip list and returns the path taken
286 | //
287 | // RETURN VALUES:
288 | // --------------
289 | // 1. *Node: The node with exact key (nil if not found)
290 | // 2. [MaxHeight]*Node: Journey array - the predecessor at each level
291 | //
292 | // EXAMPLE:
293 | // --------
294 | // Skip list: [5] -> [10] -> [15] -> [20]
295 | // Search(15) returns:
296 | //   - found: Node{15}
297 | //   - journey[0]: Node{10} (predecessor at level 0)
298 | //   - journey[1]: Node{10} (predecessor at level 1)
299 | //   - ...
300 | func (sl *SkipList) Search(key Position) (*Node, [MaxHeight]*Node) {
301 | 	var journey [MaxHeight]*Node // Track the path we take
302 | 	current := sl.Head           // Start at the sentinel head
303 | 
304 | 	// Traverse from highest level down to level 0
305 | 	for level := sl.Height - 1; level >= 0; level-- {
306 | 		// Move forward as far as possible at this level
307 | 		current = sl.traverseLevel(current, key, level)
308 | 
309 | 		// Record where we ended up at this level
310 | 		// (This is the predecessor for this level)
311 | 		journey[level] = current
312 | 	}
313 | 
314 | 	// Check if we found an exact match
315 | 	// current now points to the largest node < key
316 | 	// So current.Tower[0] might be the exact key
317 | 	next := current.Tower[0]
318 | 	if next != nil && next.Key.Equals(key) {
319 | 		return next, journey // Found it!
320 | 	}
321 | 
322 | 	return nil, journey // Not found, but journey is still useful
323 | }
324 | 
325 | // traverseLevel advances along a single level as far as possible
326 | //
327 | // PROCESS:
328 | // --------
329 | // Starting from 'start', move forward while next.Key < target
330 | // Stop when: next.Key >= target OR next == nil
331 | //
332 | // EXAMPLE:
333 | // --------
334 | // Level: HEAD -> [5] -> [10] -> [15] -> [20] -> nil
335 | // Target: 17
336 | //
337 | // Step 1: At HEAD, next = 5, should advance? Yes (5 < 17)
338 | // Step 2: At 5, next = 10, should advance? Yes (10 < 17)
339 | // Step 3: At 10, next = 15, should advance? Yes (15 < 17)
340 | // Step 4: At 15, next = 20, should advance? No! (20 > 17)
341 | // Return: node 15
342 | func (sl *SkipList) traverseLevel(start *Node, target Position, level int) *Node {
343 | 	current := start
344 | 
345 | 	// Keep moving forward while we can
346 | 	next := current.Tower[level]
347 | 	for next != nil {
348 | 		// Should we advance to the next node?
349 | 		if sl.shouldAdvance(next.Key, target) {
350 | 			current = next              // Yes, move forward
351 | 			next = current.Tower[level] // Update next to the next node
352 | 		} else {
353 | 			break // No, stop here
354 | 		}
355 | 	}
356 | 
357 | 	return current
358 | }
359 | 
360 | // shouldAdvance determines if we should move to the next node
361 | //
362 | // DECISION RULE:
363 | // --------------
364 | // Advance if: next.Key < target
365 | // Stop if: next.Key >= target
366 | //
367 | // This ensures we stop at the largest node that's still less than target
368 | func (sl *SkipList) shouldAdvance(nodeKey, targetKey Position) bool {
369 | 	// Don't advance if we've reached or passed the target
370 | 	if nodeKey.Equals(targetKey) {
371 | 		return false
372 | 	}
373 | 
374 | 	// Advance only if the node key is less than target
375 | 	return nodeKey.IsBefore(targetKey)
376 | }
377 | 
378 | // ═══════════════════════════════════════════════════════════════════════════════
379 | // FIND OPERATIONS: Building on Search
380 | // ═══════════════════════════════════════════════════════════════════════════════
381 | // These operations use Search as a building block
382 | // ═══════════════════════════════════════════════════════════════════════════════
383 | 
384 | // Find searches for an exact key match
385 | //
386 | // # This is a simple wrapper around Search that only returns the key
387 | //
388 | // Example:
389 | //
390 | //	Find(Doc1:Pos5) returns Doc1:Pos5 if it exists, else error
391 | func (sl *SkipList) Find(key Position) (Position, error) {
392 | 	found, _ := sl.Search(key)
393 | 
394 | 	if found == nil {
395 | 		return EOFDocument, ErrKeyNotFound
396 | 	}
397 | 
398 | 	return found.Key, nil
399 | }
400 | 
401 | // FindLessThan finds the largest key less than the given key
402 | //
403 | // HOW IT WORKS:
404 | // -------------
405 | // The journey from Search already gives us this answer!
406 | // journey[0] is the largest node < key at the bottom level
407 | //
408 | // EXAMPLE:
409 | // --------
410 | // Skip list: [5] -> [10] -> [15] -> [20]
411 | // FindLessThan(17) returns 15
412 | // FindLessThan(15) returns 10
413 | // FindLessThan(5) returns BOF (nothing before 5)
414 | //
415 | // USE CASE:
416 | // ---------
417 | // In search: "Find the previous occurrence of 'quick' before position X"
418 | func (sl *SkipList) FindLessThan(key Position) (Position, error) {
419 | 	_, journey := sl.Search(key)
420 | 
421 | 	predecessor := journey[0] // The node before key at level 0
422 | 
423 | 	// Check edge cases
424 | 	if predecessor == nil || predecessor == sl.Head {
425 | 		return BOFDocument, ErrNoElementFound
426 | 	}
427 | 
428 | 	return predecessor.Key, nil
429 | }
430 | 
431 | // FindGreaterThan finds the smallest key greater than the given key
432 | //
433 | // TWO CASES:
434 | // ----------
435 | // 1. Key exists: Return the next node after it
436 | // 2. Key doesn't exist: Return the next node after where it would be
437 | //
438 | // EXAMPLE:
439 | // --------
440 | // Skip list: [5] -> [10] -> [15] -> [20]
441 | // FindGreaterThan(10) returns 15 (next after 10)
442 | // FindGreaterThan(12) returns 15 (next after where 12 would be)
443 | // FindGreaterThan(20) returns EOF (nothing after 20)
444 | //
445 | // USE CASE:
446 | // ---------
447 | // In search: "Find the next occurrence of 'quick' after position X"
448 | func (sl *SkipList) FindGreaterThan(key Position) (Position, error) {
449 | 	found, journey := sl.Search(key)
450 | 
451 | 	// CASE 1: Key exists - return its successor
452 | 	if found != nil {
453 | 		if found.Tower[0] != nil {
454 | 			return found.Tower[0].Key, nil
455 | 		}
456 | 		return EOFDocument, ErrNoElementFound
457 | 	}
458 | 
459 | 	// CASE 2: Key doesn't exist - return next node after where it would be
460 | 	predecessor := journey[0]
461 | 	if predecessor != nil && predecessor.Tower[0] != nil {
462 | 		return predecessor.Tower[0].Key, nil
463 | 	}
464 | 
465 | 	return EOFDocument, ErrNoElementFound
466 | }
467 | 
468 | // ═══════════════════════════════════════════════════════════════════════════════
469 | // INSERT: Adding Elements to the Skip List
470 | // ═══════════════════════════════════════════════════════════════════════════════
471 | // Insertion is a two-phase process:
472 | // 1. Search to find where the new element should go
473 | // 2. Splice the new node into the list at multiple levels
474 | //
475 | // PROBABILISTIC HEIGHT:
476 | // ---------------------
477 | // Each new node gets a random height (tower height):
478 | // - 50% chance of height 1
479 | // - 25% chance of height 2
480 | // - 12.5% chance of height 3
481 | // - ...
482 | //
483 | // This randomness is what makes skip lists work!
484 | // It ensures roughly logarithmic performance on average.
485 | //
486 | // INSERT EXAMPLE:
487 | // ---------------
488 | // Inserting 17 with height 2:
489 | //
490 | // Before:
491 | // Level 1: HEAD -------> [10] ------------> [20]
492 | // Level 0: HEAD -> [5] -> [10] -> [15] -> [20]
493 | //
494 | // After:
495 | // Level 1: HEAD -------> [10] -> [17] ----> [20]
496 | // Level 0: HEAD -> [5] -> [10] -> [15] -> [17] -> [20]
497 | //                                           ^^^
498 | //                                          new node
499 | // ═══════════════════════════════════════════════════════════════════════════════
500 | 
501 | // Insert adds a new key to the skip list (or updates if it exists)
502 | //
503 | // ALGORITHM:
504 | // ----------
505 | //  1. Search for the key (get the journey/path)
506 | //  2. If found, update the existing node
507 | //  3. If not found:
508 | //     a. Generate a random height for the new node
509 | //     b. Create the new node
510 | //     c. Link it into the list at each level
511 | //     d. Update the skip list's height if needed
512 | //
513 | // EXAMPLE WALKTHROUGH:
514 | // --------------------
515 | // Inserting Doc2:Pos5 into skip list: [Doc1:Pos3, Doc2:Pos10]
516 | //
517 | // Step 1: Search(Doc2:Pos5)
518 | //   - Not found
519 | //   - journey[0] = Node{Doc1:Pos3} (predecessor at level 0)
520 | //
521 | // Step 2: Generate height = 2 (random)
522 | //
523 | // Step 3: Create Node{Doc2:Pos5}
524 | //
525 | // Step 4: Link at level 0 and level 1:
526 | //   - Level 0: Doc1:Pos3 -> Doc2:Pos5 -> Doc2:Pos10
527 | //   - Level 1: HEAD -> Doc2:Pos5 -> ...
528 | func (sl *SkipList) Insert(key Position) {
529 | 	found, journey := sl.Search(key)
530 | 
531 | 	// If key already exists, just update it
532 | 	if found != nil {
533 | 		found.Key = key
534 | 		return
535 | 	}
536 | 
537 | 	// Generate a random height for the new node
538 | 	height := sl.randomHeight()
539 | 
540 | 	// Create the new node
541 | 	newNode := &Node{Key: key}
542 | 
543 | 	// Link the node into the skip list
544 | 	sl.linkNode(newNode, journey, height)
545 | 
546 | 	// Update skip list height if necessary
547 | 	if height > sl.Height {
548 | 		sl.Height = height
549 | 	}
550 | }
551 | 
552 | // linkNode connects a new node into the skip list structure
553 | //
554 | // LINKING PROCESS (for each level):
555 | // ----------------------------------
556 | // 1. Find the predecessor at this level (from journey)
557 | // 2. Set newNode.Tower[level] = predecessor.Tower[level]
558 | // 3. Set predecessor.Tower[level] = newNode
559 | //
560 | // VISUAL EXAMPLE (linking at level 1):
561 | // -------------------------------------
562 | // Before:
563 | //
564 | //	predecessor -> [oldNext]
565 | //
566 | // After:
567 | //
568 | //	predecessor -> [newNode] -> [oldNext]
569 | //
570 | // The newNode "splices" itself between predecessor and oldNext
571 | func (sl *SkipList) linkNode(node *Node, journey [MaxHeight]*Node, height int) {
572 | 	// Link the node at each level up to its height
573 | 	for level := 0; level < height; level++ {
574 | 		predecessor := journey[level]
575 | 
576 | 		// Edge case: If no predecessor at this level, use HEAD
577 | 		if predecessor == nil {
578 | 			predecessor = sl.Head
579 | 		}
580 | 
581 | 		// Splice the node into the linked list at this level
582 | 		// 1. New node points to what predecessor was pointing to
583 | 		node.Tower[level] = predecessor.Tower[level]
584 | 		// 2. Predecessor now points to new node
585 | 		predecessor.Tower[level] = node
586 | 	}
587 | }
588 | 
589 | // ═══════════════════════════════════════════════════════════════════════════════
590 | // DELETE: Removing Elements from the Skip List
591 | // ═══════════════════════════════════════════════════════════════════════════════
592 | // Deletion is the reverse of insertion:
593 | // 1. Search for the key
594 | // 2. Unlink it from all levels
595 | // 3. Clean up: reduce height if top levels are now empty
596 | // ═══════════════════════════════════════════════════════════════════════════════
597 | 
598 | // Delete removes a key from the skip list
599 | //
600 | // ALGORITHM:
601 | // ----------
602 | //  1. Search for the key
603 | //  2. If not found, return false
604 | //  3. If found:
605 | //     a. Unlink it from all levels
606 | //     b. Shrink the skip list height if needed
607 | //
608 | // EXAMPLE:
609 | // --------
610 | // Deleting 15:
611 | //
612 | // Before:
613 | // Level 1: HEAD -------> [10] -> [15] ----> [20]
614 | // Level 0: HEAD -> [5] -> [10] -> [15] -> [20]
615 | //
616 | // After:
617 | // Level 1: HEAD -------> [10] ------------> [20]
618 | // Level 0: HEAD -> [5] -> [10] ------------> [20]
619 | //
620 | //	(15 removed)
621 | func (sl *SkipList) Delete(key Position) bool {
622 | 	found, journey := sl.Search(key)
623 | 
624 | 	// Key doesn't exist
625 | 	if found == nil {
626 | 		return false
627 | 	}
628 | 
629 | 	// Unlink the node from all levels
630 | 	for level := 0; level < sl.Height; level++ {
631 | 		// If the predecessor at this level doesn't point to our node,
632 | 		// we've finished unlinking (node wasn't tall enough for higher levels)
633 | 		if journey[level].Tower[level] != found {
634 | 			break
635 | 		}
636 | 
637 | 		// Bypass the node: predecessor points to node's successor
638 | 		journey[level].Tower[level] = found.Tower[level]
639 | 	}
640 | 
641 | 	// Clean up: reduce height if top levels are now empty
642 | 	sl.shrink()
643 | 	return true
644 | }
645 | 
646 | // ═══════════════════════════════════════════════════════════════════════════════
647 | // UTILITY OPERATIONS
648 | // ═══════════════════════════════════════════════════════════════════════════════
649 | 
650 | // Last returns the last position in the skip list
651 | //
652 | // HOW IT WORKS:
653 | // -------------
654 | // Simply traverse level 0 until we reach the end
655 | //
656 | // Example:
657 | // Skip list: [5] -> [10] -> [15] -> [20] -> nil
658 | // Last() returns 20
659 | func (sl *SkipList) Last() Position {
660 | 	current := sl.Head
661 | 
662 | 	// Traverse the bottom level to the end
663 | 	for next := current.Tower[0]; next != nil; next = next.Tower[0] {
664 | 		current = next
665 | 	}
666 | 
667 | 	return current.Key
668 | }
669 | 
670 | // shrink reduces the height if top levels are empty
671 | //
672 | // WHY SHRINK?
673 | // -----------
674 | // After deletions, the top levels might become empty.
675 | // Shrinking improves performance by not searching empty levels.
676 | //
677 | // EXAMPLE:
678 | // --------
679 | // Before (after deleting the only height-3 node):
680 | // Level 2: HEAD -> nil (empty!)
681 | // Level 1: HEAD -> [10] -> [20]
682 | // Level 0: HEAD -> [5] -> [10] -> [15] -> [20]
683 | // Height: 3
684 | //
685 | // After shrinking:
686 | // Level 1: HEAD -> [10] -> [20]
687 | // Level 0: HEAD -> [5] -> [10] -> [15] -> [20]
688 | // Height: 2 (top level removed)
689 | func (sl *SkipList) shrink() {
690 | 	// Check levels from top down
691 | 	for level := sl.Height - 1; level >= 0; level-- {
692 | 		if sl.Head.Tower[level] == nil {
693 | 			sl.Height-- // This level is empty, reduce height
694 | 		} else {
695 | 			break // Found a non-empty level, stop
696 | 		}
697 | 	}
698 | }
699 | 
700 | // ═══════════════════════════════════════════════════════════════════════════════
701 | // RANDOM HEIGHT GENERATION
702 | // ═══════════════════════════════════════════════════════════════════════════════
703 | // This is the "magic" that makes skip lists work!
704 | //
705 | // THE COIN FLIP ALGORITHM:
706 | // -------------------------
707 | // Flip a fair coin repeatedly:
708 | // - Heads: Increase height by 1, flip again
709 | // - Tails: Stop, return current height
710 | //
711 | // PROBABILITY DISTRIBUTION:
712 | // --------------------------
713 | // Height 1: 50% (tails on first flip)
714 | // Height 2: 25% (heads then tails)
715 | // Height 3: 12.5% (heads, heads, tails)
716 | // Height 4: 6.25% (heads, heads, heads, tails)
717 | // ...
718 | //
719 | // This creates a geometric distribution that ensures:
720 | // - Most nodes have height 1 (50%)
721 | // - Few nodes have height 2 (25%)
722 | // - Very few nodes have height 3 (12.5%)
723 | // - Extremely rare to have height > 10
724 | //
725 | // WHY THIS WORKS:
726 | // ---------------
727 | // With N elements and this distribution:
728 | // - Expected number of nodes at level 0: N
729 | // - Expected number of nodes at level 1: N/2
730 | // - Expected number of nodes at level 2: N/4
731 | // - Expected number of nodes at level 3: N/8
732 | // ...
733 | //
734 | // This creates O(log N) expected search time!
735 | // ═══════════════════════════════════════════════════════════════════════════════
736 | 
737 | // randomHeight generates a random height for a new node
738 | //
739 | // IMPLEMENTATION:
740 | // ---------------
741 | // 1. Start with height = 1
742 | // 2. Flip a coin (random < 0.5)
743 | // 3. If heads and not at max: increase height, repeat
744 | // 4. If tails or at max: return current height
745 | func (sl *SkipList) randomHeight() int {
746 | 	height := 1
747 | 	rng := rand.New(rand.NewSource(time.Now().UnixNano()))
748 | 
749 | 	// Keep "flipping coins" (50% probability)
750 | 	for rng.Float64() < 0.5 && height < MaxHeight {
751 | 		height++
752 | 	}
753 | 
754 | 	return height
755 | }
756 | 
757 | // ═══════════════════════════════════════════════════════════════════════════════
758 | // ITERATOR: Sequential Access to Elements
759 | // ═══════════════════════════════════════════════════════════════════════════════
760 | // While skip lists support fast random access, sometimes we need to
761 | // traverse all elements in order. The iterator provides this capability.
762 | //
763 | // USAGE PATTERN:
764 | // --------------
765 | // iter := skipList.Iterator()
766 | // for iter.HasNext() {
767 | //     pos := iter.Next()
768 | //     // Process position...
769 | // }
770 | //
771 | // EXAMPLE:
772 | // --------
773 | // Skip list: [Doc1:Pos1, Doc1:Pos5, Doc2:Pos0, Doc2:Pos3]
774 | //
775 | // iter := skipList.Iterator()
776 | // iter.Next() → Doc1:Pos1
777 | // iter.Next() → Doc1:Pos5
778 | // iter.Next() → Doc2:Pos0
779 | // iter.Next() → Doc2:Pos3
780 | // iter.Next() → EOF
781 | // ═══════════════════════════════════════════════════════════════════════════════
782 | 
783 | // Iterator provides sequential access to skip list elements
784 | //
785 | // IMPLEMENTATION NOTE:
786 | // --------------------
787 | // We only traverse level 0 (the bottom level) which contains all elements
788 | // in sorted order. Higher levels are just shortcuts for searching.
789 | type Iterator struct {
790 | 	current *Node // The current position in the iteration
791 | }
792 | 
793 | // Iterator creates a new iterator starting at the first element
794 | //
795 | // INITIALIZATION:
796 | // ---------------
797 | // We start at the first real element (sl.Head.Tower[0])
798 | // NOT at the Head itself (which is just a sentinel)
799 | func (sl *SkipList) Iterator() *Iterator {
800 | 	return &Iterator{current: sl.Head.Tower[0]}
801 | }
802 | 
803 | // HasNext checks if there are more elements to iterate
804 | //
805 | // LOGIC:
806 | // ------
807 | // There are more elements if:
808 | // - current is not nil (we haven't fallen off the end), AND
809 | // - current.Tower[0] is not nil (there's a next element)
810 | //
811 | // Example states:
812 | // - HasNext() == true:  current -> [next] -> ...
813 | // - HasNext() == false: current -> nil (at the last element)
814 | func (it *Iterator) HasNext() bool {
815 | 	return it.current != nil && it.current.Tower[0] != nil
816 | }
817 | 
818 | // Next advances to and returns the next position
819 | //
820 | // PROCESS:
821 | // --------
822 | // 1. Move to the next node
823 | // 2. If we've reached the end, return EOF
824 | // 3. Otherwise, return the current position
825 | //
826 | // IMPORTANT:
827 | // ----------
828 | // Always check HasNext() before calling Next() to avoid
829 | // returning EOF unexpectedly!
830 | //
831 | // EXAMPLE USAGE:
832 | // --------------
833 | // iter := skipList.Iterator()
834 | //
835 | //	for iter.HasNext() {
836 | //	    pos := iter.Next()
837 | //	    fmt.Printf("Doc %d, Pos %d\n", pos.GetDocumentID(), pos.GetOffset())
838 | //	}
839 | func (it *Iterator) Next() Position {
840 | 	// Check if we're already at the end
841 | 	if it.current == nil {
842 | 		return EOFDocument
843 | 	}
844 | 
845 | 	// Move to the next node
846 | 	it.current = it.current.Tower[0]
847 | 
848 | 	// Check if we've reached the end after moving
849 | 	if it.current == nil {
850 | 		return EOFDocument
851 | 	}
852 | 
853 | 	// Return the current position
854 | 	return it.current.Key
855 | }
856 | 
857 | // ═══════════════════════════════════════════════════════════════════════════════
858 | // SKIP LIST SUMMARY
859 | // ═══════════════════════════════════════════════════════════════════════════════
860 | //
861 | // KEY CONCEPTS:
862 | // -------------
863 | // 1. Multiple levels: Express lanes for faster searching
864 | // 2. Probabilistic balancing: Random heights keep it balanced on average
865 | // 3. Sorted order: Always maintains elements in sorted order
866 | // 4. O(log n) operations: Search, insert, delete all average O(log n)
867 | //
868 | // WHY IT'S PERFECT FOR SEARCH ENGINES:
869 | // -------------------------------------
870 | // 1. Fast positional lookups: Find any document/position quickly
871 | // 2. Range queries: Find all positions in a document efficiently
872 | // 3. Sorted iteration: Process results in order
873 | // 4. Simple implementation: No complex tree rotations needed
874 | // 5. Good cache performance: Sequential access on level 0
875 | //
876 | // OPERATIONS SUMMARY:
877 | // -------------------
878 | // - Search(key): Find exact key or where it would be → O(log n)
879 | // - Insert(key): Add new element → O(log n)
880 | // - Delete(key): Remove element → O(log n)
881 | // - Find(key): Check if key exists → O(log n)
882 | // - FindLessThan(key): Find predecessor → O(log n)
883 | // - FindGreaterThan(key): Find successor → O(log n)
884 | // - Last(): Find last element → O(n) worst case, O(1) with tail pointer
885 | // - Iterator(): Sequential traversal → O(n) for all elements
886 | //
887 | // SPACE COMPLEXITY:
888 | // -----------------
889 | // - Average: O(n) where n is the number of elements
890 | // - Each node has ~2 pointers on average (geometric distribution)
891 | // - Worst case: O(n * MaxHeight) but extremely unlikely
892 | //
893 | // PERFORMANCE CHARACTERISTICS:
894 | // -----------------------------
895 | // - Search: O(log n) expected, O(n) worst case (very rare)
896 | // - Insert: O(log n) expected, O(n) worst case (very rare)
897 | // - Delete: O(log n) expected, O(n) worst case (very rare)
898 | // - Space: O(n) expected, O(n * log n) worst case
899 | //
900 | // The "worst case" scenarios are so rare they're not practically relevant.
901 | // The randomization ensures good performance with extremely high probability.
902 | //
903 | // COMPARISON TO OTHER DATA STRUCTURES:
904 | // -------------------------------------
905 | // vs. Balanced Trees (AVL, Red-Black):
906 | //   + Simpler implementation (no rotations)
907 | //   + Better constant factors in practice
908 | //   + Lock-free variants easier to implement
909 | //   - Slightly worse worst-case guarantees (probabilistic vs deterministic)
910 | //
911 | // vs. Hash Tables:
912 | //   + Maintains sorted order (hash tables don't)
913 | //   + Supports range queries efficiently
914 | //   + No rehashing needed
915 | //   - Slower than hash tables for exact lookups (O(log n) vs O(1))
916 | //
917 | // vs. Arrays:
918 | //   + Fast insertion/deletion (no shifting elements)
919 | //   + Dynamic sizing (no reallocation)
920 | //   - Slower random access (O(log n) vs O(1))
921 | //   - More memory overhead (pointers)
922 | //
923 | // REAL-WORLD APPLICATIONS:
924 | // -------------------------
925 | // 1. Database indexes (LevelDB, RocksDB use skip lists)
926 | // 2. In-memory caches (Redis sorted sets use skip lists)
927 | // 3. Search engines (inverted indexes like this one!)
928 | // 4. Concurrent data structures (easier to make lock-free than trees)
929 | // 5. Time-series databases (sorted by timestamp)
930 | //
931 | // ═══════════════════════════════════════════════════════════════════════════════
932 | 


--------------------------------------------------------------------------------