├── .gitignore
├── DDL.sql
├── LICENSE
├── MemoryVectorIndex
├── MemoryVectorIndex.cs
└── MemoryVectorIndex.csproj
├── MempryVectorIndex.Tests
├── MemoryVectorIndexTests.cs
└── MempryVectorIndex.Tests.csproj
├── README.md
├── VectorIndex.MainTest
├── Program.cs
└── VectorIndex.MainTest.csproj
├── VectorIndex
├── FileRangeStore.cs
├── IRangeStore.cs
├── IndexBuilder.cs
├── MemoryRangeStore.cs
├── RangeValue.cs
├── Stats.cs
└── VectorIndex.csproj
└── vector-database.sln
/.gitignore:
--------------------------------------------------------------------------------
1 | dotnet/.config
2 |
3 | ## Ignore Visual Studio temporary files, build results, and
4 | ## files generated by popular Visual Studio add-ons.
5 | ##
6 | ## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
7 |
8 | # User-specific files
9 | *.rsuser
10 | *.suo
11 | *.user
12 | *.userosscache
13 | *.sln.docstates
14 |
15 | # User-specific files (MonoDevelop/Xamarin Studio)
16 | *.userprefs
17 |
18 | # Mono auto generated files
19 | mono_crash.*
20 |
21 | # Build results
22 | [Dd]ebug/
23 | [Dd]ebugPublic/
24 | [Rr]elease/
25 | [Rr]eleases/
26 | x64/
27 | x86/
28 | [Ww][Ii][Nn]32/
29 | [Aa][Rr][Mm]/
30 | [Aa][Rr][Mm]64/
31 | bld/
32 | [Bb]in/
33 | [Oo]bj/
34 | [Ll]og/
35 | [Ll]ogs/
36 |
37 | # Visual Studio 2015/2017 cache/options directory
38 | .vs/
39 | # Uncomment if you have tasks that create the project's static files in wwwroot
40 | #wwwroot/
41 |
42 | # Visual Studio 2017 auto generated files
43 | Generated\ Files/
44 |
45 | # MSTest test Results
46 | [Tt]est[Rr]esult*/
47 | [Bb]uild[Ll]og.*
48 |
49 | # NUnit
50 | *.VisualState.xml
51 | TestResult.xml
52 | nunit-*.xml
53 |
54 | # Build Results of an ATL Project
55 | [Dd]ebugPS/
56 | [Rr]eleasePS/
57 | dlldata.c
58 |
59 | # Benchmark Results
60 | BenchmarkDotNet.Artifacts/
61 |
62 | # .NET Core
63 | project.lock.json
64 | project.fragment.lock.json
65 | artifacts/
66 |
67 | # ASP.NET Scaffolding
68 | ScaffoldingReadMe.txt
69 |
70 | # StyleCop
71 | StyleCopReport.xml
72 |
73 | # Files built by Visual Studio
74 | *_i.c
75 | *_p.c
76 | *_h.h
77 | *.ilk
78 | *.meta
79 | *.obj
80 | *.iobj
81 | *.pch
82 | *.pdb
83 | *.ipdb
84 | *.pgc
85 | *.pgd
86 | *.rsp
87 | *.sbr
88 | *.tlb
89 | *.tli
90 | *.tlh
91 | *.tmp
92 | *.tmp_proj
93 | *_wpftmp.csproj
94 | *.log
95 | *.tlog
96 | *.vspscc
97 | *.vssscc
98 | .builds
99 | *.pidb
100 | *.svclog
101 | *.scc
102 |
103 | # Chutzpah Test files
104 | _Chutzpah*
105 |
106 | # Visual C++ cache files
107 | ipch/
108 | *.aps
109 | *.ncb
110 | *.opendb
111 | *.opensdf
112 | *.sdf
113 | *.cachefile
114 | *.VC.db
115 | *.VC.VC.opendb
116 |
117 | # Visual Studio profiler
118 | *.psess
119 | *.vsp
120 | *.vspx
121 | *.sap
122 |
123 | # Visual Studio Trace Files
124 | *.e2e
125 |
126 | # TFS 2012 Local Workspace
127 | $tf/
128 |
129 | # Guidance Automation Toolkit
130 | *.gpState
131 |
132 | # ReSharper is a .NET coding add-in
133 | _ReSharper*/
134 | *.[Rr]e[Ss]harper
135 | *.DotSettings.user
136 |
137 | # TeamCity is a build add-in
138 | _TeamCity*
139 |
140 | # DotCover is a Code Coverage Tool
141 | *.dotCover
142 |
143 | # AxoCover is a Code Coverage Tool
144 | .axoCover/*
145 | !.axoCover/settings.json
146 |
147 | # Coverlet is a free, cross platform Code Coverage Tool
148 | coverage*.json
149 | coverage*.xml
150 | coverage*.info
151 |
152 | # Visual Studio code coverage results
153 | *.coverage
154 | *.coveragexml
155 |
156 | # NCrunch
157 | _NCrunch_*
158 | .*crunch*.local.xml
159 | nCrunchTemp_*
160 |
161 | # MightyMoose
162 | *.mm.*
163 | AutoTest.Net/
164 |
165 | # Web workbench (sass)
166 | .sass-cache/
167 |
168 | # Installshield output folder
169 | [Ee]xpress/
170 |
171 | # DocProject is a documentation generator add-in
172 | DocProject/buildhelp/
173 | DocProject/Help/*.HxT
174 | DocProject/Help/*.HxC
175 | DocProject/Help/*.hhc
176 | DocProject/Help/*.hhk
177 | DocProject/Help/*.hhp
178 | DocProject/Help/Html2
179 | DocProject/Help/html
180 |
181 | # Click-Once directory
182 | publish/
183 |
184 | # Publish Web Output
185 | *.[Pp]ublish.xml
186 | *.azurePubxml
187 | # Note: Comment the next line if you want to checkin your web deploy settings,
188 | # but database connection strings (with potential passwords) will be unencrypted
189 | *.pubxml
190 | *.publishproj
191 |
192 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
193 | # checkin your Azure Web App publish settings, but sensitive information contained
194 | # in these scripts will be unencrypted
195 | PublishScripts/
196 |
197 | # NuGet Packages
198 | *.nupkg
199 | # NuGet Symbol Packages
200 | *.snupkg
201 | # The packages folder can be ignored because of Package Restore
202 | **/[Pp]ackages/*
203 | # except build/, which is used as an MSBuild target.
204 | !**/[Pp]ackages/build/
205 | # Uncomment if necessary however generally it will be regenerated when needed
206 | #!**/[Pp]ackages/repositories.config
207 | # NuGet v3's project.json files produces more ignorable files
208 | *.nuget.props
209 | *.nuget.targets
210 |
211 | # Microsoft Azure Build Output
212 | csx/
213 | *.build.csdef
214 |
215 | # Microsoft Azure Emulator
216 | ecf/
217 | rcf/
218 |
219 | # Windows Store app package directories and files
220 | AppPackages/
221 | BundleArtifacts/
222 | Package.StoreAssociation.xml
223 | _pkginfo.txt
224 | *.appx
225 | *.appxbundle
226 | *.appxupload
227 |
228 | # Visual Studio cache files
229 | # files ending in .cache can be ignored
230 | *.[Cc]ache
231 | # but keep track of directories ending in .cache
232 | !?*.[Cc]ache/
233 |
234 | # Others
235 | ClientBin/
236 | ~$*
237 | *~
238 | *.dbmdl
239 | *.dbproj.schemaview
240 | *.jfm
241 | *.pfx
242 | *.publishsettings
243 | orleans.codegen.cs
244 |
245 | # Including strong name files can present a security risk
246 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
247 | #*.snk
248 |
249 | # Since there are multiple workflows, uncomment next line to ignore bower_components
250 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
251 | #bower_components/
252 |
253 | # RIA/Silverlight projects
254 | Generated_Code/
255 |
256 | # Backup & report files from converting an old project file
257 | # to a newer Visual Studio version. Backup files are not needed,
258 | # because we have git ;-)
259 | _UpgradeReport_Files/
260 | Backup*/
261 | UpgradeLog*.XML
262 | UpgradeLog*.htm
263 | ServiceFabricBackup/
264 | *.rptproj.bak
265 |
266 | # SQL Server files
267 | *.mdf
268 | *.ldf
269 | *.ndf
270 |
271 | # Business Intelligence projects
272 | *.rdl.data
273 | *.bim.layout
274 | *.bim_*.settings
275 | *.rptproj.rsuser
276 | *- [Bb]ackup.rdl
277 | *- [Bb]ackup ([0-9]).rdl
278 | *- [Bb]ackup ([0-9][0-9]).rdl
279 |
280 | # Microsoft Fakes
281 | FakesAssemblies/
282 |
283 | # GhostDoc plugin setting file
284 | *.GhostDoc.xml
285 |
286 | # Node.js Tools for Visual Studio
287 | .ntvs_analysis.dat
288 | node_modules/
289 |
290 | # Visual Studio 6 build log
291 | *.plg
292 |
293 | # Visual Studio 6 workspace options file
294 | *.opt
295 |
296 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
297 | *.vbw
298 |
299 | # Visual Studio 6 auto-generated project file (contains which files were open etc.)
300 | *.vbp
301 |
302 | # Visual Studio 6 workspace and project file (working project files containing files to include in project)
303 | *.dsw
304 | *.dsp
305 |
306 | # Visual Studio 6 technical files
307 | *.ncb
308 | *.aps
309 |
310 | # Visual Studio LightSwitch build output
311 | **/*.HTMLClient/GeneratedArtifacts
312 | **/*.DesktopClient/GeneratedArtifacts
313 | **/*.DesktopClient/ModelManifest.xml
314 | **/*.Server/GeneratedArtifacts
315 | **/*.Server/ModelManifest.xml
316 | _Pvt_Extensions
317 |
318 | # Paket dependency manager
319 | .paket/paket.exe
320 | paket-files/
321 |
322 | # FAKE - F# Make
323 | .fake/
324 |
325 | # CodeRush personal settings
326 | .cr/personal
327 |
328 | # Python Tools for Visual Studio (PTVS)
329 | __pycache__/
330 | *.pyc
331 |
332 | # Cake - Uncomment if you are using it
333 | # tools/**
334 | # !tools/packages.config
335 |
336 | # Tabs Studio
337 | *.tss
338 |
339 | # Telerik's JustMock configuration file
340 | *.jmconfig
341 |
342 | # BizTalk build output
343 | *.btp.cs
344 | *.btm.cs
345 | *.odx.cs
346 | *.xsd.cs
347 |
348 | # OpenCover UI analysis results
349 | OpenCover/
350 |
351 | # Azure Stream Analytics local run output
352 | ASALocalRun/
353 |
354 | # MSBuild Binary and Structured Log
355 | *.binlog
356 |
357 | # NVidia Nsight GPU debugger configuration file
358 | *.nvuser
359 |
360 | # MFractors (Xamarin productivity tool) working folder
361 | .mfractor/
362 |
363 | # Local History for Visual Studio
364 | .localhistory/
365 |
366 | # Visual Studio History (VSHistory) files
367 | .vshistory/
368 |
369 | # BeatPulse healthcheck temp database
370 | healthchecksdb
371 |
372 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
373 | MigrationBackup/
374 |
375 | # Ionide (cross platform F# VS Code tools) working folder
376 | .ionide/
377 |
378 | # Fody - auto-generated XML schema
379 | FodyWeavers.xsd
380 |
381 | # VS Code files for those working on multiple tools
382 | .vscode/*
383 | !.vscode/settings.json
384 | !.vscode/tasks.json
385 | !.vscode/launch.json
386 | !.vscode/extensions.json
387 | *.code-workspace
388 |
389 | # Local History for Visual Studio Code
390 | .history/
391 |
392 | # Windows Installer files from build outputs
393 | *.cab
394 | *.msi
395 | *.msix
396 | *.msm
397 | *.msp
398 |
399 | # JetBrains Rider
400 | *.sln.iml
401 | *.tmp
402 | *.log
403 | *.bck
404 | *.tgz
405 | *.tar
406 | *.zip
407 | *.cer
408 | *.crt
409 | *.key
410 | *.pem
411 |
412 | .env
413 | certs/
414 | launchSettings.json
415 | config.development.yaml
416 | *.development.config
417 | *.development.json
418 | .DS_Store
419 | .idea/
420 | node_modules/
421 | obj/
422 | bin/
423 | _dev/
424 | .dev/
425 | *.devis.*
426 | .vs/
427 | *.user
428 | **/.vscode/chrome
429 | **/.vscode/.ropeproject/objectdb
430 | *.pyc
431 | .ipynb_checkpoints
432 | .jython_cache/
433 | __pycache__/
434 | .mypy_cache/
435 | __pypackages__/
436 | .pdm.toml
437 | global.json
438 |
439 | # doxfx
440 | **/DROP/
441 | **/TEMP/
442 | **/packages/
443 | **/bin/
444 | **/obj/
445 | _site
446 |
447 | # Yarn
448 | .yarn
449 | .yarnrc.yml
450 |
451 | # Python Environments
452 | .env
453 | .venv
454 | .myenv
455 | env/
456 | venv/
457 | myvenv/
458 | ENV/
459 |
460 | # Python dist
461 | dist/
462 |
463 | # Peristant storage
464 | data/qdrant
465 | data/chatstore*
466 |
467 | # Java build
468 | java/**/target
469 | java/.mvn/wrapper/maven-wrapper.jar
470 |
471 | # Java settings
472 | conf.properties
473 | /data
474 |
--------------------------------------------------------------------------------
/DDL.sql:
--------------------------------------------------------------------------------
1 | USE [Vectors]
2 | GO
3 | /****** Object: UserDefinedTableType [dbo].[PointType] Script Date: 28/06/2023 17:06:20 ******/
4 | CREATE TYPE [dbo].[PointType] AS TABLE(
5 | [ID] [bigint] NOT NULL,
6 | [Idx] [smallint] NOT NULL,
7 | [Value] [real] NULL,
8 | PRIMARY KEY CLUSTERED
9 | (
10 | [ID] ASC,
11 | [Idx] ASC
12 | )WITH (IGNORE_DUP_KEY = OFF)
13 | )
14 | GO
15 | /****** Object: UserDefinedTableType [dbo].[RangeType] Script Date: 28/06/2023 17:06:20 ******/
16 | CREATE TYPE [dbo].[RangeType] AS TABLE(
17 | [RangeID] [bigint] NOT NULL,
18 | [Dimension] [smallint] NULL,
19 | [Mid] [real] NULL,
20 | [LowRangeID] [bigint] NULL,
21 | [HighRangeID] [bigint] NULL,
22 | [ID] [bigint] NULL,
23 | PRIMARY KEY CLUSTERED
24 | (
25 | [RangeID] ASC
26 | )WITH (IGNORE_DUP_KEY = OFF)
27 | )
28 | GO
29 | /****** Object: UserDefinedFunction [dbo].[BuildIndex] Script Date: 28/06/2023 17:06:20 ******/
30 | SET ANSI_NULLS ON
31 | GO
32 | SET QUOTED_IDENTIFIER ON
33 | GO
34 |
35 | USE [Vectors]
36 | GO
37 | /****** Object: UserDefinedFunction [dbo].[BuildIndex] Script Date: 04/07/2023 12:54:24 ******/
38 | SET ANSI_NULLS ON
39 | GO
40 | SET QUOTED_IDENTIFIER ON
41 | GO
42 |
43 | -- Builds range index for points.
44 | create function [dbo].[BuildIndex]
45 | (
46 | -- a points table to build range index.
47 | @points dbo.PointType readonly
48 | )
49 | returns @index table
50 | (
51 | RangeID bigint not null primary key,
52 | Dimension smallint null,
53 | Mid real null,
54 | LowRangeID bigint null,
55 | HighRangeID bigint null,
56 | ID bigint null
57 | )
58 | as
59 | begin
60 | declare @ranges table
61 | (
62 | ID bigint,
63 | RangeID bigint,
64 | primary key(RangeID, ID)
65 | );
66 |
67 | declare @stats table
68 | (
69 | RangeID bigint not null primary key,
70 | Idx smallint not null,
71 | Mean real not null,
72 | [Stdev] real,
73 | Count bigint not null,
74 | ID bigint not null
75 | );
76 |
77 | --raiserror(N'Level 0.', 0, 0) with nowait;
78 |
79 | insert into @stats(RangeID, Idx, Mean, Stdev, Count, ID)
80 | select top 1
81 | 0,
82 | Idx,
83 | avg(Value),
84 | isnull(stdev(Value), 0) Stdev,
85 | count_big(*),
86 | avg(ID)
87 | from
88 | @points
89 | group by
90 | Idx
91 | order by
92 | Stdev desc
93 |
94 | declare @next bit = @@rowcount;
95 |
96 | if (@next != 0)
97 | begin
98 | insert @ranges(RangeID, ID)
99 | select
100 | iif(S.Stdev = 0, iif(P.ID <= S.ID, 1, 2), iif(Value < Mean, 1, 2)),
101 | P.ID
102 | from
103 | @points P
104 | join
105 | @stats S
106 | on
107 | P.Idx = S.Idx and
108 | S.Count > 1;
109 |
110 | set @next = @@rowcount;
111 | declare @level bigint = 0;
112 | declare @i tinyint = 0;
113 |
114 | while(@next != 0)
115 | begin
116 | --raiserror(N'Level %i.', 0, 0, @level) with nowait;
117 |
118 | insert into @stats(RangeID, Idx, Mean, Stdev, Count, ID)
119 | select
120 | S.RangeID * 2 + N.I,
121 | R.Idx,
122 | R.Mean,
123 | R.Stdev,
124 | R.Count,
125 | R.ID
126 | from
127 | @stats S
128 | join
129 | (select 1 union all select 2) N(I)
130 | on
131 | S.RangeID >= @level and
132 | S.Count > 1
133 | cross apply
134 | (
135 | select top 1
136 | P.Idx,
137 | avg(P.Value) Mean,
138 | isnull(stdev(P.Value), 0) Stdev,
139 | count_big(*) Count,
140 | avg(P.ID) ID
141 | from
142 | @ranges R
143 | join
144 | @points P
145 | on
146 | P.ID = R.ID and
147 | R.RangeID = S.RangeID * 2 + N.I
148 | group by
149 | Idx
150 | order by
151 | iif(@level % 2 = 1, Stdev, -Stdev) desc
152 | ) R;
153 |
154 | set @level = @level * 2 + 1;
155 | set @i += 1;
156 |
157 | with R as
158 | (
159 | select
160 | R.*,
161 | R.RangeID * 2 +
162 | case
163 | when Value < Mean then 1
164 | when Value > Mean then 2
165 | when R.ID <= S.ID then 1
166 | else 2
167 | end NewRangeID
168 | from
169 | @ranges R
170 | join
171 | @stats S
172 | on
173 | S.RangeID = R.RangeID and
174 | S.RangeID >= @level and
175 | S.Count > 1
176 | join
177 | @points P
178 | on
179 | P.ID = R.ID and
180 | P.Idx = S.Idx
181 | )
182 | update R
183 | set
184 | RangeID = NewRangeID;
185 |
186 | set @next = @@rowcount;
187 | end;
188 | end;
189 |
190 | insert into @index(RangeID, Dimension, Mid, LowRangeID, HighRangeID, ID)
191 | select
192 | RangeID,
193 | iif(Stdev = 0, null, Idx) Dimension,
194 | iif(Stdev = 0, null, Mean) Mid,
195 | iif(Count = 1, null, RangeID * 2 + 1) LowRangeID,
196 | iif(Count = 1, null, RangeID * 2 + 2) HighRangeID,
197 | iif(Count = 1, ID, null) ID
198 | from
199 | @stats;
200 |
201 | return;
202 | end
203 | GO
204 | /****** Object: Table [dbo].[TextIndex] Script Date: 28/06/2023 17:06:20 ******/
205 | SET ANSI_NULLS ON
206 | GO
207 | SET QUOTED_IDENTIFIER ON
208 | GO
209 | CREATE TABLE [dbo].[TextIndex](
210 | [DocID] [bigint] NOT NULL,
211 | [RangeID] [bigint] NOT NULL,
212 | [Dimension] [smallint] NULL,
213 | [Mid] [real] NULL,
214 | [LowRangeID] [bigint] NULL,
215 | [HighRangeID] [bigint] NULL,
216 | [TextID] [bigint] NULL,
217 | CONSTRAINT [PK_TextIndex] PRIMARY KEY CLUSTERED
218 | (
219 | [RangeID] ASC,
220 | [DocID] ASC
221 | )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON, OPTIMIZE_FOR_SEQUENTIAL_KEY = OFF) ON [PRIMARY],
222 | CONSTRAINT [IX_TextIndex] UNIQUE NONCLUSTERED
223 | (
224 | [DocID] ASC,
225 | [RangeID] ASC
226 | )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON, OPTIMIZE_FOR_SEQUENTIAL_KEY = OFF) ON [PRIMARY]
227 | ) ON [PRIMARY]
228 | GO
229 | /****** Object: UserDefinedFunction [dbo].[Search] Script Date: 28/06/2023 17:06:20 ******/
230 | SET ANSI_NULLS ON
231 | GO
232 | SET QUOTED_IDENTIFIER ON
233 | GO
234 | CREATE FUNCTION [dbo].[Search]
235 | (
236 | -- json array of embedding vector
237 | @point nvarchar(max),
238 | -- a search domain.
239 | @domain real,
240 | -- Optional doc id.
241 | @docId bigint = null
242 | )
243 | returns table
244 | as
245 | return
246 | with Vector as
247 | (
248 | select
249 | [key] Idx,
250 | value - @domain MinValue,
251 | value + @domain MaxValue
252 | from
253 | openjson(@point)
254 | ),
255 | Node as
256 | (
257 | select
258 | *
259 | from
260 | dbo.TextIndex
261 | where
262 | RangeID = 0 and
263 | (@docId is null or DocID = @docId)
264 | union all
265 | select
266 | I.*
267 | from
268 | dbo.TextIndex I
269 | inner join
270 | Node N
271 | on
272 | N.LowRangeID is not null and
273 | I.DocID = N.DocID and
274 | I.RangeID = N.LowRangeID and
275 | (
276 | N.Dimension is null or
277 | N.Mid >= (select MinValue from Vector where Idx = N.Dimension)
278 | )
279 | union all
280 | select
281 | I.*
282 | from
283 | dbo.TextIndex I
284 | inner join
285 | Node N
286 | on
287 | N.HighRangeID is not null and
288 | I.DocID = N.DocID and
289 | I.RangeID = N.HighRangeID and
290 | (
291 | N.Dimension is null or
292 | N.Mid <= (select MaxValue from Vector where Idx = N.Dimension)
293 | )
294 | )
295 | select DocID, TextID from Node where TextID is not null;
296 | GO
297 | /****** Object: Table [dbo].[Document] Script Date: 28/06/2023 17:06:20 ******/
298 | SET ANSI_NULLS ON
299 | GO
300 | SET QUOTED_IDENTIFIER ON
301 | GO
302 | CREATE TABLE [dbo].[Document](
303 | [DocID] [bigint] NOT NULL,
304 | [Name] [nvarchar](256) NULL,
305 | CONSTRAINT [PK_Documents] PRIMARY KEY CLUSTERED
306 | (
307 | [DocID] ASC
308 | )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON, OPTIMIZE_FOR_SEQUENTIAL_KEY = OFF) ON [PRIMARY]
309 | ) ON [PRIMARY]
310 | GO
311 | /****** Object: Table [dbo].[Text] Script Date: 28/06/2023 17:06:20 ******/
312 | SET ANSI_NULLS ON
313 | GO
314 | SET QUOTED_IDENTIFIER ON
315 | GO
316 | CREATE TABLE [dbo].[Text](
317 | [DocID] [bigint] NOT NULL,
318 | [TextID] [bigint] NOT NULL,
319 | [Text] [nvarchar](max) NULL,
320 | [Vector] [nvarchar](max) NULL,
321 | CONSTRAINT [PK_Text] PRIMARY KEY CLUSTERED
322 | (
323 | [DocID] ASC,
324 | [TextID] ASC
325 | )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON, OPTIMIZE_FOR_SEQUENTIAL_KEY = OFF) ON [PRIMARY]
326 | ) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
327 | GO
328 | ALTER TABLE [dbo].[Document] ADD CONSTRAINT [DF_Document_DocID] DEFAULT (NEXT VALUE FOR [dbo].[DocumentID]) FOR [DocID]
329 | GO
330 | ALTER TABLE [dbo].[Text] ADD CONSTRAINT [DF_Text_TextID_1] DEFAULT (NEXT VALUE FOR [dbo].[TextID]) FOR [TextID]
331 | GO
332 | ALTER TABLE [dbo].[Text] WITH CHECK ADD CONSTRAINT [FK_Text_Document] FOREIGN KEY([DocID])
333 | REFERENCES [dbo].[Document] ([DocID])
334 | ON UPDATE CASCADE
335 | ON DELETE CASCADE
336 | GO
337 | ALTER TABLE [dbo].[Text] CHECK CONSTRAINT [FK_Text_Document]
338 | GO
339 | ALTER TABLE [dbo].[TextIndex] WITH CHECK ADD CONSTRAINT [FK_TextIndex_Document] FOREIGN KEY([DocID])
340 | REFERENCES [dbo].[Document] ([DocID])
341 | ON UPDATE CASCADE
342 | ON DELETE CASCADE
343 | GO
344 | ALTER TABLE [dbo].[TextIndex] CHECK CONSTRAINT [FK_TextIndex_Document]
345 | GO
346 | /****** Object: StoredProcedure [dbo].[IndexDocument] Script Date: 28/06/2023 17:06:20 ******/
347 | SET ANSI_NULLS ON
348 | GO
349 | SET QUOTED_IDENTIFIER ON
350 | GO
351 |
352 | CREATE procedure [dbo].[IndexDocument]
353 | @docID bigint
354 | as
355 | begin
356 | set nocount on;
357 |
358 | declare @points dbo.PointType;
359 | declare @index dbo.RangeType;
360 |
361 | --raiserror(N'Start loading points.', 0, 0, @timespan) with nowait;
362 |
363 | --set @start = current_timestamp;
364 |
365 | insert into @points(ID, Idx, Value)
366 | select
367 | TextID, [key], value
368 | from
369 | dbo.Text
370 | cross apply
371 | openjson(Vector)
372 | where
373 | DocID = @docID;
374 |
375 | --set @end = current_timestamp;
376 | --set @timespan = datediff(ms, @start, @end);
377 |
378 | --raiserror(N'Points loaded in %i milliseconds.', 0, 0, @timespan) with nowait;
379 |
380 | --raiserror(N'Start building index.', 0, 0, @timespan) with nowait;
381 |
382 | --set @start = current_timestamp;
383 |
384 | --raiserror(N'Start building index.', 0, 0, @timespan) with nowait;
385 |
386 | --set @start = current_timestamp;
387 |
388 | insert into @index
389 | select * from dbo.BuildIndex(@points);
390 |
391 | --set @end = current_timestamp;
392 | --set @timespan = datediff(ms, @start, @end);
393 |
394 | --raiserror(N'Index built in %i milliseconds.', 0, 0, @timespan) with nowait;
395 |
396 | -- Update index.
397 | delete from dbo.TextIndex where DocID = @docID;
398 |
399 | insert into dbo.TextIndex
400 | (
401 | DocID,
402 | RangeID,
403 | Dimension,
404 | Mid,
405 | LowRangeID,
406 | HighRangeID,
407 | TextID
408 | )
409 | select
410 | @docID,
411 | RangeID,
412 | Dimension,
413 | Mid,
414 | LowRangeID,
415 | HighRangeID,
416 | ID
417 | from
418 | @index
419 | end
420 | GO
421 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Nesterovsky Bros
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MemoryVectorIndex/MemoryVectorIndex.cs:
--------------------------------------------------------------------------------
1 | using System.Collections;
2 | using System.Numerics;
3 |
4 | namespace NesterovskyBros.VectorIndex;
5 |
6 | ///
7 | /// A record index by normalized vectors, such
8 | /// that each their components lie in range [-1, 1].
9 | /// All vectors must be of the same size.
10 | ///
11 | /// A record type associated with vector.
12 | public class MemoryVectorIndex: IEnumerable
13 | {
14 | ///
15 | /// Creates a vector index.
16 | ///
17 | ///
18 | /// A function returning vector for the record.
19 | ///
20 | ///
21 | /// A threshold size to store records in list buckets.
22 | ///
23 | public MemoryVectorIndex(
24 | Func> vectorSelector,
25 | int listThreshold = 10)
26 | {
27 | if (listThreshold <= 0)
28 | {
29 | throw new ArgumentException(
30 | "List threshold must be greater than zero.",
31 | nameof(listThreshold));
32 | }
33 |
34 | this.vectorSelector = vectorSelector;
35 | this.listThreshold = listThreshold;
36 | }
37 |
38 | ///
39 | /// Creates a vector index.
40 | ///
41 | /// A records to add to index.
42 | ///
43 | /// A function returning vector for the record.
44 | ///
45 | ///
46 | /// A threshold size to store records in list buckets.
47 | ///
48 | public MemoryVectorIndex(
49 | IEnumerable records,
50 | Func> vectorSelector,
51 | int listThreshold = 10):
52 | this(vectorSelector, listThreshold)
53 | {
54 | foreach(var record in records)
55 | {
56 | Add(record);
57 | }
58 | }
59 |
60 | ///
61 | /// Number of records.
62 | ///
63 | public int Count { get; private set; }
64 |
65 | ///
66 | public IEnumerator GetEnumerator() =>
67 | records.Values.SelectMany(items => items).GetEnumerator();
68 |
69 | ///
70 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
71 |
72 | ///
73 | /// Clears the index.
74 | ///
75 | public void Clear()
76 | {
77 | Count = 0;
78 | records.Clear();
79 | entries.Clear();
80 | }
81 |
82 | ///
83 | /// Adds a record to the index.
84 | ///
85 | /// A record to add.
86 | public void Add(R record)
87 | {
88 | var vector = vectorSelector(record).Span;
89 |
90 | if (entries is [])
91 | {
92 | if (vector.Length == 0)
93 | {
94 | throw new ArgumentException("Invalid vector size.", nameof(record));
95 | }
96 |
97 | vectorSize = vector.Length;
98 | Count = 1;
99 | records[0] = [record];
100 | entries.Add((-1, -1));
101 |
102 | return;
103 | }
104 |
105 | if (vector.Length != vectorSize)
106 | {
107 | throw new ArgumentException("Invalid vector size.", nameof(record));
108 | }
109 |
110 | var index = 0;
111 | var step = 1f;
112 | var centers = new float[vector.Length];
113 |
114 | for(var depth = 0; depth < maxDepth; ++depth)
115 | {
116 | step /= 2;
117 |
118 | for(var i = 0; i < vector.Length; ++i)
119 | {
120 | var (low, high) = entries[index];
121 |
122 | if (vector[i] < centers[i])
123 | {
124 | if (low >= 0)
125 | {
126 | centers[i] -= step;
127 | index = low;
128 |
129 | continue;
130 | }
131 |
132 | if (high >= 0)
133 | {
134 | entries[index] = (entries.Count, high);
135 | records[entries.Count] = [record];
136 | entries.Add((-1, -1));
137 | ++Count;
138 |
139 | return;
140 | }
141 | }
142 | else
143 | {
144 | if (high >= 0)
145 | {
146 | centers[i] += step;
147 | index = high;
148 |
149 | continue;
150 | }
151 |
152 | if (low >= 0)
153 | {
154 | entries[index] = (low, entries.Count);
155 | records[entries.Count] = [record];
156 | entries.Add((-1, -1));
157 | ++Count;
158 |
159 | return;
160 | }
161 | }
162 |
163 | // This is a leaf.
164 | var list = records[index];
165 |
166 | list.Add(record);
167 | ++Count;
168 |
169 | if (list.Count <= listThreshold || depth >= maxDepth - 1)
170 | {
171 | return;
172 | }
173 |
174 | records.Remove(index);
175 |
176 | // Split the list;
177 | List lowList = [];
178 |
179 | for(; depth < maxDepth; ++depth)
180 | {
181 | for(; i < vector.Length; ++i)
182 | {
183 | for(var j = list.Count; j-- > 0;)
184 | {
185 | var item = list[j];
186 |
187 | if (vectorSelector(item).Span[i] < centers[i])
188 | {
189 | lowList.Add(item);
190 | list.RemoveAt(j);
191 | }
192 | }
193 |
194 | if (lowList is [])
195 | {
196 | centers[i] += step;
197 | entries[index] = (-1, entries.Count);
198 | index = entries.Count;
199 | entries.Add((-1, -1));
200 | }
201 | else if (list is [])
202 | {
203 | centers[i] -= step;
204 | (lowList, list) = (list, lowList);
205 | entries[index] = (entries.Count, -1);
206 | index = entries.Count;
207 | entries.Add((-1, -1));
208 | }
209 | else
210 | {
211 | entries[index] = (entries.Count, entries.Count + 1);
212 | records[entries.Count] = lowList;
213 | records[entries.Count + 1] = list;
214 | entries.Add((-1, -1));
215 | entries.Add((-1, -1));
216 |
217 | return;
218 | }
219 | }
220 | }
221 |
222 | // Bad distribution, probably not normalized.
223 | records[index] = list;
224 |
225 | return;
226 | }
227 | }
228 | }
229 |
230 | ///
231 | /// Finds records in the index.
232 | ///
233 | /// A vector for the neighborhood origin.
234 | /// An euclidian distance for the match.
235 | /// A filter predicate.
236 | /// A enumeration of matched record.
237 | ///
238 | /// Index searches records and discards those that are too far, yet
239 | /// predicate may recieve records that are still far enough for the match,
240 | /// so predicate should verify the match.
241 | ///
242 | public IEnumerable Find(
243 | ReadOnlyMemory vector,
244 | float distance,
245 | Func, bool> predicate)
246 | {
247 | if (entries is [])
248 | {
249 | yield break;
250 | }
251 |
252 | if (vector.Length != vectorSize)
253 | {
254 | throw new ArgumentException("Invalid vector size.", nameof(vector));
255 | }
256 |
257 | var index = 0;
258 | var centers = new float[vector.Length];
259 | Stack<(int index, int i, float center, float step, float length)> state =
260 | [];
261 |
262 | state.Push((0, 0, 0, 1, distance * distance));
263 |
264 | while(state.TryPeek(out var item))
265 | {
266 | (var prev, index) = (index, item.index);
267 | var (i, center, step) = (item.i, item.center, item.step);
268 | var (low, high) = entries[index];
269 |
270 | centers[i] = center;
271 |
272 | if (prev == high)
273 | {
274 | state.Pop();
275 |
276 | continue;
277 | }
278 |
279 | var delta = vector.Span[i] - center;
280 | var prevDelta = Math.Max(Math.Abs(delta) - step, 0);
281 |
282 | if (prev != low && low != -1)
283 | {
284 | var length = delta <= 0 ? item.length :
285 | item.length + (prevDelta - delta) * (prevDelta + delta);
286 |
287 | if (length >= 0)
288 | {
289 | var half = step / 2;
290 |
291 | centers[i] -= half;
292 |
293 | if (++i == vectorSize)
294 | {
295 | i = 0;
296 | step = half;
297 | }
298 |
299 | state.Push((low, i, centers[i], step, length));
300 |
301 | continue;
302 | }
303 | }
304 |
305 | if (high != -1)
306 | {
307 | var length = delta >= 0 ? item.length :
308 | item.length + (prevDelta - delta) * (prevDelta + delta);
309 |
310 | if (length >= 0)
311 | {
312 | var half = step / 2;
313 |
314 | centers[i] += half;
315 |
316 | if (++i == vectorSize)
317 | {
318 | i = 0;
319 | step = half;
320 | }
321 |
322 | state.Push((high, i, centers[i], step, length));
323 | }
324 | else
325 | {
326 | state.Pop();
327 | }
328 |
329 | continue;
330 | }
331 |
332 | state.Pop();
333 |
334 | if (low == -1)
335 | {
336 | foreach(var record in records[index])
337 | {
338 | if (predicate(record, vector))
339 | {
340 | yield return record;
341 | }
342 | }
343 | }
344 | }
345 | }
346 |
347 | ///
348 | /// Removes records from the index.
349 | ///
350 | /// A vector for the neighborhood origin.
351 | /// An euclidian distance for the match.
352 | /// A filter predicate.
353 | ///
354 | /// Index searches records and discards those that are too far, yet
355 | /// predicate may recieve records that are still far enough for the match,
356 | /// so predicate should verify the match.
357 | ///
358 | public void Remove(
359 | ReadOnlyMemory vector,
360 | float distance,
361 | Func, bool> predicate)
362 | {
363 | if (entries is [])
364 | {
365 | return;
366 | }
367 |
368 | if (vector.Length != vectorSize)
369 | {
370 | throw new ArgumentException("Invalid vector size.", nameof(vector));
371 | }
372 |
373 | var vectorSpan = vector.Span;
374 | var index = 0;
375 | var centers = new float[vector.Length];
376 | Stack<(int index, int i, float center, float step, float length)> state =
377 | [];
378 |
379 | state.Push((0, 0, 0, 1, distance * distance));
380 |
381 | while(state.TryPeek(out var item))
382 | {
383 | (var prev, index) = (index, item.index);
384 | var (i, center, step) = (item.i, item.center, item.step);
385 | var (low, high) = entries[index];
386 |
387 | centers[i] = center;
388 |
389 | if (prev == high)
390 | {
391 | state.Pop();
392 |
393 | continue;
394 | }
395 |
396 | var delta = vectorSpan[i] - center;
397 | var prevDelta = Math.Max(Math.Abs(delta) - step, 0);
398 |
399 | if (prev != low && low != -1)
400 | {
401 | var length = delta <= 0 ? item.length :
402 | item.length + (prevDelta - delta) * (prevDelta + delta);
403 |
404 | if (length >= 0)
405 | {
406 | var half = step / 2;
407 |
408 | centers[i] -= half;
409 |
410 | if (++i == vectorSize)
411 | {
412 | i = 0;
413 | step = half;
414 | }
415 |
416 | state.Push((low, i, centers[i], step, length));
417 |
418 | continue;
419 | }
420 | }
421 |
422 | if (high != -1)
423 | {
424 | var length = delta >= 0 ? item.length :
425 | item.length + (prevDelta - delta) * (prevDelta + delta);
426 |
427 | if (length >= 0)
428 | {
429 | var half = step / 2;
430 |
431 | centers[i] += half;
432 |
433 | if (++i == vectorSize)
434 | {
435 | i = 0;
436 | step = half;
437 | }
438 |
439 | state.Push((high, i, centers[i], step, length));
440 | }
441 | else
442 | {
443 | state.Pop();
444 | }
445 |
446 | continue;
447 | }
448 |
449 | state.Pop();
450 |
451 | if (low == -1)
452 | {
453 | var list = records[index];
454 |
455 | for(i = list.Count; i-- > 0;)
456 | {
457 | if (predicate(list[i], vector))
458 | {
459 | list.RemoveAt(i);
460 | }
461 | }
462 |
463 | if (list is [])
464 | {
465 | records.Remove(index);
466 |
467 | // NOTE: we do not consolidate lists here.
468 | while(state.TryPeek(out item))
469 | {
470 | (low, high) = entries[item.index];
471 |
472 | if (low == -1 || high == -1)
473 | {
474 | centers[i] = center;
475 | index = item.index;
476 | entries[item.index] = (-1, -1);
477 | state.Pop();
478 |
479 | continue;
480 | }
481 |
482 | entries[item.index] = low == index ? (-1, high) : (low, -1);
483 |
484 | break;
485 | }
486 | }
487 | }
488 | }
489 | }
490 |
491 | public IEnumerable<
492 | (
493 | int index,
494 | int parent,
495 | ReadOnlyMemory center,
496 | IReadOnlyList? records
497 | )> IndexHierarchy
498 | {
499 | get
500 | {
501 | if (entries is [])
502 | {
503 | yield break;
504 | }
505 |
506 | var index = 0;
507 | var step = 1f;
508 | var centers = new float[vectorSize];
509 | Stack<(int index, int parent, int i, float center, float step)> state = [];
510 |
511 | state.Push((0, -1, 0, 0, 1));
512 |
513 | while(state.TryPeek(out var item))
514 | {
515 | var i = item.i;
516 | (var prev, index) = (index, item.index);
517 | var (low, high) = entries[index];
518 |
519 | centers[i] = item.center;
520 |
521 | if (prev == high)
522 | {
523 | state.Pop();
524 |
525 | continue;
526 | }
527 |
528 | if (prev != low && low != -1)
529 | {
530 | var half = step /= 2;
531 |
532 | centers[i] -= half;
533 |
534 | if (++i == vectorSize)
535 | {
536 | i = 0;
537 | step = half;
538 | }
539 |
540 | state.Push((low, index, i, centers[i], step));
541 |
542 | yield return (low, index, centers, null);
543 |
544 | continue;
545 | }
546 |
547 | if (high != -1)
548 | {
549 | var half = step /= 2;
550 |
551 | centers[i] += half;
552 |
553 | if (++i == vectorSize)
554 | {
555 | i = 0;
556 | step = half;
557 | }
558 |
559 | state.Push((high, index, i, centers[i], step));
560 |
561 | yield return (low, index, centers, null);
562 |
563 | continue;
564 | }
565 |
566 | if (low == -1)
567 | {
568 | state.Pop();
569 |
570 | yield return (index, item.parent, centers, records[index]);
571 | }
572 | }
573 | }
574 | }
575 |
576 | ///
577 | /// A function returning a vector for a record.
578 | ///
579 | private readonly Func> vectorSelector;
580 |
581 | ///
582 | /// A threshold size to store records in list buckets.
583 | ///
584 | private readonly int listThreshold;
585 |
586 | ///
587 | /// A size of vector;
588 | ///
589 | private int vectorSize;
590 |
591 | ///
592 | /// List of buckets.
593 | ///
594 | private readonly List<(int low, int high)> entries = [];
595 |
596 | ///
597 | /// Record lists by entries.
598 | ///
599 | private readonly Dictionary> records = [];
600 |
601 | ///
602 | /// Max depth of vectors before going to list.
603 | ///
604 | private static readonly int maxDepth = ((IFloatingPoint)0f).GetSignificandBitLength();
605 | }
606 |
--------------------------------------------------------------------------------
/MemoryVectorIndex/MemoryVectorIndex.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net8.0
5 | enable
6 | enable
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/MempryVectorIndex.Tests/MemoryVectorIndexTests.cs:
--------------------------------------------------------------------------------
1 | using ArffTools;
2 |
3 | namespace NesterovskyBros.VectorIndex;
4 |
5 | [TestClass]
6 | public class MemoryVectorIndexTests
7 | {
8 | const string datasets = "https://raw.githubusercontent.com/nesterovsky-bros/clustering-benchmark/master/src/main/resources/datasets/";
9 |
10 | [TestMethod]
11 | public void Test_3_3()
12 | {
13 | List records = [];
14 |
15 | for(var i = 0; i < 3; ++i)
16 | {
17 | for(var j = 0; j < 3; ++j)
18 | {
19 | records.Add(new()
20 | {
21 | id = records.Count,
22 | tag = $"{i},{j}",
23 | vector = [i - 1, j - 1]
24 | });
25 | }
26 | }
27 |
28 | Test("Test_3_3", records, [.5f, .9f], .6f);
29 | }
30 |
31 | [TestMethod]
32 | public void Test_10_10()
33 | {
34 | List records = [];
35 |
36 | for(var i = 0; i < 10; ++i)
37 | {
38 | for(var j = 0; j < 10; ++j)
39 | {
40 | records.Add(new()
41 | {
42 | id = records.Count,
43 | tag = $"{i},{j}",
44 | vector = [(i - 4.5f) / 5, (j - 4.5f) / 5]
45 | });
46 | }
47 | }
48 |
49 | Test("Test_10_10", records, [.3f, .3f], .3f);
50 | }
51 |
52 | [TestMethod]
53 | public void Test_100_100()
54 | {
55 | List records = [];
56 |
57 | for(var i = 0; i < 100; ++i)
58 | {
59 | for(var j = 0; j < 100; ++j)
60 | {
61 | records.Add(new()
62 | {
63 | id = records.Count,
64 | tag = $"{i},{j}",
65 | vector = [(i - 49.5f) / 50, (j - 49.5f) / 50]
66 | });
67 | }
68 | }
69 |
70 | Test("Test_100_100", records, [.3f, .3f], .1f);
71 | }
72 |
73 | [TestMethod]
74 | public void Test_1000_1000()
75 | {
76 | List records = [];
77 |
78 | for(var i = 0; i < 1000; ++i)
79 | {
80 | for(var j = 0; j < 1000; ++j)
81 | {
82 | records.Add(new()
83 | {
84 | id = records.Count,
85 | tag = $"{i},{j}",
86 | vector = [(i - 499.5f) / 500, (j - 499.5f) / 500]
87 | });
88 | }
89 | }
90 |
91 | Test("Test_1000_1000", records, [.3f, .3f], .05f);
92 | }
93 |
94 | [TestMethod]
95 | public void Test_100_100_NotNormalizedVectors()
96 | {
97 | List records = [];
98 |
99 | for(var i = 0; i < 100; ++i)
100 | {
101 | for(var j = 0; j < 100; ++j)
102 | {
103 | records.Add(new()
104 | {
105 | id = records.Count,
106 | tag = $"{i},{j}",
107 | vector = [i - 1, j - 1]
108 | });
109 | }
110 | }
111 |
112 | Test("Test_100_100_NotNormalizedVectors", records, [.3f, .3f], .3f);
113 | }
114 |
115 | [TestMethod]
116 | public async Task Test_2d_10c()
117 | {
118 | var dataset = await Dataset.Read("artificial/2d-10c.arff");
119 | float[] point = [(73 - dataset.offsetX) / dataset.scale, (70 - dataset.offsetX) / dataset.scale];
120 | var distance = 10f / dataset.scale;
121 |
122 | var match = Test("Test_2d_10c", dataset.records, point, distance);
123 |
124 | var view = $"X, Y\n{string.Join(
125 | '\n',
126 | match.Select(record =>
127 | {
128 | var vector = dataset.Scale(record.vector);
129 |
130 | return $"{vector[0]}, {vector[1]}";
131 | }))}";
132 |
133 | Console.WriteLine(view);
134 | }
135 |
136 | private static List Test(
137 | string name,
138 | List records,
139 | float[] point,
140 | float distance)
141 | {
142 | var index = new MemoryVectorIndex(records, record => record.vector);
143 |
144 | //var view = System.Text.Json.JsonSerializer.Serialize(
145 | // index.IndexHierarchy.Select(item => new
146 | // {
147 | // item.index,
148 | // item.parent,
149 | // center = item.center.ToArray(),
150 | // records = item.records?.Select(item => item.vector).ToArray()
151 | // }),
152 | // new System.Text.Json.JsonSerializerOptions()
153 | // {
154 | // WriteIndented = true
155 | // });
156 |
157 | //Console.WriteLine(view);
158 |
159 | Assert.AreEqual(index.Count, records.Count);
160 |
161 | var plainMatch = records.
162 | Where(record => Distance(record.vector, point) <= distance).
163 | ToList();
164 |
165 | var testCount = 0;
166 |
167 | var match = index.
168 | Find(
169 | point,
170 | distance,
171 | (record, vector) =>
172 | {
173 | ++testCount;
174 |
175 | return Distance(record.vector, vector.Span) <= distance;
176 | }).
177 | ToList();
178 |
179 | var unmatch = records.
180 | ExceptBy(match.Select(record => record.id), record => record.id).
181 | ToList();
182 |
183 | var invalidMatch = match.
184 | Where(record => Distance(record.vector, point) > distance).
185 | ToList();
186 |
187 | var invalidUnmatch = unmatch.
188 | Where(record => Distance(record.vector, point) <= distance).
189 | ToList();
190 |
191 | Console.WriteLine($"{name
192 | }:\n records: {records.Count }, distance: {distance
193 | }\n matched: {match.Count} - {
194 | (float)match.Count / records.Count:P1}\n predicate calls: {
195 | testCount} - {(float)testCount / records.Count:P1}\n predicates per match: {
196 | (float)testCount / match.Count:N1}.");
197 |
198 | Assert.AreEqual(invalidMatch.Count, 0);
199 | Assert.AreEqual(invalidUnmatch.Count, 0);
200 | Assert.AreEqual(match.Count, plainMatch.Count);
201 |
202 | Assert.IsTrue(!match.
203 | ExceptBy(plainMatch.Select(record => record.id), record => record.id).
204 | Any());
205 |
206 | return match;
207 | }
208 |
209 | private static float Distance(
210 | ReadOnlySpan a,
211 | ReadOnlySpan b)
212 | {
213 | var x = a[0] - b[0];
214 | var y = a[1] - b[1];
215 |
216 | return MathF.Sqrt(x * x + y * y);
217 | }
218 |
219 | public record struct Record
220 | {
221 | public float X => vector?[0] ?? 0;
222 | public float Y => vector?[1] ?? 0;
223 |
224 | public int id;
225 | public string? tag;
226 | public float[] vector;
227 | }
228 |
229 | public record Dataset
230 | {
231 | public List records = null!;
232 | public float offsetX;
233 | public float offsetY;
234 | public float scale;
235 |
236 | public Dataset() { }
237 |
238 | public Dataset(List records, bool normalize = true)
239 | {
240 | this.records = records;
241 |
242 | if (!normalize)
243 | {
244 | scale = 1;
245 |
246 | return;
247 | }
248 |
249 | var minX = float.PositiveInfinity;
250 | var maxX = float.NegativeInfinity;
251 | var minY = float.PositiveInfinity;
252 | var maxY = float.NegativeInfinity;
253 |
254 | foreach(var record in records)
255 | {
256 | var x = record.vector[0];
257 | var y = record.vector[1];
258 |
259 | minX = Math.Min(minX, x);
260 | maxX = Math.Max(maxX, x);
261 | minY = Math.Min(minY, y);
262 | maxY = Math.Max(maxY, y);
263 | }
264 |
265 | if (minX >= -1 && maxX <= 1 && minY >= -1 && maxY <= 1)
266 | {
267 | scale = 1;
268 |
269 | return;
270 | }
271 |
272 | if (maxX - minX <= 2 && maxY - minY <= 2)
273 | {
274 | scale = 1;
275 | offsetX = minX >= -1 && maxX <= 1 ? 0 : (minX + maxX) / 2;
276 | offsetY = minY >= -1 && maxY <= 1 ? 0 : (minY + maxY) / 2;
277 |
278 | foreach(var record in records)
279 | {
280 | var vector = record.vector;
281 |
282 | vector[0] -= offsetX;
283 | vector[1] -= offsetY;
284 | }
285 |
286 | return;
287 | }
288 | else
289 | {
290 | scale = Math.Max(maxX - minX, maxY - minY) / 2;
291 | offsetX = minX >= -1 && maxX <= 1 ? 0 : (minX + maxX) / 2;
292 | offsetY = minY >= -1 && maxY <= 1 ? 0 : (minY + maxY) / 2;
293 |
294 | foreach(var record in records)
295 | {
296 | var vector = record.vector;
297 |
298 | vector[0] = (vector[0] - offsetX) / scale;
299 | vector[1] = (vector[01] - offsetY) / scale;
300 | }
301 |
302 | return;
303 | }
304 | }
305 |
306 | public static async Task Read(string path, bool normalize = true)
307 | {
308 | List records = [];
309 |
310 | {
311 | using var client = new HttpClient();
312 | using var reader =
313 | new ArffReader(await client.GetStreamAsync($"{datasets}/{path}"));
314 | var header = reader.ReadHeader();
315 |
316 | while(true)
317 | {
318 | var row = reader.ReadInstance();
319 |
320 | if (row == null)
321 | {
322 | break;
323 | }
324 |
325 | var x = Convert.ToSingle(row[0]);
326 | var y = Convert.ToSingle(row[1]);
327 | var tag = Convert.ToString(row[2]);
328 |
329 | records.Add(new()
330 | {
331 | id = records.Count,
332 | tag = tag,
333 | vector = [x, y]
334 | });
335 | }
336 | }
337 |
338 | return new Dataset(records, normalize);
339 | }
340 |
341 | public float[] Scale(float[] vector) =>
342 | [vector[0] * scale + offsetX, vector[1] * scale + offsetY];
343 | }
344 | }
--------------------------------------------------------------------------------
/MempryVectorIndex.Tests/MempryVectorIndex.Tests.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net8.0
5 | enable
6 | enable
7 |
8 | false
9 | true
10 |
11 |
12 |
13 |
14 | all
15 | runtime; build; native; contentfiles; analyzers; buildtransitive
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # vector-database
2 | Turn SQL Server into vector database
3 |
4 | # Turn Akinator into vector database!
5 | Several years ago we have shown how to turn SQL Server into Akinator like engine. See [KB](https://github.com/nesterovsky-bros/KB) repository.
6 |
7 | At that time we did not know about vector databases.
8 | We just implemented a binary search index to identify an object from a set of objects by a set of properties inherent to that object.
9 |
10 | ## Briefly
11 | Assume you have a set of objects.
12 | Assume you have a set of boolean properties.
13 | We hava a matix of `[objects x properties]` with `true` or `false` in cells.
14 |
15 | If we present all properties as a vectors: `[p1, p2, p3, ..., pn]` then we turn original task to a task of creating an index of objects by a vector of booleans.
16 |
17 | ## Present time vector database
18 | It is only a half a step to extend vector of booleans to vector of floats. It is enough to say that float can be represented as a set of bits (booleans), so all ideas of KB database apply to a vector database.
19 |
20 | ## Vector database
21 | ### Let's formulate the idea.
22 | 1. We have a set of vectors.
23 | 2. We want to build an index that allows us to efficiently find vectors in some vicinity of a given vector.
24 | 3. To achieve the goal we use "divide and conquer" method.
25 |
26 | 3.1. Split whole vector space in two parts.
27 | There are multiple ways to do this but we selected one of the simplest and available in the SQL.
28 | We calculate a mean `avg()` and a standard deviation `stdev()` of all vectors for each dimension.
29 | For the split we select a dimension with highest and lowest standard deviation, and split in the mean point.
30 | This gives us two subsets of vectors of similar cardinality.
31 |
32 | 3.2. Repeat step 3.1 for each subset, unless it contains exactly one vector.
33 |
34 | The height of the tree that we build this way is proportional to `Log2(N)`, where `N` is number of vectors in the set.
35 | Estimation gives that for a set of `N` vectors the number of operations required to build such binary index is proportional to `N*Log2(N)`.
36 | Obviously, compexity of algorithm is propotional to a dimension of vectors.
37 |
38 | ### SQL Server
39 | SQL Server lets us to store float vectors as JSON. Not the best storage type, but we go for it.
40 | Here is our vector table:
41 |
42 | ```SQL
43 | create table dbo.Text
44 | (
45 | TextID bigint not null primary key,
46 | Text nvarchar(max) null,
47 | Vector varchar(max) null
48 | );
49 | ```
50 |
51 | Please note that this table is used to bind `TextID` to `Vector` and to build the search index, but not for a search itself.
52 |
53 | Here is a structure of the binary index:
54 |
55 | ```SQL
56 | create table dbo.TextIndex
57 | (
58 | RangeID bigint not null primary key,
59 | Dimension smallint null,
60 | Mid real null,
61 | LowRangeID bigint null,
62 | HighRangeID bigint null,
63 | TextID bigint null
64 | );
65 | ```
66 |
67 | The search starts from a given `vector` and a `proximity`.
68 | We start from the root `RangeID = 0`, and compare `Dimension` of input `vector ± proximity` against `Mid`.
69 | Depending on the outcome we proceed to low (`LowRangeID`), high (`HighRangeID`), or to both ranges.
70 | We repeat previous step with next ranges until we locate all matched vectors.
71 |
72 | Estimation tells that we shall complete the searh at most in `Log2(N)` steps.
73 |
74 | ## Implementation
75 | An implementation may worth more than theories.
76 | So, you're welcome to see it in [DDL.sql](./DDL.sql)
77 |
78 | ## Use
79 | 1. Create a document: insert something in `dbo.Document`.
80 | 2. Populate vectors: insert something into `dbo.Text`. Note that `dbo.Text.Vector` should be a JSON array of floats.
81 | 3. Index the document: call the stored procedure `dbo.IndexDocument`.
82 | 4. Do the search: call the table valued function `dbo.Search`.
83 |
84 | That's all.
85 | Thank you for you attention.
86 |
87 | **P.S.:** In addition we have implemented similar [index builder algorithm](./VectorIndex/IndexBuilder.cs) in C#. Though it has the same asymptotical complexity `N*Log2(N)`, it works faster. So, in more complex setup index builder may be implemented in the C#, and search is in the pure SQL.
88 |
89 | **P.P.S.:** By the same token we can implement efficient vector index in any SQL database that supports recursive CTE (like SQLite), or in CosmosDB as a function.
90 |
91 | ## C#
92 |
93 | It turned out that our initial parallel C# implementation is not scalable for relatively big datasets like deep-image-96-angular, containing ~10M vectors.
94 | Though it is parallel and has `O(N*Log2(N))` complexity, it runs wildly against Process/CPU data locality, and producess enormous number of Page Faults.
95 | Alternative data storage like FasterKV turns out to be too slow.
96 |
97 | So, we went and refactored the code from parallel tree level processor into sequential tree walker.
98 | It virtually follows steps 3.1, and 3.2 sequentially for one range at time. See: https://github.com/nesterovsky-bros/vector-database/blob/deea9da842cb12e4edcde4e03a1e68014754d15b/VectorIndex/IndexBuilder.cs#L488.
99 |
100 | In such mode we are able to build an index on a laptop just in 3 minutes.
101 |
102 | Right now we want to implement benchmarks like in https://qdrant.tech/benchmarks/, though we're not going to implement Client-Server protocol right now.
103 | Yet, we think we can get the idea where we stand comparing to other vector engines.
104 |
105 |
--------------------------------------------------------------------------------
/VectorIndex.MainTest/Program.cs:
--------------------------------------------------------------------------------
1 | using System.Diagnostics;
2 |
3 | using HDF.PInvoke;
4 |
5 | using HDF5CSharp;
6 |
7 | using NesterovskyBros.VectorIndex;
8 |
9 | var randomInput = GetRandomDataset((int)DateTime.Now.Ticks, 10000, 1536);
10 |
11 | // Test2 memory
12 | if (true)
13 | {
14 | var stopwatch = new Stopwatch();
15 |
16 | stopwatch.Start();
17 |
18 | var index = new Dictionary();
19 |
20 | await foreach(var (rangeId, range) in
21 | Test(
22 | randomInput,
23 | (_, _) => new MemoryRangeStore()))
24 | {
25 | index.Add(rangeId, range);
26 | }
27 |
28 | stopwatch.Stop();
29 |
30 | Console.WriteLine($"Build index: {stopwatch.Elapsed}");
31 | }
32 |
33 | // Crafted set.
34 | if (true)
35 | {
36 | var stopwatch = new Stopwatch();
37 |
38 | stopwatch.Start();
39 |
40 | var index = new Dictionary();
41 |
42 | await foreach(var (rangeId, range) in
43 | Test(
44 | input(),
45 | (_, _) => new MemoryRangeStore()))
46 | {
47 | index.Add(rangeId, range);
48 | }
49 |
50 | stopwatch.Stop();
51 |
52 | Console.WriteLine($"Build index: {stopwatch.Elapsed}");
53 |
54 | async IAsyncEnumerable<(long, Memory)> input()
55 | {
56 | var dimensions = 1536;
57 |
58 | for(var i = 0L; i < dimensions; ++i)
59 | {
60 | var vector = new float[dimensions];
61 |
62 | vector[i] = 1;
63 |
64 | yield return (i, vector);
65 | }
66 | }
67 | }
68 |
69 | // Test deep-image-96-angular.hdf5
70 | if (true)
71 | {
72 | var fileName = args.Length > 0 ? args[0] : null;
73 |
74 | if(fileName != null)
75 | {
76 | using var outputWriter = args.Length > 1 ? File.CreateText(args[1]) : null;
77 |
78 | if (outputWriter != null)
79 | {
80 | await outputWriter.WriteLineAsync("RangeID,Dimension,Mid,ID");
81 | }
82 |
83 | // /train, /test
84 | var (size, dimension) = GetHdf5DatasetSize(fileName, "/train");
85 | var datasetInput = GetHdf5Dataset(fileName, "/train", size, dimension);
86 | using var store = new FileRangeStore(size, dimension, 10000);
87 |
88 | var stopwatch = new Stopwatch();
89 |
90 | if(args.Length > 2)
91 | {
92 | var count = 0L;
93 | using var trainWriter = File.CreateText(args[2]);
94 |
95 | await trainWriter.WriteLineAsync("ID|Vector");
96 |
97 | await foreach(var (id, vector) in datasetInput)
98 | {
99 | await trainWriter.WriteLineAsync($"{id}|{string.Join(',', vector.ToArray())}");
100 |
101 | ++count;
102 |
103 | if(count % 100000 == 0)
104 | {
105 | Console.WriteLine($"Processed {count} records.");
106 | }
107 | }
108 | }
109 |
110 | if (args.Length > 3)
111 | {
112 | var count = 0L;
113 | var (testSize, testDimension) = GetHdf5DatasetSize(fileName, "/test");
114 | var testDataset = GetHdf5Dataset(fileName, "/test", testSize, testDimension);
115 |
116 | using var testWriter = File.CreateText(args[3]);
117 |
118 | await testWriter.WriteLineAsync("ID,Vector");
119 |
120 | await foreach(var (id, vector) in testDataset)
121 | {
122 | await testWriter.WriteLineAsync($"{id}|{string.Join(',', vector.ToArray())}");
123 |
124 | ++count;
125 |
126 | if(count % 100000 == 0)
127 | {
128 | Console.WriteLine($"Processed {count} records.");
129 | }
130 | }
131 | }
132 |
133 | stopwatch.Start();
134 |
135 | var index = new Dictionary();
136 |
137 | await foreach(var (rangeId, range) in
138 | Test(
139 | datasetInput,
140 | //(_, _) => new MemoryRangeStore()).
141 | store.NextStore))
142 | {
143 | index.Add(rangeId, range);
144 |
145 | if (outputWriter != null)
146 | {
147 | await outputWriter.WriteLineAsync(
148 | $"{rangeId},{range.Dimension},{range.Mid},{range.Id}");
149 | }
150 | }
151 |
152 | stopwatch.Stop();
153 |
154 | Console.WriteLine($"Build index: {stopwatch.Elapsed}, ranges: {index.Count}");
155 | }
156 | }
157 |
158 | IAsyncEnumerable<(long rangeId, RangeValue range)> Test(
159 | IAsyncEnumerable<(long id, Memory vector)> input,
160 | Func storeFactory) =>
161 | IndexBuilder.Build(input, storeFactory);
162 |
163 | async IAsyncEnumerable<(long id, Memory vector)> GetRandomDataset(
164 | int seed,
165 | long count,
166 | short dimensions)
167 | {
168 | var random = new Random(seed);
169 |
170 | for(var i = 0L; i < count; ++i)
171 | {
172 | var vector = new float[dimensions];
173 |
174 | for(var j = 0; j < vector.Length; ++j)
175 | {
176 | vector[j] = random.NextSingle() * 2 - 1;
177 | }
178 |
179 | yield return (i, vector);
180 | }
181 | }
182 |
183 | (long count, short dimensions) GetHdf5DatasetSize(
184 | string fileName,
185 | string datasetName)
186 | {
187 | var fileId = Hdf5.OpenFile(fileName, true);
188 | var datasetId = H5D.open(fileId, Hdf5Utils.NormalizedName(datasetName));
189 |
190 | try
191 | {
192 | var spaceId = H5D.get_space(datasetId);
193 |
194 | try
195 | {
196 | int rank = H5S.get_simple_extent_ndims(spaceId);
197 |
198 | if(rank != 2)
199 | {
200 | throw new InvalidOperationException("Invalid rank.");
201 | }
202 |
203 | ulong[] maxDims = new ulong[rank];
204 | ulong[] dims = new ulong[rank];
205 |
206 | H5S.get_simple_extent_dims(spaceId, dims, maxDims);
207 |
208 | return (checked((long)maxDims[0]), checked((short)maxDims[1]));
209 | }
210 | finally
211 | {
212 | H5S.close(spaceId);
213 | }
214 | }
215 | finally
216 | {
217 | H5D.close(datasetId);
218 | }
219 | }
220 |
221 | async IAsyncEnumerable<(long id, Memory vector)> GetHdf5Dataset(
222 | string fileName,
223 | string datasetName,
224 | long size,
225 | short dimension)
226 | {
227 | var index = 0L;
228 | var step = 100000;
229 | var fileId = Hdf5.OpenFile(fileName, true);
230 |
231 | try
232 | {
233 | while(index < size)
234 | {
235 | var rows = Hdf5.ReadDataset(
236 | fileId,
237 | datasetName,
238 | checked((ulong)index),
239 | Math.Min(checked((ulong)(index + step - 1)), checked((ulong)(size - 1))));
240 |
241 | var count = rows.GetLength(0);
242 |
243 | for(var i = 0; i < count; i++)
244 | {
245 | var row = new float[dimension];
246 |
247 | for(var j = 0; j < row.Length; ++j)
248 | {
249 | row[j] = rows[i, j];
250 | }
251 |
252 | yield return (index++, row);
253 | }
254 | }
255 | }
256 | finally
257 | {
258 | Hdf5.CloseFile(fileId);
259 | }
260 | }
261 |
--------------------------------------------------------------------------------
/VectorIndex.MainTest/VectorIndex.MainTest.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | net8.0
6 | enable
7 | enable
8 |
9 |
10 |
11 | False
12 |
13 |
14 |
15 | False
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/VectorIndex/FileRangeStore.cs:
--------------------------------------------------------------------------------
1 | using System.IO.MemoryMappedFiles;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.InteropServices;
4 |
5 | namespace NesterovskyBros.VectorIndex;
6 |
7 | ///
8 | /// A memory mapped file as a store for building vector index.
9 | ///
10 | public class FileRangeStore: IDisposable
11 | {
12 | ///
13 | /// Creates a instance.
14 | ///
15 | /// Number of vectors.
16 | /// Dimension of vectors.
17 | /// A buffer size.
18 | public FileRangeStore(long count, short dimensions, int buffer = 10000)
19 | {
20 | this.dimensions = dimensions;
21 | this.buffer = buffer;
22 | capacity = checked(
23 | (Marshal.SizeOf() + Marshal.SizeOf() * dimensions) *
24 | 4 * count);
25 | highOffset = capacity / 2;
26 | file = MemoryMappedFile.CreateNew(null, capacity);
27 | }
28 |
29 | ///
30 | /// Releases resources.
31 | ///
32 | public void Dispose() => file.Dispose();
33 |
34 | ///
35 | /// Gets next instance.
36 | ///
37 | /// A range id.
38 | /// A capacity.
39 | /// The instance.
40 | public IRangeStore NextStore(
41 | long rangeId,
42 | long capacity) =>
43 | new RangeStore(this, rangeId, capacity);
44 |
45 | private class RangeStore: IRangeStore
46 | {
47 | public RangeStore(
48 | FileRangeStore container,
49 | long rangeId,
50 | long capacity)
51 | {
52 | this.container = container;
53 | this.rangeId = rangeId;
54 | this.capacity = capacity;
55 | }
56 |
57 | public ValueTask Add(long id, Memory vector)
58 | {
59 | if (vector.Length != container.dimensions)
60 | {
61 | throw new ArgumentException(
62 | "Invalid length of vector.",
63 | nameof(vector));
64 | }
65 |
66 | if (data.Count >= container.buffer)
67 | {
68 | Flush();
69 | }
70 |
71 | data.Add((id, vector));
72 | ++count;
73 |
74 | return ValueTask.CompletedTask;
75 | }
76 |
77 | #pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously
78 | public async IAsyncEnumerable<(long id, Memory vector)> GetPoints()
79 | #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously
80 | {
81 | if (stream != null)
82 | {
83 | stream.Position = 0;
84 |
85 | for(var i = 0L; i < count - data.Count; ++i)
86 | {
87 | var id = 0L;
88 | var vector = new float[container.dimensions];
89 |
90 | stream.Read(MemoryMarshal.CreateSpan(
91 | ref Unsafe.As(ref id),
92 | Marshal.SizeOf()));
93 | stream.Read(
94 | MemoryMarshal.CreateSpan(
95 | ref Unsafe.As(ref vector[0]),
96 | Marshal.SizeOf() * container.dimensions));
97 |
98 | yield return (id, vector);
99 | }
100 | }
101 |
102 | foreach(var item in data)
103 | {
104 | yield return item;
105 | }
106 | }
107 |
108 | public ValueTask DisposeAsync()
109 | {
110 | if (stream != null)
111 | {
112 | stream.Dispose();
113 |
114 | if ((rangeId & 1) != 0)
115 | {
116 | container.lowOffset = start;
117 | }
118 | else
119 | {
120 | container.highOffset = start;
121 | }
122 | }
123 |
124 | return ValueTask.CompletedTask;
125 | }
126 |
127 | private void Flush()
128 | {
129 | if (stream == null)
130 | {
131 | start =
132 | (rangeId & 1) != 0 ? container.lowOffset : container.highOffset;
133 |
134 | stream = container.file.CreateViewStream(
135 | start,
136 | (Marshal.SizeOf() +
137 | Marshal.SizeOf() * container.dimensions) * capacity);
138 | }
139 |
140 | foreach(var (id, vector) in data)
141 | {
142 | var idRef = id;
143 |
144 | stream.Write(MemoryMarshal.CreateSpan(
145 | ref Unsafe.As(ref idRef),
146 | Marshal.SizeOf()));
147 | stream.Write(
148 | MemoryMarshal.CreateSpan(
149 | ref Unsafe.As(ref vector.Span[0]),
150 | Marshal.SizeOf() * container.dimensions));
151 | }
152 |
153 | data.Clear();
154 |
155 | var offset = start + stream.Position;
156 |
157 | if ((rangeId & 1) != 0)
158 | {
159 | container.lowOffset = offset;
160 | }
161 | else
162 | {
163 | container.highOffset = offset;
164 | }
165 | }
166 |
167 | private readonly long rangeId;
168 | private readonly FileRangeStore container;
169 | private readonly List<(long id, Memory vector)> data = new();
170 | private readonly long capacity;
171 | private long start;
172 | private long count;
173 | private Stream? stream;
174 | }
175 |
176 | private readonly int buffer;
177 | private readonly short dimensions;
178 | private readonly long capacity;
179 | private readonly MemoryMappedFile file;
180 | private long lowOffset;
181 | private long highOffset;
182 | }
183 |
--------------------------------------------------------------------------------
/VectorIndex/IRangeStore.cs:
--------------------------------------------------------------------------------
1 | namespace NesterovskyBros.VectorIndex;
2 |
3 | ///
4 | /// An interface encapsulating range store.
5 | ///
6 | public interface IRangeStore: IAsyncDisposable
7 | {
8 | ///
9 | /// Adds a range to the store.
10 | ///
11 | /// A vector id.
12 | /// A vector.
13 | /// A value task.
14 | ValueTask Add(long id, Memory vector);
15 |
16 | ///
17 | /// Gets async enumerable of stored ranges.
18 | ///
19 | /// A enumerable of vectors in range.
20 | IAsyncEnumerable<(long id, Memory vector)>
21 | GetPoints();
22 | }
23 |
--------------------------------------------------------------------------------
/VectorIndex/IndexBuilder.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Concurrent;
3 | using System.Drawing;
4 | using System.Numerics;
5 | using System.Text;
6 |
7 | namespace NesterovskyBros.VectorIndex;
8 |
9 | ///
10 | /// An API to build vector index.
11 | ///
12 | public partial class IndexBuilder
13 | {
14 | ///
15 | /// Gets range enumerations of points.
16 | ///
17 | /// A points enumeration.
18 | ///
19 | /// A factory to create a temporary store of points. Called as:
20 | /// storeFactory(rangeId, capacity)
.
21 | ///
22 | ///
23 | public static async IAsyncEnumerable<(long rangeId, RangeValue range)> Build(
24 | IAsyncEnumerable<(long id, Memory vector)> points,
25 | Func storeFactory)
26 | {
27 | var iteration = 0L;
28 | var level = 0;
29 |
30 | Stats[]? stats = null;
31 | Stack<(long rangeId, IRangeStore store, bool max)> stack = new();
32 |
33 | stack.Push((0, new RangeStore { points = points }, true));
34 |
35 | try
36 | {
37 | while(stack.TryPop(out var item))
38 | {
39 | try
40 | {
41 | ++iteration;
42 |
43 | level = Math.Max(
44 | level,
45 | 64 - BitOperations.LeadingZeroCount((ulong)item.rangeId));
46 |
47 | if (iteration < 10 ||
48 | iteration < 1000 && iteration % 100 == 0 ||
49 | iteration < 10000 && iteration % 1000 == 0 ||
50 | iteration % 10000 == 0)
51 | {
52 | Console.WriteLine($"Process {iteration} ranges. Level {level}");
53 | }
54 |
55 | var count = 0L;
56 |
57 | await foreach(var (id, vector) in item.store.GetPoints())
58 | {
59 | if (count++ == 0)
60 | {
61 | stats ??= new Stats[vector.Length];
62 | InitStats(id, vector);
63 | }
64 | else
65 | {
66 | UpdateStats(id, vector);
67 | }
68 | }
69 |
70 | if (count == 0)
71 | {
72 | continue;
73 | }
74 |
75 | var max = item.max;
76 |
77 | var (match, index) = stats!.
78 | Select((stats, index) => (stats, index)).
79 | MaxBy(item => max ? item.stats.Stdev2N : -item.stats.Stdev2N);
80 |
81 | RangeValue range = count == 1 ?
82 | new() { Dimension = -1, Id = (long)stats![0].IdN } :
83 | new()
84 | {
85 | Dimension = index,
86 | Mid = match.Mean,
87 | Id = (long)(match.IdN / match.Count)
88 | };
89 |
90 | var rangeId = item.rangeId;
91 |
92 | yield return (rangeId, range);
93 |
94 | if (count == 1)
95 | {
96 | continue;
97 | }
98 |
99 | var lowRangeId = checked(rangeId * 2 + 1);
100 | var low = storeFactory(lowRangeId, count);
101 |
102 | try
103 | {
104 | var highRangeId = checked(rangeId * 2 + 2);
105 | var high = storeFactory(highRangeId, count);
106 |
107 | //var lowCount = 0L;
108 |
109 | try
110 | {
111 | await foreach(var (id, vector) in item.store.GetPoints())
112 | {
113 | var value = vector.Span[range.Dimension];
114 |
115 | if (value > range.Mid || value == range.Mid && id > range.Id)
116 | {
117 | await high.Add(id, vector);
118 | }
119 | else
120 | {
121 | await low.Add(id, vector);
122 | //++lowCount;
123 | }
124 | }
125 |
126 | //stack.Push((lowRangeId, low, lowCount < count * 0.70710678118654752440084436210485));
127 | //stack.Push((highRangeId, high, lowCount > count * (1 - 0.70710678118654752440084436210485)));
128 | stack.Push((lowRangeId, low, !max));
129 | stack.Push((highRangeId, high, !max));
130 | }
131 | catch
132 | {
133 | await high.DisposeAsync();
134 |
135 | throw;
136 | }
137 | }
138 | catch
139 | {
140 | await low.DisposeAsync();
141 |
142 | throw;
143 | }
144 | }
145 | finally
146 | {
147 | await item.store.DisposeAsync();
148 | }
149 | }
150 | }
151 | finally
152 | {
153 | while(stack.TryPop(out var item))
154 | {
155 | await item.store.DisposeAsync();
156 | }
157 | }
158 |
159 | void InitStats(long id, Memory point)
160 | {
161 | var span = point.Span;
162 |
163 | for(var i = 0; i < span.Length; ++i)
164 | {
165 | stats[i] = new()
166 | {
167 | Mean = span[i],
168 | Stdev2N = 0,
169 | Count = 1,
170 | IdN = id
171 | };
172 | }
173 | }
174 |
175 | void UpdateStats(long id, Memory point)
176 | {
177 | var span = point.Span;
178 |
179 | for(var i = 0; i < span.Length; ++i)
180 | {
181 | var value = span[i];
182 | ref var item = ref stats![i];
183 | var pa = item.Mean;
184 | var pq = item.Stdev2N;
185 | var count = item.Count + 1;
186 | var a = pa + (value - pa) / count;
187 | var q = pq + (value - pa) * (value - a);
188 |
189 | item = new()
190 | {
191 | Mean = a,
192 | Stdev2N = q,
193 | Count = count,
194 | IdN = item.IdN + id
195 | };
196 | }
197 | }
198 | }
199 |
200 | private class RangeStore: IRangeStore
201 | {
202 | public IAsyncEnumerable<(long id, Memory vector)> points;
203 |
204 | public ValueTask DisposeAsync() => ValueTask.CompletedTask;
205 |
206 | public ValueTask Add(long id, Memory vector)
207 | {
208 | throw new NotImplementedException();
209 | }
210 |
211 | public IAsyncEnumerable<(long id, Memory vector)> GetPoints() => points;
212 | }
213 | }
214 |
--------------------------------------------------------------------------------
/VectorIndex/MemoryRangeStore.cs:
--------------------------------------------------------------------------------
1 | namespace NesterovskyBros.VectorIndex;
2 |
3 | ///
4 | /// A memory implementation of .
5 | /// Note that instances of this class are not thread safe.
6 | ///
7 | public class MemoryRangeStore : IRangeStore
8 | {
9 | ///
10 | public ValueTask DisposeAsync() => ValueTask.CompletedTask;
11 |
12 | ///
13 | public ValueTask Add(long id, Memory vector)
14 | {
15 | data.Add((id, vector));
16 |
17 | return ValueTask.CompletedTask;
18 | }
19 |
20 | ///
21 | #pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously
22 | public async IAsyncEnumerable<(long id, Memory vector)> GetPoints()
23 | #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously
24 | {
25 | foreach(var item in data)
26 | {
27 | yield return item;
28 | }
29 | }
30 |
31 | private List<(long id, Memory vector)> data = new();
32 | }
33 |
--------------------------------------------------------------------------------
/VectorIndex/RangeValue.cs:
--------------------------------------------------------------------------------
1 | namespace NesterovskyBros.VectorIndex;
2 |
3 | ///
4 | /// A range spliting space by a specified dimension into two subregions.
5 | ///
6 | public readonly record struct RangeValue
7 | {
8 | ///
9 | /// Index of dimension being indexed.
10 | ///
11 | public int Dimension { get; init; }
12 |
13 | ///
14 | /// A middle point of range.
15 | ///
16 | public float Mid { get; init; }
17 |
18 | ///
19 | /// Optional point id fit into the range.
20 | ///
21 | public long Id { get; init; }
22 | }
23 |
--------------------------------------------------------------------------------
/VectorIndex/Stats.cs:
--------------------------------------------------------------------------------
1 | namespace NesterovskyBros.VectorIndex;
2 |
3 | ///
4 | /// An aggregation stats.
5 | ///
6 | public readonly record struct Stats
7 | {
8 | ///
9 | /// A mean value
10 | ///
11 | public float Mean { get; init; }
12 |
13 | ///
14 | /// A stdev^2*N value.
15 | ///
16 | public float Stdev2N { get; init; }
17 |
18 | ///
19 | /// Number of items collected.
20 | ///
21 | public long Count { get; init; }
22 |
23 | ///
24 | /// Sum of ids to get mean Id value.
25 | ///
26 | public Int128 IdN { get; init; }
27 | }
28 |
--------------------------------------------------------------------------------
/VectorIndex/VectorIndex.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net8.0
5 | enable
6 | enable
7 | NesterovskyBros.VectorIndex
8 | VectorIndex
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/vector-database.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 17
4 | VisualStudioVersion = 17.5.33530.505
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "VectorIndex", "VectorIndex\VectorIndex.csproj", "{B41A62A3-7E59-4030-8777-37AA72790EDB}"
7 | EndProject
8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "VectorIndex.MainTest", "VectorIndex.MainTest\VectorIndex.MainTest.csproj", "{04DC6990-8492-41A6-9C0A-19C39B315919}"
9 | EndProject
10 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Files", "Files", "{D7D7DEE3-EFCB-486D-B61F-8E2DAD0466E4}"
11 | ProjectSection(SolutionItems) = preProject
12 | DDL.sql = DDL.sql
13 | LICENSE = LICENSE
14 | README.md = README.md
15 | EndProjectSection
16 | EndProject
17 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MemoryVectorIndex", "MemoryVectorIndex\MemoryVectorIndex.csproj", "{C6F45BC6-752D-4B54-86E8-4842254FA110}"
18 | EndProject
19 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MempryVectorIndex.Tests", "MempryVectorIndex.Tests\MempryVectorIndex.Tests.csproj", "{746E60D8-757E-4C27-908C-AA42F8E4A458}"
20 | EndProject
21 | Global
22 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
23 | Debug|Any CPU = Debug|Any CPU
24 | Release|Any CPU = Release|Any CPU
25 | EndGlobalSection
26 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
27 | {B41A62A3-7E59-4030-8777-37AA72790EDB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
28 | {B41A62A3-7E59-4030-8777-37AA72790EDB}.Debug|Any CPU.Build.0 = Debug|Any CPU
29 | {B41A62A3-7E59-4030-8777-37AA72790EDB}.Release|Any CPU.ActiveCfg = Release|Any CPU
30 | {B41A62A3-7E59-4030-8777-37AA72790EDB}.Release|Any CPU.Build.0 = Release|Any CPU
31 | {04DC6990-8492-41A6-9C0A-19C39B315919}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
32 | {04DC6990-8492-41A6-9C0A-19C39B315919}.Debug|Any CPU.Build.0 = Debug|Any CPU
33 | {04DC6990-8492-41A6-9C0A-19C39B315919}.Release|Any CPU.ActiveCfg = Release|Any CPU
34 | {04DC6990-8492-41A6-9C0A-19C39B315919}.Release|Any CPU.Build.0 = Release|Any CPU
35 | {C6F45BC6-752D-4B54-86E8-4842254FA110}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
36 | {C6F45BC6-752D-4B54-86E8-4842254FA110}.Debug|Any CPU.Build.0 = Debug|Any CPU
37 | {C6F45BC6-752D-4B54-86E8-4842254FA110}.Release|Any CPU.ActiveCfg = Release|Any CPU
38 | {C6F45BC6-752D-4B54-86E8-4842254FA110}.Release|Any CPU.Build.0 = Release|Any CPU
39 | {746E60D8-757E-4C27-908C-AA42F8E4A458}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
40 | {746E60D8-757E-4C27-908C-AA42F8E4A458}.Debug|Any CPU.Build.0 = Debug|Any CPU
41 | {746E60D8-757E-4C27-908C-AA42F8E4A458}.Release|Any CPU.ActiveCfg = Release|Any CPU
42 | {746E60D8-757E-4C27-908C-AA42F8E4A458}.Release|Any CPU.Build.0 = Release|Any CPU
43 | EndGlobalSection
44 | GlobalSection(SolutionProperties) = preSolution
45 | HideSolutionNode = FALSE
46 | EndGlobalSection
47 | GlobalSection(ExtensibilityGlobals) = postSolution
48 | SolutionGuid = {CF23CC1E-AFB8-4517-B126-F03D7F2B5311}
49 | EndGlobalSection
50 | EndGlobal
51 |
--------------------------------------------------------------------------------