├── .bumpversion.cfg
├── .clang-format
├── .github
    ├── ISSUE_TEMPLATE
    │   └── user-story.md
    ├── hooks
    │   └── pre-commit
    │   │   └── clang-format.hook
    └── workflows
    │   ├── build_and_test_cmake.yaml
    │   ├── build_and_upload_conda.yaml
    │   ├── build_develop_docs.yaml
    │   └── build_release_docs.yaml
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── CMakePresets.json
├── LICENSE
├── Makefile
├── README.md
├── benchmarks
    ├── .gitattributes
    ├── .gitignore
    ├── CMakeLists.txt
    ├── bench_lintdb.cpp
    ├── bench_lintdb.py
    ├── common.py
    ├── lotte
    │   ├── common.py
    │   ├── compare_clustering.py
    │   ├── debug_colbert.py
    │   ├── indexing_two.py
    │   └── main.py
    ├── pixi.lock
    ├── pixi.toml
    ├── poetry.lock
    ├── pyproject.toml
    ├── run_colbert.py
    ├── run_lintdb.py
    ├── vidore
    │   └── main.py
    └── xtr
    │   └── main.py
├── cmake
    ├── FindMKL.cmake
    └── lintdb-config.cmake.in
├── conda
    ├── benchmark_env.yaml
    ├── conda_build_config.yaml
    ├── environment.yaml
    └── lintdb
    │   ├── build-lib-arm64.sh
    │   ├── build-lib-osx.sh
    │   ├── build-lib.bat
    │   ├── build-lib.sh
    │   ├── build-pkg-arm64.sh
    │   ├── build-pkg-osx.sh
    │   ├── build-pkg.bat
    │   ├── build-pkg.sh
    │   └── meta.yaml
├── docker
    └── Dockerfile.conda.build
├── docs
    ├── Makefile
    ├── development.md
    ├── environment.yaml
    ├── examples.md
    ├── getting-started.md
    ├── icon.svg
    ├── index.md
    ├── installation.md
    ├── make.bat
    ├── nav.md
    ├── pyproject.toml
    ├── reference.md
    └── requirements.txt
├── icon.svg
├── lintdb
    ├── CMakeLists.txt
    ├── SearchOptions.h
    ├── SearchResult.h
    ├── api.h
    ├── assert.h
    ├── cf.h
    ├── constants.h
    ├── env.h
    ├── exception.h
    ├── index.cpp
    ├── index.h
    ├── invlists
    │   ├── ContextIterator.h
    │   ├── EncodedDocument.cpp
    │   ├── EncodedDocument.h
    │   ├── ForwardIndexIterator.cpp
    │   ├── ForwardIndexIterator.h
    │   ├── IndexWriter.cpp
    │   ├── IndexWriter.h
    │   ├── InvertedIterator.cpp
    │   ├── InvertedIterator.h
    │   ├── InvertedList.h
    │   ├── Iterator.h
    │   ├── KeyBuilder.h
    │   ├── PostingData.h
    │   ├── RocksdbForwardIndex.cpp
    │   ├── RocksdbForwardIndex.h
    │   ├── RocksdbInvertedList.cpp
    │   └── RocksdbInvertedList.h
    ├── python
    │   ├── CMakeLists.txt
    │   ├── pylintdb.cpp
    │   ├── tests
    │   │   └── test_index.py
    │   └── version.txt
    ├── quantizers
    │   ├── Binarizer.cpp
    │   ├── Binarizer.h
    │   ├── CoarseQuantizer.cpp
    │   ├── CoarseQuantizer.h
    │   ├── IdentityQuantizer.cpp
    │   ├── IdentityQuantizer.h
    │   ├── InvertedListScanner.cpp
    │   ├── InvertedListScanner.h
    │   ├── PQDistanceTables.cpp
    │   ├── PQDistanceTables.h
    │   ├── ProductEncoder.cpp
    │   ├── ProductEncoder.h
    │   ├── Quantizer.h
    │   ├── impl
    │   │   ├── kmeans.cpp
    │   │   ├── kmeans.h
    │   │   └── product_quantizer.h
    │   ├── io.cpp
    │   └── io.h
    ├── query
    │   ├── DocIterator.cpp
    │   ├── DocIterator.h
    │   ├── DocValue.h
    │   ├── KnnNearestCentroids.cpp
    │   ├── KnnNearestCentroids.h
    │   ├── Query.cpp
    │   ├── Query.h
    │   ├── QueryContext.h
    │   ├── QueryExecutor.cpp
    │   ├── QueryExecutor.h
    │   ├── QueryNode.cpp
    │   ├── QueryNode.h
    │   ├── decode.cpp
    │   └── decode.h
    ├── schema
    │   ├── DataTypes.h
    │   ├── DocEncoder.cpp
    │   ├── DocEncoder.h
    │   ├── DocProcessor.cpp
    │   ├── DocProcessor.h
    │   ├── Document.h
    │   ├── FieldMapper.cpp
    │   ├── FieldMapper.h
    │   ├── ProcessedData.h
    │   ├── Schema.cpp
    │   └── Schema.h
    ├── scoring
    │   ├── ContextCollector.cpp
    │   ├── ContextCollector.h
    │   ├── ScoredDocument.h
    │   ├── Scorer.cpp
    │   ├── Scorer.h
    │   ├── plaid.cpp
    │   ├── plaid.h
    │   ├── scoring_methods.cpp
    │   └── scoring_methods.h
    ├── server
    │   ├── CMakeLists.txt
    │   ├── api_tests.py
    │   ├── controllers
    │   │   └── v1
    │   │   │   ├── Index.cpp
    │   │   │   ├── Index.h
    │   │   │   ├── query_node_translator.h
    │   │   │   └── result_translator.h
    │   ├── main.cpp
    │   └── openapi.yaml
    ├── util.cpp
    ├── util.h
    ├── utils
    │   ├── endian.h
    │   ├── half.h
    │   └── progress_bar.h
    └── version.h
├── mkdocs.yml
├── ports
    ├── bitsery
    │   ├── portfile.cmake
    │   └── vcpkg.json
    ├── faiss
    │   ├── faiss.patch
    │   ├── fix-dependencies.patch
    │   ├── portfile.cmake
    │   └── vcpkg.json
    ├── intel-mkl
    │   ├── copy-from-dmg.cmake
    │   ├── portfile.cmake
    │   ├── usage
    │   └── vcpkg.json
    ├── onnxruntime
    │   ├── portfile.cmake
    │   └── vcpkg.json
    └── rocksdb
    │   ├── 0001-fix-dependencies.patch
    │   ├── portfile.cmake
    │   └── vcpkg.json
├── pyproject.toml
├── tests
    ├── CMakeLists.txt
    ├── __init__.py
    ├── binarizer_test.cpp
    ├── coarse_quantizer_test.cpp
    ├── colbert_test.cpp
    ├── data
    │   ├── colbert.ranking.tsv
    │   ├── colbert_test.db
    │   │   ├── 000008.sst
    │   │   ├── 000009.sst
    │   │   ├── 000010.sst
    │   │   ├── 000011.sst
    │   │   ├── 000176.log
    │   │   ├── CURRENT
    │   │   ├── IDENTITY
    │   │   ├── LOCK
    │   │   ├── LOG
    │   │   ├── LOG.old.1722800135830059
    │   │   ├── LOG.old.1722800216294444
    │   │   ├── LOG.old.1722800296371779
    │   │   ├── LOG.old.1722800437955832
    │   │   ├── LOG.old.1722800493660601
    │   │   ├── LOG.old.1722917400856599
    │   │   ├── MANIFEST-000177
    │   │   ├── OPTIONS-000175
    │   │   ├── OPTIONS-000179
    │   │   ├── _field_mapper.json
    │   │   ├── _lintdb_metadata.json
    │   │   ├── _schema.json
    │   │   ├── colbert_coarse_quantizer
    │   │   └── colbert_quantizer
    │   └── query.txt
    ├── doc_encoder_test.cpp
    ├── doc_iterator_test.cpp
    ├── doc_processor_test.cpp
    ├── index_test.cpp
    ├── inverted_list_test.cpp
    ├── keys_test.cpp
    ├── mocks.h
    ├── plaid_test.cpp
    ├── product_quantizer_test.cpp
    └── util.h
├── vcpkg-configuration.json
├── vcpkg.json
└── version.txt


/.bumpversion.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.5.1
 3 | commit = True
 4 | tag = True
 5 | 
 6 | [bumpversion:file:version.txt]
 7 | search = {current_version}
 8 | replace = {new_version}
 9 | 
10 | [bumpversion:file:vcpkg.json]
11 | search = "version-string": "{current_version}"
12 | replace = "version-string": "{new_version}"
13 | 
14 | [bumpversion:file:lintdb/version.h]
15 | search = #define LINTDB_VERSION_STRING "{current_version}"
16 | replace = #define LINTDB_VERSION_STRING "{new_version}"
17 | 
18 | [bumpversion:file:pyproject.toml]
19 | search = version = "{current_version}"
20 | replace = version = "{new_version}"
21 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | AccessModifierOffset: -1
 3 | AlignAfterOpenBracket: AlwaysBreak
 4 | AlignConsecutiveAssignments: false
 5 | AlignConsecutiveDeclarations: false
 6 | AlignEscapedNewlinesLeft: true
 7 | AlignOperands:   false
 8 | AlignTrailingComments: true
 9 | AllowAllParametersOfDeclarationOnNextLine: false
10 | AllowShortBlocksOnASingleLine: false
11 | AllowShortCaseLabelsOnASingleLine: false
12 | AllowShortFunctionsOnASingleLine: Empty
13 | AllowShortIfStatementsOnASingleLine: false
14 | AllowShortLoopsOnASingleLine: false
15 | AlwaysBreakAfterReturnType: None
16 | AlwaysBreakBeforeMultilineStrings: true
17 | AlwaysBreakTemplateDeclarations: true
18 | BinPackArguments: false  # at some point, set this to true
19 | BinPackParameters: false # at some point, set this to true
20 | BraceWrapping:
21 |   AfterClass:      false
22 |   AfterControlStatement: false
23 |   AfterEnum:       false
24 |   AfterFunction:   false
25 |   AfterNamespace:  false
26 |   AfterObjCDeclaration: false
27 |   AfterStruct:     false
28 |   AfterUnion:      false
29 |   BeforeCatch:     false
30 |   BeforeElse:      false
31 |   IndentBraces:    false
32 | BreakBeforeBinaryOperators: None
33 | BreakBeforeBraces: Attach
34 | BreakBeforeTernaryOperators: true
35 | BreakConstructorInitializersBeforeComma: false
36 | BreakAfterJavaFieldAnnotations: false
37 | BreakStringLiterals: false
38 | ColumnLimit:     80
39 | CommentPragmas:  '^ IWYU pragma:'
40 | CompactNamespaces: false
41 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
42 | ConstructorInitializerIndentWidth: 8
43 | ContinuationIndentWidth: 8
44 | Cpp11BracedListStyle: true
45 | DerivePointerAlignment: false
46 | DisableFormat:   false
47 | ForEachMacros:   [ FOR_EACH_RANGE, FOR_EACH, ]
48 | IncludeCategories:
49 |   - Regex:           '^<.*\.h(pp)?>'
50 |     Priority:        1
51 |   - Regex:           '^<.*'
52 |     Priority:        2
53 |   - Regex:           '.*'
54 |     Priority:        3
55 | IndentCaseLabels: true
56 | IndentWidth:     4
57 | IndentWrappedFunctionNames: false
58 | KeepEmptyLinesAtTheStartOfBlocks: false
59 | MacroBlockBegin: ''
60 | MacroBlockEnd:   ''
61 | MaxEmptyLinesToKeep: 1
62 | NamespaceIndentation: None
63 | ObjCBlockIndentWidth: 4
64 | ObjCSpaceAfterProperty: false
65 | ObjCSpaceBeforeProtocolList: false
66 | PenaltyBreakBeforeFirstCallParameter: 1
67 | PenaltyBreakComment: 300
68 | PenaltyBreakFirstLessLess: 120
69 | PenaltyBreakString: 1000
70 | PenaltyExcessCharacter: 1000000
71 | PenaltyReturnTypeOnItsOwnLine: 2000000
72 | PointerAlignment: Left
73 | ReflowComments:  true
74 | SortIncludes:    CaseInsensitive
75 | SpaceAfterCStyleCast: false
76 | SpaceBeforeAssignmentOperators: true
77 | SpaceBeforeParens: ControlStatements
78 | SpaceInEmptyParentheses: false
79 | SpacesBeforeTrailingComments: 1
80 | SpacesInAngles:  false
81 | SpacesInContainerLiterals: true
82 | SpacesInCStyleCastParentheses: false
83 | SpacesInParentheses: false
84 | SpacesInSquareBrackets: false
85 | Standard:        c++17
86 | TabWidth:        4
87 | UseTab:          Never


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/user-story.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: User Story
 3 | about: Issue for User Stories
 4 | title: ''
 5 | labels: Story
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | [
11 | The user story should have a reason to exist: what do I need as the user described in the summary?
12 | This part details any detail that could not be passed by the summary.
13 | ]
14 | 
15 | As a [user concerned by the story]
16 | I want [goal of the story]
17 | so that [reason for the story]
18 | 
19 | 
20 | ### Acceptance Criteria
21 | 
22 | 1. [If I do A.]
23 | 1. [B should happen.]
24 | 
25 | [
26 | Also, here are a few points that need to be addressed:
27 | 
28 | 1. Constraint 1;
29 | 1. Constraint 2;
30 | 1. Constraint 3.
31 | ]
32 | 
33 | 
34 | ### Resources:
35 | 
36 | * Mockups: [Here goes a URL to or the name of the mockup(s) in inVision];
37 | * Testing URL: [Here goes a URL to the testing branch or IP];
38 | * Staging URL: [Here goes a URL to the feature on staging];
39 | 
40 | 
41 | ### Notes
42 | 
43 | [Some complementary notes if necessary:]
44 | 
45 | * > Here goes a quote from an email
46 | * Here goes whatever useful information can exist…
47 | 


--------------------------------------------------------------------------------
/.github/hooks/pre-commit/clang-format.hook:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | STYLE=$(git config --get hooks.clangformat.style)
 4 | if [ -n "${STYLE}" ] ; then
 5 |   STYLEARG="-style=${STYLE}"
 6 | else
 7 |   STYLEARG=""
 8 | fi
 9 | 
10 | format_file() {
11 |   file="${1}"
12 |   if [ -f $file ]; then
13 |     clang-format -i ${STYLEARG} ${1}
14 |     git add ${1}
15 |   fi
16 | }
17 | 
18 | case "${1}" in
19 |   --about )
20 |     echo "Runs clang-format on source files"
21 |     ;;
22 |   * )
23 |     for file in `git diff-index --cached --name-only HEAD | grep -iE '\.(cpp|cc|h|hpp)$' ` ; do
24 |       format_file "${file}"
25 |     done
26 |     ;;
27 | esac


--------------------------------------------------------------------------------
/.github/workflows/build_and_test_cmake.yaml:
--------------------------------------------------------------------------------
 1 | name: Build and Test Cmake
 2 | on: 
 3 |   - pull_request
 4 | 
 5 | jobs:
 6 |   cmake-build:
 7 |     name: Run Cmake
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       fail-fast: true
11 | 
12 |     steps:
13 |       - name: install clang
14 |         run: |
15 |           wget https://apt.llvm.org/llvm.sh
16 |           chmod +x llvm.sh
17 |           sudo ./llvm.sh 18 all
18 |       - uses: actions/checkout@v3
19 |         with:
20 |           submodules: recursive
21 |           # This is useful to avoid https://github.com/microsoft/vcpkg/issues/25349
22 |           # which is caused by missing Git history on the vcpkg submodule which ports 
23 |           # try to access. 
24 |           # Do not use if not needed, since it slows down the checkout of sources.
25 |           fetch-depth: 1
26 |       - name: submodule init
27 |         run: |
28 |           git submodule update --init --recursive
29 |       - uses: lukka/get-cmake@latest
30 |       - name: Setup vcpkg
31 |         uses: lukka/run-vcpkg@v11
32 |         id: runvcpkg
33 |         with:
34 |           # This one is not needed, as it is the default value anyway.
35 |           vcpkgDirectory: '${{ github.workspace }}/tools/vcpkg'
36 |           vcpkgJsonGlob: '**/cmakepresets/vcpkg.json'
37 | 
38 |       - name: Prints output of run-vcpkg's action.
39 |         run: echo "root='${{ steps.runvcpkg.outputs.RUNVCPKG_VCPKG_ROOT_OUT }}', triplet='${{ steps.runvcpkg.outputs.RUNVCPKG_VCPKG_DEFAULT_TRIPLET_OUT }}' "
40 |       - name: Run CMake+vcpkg+Ninja
41 |         uses: lukka/run-cmake@v10
42 |         id: runcmake
43 |         env:
44 |           CC: clang-18
45 |           CXX: clang++-18
46 |           CMAKE_C_COMPILER: clang-18
47 |           CMAKE_CXX_COMPILER: clang++-18
48 |           MKLROOT: ${{ github.workspace }}/builds/debug/vcpkg_installed/x64-linux/lib/intel64
49 |         with:
50 |           cmakeListsTxtPath: '${{ github.workspace }}/CMakeLists.txt'
51 |           configurePresetAdditionalArgs: "['-DOpenMP_CXX_FLAGS=-fopenmp=libiomp5', '-DOpenMP_CXX_LIB_NAMES=libiomp5', '-DOpenMP_libiomp5_LIBRARY=${{ github.workspace }}/builds/debug/vcpkg_installed/x64-linux/lib/intel64/libiomp5.so']"
52 |           configurePreset: 'debug'
53 |           buildPreset: 'debug'
54 |           testPreset: 'debug'
55 |           testPresetCmdString:  "['lintdb-tests', '--test-dir', 'builds/debug', '--output-on-failure']"
56 | 
57 |       - uses: actions/setup-python@v4
58 |         with:
59 |           python-version: 3.10.6
60 | 
61 |       - name: Install Dependencies
62 |         env:
63 |           CC: clang-18
64 |           CXX: clang++-18
65 |           CMAKE_C_COMPILER: clang-18
66 |           CMAKE_CXX_COMPILER: clang++-18
67 |         run: |
68 |           echo "CXX=${CXX}"
69 |           pip install pytest numpy
70 |         
71 |           sudo apt-get remove clang-14 clang-15
72 |           sudo rm /usr/bin/clang++
73 |           sudo ln -s /usr/bin/clang++-18 /usr/bin/clang++
74 |           pip install .
75 |       - name: Run Tests
76 |         run: |
77 |           pytest lintdb/python/tests  
78 | 
79 | 


--------------------------------------------------------------------------------
/.github/workflows/build_develop_docs.yaml:
--------------------------------------------------------------------------------
 1 | name: Build/Publish Develop Docs
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 | permissions:
 7 |   contents: write
 8 | jobs:
 9 |   deploy:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: install clang
13 |         run: |
14 |           wget https://apt.llvm.org/llvm.sh
15 |           chmod +x llvm.sh
16 |           sudo ./llvm.sh 18 all
17 |       - uses: actions/checkout@v3
18 |         with:
19 |           fetch-depth: 0
20 |       - name: submodule init
21 |         run: |
22 |           git submodule update --init --recursive
23 |       - uses: lukka/get-cmake@latest
24 |       - uses: actions/setup-python@v4
25 |         with:
26 |           python-version: 3.10.6
27 |       - name: Install Dependencies
28 |         env:
29 |           CC: clang-18
30 |           CXX: clang++-18
31 |           CMAKE_C_COMPILER: clang-18
32 |           CMAKE_CXX_COMPILER: clang++-18
33 |         run: |
34 |           pip install mkdocs-material
35 |           pip install mkdocstrings[python] markdown-callouts mkdocs-literate-nav mike
36 |           
37 |           sudo apt-get remove clang-14 clang-15
38 |           sudo rm /usr/bin/clang++
39 |           sudo ln -s /usr/bin/clang++-18 /usr/bin/clang++
40 |           pip install .
41 |       - name: Setup Docs Deploy
42 |         run: |
43 |           git config --global user.name "Docs Deploy"
44 |           git config --global user.email "docs.deploy@example.co.uk"
45 |       - name: Build Docs Website
46 |         run: mike deploy --push develop


--------------------------------------------------------------------------------
/.github/workflows/build_release_docs.yaml:
--------------------------------------------------------------------------------
 1 | name: Build/Publish Release Docs
 2 | on:
 3 |   release:
 4 |     types: [published]
 5 | permissions:
 6 |   contents: write
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - name: install clang
12 |         run: |
13 |           wget https://apt.llvm.org/llvm.sh
14 |           chmod +x llvm.sh
15 |           sudo ./llvm.sh 18 all
16 |       - uses: actions/checkout@v3
17 |         with:
18 |           fetch-depth: 0
19 |       - name: submodule init
20 |         run: |
21 |           git submodule update --init --recursive
22 |       - uses: lukka/get-cmake@latest
23 |       - uses: actions/setup-python@v4
24 |         with:
25 |           python-version: 3.10.6
26 |       - name: Install Dependencies
27 |         env:
28 |           CC: clang-18
29 |           CXX: clang++-18
30 |           CMAKE_C_COMPILER: clang-18
31 |           CMAKE_CXX_COMPILER: clang++-18
32 |         run: |
33 |           pip install mkdocs-material
34 |           pip install mkdocstrings[python] markdown-callouts mkdocs-literate-nav mike
35 |           
36 |           sudo apt-get remove clang-14 clang-15
37 |           sudo rm /usr/bin/clang++
38 |           sudo ln -s /usr/bin/clang++-18 /usr/bin/clang++
39 |           pip install .
40 |       - name: Setup Docs Deploy
41 |         run: |
42 |           git config --global user.name "Docs Deploy"
43 |           git config --global user.email "docs.deploy@example.co.uk"
44 |       - name: Build Docs Website
45 |         run: mike deploy --push --update-aliases ${{ github.event.release.tag_name }} latest


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Generated by Cargo
 2 | # will have compiled files and executables
 3 | .idea
 4 | debug/
 5 | target/
 6 | assets/
 7 | cmake-build-debug*
 8 | benchmarks/local_db.index*
 9 | 
10 | .DS_Store
11 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
12 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
13 | Cargo.lock
14 | 
15 | # These are backup files generated by rustfmt
16 | **/*.rs.bk
17 | 
18 | # MSVC Windows builds of rustc generate these, which store debugging information
19 | *.pdb
20 | 
21 | 
22 | # Added by cargo
23 | 
24 | /target
25 | 
26 | # Prerequisites
27 | *.d
28 | 
29 | # Compiled Object files
30 | *.slo
31 | *.lo
32 | *.o
33 | *.obj
34 | 
35 | # Precompiled Headers
36 | *.gch
37 | *.pch
38 | 
39 | # Compiled Dynamic libraries
40 | *.so
41 | *.dylib
42 | *.dll
43 | 
44 | # Fortran module files
45 | *.mod
46 | *.smod
47 | 
48 | # Compiled Static libraries
49 | *.lai
50 | *.la
51 | *.a
52 | *.lib
53 | 
54 | # Executables
55 | *.exe
56 | *.out
57 | *.app
58 | 
59 | vcpkg_installed
60 | build
61 | builds
62 | .vscode
63 | 
64 | models
65 | tests/__pycache__
66 | experiments
67 | build_benchmarks
68 | _build_python_
69 | # Byte-compiled / optimized / DLL files
70 | __pycache__/
71 | *.py[cod]
72 | *$py.class
73 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "tools/vcpkg"]
 2 | 	path = tools/vcpkg
 3 | 	url = https://github.com/Microsoft/vcpkg.git
 4 | 	ignore = dirty
 5 | [submodule "third_party/tokenizers-cpp"]
 6 | 	path = third_party/tokenizers-cpp
 7 | 	url = https://github.com/DeployQL/tokenizers-cpp.git
 8 | [submodule "third_party/nanobind"]
 9 | 	path = third_party/nanobind
10 | 	url = https://github.com/wjakob/nanobind
11 | [submodule "third_party/dkm"]
12 | 	path = third_party/dkm
13 | 	url = https://github.com/genbattle/dkm.git
14 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.25)
 2 | 
 3 | include(CMakeFindDependencyMacro)
 4 | 
 5 | # allow faiss to build on m1 mac even though it's listed as unsupported.
 6 | set(VCPKG_INSTALL_OPTIONS "--allow-unsupported")
 7 | 
 8 | # Setup vcpkg script with CMake (note: should be placed before project() call)
 9 | set(CMAKE_TOOLCHAIN_FILE
10 |     ${CMAKE_CURRENT_SOURCE_DIR}/tools/vcpkg/scripts/buildsystems/vcpkg.cmake
11 |     CACHE STRING "Vcpkg toolchain file")
12 | 
13 | file(READ "version.txt" version)
14 | 
15 | project(
16 |   lintdb
17 |   VERSION ${version}
18 |   DESCRIPTION "A multi-vector database for late interaction retrieval"
19 |   LANGUAGES CXX)
20 | set(LINTDB_VERSION ${version})
21 | 
22 | include(GNUInstallDirs)
23 | 
24 | set(CMAKE_CXX_STANDARD 17)
25 | 
26 | set(CMAKE_CXX_FLAGS
27 |     "${CMAKE_CXX_FLAGS} -std=c++17 -fPIC -O3 -D_LIBCPP_DISABLE_AVAILABILITY"
28 | )
29 | 
30 | if(MSVC OR LINUX)
31 |   set(BLA_VENDOR "Intel10_64lp")
32 | else()
33 |     set(BLA_VENDOR "OpenBLAS")
34 | endif()
35 | 
36 | # the below is caused by github actions failing to build flatbuffers. therefore,
37 | # we set this value so that we use a higher sdk version to build it.
38 | set(CMAKE_OSX_DEPLOYMENT_TARGET 10.13)
39 | set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
40 | 
41 | # https://conda-forge.org/docs/maintainer/knowledge_base/#newer-c-features-with-old-sdk
42 | # if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") set(CMAKE_CXX_FLAGS
43 | # "${CMAKE_CXX_FLAGS} ") endif()
44 | 
45 | list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
46 | 
47 | if(SKBUILD)
48 |   message("Building with scikit-build")
49 |   cmake_path(GET CMAKE_CURRENT_BINARY_DIR PARENT_PATH BUILD_PARENT_DIR)
50 |   set(ENV{MKLROOT}
51 |       "${BUILD_PARENT_DIR}/vcpkg_installed/x64-linux/lib/intel64")
52 |   set(OpenMP_libiomp5_LIBRARY
53 |       "${BUILD_PARENT_DIR}/vcpkg_installed/x64-linux/lib/intel64/libiomp5.so"
54 |   )
55 |   set(CMAKE_BUILD_TYPE Release)
56 | endif()
57 | 
58 | set(CMAKE_INSTALL_PREFIX ${PROJECT_SOURCE_DIR})
59 | 
60 | option(ENABLE_PYTHON "Build Python extension." ON)
61 | option(ENABLE_BENCHMARKS "Build benchmarks." ON)
62 | option(ENABLE_SERVER "Build the server." OFF)
63 | 
64 | add_subdirectory(lintdb)
65 | 
66 | if(ENABLE_PYTHON)
67 |   add_subdirectory(lintdb/python)
68 | endif()
69 | 
70 | IF(ENABLE_SERVER)
71 |   add_subdirectory(lintdb/server)
72 | endif()
73 | 
74 | include(CTest)
75 | if(BUILD_TESTING)
76 |   add_subdirectory(tests)
77 | endif()
78 | 
79 | 
80 | if(ENABLE_BENCHMARKS)
81 |   add_subdirectory(benchmarks)
82 | endif()
83 | 


--------------------------------------------------------------------------------
/benchmarks/.gitattributes:
--------------------------------------------------------------------------------
1 | # GitHub syntax highlighting
2 | pixi.lock linguist-language=YAML linguist-generated=true
3 | 


--------------------------------------------------------------------------------
/benchmarks/.gitignore:
--------------------------------------------------------------------------------
1 | # pixi environments
2 | .pixi
3 | *.egg-info
4 | 


--------------------------------------------------------------------------------
/benchmarks/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | message(STATUS "Building lintdb benchmark")
 3 | 
 4 | add_executable(bench_lintdb EXCLUDE_FROM_ALL bench_lintdb.cpp)
 5 | target_link_libraries(bench_lintdb PRIVATE lintdb_lib)
 6 | 
 7 | find_package(benchmark CONFIG REQUIRED)
 8 | target_link_libraries(bench_lintdb PRIVATE benchmark::benchmark benchmark::benchmark_main)
 9 | 
10 | 


--------------------------------------------------------------------------------
/benchmarks/bench_lintdb.cpp:
--------------------------------------------------------------------------------
  1 | #include <benchmark/benchmark.h>
  2 | #include <string>
  3 | #include <vector>
  4 | #include <chrono>
  5 | #include <filesystem>
  6 | #include "lintdb/index.h"
  7 | #include "lintdb/schema/DataTypes.h"
  8 | #include "lintdb/schema/Schema.h"
  9 | #include "lintdb/quantizers/Quantizer.h"
 10 | #include "lintdb/query/Query.h"
 11 | #include "lintdb/query/QueryNode.h"
 12 | 
 13 | lintdb::Document create_document(size_t num_tokens, size_t dim){
 14 |     std::vector<float> vector;
 15 |     for (size_t j = 0; j < num_tokens; j++) {
 16 |         std::vector<float> data(dim, j);
 17 |         vector.insert(vector.end(), data.begin(), data.end());
 18 |     }
 19 |     lintdb::FieldValue fv("colbert", vector, num_tokens);
 20 |     std::vector<lintdb::FieldValue> fields = {fv};
 21 | 
 22 |     lintdb::Document doc(0, fields );
 23 |     return doc;
 24 | }
 25 | 
 26 | inline std::filesystem::path create_temporary_directory(
 27 |         unsigned long long max_tries = 1000) {
 28 |     auto tmp_dir = std::filesystem::temp_directory_path();
 29 |     unsigned long long i = 0;
 30 |     std::random_device dev;
 31 |     std::mt19937 prng(dev());
 32 |     std::uniform_int_distribution<uint64_t> rand(0);
 33 |     std::filesystem::path path;
 34 |     while (true) {
 35 |         std::stringstream ss;
 36 |         ss << std::hex << rand(prng);
 37 |         path = tmp_dir / ss.str();
 38 |         // true if the directory was created.
 39 |         if (std::filesystem::create_directory(path)) {
 40 |             break;
 41 |         }
 42 |         if (i == max_tries) {
 43 |             throw std::runtime_error("could not find non-existing directory");
 44 |         }
 45 |         i++;
 46 |     }
 47 |     return path;
 48 | }
 49 | 
 50 | 
 51 | static void BM_lintdb_add(benchmark::State& state) {
 52 |     lintdb::Schema schema;
 53 | 
 54 |     lintdb::Field colbert;
 55 |     colbert.name = "colbert";
 56 |     colbert.data_type = lintdb::DataType::TENSOR;
 57 |     colbert.field_types = {lintdb::FieldType::Colbert};
 58 |     lintdb::FieldParameters fp;
 59 |     fp.dimensions = 128;
 60 |     fp.num_centroids = 10;
 61 |     fp.num_iterations = 2;
 62 |     fp.quantization = lintdb::QuantizerType::BINARIZER;
 63 |     fp.nbits = 1;
 64 |     colbert.parameters = fp;
 65 | 
 66 |     schema.add_field(colbert);
 67 | 
 68 |     auto temp_db = create_temporary_directory();
 69 | 
 70 |     lintdb::Configuration config;
 71 |     lintdb::IndexIVF index(
 72 |             temp_db.string(), schema, config);
 73 | 
 74 |     std::vector<lintdb::Document> docs;
 75 |     for (size_t i = 0; i < 50; i++) {
 76 |         docs.push_back(create_document(120, 128));
 77 |     }
 78 |     index.train(docs);
 79 | 
 80 |     auto doc = create_document(120, 128);
 81 | 
 82 |     for(auto _ : state) {
 83 |         index.add(0, {doc});
 84 |     }
 85 | }
 86 | 
 87 | static void BM_lintdb_search(benchmark::State& state) {
 88 |     lintdb::IndexIVF index = lintdb::IndexIVF("/home/matt/deployql/LintDB/benchmarks/lintdb-lifestyle-40k");
 89 | 
 90 |     lintdb::FieldValue fv("colbert", std::vector<float>(1280, 1), 10);
 91 |     std::unique_ptr<lintdb::VectorQueryNode> root = std::make_unique<lintdb::VectorQueryNode>(fv);
 92 |     lintdb::Query query(std::move(root));
 93 | 
 94 |     lintdb::SearchOptions opts;
 95 |     opts.n_probe = 32;
 96 |     opts.k_top_centroids = 2;
 97 | 
 98 |     for(auto _ : state) {
 99 |         index.search(0, query, 10, opts);
100 |     }
101 | }
102 | 
103 | //BENCHMARK(BM_lintdb_add)->Unit(benchmark::kMillisecond);
104 | BENCHMARK(BM_lintdb_search)->Unit(benchmark::kMillisecond);
105 | 
106 | BENCHMARK_MAIN();


--------------------------------------------------------------------------------
/benchmarks/common.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import psutil
3 | import os
4 | 
5 | def get_memory_usage():
6 |     process = psutil.Process(os.getpid())
7 |     return process.memory_info().rss


--------------------------------------------------------------------------------
/benchmarks/lotte/compare_clustering.py:
--------------------------------------------------------------------------------
 1 | import lintdb
 2 | from datasets import load_dataset
 3 | from collections import namedtuple
 4 | from colbert import Indexer, Searcher
 5 | from colbert.infra import Run, RunConfig, ColBERTConfig
 6 | from colbert.data import Queries, Collection
 7 | import os
 8 | import sys
 9 | import jsonlines
10 | from collections import defaultdict
11 | from tqdm import tqdm
12 | import time
13 | import numpy as np
14 | 
15 | LoTTeDataset = namedtuple('LoTTeDataset', ['collection', 'queries', 'qids', 'dids'])
16 | 
17 | def load_lotte(dataset, split, max_id=500000):
18 |     collection_dataset = load_dataset("colbertv2/lotte_passages", dataset)
19 |     collection = [x['text'] for x in collection_dataset[split + '_collection']]
20 |     dids = [x['doc_id'] for x in collection_dataset[split + '_collection']]
21 | 
22 |     queries_dataset = load_dataset("colbertv2/lotte", dataset)
23 |     queries = [x['query'] for x in queries_dataset['search_' + split]]
24 |     qids = [x['qid'] for x in queries_dataset['search_' + split]]
25 | 
26 |     f'Loaded {len(queries)} queries and {len(collection):,} passages'
27 | 
28 |     answer_pids = [x['answers']['answer_pids'] for x in queries_dataset['search_' + split]]
29 |     filtered_queries = [q for q, apids in zip(queries, answer_pids) if any(x < max_id for x in apids)]
30 |     filtered_qids = [i for i,(q, apids) in enumerate(zip(queries, answer_pids)) if any(x < max_id for x in apids)]
31 |     filtered_dids = [x for x in dids if x < max_id]
32 |     f'Filtered down to {len(filtered_queries)} queries'
33 | 
34 |     return LoTTeDataset(collection[:max_id], filtered_queries, filtered_qids, filtered_dids)
35 | 
36 | def compare_clustering(experiment, lintdb_path, data):
37 |     from colbert.modeling.checkpoint import Checkpoint
38 |     from colbert import Searcher
39 | 
40 |     with Run().context(RunConfig(nranks=1, experiment=experiment)):
41 |         # config = ColBERTConfig(
42 |         #     nbits=nbits,
43 |         #     kmeans_niters=4,
44 |         #     root=exp_path,
45 |         # )
46 |         checkpoint_config = ColBERTConfig.load_from_checkpoint("colbert-ir/colbertv2.0")
47 |         config = ColBERTConfig.from_existing(checkpoint_config, None)
48 | 
49 |         from colbert.modeling.checkpoint import Checkpoint
50 |         from colbert import Searcher
51 |         # checkpoint = Checkpoint("colbert-ir/colbertv2.0", config)
52 | 
53 |         searcher = Searcher(index=experiment, config=config, collection=data.collection)
54 | 
55 |         index = lintdb.IndexIVF(lintdb_path)
56 | 
57 |         for i in range(16384):
58 |             pids, cell_lengths = searcher.ranker.ivf.lookup([i])
59 | 
60 |             lintdb_pids = index.lookup_pids(i)
61 | 
62 |             diff = set([x.item() for x in pids]) - set(lintdb_pids)
63 |             if diff:
64 |                 print(
65 |                     f"centroid {i} comparison:",
66 |                     f"colbert: {len(pids)}",
67 |                     f"lintdb: {len(lintdb_pids)}",
68 |                     f"difference: {len(diff)}",
69 |                     f"pid difference: {diff}",  
70 |                 )
71 |                 for pid_values in failures.values():
72 |                     for pid in pid_values:
73 |                         if pid in diff:
74 |                             print(f"centroid {i} has a failure at pid {pid}")
75 | 
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     dataset = 'lifestyle'
80 |     datasplit = 'dev'
81 | 
82 |     experiment = 'colbert'
83 | 
84 |     failures = {
85 |         5: [5462],
86 |         11: [7767],
87 |         13: [4176, 4185, 5814, 4174],
88 |         15: [1925],
89 |         16: [3701, 3060, 3051, 3437],
90 |         19: [5619]
91 |     }
92 | 
93 |     data = load_lotte(dataset, datasplit)
94 | 
95 |     compare_clustering(experiment, f"/tmp/py_index_bench_{experiment}", data)


--------------------------------------------------------------------------------
/benchmarks/lotte/debug_colbert.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | from collections import namedtuple
 3 | from colbert import Indexer, Searcher
 4 | from colbert.infra import Run, RunConfig, ColBERTConfig
 5 | from colbert.data import Queries, Collection
 6 | import os
 7 | import sys
 8 | import jsonlines
 9 | from collections import defaultdict
10 | from tqdm import tqdm
11 | import time
12 | import numpy as np
13 | import typer
14 | import torch
15 | import random 
16 | from typing import List, Annotated
17 | from common import load_lotte, lintdb_indexing, evaluate_dataset
18 | import tempfile
19 | 
20 | app = typer.Typer()
21 | 
22 | @app.command()
23 | def debug():
24 |     torch.set_printoptions(threshold=10_000)
25 | 
26 |     d = load_lotte('lifestyle', 'dev', filter=True, start=5400, stop=5500)
27 |     print(f"Loaded {len(d.queries)} queries and {len(d.collection):,} passages")
28 |     assert(len(d.collection) == 100)
29 | 
30 |     with Run().context(RunConfig(nranks=1, experiment='colbert-debug')):
31 |         config = ColBERTConfig.load_from_checkpoint("colbert-ir/colbertv2.0")
32 |         config.kmeans_niters=4
33 |         indexer = Indexer(checkpoint="colbert-ir/colbertv2.0", config=config)
34 | 
35 |         doc = None
36 |         for i in range(len(d.collection)):
37 |             if d.dids[i] == 5462:
38 |                 doc = d.collection[i]
39 |                 break
40 |         if doc is None:
41 |             print("doc not found")
42 |             return
43 |         
44 |         # indexer trains, so needs a larger collection.
45 |         indexer.index(name='colbert-debug', collection=d.collection, overwrite=True)
46 |         # indexer = Indexer(checkpoint=checkpoint, config=config)
47 |         # indexer.index(name=experiment, collection=dataset.collection) # "/path/to/MSMARCO/collection.tsv"
48 |         from colbert.modeling.checkpoint import Checkpoint
49 |         from colbert import Searcher
50 |         searcher = Searcher(index='colbert-debug', config=config, collection=d.collection)
51 | 
52 |         # spot check this doc
53 | 
54 |         # doc_len = searcher.ranker.doclens[5462]
55 |         # print(f"doc len: {doc_len}")
56 | 
57 |         checkpoint = Checkpoint("colbert-ir/colbertv2.0", config)
58 |         doclens_ = checkpoint.docFromText([doc])
59 |         print(f"embedding size: {doclens_.shape}")
60 | 
61 |         embs_, doclens_ = checkpoint.docFromText([doc],bsize=1,  keep_dims='flatten')
62 |         print(f"embedding size: {doclens_}")
63 | 
64 |         dddd = searcher.ranker.lookup_pids([5462-5400])
65 |         print(f"shape of searcher's doc: {dddd[0].shape}")
66 | 
67 |         return
68 | 
69 | if __name__ == "__main__":
70 |     app()


--------------------------------------------------------------------------------
/benchmarks/pixi.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "benchmarks"
 3 | version = "0.1.0"
 4 | description = "Add a short description here"
 5 | channels = ["conda-forge"]
 6 | platforms = ["linux-64"]
 7 | 
 8 | [tasks]
 9 | 
10 | [dependencies]
11 | python = ">=3.10,<3.12"
12 | clang-17 = ">=17.0.6,<17.1"
13 | faiss = ">=1.7.4,<1.8"
14 | 
15 | [pypi-dependencies]
16 | typer = "*"
17 | colbert-ai = "*"
18 | jsonlines = "*"
19 | tqdm = "*"
20 | #lintdb = { path = "../.", editable = false }
21 | setuptools = "*"
22 | numpy = { version = "==1.26.4" }
23 | torch = "*"
24 | datasets = "*"
25 | transformers = "*"
26 | fsspec = { version = "==2023.9.2" }
27 | 


--------------------------------------------------------------------------------
/benchmarks/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "benchmarks"
 3 | version = "0.1.0"
 4 | description = "benchmark LintDB"
 5 | authors = ["Matt Barta <matt@deployql.com>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.9"
10 | typer = "^0.9.0"
11 | colbert-ai = "^0.2.14"
12 | pytorch
13 | faiss
14 | "git+https://github.com/illuin-tech/colpali"
15 | 
16 | [build-system]
17 | requires = ["poetry-core"]
18 | build-backend = "poetry.core.masonry.api"
19 | 
20 | [tool.pyperformance]
21 | manifest = "MANIFEST"


--------------------------------------------------------------------------------
/benchmarks/run_colbert.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | from collections import namedtuple
 3 | from colbert import Indexer, Searcher
 4 | from colbert.infra import Run, RunConfig, ColBERTConfig
 5 | from colbert.data import Queries, Collection
 6 | from colbert.modeling.checkpoint import Checkpoint
 7 | import os
 8 | import sys
 9 | import jsonlines
10 | from collections import defaultdict
11 | from tqdm import tqdm
12 | import time
13 | import numpy as np
14 | import typer
15 | import random 
16 | from typing import List, Annotated
17 | from lotte.common import load_lotte, _evaluate_dataset
18 | from common import get_memory_usage
19 | try:
20 |     from valgrind import callgrind_start_instrumentation, callgrind_stop_instrumentation, callgrind_dump_stats
21 | except ImportError:
22 |     print("didn't find valgrind")
23 |     def callgrind_stop_instrumentation():
24 |         pass
25 | 
26 |     def callgrind_start_instrumentation():
27 |         pass
28 | 
29 |     def callgrind_dump_stats(path:str):
30 |         pass
31 | 
32 | 
33 | app = typer.Typer()
34 | 
35 | 
36 | @app.command()
37 | def single_search(experiment='colbert-lifestyle-40k-benchmark', dataset:str='lifestyle', split:str='dev', profile=True, checkpoint:str='colbert-ir/colbertv2.0', index_path:str='indexes/lifestyle'):
38 |     d = load_lotte(dataset, split, stop=40000)
39 |     latencies = []
40 |     memory = []
41 | 
42 |     with Run().context(RunConfig(nranks=1, experiment=experiment)):
43 |         config = ColBERTConfig.load_from_checkpoint(checkpoint)
44 |         config.kmeans_niters=4
45 |         config.ncells = 2
46 |         # model = Checkpoint(checkpoint, config)
47 | 
48 |         # indexer = Indexer(checkpoint=checkpoint, config=config)
49 |         # indexer.index(name=experiment, collection=dataset.collection) # "/path/to/MSMARCO/collection.tsv"
50 | 
51 |         searcher = Searcher(index=experiment, config=config, collection=d.collection)
52 |         rankings = {}
53 | 
54 |         for id, query in zip(d.qids, d.queries):
55 |             embeddings = searcher.encode([query])
56 | 
57 |             start = time.perf_counter()
58 |             if profile:
59 |                 callgrind_start_instrumentation()
60 | 
61 |             results = searcher._search_all_Q(Queries.cast({1: query}), embeddings, k=100)
62 |             latencies.append(time.perf_counter() - start)
63 |             if profile:
64 |                 callgrind_stop_instrumentation()
65 |                 callgrind_dump_stats("callgrind.out.single_search")
66 |             memory.append(get_memory_usage())
67 | 
68 |             for k, v in results.todict().items():
69 |                 rankings[id] = [x[0] for x in v]
70 | 
71 |         _evaluate_dataset(rankings, dataset, 'search', k=5)
72 | 
73 | 
74 |     print(f"Average search latency: {np.mean(latencies):.2f}s")
75 |     print(f"Median search latency: {np.median(latencies):.2f}s")
76 |     print(f"95th percentile search latency: {np.percentile(latencies, 95):.2f}s")
77 |     print(f"99th percentile search latency: {np.percentile(latencies, 99):.2f}s")
78 | 
79 |     print(f"Average memory usage: {np.mean(memory):.2f}MB")
80 |     print(f"Median memory usage: {np.median(memory):.2f}MB")
81 |     print(f"95th percentile memory usage: {np.percentile(memory, 95):.2f}MB")
82 |     print(f"99th percentile memory usage: {np.percentile(memory, 99):.2f}MB")
83 | 
84 |     
85 | if __name__ == "__main__":
86 |     app()


--------------------------------------------------------------------------------
/benchmarks/run_lintdb.py:
--------------------------------------------------------------------------------
 1 | import lintdb as ldb
 2 | from datasets import load_dataset
 3 | from collections import namedtuple
 4 | from colbert import Indexer, Searcher
 5 | from colbert.infra import Run, RunConfig, ColBERTConfig
 6 | from colbert.data import Queries, Collection
 7 | import os
 8 | import sys
 9 | import jsonlines
10 | from collections import defaultdict
11 | from tqdm import tqdm
12 | import time
13 | import numpy as np
14 | import typer
15 | import random 
16 | from typing import List, Annotated
17 | from common import get_memory_usage
18 | from lotte.common import _evaluate_dataset, load_lotte
19 | 
20 | try:
21 |     from valgrind import callgrind_start_instrumentation, callgrind_stop_instrumentation, callgrind_dump_stats
22 | except ImportError:
23 |     print("didn't find valgrind")
24 |     def callgrind_stop_instrumentation():
25 |         pass
26 | 
27 |     def callgrind_start_instrumentation():
28 |         pass
29 | 
30 |     def callgrind_dump_stats(path:str):
31 |         pass
32 | 
33 | 
34 | app = typer.Typer()
35 | 
36 | @app.command()
37 | def single_search(dataset:str='lifestyle', split:str='dev',profile=False, checkpoint:str='colbert-ir/colbertv2.0', index_path:str='experiments/py_index_bench_test-collection-xtr'):
38 |     checkpoint_config = ColBERTConfig.load_from_checkpoint(checkpoint)
39 |     config = ColBERTConfig.from_existing(checkpoint_config, None)
40 | 
41 |     from colbert.modeling.checkpoint import Checkpoint
42 |     from colbert import Searcher
43 |     checkpoint = Checkpoint(checkpoint, config)
44 | 
45 |     d = load_lotte(dataset, split, stop=40000)
46 |     latencies = []
47 |     memory = []
48 | 
49 |     print(f"using index at {index_path}")
50 |     index = ldb.IndexIVF(index_path)
51 |     rankings = {}
52 | 
53 |     count=0
54 |     for id, query in zip(d.qids, d.queries):
55 |         embeddings = checkpoint.queryFromText([query])
56 |         converted = np.squeeze(embeddings.numpy().astype('float32'))
57 | 
58 |         if profile:
59 |             callgrind_start_instrumentation()
60 |         start = time.perf_counter()
61 |         opts = ldb.SearchOptions()
62 |         results = index.search(
63 |             0,
64 |             converted, 
65 |             32, # nprobe
66 |             100, # k to return
67 |             opts
68 |         )
69 |         latencies.append((time.perf_counter() - start)*1000)
70 |         if profile:
71 |             callgrind_stop_instrumentation()
72 |             callgrind_dump_stats("callgrind.out.single_search")
73 |         memory.append(get_memory_usage())
74 |         rankings[id] = [x.id for x in results]
75 |         count+=1
76 |         if count == 212:
77 |             break
78 | 
79 |         # Stats(pr).strip_dirs().sort_stats(SortKey.TIME).print_stats(10)
80 |     _evaluate_dataset(rankings, dataset, 'search', k=5)
81 | 
82 |     
83 |     print(f"Average search latency: {np.mean(latencies):.2f}ms")
84 |     print(f"Median search latency: {np.median(latencies):.2f}ms")
85 |     print(f"95th percentile search latency: {np.percentile(latencies, 95):.2f}ms")
86 |     print(f"99th percentile search latency: {np.percentile(latencies, 99):.2f}ms")
87 | 
88 |     print(f"Average memory usage: {np.mean(memory):.2f}MB")
89 |     print(f"Median memory usage: {np.median(memory):.2f}MB")
90 |     print(f"95th percentile memory usage: {np.percentile(memory, 95):.2f}MB")
91 |     print(f"99th percentile memory usage: {np.percentile(memory, 99):.2f}MB")
92 | 
93 |     
94 | if __name__ == "__main__":
95 |     app()


--------------------------------------------------------------------------------
/cmake/lintdb-config.cmake.in:
--------------------------------------------------------------------------------
1 | include("${CMAKE_CURRENT_LIST_DIR}/lintdb-targets.cmake")


--------------------------------------------------------------------------------
/conda/benchmark_env.yaml:
--------------------------------------------------------------------------------
 1 | name: lintdb-benchmark
 2 | channels:
 3 |   - pytorch
 4 |   - conda-forge
 5 |   - defaults
 6 | dependencies:
 7 |   - conda-build
 8 |   - anaconda-client
 9 |   - faiss-cpu
10 |   - pytorch::pytorch
11 |   - conda-forge::numpy
12 |   - conda-forge::onnxruntime-cpp==1.17.3
13 |   - pip:
14 |     - chardet
15 |     - typer
16 |     - jsonlines
17 |     - colbert-ai
18 |     - datasets
19 |     - valgrind


--------------------------------------------------------------------------------
/conda/conda_build_config.yaml:
--------------------------------------------------------------------------------
 1 | python:
 2 |   - 3.9
 3 |   - 3.10
 4 |   - 3.11
 5 |   - 3.12
 6 | MACOSX_SDK_VERSION:         # [osx and x86_64]
 7 |   - "10.13"                 # [osx and x86_64]
 8 | MACOSX_DEPLOYMENT_TARGET:   # [osx and x86_64]
 9 |   - "10.13"                 # [osx and x86_64]
10 | c_compiler:
11 |   - clang
12 | c_compiler_version:
13 |   - 18
14 | cxx_compiler:
15 |   - clangxx
16 | cxx_compiler_version:
17 |   - 18


--------------------------------------------------------------------------------
/conda/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: lintdb-build
 2 | channels:
 3 |   - pytorch
 4 |   - conda-forge
 5 |   - defaults
 6 | dependencies:
 7 |   - conda-build
 8 |   - anaconda-client
 9 |   - faiss-cpu
10 |   - pytorch::pytorch
11 |   - conda-forge::numpy
12 |   - pip:
13 |     - chardet


--------------------------------------------------------------------------------
/conda/lintdb/build-lib-arm64.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | # do we want to specify the build arch explicitly?       -DCMAKE_OSX_ARCHITECTURES=arm64 \
 4 | CXXFLAGS="${CXXFLAGS} -D_LIBCPP_DISABLE_AVAILABILITY" cmake -B _build \
 5 |       -DBUILD_SHARED_LIBS=ON \
 6 |       -DOpenMP_CXX_FLAGS=-fopenmp=libiomp5 \
 7 |       -DOpenMP_CXX_LIB_NAMES=libiomp5 \
 8 |       -DOpenMP_libiomp5_LIBRARY=$PREFIX/lib/libiomp5.dylib \
 9 |       -DCMAKE_INSTALL_LIBDIR=lib \
10 |       -DPython_EXECUTABLE=$PYTHON \
11 |       -DPYTHON_INCLUDE_DIR=$(python -c "import sysconfig; print(sysconfig.get_path('include'))")  \
12 |       -DPYTHON_LIBRARY=$(python -c "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))") \
13 |       -DCMAKE_BUILD_TYPE=Release .
14 | 
15 | make -C _build -j$(nproc) lintdb
16 | 
17 | cmake --install _build --prefix $PREFIX
18 | cmake --install _build --prefix _liblintdb_stage/


--------------------------------------------------------------------------------
/conda/lintdb/build-lib-osx.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | 
 4 | cmake -B _build \
 5 |       -DBUILD_SHARED_LIBS=ON \
 6 |       -DOpenMP_CXX_FLAGS=-fopenmp=libiomp5 \
 7 |       -DOpenMP_CXX_LIB_NAMES=libiomp5 \
 8 |       -DOpenMP_libiomp5_LIBRARY=$PREFIX/lib/libiomp5.dylib \
 9 |       -DCMAKE_INSTALL_LIBDIR=lib \
10 |       -DPython_EXECUTABLE=$PYTHON \
11 |       -DPYTHON_INCLUDE_DIR=$(python -c "import sysconfig; print(sysconfig.get_path('include'))")  \
12 |       -DPYTHON_LIBRARY=$(python -c "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))") \
13 |       -DCMAKE_BUILD_TYPE=Release .
14 | 
15 | make -C _build -j$(nproc) lintdb
16 | 
17 | cmake --install _build --prefix $PREFIX
18 | cmake --install _build --prefix _liblintdb_stage/


--------------------------------------------------------------------------------
/conda/lintdb/build-lib.bat:
--------------------------------------------------------------------------------
 1 | cmake -B _build ^
 2 |       -T v141 ^
 3 |       -A x64 ^
 4 |       -G "Visual Studio 16 2019" ^
 5 |       .
 6 | if %errorlevel% neq 0 exit /b %errorlevel%
 7 | 
 8 | cmake --build _build --config Release -j %CPU_COUNT%
 9 | if %errorlevel% neq 0 exit /b %errorlevel%
10 | 
11 | cmake --install _build --config Release --prefix %PREFIX%
12 | if %errorlevel% neq 0 exit /b %errorlevel%


--------------------------------------------------------------------------------
/conda/lintdb/build-lib.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | MKLROOT=_build/vcpkg_installed/x64-linux/lib/intel64 cmake -B _build \
 6 |       -DBUILD_SHARED_LIBS=ON \
 7 |       -DBUILD_TESTING=OFF \
 8 |       -DENABLE_PYTHON=OFF \
 9 |       -DCMAKE_INSTALL_LIBDIR=lib \
10 |       -DBLA_VENDOR=Intel10_64lp \
11 |       -DCMAKE_BUILD_TYPE=Release \
12 |       -DOpenMP_CXX_FLAGS=-fopenmp=libiomp5 \
13 |       -DOpenMP_CXX_LIB_NAMES=libiomp5 \
14 |       -DOpenMP_libiomp5_LIBRARY=$PREFIX/lib/libiomp5.so \
15 |       .
16 | 
17 | MKLROOT=_build/vcpkg_installed/x64-linux/lib/intel64 make -C _build -j$(nproc) lintdb
18 | 
19 | cmake --install _build --prefix $PREFIX
20 | cmake --install _build --prefix _liblintdb_stage/


--------------------------------------------------------------------------------
/conda/lintdb/build-pkg-arm64.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | 
 4 | CXXFLAGS="${CXXFLAGS} -D_LIBCPP_DISABLE_AVAILABILITY" cmake -B _build_python_${PY_VER} \
 5 |       -Dlintdb_ROOT=_liblintdb_stage/ \
 6 |       -DOpenMP_CXX_FLAGS=-fopenmp=libiomp5 \
 7 |       -DOpenMP_CXX_LIB_NAMES=libiomp5 \
 8 |       -DOpenMP_libiomp5_LIBRARY=$PREFIX/lib/libiomp5.dylib \
 9 |       -DCMAKE_BUILD_TYPE=Release \
10 |       -DPython_EXECUTABLE=$PYTHON \
11 |       -DPYTHON_INCLUDE_DIR=$(python -c "import sysconfig; print(sysconfig.get_path('include'))")  \
12 |       -DPYTHON_LIBRARY=$(python -c "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))") \
13 |       .
14 | 
15 | make -C _build_python_${PY_VER} -j$(nproc) pylintdb
16 | 
17 | # Build actual python module.
18 | cd _build_python_${PY_VER}/lintdb/python
19 | $PYTHON setup.py install --single-version-externally-managed --record=record.txt --prefix=$PREFIX


--------------------------------------------------------------------------------
/conda/lintdb/build-pkg-osx.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | 
 4 | cmake -B _build_python_${PY_VER} \
 5 |       -Dlintdb_ROOT=_liblintdb_stage/ \
 6 |       -DOpenMP_CXX_FLAGS=-fopenmp=libiomp5 \
 7 |       -DOpenMP_CXX_LIB_NAMES=libiomp5 \
 8 |       -DOpenMP_libiomp5_LIBRARY=$PREFIX/lib/libiomp5.dylib \
 9 |       -DCMAKE_BUILD_TYPE=Release \
10 |       -DPython_EXECUTABLE=$PYTHON \
11 |       -DPYTHON_INCLUDE_DIR=$(python -c "import sysconfig; print(sysconfig.get_path('include'))")  \
12 |       -DPYTHON_LIBRARY=$(python -c "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))") \
13 |       .
14 | 
15 | make -C _build_python_${PY_VER} -j$(nproc) pylintdb
16 | 
17 | # Build actual python module.
18 | cd _build_python_${PY_VER}/lintdb/python
19 | $PYTHON setup.py install --single-version-externally-managed --record=record.txt --prefix=$PREFIX


--------------------------------------------------------------------------------
/conda/lintdb/build-pkg.bat:
--------------------------------------------------------------------------------
 1 | :: Copyright (c) Facebook, Inc. and its affiliates.
 2 | ::
 3 | :: This source code is licensed under the MIT license found in the
 4 | :: LICENSE file in the root directory of this source tree.
 5 | 
 6 | :: Build vanilla version (no avx).
 7 | cmake -B _build_python_%PY_VER% ^
 8 |       -T v141 ^
 9 |       -A x64 ^
10 |       -G "Visual Studio 16 2019" ^
11 |       -DPython_EXECUTABLE=%PYTHON% ^
12 |       -DPYTHON_INCLUDE_DIR=$(python -c "import sysconfig; print(sysconfig.get_path('include'))")  ^
13 |       -DPYTHON_LIBRARY=$(python -c "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))") ^
14 |       lintdb/python
15 | if %errorlevel% neq 0 exit /b %errorlevel%
16 | 
17 | cmake --build _build_python_%PY_VER% --config Release -j %CPU_COUNT%
18 | if %errorlevel% neq 0 exit /b %errorlevel%
19 | 
20 | 
21 | :: Build actual python module.
22 | cd _build_python_%PY_VER%/
23 | %PYTHON% setup.py install --single-version-externally-managed --record=record.txt --prefix=%PREFIX%
24 | if %errorlevel% neq 0 exit /b %errorlevel%


--------------------------------------------------------------------------------
/conda/lintdb/build-pkg.sh:
--------------------------------------------------------------------------------
1 | set -e
2 | 
3 | $PYTHON -m pip install .  -vv


--------------------------------------------------------------------------------
/conda/lintdb/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.1').lstrip('v') %}
 2 | {% set suffix = "_nightly" if environ.get('PACKAGE_TYPE') == 'nightly' else "" %}
 3 | {% set number = environ.get('GIT_DESCRIBE_NUMBER', '0') %}
 4 | 
 5 | package:
 6 |   name: lintdb-pkg
 7 |   version: {{ version }}
 8 | 
 9 | build:
10 |   number: {{ number }}
11 | 
12 | about:
13 |   home: https://github.com/DeployQL/lintdb
14 |   license: Apache 2
15 |   license_family: Apache
16 |   license_file: LICENSE
17 |   summary: A multi-vector database for token level interaction.
18 |   description: |
19 |     LintDB is a multi-vector database meant for Gen AI. LintDB natively supports late interaction like colBERT and PLAID. 
20 | 
21 | source:
22 |   git_url: ../../
23 | 
24 | outputs:
25 |   - name: lintdb
26 |     script: build-pkg.sh
27 |     build:
28 |       string: "py{{ PY_VER }}_h{{ PKG_HASH }}_{{ number }}_cpu{{ suffix }}"
29 |     requirements:
30 |       build:
31 |         - {{ compiler('cxx') }}
32 |         - {{ compiler('fortran')}}
33 |         - sysroot_linux-64 =2.17 # [linux64]
34 |         - numpy==1.26.4
35 |         - scikit-build-core
36 |         - cmake >=3.25
37 |         - make  # [not win]
38 |         - mkl-devel =2023.0.0  # [x86_64]
39 |         - openblas  # [not x86_64]
40 |         - python {{ python }}
41 |       host:
42 |         - python {{ python }}
43 |         - numpy==1.26.4
44 |         - scikit-build-core
45 |         - conda-forge::llvm-openmp # [x86_64]
46 |         - mkl =2023.0.0  # [x86_64]
47 |         - openblas  # [not x86_64]
48 |       run:
49 |         - python {{ python }}
50 |         - numpy==1.26.4
51 |         - mkl =2023.0.0  # [x86_64]
52 |         - openblas  # [not x86_64]
53 |         - packaging
54 |         - __osx >={{ MACOSX_DEPLOYMENT_TARGET|default("10.13") }}  # [osx and x86_64]
55 |     test:
56 |       requires:
57 |         - pytest
58 |         - numpy==1.26.4
59 |       commands:
60 |         - pytest lintdb/python/tests
61 |       source_files:
62 |         - lintdb/python/tests


--------------------------------------------------------------------------------
/docker/Dockerfile.conda.build:
--------------------------------------------------------------------------------
 1 | FROM ubuntu
 2 | 
 3 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
 4 | ENV PATH /opt/conda/bin:$PATH
 5 | 
 6 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \
 7 |     build-essential curl \
 8 |     git
 9 | 
10 | # Get Rust
11 | RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
12 | ENV PATH="/root/.cargo/bin:${PATH}"
13 | 
14 | # Install Miniconda
15 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh && \
16 |     /bin/bash /tmp/miniconda.sh -b -p /opt/miniconda && \
17 |     rm /tmp/miniconda.sh
18 | 
19 | # Add Miniconda to the path
20 | ENV PATH="/opt/miniconda/bin:$PATH"
21 | 
22 | # Update conda
23 | RUN conda update -n base -c defaults conda -y
24 | 
25 | # Create and activate a new conda environment
26 | RUN conda create -y -n build_env python=3.11
27 | SHELL ["conda", "run", "-n", "build_env", "/bin/bash", "-c"]
28 | RUN echo "source activate build_env" > ~/.bashrc
29 | ENV PATH /opt/conda/envs/build_env/bin:$PATH
30 | 
31 | # Install conda-build
32 | RUN conda install -y conda-build
33 | 
34 | RUN conda config --append channels conda-forge
35 | 
36 | CMD [ "conda", "build", "/lintdb/conda/lintdb" ]
37 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/development.md:
--------------------------------------------------------------------------------
 1 | # Development
 2 | 
 3 | ## LintDB C++ Libraries
 4 | To develop on LintDB, there are a few dependencies that you need to install. The below instructions are for Ubuntu.
 5 | 
 6 | ### [vcpkg](https://learn.microsoft.com/en-us/vcpkg/get_started/overview)
 7 | ```bash
 8 | git clone https://github.com/microsoft/vcpkg.git
 9 | cd vcpkg && ./bootstrap-vcpkg.sh
10 | ```
11 | 
12 | ### [clang](https://apt.llvm.org/)
13 | We expect clang as the compiler. This helps align with our expectations of MKL libraries detailed below.
14 | ```bash
15 | wget https://apt.llvm.org/llvm.sh
16 | chmod +x llvm.sh
17 | sudo ./llvm.sh all
18 | ```
19 | 
20 | ### [miniforge](https://github.com/conda-forge/miniforge) (recommended)
21 | Miniforge is a minimal installer for conda that automatically installs conda-forge packages.
22 | 
23 | We can create an isolated environment for lintdb development.
24 | ```bash
25 | conda create -n lintdb python=3.10
26 | conda activate lintdb
27 | ```
28 | 
29 | ### Recommended Python Libraries
30 | There are a few helpful python libraries that are used in profiling and testing LintDB.
31 | ```bash
32 | pip install graph2dot
33 | ```
34 | ---
35 | # Python LintDB
36 | 
37 | In addition to the above, developing with the Python LintDB library requires a few more dependencies.
38 | 
39 | LintDB uses nanobind to create Python bindings. It also comes with a helpful CLI tool to create stubs for Python.
40 | 
41 | ```bash
42 | pip install nanobind
43 | ```
44 | 
45 | ### creating python stubs
46 | ```bash
47 | python -m nanobind.stubgen -m lintdb.core -M py.typed -o core.pyi 
48 | ```
49 | 
50 | ---
51 | 
52 | # Makefile commands
53 | 
54 | The Makefile at the root of the repository has a few commands that can help you get started.
55 | CMakePresets.json is used to configure the build system.
56 | 
57 | ```bash
58 | # build a debug target with tests.
59 | make build-debug
60 | 
61 | # build a release target
62 | make build-release
63 | 
64 | # run tests
65 | make tests
66 | 
67 | # run benchmarks
68 | make benchmarks
69 | 
70 | # profile LintDB (note some variables need to change in the Makefile)
71 | make callgrind
72 | ```
73 | 
74 | ---
75 | 
76 | You'll notice that each target is statically linked. However, we dynamically depend on finding either MKL or OpenBLAS at runtime.
77 | 
78 | ## MKL vs OpenBLAS
79 | 
80 | LintDB currently uses either MKL or OpenBLAS for linear algebra operations. By default, we use MKL on Windows and Ubuntu. On MacOS, we use OpenBLAS. 
81 | 
82 | It should be noted that MKL doesn't always play well with OpenMP. We specify linking against intel's version of OpenMP, but
83 | at runtime, it's possible we find a different version. This can lead to performance issues.
84 | 
85 | It can be helpful to refer to [Intel's threading layer documentation](https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-linux/2023-0/dynamic-select-the-interface-and-threading-layer.html) and
86 | try `INTEL` or `GNU`. Running `ldd path/to/liblintdb_lib.so` will output what libraries are being linked at runtime to verify if there
87 | are issues.


--------------------------------------------------------------------------------
/docs/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: lintdb-docs
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - mkdocs-material
 7 |   - mike
 8 |   - mkdocstrings[python]
 9 |   - markdown-callouts
10 |   - mkdocs-literate-nav


--------------------------------------------------------------------------------
/docs/icon.svg:
--------------------------------------------------------------------------------
 1 | <svg width="531" height="183" viewBox="0 0 531 183" fill="none" xmlns="http://www.w3.org/2000/svg">
 2 | <g clip-path="url(#clip0_1_24)">
 3 | <path d="M530.941 110.34C530.941 118.66 528.601 124.683 523.921 128.41C519.241 132.137 511.831 134 501.691 134H469.841V52.49H501.691C510.878 52.49 517.378 54.05 521.191 57.17C525.091 60.29 527.041 65.62 527.041 73.16C527.041 78.2733 526.261 82.1733 524.701 84.86C523.141 87.5467 520.281 89.4967 516.121 90.71C521.061 91.5767 524.744 93.6567 527.171 96.95C529.684 100.157 530.941 104.62 530.941 110.34ZM512.611 75.37C512.611 71.6433 511.788 69.0867 510.141 67.7C508.581 66.2267 505.764 65.49 501.691 65.49H485.441V86.55H501.691C505.591 86.55 508.364 85.6833 510.011 83.95C511.744 82.2167 512.611 79.3567 512.611 75.37ZM515.861 109.3C515.861 105.313 514.778 102.54 512.611 100.98C510.531 99.42 506.891 98.64 501.691 98.64H485.441V120.48H501.691C506.891 120.48 510.531 119.657 512.611 118.01C514.778 116.363 515.861 113.46 515.861 109.3Z" fill="#051646"/>
 4 | <path d="M417.514 134H391.384V52.49H417.514C425.834 52.49 432.421 53.1833 437.274 54.57C442.214 55.9567 446.157 58.1667 449.104 61.2C452.051 64.2333 454.174 68.35 455.474 73.55C456.861 78.6633 457.554 85.2067 457.554 93.18C457.554 107.307 454.391 117.663 448.064 124.25C441.737 130.75 431.554 134 417.514 134ZM437.144 72.51C435.497 70.3433 433.201 68.8267 430.254 67.96C427.307 67.0067 423.061 66.53 417.514 66.53H406.984V119.31H417.514C426.874 119.31 433.201 117.403 436.494 113.59C439.787 109.777 441.434 102.973 441.434 93.18C441.434 87.98 441.087 83.7333 440.394 80.44C439.701 77.1467 438.617 74.5033 437.144 72.51Z" fill="#051646"/>
 5 | <path d="M381.622 123.99V132.96C380.235 133.567 378.328 134.043 375.902 134.39C373.562 134.823 371.308 135.04 369.142 135.04C361.342 135.04 355.752 133.307 352.372 129.84C349.078 126.373 347.432 120.393 347.432 111.9V78.88H337.812V69.78H347.562L349.252 54.18H358.092V69.78H380.972V78.88H358.092V111.51C358.092 116.797 359.045 120.393 360.952 122.3C362.945 124.207 366.412 125.16 371.352 125.16C372.912 125.16 374.688 125.03 376.682 124.77C378.762 124.51 380.408 124.25 381.622 123.99Z" fill="#051646"/>
 6 | <path d="M283.361 134H272.701V69.78H281.931L282.451 76.02H282.841C286.308 73.42 289.904 71.5133 293.631 70.3C297.444 69.0867 301.691 68.48 306.371 68.48C313.478 68.48 318.851 70.2567 322.491 73.81C326.218 77.3633 328.081 82.6067 328.081 89.54V134H317.421V90.84C317.421 86.5067 316.208 83.3 313.781 81.22C311.354 79.14 307.628 78.1 302.601 78.1C299.481 78.1 296.274 78.62 292.981 79.66C289.774 80.6133 286.568 82.0433 283.361 83.95V134Z" fill="#051646"/>
 7 | <path d="M255.948 58.6H244.508V47.55H255.948V58.6ZM255.558 134H244.898V69.78H255.558V134Z" fill="#051646"/>
 8 | <path d="M234.28 123.6V134H184.36V52.49H195.41V123.6H234.28Z" fill="#051646"/>
 9 | <path d="M63 41L121.5 1.5L83.5 60.5L63 41Z" fill="#1449A7"/>
10 | <path d="M83.5 60.5L63 41L58.5 44L55 42.5L61.5 66L83.5 60.5Z" fill="#092575"/>
11 | <path d="M58.5 44L55 42.5L51.5 38.5L102.5 0L92.5 21L58.5 44Z" fill="#4375CD"/>
12 | <path d="M55 42.5L61.5 66L66.5 86L9 88L0 78L47.5 35.5L51.5 38.5L55 42.5Z" fill="#1772E8"/>
13 | <path d="M63 72L66.5 86L29 87L38.5 128L115.5 183V121L63 72Z" fill="#1772E8"/>
14 | <path d="M9 88L0 78L7.5 88H9Z" fill="#092575"/>
15 | <path d="M66.5 86L45.5 133L49 135.5L79 87L63 72L66.5 86Z" fill="#092575"/>
16 | <path d="M120.5 140H115.5V163.5L120.5 158V140Z" fill="#092575"/>
17 | <path d="M135.5 140L120.5 158V140H135.5Z" fill="#4375CD"/>
18 | <path d="M66.5 86L32.5 102.5L29 87L66.5 86Z" fill="#092575"/>
19 | <path d="M23 183H115.5L59.5 143L23 183Z" fill="#092575"/>
20 | <path d="M50.5 162L77 183H31L50.5 162Z" fill="#1772E8"/>
21 | <path d="M38.5 128L32.5 102.5L66.5 86L45.5 133L38.5 128Z" fill="#1449A7"/>
22 | </g>
23 | <defs>
24 | <clipPath id="clip0_1_24">
25 | <rect width="531" height="183" fill="white"/>
26 | </clipPath>
27 | </defs>
28 | </svg>
29 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | --8<-- "README.md"


--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | LintDB requires Python 3.10 or later.
 4 | 
 5 | ## Installing using conda (recommended)
 6 | 
 7 | We highly recommend using conda to install LintDB.
 8 | 
 9 | ```bash
10 | conda install -c DeployQL lintdb
11 | ```
12 | 
13 | If you don't have conda, you can install it from [here](https://docs.conda.io/en/latest/miniconda.html).
14 | 
15 | 
16 | [//]: # (## Installing using pip)
17 | 
18 | [//]: # ()
19 | [//]: # (If you don't want to use conda, you can install LintDB using pip.)
20 | 
21 | [//]: # ()
22 | [//]: # (LintDB expects that you have openBLAS or MKL installed.)
23 | 
24 | [//]: # ()
25 | [//]: # (```bash)
26 | 
27 | [//]: # (pip install mkl lintdb)
28 | 
29 | [//]: # (```)
30 | 
31 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/nav.md:
--------------------------------------------------------------------------------
1 | * [Introduction](index.md)
2 | * [Getting Started](getting-started.md)
3 | * [Installation](installation.md)
4 | * [Examples](examples.md)
5 | * [Development](development.md)
6 | * [Reference](reference.md)


--------------------------------------------------------------------------------
/docs/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "lintdb-docs"
 3 | version = "0.1.0"
 4 | description = "documentation for LintDB"
 5 | authors = ["DeployQL"]
 6 | readme = "README.md"
 7 | packages = [{include = "lintdb_docs"}]
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "^3.10"
11 | chardet = "^5.2.0"
12 | sphinx-immaterial = "^0.11.11"
13 | 
14 | 
15 | [build-system]
16 | requires = ["poetry-core"]
17 | build-backend = "poetry.core.masonry.api"
18 | 


--------------------------------------------------------------------------------
/docs/reference.md:
--------------------------------------------------------------------------------
1 | # Reference
2 | 
3 | ::: lintdb.core
4 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | chardet = "^5.2.0"
2 | sphinx-immaterial = "^0.11.11"
3 | myst-parser


--------------------------------------------------------------------------------
/icon.svg:
--------------------------------------------------------------------------------
 1 | <svg width="531" height="183" viewBox="0 0 531 183" fill="none" xmlns="http://www.w3.org/2000/svg">
 2 | <g clip-path="url(#clip0_1_24)">
 3 | <path d="M530.941 110.34C530.941 118.66 528.601 124.683 523.921 128.41C519.241 132.137 511.831 134 501.691 134H469.841V52.49H501.691C510.878 52.49 517.378 54.05 521.191 57.17C525.091 60.29 527.041 65.62 527.041 73.16C527.041 78.2733 526.261 82.1733 524.701 84.86C523.141 87.5467 520.281 89.4967 516.121 90.71C521.061 91.5767 524.744 93.6567 527.171 96.95C529.684 100.157 530.941 104.62 530.941 110.34ZM512.611 75.37C512.611 71.6433 511.788 69.0867 510.141 67.7C508.581 66.2267 505.764 65.49 501.691 65.49H485.441V86.55H501.691C505.591 86.55 508.364 85.6833 510.011 83.95C511.744 82.2167 512.611 79.3567 512.611 75.37ZM515.861 109.3C515.861 105.313 514.778 102.54 512.611 100.98C510.531 99.42 506.891 98.64 501.691 98.64H485.441V120.48H501.691C506.891 120.48 510.531 119.657 512.611 118.01C514.778 116.363 515.861 113.46 515.861 109.3Z" fill="#051646"/>
 4 | <path d="M417.514 134H391.384V52.49H417.514C425.834 52.49 432.421 53.1833 437.274 54.57C442.214 55.9567 446.157 58.1667 449.104 61.2C452.051 64.2333 454.174 68.35 455.474 73.55C456.861 78.6633 457.554 85.2067 457.554 93.18C457.554 107.307 454.391 117.663 448.064 124.25C441.737 130.75 431.554 134 417.514 134ZM437.144 72.51C435.497 70.3433 433.201 68.8267 430.254 67.96C427.307 67.0067 423.061 66.53 417.514 66.53H406.984V119.31H417.514C426.874 119.31 433.201 117.403 436.494 113.59C439.787 109.777 441.434 102.973 441.434 93.18C441.434 87.98 441.087 83.7333 440.394 80.44C439.701 77.1467 438.617 74.5033 437.144 72.51Z" fill="#051646"/>
 5 | <path d="M381.622 123.99V132.96C380.235 133.567 378.328 134.043 375.902 134.39C373.562 134.823 371.308 135.04 369.142 135.04C361.342 135.04 355.752 133.307 352.372 129.84C349.078 126.373 347.432 120.393 347.432 111.9V78.88H337.812V69.78H347.562L349.252 54.18H358.092V69.78H380.972V78.88H358.092V111.51C358.092 116.797 359.045 120.393 360.952 122.3C362.945 124.207 366.412 125.16 371.352 125.16C372.912 125.16 374.688 125.03 376.682 124.77C378.762 124.51 380.408 124.25 381.622 123.99Z" fill="#051646"/>
 6 | <path d="M283.361 134H272.701V69.78H281.931L282.451 76.02H282.841C286.308 73.42 289.904 71.5133 293.631 70.3C297.444 69.0867 301.691 68.48 306.371 68.48C313.478 68.48 318.851 70.2567 322.491 73.81C326.218 77.3633 328.081 82.6067 328.081 89.54V134H317.421V90.84C317.421 86.5067 316.208 83.3 313.781 81.22C311.354 79.14 307.628 78.1 302.601 78.1C299.481 78.1 296.274 78.62 292.981 79.66C289.774 80.6133 286.568 82.0433 283.361 83.95V134Z" fill="#051646"/>
 7 | <path d="M255.948 58.6H244.508V47.55H255.948V58.6ZM255.558 134H244.898V69.78H255.558V134Z" fill="#051646"/>
 8 | <path d="M234.28 123.6V134H184.36V52.49H195.41V123.6H234.28Z" fill="#051646"/>
 9 | <path d="M63 41L121.5 1.5L83.5 60.5L63 41Z" fill="#1449A7"/>
10 | <path d="M83.5 60.5L63 41L58.5 44L55 42.5L61.5 66L83.5 60.5Z" fill="#092575"/>
11 | <path d="M58.5 44L55 42.5L51.5 38.5L102.5 0L92.5 21L58.5 44Z" fill="#4375CD"/>
12 | <path d="M55 42.5L61.5 66L66.5 86L9 88L0 78L47.5 35.5L51.5 38.5L55 42.5Z" fill="#1772E8"/>
13 | <path d="M63 72L66.5 86L29 87L38.5 128L115.5 183V121L63 72Z" fill="#1772E8"/>
14 | <path d="M9 88L0 78L7.5 88H9Z" fill="#092575"/>
15 | <path d="M66.5 86L45.5 133L49 135.5L79 87L63 72L66.5 86Z" fill="#092575"/>
16 | <path d="M120.5 140H115.5V163.5L120.5 158V140Z" fill="#092575"/>
17 | <path d="M135.5 140L120.5 158V140H135.5Z" fill="#4375CD"/>
18 | <path d="M66.5 86L32.5 102.5L29 87L66.5 86Z" fill="#092575"/>
19 | <path d="M23 183H115.5L59.5 143L23 183Z" fill="#092575"/>
20 | <path d="M50.5 162L77 183H31L50.5 162Z" fill="#1772E8"/>
21 | <path d="M38.5 128L32.5 102.5L66.5 86L45.5 133L38.5 128Z" fill="#1449A7"/>
22 | </g>
23 | <defs>
24 | <clipPath id="clip0_1_24">
25 | <rect width="531" height="183" fill="white"/>
26 | </clipPath>
27 | </defs>
28 | </svg>
29 | 


--------------------------------------------------------------------------------
/lintdb/SearchOptions.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINTDB_SEARCH_OPTIONS_H
 2 | #define LINTDB_SEARCH_OPTIONS_H
 3 | 
 4 | #include <cstddef>
 5 | #include <string>
 6 | #include <vector>
 7 | #include "lintdb/api.h"
 8 | 
 9 | namespace lintdb {
10 | 
11 | /**
12 |  * SearchOptions enables custom searching behavior.
13 |  *
14 |  * These options expose ways to tradeoff recall and latency at different levels
15 |  * of retrieval. Searching more centroids:
16 |  * - decrease centroid_score_threshold and increase k_top_centroids.
17 |  * - increase n_probe in search()
18 |  *
19 |  * Decreasing latency:
20 |  * - increase centroid_score_threshold and decrease k_top_centroids.
21 |  * - decrease n_probe in search()
22 |  */
23 | struct SearchOptions {
24 |     idx_t expected_id = -1; /// expects a document id in the return result.
25 |                             /// prints additional information during execution.
26 |                             /// useful for debugging.
27 |     float centroid_score_threshold =
28 |             0.45; /// the threshold for centroid scores.
29 |     size_t k_top_centroids =
30 |             2; /// the number of top centroids to consider per token.
31 |     size_t num_second_pass =
32 |             1024;        /// the number of second pass candidates to consider.
33 |     size_t n_probe = 32; /// the number of centroids to search overall.
34 |     size_t nearest_tokens_to_fetch =
35 |             100; /// the number of nearest tokens to fetch in XTR.
36 |     std::string colbert_field = "colbert";
37 | 
38 |     SearchOptions() : expected_id(-1){};
39 | };
40 | } // namespace lintdb
41 | 
42 | #endif


--------------------------------------------------------------------------------
/lintdb/SearchResult.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINTDB_SEARCH_RESULT_H
 2 | #define LINTDB_SEARCH_RESULT_H
 3 | 
 4 | #include <map>
 5 | #include <string>
 6 | #include <vector>
 7 | #include "lintdb/api.h"
 8 | #include "lintdb/schema/DataTypes.h"
 9 | 
10 | namespace lintdb {
11 | 
12 | /**
13 |  * SearchResult is a simple struct to hold the results of a search.
14 |  *
15 |  */
16 | struct SearchResult {
17 |     idx_t id;    /// the document's id.
18 |     float score; /// the final score as determined by the database.
19 |     std::map<std::string, SupportedTypes>
20 |             metadata; /// Optionally, metadata that was indexed for the
21 |                       /// document.
22 | 
23 |     SearchResult() = default;
24 | 
25 |     bool operator<(const SearchResult& other) const {
26 |         return score < other.score;
27 |     }
28 |     bool operator>(const SearchResult& other) const {
29 |         return score > other.score;
30 |     }
31 | };
32 | 
33 | } // namespace lintdb
34 | 
35 | #endif


--------------------------------------------------------------------------------
/lintdb/api.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINTDB_API_H
 2 | #define LINTDB_API_H
 3 | 
 4 | #include <cstdint>
 5 | #include <string>
 6 | 
 7 | typedef int64_t idx_t;
 8 | 
 9 | // the codes used to save the centroid for each token vector.
10 | // each code is treated as an index, which is defined above.
11 | typedef idx_t code_t;
12 | typedef uint8_t residual_t; // the residual codes saved for each token vector.
13 | 
14 | typedef uint16_t float16;
15 | typedef uint16_t bfloat16;
16 | 
17 | #endif


--------------------------------------------------------------------------------
/lintdb/assert.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef LINTDB_ASSERT_H
 3 | #define LINTDB_ASSERT_H
 4 | 
 5 | #include <faiss/impl/platform_macros.h>
 6 | #include <cstdio>
 7 | #include <cstdlib>
 8 | #include <string>
 9 | #include "lintdb/exception.h"
10 | 
11 | // #define __PRETTY_FUNCTION__ __FUNCSIG__
12 | 
13 | #define LINTDB_THROW_FMT(FMT, ...)                             \
14 |     do {                                                       \
15 |         std::string __s;                                       \
16 |         int __size = snprintf(nullptr, 0, FMT, __VA_ARGS__);   \
17 |         __s.resize(__size + 1);                                \
18 |         snprintf(&__s[0], __s.size(), FMT, __VA_ARGS__);       \
19 |         throw lintdb::LintDBException(                         \
20 |                 __s, __PRETTY_FUNCTION__, __FILE__, __LINE__); \
21 |     } while (false)
22 | 
23 | ///
24 | /// Exceptions thrown upon a conditional failure
25 | ///
26 | 
27 | #define LINTDB_THROW_IF_NOT(X)                          \
28 |     do {                                                \
29 |         if (!(X)) {                                     \
30 |             LINTDB_THROW_FMT("Error: '%s' failed", #X); \
31 |         }                                               \
32 |     } while (false)
33 | 
34 | #define LINTDB_THROW_IF_NOT_MSG(X, MSG)                       \
35 |     do {                                                      \
36 |         if (!(X)) {                                           \
37 |             LINTDB_THROW_FMT("Error: '%s' failed: " MSG, #X); \
38 |         }                                                     \
39 |     } while (false)
40 | 
41 | #define LINTDB_THROW_IF_NOT_FMT(X, FMT, ...)                               \
42 |     do {                                                                   \
43 |         if (!(X)) {                                                        \
44 |             LINTDB_THROW_FMT("Error: '%s' failed: " FMT, #X, __VA_ARGS__); \
45 |         }                                                                  \
46 |     } while (false)
47 | 
48 | #endif


--------------------------------------------------------------------------------
/lintdb/cf.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINTDB_CF_H
 2 | #define LINTDB_CF_H
 3 | 
 4 | #include <rocksdb/db.h>
 5 | #include <rocksdb/filter_policy.h>
 6 | #include <rocksdb/slice_transform.h>
 7 | #include <rocksdb/table.h>
 8 | #include "lintdb/constants.h"
 9 | 
10 | namespace lintdb {
11 | namespace {
12 | rocksdb::ColumnFamilyOptions create_index_table_options() {
13 |     rocksdb::ColumnFamilyOptions index_options;
14 |     rocksdb::BlockBasedTableOptions table_options;
15 |     table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
16 |     index_options.table_factory.reset(
17 |             rocksdb::NewBlockBasedTableFactory(table_options));
18 | 
19 |     // the inverted index uses 8 bytes for the tenant, and 4 bytes for the
20 |     // inverted list id.
21 |     index_options.prefix_extractor.reset(rocksdb::NewCappedPrefixTransform(12));
22 | 
23 |     return index_options;
24 | };
25 | } // namespace
26 | inline std::vector<rocksdb::ColumnFamilyDescriptor> create_column_families() {
27 |     return {rocksdb::ColumnFamilyDescriptor(
28 |                     rocksdb::kDefaultColumnFamilyName,
29 |                     rocksdb::ColumnFamilyOptions()),
30 |             rocksdb::ColumnFamilyDescriptor(
31 |                     kIndexColumnFamily, create_index_table_options()),
32 |             rocksdb::ColumnFamilyDescriptor(
33 |                     kForwardColumnFamily, rocksdb::ColumnFamilyOptions()),
34 |             rocksdb::ColumnFamilyDescriptor(
35 |                     kCodesColumnFamily, rocksdb::ColumnFamilyOptions()),
36 |             rocksdb::ColumnFamilyDescriptor(
37 |                     kResidualsColumnFamily, rocksdb::ColumnFamilyOptions()),
38 |             rocksdb::ColumnFamilyDescriptor(
39 |                     kMappingColumnFamily, rocksdb::ColumnFamilyOptions()),
40 |             rocksdb::ColumnFamilyDescriptor(
41 |                     kDocColumnFamily, rocksdb::ColumnFamilyOptions())};
42 | }
43 | 
44 | } // namespace lintdb
45 | 
46 | #endif


--------------------------------------------------------------------------------
/lintdb/constants.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINTDB_CONSTANTS_H
 2 | #define LINTDB_CONSTANTS_H
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | #include "lintdb/api.h"
 7 | 
 8 | namespace lintdb {
 9 | using std::string;
10 | static const string kIndexColumnFamily = "index";
11 | static const string kCodesColumnFamily = "codes";
12 | static const string kResidualsColumnFamily = "residuals";
13 | static const string kForwardColumnFamily = "forward";
14 | static const string kMappingColumnFamily = "mapping";
15 | static const string kDocColumnFamily = "doc";
16 | 
17 | typedef idx_t column_index_t;
18 | static const column_index_t kIndexColumnIndex = 1;
19 | static const column_index_t kForwardColumnIndex = 2;
20 | static const column_index_t kCodesColumnIndex = 3;
21 | static const column_index_t kResidualsColumnIndex = 4;
22 | static const column_index_t kMappingColumnIndex = 5;
23 | static const column_index_t kDocColumnIndex = 6;
24 | 
25 | // default tenant is used in testing.
26 | static const uint64_t kDefaultTenant = 0;
27 | } // namespace lintdb
28 | 
29 | #endif


--------------------------------------------------------------------------------
/lintdb/env.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINTDB_ENV_H
 2 | #define LINTDB_ENV_H
 3 | 
 4 | namespace lintdb {
 5 | // environment variables we use to set the number of threads.
 6 | const char* ONNX_INTER_THREADS = "LINTDB_INTER_NUM_THREADS";
 7 | const char* ONNX_INTRA_THREADS = "LINTDB_INTRA_NUM_THREADS";
 8 | } // namespace lintdb
 9 | 
10 | #endif // LINTDB_ENV_H
11 | 


--------------------------------------------------------------------------------
/lintdb/exception.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINTDB_EXCEPTION_H
 2 | #define LINTDB_EXCEPTION_H
 3 | 
 4 | #include <exception>
 5 | #include <string>
 6 | #include <utility>
 7 | #include <vector>
 8 | 
 9 | namespace lintdb {
10 | class LintDBException : public std::exception {
11 |    public:
12 |     explicit LintDBException(const std::string& message) : message(message){};
13 | 
14 |     LintDBException(
15 |             const std::string& m,
16 |             const char* funcName,
17 |             const char* file,
18 |             int line) {
19 |         int size = snprintf(
20 |                 nullptr,
21 |                 0,
22 |                 "Error in %s at %s:%d: %s",
23 |                 funcName,
24 |                 file,
25 |                 line,
26 |                 m.c_str());
27 |         message.resize(size + 1);
28 |         snprintf(
29 |                 &message[0],
30 |                 message.size(),
31 |                 "Error in %s at %s:%d: %s",
32 |                 funcName,
33 |                 file,
34 |                 line,
35 |                 m.c_str());
36 |     }
37 | 
38 |     const char* what() const noexcept override {
39 |         return message.c_str();
40 |     };
41 | 
42 |    private:
43 |     std::string message;
44 | };
45 | } // namespace lintdb
46 | 
47 | #endif


--------------------------------------------------------------------------------
/lintdb/invlists/ContextIterator.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <rocksdb/db.h>
 4 | #include <rocksdb/slice.h>
 5 | #include <string>
 6 | #include "lintdb/constants.h"
 7 | #include "lintdb/invlists/Iterator.h"
 8 | #include "lintdb/invlists/KeyBuilder.h"
 9 | 
10 | namespace lintdb {
11 | class ContextIterator {
12 |    public:
13 |     ContextIterator(
14 |             const std::shared_ptr<rocksdb::DB> db,
15 |             rocksdb::ColumnFamilyHandle* column_family,
16 |             const uint64_t tenant,
17 |             const uint8_t field)
18 |             : tenant(tenant), field(field) {
19 |         if (!column_family) {
20 |             throw std::runtime_error("Column family not found");
21 |         }
22 |         cf = column_family->GetID();
23 |         KeyBuilder kb;
24 |         prefix = kb.add(tenant).add(field).build();
25 | 
26 |         prefix_slice = rocksdb::Slice(this->prefix);
27 |         auto options = rocksdb::ReadOptions();
28 | 
29 |         this->it = std::unique_ptr<rocksdb::Iterator>(
30 |                 db->NewIterator(options, column_family));
31 |         it->Seek(this->prefix);
32 |     }
33 | 
34 |     bool is_valid() {
35 |         if (!has_read_key) {
36 |             bool is_valid = it->Valid();
37 |             if (!is_valid) {
38 |                 return false;
39 |             }
40 | 
41 |             auto key = it->key();
42 |             std::string key_str = key.ToString();
43 |             if (key_str.compare(0, prefix.size(), prefix) != 0) {
44 |                 return false;
45 |             }
46 |             this->current_key = ContextKey(key_str);
47 |         }
48 | 
49 |         has_read_key = true;
50 |         return true;
51 |     }
52 | 
53 |     void advance(const idx_t doc_id) {
54 |         KeyBuilder kb;
55 | 
56 |         std::string expected_key =
57 |                 kb.add(tenant).add(field).add(doc_id).build();
58 |         it->Seek(rocksdb::Slice(expected_key));
59 |         has_read_key = false;
60 |     }
61 | 
62 |     void next() {
63 |         it->Next();
64 |         has_read_key = false;
65 |     }
66 | 
67 |     ContextKey get_key() const {
68 |         return current_key;
69 |     }
70 | 
71 |     std::string get_value() const {
72 |         return it->value().ToString();
73 |     }
74 | 
75 |     std::unique_ptr<rocksdb::Iterator> it;
76 | 
77 |    protected:
78 |     lintdb::column_index_t cf;
79 |     string prefix;
80 |     string end_key;
81 |     rocksdb::Slice prefix_slice;
82 |     ContextKey current_key;
83 | 
84 |     bool has_read_key;
85 |     const uint64_t tenant;
86 |     const uint8_t field;
87 | };
88 | 
89 | } // namespace lintdb
90 | 


--------------------------------------------------------------------------------
/lintdb/invlists/ForwardIndexIterator.cpp:
--------------------------------------------------------------------------------
 1 | #include "ForwardIndexIterator.h"
 2 | #include <memory>
 3 | 
 4 | namespace lintdb {
 5 | ForwardIndexIterator::ForwardIndexIterator(
 6 |         std::shared_ptr<rocksdb::DB> db,
 7 |         rocksdb::ColumnFamilyHandle* column_family,
 8 |         const uint64_t tenant)
 9 |         : tenant(tenant) {
10 |     cf = column_family->GetID();
11 |     KeyBuilder kb;
12 | 
13 |     prefix = kb.add(tenant).build();
14 | 
15 |     prefix_slice = rocksdb::Slice(this->prefix);
16 |     auto options = rocksdb::ReadOptions();
17 | 
18 |     this->it = std::unique_ptr<rocksdb::Iterator>(
19 |             db->NewIterator(options, column_family));
20 |     it->Seek(this->prefix);
21 | }
22 | 
23 | bool ForwardIndexIterator::has_next() {
24 |     bool is_valid = it->Valid();
25 |     if (!is_valid) {
26 |         return false;
27 |     }
28 |     auto key = it->key().ToString();
29 |     this->current_key = ForwardIndexKey(key);
30 | 
31 |     if (current_key.tenant() != tenant) {
32 |         return false;
33 |     }
34 | 
35 |     return true;
36 | }
37 | 
38 | void ForwardIndexIterator::next() {
39 |     it->Next();
40 | }
41 | 
42 | ForwardIndexKey ForwardIndexIterator::get_key() const {
43 |     return current_key;
44 | }
45 | 
46 | std::string ForwardIndexIterator::get_value() const {
47 |     return it->value().ToString();
48 | }
49 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/invlists/ForwardIndexIterator.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <memory>
 4 | #include "lintdb/constants.h"
 5 | #include "lintdb/invlists/KeyBuilder.h"
 6 | #include "rocksdb/db.h"
 7 | 
 8 | namespace lintdb {
 9 | 
10 | /**
11 |  * ForwardIndexIterator is an iterator over the forward index.
12 |  *
13 |  * This is somewhat coupled to key format so that we can control
14 |  * iteration. Note that while RocksDB has start and stop option,
15 |  * it was not working as expected. So we are doing it manually.
16 |  */
17 | struct ForwardIndexIterator {
18 |     ForwardIndexIterator(
19 |             std::shared_ptr<rocksdb::DB> db,
20 |             rocksdb::ColumnFamilyHandle* column_family,
21 |             const uint64_t tenant);
22 | 
23 |     bool has_next();
24 | 
25 |     void next();
26 | 
27 |     ForwardIndexKey get_key() const;
28 | 
29 |     std::string get_value() const;
30 | 
31 |     std::unique_ptr<rocksdb::Iterator> it;
32 | 
33 |    protected:
34 |     lintdb::column_index_t cf;
35 |     string prefix;
36 |     string end_key;
37 |     rocksdb::Slice prefix_slice;
38 |     ForwardIndexKey current_key;
39 | 
40 |     const idx_t tenant;
41 | };
42 | 
43 | } // namespace lintdb
44 | 


--------------------------------------------------------------------------------
/lintdb/invlists/IndexWriter.cpp:
--------------------------------------------------------------------------------
 1 | #include "IndexWriter.h"
 2 | #include <glog/logging.h>
 3 | #include <rocksdb/db.h>
 4 | #include <rocksdb/slice.h>
 5 | #include "lintdb/api.h"
 6 | #include "lintdb/assert.h"
 7 | #include "lintdb/constants.h"
 8 | #include "lintdb/invlists/PostingData.h"
 9 | 
10 | namespace lintdb {
11 | IndexWriter::IndexWriter(
12 |         std::shared_ptr<rocksdb::DB> db,
13 |         std::vector<rocksdb::ColumnFamilyHandle*>& column_families,
14 |         const Version& version)
15 |         : db(db), column_families(column_families), version(version) {}
16 | 
17 | /**
18 |  * Write will batch write all document data to the database.
19 |  * @param batch_posting_data
20 |  */
21 | void IndexWriter::write(const BatchPostingData& batch_posting_data) {
22 |     rocksdb::WriteBatch batch;
23 | 
24 |     // write all inverted index data
25 |     for (const auto& posting : batch_posting_data.inverted) {
26 |         batch.Put(
27 |                 column_families[kIndexColumnIndex],
28 |                 rocksdb::Slice(posting.key),
29 |                 rocksdb::Slice(posting.value));
30 |     }
31 | 
32 |     // write all mappings
33 |     for (const auto& posting : batch_posting_data.inverted_mapping) {
34 |         batch.Put(
35 |                 column_families[kMappingColumnIndex],
36 |                 rocksdb::Slice(posting.key),
37 |                 rocksdb::Slice(posting.value));
38 |     }
39 | 
40 |     // write all document data
41 |     batch.Put(
42 |             column_families[kDocColumnIndex],
43 |             rocksdb::Slice(batch_posting_data.forward.key),
44 |             rocksdb::Slice(batch_posting_data.forward.value));
45 | 
46 |     // write all context data
47 |     for (const auto& posting : batch_posting_data.context) {
48 |         batch.Put(
49 |                 column_families[kCodesColumnIndex],
50 |                 rocksdb::Slice(posting.key),
51 |                 rocksdb::Slice(posting.value));
52 |     }
53 | 
54 |     auto status = db->Write(rocksdb::WriteOptions(), &batch);
55 |     assert(status.ok());
56 | 
57 |     LINTDB_THROW_IF_NOT(status.ok());
58 | }
59 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/invlists/IndexWriter.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <rocksdb/db.h>
 4 | #include <rocksdb/iterator.h>
 5 | #include <vector>
 6 | #include "lintdb/invlists/PostingData.h"
 7 | #include "lintdb/version.h"
 8 | 
 9 | namespace lintdb {
10 | 
11 | class IIndexWriter {
12 |    public:
13 |     virtual void write(const BatchPostingData& batch_posting_data) = 0;
14 | 
15 |     virtual ~IIndexWriter() = default;
16 | };
17 | 
18 | class IndexWriter : public IIndexWriter {
19 |    private:
20 |     std::shared_ptr<rocksdb::DB> db;
21 |     std::vector<rocksdb::ColumnFamilyHandle*>& column_families;
22 |     const Version& version;
23 | 
24 |    public:
25 |     IndexWriter(
26 |             std::shared_ptr<rocksdb::DB> db,
27 |             std::vector<rocksdb::ColumnFamilyHandle*>& column_families,
28 |             const Version& version);
29 | 
30 |     void write(const BatchPostingData& batch_posting_data) override;
31 | };
32 | 
33 | } // namespace lintdb
34 | 


--------------------------------------------------------------------------------
/lintdb/invlists/InvertedIterator.cpp:
--------------------------------------------------------------------------------
 1 | #include "InvertedIterator.h"
 2 | #include <glog/logging.h>
 3 | #include <rocksdb/slice.h>
 4 | #include <rocksdb/utilities/transaction.h>
 5 | #include <memory>
 6 | #include "lintdb/constants.h"
 7 | #include "lintdb/invlists/ContextIterator.h"
 8 | 
 9 | lintdb::RocksDBIterator::RocksDBIterator(
10 |         std::shared_ptr<rocksdb::DB> db,
11 |         rocksdb::ColumnFamilyHandle* column_family,
12 |         const std::string& prefix)
13 |         : Iterator(), prefix(prefix), has_read_key(false) {
14 |     cf = column_family->GetID();
15 | 
16 |     prefix_slice = rocksdb::Slice(this->prefix);
17 |     auto options = rocksdb::ReadOptions();
18 | 
19 |     this->it = std::unique_ptr<rocksdb::Iterator>(
20 |             db->NewIterator(options, column_family));
21 |     it->Seek(this->prefix);
22 | }


--------------------------------------------------------------------------------
/lintdb/invlists/InvertedIterator.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <glog/logging.h>
 4 | #include <rocksdb/db.h>
 5 | #include <rocksdb/iterator.h>
 6 | #include <rocksdb/utilities/optimistic_transaction_db.h>
 7 | #include <iostream>
 8 | #include <memory>
 9 | #include <string>
10 | #include "lintdb/constants.h"
11 | #include "lintdb/exception.h"
12 | #include "lintdb/invlists/ContextIterator.h"
13 | #include "lintdb/invlists/InvertedList.h"
14 | #include "lintdb/invlists/Iterator.h"
15 | #include "lintdb/invlists/KeyBuilder.h"
16 | #include "lintdb/version.h"
17 | 
18 | namespace lintdb {
19 | 
20 | struct RocksDBIterator : public lintdb::Iterator {
21 |     RocksDBIterator(
22 |             std::shared_ptr<rocksdb::DB> db,
23 |             rocksdb::ColumnFamilyHandle* column_family,
24 |             const std::string& prefix);
25 | 
26 |     bool is_valid() override {
27 |         if (!has_read_key) {
28 |             bool is_valid = it->Valid();
29 |             if (!is_valid) {
30 |                 return false;
31 |             }
32 | 
33 |             auto key = it->key();
34 |             std::string key_str = key.ToString();
35 |             if (key_str.compare(0, prefix.size(), prefix) != 0) {
36 |                 return false;
37 |             }
38 | 
39 |             current_key = InvertedIndexKey(key_str);
40 |         }
41 | 
42 |         has_read_key = true;
43 |         return true;
44 |     }
45 | 
46 |     void next() override {
47 |         it->Next();
48 |         has_read_key = false;
49 |     }
50 | 
51 |     InvertedIndexKey get_key() const override {
52 |         return current_key;
53 |     }
54 | 
55 |     string get_value() const override {
56 |         return it->value().ToString();
57 |     }
58 | 
59 |     std::unique_ptr<rocksdb::Iterator> it;
60 | 
61 |    protected:
62 |     lintdb::column_index_t cf;
63 |     string prefix;
64 |     string end_key;
65 |     rocksdb::Slice prefix_slice;
66 |     InvertedIndexKey current_key;
67 | 
68 |     bool has_read_key;
69 | };
70 | 
71 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/invlists/InvertedList.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINTDB_INVLISTS_INVERTED_LIST_H
 2 | #define LINTDB_INVLISTS_INVERTED_LIST_H
 3 | 
 4 | #include <stddef.h>
 5 | #include <memory>
 6 | #include <vector>
 7 | #include "lintdb/api.h"
 8 | #include "lintdb/constants.h"
 9 | #include "lintdb/invlists/ContextIterator.h"
10 | #include "lintdb/invlists/EncodedDocument.h"
11 | #include "lintdb/invlists/ForwardIndexIterator.h"
12 | #include "lintdb/invlists/Iterator.h"
13 | #include "lintdb/schema/Schema.h"
14 | 
15 | namespace lintdb {
16 | /**
17 |  * InvertedList manages the storage of centroid -> codes mappping.
18 |  *
19 |  * InvertedLists are expected to be fast. The more data stored in the
20 |  * invertedList, the slower it will become.
21 |  *
22 |  * We also expect the InvertedList to manage a mapping of document -> centroids
23 |  * to facilitate deletion.
24 |  */
25 | struct InvertedList {
26 |     virtual void remove(
27 |             const uint64_t tenant,
28 |             std::vector<idx_t> ids,
29 |             const uint8_t field,
30 |             const DataType data_type,
31 |             const std::vector<FieldType> field_types) = 0;
32 |     virtual void merge(
33 |             rocksdb::DB* db,
34 |             std::vector<rocksdb::ColumnFamilyHandle*>& cfs) = 0;
35 | 
36 |     virtual std::unique_ptr<Iterator> get_iterator(
37 |             const std::string& prefix) const = 0;
38 | 
39 |     virtual std::unique_ptr<ContextIterator> get_context_iterator(
40 |             const uint64_t tenant,
41 |             const uint8_t field_id) const = 0;
42 | 
43 |     virtual std::vector<idx_t> get_mapping(const uint64_t tenant, idx_t id)
44 |             const = 0;
45 | 
46 |     virtual ~InvertedList() = default;
47 | };
48 | 
49 | /**
50 |  * ForwardIndex helps retrieve document data from the index.
51 |  */
52 | struct ForwardIndex {
53 |     virtual std::vector<std::map<uint8_t, SupportedTypes>> get_metadata(
54 |             const uint64_t tenant,
55 |             const std::vector<idx_t>& ids) const = 0;
56 | 
57 |     virtual void remove(const uint64_t tenant, std::vector<idx_t> ids) = 0;
58 | 
59 |     virtual void merge(
60 |             rocksdb::DB* db,
61 |             std::vector<rocksdb::ColumnFamilyHandle*>& cfs) = 0;
62 | 
63 |     virtual std::unique_ptr<ForwardIndexIterator> get_iterator(
64 |             const uint64_t tenant,
65 |             const idx_t inverted_list) const = 0;
66 | 
67 |     virtual ~ForwardIndex() = default;
68 | };
69 | } // namespace lintdb
70 | 
71 | #endif


--------------------------------------------------------------------------------
/lintdb/invlists/Iterator.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINTDB_INVLISTS_ITERATOR_H
 2 | #define LINTDB_INVLISTS_ITERATOR_H
 3 | 
 4 | #include <string>
 5 | #include "lintdb/invlists/KeyBuilder.h"
 6 | 
 7 | namespace lintdb {
 8 | struct Iterator {
 9 |     virtual bool is_valid() = 0;
10 |     virtual void next() = 0;
11 | 
12 |     virtual InvertedIndexKey get_key() const = 0;
13 |     virtual std::string get_value() const = 0;
14 | 
15 |     virtual ~Iterator() = default;
16 | };
17 | } // namespace lintdb
18 | 
19 | #endif


--------------------------------------------------------------------------------
/lintdb/invlists/PostingData.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | #include <vector>
 5 | 
 6 | namespace lintdb {
 7 | struct PostingData {
 8 |     std::string key;
 9 |     std::string value;
10 | };
11 | 
12 | struct BatchPostingData {
13 |     std::vector<PostingData> inverted;
14 |     PostingData forward; /// A single document has one entry in forward index
15 |     std::vector<PostingData> context;
16 |     std::vector<PostingData> inverted_mapping;
17 | };
18 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/invlists/RocksdbForwardIndex.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINTDB_ROCKSDB_LIST_H
 2 | #define LINTDB_ROCKSDB_LIST_H
 3 | 
 4 | #include <glog/logging.h>
 5 | #include <rocksdb/db.h>
 6 | #include <rocksdb/iterator.h>
 7 | #include <rocksdb/utilities/optimistic_transaction_db.h>
 8 | #include <iostream>
 9 | #include <memory>
10 | #include <string>
11 | #include "lintdb/constants.h"
12 | #include "lintdb/invlists/ForwardIndexIterator.h"
13 | #include "lintdb/invlists/InvertedList.h"
14 | #include "lintdb/invlists/Iterator.h"
15 | #include "lintdb/version.h"
16 | 
17 | namespace lintdb {
18 | 
19 | struct RocksdbForwardIndex : public ForwardIndex {
20 |     RocksdbForwardIndex(
21 |             std::shared_ptr<rocksdb::DB> db,
22 |             std::vector<rocksdb::ColumnFamilyHandle*>& column_families,
23 |             const Version& version);
24 | 
25 |     void remove(const uint64_t tenant, std::vector<idx_t> ids) override;
26 | 
27 |     void merge(rocksdb::DB* db, std::vector<rocksdb::ColumnFamilyHandle*>& cfs)
28 |             override;
29 | 
30 |     std::vector<std::map<uint8_t, SupportedTypes>> get_metadata(
31 |             const uint64_t tenant,
32 |             const std::vector<idx_t>& ids) const override;
33 | 
34 |     std::unique_ptr<ForwardIndexIterator> get_iterator(
35 |             const uint64_t tenant,
36 |             idx_t column_index) const override;
37 | 
38 |    protected:
39 |     Version version;
40 |     std::shared_ptr<rocksdb::DB> db_;
41 |     std::vector<rocksdb::ColumnFamilyHandle*>& column_families;
42 | };
43 | 
44 | } // namespace lintdb
45 | 
46 | #endif


--------------------------------------------------------------------------------
/lintdb/invlists/RocksdbInvertedList.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINTDB_ROCKSDBINVERTEDLIST_H
 2 | #define LINTDB_ROCKSDBINVERTEDLIST_H
 3 | 
 4 | #include <glog/logging.h>
 5 | #include <rocksdb/db.h>
 6 | #include <rocksdb/iterator.h>
 7 | #include <rocksdb/utilities/optimistic_transaction_db.h>
 8 | #include <iostream>
 9 | #include <memory>
10 | #include <string>
11 | #include "lintdb/constants.h"
12 | #include "lintdb/exception.h"
13 | #include "lintdb/invlists/ContextIterator.h"
14 | #include "lintdb/invlists/InvertedList.h"
15 | #include "lintdb/invlists/Iterator.h"
16 | #include "lintdb/version.h"
17 | 
18 | namespace lintdb {
19 | 
20 | /**
21 |  * RocksdbInvertedList stores a slim version of the inverted list. There is no
22 |  * data associated with each token, only the document id as part of the key.
23 |  *
24 |  * This inverted list is only capable of telling us what documents are
25 |  * associated with what centroids.
26 |  */
27 | struct RocksdbInvertedList : public InvertedList {
28 |     RocksdbInvertedList(
29 |             std::shared_ptr<rocksdb::DB> db,
30 |             std::vector<rocksdb::ColumnFamilyHandle*>& column_families,
31 |             const Version& version);
32 | 
33 |     void remove(
34 |             const uint64_t tenant,
35 |             std::vector<idx_t> ids,
36 |             const uint8_t field,
37 |             const DataType data_type,
38 |             const std::vector<FieldType> field_types) override;
39 |     void merge(rocksdb::DB* db, std::vector<rocksdb::ColumnFamilyHandle*>& cfs)
40 |             override;
41 | 
42 |     std::vector<idx_t> get_mapping(const uint64_t tenant, idx_t id)
43 |             const override;
44 | 
45 |     [[nodiscard]] std::unique_ptr<Iterator> get_iterator(
46 |             const std::string& prefix) const override;
47 | 
48 |     std::unique_ptr<ContextIterator> get_context_iterator(
49 |             const uint64_t tenant,
50 |             const uint8_t field_id) const override;
51 | 
52 |    protected:
53 |     Version version;
54 |     std::shared_ptr<rocksdb::DB> db_;
55 |     std::vector<rocksdb::ColumnFamilyHandle*>& column_families;
56 | };
57 | 
58 | } // namespace lintdb
59 | 
60 | #endif // LINTDB_ROCKSDBINVERTEDLIST_H
61 | 


--------------------------------------------------------------------------------
/lintdb/python/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
 4 | 
 5 | project(lintdb
 6 |     LANGUAGES CXX
 7 | )
 8 | 
 9 | set(CMAKE_CXX_STANDARD 17)
10 | 
11 | find_package(Python
12 |         REQUIRED COMPONENTS Interpreter Development.Module
13 |         OPTIONAL_COMPONENTS Development.SABIModule
14 | )
15 | 
16 | message("=== Selected Python Variables ===")
17 | message(STATUS "Python3_STDLIB: " ${Python_STDLIB})
18 | message(STATUS "Python3_SITELIB: " ${Python_SITELIB})
19 | message(STATUS "Python3_VERSION: " ${Python_VERSION})
20 | message(STATUS "Python3_EXECUTABLE: " ${Python_EXECUTABLE})
21 | 
22 | if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
23 |     set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
24 |     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
25 | endif()
26 | 
27 | add_subdirectory(${PROJECT_SOURCE_DIR}/../../third_party/nanobind nanobind EXCLUDE_FROM_ALL)
28 | find_package(nanobind CONFIG REQUIRED)
29 | 
30 | nanobind_add_module(
31 |         core
32 |         STABLE_ABI
33 |         pylintdb.cpp
34 | )
35 | 
36 | target_link_libraries(core PRIVATE
37 |         lintdb_lib
38 | )
39 | 
40 | install(TARGETS core LIBRARY DESTINATION lintdb)
41 | 
42 | # this doesn't work because python looks for typing_extensions and doesn't find it on py3.10
43 | #nanobind_add_stub(
44 | #        lintdb_stub
45 | #        INSTALL_TIME
46 | #        MODULE core
47 | #        OUTPUT core.pyi
48 | #        PYTHON_PATH $<TARGET_FILE_DIR:core>
49 | #        DEPENDS core
50 | #        VERBOSE
51 | #)
52 | #
53 | #install(FILES "core.pyi" DESTINATION lintdb)
54 | #install(FILES "py.typed" DESTINATION lintdb)


--------------------------------------------------------------------------------
/lintdb/python/version.txt:
--------------------------------------------------------------------------------
1 | 0.2.0


--------------------------------------------------------------------------------
/lintdb/quantizers/Binarizer.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINTDB_BINARIZER_H
 2 | #define LINTDB_BINARIZER_H
 3 | 
 4 | #include <cstddef>
 5 | #include <memory>
 6 | #include <string>
 7 | #include <vector>
 8 | #include "lintdb/api.h"
 9 | #include "lintdb/quantizers/PQDistanceTables.h"
10 | #include "lintdb/quantizers/Quantizer.h"
11 | 
12 | namespace lintdb {
13 | struct Binarizer : public Quantizer {
14 |     std::vector<float> bucket_cutoffs;
15 |     std::vector<float> bucket_weights;
16 |     float avg_residual;
17 |     size_t nbits;
18 |     size_t dim;
19 |     std::vector<uint8_t> reverse_bitmap;
20 |     std::vector<uint8_t> decompression_lut;
21 | 
22 |     Binarizer(size_t nbits, size_t dim);
23 | 
24 |     Binarizer(
25 |             const std::vector<float>& bucket_cutoffs,
26 |             const std::vector<float>& bucket_weights,
27 |             const float avg_residual,
28 |             const size_t nbits,
29 |             const size_t dim);
30 | 
31 |     // copy constructor
32 |     Binarizer(const Binarizer& other);
33 | 
34 |     Binarizer& operator=(Binarizer other) {
35 |         swap(*this, other);
36 |         return *this;
37 |     }
38 | 
39 |     std::vector<uint8_t> binarize(const std::vector<float>& residuals);
40 |     void train(const size_t n, const float* x, const size_t dim) override;
41 |     void save(const std::string path) override;
42 | 
43 |     void sa_encode(size_t n, const float* x, residual_t* codes) override;
44 |     void sa_decode(size_t n, const residual_t* codes, float* x) override;
45 |     size_t code_size() override;
46 | 
47 |     size_t get_nbits() override {
48 |         return nbits;
49 |     }
50 | 
51 |     static std::unique_ptr<Binarizer> load(std::string path);
52 | 
53 |     QuantizerType get_type() override;
54 | 
55 |     friend void swap(Binarizer& first, Binarizer& second) {
56 |         std::swap(first.bucket_cutoffs, second.bucket_cutoffs);
57 |         std::swap(first.bucket_weights, second.bucket_weights);
58 |         std::swap(first.avg_residual, second.avg_residual);
59 |         std::swap(first.nbits, second.nbits);
60 |         std::swap(first.dim, second.dim);
61 |         std::swap(first.reverse_bitmap, second.reverse_bitmap);
62 |         std::swap(first.decompression_lut, second.decompression_lut);
63 |     }
64 | 
65 |    private:
66 |     void calculate_quantiles(const std::vector<float>& heldoout_residual);
67 | 
68 |     std::vector<uint8_t> bucketize(const std::vector<float>& residuals);
69 |     std::vector<uint8_t> packbits(const std::vector<uint8_t>& binarized);
70 |     std::vector<uint8_t> unpackbits(
71 |             const std::vector<uint8_t>& packed,
72 |             size_t dim,
73 |             size_t nbits);
74 |     // binarize takes in the residuals as floats, bucketizes them, and
75 |     // then returns the binarized version of the residuals.
76 |     // the returned vector is of size dim * nbits.
77 | 
78 |     std::vector<uint8_t> create_reverse_bitmap();
79 |     std::vector<uint8_t> create_decompression_lut();
80 | };
81 | } // namespace lintdb
82 | 
83 | #endif


--------------------------------------------------------------------------------
/lintdb/quantizers/IdentityQuantizer.cpp:
--------------------------------------------------------------------------------
 1 | #include "IdentityQuantizer.h"
 2 | 
 3 | namespace lintdb {
 4 | void IdentityQuantizer::train(
 5 |         const size_t n,
 6 |         const float* x,
 7 |         const size_t dim) {}
 8 | 
 9 | void IdentityQuantizer::save(const std::string path) {}
10 | 
11 | void IdentityQuantizer::sa_encode(size_t n, const float* x, residual_t* codes) {
12 |     codes = (residual_t*)x;
13 | }
14 | 
15 | void IdentityQuantizer::sa_decode(size_t n, const residual_t* codes, float* x) {
16 |     x = (float*)codes;
17 | }
18 | 
19 | size_t IdentityQuantizer::code_size() {
20 |     return dim * sizeof(float);
21 | }
22 | 
23 | size_t IdentityQuantizer::get_nbits() {
24 |     return dim * sizeof(float);
25 | }
26 | 
27 | QuantizerType IdentityQuantizer::get_type() {
28 |     return NONE;
29 | }
30 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/quantizers/IdentityQuantizer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "lintdb/quantizers/Quantizer.h"
 4 | 
 5 | namespace lintdb {
 6 | 
 7 | class IdentityQuantizer : public Quantizer {
 8 |    public:
 9 |     IdentityQuantizer(size_t dim) : dim(dim) {}
10 | 
11 |     void train(const size_t n, const float* x, const size_t dim) override;
12 | 
13 |     void save(const std::string path) override;
14 | 
15 |     void sa_encode(size_t n, const float* x, residual_t* codes) override;
16 | 
17 |     void sa_decode(size_t n, const residual_t* codes, float* x) override;
18 | 
19 |     size_t code_size() override;
20 | 
21 |     size_t get_nbits() override;
22 | 
23 |     QuantizerType get_type() override;
24 | 
25 |    private:
26 |     size_t dim;
27 | };
28 | 
29 | } // namespace lintdb
30 | 


--------------------------------------------------------------------------------
/lintdb/quantizers/InvertedListScanner.cpp:
--------------------------------------------------------------------------------
 1 | #include "InvertedListScanner.h"
 2 | #include <faiss/IndexPQ.h>
 3 | #include <faiss/utils/distances.h>
 4 | #include <glog/logging.h>
 5 | #include <memory>
 6 | #include <vector>
 7 | 
 8 | namespace lintdb {
 9 | InvertedListScanner::InvertedListScanner(
10 |         std::shared_ptr<ProductEncoder>& quantizer,
11 |         const float* query_data,
12 |         size_t num_tokens)
13 |         : quantizer(quantizer), code_size(quantizer->code_size()) {
14 |     distance_tables = quantizer->get_distance_tables(query_data, num_tokens);
15 | }
16 | 
17 | std::vector<ScoredPartialDocumentCodes> InvertedListScanner::scan(
18 |         const idx_t key,
19 |         const std::unique_ptr<Iterator> list_iterator,
20 |         const std::vector<QueryTokenCentroidScore>& query_tokens_to_score) {
21 |     std::vector<idx_t> query_token_ids;
22 |     query_token_ids.reserve(query_tokens_to_score.size());
23 |     for (const auto& q : query_tokens_to_score) {
24 |         query_token_ids.push_back(q.query_token);
25 |     }
26 | 
27 |     std::vector<float> precomputed_distances;
28 |     precomputed_distances.reserve(query_tokens_to_score.size());
29 |     for (const auto& q : query_tokens_to_score) {
30 |         precomputed_distances.push_back(q.distance);
31 |     }
32 | 
33 |     std::vector<ScoredPartialDocumentCodes> results;
34 |     for (; list_iterator->has_next(); list_iterator->next()) {
35 |         auto partial_codes = list_iterator->get_value();
36 |         size_t num_tokens = partial_codes.partial_residuals.size() / code_size;
37 |         if (num_tokens != 1) {
38 |             LOG(WARNING)
39 |                     << "Codes found in inverted index are the wrong size. residual size: "
40 |                     << partial_codes.partial_residuals.size()
41 |                     << " code size: " << code_size;
42 |         }
43 | 
44 |         ScoredPartialDocumentCodes doc_results;
45 |         auto token_key = list_iterator->get_key();
46 |         doc_results.doc_id = token_key.doc_id;
47 |         doc_results.doc_token_id = token_key.token_id;
48 | 
49 |         auto scores = distance_tables->calculate_query_distances(
50 |                 query_token_ids,
51 |                 precomputed_distances,
52 |                 partial_codes.partial_residuals);
53 | 
54 |         for (idx_t i = 0; i < scores.size(); i++) {
55 |             const auto query_token_id = query_token_ids[i];
56 |             doc_results.query_token_id = query_token_id;
57 |             doc_results.score = scores[i];
58 |         }
59 | 
60 |         results.push_back(doc_results);
61 |     }
62 |     return results;
63 | }
64 | 
65 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/quantizers/InvertedListScanner.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINTDB_INVERTEDLISTSCANNER_H
 2 | #define LINTDB_INVERTEDLISTSCANNER_H
 3 | 
 4 | #include <map>
 5 | #include <memory>
 6 | #include <vector>
 7 | #include "lintdb/api.h"
 8 | #include "lintdb/invlists/EncodedDocument.h"
 9 | #include "lintdb/invlists/Iterator.h"
10 | #include "lintdb/quantizers/PQDistanceTables.h"
11 | #include "lintdb/quantizers/Quantizer.h"
12 | #include "ProductEncoder.h"
13 | 
14 | namespace lintdb {
15 | 
16 | /**
17 |  * ScoredPartialDocumentCodes holds per-token scores to help calculate
18 |  * sum-of-max scores.
19 |  *
20 |  * Each token in a document is scored across the query tokens, and we want to
21 |  * keep the max score per query token.
22 |  */
23 | struct ScoredPartialDocumentCodes {
24 |     idx_t doc_id;
25 |     idx_t doc_token_id;
26 |     idx_t query_token_id;
27 |     float score;
28 | 
29 |     ScoredPartialDocumentCodes() = default;
30 | 
31 |     ScoredPartialDocumentCodes(
32 |             idx_t doc_id,
33 |             idx_t doc_token_id,
34 |             idx_t query_token_id,
35 |             float score)
36 |             : doc_id(doc_id),
37 |               doc_token_id(doc_token_id),
38 |               query_token_id(query_token_id),
39 |               score(score) {}
40 | };
41 | 
42 | /**
43 |  * QueryTokenCentroidScore holds the distance between a query token and a
44 |  centroid.
45 |  *
46 |  * This is passed to scan to help calculate the score of a token.
47 | 
48 |  */
49 | struct QueryTokenCentroidScore {
50 |     idx_t query_token;
51 |     idx_t centroid_id;
52 |     float distance;
53 | };
54 | 
55 | /**
56 |  * InvertedListScanner helps us scan through an inverted list and score the
57 |  * results.
58 |  *
59 |  * The score is going to be a calculation between the stored codes, the
60 |  * centroid, and the query.
61 |  */
62 | class InvertedListScanner {
63 |    public:
64 |     InvertedListScanner(
65 |             std::shared_ptr<ProductEncoder>& quantizer,
66 |             const float* query_data,
67 |             size_t num_tokens);
68 | 
69 |     std::vector<ScoredPartialDocumentCodes> scan(
70 |             idx_t key,
71 |             const std::unique_ptr<Iterator> list_iterator,
72 |             const std::vector<QueryTokenCentroidScore>& query_tokens_to_score);
73 | 
74 |    private:
75 |     std::unique_ptr<PQDistanceTables> distance_tables;
76 |     std::shared_ptr<ProductEncoder> quantizer;
77 |     size_t code_size;
78 | };
79 | 
80 | } // namespace lintdb
81 | 
82 | #endif // LINTDB_INVERTEDLISTSCANNER_H
83 | 


--------------------------------------------------------------------------------
/lintdb/quantizers/PQDistanceTables.cpp:
--------------------------------------------------------------------------------
 1 | #include "PQDistanceTables.h"
 2 | #include <faiss/impl/code_distance/code_distance.h>
 3 | #include <faiss/impl/ProductQuantizer.h>
 4 | #include <faiss/IndexPQ.h>
 5 | #include <faiss/utils/distances.h>
 6 | #include <glog/logging.h>
 7 | #include "lintdb/assert.h"
 8 | 
 9 | namespace lintdb {
10 | PQDistanceTables::PQDistanceTables(
11 |         const float* query_data,
12 |         size_t num_tokens,
13 |         size_t dim,
14 |         const std::shared_ptr<faiss::IndexPQ> ipq,
15 |         bool is_ip)
16 |         : ipq(ipq), is_ip(is_ip), dim(dim) {
17 |     // right now, we only support IP.
18 |     LINTDB_THROW_IF_NOT(ipq->metric_type == faiss::METRIC_INNER_PRODUCT);
19 | 
20 |     for (size_t i = 0; i < num_tokens; i++) {
21 |         std::vector<float> distance_table(ipq->pq.M * ipq->pq.ksub);
22 |         ipq->pq.compute_inner_prod_table(
23 |                 query_data + i * dim, distance_table.data());
24 |         distance_tables.push_back(distance_table);
25 |     }
26 | }
27 | 
28 | std::vector<float> PQDistanceTables::calculate_query_distances(
29 |         const std::vector<idx_t>& query_tokens_to_score,
30 |         const std::vector<float>& precomputed_distances,
31 |         const std::vector<uint8_t>& codes) {
32 |     std::vector<float> results(precomputed_distances);
33 |     // use the distance to the centroid as a precomputed distance.
34 |     // we'll then add the distance from the centroid to the document code.
35 |     for (int j = 0; j < query_tokens_to_score.size(); j++) {
36 |         auto query_token_id = query_tokens_to_score[j];
37 |         auto sim_table = distance_tables[query_token_id];
38 |         float score = faiss::distance_single_code<faiss::PQDecoderGeneric>(
39 |                 ipq->pq, sim_table.data(), codes.data());
40 |         results[j] += score;
41 |     }
42 |     return results;
43 | }
44 | 
45 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/quantizers/PQDistanceTables.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINTDB_PQDISTANCETABLES_H
 2 | #define LINTDB_PQDISTANCETABLES_H
 3 | 
 4 | #include <cstddef>
 5 | #include <list>
 6 | #include <memory>
 7 | #include <vector>
 8 | #include "lintdb/api.h"
 9 | 
10 | namespace faiss {
11 | struct IndexPQ;
12 | }
13 | 
14 | namespace lintdb {
15 | 
16 | /**
17 |  * PQDistanceTables calculates scores for a given query token and doc token.
18 |  *
19 |  * This class holds all of the compute logic and returns the pieces of the
20 |  * calculation to InvertedListScanner.
21 |  *
22 |  * This class also knows about the IndexPQ internals, and should be owned by
23 |  * the quantizer. In the future, this will move inside ProductEncoder.
24 |  */
25 | class PQDistanceTables {
26 |    public:
27 |     PQDistanceTables(
28 |             const float* query_data,
29 |             size_t num_tokens,
30 |             size_t dim,
31 |             std::shared_ptr<faiss::IndexPQ> ipq,
32 |             bool is_ip = true);
33 | 
34 |     /**
35 |      * precompute_list_tables precomputes the distance to the list's centroid
36 |      * using the quantizer. We store the initial distance to each query token.
37 |      */
38 |     //    std::vector<float> precompute_list_tables(const std::vector<idx_t>&
39 |     //    query_token_ids);
40 | 
41 |     std::vector<float> calculate_query_distances(
42 |             const std::vector<idx_t>& query_tokens_to_score,
43 |             const std::vector<float>& precomputed_distances,
44 |             const std::vector<uint8_t>& codes);
45 | 
46 |    private:
47 |     std::vector<std::vector<float>> distance_tables;
48 |     std::shared_ptr<faiss::IndexPQ> ipq;
49 |     bool is_ip;
50 |     size_t dim;
51 | };
52 | 
53 | } // namespace lintdb
54 | 
55 | #endif // LINTDB_PQDISTANCETABLES_H
56 | 


--------------------------------------------------------------------------------
/lintdb/quantizers/ProductEncoder.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <memory>
 4 | #include "lintdb/quantizers/PQDistanceTables.h"
 5 | #include "lintdb/quantizers/Quantizer.h"
 6 | 
 7 | namespace faiss {
 8 | struct IndexPQ;
 9 | }
10 | 
11 | namespace lintdb {
12 | struct ProductEncoder : public Quantizer{
13 |     std::shared_ptr<faiss::IndexPQ> pq;
14 |     size_t nbits; // number of bits used in binarizing the residuals.
15 |     size_t dim;   // number of dimensions per embedding.
16 |     size_t dsub;  // dimensionality of each subvector;
17 |     size_t ksub;  // number of centroids per subquantizer.
18 |     size_t num_subquantizers;
19 | 
20 |     /// This table is used to precompute the inner product between the centroids
21 |     /// of the PQ quantizer.
22 |     std::vector<float> precomputed_table;
23 | 
24 |     ProductEncoder(size_t dim, size_t nbits, size_t num_subquantizers);
25 | 
26 |     ProductEncoder(const ProductEncoder& other);
27 | 
28 |     friend void swap(ProductEncoder& lhs, ProductEncoder& rhs);
29 | 
30 |     ProductEncoder& operator=(ProductEncoder& other) {
31 |         swap(*this, other);
32 |         return *this;
33 |     }
34 | 
35 |     bool is_trained = false;
36 | 
37 |     void sa_encode(size_t n, const float* x, residual_t* codes) override;
38 |     void sa_decode(size_t n, const residual_t* codes, float* x) override;
39 |     size_t code_size() override;
40 | 
41 |     size_t get_nbits() override {
42 |         return nbits;
43 |     }
44 | 
45 |     // Compute the inner product table for the given embeddings.
46 |     // This currently wraps the underlying faiss PQ index.
47 |     std::unique_ptr<PQDistanceTables> get_distance_tables(
48 |             const float* query_data,
49 |             size_t num_tokens) const;
50 | 
51 |     void save(const std::string path) override;
52 | 
53 |     static std::unique_ptr<ProductEncoder> load(
54 |             std::string path,
55 |             QuantizerConfig& config);
56 | 
57 |     void train(const size_t n, const float* embeddings, const size_t dim)
58 |             override;
59 | 
60 |     QuantizerType get_type() override;
61 | };
62 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/quantizers/Quantizer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stddef.h>
 4 | #include <string>
 5 | #include "lintdb/api.h"
 6 | #include "lintdb/quantizers/PQDistanceTables.h"
 7 | 
 8 | namespace lintdb {
 9 | static const std::string QUANTIZER_FILENAME = "_residual_quantizer.bin";
10 | static const std::string LEGACY_QUANTIZER_FILENAME = "_binarizer.bin";
11 | 
12 | enum QuantizerType {
13 |     UNKNOWN,
14 |     NONE,
15 |     BINARIZER,
16 |     PRODUCT_ENCODER,
17 | };
18 | 
19 | struct QuantizerConfig {
20 |     size_t nbits;
21 |     size_t dim;
22 |     size_t num_subquantizers; // used in ProductEncoder
23 | };
24 | /**
25 |  * Quantizer is responsible for vector encoding. Unlike the Encoder, this isn't
26 |  * responsible for IVF assignment.
27 |  */
28 | struct Quantizer {
29 |     virtual void train(const size_t n, const float* x, const size_t dim) = 0;
30 |     virtual void save(const std::string path) = 0;
31 | 
32 |     virtual void sa_encode(size_t n, const float* x, residual_t* codes) = 0;
33 |     virtual void sa_decode(size_t n, const residual_t* codes, float* x) = 0;
34 |     virtual size_t code_size() = 0;
35 | 
36 |     virtual size_t get_nbits() = 0;
37 | 
38 |     virtual QuantizerType get_type() = 0;
39 | 
40 |     virtual ~Quantizer() = default;
41 | };
42 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/quantizers/impl/kmeans.cpp:
--------------------------------------------------------------------------------
 1 | #include "lintdb/quantizers/impl/kmeans.h"
 2 | #include <faiss/Clustering.h>
 3 | #include <faiss/IndexFlat.h>
 4 | #include <glog/logging.h>
 5 | #include <gsl/span>
 6 | #include <random>
 7 | #include <vector>
 8 | #include "lintdb/assert.h"
 9 | 
10 | namespace lintdb {
11 | std::vector<float> kmeans(
12 |         const float* data,
13 |         size_t n,
14 |         size_t dim,
15 |         size_t k,
16 |         Metric metric,
17 |         int iterations) {
18 |     LINTDB_THROW_IF_NOT_MSG(
19 |             n > k,
20 |             "Number of data points must be greater than the number of clusters.");
21 | 
22 |     LOG(INFO) << "clustering " << n << " points in " << dim
23 |               << " dimensions into " << k << " clusters.";
24 | 
25 |     faiss::IndexFlatIP index(dim);
26 |     faiss::ClusteringParameters cp;
27 |     cp.niter = iterations;
28 |     cp.nredo = 1;
29 |     cp.verbose = true;
30 |     faiss::Clustering clus(dim, k, cp);
31 | 
32 |     clus.train(n, data, index);
33 | 
34 |     return std::vector<float>(index.get_xb(), index.get_xb() + k * dim);
35 | }
36 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/quantizers/impl/kmeans.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINTDB_KMEANS_H
 2 | #define LINTDB_KMEANS_H
 3 | 
 4 | #include <stddef.h>
 5 | #include <cmath>
 6 | #include <gsl/span>
 7 | #include <vector>
 8 | 
 9 | namespace lintdb {
10 | 
11 | enum class Metric { EUCLIDEAN, INNER_PRODUCT };
12 | // Helper function for Euclidean distance
13 | inline float euclidean_distance(
14 |         gsl::span<const float> a,
15 |         gsl::span<const float> b) {
16 |     float sum = 0.0f;
17 |     for (size_t i = 0; i < a.size(); ++i) {
18 |         float diff = a[i] - b[i];
19 |         sum += diff * diff;
20 |     }
21 |     return std::sqrt(sum);
22 | }
23 | 
24 | // Helper function for inner product
25 | inline float inner_product(gsl::span<const float> a, gsl::span<const float> b) {
26 |     size_t size = a.size();
27 |     size_t i = 0;
28 |     float result = 0.0f;
29 | 
30 |     // Use manual loop unrolling for better performance
31 |     for (; i + 4 <= size; i += 4) {
32 |         result += a[i] * b[i];
33 |         result += a[i + 1] * b[i + 1];
34 |         result += a[i + 2] * b[i + 2];
35 |         result += a[i + 3] * b[i + 3];
36 |     }
37 | 
38 |     // Process remaining elements
39 |     for (; i < size; ++i) {
40 |         result += a[i] * b[i];
41 |     }
42 | 
43 |     return result;
44 | }
45 | 
46 | inline float inner_product(std::vector<float>& a, std::vector<float>& b) {
47 |     size_t size = a.size();
48 |     size_t i = 0;
49 |     float result = 0.0f;
50 | 
51 |     // Use manual loop unrolling for better performance
52 |     for (; i + 4 <= size; i += 4) {
53 |         result += a[i] * b[i];
54 |         result += a[i + 1] * b[i + 1];
55 |         result += a[i + 2] * b[i + 2];
56 |         result += a[i + 3] * b[i + 3];
57 |     }
58 | 
59 |     // Process remaining elements
60 |     for (; i < size; ++i) {
61 |         result += a[i] * b[i];
62 |     }
63 | 
64 |     return result;
65 | }
66 | 
67 | // K-means clustering for a single sub-vector
68 | std::vector<float> kmeans(
69 |         const float* data,
70 |         size_t n,
71 |         size_t dim,
72 |         size_t k,
73 |         Metric metric,
74 |         int iterations = 100);
75 | 
76 | } // namespace lintdb
77 | 
78 | #endif // LINTDB_KMEANS_H
79 | 


--------------------------------------------------------------------------------
/lintdb/quantizers/io.cpp:
--------------------------------------------------------------------------------
 1 | #include "lintdb/quantizers/io.h"
 2 | #include <faiss/index_io.h>
 3 | 
 4 | namespace lintdb {
 5 | std::unique_ptr<Quantizer> load_quantizer(
 6 |         std::string path,
 7 |         QuantizerType type,
 8 |         QuantizerConfig& config) {
 9 |     if (type == QuantizerType::NONE) {
10 |         // the file won't exist, so we check NONE first.
11 |         return std::make_unique<IdentityQuantizer>(config.dim);
12 |     }
13 | 
14 |     if (FILE* file = fopen((path).c_str(), "r")) {
15 |         fclose(file);
16 |         switch (type) {
17 |             case QuantizerType::NONE:
18 |                 return std::make_unique<IdentityQuantizer>(config.dim);
19 |             case QuantizerType::BINARIZER:
20 |                 return Binarizer::load(path);
21 | 
22 |             case QuantizerType::PRODUCT_ENCODER:
23 |                 return ProductEncoder::load(path, config);
24 | 
25 |             default:
26 |                 throw LintDBException("Quantizer type not valid.");
27 |         }
28 |         return ProductEncoder::load(path, config);
29 |     } else {
30 |         throw LintDBException("Quantizer not found at path: " + path);
31 |     }
32 | }
33 | 
34 | void save_quantizer(std::string path, Quantizer* quantizer) {
35 |     if (quantizer == nullptr) {
36 |         return;
37 |     }
38 | 
39 |     switch (quantizer->get_type()) {
40 |         case QuantizerType::NONE:
41 |             break;
42 |         case QuantizerType::BINARIZER:
43 |             quantizer->save(path);
44 |             break;
45 | 
46 |         case QuantizerType::PRODUCT_ENCODER:
47 |             quantizer->save(path);
48 |             break;
49 | 
50 |         default:
51 |             throw LintDBException("Quantizer type not valid.");
52 |     }
53 | }
54 | 
55 | std::unique_ptr<Quantizer> create_quantizer(
56 |         QuantizerType type,
57 |         QuantizerConfig& config) {
58 |     switch (type) {
59 |         case QuantizerType::NONE:
60 |             return std::make_unique<IdentityQuantizer>(config.dim);
61 |             ;
62 | 
63 |         case QuantizerType::BINARIZER:
64 |             return std::make_unique<Binarizer>(config.nbits, config.dim);
65 | 
66 |         case QuantizerType::PRODUCT_ENCODER:
67 |             return std::make_unique<ProductEncoder>(
68 |                     config.dim, config.nbits, config.num_subquantizers);
69 | 
70 |         default:
71 |             throw LintDBException("Quantizer type not valid.");
72 |     }
73 | }
74 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/quantizers/io.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINTDB_QUANTIZERS_IO_H
 2 | #define LINTDB_QUANTIZERS_IO_H
 3 | 
 4 | #include <memory>
 5 | #include <string>
 6 | #include <unordered_map>
 7 | #include "lintdb/exception.h"
 8 | #include "lintdb/quantizers/Binarizer.h"
 9 | #include "lintdb/quantizers/IdentityQuantizer.h"
10 | #include "lintdb/quantizers/ProductEncoder.h"
11 | #include "lintdb/quantizers/Quantizer.h"
12 | #include "lintdb/SearchOptions.h"
13 | 
14 | namespace lintdb {
15 | std::unique_ptr<Quantizer> load_quantizer(
16 |         std::string path,
17 |         QuantizerType type,
18 |         QuantizerConfig& config);
19 | 
20 | void save_quantizer(std::string path, Quantizer* quantizer);
21 | 
22 | std::unique_ptr<Quantizer> create_quantizer(
23 |         QuantizerType type,
24 |         QuantizerConfig& config);
25 | } // namespace lintdb
26 | 
27 | #endif


--------------------------------------------------------------------------------
/lintdb/query/DocValue.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <vector>
 4 | #include "lintdb/api.h"
 5 | #include "lintdb/invlists/Iterator.h"
 6 | #include "lintdb/schema/DataTypes.h"
 7 | 
 8 | namespace lintdb {
 9 | /**
10 |  * DocValue is a simple struct that holds a field value and the field id.
11 |  *
12 |  * It is the job of the caller to ensure that the field is valid, because this
13 |  * class has no concept of what the field should look like.
14 |  */
15 | struct DocValue {
16 |     lintdb::SupportedTypes value;
17 |     uint8_t field_id;
18 |     DataType type;
19 |     bool unread_value =
20 |             false; /// ColBERT fields do not have their values decoded from the
21 |                    /// index. We check this flag so that
22 |     /// we can throw an exception if the user tries to access the value.
23 | 
24 |     DocValue(SupportedTypes value, uint8_t field_id, DataType type)
25 |             : value(std::move(value)), field_id(field_id), type(type) {}
26 | 
27 |     SupportedTypes get_value() const {
28 |         if (unread_value) {
29 |             throw LintDBException(
30 |                     "Document's value was not decoded from the index. This is likely because a ColBERT field was read");
31 |         }
32 |         return value;
33 |     }
34 | };
35 | 
36 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/query/KnnNearestCentroids.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <memory>
 4 | #include <utility>
 5 | #include <vector>
 6 | #include "lintdb/assert.h"
 7 | #include "lintdb/quantizers/CoarseQuantizer.h"
 8 | 
 9 | namespace lintdb {
10 | 
11 | struct QueryTensor {
12 |     const std::vector<float>& query;
13 |     size_t num_query_tokens;
14 | };
15 | 
16 | class KnnNearestCentroids {
17 |    public:
18 |     KnnNearestCentroids() = default;
19 |     void calculate(
20 |             std::vector<float>& query,
21 |             const size_t num_query_tokens,
22 |             const std::shared_ptr<ICoarseQuantizer> quantizer,
23 |             const size_t total_centroids_to_calculate);
24 | 
25 |     std::vector<std::pair<float, idx_t>> get_top_centroids(
26 |             const size_t k_top_centroids, /// k centroids per token to consider.
27 |             const size_t n_probe /// overall number of centroids to return.
28 |     ) const;
29 | 
30 |     inline std::vector<float> get_distances() const {
31 |         return distances;
32 |     }
33 |     inline std::vector<idx_t> get_indices() const {
34 |         return coarse_idx;
35 |     }
36 | 
37 |     inline size_t get_num_centroids() const {
38 |         return num_centroids;
39 |     }
40 | 
41 |     /// Returns the top centroid id for the idx-th token.
42 |     inline idx_t get_assigned_centroid(size_t idx) const {
43 |         return coarse_idx[idx * total_centroids_to_calculate];
44 |     }
45 | 
46 |     inline const std::vector<float>& get_reordered_distances() const {
47 |         return reordered_distances;
48 |     }
49 | 
50 |     inline bool is_valid() const {
51 |         // this works because we don't set num_centroids until we have
52 |         // calculated them.
53 |         return num_centroids > 0;
54 |     }
55 | 
56 |     inline QueryTensor get_query_tensor() const {
57 |         LINTDB_THROW_IF_NOT_MSG(!query.empty(), "query is empty");
58 |         return {query, num_query_tokens};
59 |     }
60 | 
61 |    private:
62 |     std::vector<float> query;
63 |     size_t num_query_tokens;
64 |     size_t num_centroids;
65 |     size_t total_centroids_to_calculate;
66 |     std::vector<std::pair<float, idx_t>> top_centroids;
67 |     std::vector<float> distances;
68 |     std::vector<idx_t> coarse_idx;
69 |     std::vector<float> reordered_distances; /// distances that match the
70 |                                             /// centroid id position.
71 | };
72 | 
73 | } // namespace lintdb
74 | 


--------------------------------------------------------------------------------
/lintdb/query/Query.cpp:
--------------------------------------------------------------------------------
1 | #include "Query.h"
2 | 
3 | namespace lintdb {
4 | 
5 | Query::Query(std::unique_ptr<QueryNode> root) : root(std::move(root)) {}
6 | 
7 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/query/Query.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "QueryNode.h"
 4 | 
 5 | namespace lintdb {
 6 | struct Query {
 7 |    public:
 8 |     Query(std::unique_ptr<QueryNode> root);
 9 | 
10 |     std::unique_ptr<QueryNode> root;
11 | };
12 | 
13 | } // namespace lintdb
14 | 


--------------------------------------------------------------------------------
/lintdb/query/QueryContext.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <unordered_map>
 4 | #include <variant>
 5 | #include "lintdb/invlists/InvertedList.h"
 6 | #include "lintdb/quantizers/CoarseQuantizer.h"
 7 | #include "lintdb/quantizers/Quantizer.h"
 8 | #include "lintdb/query/KnnNearestCentroids.h"
 9 | #include "lintdb/schema/FieldMapper.h"
10 | 
11 | namespace lintdb {
12 | 
13 | class QueryContext {
14 |    public:
15 |     const std::string colbert_context;
16 | 
17 |     explicit QueryContext(
18 |             const uint64_t tenant,
19 |             const std::string colbert_field,
20 |             const std::shared_ptr<InvertedList> invertedList,
21 |             const std::shared_ptr<FieldMapper> fieldMapper,
22 |             const std::unordered_map<
23 |                     std::string,
24 |                     std::shared_ptr<ICoarseQuantizer>>& coarse_quantizer_map,
25 |             const std::unordered_map<std::string, std::shared_ptr<Quantizer>>&
26 |                     quantizer_map)
27 |             : colbert_context(colbert_field),
28 |               tenant(tenant),
29 |               db_(invertedList),
30 |               fieldMapper_(fieldMapper),
31 |               coarse_quantizer_map(coarse_quantizer_map),
32 |               quantizer_map(quantizer_map) {}
33 | 
34 |     inline std::shared_ptr<FieldMapper> getFieldMapper() const {
35 |         return fieldMapper_;
36 |     }
37 | 
38 |     inline std::shared_ptr<InvertedList> getIndex() const {
39 |         return db_;
40 |     }
41 | 
42 |     inline uint64_t getTenant() const {
43 |         return tenant;
44 |     }
45 | 
46 |     inline std::shared_ptr<ICoarseQuantizer> getCoarseQuantizer(
47 |             const std::string& field) const {
48 |         return coarse_quantizer_map.at(field);
49 |     }
50 | 
51 |     inline std::shared_ptr<Quantizer> getQuantizer(
52 |             const std::string& field) const {
53 |         return quantizer_map.at(field);
54 |     }
55 | 
56 |     inline std::shared_ptr<KnnNearestCentroids> getOrCreateNearestCentroids(
57 |             const std::string& field) {
58 |         if (knnNearestCentroidsMap.find(field) ==
59 |             knnNearestCentroidsMap.end()) {
60 |             auto knnNearestCentroids = std::make_shared<KnnNearestCentroids>();
61 |             knnNearestCentroidsMap.insert(
62 |                     {field, std::move(knnNearestCentroids)});
63 |         }
64 |         return knnNearestCentroidsMap.at(field);
65 |     }
66 | 
67 |     inline void setNearestCentroids(
68 |             const std::string& field,
69 |             std::shared_ptr<KnnNearestCentroids> knnNearestCentroids) {
70 |         knnNearestCentroidsMap.insert({field, knnNearestCentroids});
71 |     }
72 | 
73 |    private:
74 |     const uint64_t tenant;
75 |     const std::shared_ptr<InvertedList> db_;
76 |     const std::shared_ptr<FieldMapper> fieldMapper_;
77 |     const std::unordered_map<std::string, std::shared_ptr<ICoarseQuantizer>>&
78 |             coarse_quantizer_map;
79 |     const std::unordered_map<std::string, std::shared_ptr<Quantizer>>&
80 |             quantizer_map;
81 |     std::unordered_map<std::string, std::shared_ptr<KnnNearestCentroids>>
82 |             knnNearestCentroidsMap;
83 | };
84 | 
85 | } // namespace lintdb
86 | 


--------------------------------------------------------------------------------
/lintdb/query/QueryExecutor.cpp:
--------------------------------------------------------------------------------
 1 | #include "QueryExecutor.h"
 2 | #include <vector>
 3 | #include "decode.h"
 4 | #include "DocIterator.h"
 5 | #include "DocValue.h"
 6 | #include "lintdb/query/KnnNearestCentroids.h"
 7 | #include "lintdb/scoring/ContextCollector.h"
 8 | #include "lintdb/scoring/ScoredDocument.h"
 9 | 
10 | namespace lintdb {
11 | QueryExecutor::QueryExecutor(Scorer& ranker)
12 |         : ranker(ranker) {}
13 | 
14 | std::vector<ScoredDocument> QueryExecutor::execute(
15 |         QueryContext& context,
16 |         const Query& query,
17 |         const size_t num_results,
18 |         const SearchOptions& opts) {
19 |     std::unique_ptr<DocIterator> doc_it = query.root->process(context, opts);
20 | 
21 |     std::vector<std::pair<idx_t, std::vector<DocValue>>> documents;
22 |     for(; doc_it->is_valid(); doc_it->advance()) {
23 |         std::vector<DocValue> dvs = doc_it->fields();
24 | 
25 |         documents.emplace_back(doc_it->doc_id(), dvs);
26 |     }
27 | 
28 |     std::vector<ScoredDocument> results(documents.size());
29 | #pragma omp parallel for if(documents.size() > 100)
30 |     for(int i = 0; i < documents.size(); i++) {
31 |         auto doc = documents[i];
32 | //        for (auto& dv : doc.second) {
33 | //            // ColBERT is a special case where we don't have a value to decode.
34 | //            if (dv.unread_value) {
35 | //                continue;
36 | //            }
37 | //            dv = decode_vectors(context, dv);
38 | //        }
39 |         ScoredDocument scored = doc_it->score(doc.second);
40 |         scored.doc_id = doc.first;
41 | 
42 |         if (opts.expected_id != -1 && doc.first == opts.expected_id) {
43 |             LOG(INFO) << "\tscore: " << scored.score;
44 |         }
45 | 
46 |         results[i] = scored;
47 |     } // end for
48 | 
49 |     std::sort(results.begin(), results.end(), std::greater<>());
50 | 
51 |     size_t num_to_rank = std::min(results.size(), opts.num_second_pass);
52 | 
53 |     std::vector<ScoredDocument> top_results_ranked(num_to_rank);
54 |     for (size_t i = 0; i < num_to_rank; i++) {
55 |         top_results_ranked[i] = ranker.score(
56 |                 context, results[i].doc_id, results[i].values);
57 |     }
58 | 
59 |     std::sort(
60 |             top_results_ranked.begin(),
61 |             top_results_ranked.end(),
62 |             std::greater<>());
63 | 
64 |     // return num_results from top_results_ranked
65 |     std::vector<ScoredDocument> final_results;
66 |     for (size_t i = 0; i < num_results && i < top_results_ranked.size(); i++) {
67 |         final_results.push_back(top_results_ranked[i]);
68 |     }
69 | 
70 |     return final_results;
71 | }
72 | 
73 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/query/QueryExecutor.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "lintdb/invlists/RocksdbInvertedList.h"
 4 | #include "lintdb/query/DocValue.h"
 5 | #include "lintdb/schema/FieldMapper.h"
 6 | #include "lintdb/scoring/Scorer.h"
 7 | #include "lintdb/SearchOptions.h"
 8 | #include "lintdb/SearchResult.h"
 9 | #include "Query.h"
10 | #include "QueryContext.h"
11 | #include "lintdb/scoring/ScoredDocument.h"
12 | 
13 | namespace lintdb {
14 | /**
15 |  * QueryExecutor helps manage the execution of queries.
16 |  *
17 |  * The basic flow of retrieval is:
18 |  * 1. Optimize the query.
19 |  * 2. Translate the query into a series of document iterators.
20 |  * 3. Scan those iterators to retrieve the right documents.
21 |  * 4. Score the documents.
22 |  *
23 |  */
24 | class QueryExecutor {
25 |    public:
26 |     QueryExecutor(Scorer& ranker);
27 | 
28 |     std::vector<ScoredDocument> execute(
29 |             QueryContext& context,
30 |             const Query& query,
31 |             const size_t num_results,
32 |             const SearchOptions& opts);
33 | 
34 |    private:
35 |     Scorer& ranker;
36 | };
37 | 
38 | } // namespace lintdb
39 | 


--------------------------------------------------------------------------------
/lintdb/query/decode.cpp:
--------------------------------------------------------------------------------
 1 | #include "decode.h"
 2 | #include <queue>
 3 | #include <vector>
 4 | #include "DocValue.h"
 5 | #include "lintdb/schema/DataTypes.h"
 6 | 
 7 | namespace lintdb {
 8 | DocValue decode_vectors(
 9 |         const lintdb::QueryContext& context,
10 |         const lintdb::DocValue& doc_value) {
11 |     if (doc_value.unread_value) {
12 |         return doc_value;
13 |     }
14 |     switch (doc_value.type) {
15 |         case lintdb::QUANTIZED_TENSOR: {
16 |             std::string field =
17 |                     context.getFieldMapper()->getFieldName(doc_value.field_id);
18 |             // check if field has a quantizer.
19 |             if (!context.getQuantizer(field)) {
20 |                 return doc_value;
21 |             }
22 | 
23 |             auto quantizer = context.getQuantizer(field);
24 | 
25 |             std::vector<residual_t> quantized =
26 |                     std::get<lintdb::QuantizedTensor>(doc_value.get_value());
27 |             size_t dim = context.getFieldMapper()->getFieldDimensions(
28 |                     doc_value.field_id);
29 |             size_t num_vectors = quantized.size() / quantizer->code_size();
30 | 
31 |             std::vector<float> tensor(num_vectors * dim, 0);
32 |             quantizer->sa_decode(num_vectors, quantized.data(), tensor.data());
33 | 
34 |             return {tensor, doc_value.field_id, lintdb::TENSOR};
35 |         }
36 |         default:
37 |             return doc_value;
38 |     }
39 | }
40 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/query/decode.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "DocValue.h"
 4 | #include "QueryContext.h"
 5 | 
 6 | namespace lintdb {
 7 | /**
 8 |  * decode_vectors manages the decoding of vectors from the doc_value. All
 9 |  * tensors become QuantizedTensor values going into the index, and we need to
10 |  * decode any tensors that have an associated quantizer.
11 |  *
12 |  * @param context
13 |  * @param doc_value
14 |  * @return
15 |  */
16 | DocValue decode_vectors(const QueryContext& context, const DocValue& doc_value);
17 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/schema/DocEncoder.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <map>
 4 | #include <string>
 5 | #include <vector>
 6 | #include "lintdb/invlists/PostingData.h"
 7 | #include "lintdb/schema/DataTypes.h"
 8 | #include "lintdb/schema/ProcessedData.h"
 9 | #include "lintdb/api.h"
10 | 
11 | namespace lintdb {
12 | 
13 | class DocEncoder {
14 |    public:
15 |     static std::vector<PostingData> encode_inverted_data(
16 |             const ProcessedData& data,
17 |             size_t code_size);
18 | 
19 |     static PostingData encode_forward_data(
20 |             const std::vector<ProcessedData>& data);
21 | 
22 |     static PostingData encode_context_data(const ProcessedData& data);
23 | 
24 |     static std::vector<PostingData> encode_inverted_mapping_data(
25 |             const ProcessedData& data);
26 | 
27 |     static SupportedTypes decode_supported_types(std::string& data);
28 | 
29 |     static std::map<uint8_t, SupportedTypes> decode_forward_data(
30 |             std::string& data);
31 | 
32 |     static std::vector<idx_t> decode_inverted_mapping_data(std::string& data);
33 | };
34 | 
35 | } // namespace lintdb
36 | 


--------------------------------------------------------------------------------
/lintdb/schema/DocProcessor.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <memory>
 4 | #include <string>
 5 | #include <unordered_map>
 6 | #include "lintdb/invlists/IndexWriter.h"
 7 | #include "lintdb/quantizers/CoarseQuantizer.h"
 8 | #include "lintdb/quantizers/Quantizer.h"
 9 | #include "lintdb/schema/DataTypes.h"
10 | #include "lintdb/schema/Document.h"
11 | #include "lintdb/schema/FieldMapper.h"
12 | #include "lintdb/schema/ProcessedData.h"
13 | #include "lintdb/schema/Schema.h"
14 | 
15 | namespace lintdb {
16 | 
17 | class DocumentProcessor {
18 |    public:
19 |     DocumentProcessor(
20 |             const Schema& schema,
21 |             const std::unordered_map<std::string, std::shared_ptr<Quantizer>>&
22 |                     quantizer_map,
23 |             const std::unordered_map<
24 |                     std::string,
25 |                     std::shared_ptr<ICoarseQuantizer>>& coarse_quantizer_map,
26 |             const std::shared_ptr<FieldMapper> field_mapper,
27 |             std::unique_ptr<IIndexWriter> index_writer);
28 |     void processDocument(const uint64_t tenant, const Document& document);
29 | 
30 |    private:
31 |     static void validateField(const Field& field, const FieldValue& value);
32 |     FieldValue quantizeField(const Field& field, const FieldValue& value);
33 |     std::vector<idx_t> assignIVFCentroids(
34 |             const Field& field,
35 |             const FieldValue& value);
36 | 
37 |     Schema schema;
38 |     std::unordered_map<std::string, Field> field_map;
39 |     const std::shared_ptr<FieldMapper> field_mapper;
40 |     // each tensor/tensor_array field has a quantizer
41 |     const std::unordered_map<std::string, std::shared_ptr<Quantizer>>&
42 |             quantizer_map;
43 |     const std::unordered_map<std::string, std::shared_ptr<ICoarseQuantizer>>&
44 |             coarse_quantizer_map;
45 | 
46 |     std::unique_ptr<IIndexWriter> index_writer;
47 | };
48 | 
49 | } // namespace lintdb
50 | 


--------------------------------------------------------------------------------
/lintdb/schema/Document.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <unordered_map>
 4 | #include "DataTypes.h"
 5 | 
 6 | namespace lintdb {
 7 | /**
 8 |  * Documents hold data as they are passed into the database from the user.
 9 |  *
10 |  * Each Document must have a unique id.
11 |  */
12 | struct Document {
13 |     std::vector<FieldValue> fields;
14 |     idx_t id; /// the unique id of the document
15 | 
16 |     Document(idx_t id, const std::vector<FieldValue>& fields)
17 |             : fields(fields), id(id) {}
18 | 
19 |     Json::Value toJson() const {
20 |         Json::Value root;
21 |         root["id"] = static_cast<Json::Int64>(id);
22 | 
23 |         Json::Value fieldsArray(Json::arrayValue);
24 |         for (const auto &field : fields) {
25 |             fieldsArray.append(field.toJson());
26 |         }
27 |         root["fields"] = fieldsArray;
28 | 
29 |         return root;
30 |     }
31 | 
32 |     static Document fromJson(const Json::Value &json) {
33 |         idx_t id = json["id"].asInt64();
34 | 
35 |         std::vector<FieldValue> fields;
36 |         const Json::Value &fieldsArray = json["fields"];
37 |         for (const auto &fieldJson : fieldsArray) {
38 |             fields.push_back(FieldValue::fromJson(fieldJson));
39 |         }
40 | 
41 |         return Document(id, fields);
42 |     }
43 | };
44 | 
45 | } // namespace lintdb
46 | 


--------------------------------------------------------------------------------
/lintdb/schema/FieldMapper.cpp:
--------------------------------------------------------------------------------
 1 | #include "FieldMapper.h"
 2 | #include <json/json.h>
 3 | 
 4 | namespace lintdb {
 5 | std::shared_ptr<FieldMapper> FieldMapper::fromJson(const Json::Value& json) {
 6 |     std::shared_ptr<FieldMapper> mapper = std::make_shared<FieldMapper>();
 7 |     int highest_id = 0;
 8 |     for (const auto& member : json["nameToID"].getMemberNames()) {
 9 |         mapper->nameToID[member] = json["nameToID"][member].asInt();
10 |         if (mapper->nameToID[member] > highest_id) {
11 |             highest_id = mapper->nameToID[member];
12 |         }
13 |     }
14 | 
15 |     for (const auto& field : json["idToField"]) {
16 |         mapper->idToField[field["id"].asInt()] = Field::fromJson(field);
17 |     }
18 | 
19 |     mapper->fieldID = highest_id + 1;
20 |     return mapper;
21 | }
22 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/schema/FieldMapper.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <glog/logging.h>
 4 | #include <json/json.h>
 5 | #include <memory>
 6 | #include <string>
 7 | #include <unordered_map>
 8 | #include "lintdb/schema/DataTypes.h"
 9 | #include "lintdb/schema/Schema.h"
10 | 
11 | namespace lintdb {
12 | 
13 | class FieldMapper {
14 |    public:
15 |     FieldMapper() = default;
16 |     // copy constructor
17 |     FieldMapper(const FieldMapper& other) {
18 |         nameToID = other.nameToID;
19 |         fieldID = other.fieldID;
20 |         idToField = other.idToField;
21 |     }
22 | 
23 |     // copy assignment operator
24 |     // using copy and swap idiom.
25 |     FieldMapper& operator=(FieldMapper other) {
26 |         std::swap(nameToID, other.nameToID);
27 |         std::swap(fieldID, other.fieldID);
28 |         std::swap(idToField, other.idToField);
29 |         return *this;
30 |     }
31 | 
32 |     inline void addSchema(const Schema& schema) {
33 |         for (const auto& field : schema.fields) {
34 |             addMapping(field);
35 |         }
36 |     }
37 | 
38 |     inline DataType getDataType(const uint8_t field_id) const {
39 |         return idToField.at(field_id).data_type;
40 |     }
41 | 
42 |     inline std::vector<FieldType> getFieldTypes(const uint8_t field_id) const {
43 |         return idToField.at(field_id).field_types;
44 |     }
45 | 
46 |     inline int getFieldID(const std::string& fieldName) const {
47 |         auto it = nameToID.find(fieldName);
48 |         if (it != nameToID.end()) {
49 |             return it->second;
50 |         }
51 |         throw std::runtime_error("Field name not found: " + fieldName);
52 |     }
53 | 
54 |     inline std::string getFieldName(int fieldID) const {
55 |         auto it = idToField.find(fieldID);
56 |         if (it != idToField.end()) {
57 |             return it->second.name;
58 |         }
59 |         throw std::runtime_error("Field ID not found");
60 |     }
61 | 
62 |     inline size_t getFieldDimensions(int field_id) const {
63 |         return idToField.at(field_id).parameters.dimensions;
64 |     }
65 | 
66 |     inline Json::Value toJson() const {
67 |         Json::Value json;
68 |         for (const auto& pair : nameToID) {
69 |             json["nameToID"][pair.first] = pair.second;
70 |         }
71 |         for (const auto& pair : idToField) {
72 |             json["idToField"][pair.first] = pair.second.toJson();
73 |         }
74 |         return json;
75 |     }
76 | 
77 |     static std::shared_ptr<FieldMapper> fromJson(const Json::Value& json);
78 | 
79 |    private:
80 |     std::unordered_map<int, Field> idToField;
81 |     std::unordered_map<std::string, int> nameToID;
82 |     int fieldID = 0;
83 | 
84 |     inline void addMapping(const Field& field) {
85 |         if (nameToID.find(field.name) != nameToID.end()) {
86 |             throw std::runtime_error(
87 |                     "Field name already exists: " + field.name);
88 |         }
89 |         nameToID[field.name] = fieldID;
90 |         idToField[fieldID] = field;
91 | 
92 |         fieldID++;
93 |     }
94 | };
95 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/schema/ProcessedData.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "lintdb/schema/DataTypes.h"
 4 | 
 5 | namespace lintdb {
 6 | /**
 7 |  * ColumnInverter is a helper class that inverts a column of a document.
 8 |  *
 9 |  * Once the document processor has assigned codes to a tensor, we can invert it
10 |  * into the index.
11 |  *
12 |  * inverted index:
13 |  *      key => tenant, field, IVF centroid id, doc_id
14 |  *      value => codes assigned to this centroid
15 |  * context index:
16 |  *      key => tenant, field, doc_id
17 |  *      value => values of the field
18 |  * forward index:
19 |  *      key => tenant, doc_id
20 |  *      value => all stored data of the document
21 |  */
22 | struct ProcessedData {
23 |     uint64_t tenant;
24 |     uint8_t field;
25 |     std::vector<idx_t> centroid_ids;
26 |     idx_t doc_id;
27 | 
28 |     FieldValue value;
29 | };
30 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/schema/Schema.cpp:
--------------------------------------------------------------------------------
 1 | #include "Schema.h"
 2 | 
 3 | namespace lintdb {
 4 | Json::Value Field::toJson() const {
 5 |     Json::Value json;
 6 |     json["name"] = name;
 7 |     json["data_type"] = static_cast<int>(data_type);
 8 | 
 9 |     Json::Value fieldTypesJson(Json::arrayValue);
10 |     for (const auto& fieldType : field_types) {
11 |         fieldTypesJson.append(static_cast<int>(fieldType));
12 |     }
13 |     json["field_types"] = fieldTypesJson;
14 | 
15 |     Json::Value params;
16 |     params["dimensions"] = static_cast<Json::Value::UInt64>(parameters.dimensions);
17 |     params["analyzer"] = parameters.analyzer;
18 |     params["quantization"] = static_cast<int>(parameters.quantization);
19 |     params["num_centroids"] = static_cast<Json::Value::UInt64>(parameters.num_centroids);
20 |     params["num_iterations"] = static_cast<Json::Value::UInt64>(parameters.num_iterations);
21 |     params["num_subquantizers"] = static_cast<Json::Value::UInt64>(parameters.num_subquantizers);
22 |     params["nbits"] = static_cast<Json::Value::UInt64>(parameters.nbits);
23 |     json["parameters"] = params;
24 | 
25 |     return json;
26 | }
27 | 
28 | Field Field::fromJson(const Json::Value& json) {
29 |     Field field;
30 |     field.name = json["name"].asString();
31 |     field.data_type = static_cast<DataType>(json["data_type"].asInt());
32 | 
33 |     const Json::Value& fieldTypesJson = json["field_types"];
34 |     for (const auto& fieldTypeJson : fieldTypesJson) {
35 |         field.field_types.push_back(
36 |                 static_cast<FieldType>(fieldTypeJson.asInt()));
37 |     }
38 | 
39 |     const Json::Value& params = json["parameters"];
40 |     field.parameters.dimensions = params["dimensions"].asUInt();
41 |     field.parameters.analyzer = params["analyzer"].asString();
42 |     field.parameters.quantization =
43 |             static_cast<QuantizerType>(params["quantization"].asInt());
44 |     field.parameters.num_centroids = params["num_centroids"].asUInt();
45 |     field.parameters.num_iterations = params["num_iterations"].asUInt();
46 |     field.parameters.num_subquantizers = params["num_subquantizers"].asUInt();
47 |     field.parameters.nbits = params["nbits"].asUInt();
48 | 
49 |     return field;
50 | }
51 | 
52 | Json::Value Schema::toJson() const {
53 |     Json::Value json;
54 |     for (const auto& field : fields) {
55 |         json["fields"].append(field.toJson());
56 |     }
57 |     return json;
58 | }
59 | 
60 | Schema Schema::fromJson(const Json::Value& json) {
61 |     Schema schema;
62 |     for (const auto& jsonField : json["fields"]) {
63 |         schema.fields.push_back(Field::fromJson(jsonField));
64 |     }
65 |     return schema;
66 | }
67 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/schema/Schema.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <json/json.h>
  4 | #include <stddef.h>
  5 | #include <map>
  6 | #include <string>
  7 | #include <vector>
  8 | #include "lintdb/quantizers/Quantizer.h"
  9 | #include "lintdb/schema/DataTypes.h"
 10 | 
 11 | namespace lintdb {
 12 | 
 13 | enum class FieldType { Indexed, Context, Stored, Colbert };
 14 | 
 15 | struct FieldParameters {
 16 |     size_t dimensions = 0;
 17 |     std::string analyzer = "";
 18 |     QuantizerType quantization = QuantizerType::UNKNOWN;
 19 |     size_t num_centroids = 0;
 20 |     size_t num_iterations = 10;
 21 |     size_t num_subquantizers = 0; // used for PQ quantizer
 22 |     size_t nbits = 1;             // used for PQ quantizer
 23 | };
 24 | 
 25 | /**
 26 |  * A Schema is made up of multiple fields.
 27 |  */
 28 | struct Field {
 29 |     std::string name;   /// the name of the field
 30 |     DataType data_type; /// the data type. e.g. int, float, string, embedding.
 31 |     std::vector<FieldType> field_types; /// the field types. e.g. indexed or
 32 |                                         /// stored in the database.
 33 |     FieldParameters parameters;         /// parameters for the field.
 34 | 
 35 |     Field() = default;
 36 |     Field(const std::string& name,
 37 |           const DataType data_type,
 38 |           const std::vector<FieldType>& field_types,
 39 |           const FieldParameters& parameters)
 40 |             : name(name),
 41 |               data_type(data_type),
 42 |               field_types(field_types),
 43 |               parameters(parameters) {}
 44 | 
 45 |     Json::Value toJson() const;
 46 |     static Field fromJson(const Json::Value& json);
 47 | 
 48 |     void add_field_type(FieldType field_type) {
 49 |         field_types.push_back(field_type);
 50 |     }
 51 | };
 52 | 
 53 | struct IndexedField : public Field {
 54 |     IndexedField(
 55 |             const std::string& name,
 56 |             const DataType data_type,
 57 |             const FieldParameters& parameters)
 58 |             : Field(name, data_type, {FieldType::Indexed}, parameters) {}
 59 | };
 60 | 
 61 | struct ContextField : public Field {
 62 |     ContextField(
 63 |             const std::string& name,
 64 |             const DataType data_type,
 65 |             const FieldParameters& parameters)
 66 |             : Field(name, data_type, {FieldType::Context}, parameters) {}
 67 | };
 68 | 
 69 | struct StoredField : public Field {
 70 |     StoredField(
 71 |             const std::string& name,
 72 |             const DataType data_type,
 73 |             const FieldParameters& parameters)
 74 |             : Field(name, data_type, {FieldType::Stored}, parameters) {}
 75 | };
 76 | 
 77 | struct ColbertField : public Field {
 78 |     ColbertField(
 79 |             const std::string& name,
 80 |             const DataType data_type,
 81 |             const FieldParameters& parameters)
 82 |             : Field(name, data_type, {FieldType::Colbert}, parameters) {}
 83 | };
 84 | 
 85 | /**
 86 |  * A schema dictates what data is stored, how it is stored, and the way we are
 87 |  * able to interact with the data.
 88 |  */
 89 | struct Schema {
 90 |     std::vector<Field> fields;
 91 | 
 92 |     Schema() = default;
 93 |     explicit Schema(const std::vector<Field>& fields) : fields(fields) {}
 94 | 
 95 |     Json::Value toJson() const;
 96 |     static Schema fromJson(const Json::Value& json);
 97 | 
 98 |     inline void add_field(Field& field) {
 99 |         fields.push_back(field);
100 |     }
101 | };
102 | 
103 | } // namespace lintdb
104 | 


--------------------------------------------------------------------------------
/lintdb/scoring/ContextCollector.cpp:
--------------------------------------------------------------------------------
1 | //
2 | // Copyright (c) 2024 ${ORGANIZATION_NAME}. All rights reserved.
3 | //
4 | 
5 | #include "ContextCollector.h"
6 | 
7 | namespace lintdb {} // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/scoring/ContextCollector.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <vector>
 4 | #include <memory>
 5 | #include <string>
 6 | #include "lintdb/query/QueryContext.h"
 7 | #include "lintdb/query/DocValue.h"
 8 | #include "lintdb/invlists/ContextIterator.h"
 9 | #include "lintdb/schema/DocEncoder.h"
10 | #include <glog/logging.h>
11 | #include <algorithm>
12 | 
13 | namespace lintdb {
14 | 
15 | class ContextCollector {
16 |    public:
17 |     ContextCollector() = default;
18 | 
19 |     void add_field(const QueryContext& context, const std::string& field) {
20 |         context_fields.push_back(field);
21 | 
22 |         uint8_t colbert_field_id =
23 |                 context.getFieldMapper()->getFieldID(context.colbert_context);
24 |         context_field_ids.push_back(colbert_field_id);
25 | 
26 |         bool is_colbert = false;
27 |         auto field_types = context.getFieldMapper()->getFieldTypes(colbert_field_id);
28 |         /**
29 |          * This is a pretty big hack because we modify the ColBERT fields internally. A user passes in
30 |          * a tensor data type, and we process it distinctly for colbert and reset it to be datatype::colbert.
31 |          *
32 |          * A solution is to stop modifying datatypes internally, or we could expose ColBERT
33 |          * as a datatype. However, our colbert storage is meant to be internal.
34 |          */
35 |         if (std::find(field_types.begin(), field_types.end(), FieldType::Colbert) != field_types.end()) {
36 |             is_colbert = true;
37 |         }
38 |         if (!is_colbert) {
39 |             context_data_types.push_back(context.getFieldMapper()->getDataType(colbert_field_id));
40 |         } else {
41 |             context_data_types.push_back(DataType::COLBERT);
42 |         }
43 | 
44 |         auto it = context.getIndex()->get_context_iterator(
45 |                 context.getTenant(), colbert_field_id);
46 | 
47 |         context_iterators.push_back(std::move(it));
48 |     }
49 | 
50 |     std::vector<DocValue> get_context_values(const idx_t doc_id) const {
51 |         std::vector<DocValue> results;
52 |         results.reserve(context_iterators.size());
53 | 
54 |         for(int i=0; i < context_iterators.size(); i++) {
55 |             auto it = context_iterators[i].get();
56 |             it->advance(doc_id);
57 | 
58 |             if(it->is_valid() && it->get_key().doc_id() == doc_id) {
59 |                 std::string context_str = it->get_value();
60 |                 SupportedTypes colbert_context =
61 |                         DocEncoder::decode_supported_types(context_str);
62 | 
63 |                 // create DocValues for the context info.
64 |                 uint8_t colbert_field_id = context_field_ids[i];
65 |                 results.emplace_back(colbert_context, colbert_field_id, context_data_types[i]);
66 |             } else {
67 |                 LOG(WARNING) << "No context found for doc_id: " << doc_id << " field: " << context_fields[i];
68 |             }
69 |         }
70 | 
71 |         return results;
72 |     }
73 | 
74 | 
75 |    private:
76 |     std::vector<std::string> context_fields;
77 |     std::vector<uint8_t> context_field_ids;
78 |     std::vector<DataType> context_data_types;
79 |     std::vector<std::unique_ptr<ContextIterator>> context_iterators;
80 | 
81 | };
82 | 
83 | } // namespace lintdb
84 | 


--------------------------------------------------------------------------------
/lintdb/scoring/ScoredDocument.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <memory>
 3 | #include <vector>
 4 | #include "lintdb/query/DocValue.h"
 5 | 
 6 | namespace lintdb {
 7 | struct ScoredDocument {
 8 |     double score = 0;
 9 |     idx_t doc_id = -1;
10 |     std::vector<lintdb::DocValue>
11 |             values; /// ScoredDocument takes ownership of the values, because
12 |     /// we assume we are iterating over a DocIterator and the values are only
13 |     /// valid for the duration of the iteration.
14 | 
15 |     ScoredDocument() = default;
16 | 
17 |     ScoredDocument(float score, idx_t doc_id, std::vector<lintdb::DocValue> values)
18 |             : score(score), doc_id(doc_id), values(std::move(values)) {}
19 | 
20 |     bool operator<(const ScoredDocument& other) const {
21 |         return score < other.score;
22 |     }
23 | 
24 |     bool operator>(const ScoredDocument& other) const {
25 |         return score > other.score;
26 |     }
27 | };
28 | }
29 | 


--------------------------------------------------------------------------------
/lintdb/scoring/Scorer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <memory>
 4 | #include <vector>
 5 | #include "lintdb/invlists/ContextIterator.h"
 6 | #include "lintdb/invlists/InvertedList.h"
 7 | #include "lintdb/quantizers/Quantizer.h"
 8 | #include "lintdb/query/DocIterator.h"
 9 | #include "lintdb/query/DocValue.h"
10 | #include "lintdb/query/QueryContext.h"
11 | #include "lintdb/schema/DataTypes.h"
12 | #include "lintdb/scoring/plaid.h"
13 | #include "ScoredDocument.h"
14 | 
15 | namespace lintdb {
16 | 
17 | /**
18 |  * Scorer is an interface for scoring documents.
19 |  *
20 |  * Scorers will iterate over a DocIterator and score each document.
21 |  * The caller of Scorer.score() will be responsible for keeping the scores in
22 |  * order.
23 |  *
24 |  * Additionally, different scorers can retrieve different context from fast
25 |  * fields. For example, ColBERT will use a context field to retrieve all
26 |  * document codes during scoring.
27 |  */
28 | class Scorer {
29 |    public:
30 |     virtual ~Scorer() = default;
31 |     virtual ScoredDocument score(
32 |             QueryContext& context,
33 |             idx_t doc_id,
34 |             std::vector<DocValue>& fvs) const = 0;
35 | };
36 | 
37 | class PlaidScorer : public Scorer {
38 |    public:
39 |     explicit PlaidScorer(const QueryContext& context);
40 |     ScoredDocument score(
41 |             QueryContext& context,
42 |             idx_t doc_id,
43 |             std::vector<DocValue>& fvs) const override;
44 |     ~PlaidScorer() override = default;
45 | 
46 | };
47 | 
48 | class ColBERTScorer : public Scorer {
49 |    public:
50 |     explicit ColBERTScorer(const QueryContext& context);
51 |     ScoredDocument score(
52 |             QueryContext& context,
53 |             idx_t doc_id,
54 |             std::vector<DocValue>& fvs) const override;
55 |     ~ColBERTScorer() override = default;
56 | 
57 | };
58 | 
59 | // class XTRScorer: public Scorer {
60 | //     double score(idx_t doc_id, std::vector<FieldValue>& fvs) const override;
61 | // };
62 | 
63 | } // namespace lintdb
64 | 


--------------------------------------------------------------------------------
/lintdb/scoring/plaid.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINTDB_PLAID_H
 2 | #define LINTDB_PLAID_H
 3 | 
 4 | #include <algorithm>
 5 | #include <bitset>
 6 | #include <gsl/span>
 7 | #include <vector>
 8 | #include "lintdb/api.h"
 9 | 
10 | namespace lintdb {
11 | /**
12 |  * score_documents_by_codes aggregates a document score based on each token's
13 |  * code and how well it matches the query.
14 |  *
15 |  * We return the list of scores for each centroid.
16 |  */
17 | float score_documents_by_codes(
18 |         const gsl::span<float>
19 |                 max_scores_by_centroid, // the max score per centroid across the
20 |                                         // query tokens.
21 |         const std::vector<code_t>& doc_codes,
22 |         const float centroid_score_threshold,
23 |         const idx_t expected_id = -1);
24 | 
25 | std::vector<float> max_score_by_centroid(
26 |         gsl::span<idx_t> coarse_idx,
27 |         gsl::span<float> distances,
28 |         size_t k_per_token,
29 |         size_t num_tokens,
30 |         size_t num_centroids);
31 | 
32 | float colbert_centroid_score(
33 |         const std::vector<code_t>& doc_codes, /// codes from the document. each
34 |                                               /// token is assigned a code.
35 |         const std::vector<float>&
36 |                 centroid_scores,     /// the score of those codes to the query.
37 |         const size_t nquery_vectors, /// the number of query vectors.
38 |         const size_t n_centroids,    /// how many centroids there are. this may
39 |                                   /// change based on how many scores we choose
40 |                                   /// to calculate.
41 |         const idx_t expected_id);
42 | 
43 | struct DocumentScore {
44 |     float score;
45 |     std::vector<float> tokens;
46 | };
47 | 
48 | DocumentScore score_document_by_residuals(
49 |         const gsl::span<const float>
50 |                 query_vectors, // size: (num_query_tokens, num_dim)
51 |         const size_t num_query_tokens,
52 |         float* doc_residuals, // size: (num_doc_tokens, num_dim)
53 |         const size_t num_doc_tokens,
54 |         const size_t dim,
55 |         const idx_t doc_id,
56 |         bool normalize = true);
57 | 
58 | } // namespace lintdb
59 | 
60 | #endif


--------------------------------------------------------------------------------
/lintdb/scoring/scoring_methods.cpp:
--------------------------------------------------------------------------------
 1 | #include "scoring_methods.h"
 2 | 
 3 | namespace lintdb {
 4 | score_t score_one(const std::vector<DocValue>& values) {
 5 |     return 1.0;
 6 | }
 7 | 
 8 | score_t plaid_similarity(const std::vector<DocValue>& values, std::shared_ptr<KnnNearestCentroids> knn) {
 9 |     int colbert_idx = -1;
10 |     for (size_t i = 0; i < values.size(); i++) {
11 |         if (values[i].type == DataType::COLBERT) {
12 |             colbert_idx = i;
13 |             break;
14 |         }
15 |     }
16 | 
17 |     if (colbert_idx == -1) {
18 |         LOG(WARNING) << "plaid context field not found for doc_id";
19 |         return 0.0;
20 |     }
21 | 
22 |     // rank phase 1: use the codes to score the document using the centroid
23 |     // scores.
24 |     auto reordered_distances = knn->get_reordered_distances();
25 | 
26 |     // gives us a potentially quantized vector
27 |     SupportedTypes colbert_context = values[colbert_idx].value;
28 |     ColBERTContextData codes = std::get<ColBERTContextData>(colbert_context);
29 |     size_t num_tensors = codes.doc_codes.size();
30 | 
31 |     QueryTensor query = knn->get_query_tensor();
32 |     float score = colbert_centroid_score(
33 |             codes.doc_codes,
34 |             reordered_distances,
35 |             query.num_query_tokens,
36 |             knn->get_num_centroids(),
37 |             -1);
38 | 
39 |     return score;
40 | }
41 | 
42 | UnaryScoringMethodFunction unary_scoring_methods[] = {
43 |         score_one,
44 | };
45 | 
46 | score_t score(const UnaryScoringMethod method, const std::vector<DocValue>& values) {
47 |     int scoring_type = static_cast<int>(method);
48 |     return unary_scoring_methods[scoring_type](values);
49 | }
50 | 
51 | EmbeddingScoringMethodFunction embedding_scoring_methods[] = {
52 |         plaid_similarity,
53 | };
54 | 
55 | 
56 | score_t score_embeddings(const EmbeddingScoringMethod method, const std::vector<DocValue>& values, std::shared_ptr<KnnNearestCentroids> knn) {
57 |     int scoring_type = static_cast<int>(method);
58 |     return embedding_scoring_methods[scoring_type](values, knn);
59 | }
60 | 
61 | score_t sum(const std::vector<score_t>& values) {
62 |     score_t sum = 0;
63 |     for (const score_t value : values) {
64 |         sum += value;
65 |     }
66 |     return sum;
67 | }
68 | 
69 | score_t reduce(const std::vector<score_t>& values) {
70 |     score_t product = 1;
71 |     for (const score_t value : values) {
72 |         product *= value;
73 |     }
74 |     return product;
75 | }
76 | 
77 | score_t max(const std::vector<score_t>& values) {
78 |     score_t max = values[0];
79 |     for (const score_t value : values) {
80 |         if (value > max) {
81 |             max = value;
82 |         }
83 |     }
84 |     return max;
85 | }
86 | 
87 | NaryScoringMethodFunction nary_scoring_methods[] = {
88 |         sum,
89 |         reduce,
90 |         max,
91 | };
92 | 
93 | score_t score(const NaryScoringMethod method, const std::vector<score_t>& values) {
94 |     int scoring_type = static_cast<int>(method);
95 |     return nary_scoring_methods[scoring_type](values);
96 | }
97 | 
98 | }


--------------------------------------------------------------------------------
/lintdb/scoring/scoring_methods.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "lintdb/schema/DataTypes.h"
 4 | #include "lintdb/query/DocValue.h"
 5 | #include "lintdb/query/KnnNearestCentroids.h"
 6 | #include "lintdb/scoring/plaid.h"
 7 | #include <vector>
 8 | #include <string>
 9 | #include <memory>
10 | 
11 | namespace lintdb {
12 | 
13 | typedef double score_t;
14 | typedef score_t (*UnaryScoringMethodFunction)(const std::vector<DocValue>& values);
15 | typedef score_t (*NaryScoringMethodFunction)(const std::vector<score_t>& values);
16 | typedef score_t (*EmbeddingScoringMethodFunction)(const std::vector<DocValue>& values, std::shared_ptr<KnnNearestCentroids> knn);
17 | 
18 | score_t score_one(const std::vector<DocValue>& values);
19 | 
20 | score_t plaid_similarity(const std::vector<DocValue>& values, std::shared_ptr<KnnNearestCentroids> knn);
21 | 
22 | 
23 | enum class UnaryScoringMethod {
24 |     ONE = 0,
25 | };
26 | 
27 | score_t score(const UnaryScoringMethod method, const std::vector<DocValue>& values);
28 | 
29 | enum class EmbeddingScoringMethod {
30 |     PLAID = 1,
31 |     COLBERT = 2
32 | };
33 | 
34 | score_t score_embeddings(const EmbeddingScoringMethod method, const std::vector<DocValue>& values, std::shared_ptr<KnnNearestCentroids> knn);
35 | 
36 | score_t sum(const std::vector<score_t>& values);
37 | 
38 | score_t reduce(const std::vector<score_t>& values);
39 | 
40 | score_t max(const std::vector<score_t>& values);
41 | 
42 | enum class NaryScoringMethod {
43 |     SUM = 0,
44 |     REDUCE = 1,
45 |     MAX = 2,
46 | };
47 | score_t score(const NaryScoringMethod method, const std::vector<score_t>& values);
48 | 
49 | }


--------------------------------------------------------------------------------
/lintdb/server/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(LINTDB_SERVER_SRCS
 2 |         main.cpp
 3 |         controllers/v1/query_node_translator.h
 4 |         controllers/v1/result_translator.h
 5 | )
 6 | add_executable(lintdb-server ${LINTDB_SERVER_SRCS})
 7 | 
 8 | find_package(Drogon CONFIG REQUIRED)
 9 | target_link_libraries(lintdb-server PRIVATE Drogon::Drogon)
10 | 
11 | find_package(args CONFIG REQUIRED)
12 | target_link_libraries(lintdb-server PRIVATE taywee::args)
13 | 
14 | target_link_libraries(lintdb-server PRIVATE lintdb_lib)
15 | 
16 | 
17 | install(
18 |         TARGETS lintdb-server
19 |         EXPORT lintdb-targets
20 |         RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
21 |         ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
22 |         LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
23 |         INCLUDES
24 |         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})


--------------------------------------------------------------------------------
/lintdb/server/api_tests.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | def test_search():
 4 |     tensor = [0.1] * 128 * 32
 5 |     request = {
 6 |         "query": {
 7 |             "type": "TENSOR",
 8 |             "name": "colbert",
 9 |             "value": tensor,
10 |             "num_tensors": 1
11 |         },
12 |         "options": {
13 |             "colbert_field": "colbert"
14 |         },
15 |         "k": 10
16 |     }
17 | 
18 |     resp = requests.post("http://0.0.0.0:8080/v1/Index/search/0", json=request)
19 |     assert resp.status_code == 200
20 |     data = resp.json()
21 | 
22 |     assert('results' in data), "Results not found in response"
23 | 
24 |     print("search test passed")
25 | 
26 | def test_add():
27 |     tensor = [0.1] * 128 * 32
28 |     request = {
29 |         "documents": [
30 |             {
31 |                 "id": 50001,
32 |                 "fields": [
33 |                     {
34 |                         "name": "colbert",
35 |                         "data_type": "TENSOR",
36 |                         "value": tensor,
37 |                     }
38 |                 ]
39 |             }
40 |         ]
41 |     }
42 | 
43 |     resp = requests.post("http://0.0.0.0:8080/v1/Index/add/0", json=request)
44 |     assert resp.status_code == 200
45 |     data = resp.json()
46 | 
47 |     assert('ok' in data)
48 | 
49 |     print("add test passed")
50 | 
51 | def test_update():
52 |     tensor = [0.2] * 128 * 32
53 |     request = {
54 |         "documents": [
55 |             {
56 |                 "id": 50001,
57 |                 "fields": [
58 |                     {
59 |                         "name": "colbert",
60 |                         "data_type": "TENSOR",
61 |                         "value": tensor,
62 |                     }
63 |                 ]
64 |             }
65 |         ]
66 |     }
67 | 
68 |     resp = requests.post("http://0.0.0.0:8080/v1/Index/update/0", json=request)
69 |     assert resp.status_code == 200
70 |     data = resp.json()
71 | 
72 |     assert('ok' in data)
73 | 
74 |     print("update test passed")
75 | 
76 | def test_remove():
77 |     request = {
78 |         'ids': [50001]
79 |     }
80 |     resp = requests.post("http://0.0.0.0:8080/v1/Index/remove/0", json=request)
81 |     assert resp.status_code == 200
82 |     data = resp.json()
83 | 
84 |     assert('ok' in data)
85 | 
86 |     print("remove test passed")
87 | 
88 | 
89 | if __name__ == "__main__":
90 |     test_search()
91 |     test_add()
92 |     test_update()
93 |     test_remove()


--------------------------------------------------------------------------------
/lintdb/server/controllers/v1/Index.cpp:
--------------------------------------------------------------------------------
1 | //
2 | // Copyright (c) 2024 ${ORGANIZATION_NAME}. All rights reserved.
3 | //
4 | 
5 | #include "Index.h"
6 | 


--------------------------------------------------------------------------------
/lintdb/server/controllers/v1/query_node_translator.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "lintdb/query/QueryNode.h"
 4 | #include "lintdb/schema/DataTypes.h"
 5 | #include <memory>
 6 | #include <json/reader.h>
 7 | #include <json/writer.h>
 8 | 
 9 | namespace server {
10 | class QueryNodeJsonTranslator {
11 |    public:
12 |     static std::unique_ptr<lintdb::QueryNode> fromJson(const Json::Value& json) {
13 |         std::string type_string = json["type"].asString();
14 | 
15 |         lintdb::QueryNodeType type;
16 |         if(type_string == "TERM") {
17 |             type = lintdb::QueryNodeType::TERM;
18 |         } else if (type_string == "TENSOR") {
19 |             type = lintdb::QueryNodeType::VECTOR;
20 |         } else if (type_string == "AND") {
21 |             type = lintdb::QueryNodeType::AND;
22 |         } else {
23 |             throw std::runtime_error("unknown QueryNodeType");
24 |         }
25 | 
26 |         switch (type) {
27 |             case lintdb::QueryNodeType::TERM: {
28 |                 lintdb::FieldValue value = lintdb::FieldValue::fromJson(json["value"]);
29 |                 return std::make_unique<lintdb::TermQueryNode>(value);
30 |             }
31 |             case lintdb::QueryNodeType::VECTOR: {
32 |                 lintdb::Tensor value;
33 |                 for(auto& v : json["value"]) {
34 |                     value.push_back(v.asFloat());
35 |                 }
36 |                 uint64_t num_tensors = json["num_tensors"].asUInt64();
37 |                 std::string field = json["name"].asString();
38 |                 lintdb::FieldValue fv = lintdb::FieldValue(field, value, size_t(num_tensors));
39 |                 return std::make_unique<lintdb::VectorQueryNode>(fv);
40 |             }
41 |             case lintdb::QueryNodeType::AND: {
42 |                 std::vector<std::unique_ptr<lintdb::QueryNode>> children;
43 |                 for (const auto& childJson : json["children"]) {
44 |                     children.push_back(fromJson(childJson));
45 |                 }
46 |                 return std::make_unique<lintdb::AndQueryNode>(std::move(children));
47 |             }
48 |             default:
49 |                 throw std::runtime_error("Unknown QueryNodeType");
50 |         }
51 |     }
52 | };
53 | }


--------------------------------------------------------------------------------
/lintdb/server/controllers/v1/result_translator.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <memory>
 4 | #include <json/reader.h>
 5 | #include <json/writer.h>
 6 | #include "lintdb/SearchResult.h"
 7 | #include "lintdb/schema/DataTypes.h"
 8 | 
 9 | namespace server {
10 | class SearchResultJsonTranslator {
11 |    public:
12 |     static Json::Value toJson(const lintdb::SearchResult& result) {
13 |         Json::Value root;
14 |         root["id"] = static_cast<Json::Int64>(result.id);
15 |         root["score"] = result.score;
16 | 
17 |         Json::Value metadataJson;
18 |         for (const auto& [key, value] : result.metadata) {
19 | 
20 |             metadataJson[key] = lintdb::supportedTypeToJSON(value);
21 |         }
22 |         root["metadata"] = metadataJson;
23 | 
24 |         return root;
25 |     }
26 | };
27 | }


--------------------------------------------------------------------------------
/lintdb/server/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <drogon/drogon.h>
 2 | #include "controllers/v1/Index.h"
 3 | #include <args.hxx>
 4 | #include <iostream>
 5 | #include <memory>
 6 | 
 7 | using namespace drogon;
 8 | 
 9 | int main(int argc, char**argv)
10 | {
11 |     args::ArgumentParser parser("LintDB Server.");
12 |     args::HelpFlag help(parser, "help", "Display this help menu", {'h', "help"});
13 | 
14 |     args::ValueFlag<std::string> path(parser, "path", "Set the path to the database", {'p', "path"});
15 |     args::Flag read_only(parser, "read-only", "Set the database to read-only mode", {'r', "read-only"});
16 | 
17 |     try
18 |     {
19 |         parser.ParseCLI(argc, argv);
20 |     }
21 |     catch (args::Help)
22 |     {
23 |         std::cout << parser;
24 |         return 0;
25 |     }
26 |     catch (args::ParseError e)
27 |     {
28 |         std::cerr << e.what() << std::endl;
29 |         std::cerr << parser;
30 |         return 1;
31 |     }
32 |     catch (args::ValidationError e)
33 |     {
34 |         std::cerr << e.what() << std::endl;
35 |         std::cerr << parser;
36 |         return 1;
37 |     }
38 | 
39 |     std::string p = args::get(path);
40 |     auto indexController = std::make_shared<v1::Index>(p, !!read_only);
41 | 
42 |     app().setLogPath("./", "lintdb-server.log")
43 |             .setLogLevel(trantor::Logger::kDebug)
44 |             .addListener("0.0.0.0", 8080)
45 |             .setThreadNum(12)
46 |             .registerController(indexController)
47 | //            .enableRunAsDaemon()
48 |             .run();
49 | }


--------------------------------------------------------------------------------
/lintdb/util.cpp:
--------------------------------------------------------------------------------
 1 | #include "lintdb/util.h"
 2 | #include <glog/logging.h>
 3 | #include <fstream>
 4 | #include <unordered_map>
 5 | #include "lintdb/api.h"
 6 | #include "lintdb/exception.h"
 7 | #include "lintdb/SearchOptions.h"
 8 | 
 9 | namespace lintdb {
10 | extern "C" {
11 | // this is to keep the clang syntax checker happy
12 | #ifndef FINTEGER
13 | #define FINTEGER int
14 | #endif
15 | 
16 | /* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
17 | 
18 | float cblas_snrm2(FINTEGER n, const float* x, FINTEGER incx);
19 | 
20 | int cblas_sscal(FINTEGER n, const float alpha, float* x, FINTEGER incx);
21 | }
22 | 
23 | void normalize_vector(
24 |         float* doc_residuals,
25 |         const size_t num_doc_tokens,
26 |         const size_t dim) {
27 |     float mod = 0.0;
28 | 
29 |     int dim2 = dim;
30 | 
31 |     for (size_t i = 0; i < num_doc_tokens; i++) {
32 |         mod = cblas_snrm2(dim2, doc_residuals + i * dim2, 1);
33 |         if (mod == 1.0) {
34 |             continue;
35 |         }
36 | 
37 |         int dim2 = dim;
38 |         float mod2 = 1.0 / mod;
39 |         int incx = 1;
40 |         // auto adjusted = std::max(mod, 1e-12f);
41 |         cblas_sscal(dim2, mod2, doc_residuals + i * dim, incx);
42 |     }
43 | }
44 | 
45 | Json::Value loadJson(const std::string& path) {
46 |     Json::Value root;
47 |     std::ifstream in(path);
48 |     Json::CharReaderBuilder readerBuilder;
49 |     std::string errs;
50 |     if (in.is_open()) {
51 |         if (!Json::parseFromStream(readerBuilder, in, &root, &errs)) {
52 |             LOG(ERROR) << "Failed to parse JSON from file: " << path
53 |                        << ", Error: " << errs;
54 |         }
55 |         in.close();
56 |     } else {
57 |         LOG(ERROR) << "Unable to open file for reading: " << path;
58 |     }
59 | 
60 |     return root;
61 | }
62 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/util.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINTDB_UTIL_H
 2 | #define LINTDB_UTIL_H
 3 | 
 4 | #include <json/reader.h>
 5 | #include <json/writer.h>
 6 | #include <stddef.h>
 7 | #include <stdint.h>
 8 | #include <random>
 9 | #include <string>
10 | #include <tuple>
11 | #include <vector>
12 | #include "lintdb/SearchOptions.h"
13 | 
14 | namespace lintdb {
15 | /**
16 |  * Normalize vector normalizes vectors in place.
17 |  *
18 |  * do i need to consider simd instructions for optimizations?
19 |  * https://stackoverflow.com/questions/57469359/how-to-efficiently-normalize-vector-c
20 |  */
21 | void normalize_vector(
22 |         float* doc_residuals,
23 |         const size_t num_doc_tokens,
24 |         const size_t dim);
25 | 
26 | template <typename T>
27 | void product_helper(
28 |         const std::vector<std::vector<T>>& pools,
29 |         std::vector<T>& result,
30 |         size_t index,
31 |         std::vector<T>& current) {
32 |     if (index == pools.size()) {
33 |         for (const auto& elem : current) {
34 |             result.push_back(elem);
35 |         }
36 |         return;
37 |     }
38 |     for (const auto& element : pools[index]) {
39 |         current.push_back(element);
40 |         product_helper(pools, result, index + 1, current);
41 |         current.pop_back();
42 |     }
43 | }
44 | 
45 | /**
46 |  * product creates the cartesian product of a range of elements. Similar to
47 |  * python, it enables us to repeat the input a certain amount of times.
48 |  */
49 | template <typename T>
50 | std::vector<T> product(
51 |         const std::vector<std::vector<T>>& args,
52 |         size_t repeat = 1) {
53 |     std::vector<std::vector<T>> pools;
54 |     for (const auto& arg : args) {
55 |         pools.insert(pools.end(), repeat, arg);
56 |     }
57 |     std::vector<T> result;
58 |     std::vector<T> current;
59 |     product_helper(pools, result, 0, current);
60 |     return result;
61 | }
62 | 
63 | Json::Value loadJson(const std::string& path);
64 | 
65 | inline std::vector<size_t> subsample(const size_t total, const size_t sample) {
66 |     std::mt19937 rng;
67 |     std::seed_seq seed{1234};
68 | 
69 |     rng.seed(seed);
70 | 
71 |     std::uniform_int_distribution<size_t> dist(0, total - 1);
72 |     std::vector<size_t> indices;
73 |     for (size_t i = 0; i < sample; i++) {
74 |         indices.push_back(dist(rng));
75 |     }
76 | 
77 |     return indices;
78 | }
79 | 
80 | } // namespace lintdb
81 | 
82 | #endif


--------------------------------------------------------------------------------
/lintdb/utils/endian.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstddef>
 4 | #include <vector>
 5 | 
 6 | namespace lintdb {
 7 | template <typename T>
 8 | T load_bigendian(void const* bytes) {
 9 |     T num = 0;
10 |     for (size_t i = 0; i < sizeof(T); ++i) {
11 |         num |= static_cast<T>(static_cast<const unsigned char*>(bytes)[i])
12 |                 << (8 * (sizeof(T) - i - 1));
13 |     }
14 |     return num;
15 | }
16 | 
17 | template <typename T>
18 | void store_bigendian(T num, std::vector<unsigned char>& bigEndian) {
19 |     for (int i = sizeof(T) - 1; i >= 0; i--) {
20 |         bigEndian.push_back((num >> (8 * i)) & 0xff);
21 |     }
22 | }
23 | } // namespace lintdb


--------------------------------------------------------------------------------
/lintdb/version.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | 
 5 | #define LINTDB_VERSION_STRING "0.5.1"
 6 | 
 7 | namespace lintdb {
 8 | struct Version {
 9 |     Version(std::string versionStr = LINTDB_VERSION_STRING) {
10 |         sscanf(versionStr.c_str(), "%d.%d.%d", &major, &minor, &revision);
11 |         metadata_enabled = major >= 0 && minor >= 3 && revision >= 0;
12 |     }
13 | 
14 |     bool operator==(const Version& otherVersion) const {
15 |         return major == otherVersion.major && minor == otherVersion.minor &&
16 |                 revision == otherVersion.revision;
17 |     }
18 | 
19 |     bool operator<(const Version& otherVersion) {
20 |         if (major < otherVersion.major)
21 |             return true;
22 |         if (minor < otherVersion.minor)
23 |             return true;
24 |         if (revision < otherVersion.revision)
25 |             return true;
26 |         return false;
27 |     }
28 | 
29 |     bool metadata_enabled;
30 | 
31 |     int major, minor, revision, build;
32 | };
33 | 
34 | static const Version LINTDB_VERSION(LINTDB_VERSION_STRING);
35 | } // namespace lintdb
36 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
  1 | site_name: "LintDB"
  2 | site_description: "A vector database for multi vector representations and late interaction scoring."
  3 | site_url: "https://deployql.github.io/LintDB/"
  4 | repo_url: "https://github.com/DeployQL/lintdb"
  5 | repo_name: "DeployQL/lintdb"
  6 | edit_uri: "edit/main/docs/"
  7 | 
  8 | extra:
  9 |   version:
 10 |     provider: mike
 11 | 
 12 | theme:
 13 |   name: "material"
 14 |   logo: icon.svg
 15 |   features:
 16 |     - announce.dismiss
 17 |     - content.action.edit
 18 |     - content.action.view
 19 |     - content.code.annotate
 20 |     - content.code.copy
 21 |     - content.tooltips
 22 |     - navigation.footer
 23 |     - navigation.indexes
 24 |     - search.highlight
 25 |     - search.suggest
 26 |     - toc.follow
 27 |   palette:
 28 |     - media: "(prefers-color-scheme)"
 29 |       toggle:
 30 |         icon: material/link
 31 |         name: Switch to light mode
 32 |     - media: "(prefers-color-scheme: light)"
 33 |       scheme: default
 34 |       primary: indigo
 35 |       accent: indigo
 36 |       toggle:
 37 |         icon: material/toggle-switch
 38 |         name: Switch to dark mode
 39 |     - media: "(prefers-color-scheme: dark)"
 40 |       scheme: slate
 41 |       primary: indigo
 42 |       accent: black
 43 |       toggle:
 44 |         icon: material/toggle-switch-off
 45 |         name: Switch to system preference
 46 | 
 47 | markdown_extensions:
 48 |   - attr_list
 49 |   - admonition
 50 |   - callouts
 51 |   - footnotes
 52 |   - pymdownx.details
 53 |   - pymdownx.emoji:
 54 |       emoji_index: !!python/name:material.extensions.emoji.twemoji
 55 |       emoji_generator: !!python/name:material.extensions.emoji.to_svg
 56 |   - pymdownx.highlight:
 57 |       pygments_lang_class: true
 58 |   - pymdownx.magiclink
 59 |   - pymdownx.snippets:
 60 |       base_path: [ !relative $config_dir ]
 61 |       check_paths: true
 62 |   - pymdownx.superfences
 63 |   - pymdownx.tabbed:
 64 |       alternate_style: true
 65 |       slugify: !!python/object/apply:pymdownx.slugs.slugify
 66 |         kwds:
 67 |           case: lower
 68 |   - pymdownx.tasklist:
 69 |       custom_checkbox: true
 70 |   - pymdownx.tilde
 71 |   - toc:
 72 |       permalink: "¤"
 73 | 
 74 | plugins:
 75 |   - search
 76 |   - literate-nav:
 77 |       nav_file: "nav.md"
 78 |   - mkdocstrings:
 79 |       handlers:
 80 |         python:
 81 |           options:
 82 |             find_stubs_package: true
 83 |             docstring_options:
 84 |               ignore_init_summary: true
 85 |             docstring_section_style: list
 86 |             filters: [ "!^_" ]
 87 |             heading_level: 2
 88 |             inherited_members: true
 89 |             merge_init_into_class: true
 90 |             parameter_headings: true
 91 |             separate_signature: true
 92 |             show_root_heading: true
 93 |             show_root_full_path: false
 94 |             show_signature: true
 95 |             show_signature_annotations: true
 96 |             show_symbol_type_heading: true
 97 |             show_symbol_type_toc: true
 98 |             signature_crossrefs: true
 99 |             summary: true
100 |   - mike:
101 |       # These fields are all optional; the defaults are as below...
102 |       alias_type: symlink
103 |       redirect_template: null
104 |       deploy_prefix: ''
105 |       canonical_version: latest
106 |       version_selector: true
107 |       css_dir: css
108 |       javascript_dir: js


--------------------------------------------------------------------------------
/ports/bitsery/portfile.cmake:
--------------------------------------------------------------------------------
 1 | vcpkg_from_github(
 2 |     OUT_SOURCE_PATH SOURCE_PATH
 3 |     REPO fraillt/bitsery
 4 |     REF "v${VERSION}"
 5 |     SHA512 26e525d799d1777e182753c6c970765be8695a557e0fef35224ab8f4629a094c04fd8d7e456da369938d74acb0ca84084f394f212ae1343fa62a27256dba971f
 6 |     HEAD_REF master
 7 | )
 8 | 
 9 | vcpkg_cmake_configure(
10 |     SOURCE_PATH "${SOURCE_PATH}"
11 | )
12 | 
13 | vcpkg_cmake_install()
14 | 
15 | vcpkg_cmake_config_fixup(CONFIG_PATH lib/cmake/${PORT})
16 | 
17 | file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/lib")
18 | file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug")
19 | 
20 | file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright)
21 | 


--------------------------------------------------------------------------------
/ports/bitsery/vcpkg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "bitsery",
 3 |   "version": "5.2.4",
 4 |   "description": "Header only C++ binary serialization library",
 5 |   "homepage": "https://github.com/fraillt/bitsery",
 6 |   "dependencies": [
 7 |     {
 8 |       "name": "vcpkg-cmake",
 9 |       "host": true
10 |     },
11 |     {
12 |       "name": "vcpkg-cmake-config",
13 |       "host": true
14 |     }
15 |   ]
16 | }
17 | 


--------------------------------------------------------------------------------
/ports/faiss/faiss.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/cmake/faiss-config.cmake.in b/cmake/faiss-config.cmake.in
 2 | index 43ea9d4c..a7beff69 100644
 3 | --- a/cmake/faiss-config.cmake.in
 4 | +++ b/cmake/faiss-config.cmake.in
 5 | @@ -4,4 +4,6 @@
 6 |  # This source code is licensed under the BSD-style license found in the
 7 |  # LICENSE file in the root directory of this source tree.
 8 |  
 9 | +find_dependency(MKL REQUIRED)
10 | +
11 |  include("${CMAKE_CURRENT_LIST_DIR}/faiss-targets.cmake")
12 | diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
13 | index 1fea676c..8723be27 100644
14 | --- a/faiss/CMakeLists.txt
15 | +++ b/faiss/CMakeLists.txt
16 | @@ -269,16 +269,16 @@ target_link_libraries(faiss_avx2 PRIVATE OpenMP::OpenMP_CXX)
17 |  
18 |  find_package(MKL)
19 |  if(MKL_FOUND)
20 | -  target_link_libraries(faiss PRIVATE ${MKL_LIBRARIES})
21 | -  target_link_libraries(faiss_avx2 PRIVATE ${MKL_LIBRARIES})
22 | +  target_link_libraries(faiss PRIVATE MKL::MKL)
23 | +  target_link_libraries(faiss_avx2 PRIVATE MKL::MKL)
24 |  else()
25 |    find_package(BLAS REQUIRED)
26 | -  target_link_libraries(faiss PRIVATE ${BLAS_LIBRARIES})
27 | -  target_link_libraries(faiss_avx2 PRIVATE ${BLAS_LIBRARIES})
28 | +  target_link_libraries(faiss PRIVATE BLAS::BLAS)
29 | +  target_link_libraries(faiss_avx2 PRIVATE BLAS::BLAS)
30 |  
31 |    find_package(LAPACK REQUIRED)
32 | -  target_link_libraries(faiss PRIVATE ${LAPACK_LIBRARIES})
33 | -  target_link_libraries(faiss_avx2 PRIVATE ${LAPACK_LIBRARIES})
34 | +  target_link_libraries(faiss PRIVATE lAPACK::LAPACK)
35 | +  target_link_libraries(faiss_avx2 PRIVATE LAPACK::LAPACK)
36 |  endif()
37 |  
38 |  install(TARGETS faiss
39 | 


--------------------------------------------------------------------------------
/ports/faiss/fix-dependencies.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/cmake/faiss-config.cmake.in b/cmake/faiss-config.cmake.in
 2 | index 43ea9d4..437a7f8 100644
 3 | --- a/cmake/faiss-config.cmake.in
 4 | +++ b/cmake/faiss-config.cmake.in
 5 | @@ -4,4 +4,7 @@
 6 |  # This source code is licensed under the BSD-style license found in the
 7 |  # LICENSE file in the root directory of this source tree.
 8 |  
 9 | +find_dependency(OpenMP REQUIRED)
10 | +find_dependency(BLAS REQUIRED)
11 | +find_dependency(LAPACK REQUIRED)
12 |  include("${CMAKE_CURRENT_LIST_DIR}/faiss-targets.cmake")
13 | diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
14 | index 30d573f..9af8baf 100644
15 | --- a/faiss/CMakeLists.txt
16 | +++ b/faiss/CMakeLists.txt
17 | @@ -212,17 +212,17 @@ target_link_libraries(faiss PRIVATE OpenMP::OpenMP_CXX)
18 |  target_link_libraries(faiss_avx2 PRIVATE OpenMP::OpenMP_CXX)
19 |  
20 |  find_package(MKL)
21 | -if(MKL_FOUND)
22 | +if(MKL_FOUND)
23 |    target_link_libraries(faiss PRIVATE ${MKL_LIBRARIES})
24 |    target_link_libraries(faiss_avx2 PRIVATE ${MKL_LIBRARIES})
25 |  else()
26 |    find_package(BLAS REQUIRED)
27 | -  target_link_libraries(faiss PRIVATE ${BLAS_LIBRARIES})
28 | -  target_link_libraries(faiss_avx2 PRIVATE ${BLAS_LIBRARIES})
29 | +  target_link_libraries(faiss PRIVATE BLAS::BLAS)
30 | +  target_link_libraries(faiss_avx2 PRIVATE BLAS::BLAS)
31 |  
32 |    find_package(LAPACK REQUIRED)
33 | -  target_link_libraries(faiss PRIVATE ${LAPACK_LIBRARIES})
34 | -  target_link_libraries(faiss_avx2 PRIVATE ${LAPACK_LIBRARIES})
35 | +  target_link_libraries(faiss PRIVATE LAPACK::LAPACK)
36 | +  target_link_libraries(faiss_avx2 PRIVATE LAPACK::LAPACK)
37 |  endif()
38 |  
39 |  install(TARGETS faiss
40 | 


--------------------------------------------------------------------------------
/ports/faiss/portfile.cmake:
--------------------------------------------------------------------------------
 1 | vcpkg_from_github(
 2 |     OUT_SOURCE_PATH SOURCE_PATH
 3 |     REPO facebookresearch/faiss
 4 | #    REF v1.8.0
 5 | #    SHA512 38d4215e3e019915d8b367ff0e8d14901b1495f6f45b835e9248276567a422b0370baab6bd887045442dd1e268b7fe7c347107162e66bb3ec6b1a53be4b2e441
 6 |     REF v1.7.4
 7 |     SHA512 9622fb989cb2e1879450c2ad257cb55d0c0c639f54f0815e4781f4e4b2ae2f01779f5c8c0738ae9a29fde7e418587e6a92e91240d36c1ca051a6228bfb777638
 8 |     HEAD_REF master
 9 | )
10 | 
11 | vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS
12 |     FEATURES
13 |         gpu FAISS_ENABLE_GPU
14 | )
15 | 
16 | if ("${FAISS_ENABLE_GPU}")
17 |     if (NOT VCPKG_CMAKE_SYSTEM_NAME AND NOT ENV{CUDACXX})
18 |         set(ENV{CUDACXX} "$ENV{CUDA_PATH}/bin/nvcc.exe")
19 |     endif()
20 | endif()
21 | 
22 | 
23 | vcpkg_cmake_configure(
24 |     SOURCE_PATH "${SOURCE_PATH}"
25 |     OPTIONS
26 |         ${FEATURE_OPTIONS}
27 |         -DFAISS_ENABLE_PYTHON=OFF  # Requires SWIG
28 |         -DBUILD_TESTING=OFF
29 |         -DCMAKE_BUILD_TYPE=Release
30 |         # -DBLA_VENDOR=Intel10_64lp
31 |         # -DCMAKE_TOOLCHAIN_FILE="${CMAKE_CURRENT_SOURCE_DIR}/tools/vcpkg/scripts/buildsystems/vcpkg.cmake"
32 | )
33 | 
34 | # # Setup vcpkg script with CMake (note: should be placed before project() call)
35 | # set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_SOURCE_DIR}/tools/vcpkg/scripts/buildsystems/vcpkg.cmake CACHE STRING "Vcpkg toolchain file")
36 | 
37 | 
38 | vcpkg_cmake_install()
39 | 
40 | vcpkg_cmake_config_fixup()
41 | 
42 | vcpkg_copy_pdbs()
43 | 
44 | file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright)
45 | 
46 | file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include")
47 | 


--------------------------------------------------------------------------------
/ports/faiss/vcpkg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "faiss",
 3 |   "version": "1.7.4",
 4 |   "description": "Faiss is a library for efficient similarity search and clustering of dense vectors.",
 5 |   "homepage": "https://github.com/facebookresearch/faiss",
 6 |   "license": "MIT",
 7 |   "supports": "!uwp & !osx & !x86",
 8 |   "dependencies": [
 9 |     "lapack",
10 |     "openblas",
11 |     {
12 |       "name": "vcpkg-cmake",
13 |       "host": true
14 |     },
15 |     {
16 |       "name": "vcpkg-cmake-config",
17 |       "host": true
18 |     }
19 |   ],
20 |   "features": {
21 |     "gpu": {
22 |       "description": "Whether to enable GPU support",
23 |       "dependencies": [
24 |         "cuda"
25 |       ]
26 |     }
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/ports/intel-mkl/copy-from-dmg.cmake:
--------------------------------------------------------------------------------
 1 | find_program(HDIUTIL NAMES hdiutil REQUIRED)
 2 | set(dmg_path "NOTFOUND" CACHE FILEPATH "Where to find the DMG")
 3 | set(output_dir "output_dir" CACHE FILEPATH "Where to put the packages")
 4 | 
 5 | if(NOT EXISTS "${dmg_path}")
 6 |     message(FATAL_ERROR "'dmg_path' (${dmg_path}) does not exist.")
 7 | endif()
 8 | if(NOT IS_DIRECTORY "${output_dir}")
 9 |     message(FATAL_ERROR "'output_dir' (${output_dir}) is not a directory.")
10 | endif()
11 | 
12 | execute_process(
13 |     COMMAND mktemp -d
14 |     RESULT_VARIABLE mktemp_result
15 |     OUTPUT_VARIABLE mount_point
16 |     OUTPUT_STRIP_TRAILING_WHITESPACE
17 | )
18 | if(NOT mktemp_result STREQUAL "0")
19 |     message(FATAL_ERROR "mktemp -d failed: ${mktemp_result}")
20 | elseif(NOT IS_DIRECTORY "${mount_point}")
21 |     message(FATAL_ERROR "'mount_point' (${mount_point}) is not a directory.")
22 | endif()
23 | 
24 | execute_process(
25 |     COMMAND "${HDIUTIL}" attach "${dmg_path}" -mountpoint "${mount_point}" -readonly
26 |     RESULT_VARIABLE mount_result
27 | )
28 | if(mount_result STREQUAL "0")
29 |     set(dmg_packages_dir "${mount_point}/bootstrapper.app/Contents/Resources/packages")
30 |     file(GLOB packages
31 |         "${dmg_packages_dir}/intel.oneapi.mac.mkl.devel,*"
32 |         "${dmg_packages_dir}/intel.oneapi.mac.mkl.runtime,*"
33 |         "${dmg_packages_dir}/intel.oneapi.mac.mkl.product,*"
34 |         "${dmg_packages_dir}/intel.oneapi.mac.openmp,*"
35 |     )
36 |     # Using execute_process to avoid direct errors
37 |     execute_process(
38 |         COMMAND cp -R ${packages} "${output_dir}/"
39 |         RESULT_VARIABLE copy_result
40 |     )
41 | endif()
42 | execute_process(
43 |     COMMAND "${HDIUTIL}" detach "${mount_point}"
44 |     RESULT_VARIABLE unmount_result
45 | )
46 | 
47 | if(NOT mount_result STREQUAL "0")
48 |     message(FATAL_ERROR "Mounting ${dmg_path} failed: ${mount_result}")
49 | elseif(NOT copy_result STREQUAL "0")
50 |     message(FATAL_ERROR "Coyping packages failed: ${copy_result}")
51 | elseif(NOT unmount_result STREQUAL "0")
52 |     message(FATAL_ERROR "Unounting ${dmg_path} failed: ${unmount_result}")
53 | endif()
54 | 


--------------------------------------------------------------------------------
/ports/intel-mkl/usage:
--------------------------------------------------------------------------------
1 | intel-mkl provides CMake targets:
2 | 
3 |     find_package(MKL CONFIG REQUIRED)
4 |     target_link_libraries(main PRIVATE MKL::MKL)
5 | 


--------------------------------------------------------------------------------
/ports/intel-mkl/vcpkg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "intel-mkl",
 3 |   "version": "2023.0.0",
 4 |   "port-version": 3,
 5 |   "description": "Intel® Math Kernel Library (Intel® MKL) accelerates math processing routines, increases application performance, and reduces development time on Intel® processors.",
 6 |   "homepage": "https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html",
 7 |   "license": null,
 8 |   "supports": "(windows | linux | osx) & x64",
 9 |   "dependencies": [
10 |     {
11 |       "name": "vcpkg-tool-lessmsi",
12 |       "host": true,
13 |       "platform": "windows"
14 |     }
15 |   ]
16 | }
17 | 


--------------------------------------------------------------------------------
/ports/onnxruntime/vcpkg.json:
--------------------------------------------------------------------------------
1 | {
2 |   "name": "onnxruntime",
3 |   "version": "1.17.3",
4 |   "description": "onnxruntime",
5 |   "homepage": "https://github.com/microsoft/onnxruntime",
6 |   "license": "MIT",
7 |   "supports": "(windows | linux | osx)"
8 | }
9 | 


--------------------------------------------------------------------------------
/ports/rocksdb/0001-fix-dependencies.patch:
--------------------------------------------------------------------------------
  1 |  CMakeLists.txt               | 33 +++++++++++++++------------------
  2 |  cmake/RocksDBConfig.cmake.in | 11 ++++++++---
  3 |  2 files changed, 23 insertions(+), 21 deletions(-)
  4 | 
  5 | diff --git a/CMakeLists.txt b/CMakeLists.txt
  6 | index 23a4014bc..045f5a36d 100644
  7 | --- a/CMakeLists.txt
  8 | +++ b/CMakeLists.txt
  9 | @@ -87,7 +87,7 @@ endif()
 10 |  
 11 |  include(CMakeDependentOption)
 12 |  
 13 | -if(MSVC)
 14 | +if(0)
 15 |    option(WITH_GFLAGS "build with GFlags" OFF)
 16 |    option(WITH_XPRESS "build with windows built in compression" OFF)
 17 |    option(ROCKSDB_SKIP_THIRDPARTY "skip thirdparty.inc" OFF)
 18 | @@ -136,10 +136,7 @@ else()
 19 |    endif()
 20 |  
 21 |    if(WITH_SNAPPY)
 22 | -    find_package(Snappy CONFIG)
 23 | -    if(NOT Snappy_FOUND)
 24 | -      find_package(Snappy REQUIRED)
 25 | -    endif()
 26 | +    find_package(Snappy CONFIG REQUIRED)
 27 |      add_definitions(-DSNAPPY)
 28 |      list(APPEND THIRDPARTY_LIBS Snappy::snappy)
 29 |    endif()
 30 | @@ -163,16 +160,19 @@ else()
 31 |    endif()
 32 |  
 33 |    if(WITH_LZ4)
 34 | -    find_package(lz4 REQUIRED)
 35 | +    find_package(lz4 CONFIG REQUIRED)
 36 |      add_definitions(-DLZ4)
 37 |      list(APPEND THIRDPARTY_LIBS lz4::lz4)
 38 |    endif()
 39 |  
 40 |    if(WITH_ZSTD)
 41 | -    find_package(zstd REQUIRED)
 42 | +    find_package(zstd CONFIG REQUIRED)
 43 |      add_definitions(-DZSTD)
 44 | -    include_directories(${ZSTD_INCLUDE_DIR})
 45 | -    list(APPEND THIRDPARTY_LIBS zstd::zstd)
 46 | +    if(TARGET zstd::libzstd_shared)
 47 | +      list(APPEND THIRDPARTY_LIBS zstd::libzstd_shared)
 48 | +    elseif(TARGET zstd::libzstd_static)
 49 | +      list(APPEND THIRDPARTY_LIBS zstd::libzstd_static)
 50 | +    endif()
 51 |    endif()
 52 |  endif()
 53 |  
 54 | @@ -312,11 +312,10 @@ int main() {
 55 |  endif()
 56 |  
 57 |  if (WITH_LIBURING)
 58 | -  find_package(uring)
 59 | -  if (uring_FOUND)
 60 | -    add_definitions(-DROCKSDB_IOURING_PRESENT)
 61 | -    list(APPEND THIRDPARTY_LIBS uring::uring)
 62 | -  endif()
 63 | +  find_package(PkgConfig)
 64 | +  pkg_check_modules(liburing REQUIRED IMPORTED_TARGET GLOBAL liburing>=2.0)
 65 | +  add_definitions(-DROCKSDB_IOURING_PRESENT)
 66 | +  list(APPEND THIRDPARTY_LIBS PkgConfig::liburing)
 67 |  endif()
 68 |  
 69 |  # Reset the required flags
 70 | @@ -382,9 +381,9 @@ endif()
 71 |  
 72 |  option(WITH_TBB "build with Threading Building Blocks (TBB)" OFF)
 73 |  if(WITH_TBB)
 74 | -  find_package(TBB REQUIRED)
 75 | +  find_package(TBB CONFIG REQUIRED)
 76 |    add_definitions(-DTBB)
 77 | -  list(APPEND THIRDPARTY_LIBS TBB::TBB)
 78 | +  list(APPEND THIRDPARTY_LIBS TBB::tbb)
 79 |  endif()
 80 |  
 81 |  # Stall notifications eat some performance from inserts
 82 | @@ -1202,8 +1201,6 @@ if(NOT WIN32 OR ROCKSDB_INSTALL_ON_WINDOWS)
 83 |      endforeach()
 84 |    endforeach()
 85 |  
 86 | -  install(DIRECTORY "${PROJECT_SOURCE_DIR}/cmake/modules" COMPONENT devel DESTINATION ${package_config_destination})
 87 | -
 88 |    install(
 89 |      TARGETS ${ROCKSDB_STATIC_LIB}
 90 |      EXPORT RocksDBTargets
 91 | diff --git a/cmake/RocksDBConfig.cmake.in b/cmake/RocksDBConfig.cmake.in
 92 | index 0bd14be11..a420d8bfe 100644
 93 | --- a/cmake/RocksDBConfig.cmake.in
 94 | +++ b/cmake/RocksDBConfig.cmake.in
 95 | @@ -33,11 +33,11 @@ if(@WITH_BZ2@)
 96 |  endif()
 97 |  
 98 |  if(@WITH_LZ4@)
 99 | -  find_dependency(lz4)
100 | +  find_dependency(lz4 CONFIG)
101 |  endif()
102 |  
103 |  if(@WITH_ZSTD@)
104 | -  find_dependency(zstd)
105 | +  find_dependency(zstd CONFIG)
106 |  endif()
107 |  
108 |  if(@WITH_NUMA@)
109 | @@ -45,7 +45,12 @@ if(@WITH_NUMA@)
110 |  endif()
111 |  
112 |  if(@WITH_TBB@)
113 | -  find_dependency(TBB)
114 | +  find_dependency(TBB CONFIG)
115 | +endif()
116 | +
117 | +if(@WITH_LIBURING@)
118 | +  find_dependency(PkgConfig)
119 | +  pkg_check_modules(liburing REQUIRED IMPORTED_TARGET GLOBAL liburing>=2.0)
120 |  endif()
121 |  
122 |  find_dependency(Threads)
123 | 


--------------------------------------------------------------------------------
/ports/rocksdb/portfile.cmake:
--------------------------------------------------------------------------------
 1 | vcpkg_from_github(
 2 |   OUT_SOURCE_PATH SOURCE_PATH
 3 |   REPO facebook/rocksdb
 4 |   REF "v${VERSION}"
 5 |   SHA512 524e3e70ed2b1d2e6c61a7b401946e50473cc95684ce4efc6250062f5bc945e443e96f7907fcc3ee1ab98c71179a8b56a654383cf2c0bbe1bb20907ab1ac7523
 6 |   HEAD_REF main
 7 |   PATCHES
 8 |     0001-fix-dependencies.patch
 9 | )
10 | 
11 | string(COMPARE EQUAL "${VCPKG_CRT_LINKAGE}" "dynamic" WITH_MD_LIBRARY)
12 | string(COMPARE EQUAL "${VCPKG_LIBRARY_LINKAGE}" "dynamic" ROCKSDB_BUILD_SHARED)
13 | 
14 | vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS
15 |   FEATURES
16 |     "liburing" WITH_LIBURING
17 |     "snappy" WITH_SNAPPY
18 |     "lz4" WITH_LZ4
19 |     "zlib" WITH_ZLIB
20 |     "zstd" WITH_ZSTD
21 |     "bzip2" WITH_BZ2
22 |     "numa" WITH_NUMA
23 |     "tbb" WITH_TBB
24 | )
25 | 
26 | vcpkg_cmake_configure(
27 |   SOURCE_PATH "${SOURCE_PATH}"
28 |   OPTIONS
29 |     -DWITH_GFLAGS=OFF
30 |     -DWITH_TESTS=OFF
31 |     -DWITH_BENCHMARK_TOOLS=OFF
32 |     -DWITH_TOOLS=OFF
33 |     -DUSE_RTTI=ON
34 |     -DROCKSDB_INSTALL_ON_WINDOWS=ON
35 |     -DFAIL_ON_WARNINGS=OFF
36 |     -DWITH_MD_LIBRARY=${WITH_MD_LIBRARY}
37 |     -DPORTABLE=1 # Minimum CPU arch to support, or 0 = current CPU, 1 = baseline CPU
38 |     -DROCKSDB_BUILD_SHARED=${ROCKSDB_BUILD_SHARED}
39 |     -DCMAKE_DISABLE_FIND_PACKAGE_Git=TRUE
40 |     ${FEATURE_OPTIONS}
41 |   OPTIONS_DEBUG
42 |     -DCMAKE_DEBUG_POSTFIX=d
43 |     -DWITH_RUNTIME_DEBUG=ON
44 |   OPTIONS_RELEASE
45 |     -DWITH_RUNTIME_DEBUG=OFF
46 | )
47 | 
48 | vcpkg_cmake_install()
49 | 
50 | vcpkg_cmake_config_fixup(CONFIG_PATH lib/cmake/rocksdb)
51 | 
52 | vcpkg_copy_pdbs()
53 | 
54 | file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include")
55 | file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/share")
56 | 
57 | vcpkg_fixup_pkgconfig()
58 | 
59 | vcpkg_install_copyright(COMMENT [[
60 | RocksDB is dual-licensed under both the GPLv2 (found in COPYING)
61 | and Apache 2.0 License (found in LICENSE.Apache). You may select,
62 | at your option, one of the above-listed licenses.
63 | ]]
64 |   FILE_LIST
65 |     "${SOURCE_PATH}/LICENSE.leveldb"
66 |     "${SOURCE_PATH}/LICENSE.Apache"
67 |     "${SOURCE_PATH}/COPYING"
68 | )
69 | 


--------------------------------------------------------------------------------
/ports/rocksdb/vcpkg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "rocksdb",
 3 |   "version": "8.10.0",
 4 |   "description": "A library that provides an embeddable, persistent key-value store for fast storage",
 5 |   "homepage": "https://github.com/facebook/rocksdb",
 6 |   "license": "GPL-2.0-only OR Apache-2.0",
 7 |   "supports": "!uwp & !(arm & !arm64 & android)",
 8 |   "dependencies": [
 9 |     {
10 |       "name": "vcpkg-cmake",
11 |       "host": true
12 |     },
13 |     {
14 |       "name": "vcpkg-cmake-config",
15 |       "host": true
16 |     }
17 |   ],
18 |   "default-features": [
19 |     "zlib"
20 |   ],
21 |   "features": {
22 |     "bzip2": {
23 |       "description": "build with bzip2",
24 |       "dependencies": [
25 |         "bzip2"
26 |       ]
27 |     },
28 |     "liburing": {
29 |       "description": "build with liburing",
30 |       "supports": "linux",
31 |       "dependencies": [
32 |         {
33 |           "name": "liburing",
34 |           "platform": "linux"
35 |         }
36 |       ]
37 |     },
38 |     "lz4": {
39 |       "description": "build with lz4",
40 |       "dependencies": [
41 |         "lz4"
42 |       ]
43 |     },
44 |     "numa": {
45 |       "description": "build with NUMA policy support",
46 |       "supports": "linux"
47 |     },
48 |     "snappy": {
49 |       "description": "build with SNAPPY",
50 |       "dependencies": [
51 |         "snappy"
52 |       ]
53 |     },
54 |     "tbb": {
55 |       "description": "build with Threading Building Blocks (TBB)",
56 |       "dependencies": [
57 |         "tbb"
58 |       ]
59 |     },
60 |     "zlib": {
61 |       "description": "build with zlib",
62 |       "dependencies": [
63 |         "zlib"
64 |       ]
65 |     },
66 |     "zstd": {
67 |       "description": "build with zstd",
68 |       "dependencies": [
69 |         "zstd"
70 |       ]
71 |     }
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["scikit-build-core >=0.4.3", "nanobind >=1.3.2"]
 3 | build-backend = "scikit_build_core.build"
 4 | 
 5 | [project]
 6 | name = "lintdb"
 7 | version = "0.5.1"
 8 | description = "Python library for LintDB, a vector database for token embeddings and late interaction."
 9 | readme = "README.md"
10 | license = {text = "Apache-2.0 License"}
11 | requires-python = ">=3.9"
12 | authors = [
13 |     { name = "Matt Barta", email = "matt@deployql.com" },
14 | ]
15 | classifiers = [
16 |     "License :: OSI Approved :: Apache-2.0 License",
17 | ]
18 | dependencies = [
19 |     "typing_extensions",
20 | ]
21 | 
22 | [project.urls]
23 | Homepage = "https://github.com/deployQL/lintdb"
24 | 
25 | [tool.pytest.ini_options]
26 | #pythonpath = [ "./builds/python/lintdb/python/Release", "lintdb/python" ]
27 | testpaths = [
28 |     "lintdb/python/tests"
29 | ]
30 | 
31 | [tool.scikit-build]
32 | # Protect the configuration against future changes in scikit-build-core
33 | minimum-version = "0.4"
34 | 
35 | # Setuptools-style build caching in a local directory
36 | build-dir = "builds/{wheel_tag}"
37 | 
38 | # Build stable ABI wheels for CPython 3.12+
39 | #wheel.py-api = "cp310"
40 | 
41 | cmake.build-type = "Release"
42 | 
43 | sdist.cmake = true
44 | wheel.packages = ["lintdb/python"]
45 | cmake.targets = ['lintdb_lib', 'core']
46 | 
47 | [tool.scikit-build.cmake.define]
48 | # to use a shared library, we need to jump through some hoops for python: https://github.com/scikit-build/scikit-build/issues/272
49 | # SO link: https://stackoverflow.com/questions/70044257/packaging-executable-shared-library-and-python-bindings-not-finding-library
50 | BUILD_SHARED_LIBS = "OFF"
51 | CMAKE_CXX_COMPILER = "clang++"
52 | #CMAKE_INSTALL_LIBDIR = "lib"
53 | CMAKE_VERBOSE_MAKEFILE = "OFF"
54 | ENABLE_PYTHON = "ON"
55 | BUILD_TESTING = "OFF"
56 | #BLA_VENDOR = "Intel10_64lp"


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(LINT_DB_TESTS
 2 |     util.h
 3 |     coarse_quantizer_test.cpp
 4 |     doc_iterator_test.cpp
 5 |     index_test.cpp
 6 |     mocks.h
 7 |     keys_test.cpp
 8 |     doc_encoder_test.cpp
 9 |     colbert_test.cpp
10 |     plaid_test.cpp
11 |     binarizer_test.cpp
12 |     inverted_list_test.cpp
13 |     doc_processor_test.cpp
14 |     product_quantizer_test.cpp)
15 | 
16 | add_executable(lintdb-tests ${LINT_DB_TESTS})
17 | 
18 | target_link_libraries(lintdb-tests PRIVATE lintdb_lib)
19 | 
20 | find_package(Bitsery CONFIG REQUIRED)
21 | target_link_libraries(lintdb-tests PRIVATE Bitsery::bitsery)
22 | 
23 | enable_testing()
24 | 
25 | find_package(GTest CONFIG REQUIRED)
26 | 
27 | include(FetchContent)
28 | set(BUILD_GMOCK CACHE BOOL OFF)
29 | set(INSTALL_GTEST CACHE BOOL OFF)
30 | FetchContent_Declare(
31 |   googletest
32 |   URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
33 | )
34 | 
35 | # target_include_directories(lintdb-tests PUBLIC
36 | # $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>)
37 | 
38 | target_link_libraries(lintdb-tests PRIVATE GTest::gtest GTest::gtest_main
39 |                                            GTest::gmock GTest::gmock_main)
40 | 
41 | include(GoogleTest)
42 | gtest_discover_tests(lintdb-tests
43 |                      WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/tests")
44 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeployQL/LintDB/bba57a67d1289edc14bdc00cb3c45df60877e2e4/tests/__init__.py


--------------------------------------------------------------------------------
/tests/colbert_test.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <gtest/gtest.h>
  3 | #include <algorithm>
  4 | #include <cstdlib>
  5 | #include <vector>
  6 | #include "lintdb/index.h"
  7 | #include "lintdb/query/Query.h"
  8 | #include "lintdb/query/QueryNode.h"
  9 | #include "lintdb/schema/DataTypes.h"
 10 | #include <string>
 11 | #include <fstream>
 12 | #include <iostream>
 13 | 
 14 | #define DATABASE_PATH "data/colbert_test.db"
 15 | #define QUERY_EMBEDDING_PATH "data/query.txt"
 16 | #define EXPECTED_RESULTS_PATH "data/colbert.ranking.tsv"
 17 | 
 18 | using namespace std;
 19 | /**
 20 |  * This test uses 1 query from LoTTE lifestyle and 1,000 documents.
 21 |  *
 22 |  * This is a fairly relaxed test. We ensure that the top doc ids are correct, but don't
 23 |  * enforce the order or score.
 24 |  *
 25 |  * We can notice scores change slightly between any given indexing run.
 26 |  */
 27 | TEST(ColBertTests, ScoresCorrectly) {
 28 |     auto index = lintdb::IndexIVF(DATABASE_PATH);
 29 | 
 30 |     // read query embeddings
 31 |     std::ifstream queryFile;
 32 |     queryFile.open(QUERY_EMBEDDING_PATH);
 33 |     std::string line;
 34 |     std::vector<float> embeddings;
 35 | 
 36 |     while(std::getline(queryFile, line)) {
 37 |         std::stringstream buf(line);
 38 |         std::string tmp;
 39 |         while(getline(buf, tmp, ' ')) {
 40 |             float f = std::stof(tmp);
 41 |             embeddings.push_back(f);
 42 |         }
 43 |     }
 44 |     // we save a padded query, which should be 32 tokens long.
 45 |     ASSERT_EQ(embeddings.size(), 32 * 128);
 46 | 
 47 |     lintdb::SearchOptions searchOpts;
 48 |     searchOpts.k_top_centroids = 32;
 49 | 
 50 |     lintdb::FieldValue fv("colbert", embeddings, 32);
 51 |     std::unique_ptr<lintdb::VectorQueryNode> root = std::make_unique<lintdb::VectorQueryNode>(fv);
 52 |     lintdb::Query query(std::move(root));
 53 | 
 54 |     std::vector<lintdb::SearchResult> results = index.search(0, query, 4, searchOpts);
 55 | 
 56 |     // print result ids and score
 57 |     for (auto& result : results) {
 58 |         std::cout << result.id << " " << result.score << std::endl;
 59 |     }
 60 | 
 61 |     ifstream dataFile;
 62 |     dataFile.open(EXPECTED_RESULTS_PATH);
 63 | 
 64 |     // read each line.
 65 |     std::unordered_set<int> doc_ids;
 66 |     int count = 0;
 67 |     while(!dataFile.eof() && count < 4) {
 68 |         std::string str;
 69 |         std::getline( dataFile, str);
 70 |         std::stringstream buffer(str);
 71 |         std::string tmp;
 72 | 
 73 |         // read each column
 74 |         int doc_id;
 75 |         float doc_score;
 76 |         int ranking = 0;
 77 | 
 78 |         int i = 0;
 79 |         while( getline( buffer, tmp, '\t') ) {
 80 |             if (i == 1) {
 81 |                 // doc id
 82 |                 doc_id = std::stoi(tmp);
 83 |             }
 84 |             if (i==2) {
 85 |                 ranking = std::stoi(tmp);
 86 |             }
 87 |             if (i==3) {
 88 |                 doc_score = std::stof(tmp);
 89 |             }
 90 |             i++;
 91 |         }
 92 |         doc_ids.insert(doc_id);
 93 |         count++;
 94 |     }
 95 | 
 96 |     // check if the top 10 doc ids are in the expected results.
 97 |     for (auto& result : results) {
 98 |         ASSERT_TRUE(doc_ids.find(result.id) != doc_ids.end()) << "Doc id " << result.id << " not found in expected results";
 99 |     }
100 | }
101 | 


--------------------------------------------------------------------------------
/tests/data/colbert.ranking.tsv:
--------------------------------------------------------------------------------
  1 | 1	509	1	15.1640625
  2 | 1	619	2	14.296875
  3 | 1	795	3	14.2734375
  4 | 1	637	4	14.046875
  5 | 1	716	5	14.0078125
  6 | 1	55	6	13.546875
  7 | 1	311	7	13.3203125
  8 | 1	513	8	12.6875
  9 | 1	313	9	12.2265625
 10 | 1	787	10	11.5390625
 11 | 1	323	11	11.3125
 12 | 1	960	12	11.28125
 13 | 1	686	13	11.2421875
 14 | 1	767	14	11.1640625
 15 | 1	33	15	11.1171875
 16 | 1	267	16	11.09375
 17 | 1	629	17	11.0625
 18 | 1	451	18	10.984375
 19 | 1	321	19	10.9375
 20 | 1	682	20	10.6953125
 21 | 1	237	21	10.390625
 22 | 1	644	22	10.3828125
 23 | 1	683	23	10.328125
 24 | 1	937	24	10.2734375
 25 | 1	362	25	10.1796875
 26 | 1	25	26	10.078125
 27 | 1	45	27	10.078125
 28 | 1	886	28	9.96875
 29 | 1	727	29	9.828125
 30 | 1	609	30	9.6796875
 31 | 1	478	31	9.6484375
 32 | 1	602	32	9.5703125
 33 | 1	338	33	9.484375
 34 | 1	514	34	9.40625
 35 | 1	608	35	9.359375
 36 | 1	390	36	9.3046875
 37 | 1	822	37	9.28125
 38 | 1	598	38	9.2734375
 39 | 1	755	39	9.265625
 40 | 1	835	40	9.1484375
 41 | 1	701	41	9.140625
 42 | 1	229	42	9.1328125
 43 | 1	456	43	9.125
 44 | 1	621	44	9.0390625
 45 | 1	786	45	9.0234375
 46 | 1	914	46	9.0234375
 47 | 1	403	47	9.015625
 48 | 1	620	48	9.0
 49 | 1	24	49	8.9453125
 50 | 1	42	50	8.9375
 51 | 1	356	51	8.9375
 52 | 1	373	52	8.9140625
 53 | 1	500	53	8.8828125
 54 | 1	687	54	8.8828125
 55 | 1	344	55	8.84375
 56 | 1	924	56	8.828125
 57 | 1	988	57	8.828125
 58 | 1	197	58	8.75
 59 | 1	912	59	8.71875
 60 | 1	925	60	8.7109375
 61 | 1	378	61	8.6875
 62 | 1	867	62	8.6875
 63 | 1	417	63	8.65625
 64 | 1	695	64	8.65625
 65 | 1	601	65	8.6484375
 66 | 1	436	66	8.640625
 67 | 1	699	67	8.640625
 68 | 1	473	68	8.609375
 69 | 1	603	69	8.609375
 70 | 1	111	70	8.59375
 71 | 1	315	71	8.5859375
 72 | 1	455	72	8.5859375
 73 | 1	582	73	8.5
 74 | 1	148	74	8.4921875
 75 | 1	492	75	8.4921875
 76 | 1	352	76	8.46875
 77 | 1	384	77	8.4609375
 78 | 1	814	78	8.4609375
 79 | 1	249	79	8.375
 80 | 1	260	80	8.3671875
 81 | 1	864	81	8.359375
 82 | 1	333	82	8.3359375
 83 | 1	944	83	8.3203125
 84 | 1	196	84	8.3125
 85 | 1	604	85	8.3125
 86 | 1	471	86	8.2890625
 87 | 1	409	87	8.2734375
 88 | 1	553	88	8.2734375
 89 | 1	992	89	8.2734375
 90 | 1	275	90	8.25
 91 | 1	804	91	8.2265625
 92 | 1	673	92	8.21875
 93 | 1	948	93	8.15625
 94 | 1	995	94	8.15625
 95 | 1	309	95	8.1328125
 96 | 1	281	96	8.09375
 97 | 1	950	97	8.0546875
 98 | 1	649	98	8.0234375
 99 | 1	357	99	8.015625
100 | 1	73	100	8.0
101 | 


--------------------------------------------------------------------------------
/tests/data/colbert_test.db/000008.sst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeployQL/LintDB/bba57a67d1289edc14bdc00cb3c45df60877e2e4/tests/data/colbert_test.db/000008.sst


--------------------------------------------------------------------------------
/tests/data/colbert_test.db/000009.sst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeployQL/LintDB/bba57a67d1289edc14bdc00cb3c45df60877e2e4/tests/data/colbert_test.db/000009.sst


--------------------------------------------------------------------------------
/tests/data/colbert_test.db/000010.sst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeployQL/LintDB/bba57a67d1289edc14bdc00cb3c45df60877e2e4/tests/data/colbert_test.db/000010.sst


--------------------------------------------------------------------------------
/tests/data/colbert_test.db/000011.sst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeployQL/LintDB/bba57a67d1289edc14bdc00cb3c45df60877e2e4/tests/data/colbert_test.db/000011.sst


--------------------------------------------------------------------------------
/tests/data/colbert_test.db/000176.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeployQL/LintDB/bba57a67d1289edc14bdc00cb3c45df60877e2e4/tests/data/colbert_test.db/000176.log


--------------------------------------------------------------------------------
/tests/data/colbert_test.db/CURRENT:
--------------------------------------------------------------------------------
1 | MANIFEST-000177
2 | 


--------------------------------------------------------------------------------
/tests/data/colbert_test.db/IDENTITY:
--------------------------------------------------------------------------------
1 | ca27e180-c0ac-40a6-8ba1-abc5931d9ca6


--------------------------------------------------------------------------------
/tests/data/colbert_test.db/LOCK:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeployQL/LintDB/bba57a67d1289edc14bdc00cb3c45df60877e2e4/tests/data/colbert_test.db/LOCK


--------------------------------------------------------------------------------
/tests/data/colbert_test.db/MANIFEST-000177:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeployQL/LintDB/bba57a67d1289edc14bdc00cb3c45df60877e2e4/tests/data/colbert_test.db/MANIFEST-000177


--------------------------------------------------------------------------------
/tests/data/colbert_test.db/_field_mapper.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "idToField" : [
 3 |       {
 4 |          "data_type" : 0,
 5 |          "field_types" : [ 3 ],
 6 |          "name" : "colbert",
 7 |          "parameters" : {
 8 |             "analyzer" : "",
 9 |             "dimensions" : 128,
10 |             "nbits" : 1,
11 |             "num_centroids" : 32768,
12 |             "num_iterations" : 10,
13 |             "num_subquantizers" : 0,
14 |             "quantization" : 2
15 |          }
16 |       }
17 |    ],
18 |    "nameToID" : {
19 |       "colbert" : 0
20 |    }
21 | }
22 | 


--------------------------------------------------------------------------------
/tests/data/colbert_test.db/_lintdb_metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |    "lintdb_version" : "0.4.1"
3 | }
4 | 


--------------------------------------------------------------------------------
/tests/data/colbert_test.db/_schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "fields" : [
 3 |       {
 4 |          "data_type" : 0,
 5 |          "field_types" : [ 3 ],
 6 |          "name" : "colbert",
 7 |          "parameters" : {
 8 |             "analyzer" : "",
 9 |             "dimensions" : 128,
10 |             "nbits" : 1,
11 |             "num_centroids" : 32768,
12 |             "num_iterations" : 10,
13 |             "num_subquantizers" : 0,
14 |             "quantization" : 2
15 |          }
16 |       }
17 |    ]
18 | }
19 | 


--------------------------------------------------------------------------------
/tests/data/colbert_test.db/colbert_coarse_quantizer:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeployQL/LintDB/bba57a67d1289edc14bdc00cb3c45df60877e2e4/tests/data/colbert_test.db/colbert_coarse_quantizer


--------------------------------------------------------------------------------
/tests/doc_encoder_test.cpp:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | #include "lintdb/schema/DocEncoder.h"
 3 | #include "bitsery/bitsery.h"
 4 | #include "bitsery/adapter/buffer.h"
 5 | #include "lintdb/schema/DocEncoder.h"
 6 | #include "lintdb/schema/ProcessedData.h"
 7 | 
 8 | TEST(DocEncoder, EncodeInvertedDataForTensorDataType) {
 9 |     lintdb::DocEncoder encoder;
10 |     lintdb::ProcessedData data;
11 |     data.value.data_type = lintdb::DataType::TENSOR;
12 |     data.value.num_tensors = 2;
13 |     data.centroid_ids = {1, 2};
14 |     data.tenant = 0;
15 |     data.field = 1;
16 |     data.doc_id = 1;
17 |     data.value.value = lintdb::Tensor{1.0f, 2.0f, 3.0f, 4.0f};
18 | 
19 |     auto result = encoder.encode_inverted_data(data, 2);
20 | 
21 |     EXPECT_EQ(result.size(), 2);
22 | }
23 | 
24 | TEST(DocEncoder, EncodeInvertedDataForNonTensorDataType) {
25 |     lintdb::DocEncoder encoder;
26 |     lintdb::ProcessedData data;
27 |     data.value.data_type = lintdb::DataType::INTEGER;
28 |     data.value.value = 10;
29 |     data.tenant = 0;
30 |     data.field = 1;
31 |     data.doc_id = 1;
32 | 
33 |     auto result = encoder.encode_inverted_data(data, 2);
34 | 
35 |     EXPECT_EQ(result.size(), 1);
36 | }
37 | 
38 | TEST(DocEncoder, EncodeInvertedMappingData) {
39 |     lintdb::DocEncoder encoder;
40 |     lintdb::ProcessedData data;
41 |     data.tenant = 0;
42 |     data.field = 1;
43 |     data.doc_id = 1;
44 |     data.centroid_ids = {1, 2, 3};
45 | 
46 |     auto result = encoder.encode_inverted_mapping_data(data);
47 | 
48 |     EXPECT_EQ(result.size(), 1);
49 | }
50 | 
51 | 
52 | TEST(DocEncoder, EncodeContextData) {
53 |     lintdb::DocEncoder encoder;
54 |     lintdb::ProcessedData data;
55 |     data.tenant = 0;
56 |     data.field = 1;
57 |     data.doc_id = 1;
58 |     data.value.value = "context";
59 | 
60 |     auto result = encoder.encode_context_data(data);
61 | 
62 |     EXPECT_FALSE(result.key.empty());
63 |     EXPECT_FALSE(result.value.empty());
64 | }


--------------------------------------------------------------------------------
/tests/inverted_list_test.cpp:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | #include <rocksdb/db.h>
  3 | #include <rocksdb/slice.h>
  4 | #include <rocksdb/table.h>
  5 | #include <filesystem>
  6 | #include <iostream>
  7 | #include <map>
  8 | #include <memory>
  9 | #include <vector>
 10 | #include "lintdb/cf.h"
 11 | #include "lintdb/index.h"
 12 | #include "lintdb/version.h"
 13 | #include "util.h"
 14 | #include "lintdb/invlists/KeyBuilder.h"
 15 | 
 16 | using ::testing::Test;
 17 | using ::testing::Values;
 18 | 
 19 | class InvertedListTest : public Test {
 20 |    public:
 21 |     ~InvertedListTest() override {}
 22 |     void SetUp() override {
 23 |         version = lintdb::Version();
 24 |         temp_db = create_temporary_directory();
 25 |         rocksdb::Options options;
 26 |         options.create_if_missing = true;
 27 |         options.create_missing_column_families = true;
 28 | 
 29 |         auto cfs = lintdb::create_column_families();
 30 | 
 31 |         rocksdb::DB* ptr;
 32 |         rocksdb::Status s = rocksdb::DB::Open(
 33 |                 options, temp_db, cfs, &column_families, &ptr);
 34 | 
 35 |         assert(s.ok());
 36 |         this->db = std::shared_ptr<rocksdb::DB>(ptr);
 37 |     }
 38 |     void TearDown() override {
 39 |         for (auto cf : column_families) {
 40 |             db->DestroyColumnFamilyHandle(cf);
 41 |         }
 42 |         std::filesystem::remove_all(temp_db);
 43 |     }
 44 | 
 45 |    protected:
 46 |     lintdb::Version version;
 47 |     std::filesystem::path temp_db;
 48 |     std::shared_ptr<rocksdb::DB> db;
 49 |     std::vector<rocksdb::ColumnFamilyHandle*> column_families;
 50 | };
 51 | 
 52 | TEST_F(InvertedListTest, StoresCodesCorrectly) {
 53 |     lintdb::RocksdbInvertedList invlist(db, column_families, version);
 54 | 
 55 | 
 56 |     auto one = lintdb::create_index_id(0, 1, lintdb::DataType::QUANTIZED_TENSOR, 1, 555);
 57 |     auto two = lintdb::create_index_id(0, 1, lintdb::DataType::QUANTIZED_TENSOR, 1, 556);
 58 |     auto three = lintdb::create_index_id(0, 1, lintdb::DataType::QUANTIZED_TENSOR, 3, 555);
 59 |     rocksdb::WriteOptions wo;
 60 |     this->db->Put(wo, column_families[lintdb::kIndexColumnIndex], one, "value");
 61 |     this->db->Put(wo, column_families[lintdb::kIndexColumnIndex], two, "value");
 62 |     this->db->Put(wo, column_families[lintdb::kIndexColumnIndex], three, "value");
 63 | 
 64 |     std::string prefix = lintdb::create_index_prefix(0, 1, lintdb::DataType::QUANTIZED_TENSOR, 1);
 65 |     auto it1 = invlist.get_iterator(prefix);
 66 | 
 67 |     // inverted list should have 2 entries
 68 |     EXPECT_TRUE(it1->is_valid());
 69 |     auto key = it1->get_key();
 70 |     ASSERT_EQ(key.doc_id(), 555);
 71 | 
 72 |     std::string val = it1->get_value();
 73 |     ASSERT_EQ(val, "value");
 74 | 
 75 |     it1->next();
 76 | 
 77 |     EXPECT_TRUE(it1->is_valid());
 78 |     key = it1->get_key();
 79 |     ASSERT_EQ(key.doc_id(), 556);
 80 | 
 81 |     val = it1->get_value();
 82 |     ASSERT_EQ(val, "value");
 83 | 
 84 |     // only two documents.
 85 |     it1->next();
 86 |     EXPECT_FALSE(it1->is_valid());
 87 | 
 88 | 
 89 |     std::string prefix_three = lintdb::create_index_prefix(0, 1, lintdb::DataType::QUANTIZED_TENSOR, 3);
 90 |     auto it3 = invlist.get_iterator(prefix_three);
 91 | 
 92 |     EXPECT_TRUE(it3->is_valid());
 93 | 
 94 |     auto key_three = it3->get_key();
 95 |     ASSERT_EQ(key_three.doc_id(), 555);
 96 | 
 97 |     std::string val_three = it3->get_value();
 98 |     ASSERT_EQ(val_three, "value");
 99 | 
100 |     // only one document.
101 |     it3->next();
102 |     EXPECT_FALSE(it3->is_valid());
103 | 
104 | }


--------------------------------------------------------------------------------
/tests/keys_test.cpp:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | #include "lintdb/invlists/KeyBuilder.h"
 3 | #include "lintdb/schema/DataTypes.h"
 4 | #include <string>
 5 | #include <chrono>
 6 | 
 7 | class KeySerializationTests : public ::testing::Test {
 8 | protected:
 9 |     lintdb::KeyBuilder builder;
10 | };
11 | 
12 | TEST_F(KeySerializationTests, SerializeAndDeserializeInvertedIndexKey_IntegerType) {
13 |     std::string expectedKey = builder.add(static_cast<uint64_t>(1)) // tenant
14 |             .add(static_cast<uint8_t>(2)) // field
15 |             .add(lintdb::DataType::INTEGER) // field_type
16 |             .add(static_cast<idx_t>(3)) // inverted_list
17 |             .add(static_cast<idx_t>(4)) // doc_id
18 |             .build();
19 |     lintdb::InvertedIndexKey key(expectedKey);
20 |     ASSERT_EQ(key.field(), uint8_t(2));
21 |     idx_t actual = std::get<idx_t>(key.field_value());
22 |     ASSERT_EQ(actual, 3);
23 |     ASSERT_EQ(key.doc_id(), 4);
24 | }
25 | 
26 | TEST_F(KeySerializationTests, SerializeAndDeserializeInvertedIndexKey_StringType) {
27 |     std::string expectedKey = lintdb::create_index_id(1, 2, lintdb::DataType::TEXT, "some value", 123);
28 |     lintdb::InvertedIndexKey key(expectedKey);
29 |     ASSERT_EQ(key.field(), uint8_t(2));
30 |     auto actual = std::get<std::string>(key.field_value());
31 |     ASSERT_EQ(actual, "some value");
32 |     ASSERT_EQ(key.doc_id(), 123);
33 | }
34 | 
35 | TEST_F(KeySerializationTests, SerializeAndDeserializeInvertedIndexKey_DateType) {
36 |     lintdb::DateTime now = std::chrono::time_point_cast<lintdb::Duration>(std::chrono::system_clock::now());
37 |     std::string expectedKey = lintdb::create_index_id(1, 2, lintdb::DataType::DATETIME, lintdb::DateTime(now), 123);
38 |     lintdb::InvertedIndexKey key(expectedKey);
39 |     ASSERT_EQ(key.field(), uint8_t(2));
40 | 
41 |     std::visit([](auto&& arg) {
42 |         using T = std::decay_t<decltype(arg)>;
43 |         if constexpr (std::is_same_v<T, lintdb::DateTime>) {
44 |             // Handle DateTime
45 |             std::cout << "DateTime with ms: " << arg.time_since_epoch().count() << std::endl;
46 |         } else {
47 |             // Handle other types
48 |             std::cout << "Not a DateTime" << std::endl;
49 |         }
50 |     }, key.field_value());
51 | 
52 |     auto actual = std::get<lintdb::DateTime>(key.field_value());
53 |     ASSERT_EQ(actual, now);
54 |     ASSERT_EQ(key.doc_id(), 123);
55 | }
56 | 
57 | TEST_F(KeySerializationTests, SerializeAndDeserializeContextKey) {
58 |     std::string expectedKey = builder.add(static_cast<uint64_t>(1)) // tenant
59 |             .add(static_cast<uint8_t>(2)) // field
60 |             .add(static_cast<idx_t>(3)) // doc_id
61 |             .build();
62 |     lintdb::ContextKey key(expectedKey);
63 |     ASSERT_EQ(key.doc_id(), 3);
64 | }
65 | 
66 | TEST_F(KeySerializationTests, SerializeAndDeserializeForwardIndexKey) {
67 |     std::string expectedKey = builder.add(static_cast<uint64_t>(1)) // tenant
68 |             .add(static_cast<idx_t>(2)) // doc_id
69 |             .build();
70 |     lintdb::ForwardIndexKey key(expectedKey);
71 |     ASSERT_EQ(key.doc_id(), 2);
72 | }


--------------------------------------------------------------------------------
/tests/mocks.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINTDB_MOCKS_H
 2 | #define LINTDB_MOCKS_H
 3 | 
 4 | #include "lintdb/invlists/InvertedList.h"
 5 | #include "lintdb/invlists/Iterator.h"
 6 | #include "lintdb/quantizers/ProductEncoder.h"
 7 | #include "lintdb/quantizers/Quantizer.h"
 8 | #include "lintdb/invlists/IndexWriter.h"
 9 | #include "lintdb/quantizers/CoarseQuantizer.h"
10 | #include <gmock/gmock.h>
11 | #include <memory>
12 | #include <vector>
13 | #include <gsl/span>
14 | 
15 | 
16 | class MockIndexWriter : public lintdb::IIndexWriter {
17 |    public:
18 |     MOCK_METHOD(void, write, (const lintdb::BatchPostingData& batch_posting_data), (override));
19 | };
20 | 
21 | class MockQuantizer : public lintdb::Quantizer {
22 |    public:
23 |     MOCK_METHOD(void, train, (const size_t n, const float* x, const size_t dim), (override));
24 |     MOCK_METHOD(void, save, (const std::string path), (override));
25 |     MOCK_METHOD(void, sa_encode, (size_t n, const float* x, residual_t* codes), (override));
26 |     MOCK_METHOD(void, sa_decode, (size_t n, const residual_t* codes, float* x), (override));
27 |     MOCK_METHOD(size_t, code_size, (), (override));
28 |     MOCK_METHOD(size_t, get_nbits, (), (override));
29 |     MOCK_METHOD(lintdb::QuantizerType, get_type, (), (override));
30 | };
31 | 
32 | class MockCoarseQuantizer : public lintdb::ICoarseQuantizer {
33 | public:
34 |     MOCK_METHOD(void, train, (const size_t n, const float* x, size_t k, size_t num_iter), (override));
35 |     MOCK_METHOD(void, save, (const std::string& path), (override));
36 |     MOCK_METHOD(void, assign, (size_t n, const float* x, idx_t* codes), (override));
37 |     MOCK_METHOD(void, sa_decode, (size_t n, const idx_t* codes, float* x), (override));
38 |     MOCK_METHOD(void, compute_residual, (const float* vec, float* residual, idx_t centroid_id), (override));
39 |     MOCK_METHOD(void, compute_residual_n, (int n, const float* vec, float* residual, idx_t* centroid_ids), (override));
40 |     MOCK_METHOD(void, reconstruct, (idx_t centroid_id, float* embedding), (override));
41 |     MOCK_METHOD(void, search, (size_t num_query_tok, const float* data, size_t k_top_centroids, float* distances, idx_t* coarse_idx), (override));
42 |     MOCK_METHOD(void, reset, (), (override));
43 |     MOCK_METHOD(void, add, (int n, float* data), (override));
44 |     MOCK_METHOD(size_t, code_size, (), (override));
45 |     MOCK_METHOD(size_t, num_centroids, (), (override));
46 |     MOCK_METHOD(float*, get_xb, (), (override));
47 |     MOCK_METHOD(void, serialize, (const std::string& filename), (const, override));
48 |     MOCK_METHOD(bool, is_trained, (), (const, override));
49 | 
50 | };
51 | 
52 | #endif // LINTDB_MOCKS_H
53 | 


--------------------------------------------------------------------------------
/tests/util.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <exception>
 3 | #include <fstream>
 4 | #include <iostream>
 5 | #include <random>
 6 | #include <sstream>
 7 | 
 8 | #include <filesystem>
 9 | 
10 | inline std::filesystem::path create_temporary_directory(
11 |         unsigned long long max_tries = 1000) {
12 |     auto tmp_dir = std::filesystem::temp_directory_path();
13 |     unsigned long long i = 0;
14 |     std::random_device dev;
15 |     std::mt19937 prng(dev());
16 |     std::uniform_int_distribution<uint64_t> rand(0);
17 |     std::filesystem::path path;
18 |     while (true) {
19 |         std::stringstream ss;
20 |         ss << std::hex << rand(prng);
21 |         path = tmp_dir / ss.str();
22 |         // true if the directory was created.
23 |         if (std::filesystem::create_directory(path)) {
24 |             break;
25 |         }
26 |         if (i == max_tries) {
27 |             throw std::runtime_error("could not find non-existing directory");
28 |         }
29 |         i++;
30 |     }
31 |     return path;
32 | }
33 | 


--------------------------------------------------------------------------------
/vcpkg-configuration.json:
--------------------------------------------------------------------------------
1 | {
2 |   "overlay-ports": [
3 |     "./ports"
4 |   ]
5 | }
6 | 


--------------------------------------------------------------------------------
/vcpkg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "lintdb",
 3 |   "version-string": "0.5.1",
 4 |   "license": "MIT",
 5 |   "dependencies": [
 6 |     "faiss",
 7 |     "rocksdb",
 8 |     "flatbuffers",
 9 |     "gtest",
10 |     "glog",
11 |     "jsoncpp",
12 |     "ms-gsl",
13 |     "benchmark",
14 |     "intel-mkl",
15 |     "openblas",
16 |     "bitsery",
17 |     "drogon",
18 |     "args"
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/version.txt:
--------------------------------------------------------------------------------
1 | 0.5.1


--------------------------------------------------------------------------------