├── .bumpversion.cfg ├── .clang-format ├── .github ├── ISSUE_TEMPLATE │ └── user-story.md ├── hooks │ └── pre-commit │ │ └── clang-format.hook └── workflows │ ├── build_and_test_cmake.yaml │ ├── build_and_upload_conda.yaml │ ├── build_develop_docs.yaml │ └── build_release_docs.yaml ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── CMakePresets.json ├── LICENSE ├── Makefile ├── README.md ├── benchmarks ├── .gitattributes ├── .gitignore ├── CMakeLists.txt ├── bench_lintdb.cpp ├── bench_lintdb.py ├── common.py ├── lotte │ ├── common.py │ ├── compare_clustering.py │ ├── debug_colbert.py │ ├── indexing_two.py │ └── main.py ├── pixi.lock ├── pixi.toml ├── poetry.lock ├── pyproject.toml ├── run_colbert.py ├── run_lintdb.py ├── vidore │ └── main.py └── xtr │ └── main.py ├── cmake ├── FindMKL.cmake └── lintdb-config.cmake.in ├── conda ├── benchmark_env.yaml ├── conda_build_config.yaml ├── environment.yaml └── lintdb │ ├── build-lib-arm64.sh │ ├── build-lib-osx.sh │ ├── build-lib.bat │ ├── build-lib.sh │ ├── build-pkg-arm64.sh │ ├── build-pkg-osx.sh │ ├── build-pkg.bat │ ├── build-pkg.sh │ └── meta.yaml ├── docker └── Dockerfile.conda.build ├── docs ├── Makefile ├── development.md ├── environment.yaml ├── examples.md ├── getting-started.md ├── icon.svg ├── index.md ├── installation.md ├── make.bat ├── nav.md ├── pyproject.toml ├── reference.md └── requirements.txt ├── icon.svg ├── lintdb ├── CMakeLists.txt ├── SearchOptions.h ├── SearchResult.h ├── api.h ├── assert.h ├── cf.h ├── constants.h ├── env.h ├── exception.h ├── index.cpp ├── index.h ├── invlists │ ├── ContextIterator.h │ ├── EncodedDocument.cpp │ ├── EncodedDocument.h │ ├── ForwardIndexIterator.cpp │ ├── ForwardIndexIterator.h │ ├── IndexWriter.cpp │ ├── IndexWriter.h │ ├── InvertedIterator.cpp │ ├── InvertedIterator.h │ ├── InvertedList.h │ ├── Iterator.h │ ├── KeyBuilder.h │ ├── PostingData.h │ ├── RocksdbForwardIndex.cpp │ ├── RocksdbForwardIndex.h │ ├── RocksdbInvertedList.cpp │ └── RocksdbInvertedList.h ├── python │ ├── CMakeLists.txt │ ├── pylintdb.cpp │ ├── tests │ │ └── test_index.py │ └── version.txt ├── quantizers │ ├── Binarizer.cpp │ ├── Binarizer.h │ ├── CoarseQuantizer.cpp │ ├── CoarseQuantizer.h │ ├── IdentityQuantizer.cpp │ ├── IdentityQuantizer.h │ ├── InvertedListScanner.cpp │ ├── InvertedListScanner.h │ ├── PQDistanceTables.cpp │ ├── PQDistanceTables.h │ ├── ProductEncoder.cpp │ ├── ProductEncoder.h │ ├── Quantizer.h │ ├── impl │ │ ├── kmeans.cpp │ │ ├── kmeans.h │ │ └── product_quantizer.h │ ├── io.cpp │ └── io.h ├── query │ ├── DocIterator.cpp │ ├── DocIterator.h │ ├── DocValue.h │ ├── KnnNearestCentroids.cpp │ ├── KnnNearestCentroids.h │ ├── Query.cpp │ ├── Query.h │ ├── QueryContext.h │ ├── QueryExecutor.cpp │ ├── QueryExecutor.h │ ├── QueryNode.cpp │ ├── QueryNode.h │ ├── decode.cpp │ └── decode.h ├── schema │ ├── DataTypes.h │ ├── DocEncoder.cpp │ ├── DocEncoder.h │ ├── DocProcessor.cpp │ ├── DocProcessor.h │ ├── Document.h │ ├── FieldMapper.cpp │ ├── FieldMapper.h │ ├── ProcessedData.h │ ├── Schema.cpp │ └── Schema.h ├── scoring │ ├── ContextCollector.cpp │ ├── ContextCollector.h │ ├── ScoredDocument.h │ ├── Scorer.cpp │ ├── Scorer.h │ ├── plaid.cpp │ ├── plaid.h │ ├── scoring_methods.cpp │ └── scoring_methods.h ├── server │ ├── CMakeLists.txt │ ├── api_tests.py │ ├── controllers │ │ └── v1 │ │ │ ├── Index.cpp │ │ │ ├── Index.h │ │ │ ├── query_node_translator.h │ │ │ └── result_translator.h │ ├── main.cpp │ └── openapi.yaml ├── util.cpp ├── util.h ├── utils │ ├── endian.h │ ├── half.h │ └── progress_bar.h └── version.h ├── mkdocs.yml ├── ports ├── bitsery │ ├── portfile.cmake │ └── vcpkg.json ├── faiss │ ├── faiss.patch │ ├── fix-dependencies.patch │ ├── portfile.cmake │ └── vcpkg.json ├── intel-mkl │ ├── copy-from-dmg.cmake │ ├── portfile.cmake │ ├── usage │ └── vcpkg.json ├── onnxruntime │ ├── portfile.cmake │ └── vcpkg.json └── rocksdb │ ├── 0001-fix-dependencies.patch │ ├── portfile.cmake │ └── vcpkg.json ├── pyproject.toml ├── tests ├── CMakeLists.txt ├── __init__.py ├── binarizer_test.cpp ├── coarse_quantizer_test.cpp ├── colbert_test.cpp ├── data │ ├── colbert.ranking.tsv │ ├── colbert_test.db │ │ ├── 000008.sst │ │ ├── 000009.sst │ │ ├── 000010.sst │ │ ├── 000011.sst │ │ ├── 000176.log │ │ ├── CURRENT │ │ ├── IDENTITY │ │ ├── LOCK │ │ ├── LOG │ │ ├── LOG.old.1722800135830059 │ │ ├── LOG.old.1722800216294444 │ │ ├── LOG.old.1722800296371779 │ │ ├── LOG.old.1722800437955832 │ │ ├── LOG.old.1722800493660601 │ │ ├── LOG.old.1722917400856599 │ │ ├── MANIFEST-000177 │ │ ├── OPTIONS-000175 │ │ ├── OPTIONS-000179 │ │ ├── _field_mapper.json │ │ ├── _lintdb_metadata.json │ │ ├── _schema.json │ │ ├── colbert_coarse_quantizer │ │ └── colbert_quantizer │ └── query.txt ├── doc_encoder_test.cpp ├── doc_iterator_test.cpp ├── doc_processor_test.cpp ├── index_test.cpp ├── inverted_list_test.cpp ├── keys_test.cpp ├── mocks.h ├── plaid_test.cpp ├── product_quantizer_test.cpp └── util.h ├── vcpkg-configuration.json ├── vcpkg.json └── version.txt /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.5.1 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:version.txt] 7 | search = {current_version} 8 | replace = {new_version} 9 | 10 | [bumpversion:file:vcpkg.json] 11 | search = "version-string": "{current_version}" 12 | replace = "version-string": "{new_version}" 13 | 14 | [bumpversion:file:lintdb/version.h] 15 | search = #define LINTDB_VERSION_STRING "{current_version}" 16 | replace = #define LINTDB_VERSION_STRING "{new_version}" 17 | 18 | [bumpversion:file:pyproject.toml] 19 | search = version = "{current_version}" 20 | replace = version = "{new_version}" 21 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | AccessModifierOffset: -1 3 | AlignAfterOpenBracket: AlwaysBreak 4 | AlignConsecutiveAssignments: false 5 | AlignConsecutiveDeclarations: false 6 | AlignEscapedNewlinesLeft: true 7 | AlignOperands: false 8 | AlignTrailingComments: true 9 | AllowAllParametersOfDeclarationOnNextLine: false 10 | AllowShortBlocksOnASingleLine: false 11 | AllowShortCaseLabelsOnASingleLine: false 12 | AllowShortFunctionsOnASingleLine: Empty 13 | AllowShortIfStatementsOnASingleLine: false 14 | AllowShortLoopsOnASingleLine: false 15 | AlwaysBreakAfterReturnType: None 16 | AlwaysBreakBeforeMultilineStrings: true 17 | AlwaysBreakTemplateDeclarations: true 18 | BinPackArguments: false # at some point, set this to true 19 | BinPackParameters: false # at some point, set this to true 20 | BraceWrapping: 21 | AfterClass: false 22 | AfterControlStatement: false 23 | AfterEnum: false 24 | AfterFunction: false 25 | AfterNamespace: false 26 | AfterObjCDeclaration: false 27 | AfterStruct: false 28 | AfterUnion: false 29 | BeforeCatch: false 30 | BeforeElse: false 31 | IndentBraces: false 32 | BreakBeforeBinaryOperators: None 33 | BreakBeforeBraces: Attach 34 | BreakBeforeTernaryOperators: true 35 | BreakConstructorInitializersBeforeComma: false 36 | BreakAfterJavaFieldAnnotations: false 37 | BreakStringLiterals: false 38 | ColumnLimit: 80 39 | CommentPragmas: '^ IWYU pragma:' 40 | CompactNamespaces: false 41 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 42 | ConstructorInitializerIndentWidth: 8 43 | ContinuationIndentWidth: 8 44 | Cpp11BracedListStyle: true 45 | DerivePointerAlignment: false 46 | DisableFormat: false 47 | ForEachMacros: [ FOR_EACH_RANGE, FOR_EACH, ] 48 | IncludeCategories: 49 | - Regex: '^<.*\.h(pp)?>' 50 | Priority: 1 51 | - Regex: '^<.*' 52 | Priority: 2 53 | - Regex: '.*' 54 | Priority: 3 55 | IndentCaseLabels: true 56 | IndentWidth: 4 57 | IndentWrappedFunctionNames: false 58 | KeepEmptyLinesAtTheStartOfBlocks: false 59 | MacroBlockBegin: '' 60 | MacroBlockEnd: '' 61 | MaxEmptyLinesToKeep: 1 62 | NamespaceIndentation: None 63 | ObjCBlockIndentWidth: 4 64 | ObjCSpaceAfterProperty: false 65 | ObjCSpaceBeforeProtocolList: false 66 | PenaltyBreakBeforeFirstCallParameter: 1 67 | PenaltyBreakComment: 300 68 | PenaltyBreakFirstLessLess: 120 69 | PenaltyBreakString: 1000 70 | PenaltyExcessCharacter: 1000000 71 | PenaltyReturnTypeOnItsOwnLine: 2000000 72 | PointerAlignment: Left 73 | ReflowComments: true 74 | SortIncludes: CaseInsensitive 75 | SpaceAfterCStyleCast: false 76 | SpaceBeforeAssignmentOperators: true 77 | SpaceBeforeParens: ControlStatements 78 | SpaceInEmptyParentheses: false 79 | SpacesBeforeTrailingComments: 1 80 | SpacesInAngles: false 81 | SpacesInContainerLiterals: true 82 | SpacesInCStyleCastParentheses: false 83 | SpacesInParentheses: false 84 | SpacesInSquareBrackets: false 85 | Standard: c++17 86 | TabWidth: 4 87 | UseTab: Never -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/user-story.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: User Story 3 | about: Issue for User Stories 4 | title: '' 5 | labels: Story 6 | assignees: '' 7 | 8 | --- 9 | 10 | [ 11 | The user story should have a reason to exist: what do I need as the user described in the summary? 12 | This part details any detail that could not be passed by the summary. 13 | ] 14 | 15 | As a [user concerned by the story] 16 | I want [goal of the story] 17 | so that [reason for the story] 18 | 19 | 20 | ### Acceptance Criteria 21 | 22 | 1. [If I do A.] 23 | 1. [B should happen.] 24 | 25 | [ 26 | Also, here are a few points that need to be addressed: 27 | 28 | 1. Constraint 1; 29 | 1. Constraint 2; 30 | 1. Constraint 3. 31 | ] 32 | 33 | 34 | ### Resources: 35 | 36 | * Mockups: [Here goes a URL to or the name of the mockup(s) in inVision]; 37 | * Testing URL: [Here goes a URL to the testing branch or IP]; 38 | * Staging URL: [Here goes a URL to the feature on staging]; 39 | 40 | 41 | ### Notes 42 | 43 | [Some complementary notes if necessary:] 44 | 45 | * > Here goes a quote from an email 46 | * Here goes whatever useful information can exist… 47 | -------------------------------------------------------------------------------- /.github/hooks/pre-commit/clang-format.hook: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | STYLE=$(git config --get hooks.clangformat.style) 4 | if [ -n "${STYLE}" ] ; then 5 | STYLEARG="-style=${STYLE}" 6 | else 7 | STYLEARG="" 8 | fi 9 | 10 | format_file() { 11 | file="${1}" 12 | if [ -f $file ]; then 13 | clang-format -i ${STYLEARG} ${1} 14 | git add ${1} 15 | fi 16 | } 17 | 18 | case "${1}" in 19 | --about ) 20 | echo "Runs clang-format on source files" 21 | ;; 22 | * ) 23 | for file in `git diff-index --cached --name-only HEAD | grep -iE '\.(cpp|cc|h|hpp)$' ` ; do 24 | format_file "${file}" 25 | done 26 | ;; 27 | esac -------------------------------------------------------------------------------- /.github/workflows/build_and_test_cmake.yaml: -------------------------------------------------------------------------------- 1 | name: Build and Test Cmake 2 | on: 3 | - pull_request 4 | 5 | jobs: 6 | cmake-build: 7 | name: Run Cmake 8 | runs-on: ubuntu-latest 9 | strategy: 10 | fail-fast: true 11 | 12 | steps: 13 | - name: install clang 14 | run: | 15 | wget https://apt.llvm.org/llvm.sh 16 | chmod +x llvm.sh 17 | sudo ./llvm.sh 18 all 18 | - uses: actions/checkout@v3 19 | with: 20 | submodules: recursive 21 | # This is useful to avoid https://github.com/microsoft/vcpkg/issues/25349 22 | # which is caused by missing Git history on the vcpkg submodule which ports 23 | # try to access. 24 | # Do not use if not needed, since it slows down the checkout of sources. 25 | fetch-depth: 1 26 | - name: submodule init 27 | run: | 28 | git submodule update --init --recursive 29 | - uses: lukka/get-cmake@latest 30 | - name: Setup vcpkg 31 | uses: lukka/run-vcpkg@v11 32 | id: runvcpkg 33 | with: 34 | # This one is not needed, as it is the default value anyway. 35 | vcpkgDirectory: '${{ github.workspace }}/tools/vcpkg' 36 | vcpkgJsonGlob: '**/cmakepresets/vcpkg.json' 37 | 38 | - name: Prints output of run-vcpkg's action. 39 | run: echo "root='${{ steps.runvcpkg.outputs.RUNVCPKG_VCPKG_ROOT_OUT }}', triplet='${{ steps.runvcpkg.outputs.RUNVCPKG_VCPKG_DEFAULT_TRIPLET_OUT }}' " 40 | - name: Run CMake+vcpkg+Ninja 41 | uses: lukka/run-cmake@v10 42 | id: runcmake 43 | env: 44 | CC: clang-18 45 | CXX: clang++-18 46 | CMAKE_C_COMPILER: clang-18 47 | CMAKE_CXX_COMPILER: clang++-18 48 | MKLROOT: ${{ github.workspace }}/builds/debug/vcpkg_installed/x64-linux/lib/intel64 49 | with: 50 | cmakeListsTxtPath: '${{ github.workspace }}/CMakeLists.txt' 51 | configurePresetAdditionalArgs: "['-DOpenMP_CXX_FLAGS=-fopenmp=libiomp5', '-DOpenMP_CXX_LIB_NAMES=libiomp5', '-DOpenMP_libiomp5_LIBRARY=${{ github.workspace }}/builds/debug/vcpkg_installed/x64-linux/lib/intel64/libiomp5.so']" 52 | configurePreset: 'debug' 53 | buildPreset: 'debug' 54 | testPreset: 'debug' 55 | testPresetCmdString: "['lintdb-tests', '--test-dir', 'builds/debug', '--output-on-failure']" 56 | 57 | - uses: actions/setup-python@v4 58 | with: 59 | python-version: 3.10.6 60 | 61 | - name: Install Dependencies 62 | env: 63 | CC: clang-18 64 | CXX: clang++-18 65 | CMAKE_C_COMPILER: clang-18 66 | CMAKE_CXX_COMPILER: clang++-18 67 | run: | 68 | echo "CXX=${CXX}" 69 | pip install pytest numpy 70 | 71 | sudo apt-get remove clang-14 clang-15 72 | sudo rm /usr/bin/clang++ 73 | sudo ln -s /usr/bin/clang++-18 /usr/bin/clang++ 74 | pip install . 75 | - name: Run Tests 76 | run: | 77 | pytest lintdb/python/tests 78 | 79 | -------------------------------------------------------------------------------- /.github/workflows/build_develop_docs.yaml: -------------------------------------------------------------------------------- 1 | name: Build/Publish Develop Docs 2 | on: 3 | push: 4 | branches: 5 | - main 6 | permissions: 7 | contents: write 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: install clang 13 | run: | 14 | wget https://apt.llvm.org/llvm.sh 15 | chmod +x llvm.sh 16 | sudo ./llvm.sh 18 all 17 | - uses: actions/checkout@v3 18 | with: 19 | fetch-depth: 0 20 | - name: submodule init 21 | run: | 22 | git submodule update --init --recursive 23 | - uses: lukka/get-cmake@latest 24 | - uses: actions/setup-python@v4 25 | with: 26 | python-version: 3.10.6 27 | - name: Install Dependencies 28 | env: 29 | CC: clang-18 30 | CXX: clang++-18 31 | CMAKE_C_COMPILER: clang-18 32 | CMAKE_CXX_COMPILER: clang++-18 33 | run: | 34 | pip install mkdocs-material 35 | pip install mkdocstrings[python] markdown-callouts mkdocs-literate-nav mike 36 | 37 | sudo apt-get remove clang-14 clang-15 38 | sudo rm /usr/bin/clang++ 39 | sudo ln -s /usr/bin/clang++-18 /usr/bin/clang++ 40 | pip install . 41 | - name: Setup Docs Deploy 42 | run: | 43 | git config --global user.name "Docs Deploy" 44 | git config --global user.email "docs.deploy@example.co.uk" 45 | - name: Build Docs Website 46 | run: mike deploy --push develop -------------------------------------------------------------------------------- /.github/workflows/build_release_docs.yaml: -------------------------------------------------------------------------------- 1 | name: Build/Publish Release Docs 2 | on: 3 | release: 4 | types: [published] 5 | permissions: 6 | contents: write 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: install clang 12 | run: | 13 | wget https://apt.llvm.org/llvm.sh 14 | chmod +x llvm.sh 15 | sudo ./llvm.sh 18 all 16 | - uses: actions/checkout@v3 17 | with: 18 | fetch-depth: 0 19 | - name: submodule init 20 | run: | 21 | git submodule update --init --recursive 22 | - uses: lukka/get-cmake@latest 23 | - uses: actions/setup-python@v4 24 | with: 25 | python-version: 3.10.6 26 | - name: Install Dependencies 27 | env: 28 | CC: clang-18 29 | CXX: clang++-18 30 | CMAKE_C_COMPILER: clang-18 31 | CMAKE_CXX_COMPILER: clang++-18 32 | run: | 33 | pip install mkdocs-material 34 | pip install mkdocstrings[python] markdown-callouts mkdocs-literate-nav mike 35 | 36 | sudo apt-get remove clang-14 clang-15 37 | sudo rm /usr/bin/clang++ 38 | sudo ln -s /usr/bin/clang++-18 /usr/bin/clang++ 39 | pip install . 40 | - name: Setup Docs Deploy 41 | run: | 42 | git config --global user.name "Docs Deploy" 43 | git config --global user.email "docs.deploy@example.co.uk" 44 | - name: Build Docs Website 45 | run: mike deploy --push --update-aliases ${{ github.event.release.tag_name }} latest -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | .idea 4 | debug/ 5 | target/ 6 | assets/ 7 | cmake-build-debug* 8 | benchmarks/local_db.index* 9 | 10 | .DS_Store 11 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 12 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 13 | Cargo.lock 14 | 15 | # These are backup files generated by rustfmt 16 | **/*.rs.bk 17 | 18 | # MSVC Windows builds of rustc generate these, which store debugging information 19 | *.pdb 20 | 21 | 22 | # Added by cargo 23 | 24 | /target 25 | 26 | # Prerequisites 27 | *.d 28 | 29 | # Compiled Object files 30 | *.slo 31 | *.lo 32 | *.o 33 | *.obj 34 | 35 | # Precompiled Headers 36 | *.gch 37 | *.pch 38 | 39 | # Compiled Dynamic libraries 40 | *.so 41 | *.dylib 42 | *.dll 43 | 44 | # Fortran module files 45 | *.mod 46 | *.smod 47 | 48 | # Compiled Static libraries 49 | *.lai 50 | *.la 51 | *.a 52 | *.lib 53 | 54 | # Executables 55 | *.exe 56 | *.out 57 | *.app 58 | 59 | vcpkg_installed 60 | build 61 | builds 62 | .vscode 63 | 64 | models 65 | tests/__pycache__ 66 | experiments 67 | build_benchmarks 68 | _build_python_ 69 | # Byte-compiled / optimized / DLL files 70 | __pycache__/ 71 | *.py[cod] 72 | *$py.class 73 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "tools/vcpkg"] 2 | path = tools/vcpkg 3 | url = https://github.com/Microsoft/vcpkg.git 4 | ignore = dirty 5 | [submodule "third_party/tokenizers-cpp"] 6 | path = third_party/tokenizers-cpp 7 | url = https://github.com/DeployQL/tokenizers-cpp.git 8 | [submodule "third_party/nanobind"] 9 | path = third_party/nanobind 10 | url = https://github.com/wjakob/nanobind 11 | [submodule "third_party/dkm"] 12 | path = third_party/dkm 13 | url = https://github.com/genbattle/dkm.git 14 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.25) 2 | 3 | include(CMakeFindDependencyMacro) 4 | 5 | # allow faiss to build on m1 mac even though it's listed as unsupported. 6 | set(VCPKG_INSTALL_OPTIONS "--allow-unsupported") 7 | 8 | # Setup vcpkg script with CMake (note: should be placed before project() call) 9 | set(CMAKE_TOOLCHAIN_FILE 10 | ${CMAKE_CURRENT_SOURCE_DIR}/tools/vcpkg/scripts/buildsystems/vcpkg.cmake 11 | CACHE STRING "Vcpkg toolchain file") 12 | 13 | file(READ "version.txt" version) 14 | 15 | project( 16 | lintdb 17 | VERSION ${version} 18 | DESCRIPTION "A multi-vector database for late interaction retrieval" 19 | LANGUAGES CXX) 20 | set(LINTDB_VERSION ${version}) 21 | 22 | include(GNUInstallDirs) 23 | 24 | set(CMAKE_CXX_STANDARD 17) 25 | 26 | set(CMAKE_CXX_FLAGS 27 | "${CMAKE_CXX_FLAGS} -std=c++17 -fPIC -O3 -D_LIBCPP_DISABLE_AVAILABILITY" 28 | ) 29 | 30 | if(MSVC OR LINUX) 31 | set(BLA_VENDOR "Intel10_64lp") 32 | else() 33 | set(BLA_VENDOR "OpenBLAS") 34 | endif() 35 | 36 | # the below is caused by github actions failing to build flatbuffers. therefore, 37 | # we set this value so that we use a higher sdk version to build it. 38 | set(CMAKE_OSX_DEPLOYMENT_TARGET 10.13) 39 | set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) 40 | 41 | # https://conda-forge.org/docs/maintainer/knowledge_base/#newer-c-features-with-old-sdk 42 | # if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") set(CMAKE_CXX_FLAGS 43 | # "${CMAKE_CXX_FLAGS} ") endif() 44 | 45 | list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") 46 | 47 | if(SKBUILD) 48 | message("Building with scikit-build") 49 | cmake_path(GET CMAKE_CURRENT_BINARY_DIR PARENT_PATH BUILD_PARENT_DIR) 50 | set(ENV{MKLROOT} 51 | "${BUILD_PARENT_DIR}/vcpkg_installed/x64-linux/lib/intel64") 52 | set(OpenMP_libiomp5_LIBRARY 53 | "${BUILD_PARENT_DIR}/vcpkg_installed/x64-linux/lib/intel64/libiomp5.so" 54 | ) 55 | set(CMAKE_BUILD_TYPE Release) 56 | endif() 57 | 58 | set(CMAKE_INSTALL_PREFIX ${PROJECT_SOURCE_DIR}) 59 | 60 | option(ENABLE_PYTHON "Build Python extension." ON) 61 | option(ENABLE_BENCHMARKS "Build benchmarks." ON) 62 | option(ENABLE_SERVER "Build the server." OFF) 63 | 64 | add_subdirectory(lintdb) 65 | 66 | if(ENABLE_PYTHON) 67 | add_subdirectory(lintdb/python) 68 | endif() 69 | 70 | IF(ENABLE_SERVER) 71 | add_subdirectory(lintdb/server) 72 | endif() 73 | 74 | include(CTest) 75 | if(BUILD_TESTING) 76 | add_subdirectory(tests) 77 | endif() 78 | 79 | 80 | if(ENABLE_BENCHMARKS) 81 | add_subdirectory(benchmarks) 82 | endif() 83 | -------------------------------------------------------------------------------- /benchmarks/.gitattributes: -------------------------------------------------------------------------------- 1 | # GitHub syntax highlighting 2 | pixi.lock linguist-language=YAML linguist-generated=true 3 | -------------------------------------------------------------------------------- /benchmarks/.gitignore: -------------------------------------------------------------------------------- 1 | # pixi environments 2 | .pixi 3 | *.egg-info 4 | -------------------------------------------------------------------------------- /benchmarks/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | message(STATUS "Building lintdb benchmark") 3 | 4 | add_executable(bench_lintdb EXCLUDE_FROM_ALL bench_lintdb.cpp) 5 | target_link_libraries(bench_lintdb PRIVATE lintdb_lib) 6 | 7 | find_package(benchmark CONFIG REQUIRED) 8 | target_link_libraries(bench_lintdb PRIVATE benchmark::benchmark benchmark::benchmark_main) 9 | 10 | -------------------------------------------------------------------------------- /benchmarks/bench_lintdb.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "lintdb/index.h" 7 | #include "lintdb/schema/DataTypes.h" 8 | #include "lintdb/schema/Schema.h" 9 | #include "lintdb/quantizers/Quantizer.h" 10 | #include "lintdb/query/Query.h" 11 | #include "lintdb/query/QueryNode.h" 12 | 13 | lintdb::Document create_document(size_t num_tokens, size_t dim){ 14 | std::vector vector; 15 | for (size_t j = 0; j < num_tokens; j++) { 16 | std::vector data(dim, j); 17 | vector.insert(vector.end(), data.begin(), data.end()); 18 | } 19 | lintdb::FieldValue fv("colbert", vector, num_tokens); 20 | std::vector fields = {fv}; 21 | 22 | lintdb::Document doc(0, fields ); 23 | return doc; 24 | } 25 | 26 | inline std::filesystem::path create_temporary_directory( 27 | unsigned long long max_tries = 1000) { 28 | auto tmp_dir = std::filesystem::temp_directory_path(); 29 | unsigned long long i = 0; 30 | std::random_device dev; 31 | std::mt19937 prng(dev()); 32 | std::uniform_int_distribution rand(0); 33 | std::filesystem::path path; 34 | while (true) { 35 | std::stringstream ss; 36 | ss << std::hex << rand(prng); 37 | path = tmp_dir / ss.str(); 38 | // true if the directory was created. 39 | if (std::filesystem::create_directory(path)) { 40 | break; 41 | } 42 | if (i == max_tries) { 43 | throw std::runtime_error("could not find non-existing directory"); 44 | } 45 | i++; 46 | } 47 | return path; 48 | } 49 | 50 | 51 | static void BM_lintdb_add(benchmark::State& state) { 52 | lintdb::Schema schema; 53 | 54 | lintdb::Field colbert; 55 | colbert.name = "colbert"; 56 | colbert.data_type = lintdb::DataType::TENSOR; 57 | colbert.field_types = {lintdb::FieldType::Colbert}; 58 | lintdb::FieldParameters fp; 59 | fp.dimensions = 128; 60 | fp.num_centroids = 10; 61 | fp.num_iterations = 2; 62 | fp.quantization = lintdb::QuantizerType::BINARIZER; 63 | fp.nbits = 1; 64 | colbert.parameters = fp; 65 | 66 | schema.add_field(colbert); 67 | 68 | auto temp_db = create_temporary_directory(); 69 | 70 | lintdb::Configuration config; 71 | lintdb::IndexIVF index( 72 | temp_db.string(), schema, config); 73 | 74 | std::vector docs; 75 | for (size_t i = 0; i < 50; i++) { 76 | docs.push_back(create_document(120, 128)); 77 | } 78 | index.train(docs); 79 | 80 | auto doc = create_document(120, 128); 81 | 82 | for(auto _ : state) { 83 | index.add(0, {doc}); 84 | } 85 | } 86 | 87 | static void BM_lintdb_search(benchmark::State& state) { 88 | lintdb::IndexIVF index = lintdb::IndexIVF("/home/matt/deployql/LintDB/benchmarks/lintdb-lifestyle-40k"); 89 | 90 | lintdb::FieldValue fv("colbert", std::vector(1280, 1), 10); 91 | std::unique_ptr root = std::make_unique(fv); 92 | lintdb::Query query(std::move(root)); 93 | 94 | lintdb::SearchOptions opts; 95 | opts.n_probe = 32; 96 | opts.k_top_centroids = 2; 97 | 98 | for(auto _ : state) { 99 | index.search(0, query, 10, opts); 100 | } 101 | } 102 | 103 | //BENCHMARK(BM_lintdb_add)->Unit(benchmark::kMillisecond); 104 | BENCHMARK(BM_lintdb_search)->Unit(benchmark::kMillisecond); 105 | 106 | BENCHMARK_MAIN(); -------------------------------------------------------------------------------- /benchmarks/common.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import psutil 3 | import os 4 | 5 | def get_memory_usage(): 6 | process = psutil.Process(os.getpid()) 7 | return process.memory_info().rss -------------------------------------------------------------------------------- /benchmarks/lotte/compare_clustering.py: -------------------------------------------------------------------------------- 1 | import lintdb 2 | from datasets import load_dataset 3 | from collections import namedtuple 4 | from colbert import Indexer, Searcher 5 | from colbert.infra import Run, RunConfig, ColBERTConfig 6 | from colbert.data import Queries, Collection 7 | import os 8 | import sys 9 | import jsonlines 10 | from collections import defaultdict 11 | from tqdm import tqdm 12 | import time 13 | import numpy as np 14 | 15 | LoTTeDataset = namedtuple('LoTTeDataset', ['collection', 'queries', 'qids', 'dids']) 16 | 17 | def load_lotte(dataset, split, max_id=500000): 18 | collection_dataset = load_dataset("colbertv2/lotte_passages", dataset) 19 | collection = [x['text'] for x in collection_dataset[split + '_collection']] 20 | dids = [x['doc_id'] for x in collection_dataset[split + '_collection']] 21 | 22 | queries_dataset = load_dataset("colbertv2/lotte", dataset) 23 | queries = [x['query'] for x in queries_dataset['search_' + split]] 24 | qids = [x['qid'] for x in queries_dataset['search_' + split]] 25 | 26 | f'Loaded {len(queries)} queries and {len(collection):,} passages' 27 | 28 | answer_pids = [x['answers']['answer_pids'] for x in queries_dataset['search_' + split]] 29 | filtered_queries = [q for q, apids in zip(queries, answer_pids) if any(x < max_id for x in apids)] 30 | filtered_qids = [i for i,(q, apids) in enumerate(zip(queries, answer_pids)) if any(x < max_id for x in apids)] 31 | filtered_dids = [x for x in dids if x < max_id] 32 | f'Filtered down to {len(filtered_queries)} queries' 33 | 34 | return LoTTeDataset(collection[:max_id], filtered_queries, filtered_qids, filtered_dids) 35 | 36 | def compare_clustering(experiment, lintdb_path, data): 37 | from colbert.modeling.checkpoint import Checkpoint 38 | from colbert import Searcher 39 | 40 | with Run().context(RunConfig(nranks=1, experiment=experiment)): 41 | # config = ColBERTConfig( 42 | # nbits=nbits, 43 | # kmeans_niters=4, 44 | # root=exp_path, 45 | # ) 46 | checkpoint_config = ColBERTConfig.load_from_checkpoint("colbert-ir/colbertv2.0") 47 | config = ColBERTConfig.from_existing(checkpoint_config, None) 48 | 49 | from colbert.modeling.checkpoint import Checkpoint 50 | from colbert import Searcher 51 | # checkpoint = Checkpoint("colbert-ir/colbertv2.0", config) 52 | 53 | searcher = Searcher(index=experiment, config=config, collection=data.collection) 54 | 55 | index = lintdb.IndexIVF(lintdb_path) 56 | 57 | for i in range(16384): 58 | pids, cell_lengths = searcher.ranker.ivf.lookup([i]) 59 | 60 | lintdb_pids = index.lookup_pids(i) 61 | 62 | diff = set([x.item() for x in pids]) - set(lintdb_pids) 63 | if diff: 64 | print( 65 | f"centroid {i} comparison:", 66 | f"colbert: {len(pids)}", 67 | f"lintdb: {len(lintdb_pids)}", 68 | f"difference: {len(diff)}", 69 | f"pid difference: {diff}", 70 | ) 71 | for pid_values in failures.values(): 72 | for pid in pid_values: 73 | if pid in diff: 74 | print(f"centroid {i} has a failure at pid {pid}") 75 | 76 | 77 | 78 | if __name__ == '__main__': 79 | dataset = 'lifestyle' 80 | datasplit = 'dev' 81 | 82 | experiment = 'colbert' 83 | 84 | failures = { 85 | 5: [5462], 86 | 11: [7767], 87 | 13: [4176, 4185, 5814, 4174], 88 | 15: [1925], 89 | 16: [3701, 3060, 3051, 3437], 90 | 19: [5619] 91 | } 92 | 93 | data = load_lotte(dataset, datasplit) 94 | 95 | compare_clustering(experiment, f"/tmp/py_index_bench_{experiment}", data) -------------------------------------------------------------------------------- /benchmarks/lotte/debug_colbert.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | from collections import namedtuple 3 | from colbert import Indexer, Searcher 4 | from colbert.infra import Run, RunConfig, ColBERTConfig 5 | from colbert.data import Queries, Collection 6 | import os 7 | import sys 8 | import jsonlines 9 | from collections import defaultdict 10 | from tqdm import tqdm 11 | import time 12 | import numpy as np 13 | import typer 14 | import torch 15 | import random 16 | from typing import List, Annotated 17 | from common import load_lotte, lintdb_indexing, evaluate_dataset 18 | import tempfile 19 | 20 | app = typer.Typer() 21 | 22 | @app.command() 23 | def debug(): 24 | torch.set_printoptions(threshold=10_000) 25 | 26 | d = load_lotte('lifestyle', 'dev', filter=True, start=5400, stop=5500) 27 | print(f"Loaded {len(d.queries)} queries and {len(d.collection):,} passages") 28 | assert(len(d.collection) == 100) 29 | 30 | with Run().context(RunConfig(nranks=1, experiment='colbert-debug')): 31 | config = ColBERTConfig.load_from_checkpoint("colbert-ir/colbertv2.0") 32 | config.kmeans_niters=4 33 | indexer = Indexer(checkpoint="colbert-ir/colbertv2.0", config=config) 34 | 35 | doc = None 36 | for i in range(len(d.collection)): 37 | if d.dids[i] == 5462: 38 | doc = d.collection[i] 39 | break 40 | if doc is None: 41 | print("doc not found") 42 | return 43 | 44 | # indexer trains, so needs a larger collection. 45 | indexer.index(name='colbert-debug', collection=d.collection, overwrite=True) 46 | # indexer = Indexer(checkpoint=checkpoint, config=config) 47 | # indexer.index(name=experiment, collection=dataset.collection) # "/path/to/MSMARCO/collection.tsv" 48 | from colbert.modeling.checkpoint import Checkpoint 49 | from colbert import Searcher 50 | searcher = Searcher(index='colbert-debug', config=config, collection=d.collection) 51 | 52 | # spot check this doc 53 | 54 | # doc_len = searcher.ranker.doclens[5462] 55 | # print(f"doc len: {doc_len}") 56 | 57 | checkpoint = Checkpoint("colbert-ir/colbertv2.0", config) 58 | doclens_ = checkpoint.docFromText([doc]) 59 | print(f"embedding size: {doclens_.shape}") 60 | 61 | embs_, doclens_ = checkpoint.docFromText([doc],bsize=1, keep_dims='flatten') 62 | print(f"embedding size: {doclens_}") 63 | 64 | dddd = searcher.ranker.lookup_pids([5462-5400]) 65 | print(f"shape of searcher's doc: {dddd[0].shape}") 66 | 67 | return 68 | 69 | if __name__ == "__main__": 70 | app() -------------------------------------------------------------------------------- /benchmarks/pixi.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "benchmarks" 3 | version = "0.1.0" 4 | description = "Add a short description here" 5 | channels = ["conda-forge"] 6 | platforms = ["linux-64"] 7 | 8 | [tasks] 9 | 10 | [dependencies] 11 | python = ">=3.10,<3.12" 12 | clang-17 = ">=17.0.6,<17.1" 13 | faiss = ">=1.7.4,<1.8" 14 | 15 | [pypi-dependencies] 16 | typer = "*" 17 | colbert-ai = "*" 18 | jsonlines = "*" 19 | tqdm = "*" 20 | #lintdb = { path = "../.", editable = false } 21 | setuptools = "*" 22 | numpy = { version = "==1.26.4" } 23 | torch = "*" 24 | datasets = "*" 25 | transformers = "*" 26 | fsspec = { version = "==2023.9.2" } 27 | -------------------------------------------------------------------------------- /benchmarks/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "benchmarks" 3 | version = "0.1.0" 4 | description = "benchmark LintDB" 5 | authors = ["Matt Barta "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.9" 10 | typer = "^0.9.0" 11 | colbert-ai = "^0.2.14" 12 | pytorch 13 | faiss 14 | "git+https://github.com/illuin-tech/colpali" 15 | 16 | [build-system] 17 | requires = ["poetry-core"] 18 | build-backend = "poetry.core.masonry.api" 19 | 20 | [tool.pyperformance] 21 | manifest = "MANIFEST" -------------------------------------------------------------------------------- /benchmarks/run_colbert.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | from collections import namedtuple 3 | from colbert import Indexer, Searcher 4 | from colbert.infra import Run, RunConfig, ColBERTConfig 5 | from colbert.data import Queries, Collection 6 | from colbert.modeling.checkpoint import Checkpoint 7 | import os 8 | import sys 9 | import jsonlines 10 | from collections import defaultdict 11 | from tqdm import tqdm 12 | import time 13 | import numpy as np 14 | import typer 15 | import random 16 | from typing import List, Annotated 17 | from lotte.common import load_lotte, _evaluate_dataset 18 | from common import get_memory_usage 19 | try: 20 | from valgrind import callgrind_start_instrumentation, callgrind_stop_instrumentation, callgrind_dump_stats 21 | except ImportError: 22 | print("didn't find valgrind") 23 | def callgrind_stop_instrumentation(): 24 | pass 25 | 26 | def callgrind_start_instrumentation(): 27 | pass 28 | 29 | def callgrind_dump_stats(path:str): 30 | pass 31 | 32 | 33 | app = typer.Typer() 34 | 35 | 36 | @app.command() 37 | def single_search(experiment='colbert-lifestyle-40k-benchmark', dataset:str='lifestyle', split:str='dev', profile=True, checkpoint:str='colbert-ir/colbertv2.0', index_path:str='indexes/lifestyle'): 38 | d = load_lotte(dataset, split, stop=40000) 39 | latencies = [] 40 | memory = [] 41 | 42 | with Run().context(RunConfig(nranks=1, experiment=experiment)): 43 | config = ColBERTConfig.load_from_checkpoint(checkpoint) 44 | config.kmeans_niters=4 45 | config.ncells = 2 46 | # model = Checkpoint(checkpoint, config) 47 | 48 | # indexer = Indexer(checkpoint=checkpoint, config=config) 49 | # indexer.index(name=experiment, collection=dataset.collection) # "/path/to/MSMARCO/collection.tsv" 50 | 51 | searcher = Searcher(index=experiment, config=config, collection=d.collection) 52 | rankings = {} 53 | 54 | for id, query in zip(d.qids, d.queries): 55 | embeddings = searcher.encode([query]) 56 | 57 | start = time.perf_counter() 58 | if profile: 59 | callgrind_start_instrumentation() 60 | 61 | results = searcher._search_all_Q(Queries.cast({1: query}), embeddings, k=100) 62 | latencies.append(time.perf_counter() - start) 63 | if profile: 64 | callgrind_stop_instrumentation() 65 | callgrind_dump_stats("callgrind.out.single_search") 66 | memory.append(get_memory_usage()) 67 | 68 | for k, v in results.todict().items(): 69 | rankings[id] = [x[0] for x in v] 70 | 71 | _evaluate_dataset(rankings, dataset, 'search', k=5) 72 | 73 | 74 | print(f"Average search latency: {np.mean(latencies):.2f}s") 75 | print(f"Median search latency: {np.median(latencies):.2f}s") 76 | print(f"95th percentile search latency: {np.percentile(latencies, 95):.2f}s") 77 | print(f"99th percentile search latency: {np.percentile(latencies, 99):.2f}s") 78 | 79 | print(f"Average memory usage: {np.mean(memory):.2f}MB") 80 | print(f"Median memory usage: {np.median(memory):.2f}MB") 81 | print(f"95th percentile memory usage: {np.percentile(memory, 95):.2f}MB") 82 | print(f"99th percentile memory usage: {np.percentile(memory, 99):.2f}MB") 83 | 84 | 85 | if __name__ == "__main__": 86 | app() -------------------------------------------------------------------------------- /benchmarks/run_lintdb.py: -------------------------------------------------------------------------------- 1 | import lintdb as ldb 2 | from datasets import load_dataset 3 | from collections import namedtuple 4 | from colbert import Indexer, Searcher 5 | from colbert.infra import Run, RunConfig, ColBERTConfig 6 | from colbert.data import Queries, Collection 7 | import os 8 | import sys 9 | import jsonlines 10 | from collections import defaultdict 11 | from tqdm import tqdm 12 | import time 13 | import numpy as np 14 | import typer 15 | import random 16 | from typing import List, Annotated 17 | from common import get_memory_usage 18 | from lotte.common import _evaluate_dataset, load_lotte 19 | 20 | try: 21 | from valgrind import callgrind_start_instrumentation, callgrind_stop_instrumentation, callgrind_dump_stats 22 | except ImportError: 23 | print("didn't find valgrind") 24 | def callgrind_stop_instrumentation(): 25 | pass 26 | 27 | def callgrind_start_instrumentation(): 28 | pass 29 | 30 | def callgrind_dump_stats(path:str): 31 | pass 32 | 33 | 34 | app = typer.Typer() 35 | 36 | @app.command() 37 | def single_search(dataset:str='lifestyle', split:str='dev',profile=False, checkpoint:str='colbert-ir/colbertv2.0', index_path:str='experiments/py_index_bench_test-collection-xtr'): 38 | checkpoint_config = ColBERTConfig.load_from_checkpoint(checkpoint) 39 | config = ColBERTConfig.from_existing(checkpoint_config, None) 40 | 41 | from colbert.modeling.checkpoint import Checkpoint 42 | from colbert import Searcher 43 | checkpoint = Checkpoint(checkpoint, config) 44 | 45 | d = load_lotte(dataset, split, stop=40000) 46 | latencies = [] 47 | memory = [] 48 | 49 | print(f"using index at {index_path}") 50 | index = ldb.IndexIVF(index_path) 51 | rankings = {} 52 | 53 | count=0 54 | for id, query in zip(d.qids, d.queries): 55 | embeddings = checkpoint.queryFromText([query]) 56 | converted = np.squeeze(embeddings.numpy().astype('float32')) 57 | 58 | if profile: 59 | callgrind_start_instrumentation() 60 | start = time.perf_counter() 61 | opts = ldb.SearchOptions() 62 | results = index.search( 63 | 0, 64 | converted, 65 | 32, # nprobe 66 | 100, # k to return 67 | opts 68 | ) 69 | latencies.append((time.perf_counter() - start)*1000) 70 | if profile: 71 | callgrind_stop_instrumentation() 72 | callgrind_dump_stats("callgrind.out.single_search") 73 | memory.append(get_memory_usage()) 74 | rankings[id] = [x.id for x in results] 75 | count+=1 76 | if count == 212: 77 | break 78 | 79 | # Stats(pr).strip_dirs().sort_stats(SortKey.TIME).print_stats(10) 80 | _evaluate_dataset(rankings, dataset, 'search', k=5) 81 | 82 | 83 | print(f"Average search latency: {np.mean(latencies):.2f}ms") 84 | print(f"Median search latency: {np.median(latencies):.2f}ms") 85 | print(f"95th percentile search latency: {np.percentile(latencies, 95):.2f}ms") 86 | print(f"99th percentile search latency: {np.percentile(latencies, 99):.2f}ms") 87 | 88 | print(f"Average memory usage: {np.mean(memory):.2f}MB") 89 | print(f"Median memory usage: {np.median(memory):.2f}MB") 90 | print(f"95th percentile memory usage: {np.percentile(memory, 95):.2f}MB") 91 | print(f"99th percentile memory usage: {np.percentile(memory, 99):.2f}MB") 92 | 93 | 94 | if __name__ == "__main__": 95 | app() -------------------------------------------------------------------------------- /cmake/lintdb-config.cmake.in: -------------------------------------------------------------------------------- 1 | include("${CMAKE_CURRENT_LIST_DIR}/lintdb-targets.cmake") -------------------------------------------------------------------------------- /conda/benchmark_env.yaml: -------------------------------------------------------------------------------- 1 | name: lintdb-benchmark 2 | channels: 3 | - pytorch 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - conda-build 8 | - anaconda-client 9 | - faiss-cpu 10 | - pytorch::pytorch 11 | - conda-forge::numpy 12 | - conda-forge::onnxruntime-cpp==1.17.3 13 | - pip: 14 | - chardet 15 | - typer 16 | - jsonlines 17 | - colbert-ai 18 | - datasets 19 | - valgrind -------------------------------------------------------------------------------- /conda/conda_build_config.yaml: -------------------------------------------------------------------------------- 1 | python: 2 | - 3.9 3 | - 3.10 4 | - 3.11 5 | - 3.12 6 | MACOSX_SDK_VERSION: # [osx and x86_64] 7 | - "10.13" # [osx and x86_64] 8 | MACOSX_DEPLOYMENT_TARGET: # [osx and x86_64] 9 | - "10.13" # [osx and x86_64] 10 | c_compiler: 11 | - clang 12 | c_compiler_version: 13 | - 18 14 | cxx_compiler: 15 | - clangxx 16 | cxx_compiler_version: 17 | - 18 -------------------------------------------------------------------------------- /conda/environment.yaml: -------------------------------------------------------------------------------- 1 | name: lintdb-build 2 | channels: 3 | - pytorch 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - conda-build 8 | - anaconda-client 9 | - faiss-cpu 10 | - pytorch::pytorch 11 | - conda-forge::numpy 12 | - pip: 13 | - chardet -------------------------------------------------------------------------------- /conda/lintdb/build-lib-arm64.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | # do we want to specify the build arch explicitly? -DCMAKE_OSX_ARCHITECTURES=arm64 \ 4 | CXXFLAGS="${CXXFLAGS} -D_LIBCPP_DISABLE_AVAILABILITY" cmake -B _build \ 5 | -DBUILD_SHARED_LIBS=ON \ 6 | -DOpenMP_CXX_FLAGS=-fopenmp=libiomp5 \ 7 | -DOpenMP_CXX_LIB_NAMES=libiomp5 \ 8 | -DOpenMP_libiomp5_LIBRARY=$PREFIX/lib/libiomp5.dylib \ 9 | -DCMAKE_INSTALL_LIBDIR=lib \ 10 | -DPython_EXECUTABLE=$PYTHON \ 11 | -DPYTHON_INCLUDE_DIR=$(python -c "import sysconfig; print(sysconfig.get_path('include'))") \ 12 | -DPYTHON_LIBRARY=$(python -c "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))") \ 13 | -DCMAKE_BUILD_TYPE=Release . 14 | 15 | make -C _build -j$(nproc) lintdb 16 | 17 | cmake --install _build --prefix $PREFIX 18 | cmake --install _build --prefix _liblintdb_stage/ -------------------------------------------------------------------------------- /conda/lintdb/build-lib-osx.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | 4 | cmake -B _build \ 5 | -DBUILD_SHARED_LIBS=ON \ 6 | -DOpenMP_CXX_FLAGS=-fopenmp=libiomp5 \ 7 | -DOpenMP_CXX_LIB_NAMES=libiomp5 \ 8 | -DOpenMP_libiomp5_LIBRARY=$PREFIX/lib/libiomp5.dylib \ 9 | -DCMAKE_INSTALL_LIBDIR=lib \ 10 | -DPython_EXECUTABLE=$PYTHON \ 11 | -DPYTHON_INCLUDE_DIR=$(python -c "import sysconfig; print(sysconfig.get_path('include'))") \ 12 | -DPYTHON_LIBRARY=$(python -c "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))") \ 13 | -DCMAKE_BUILD_TYPE=Release . 14 | 15 | make -C _build -j$(nproc) lintdb 16 | 17 | cmake --install _build --prefix $PREFIX 18 | cmake --install _build --prefix _liblintdb_stage/ -------------------------------------------------------------------------------- /conda/lintdb/build-lib.bat: -------------------------------------------------------------------------------- 1 | cmake -B _build ^ 2 | -T v141 ^ 3 | -A x64 ^ 4 | -G "Visual Studio 16 2019" ^ 5 | . 6 | if %errorlevel% neq 0 exit /b %errorlevel% 7 | 8 | cmake --build _build --config Release -j %CPU_COUNT% 9 | if %errorlevel% neq 0 exit /b %errorlevel% 10 | 11 | cmake --install _build --config Release --prefix %PREFIX% 12 | if %errorlevel% neq 0 exit /b %errorlevel% -------------------------------------------------------------------------------- /conda/lintdb/build-lib.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | MKLROOT=_build/vcpkg_installed/x64-linux/lib/intel64 cmake -B _build \ 6 | -DBUILD_SHARED_LIBS=ON \ 7 | -DBUILD_TESTING=OFF \ 8 | -DENABLE_PYTHON=OFF \ 9 | -DCMAKE_INSTALL_LIBDIR=lib \ 10 | -DBLA_VENDOR=Intel10_64lp \ 11 | -DCMAKE_BUILD_TYPE=Release \ 12 | -DOpenMP_CXX_FLAGS=-fopenmp=libiomp5 \ 13 | -DOpenMP_CXX_LIB_NAMES=libiomp5 \ 14 | -DOpenMP_libiomp5_LIBRARY=$PREFIX/lib/libiomp5.so \ 15 | . 16 | 17 | MKLROOT=_build/vcpkg_installed/x64-linux/lib/intel64 make -C _build -j$(nproc) lintdb 18 | 19 | cmake --install _build --prefix $PREFIX 20 | cmake --install _build --prefix _liblintdb_stage/ -------------------------------------------------------------------------------- /conda/lintdb/build-pkg-arm64.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | 4 | CXXFLAGS="${CXXFLAGS} -D_LIBCPP_DISABLE_AVAILABILITY" cmake -B _build_python_${PY_VER} \ 5 | -Dlintdb_ROOT=_liblintdb_stage/ \ 6 | -DOpenMP_CXX_FLAGS=-fopenmp=libiomp5 \ 7 | -DOpenMP_CXX_LIB_NAMES=libiomp5 \ 8 | -DOpenMP_libiomp5_LIBRARY=$PREFIX/lib/libiomp5.dylib \ 9 | -DCMAKE_BUILD_TYPE=Release \ 10 | -DPython_EXECUTABLE=$PYTHON \ 11 | -DPYTHON_INCLUDE_DIR=$(python -c "import sysconfig; print(sysconfig.get_path('include'))") \ 12 | -DPYTHON_LIBRARY=$(python -c "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))") \ 13 | . 14 | 15 | make -C _build_python_${PY_VER} -j$(nproc) pylintdb 16 | 17 | # Build actual python module. 18 | cd _build_python_${PY_VER}/lintdb/python 19 | $PYTHON setup.py install --single-version-externally-managed --record=record.txt --prefix=$PREFIX -------------------------------------------------------------------------------- /conda/lintdb/build-pkg-osx.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | 4 | cmake -B _build_python_${PY_VER} \ 5 | -Dlintdb_ROOT=_liblintdb_stage/ \ 6 | -DOpenMP_CXX_FLAGS=-fopenmp=libiomp5 \ 7 | -DOpenMP_CXX_LIB_NAMES=libiomp5 \ 8 | -DOpenMP_libiomp5_LIBRARY=$PREFIX/lib/libiomp5.dylib \ 9 | -DCMAKE_BUILD_TYPE=Release \ 10 | -DPython_EXECUTABLE=$PYTHON \ 11 | -DPYTHON_INCLUDE_DIR=$(python -c "import sysconfig; print(sysconfig.get_path('include'))") \ 12 | -DPYTHON_LIBRARY=$(python -c "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))") \ 13 | . 14 | 15 | make -C _build_python_${PY_VER} -j$(nproc) pylintdb 16 | 17 | # Build actual python module. 18 | cd _build_python_${PY_VER}/lintdb/python 19 | $PYTHON setup.py install --single-version-externally-managed --record=record.txt --prefix=$PREFIX -------------------------------------------------------------------------------- /conda/lintdb/build-pkg.bat: -------------------------------------------------------------------------------- 1 | :: Copyright (c) Facebook, Inc. and its affiliates. 2 | :: 3 | :: This source code is licensed under the MIT license found in the 4 | :: LICENSE file in the root directory of this source tree. 5 | 6 | :: Build vanilla version (no avx). 7 | cmake -B _build_python_%PY_VER% ^ 8 | -T v141 ^ 9 | -A x64 ^ 10 | -G "Visual Studio 16 2019" ^ 11 | -DPython_EXECUTABLE=%PYTHON% ^ 12 | -DPYTHON_INCLUDE_DIR=$(python -c "import sysconfig; print(sysconfig.get_path('include'))") ^ 13 | -DPYTHON_LIBRARY=$(python -c "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))") ^ 14 | lintdb/python 15 | if %errorlevel% neq 0 exit /b %errorlevel% 16 | 17 | cmake --build _build_python_%PY_VER% --config Release -j %CPU_COUNT% 18 | if %errorlevel% neq 0 exit /b %errorlevel% 19 | 20 | 21 | :: Build actual python module. 22 | cd _build_python_%PY_VER%/ 23 | %PYTHON% setup.py install --single-version-externally-managed --record=record.txt --prefix=%PREFIX% 24 | if %errorlevel% neq 0 exit /b %errorlevel% -------------------------------------------------------------------------------- /conda/lintdb/build-pkg.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | $PYTHON -m pip install . -vv -------------------------------------------------------------------------------- /conda/lintdb/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.1').lstrip('v') %} 2 | {% set suffix = "_nightly" if environ.get('PACKAGE_TYPE') == 'nightly' else "" %} 3 | {% set number = environ.get('GIT_DESCRIBE_NUMBER', '0') %} 4 | 5 | package: 6 | name: lintdb-pkg 7 | version: {{ version }} 8 | 9 | build: 10 | number: {{ number }} 11 | 12 | about: 13 | home: https://github.com/DeployQL/lintdb 14 | license: Apache 2 15 | license_family: Apache 16 | license_file: LICENSE 17 | summary: A multi-vector database for token level interaction. 18 | description: | 19 | LintDB is a multi-vector database meant for Gen AI. LintDB natively supports late interaction like colBERT and PLAID. 20 | 21 | source: 22 | git_url: ../../ 23 | 24 | outputs: 25 | - name: lintdb 26 | script: build-pkg.sh 27 | build: 28 | string: "py{{ PY_VER }}_h{{ PKG_HASH }}_{{ number }}_cpu{{ suffix }}" 29 | requirements: 30 | build: 31 | - {{ compiler('cxx') }} 32 | - {{ compiler('fortran')}} 33 | - sysroot_linux-64 =2.17 # [linux64] 34 | - numpy==1.26.4 35 | - scikit-build-core 36 | - cmake >=3.25 37 | - make # [not win] 38 | - mkl-devel =2023.0.0 # [x86_64] 39 | - openblas # [not x86_64] 40 | - python {{ python }} 41 | host: 42 | - python {{ python }} 43 | - numpy==1.26.4 44 | - scikit-build-core 45 | - conda-forge::llvm-openmp # [x86_64] 46 | - mkl =2023.0.0 # [x86_64] 47 | - openblas # [not x86_64] 48 | run: 49 | - python {{ python }} 50 | - numpy==1.26.4 51 | - mkl =2023.0.0 # [x86_64] 52 | - openblas # [not x86_64] 53 | - packaging 54 | - __osx >={{ MACOSX_DEPLOYMENT_TARGET|default("10.13") }} # [osx and x86_64] 55 | test: 56 | requires: 57 | - pytest 58 | - numpy==1.26.4 59 | commands: 60 | - pytest lintdb/python/tests 61 | source_files: 62 | - lintdb/python/tests -------------------------------------------------------------------------------- /docker/Dockerfile.conda.build: -------------------------------------------------------------------------------- 1 | FROM ubuntu 2 | 3 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 4 | ENV PATH /opt/conda/bin:$PATH 5 | 6 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \ 7 | build-essential curl \ 8 | git 9 | 10 | # Get Rust 11 | RUN curl https://sh.rustup.rs -sSf | bash -s -- -y 12 | ENV PATH="/root/.cargo/bin:${PATH}" 13 | 14 | # Install Miniconda 15 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh && \ 16 | /bin/bash /tmp/miniconda.sh -b -p /opt/miniconda && \ 17 | rm /tmp/miniconda.sh 18 | 19 | # Add Miniconda to the path 20 | ENV PATH="/opt/miniconda/bin:$PATH" 21 | 22 | # Update conda 23 | RUN conda update -n base -c defaults conda -y 24 | 25 | # Create and activate a new conda environment 26 | RUN conda create -y -n build_env python=3.11 27 | SHELL ["conda", "run", "-n", "build_env", "/bin/bash", "-c"] 28 | RUN echo "source activate build_env" > ~/.bashrc 29 | ENV PATH /opt/conda/envs/build_env/bin:$PATH 30 | 31 | # Install conda-build 32 | RUN conda install -y conda-build 33 | 34 | RUN conda config --append channels conda-forge 35 | 36 | CMD [ "conda", "build", "/lintdb/conda/lintdb" ] 37 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/development.md: -------------------------------------------------------------------------------- 1 | # Development 2 | 3 | ## LintDB C++ Libraries 4 | To develop on LintDB, there are a few dependencies that you need to install. The below instructions are for Ubuntu. 5 | 6 | ### [vcpkg](https://learn.microsoft.com/en-us/vcpkg/get_started/overview) 7 | ```bash 8 | git clone https://github.com/microsoft/vcpkg.git 9 | cd vcpkg && ./bootstrap-vcpkg.sh 10 | ``` 11 | 12 | ### [clang](https://apt.llvm.org/) 13 | We expect clang as the compiler. This helps align with our expectations of MKL libraries detailed below. 14 | ```bash 15 | wget https://apt.llvm.org/llvm.sh 16 | chmod +x llvm.sh 17 | sudo ./llvm.sh all 18 | ``` 19 | 20 | ### [miniforge](https://github.com/conda-forge/miniforge) (recommended) 21 | Miniforge is a minimal installer for conda that automatically installs conda-forge packages. 22 | 23 | We can create an isolated environment for lintdb development. 24 | ```bash 25 | conda create -n lintdb python=3.10 26 | conda activate lintdb 27 | ``` 28 | 29 | ### Recommended Python Libraries 30 | There are a few helpful python libraries that are used in profiling and testing LintDB. 31 | ```bash 32 | pip install graph2dot 33 | ``` 34 | --- 35 | # Python LintDB 36 | 37 | In addition to the above, developing with the Python LintDB library requires a few more dependencies. 38 | 39 | LintDB uses nanobind to create Python bindings. It also comes with a helpful CLI tool to create stubs for Python. 40 | 41 | ```bash 42 | pip install nanobind 43 | ``` 44 | 45 | ### creating python stubs 46 | ```bash 47 | python -m nanobind.stubgen -m lintdb.core -M py.typed -o core.pyi 48 | ``` 49 | 50 | --- 51 | 52 | # Makefile commands 53 | 54 | The Makefile at the root of the repository has a few commands that can help you get started. 55 | CMakePresets.json is used to configure the build system. 56 | 57 | ```bash 58 | # build a debug target with tests. 59 | make build-debug 60 | 61 | # build a release target 62 | make build-release 63 | 64 | # run tests 65 | make tests 66 | 67 | # run benchmarks 68 | make benchmarks 69 | 70 | # profile LintDB (note some variables need to change in the Makefile) 71 | make callgrind 72 | ``` 73 | 74 | --- 75 | 76 | You'll notice that each target is statically linked. However, we dynamically depend on finding either MKL or OpenBLAS at runtime. 77 | 78 | ## MKL vs OpenBLAS 79 | 80 | LintDB currently uses either MKL or OpenBLAS for linear algebra operations. By default, we use MKL on Windows and Ubuntu. On MacOS, we use OpenBLAS. 81 | 82 | It should be noted that MKL doesn't always play well with OpenMP. We specify linking against intel's version of OpenMP, but 83 | at runtime, it's possible we find a different version. This can lead to performance issues. 84 | 85 | It can be helpful to refer to [Intel's threading layer documentation](https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-linux/2023-0/dynamic-select-the-interface-and-threading-layer.html) and 86 | try `INTEL` or `GNU`. Running `ldd path/to/liblintdb_lib.so` will output what libraries are being linked at runtime to verify if there 87 | are issues. -------------------------------------------------------------------------------- /docs/environment.yaml: -------------------------------------------------------------------------------- 1 | name: lintdb-docs 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - mkdocs-material 7 | - mike 8 | - mkdocstrings[python] 9 | - markdown-callouts 10 | - mkdocs-literate-nav -------------------------------------------------------------------------------- /docs/icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --8<-- "README.md" -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | LintDB requires Python 3.10 or later. 4 | 5 | ## Installing using conda (recommended) 6 | 7 | We highly recommend using conda to install LintDB. 8 | 9 | ```bash 10 | conda install -c DeployQL lintdb 11 | ``` 12 | 13 | If you don't have conda, you can install it from [here](https://docs.conda.io/en/latest/miniconda.html). 14 | 15 | 16 | [//]: # (## Installing using pip) 17 | 18 | [//]: # () 19 | [//]: # (If you don't want to use conda, you can install LintDB using pip.) 20 | 21 | [//]: # () 22 | [//]: # (LintDB expects that you have openBLAS or MKL installed.) 23 | 24 | [//]: # () 25 | [//]: # (```bash) 26 | 27 | [//]: # (pip install mkl lintdb) 28 | 29 | [//]: # (```) 30 | 31 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/nav.md: -------------------------------------------------------------------------------- 1 | * [Introduction](index.md) 2 | * [Getting Started](getting-started.md) 3 | * [Installation](installation.md) 4 | * [Examples](examples.md) 5 | * [Development](development.md) 6 | * [Reference](reference.md) -------------------------------------------------------------------------------- /docs/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "lintdb-docs" 3 | version = "0.1.0" 4 | description = "documentation for LintDB" 5 | authors = ["DeployQL"] 6 | readme = "README.md" 7 | packages = [{include = "lintdb_docs"}] 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.10" 11 | chardet = "^5.2.0" 12 | sphinx-immaterial = "^0.11.11" 13 | 14 | 15 | [build-system] 16 | requires = ["poetry-core"] 17 | build-backend = "poetry.core.masonry.api" 18 | -------------------------------------------------------------------------------- /docs/reference.md: -------------------------------------------------------------------------------- 1 | # Reference 2 | 3 | ::: lintdb.core 4 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | chardet = "^5.2.0" 2 | sphinx-immaterial = "^0.11.11" 3 | myst-parser -------------------------------------------------------------------------------- /icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /lintdb/SearchOptions.h: -------------------------------------------------------------------------------- 1 | #ifndef LINTDB_SEARCH_OPTIONS_H 2 | #define LINTDB_SEARCH_OPTIONS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "lintdb/api.h" 8 | 9 | namespace lintdb { 10 | 11 | /** 12 | * SearchOptions enables custom searching behavior. 13 | * 14 | * These options expose ways to tradeoff recall and latency at different levels 15 | * of retrieval. Searching more centroids: 16 | * - decrease centroid_score_threshold and increase k_top_centroids. 17 | * - increase n_probe in search() 18 | * 19 | * Decreasing latency: 20 | * - increase centroid_score_threshold and decrease k_top_centroids. 21 | * - decrease n_probe in search() 22 | */ 23 | struct SearchOptions { 24 | idx_t expected_id = -1; /// expects a document id in the return result. 25 | /// prints additional information during execution. 26 | /// useful for debugging. 27 | float centroid_score_threshold = 28 | 0.45; /// the threshold for centroid scores. 29 | size_t k_top_centroids = 30 | 2; /// the number of top centroids to consider per token. 31 | size_t num_second_pass = 32 | 1024; /// the number of second pass candidates to consider. 33 | size_t n_probe = 32; /// the number of centroids to search overall. 34 | size_t nearest_tokens_to_fetch = 35 | 100; /// the number of nearest tokens to fetch in XTR. 36 | std::string colbert_field = "colbert"; 37 | 38 | SearchOptions() : expected_id(-1){}; 39 | }; 40 | } // namespace lintdb 41 | 42 | #endif -------------------------------------------------------------------------------- /lintdb/SearchResult.h: -------------------------------------------------------------------------------- 1 | #ifndef LINTDB_SEARCH_RESULT_H 2 | #define LINTDB_SEARCH_RESULT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "lintdb/api.h" 8 | #include "lintdb/schema/DataTypes.h" 9 | 10 | namespace lintdb { 11 | 12 | /** 13 | * SearchResult is a simple struct to hold the results of a search. 14 | * 15 | */ 16 | struct SearchResult { 17 | idx_t id; /// the document's id. 18 | float score; /// the final score as determined by the database. 19 | std::map 20 | metadata; /// Optionally, metadata that was indexed for the 21 | /// document. 22 | 23 | SearchResult() = default; 24 | 25 | bool operator<(const SearchResult& other) const { 26 | return score < other.score; 27 | } 28 | bool operator>(const SearchResult& other) const { 29 | return score > other.score; 30 | } 31 | }; 32 | 33 | } // namespace lintdb 34 | 35 | #endif -------------------------------------------------------------------------------- /lintdb/api.h: -------------------------------------------------------------------------------- 1 | #ifndef LINTDB_API_H 2 | #define LINTDB_API_H 3 | 4 | #include 5 | #include 6 | 7 | typedef int64_t idx_t; 8 | 9 | // the codes used to save the centroid for each token vector. 10 | // each code is treated as an index, which is defined above. 11 | typedef idx_t code_t; 12 | typedef uint8_t residual_t; // the residual codes saved for each token vector. 13 | 14 | typedef uint16_t float16; 15 | typedef uint16_t bfloat16; 16 | 17 | #endif -------------------------------------------------------------------------------- /lintdb/assert.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef LINTDB_ASSERT_H 3 | #define LINTDB_ASSERT_H 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "lintdb/exception.h" 10 | 11 | // #define __PRETTY_FUNCTION__ __FUNCSIG__ 12 | 13 | #define LINTDB_THROW_FMT(FMT, ...) \ 14 | do { \ 15 | std::string __s; \ 16 | int __size = snprintf(nullptr, 0, FMT, __VA_ARGS__); \ 17 | __s.resize(__size + 1); \ 18 | snprintf(&__s[0], __s.size(), FMT, __VA_ARGS__); \ 19 | throw lintdb::LintDBException( \ 20 | __s, __PRETTY_FUNCTION__, __FILE__, __LINE__); \ 21 | } while (false) 22 | 23 | /// 24 | /// Exceptions thrown upon a conditional failure 25 | /// 26 | 27 | #define LINTDB_THROW_IF_NOT(X) \ 28 | do { \ 29 | if (!(X)) { \ 30 | LINTDB_THROW_FMT("Error: '%s' failed", #X); \ 31 | } \ 32 | } while (false) 33 | 34 | #define LINTDB_THROW_IF_NOT_MSG(X, MSG) \ 35 | do { \ 36 | if (!(X)) { \ 37 | LINTDB_THROW_FMT("Error: '%s' failed: " MSG, #X); \ 38 | } \ 39 | } while (false) 40 | 41 | #define LINTDB_THROW_IF_NOT_FMT(X, FMT, ...) \ 42 | do { \ 43 | if (!(X)) { \ 44 | LINTDB_THROW_FMT("Error: '%s' failed: " FMT, #X, __VA_ARGS__); \ 45 | } \ 46 | } while (false) 47 | 48 | #endif -------------------------------------------------------------------------------- /lintdb/cf.h: -------------------------------------------------------------------------------- 1 | #ifndef LINTDB_CF_H 2 | #define LINTDB_CF_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "lintdb/constants.h" 9 | 10 | namespace lintdb { 11 | namespace { 12 | rocksdb::ColumnFamilyOptions create_index_table_options() { 13 | rocksdb::ColumnFamilyOptions index_options; 14 | rocksdb::BlockBasedTableOptions table_options; 15 | table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false)); 16 | index_options.table_factory.reset( 17 | rocksdb::NewBlockBasedTableFactory(table_options)); 18 | 19 | // the inverted index uses 8 bytes for the tenant, and 4 bytes for the 20 | // inverted list id. 21 | index_options.prefix_extractor.reset(rocksdb::NewCappedPrefixTransform(12)); 22 | 23 | return index_options; 24 | }; 25 | } // namespace 26 | inline std::vector create_column_families() { 27 | return {rocksdb::ColumnFamilyDescriptor( 28 | rocksdb::kDefaultColumnFamilyName, 29 | rocksdb::ColumnFamilyOptions()), 30 | rocksdb::ColumnFamilyDescriptor( 31 | kIndexColumnFamily, create_index_table_options()), 32 | rocksdb::ColumnFamilyDescriptor( 33 | kForwardColumnFamily, rocksdb::ColumnFamilyOptions()), 34 | rocksdb::ColumnFamilyDescriptor( 35 | kCodesColumnFamily, rocksdb::ColumnFamilyOptions()), 36 | rocksdb::ColumnFamilyDescriptor( 37 | kResidualsColumnFamily, rocksdb::ColumnFamilyOptions()), 38 | rocksdb::ColumnFamilyDescriptor( 39 | kMappingColumnFamily, rocksdb::ColumnFamilyOptions()), 40 | rocksdb::ColumnFamilyDescriptor( 41 | kDocColumnFamily, rocksdb::ColumnFamilyOptions())}; 42 | } 43 | 44 | } // namespace lintdb 45 | 46 | #endif -------------------------------------------------------------------------------- /lintdb/constants.h: -------------------------------------------------------------------------------- 1 | #ifndef LINTDB_CONSTANTS_H 2 | #define LINTDB_CONSTANTS_H 3 | 4 | #include 5 | #include 6 | #include "lintdb/api.h" 7 | 8 | namespace lintdb { 9 | using std::string; 10 | static const string kIndexColumnFamily = "index"; 11 | static const string kCodesColumnFamily = "codes"; 12 | static const string kResidualsColumnFamily = "residuals"; 13 | static const string kForwardColumnFamily = "forward"; 14 | static const string kMappingColumnFamily = "mapping"; 15 | static const string kDocColumnFamily = "doc"; 16 | 17 | typedef idx_t column_index_t; 18 | static const column_index_t kIndexColumnIndex = 1; 19 | static const column_index_t kForwardColumnIndex = 2; 20 | static const column_index_t kCodesColumnIndex = 3; 21 | static const column_index_t kResidualsColumnIndex = 4; 22 | static const column_index_t kMappingColumnIndex = 5; 23 | static const column_index_t kDocColumnIndex = 6; 24 | 25 | // default tenant is used in testing. 26 | static const uint64_t kDefaultTenant = 0; 27 | } // namespace lintdb 28 | 29 | #endif -------------------------------------------------------------------------------- /lintdb/env.h: -------------------------------------------------------------------------------- 1 | #ifndef LINTDB_ENV_H 2 | #define LINTDB_ENV_H 3 | 4 | namespace lintdb { 5 | // environment variables we use to set the number of threads. 6 | const char* ONNX_INTER_THREADS = "LINTDB_INTER_NUM_THREADS"; 7 | const char* ONNX_INTRA_THREADS = "LINTDB_INTRA_NUM_THREADS"; 8 | } // namespace lintdb 9 | 10 | #endif // LINTDB_ENV_H 11 | -------------------------------------------------------------------------------- /lintdb/exception.h: -------------------------------------------------------------------------------- 1 | #ifndef LINTDB_EXCEPTION_H 2 | #define LINTDB_EXCEPTION_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace lintdb { 10 | class LintDBException : public std::exception { 11 | public: 12 | explicit LintDBException(const std::string& message) : message(message){}; 13 | 14 | LintDBException( 15 | const std::string& m, 16 | const char* funcName, 17 | const char* file, 18 | int line) { 19 | int size = snprintf( 20 | nullptr, 21 | 0, 22 | "Error in %s at %s:%d: %s", 23 | funcName, 24 | file, 25 | line, 26 | m.c_str()); 27 | message.resize(size + 1); 28 | snprintf( 29 | &message[0], 30 | message.size(), 31 | "Error in %s at %s:%d: %s", 32 | funcName, 33 | file, 34 | line, 35 | m.c_str()); 36 | } 37 | 38 | const char* what() const noexcept override { 39 | return message.c_str(); 40 | }; 41 | 42 | private: 43 | std::string message; 44 | }; 45 | } // namespace lintdb 46 | 47 | #endif -------------------------------------------------------------------------------- /lintdb/invlists/ContextIterator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "lintdb/constants.h" 7 | #include "lintdb/invlists/Iterator.h" 8 | #include "lintdb/invlists/KeyBuilder.h" 9 | 10 | namespace lintdb { 11 | class ContextIterator { 12 | public: 13 | ContextIterator( 14 | const std::shared_ptr db, 15 | rocksdb::ColumnFamilyHandle* column_family, 16 | const uint64_t tenant, 17 | const uint8_t field) 18 | : tenant(tenant), field(field) { 19 | if (!column_family) { 20 | throw std::runtime_error("Column family not found"); 21 | } 22 | cf = column_family->GetID(); 23 | KeyBuilder kb; 24 | prefix = kb.add(tenant).add(field).build(); 25 | 26 | prefix_slice = rocksdb::Slice(this->prefix); 27 | auto options = rocksdb::ReadOptions(); 28 | 29 | this->it = std::unique_ptr( 30 | db->NewIterator(options, column_family)); 31 | it->Seek(this->prefix); 32 | } 33 | 34 | bool is_valid() { 35 | if (!has_read_key) { 36 | bool is_valid = it->Valid(); 37 | if (!is_valid) { 38 | return false; 39 | } 40 | 41 | auto key = it->key(); 42 | std::string key_str = key.ToString(); 43 | if (key_str.compare(0, prefix.size(), prefix) != 0) { 44 | return false; 45 | } 46 | this->current_key = ContextKey(key_str); 47 | } 48 | 49 | has_read_key = true; 50 | return true; 51 | } 52 | 53 | void advance(const idx_t doc_id) { 54 | KeyBuilder kb; 55 | 56 | std::string expected_key = 57 | kb.add(tenant).add(field).add(doc_id).build(); 58 | it->Seek(rocksdb::Slice(expected_key)); 59 | has_read_key = false; 60 | } 61 | 62 | void next() { 63 | it->Next(); 64 | has_read_key = false; 65 | } 66 | 67 | ContextKey get_key() const { 68 | return current_key; 69 | } 70 | 71 | std::string get_value() const { 72 | return it->value().ToString(); 73 | } 74 | 75 | std::unique_ptr it; 76 | 77 | protected: 78 | lintdb::column_index_t cf; 79 | string prefix; 80 | string end_key; 81 | rocksdb::Slice prefix_slice; 82 | ContextKey current_key; 83 | 84 | bool has_read_key; 85 | const uint64_t tenant; 86 | const uint8_t field; 87 | }; 88 | 89 | } // namespace lintdb 90 | -------------------------------------------------------------------------------- /lintdb/invlists/ForwardIndexIterator.cpp: -------------------------------------------------------------------------------- 1 | #include "ForwardIndexIterator.h" 2 | #include 3 | 4 | namespace lintdb { 5 | ForwardIndexIterator::ForwardIndexIterator( 6 | std::shared_ptr db, 7 | rocksdb::ColumnFamilyHandle* column_family, 8 | const uint64_t tenant) 9 | : tenant(tenant) { 10 | cf = column_family->GetID(); 11 | KeyBuilder kb; 12 | 13 | prefix = kb.add(tenant).build(); 14 | 15 | prefix_slice = rocksdb::Slice(this->prefix); 16 | auto options = rocksdb::ReadOptions(); 17 | 18 | this->it = std::unique_ptr( 19 | db->NewIterator(options, column_family)); 20 | it->Seek(this->prefix); 21 | } 22 | 23 | bool ForwardIndexIterator::has_next() { 24 | bool is_valid = it->Valid(); 25 | if (!is_valid) { 26 | return false; 27 | } 28 | auto key = it->key().ToString(); 29 | this->current_key = ForwardIndexKey(key); 30 | 31 | if (current_key.tenant() != tenant) { 32 | return false; 33 | } 34 | 35 | return true; 36 | } 37 | 38 | void ForwardIndexIterator::next() { 39 | it->Next(); 40 | } 41 | 42 | ForwardIndexKey ForwardIndexIterator::get_key() const { 43 | return current_key; 44 | } 45 | 46 | std::string ForwardIndexIterator::get_value() const { 47 | return it->value().ToString(); 48 | } 49 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/invlists/ForwardIndexIterator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "lintdb/constants.h" 5 | #include "lintdb/invlists/KeyBuilder.h" 6 | #include "rocksdb/db.h" 7 | 8 | namespace lintdb { 9 | 10 | /** 11 | * ForwardIndexIterator is an iterator over the forward index. 12 | * 13 | * This is somewhat coupled to key format so that we can control 14 | * iteration. Note that while RocksDB has start and stop option, 15 | * it was not working as expected. So we are doing it manually. 16 | */ 17 | struct ForwardIndexIterator { 18 | ForwardIndexIterator( 19 | std::shared_ptr db, 20 | rocksdb::ColumnFamilyHandle* column_family, 21 | const uint64_t tenant); 22 | 23 | bool has_next(); 24 | 25 | void next(); 26 | 27 | ForwardIndexKey get_key() const; 28 | 29 | std::string get_value() const; 30 | 31 | std::unique_ptr it; 32 | 33 | protected: 34 | lintdb::column_index_t cf; 35 | string prefix; 36 | string end_key; 37 | rocksdb::Slice prefix_slice; 38 | ForwardIndexKey current_key; 39 | 40 | const idx_t tenant; 41 | }; 42 | 43 | } // namespace lintdb 44 | -------------------------------------------------------------------------------- /lintdb/invlists/IndexWriter.cpp: -------------------------------------------------------------------------------- 1 | #include "IndexWriter.h" 2 | #include 3 | #include 4 | #include 5 | #include "lintdb/api.h" 6 | #include "lintdb/assert.h" 7 | #include "lintdb/constants.h" 8 | #include "lintdb/invlists/PostingData.h" 9 | 10 | namespace lintdb { 11 | IndexWriter::IndexWriter( 12 | std::shared_ptr db, 13 | std::vector& column_families, 14 | const Version& version) 15 | : db(db), column_families(column_families), version(version) {} 16 | 17 | /** 18 | * Write will batch write all document data to the database. 19 | * @param batch_posting_data 20 | */ 21 | void IndexWriter::write(const BatchPostingData& batch_posting_data) { 22 | rocksdb::WriteBatch batch; 23 | 24 | // write all inverted index data 25 | for (const auto& posting : batch_posting_data.inverted) { 26 | batch.Put( 27 | column_families[kIndexColumnIndex], 28 | rocksdb::Slice(posting.key), 29 | rocksdb::Slice(posting.value)); 30 | } 31 | 32 | // write all mappings 33 | for (const auto& posting : batch_posting_data.inverted_mapping) { 34 | batch.Put( 35 | column_families[kMappingColumnIndex], 36 | rocksdb::Slice(posting.key), 37 | rocksdb::Slice(posting.value)); 38 | } 39 | 40 | // write all document data 41 | batch.Put( 42 | column_families[kDocColumnIndex], 43 | rocksdb::Slice(batch_posting_data.forward.key), 44 | rocksdb::Slice(batch_posting_data.forward.value)); 45 | 46 | // write all context data 47 | for (const auto& posting : batch_posting_data.context) { 48 | batch.Put( 49 | column_families[kCodesColumnIndex], 50 | rocksdb::Slice(posting.key), 51 | rocksdb::Slice(posting.value)); 52 | } 53 | 54 | auto status = db->Write(rocksdb::WriteOptions(), &batch); 55 | assert(status.ok()); 56 | 57 | LINTDB_THROW_IF_NOT(status.ok()); 58 | } 59 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/invlists/IndexWriter.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "lintdb/invlists/PostingData.h" 7 | #include "lintdb/version.h" 8 | 9 | namespace lintdb { 10 | 11 | class IIndexWriter { 12 | public: 13 | virtual void write(const BatchPostingData& batch_posting_data) = 0; 14 | 15 | virtual ~IIndexWriter() = default; 16 | }; 17 | 18 | class IndexWriter : public IIndexWriter { 19 | private: 20 | std::shared_ptr db; 21 | std::vector& column_families; 22 | const Version& version; 23 | 24 | public: 25 | IndexWriter( 26 | std::shared_ptr db, 27 | std::vector& column_families, 28 | const Version& version); 29 | 30 | void write(const BatchPostingData& batch_posting_data) override; 31 | }; 32 | 33 | } // namespace lintdb 34 | -------------------------------------------------------------------------------- /lintdb/invlists/InvertedIterator.cpp: -------------------------------------------------------------------------------- 1 | #include "InvertedIterator.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "lintdb/constants.h" 7 | #include "lintdb/invlists/ContextIterator.h" 8 | 9 | lintdb::RocksDBIterator::RocksDBIterator( 10 | std::shared_ptr db, 11 | rocksdb::ColumnFamilyHandle* column_family, 12 | const std::string& prefix) 13 | : Iterator(), prefix(prefix), has_read_key(false) { 14 | cf = column_family->GetID(); 15 | 16 | prefix_slice = rocksdb::Slice(this->prefix); 17 | auto options = rocksdb::ReadOptions(); 18 | 19 | this->it = std::unique_ptr( 20 | db->NewIterator(options, column_family)); 21 | it->Seek(this->prefix); 22 | } -------------------------------------------------------------------------------- /lintdb/invlists/InvertedIterator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "lintdb/constants.h" 11 | #include "lintdb/exception.h" 12 | #include "lintdb/invlists/ContextIterator.h" 13 | #include "lintdb/invlists/InvertedList.h" 14 | #include "lintdb/invlists/Iterator.h" 15 | #include "lintdb/invlists/KeyBuilder.h" 16 | #include "lintdb/version.h" 17 | 18 | namespace lintdb { 19 | 20 | struct RocksDBIterator : public lintdb::Iterator { 21 | RocksDBIterator( 22 | std::shared_ptr db, 23 | rocksdb::ColumnFamilyHandle* column_family, 24 | const std::string& prefix); 25 | 26 | bool is_valid() override { 27 | if (!has_read_key) { 28 | bool is_valid = it->Valid(); 29 | if (!is_valid) { 30 | return false; 31 | } 32 | 33 | auto key = it->key(); 34 | std::string key_str = key.ToString(); 35 | if (key_str.compare(0, prefix.size(), prefix) != 0) { 36 | return false; 37 | } 38 | 39 | current_key = InvertedIndexKey(key_str); 40 | } 41 | 42 | has_read_key = true; 43 | return true; 44 | } 45 | 46 | void next() override { 47 | it->Next(); 48 | has_read_key = false; 49 | } 50 | 51 | InvertedIndexKey get_key() const override { 52 | return current_key; 53 | } 54 | 55 | string get_value() const override { 56 | return it->value().ToString(); 57 | } 58 | 59 | std::unique_ptr it; 60 | 61 | protected: 62 | lintdb::column_index_t cf; 63 | string prefix; 64 | string end_key; 65 | rocksdb::Slice prefix_slice; 66 | InvertedIndexKey current_key; 67 | 68 | bool has_read_key; 69 | }; 70 | 71 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/invlists/InvertedList.h: -------------------------------------------------------------------------------- 1 | #ifndef LINTDB_INVLISTS_INVERTED_LIST_H 2 | #define LINTDB_INVLISTS_INVERTED_LIST_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "lintdb/api.h" 8 | #include "lintdb/constants.h" 9 | #include "lintdb/invlists/ContextIterator.h" 10 | #include "lintdb/invlists/EncodedDocument.h" 11 | #include "lintdb/invlists/ForwardIndexIterator.h" 12 | #include "lintdb/invlists/Iterator.h" 13 | #include "lintdb/schema/Schema.h" 14 | 15 | namespace lintdb { 16 | /** 17 | * InvertedList manages the storage of centroid -> codes mappping. 18 | * 19 | * InvertedLists are expected to be fast. The more data stored in the 20 | * invertedList, the slower it will become. 21 | * 22 | * We also expect the InvertedList to manage a mapping of document -> centroids 23 | * to facilitate deletion. 24 | */ 25 | struct InvertedList { 26 | virtual void remove( 27 | const uint64_t tenant, 28 | std::vector ids, 29 | const uint8_t field, 30 | const DataType data_type, 31 | const std::vector field_types) = 0; 32 | virtual void merge( 33 | rocksdb::DB* db, 34 | std::vector& cfs) = 0; 35 | 36 | virtual std::unique_ptr get_iterator( 37 | const std::string& prefix) const = 0; 38 | 39 | virtual std::unique_ptr get_context_iterator( 40 | const uint64_t tenant, 41 | const uint8_t field_id) const = 0; 42 | 43 | virtual std::vector get_mapping(const uint64_t tenant, idx_t id) 44 | const = 0; 45 | 46 | virtual ~InvertedList() = default; 47 | }; 48 | 49 | /** 50 | * ForwardIndex helps retrieve document data from the index. 51 | */ 52 | struct ForwardIndex { 53 | virtual std::vector> get_metadata( 54 | const uint64_t tenant, 55 | const std::vector& ids) const = 0; 56 | 57 | virtual void remove(const uint64_t tenant, std::vector ids) = 0; 58 | 59 | virtual void merge( 60 | rocksdb::DB* db, 61 | std::vector& cfs) = 0; 62 | 63 | virtual std::unique_ptr get_iterator( 64 | const uint64_t tenant, 65 | const idx_t inverted_list) const = 0; 66 | 67 | virtual ~ForwardIndex() = default; 68 | }; 69 | } // namespace lintdb 70 | 71 | #endif -------------------------------------------------------------------------------- /lintdb/invlists/Iterator.h: -------------------------------------------------------------------------------- 1 | #ifndef LINTDB_INVLISTS_ITERATOR_H 2 | #define LINTDB_INVLISTS_ITERATOR_H 3 | 4 | #include 5 | #include "lintdb/invlists/KeyBuilder.h" 6 | 7 | namespace lintdb { 8 | struct Iterator { 9 | virtual bool is_valid() = 0; 10 | virtual void next() = 0; 11 | 12 | virtual InvertedIndexKey get_key() const = 0; 13 | virtual std::string get_value() const = 0; 14 | 15 | virtual ~Iterator() = default; 16 | }; 17 | } // namespace lintdb 18 | 19 | #endif -------------------------------------------------------------------------------- /lintdb/invlists/PostingData.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace lintdb { 7 | struct PostingData { 8 | std::string key; 9 | std::string value; 10 | }; 11 | 12 | struct BatchPostingData { 13 | std::vector inverted; 14 | PostingData forward; /// A single document has one entry in forward index 15 | std::vector context; 16 | std::vector inverted_mapping; 17 | }; 18 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/invlists/RocksdbForwardIndex.h: -------------------------------------------------------------------------------- 1 | #ifndef LINTDB_ROCKSDB_LIST_H 2 | #define LINTDB_ROCKSDB_LIST_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "lintdb/constants.h" 12 | #include "lintdb/invlists/ForwardIndexIterator.h" 13 | #include "lintdb/invlists/InvertedList.h" 14 | #include "lintdb/invlists/Iterator.h" 15 | #include "lintdb/version.h" 16 | 17 | namespace lintdb { 18 | 19 | struct RocksdbForwardIndex : public ForwardIndex { 20 | RocksdbForwardIndex( 21 | std::shared_ptr db, 22 | std::vector& column_families, 23 | const Version& version); 24 | 25 | void remove(const uint64_t tenant, std::vector ids) override; 26 | 27 | void merge(rocksdb::DB* db, std::vector& cfs) 28 | override; 29 | 30 | std::vector> get_metadata( 31 | const uint64_t tenant, 32 | const std::vector& ids) const override; 33 | 34 | std::unique_ptr get_iterator( 35 | const uint64_t tenant, 36 | idx_t column_index) const override; 37 | 38 | protected: 39 | Version version; 40 | std::shared_ptr db_; 41 | std::vector& column_families; 42 | }; 43 | 44 | } // namespace lintdb 45 | 46 | #endif -------------------------------------------------------------------------------- /lintdb/invlists/RocksdbInvertedList.h: -------------------------------------------------------------------------------- 1 | #ifndef LINTDB_ROCKSDBINVERTEDLIST_H 2 | #define LINTDB_ROCKSDBINVERTEDLIST_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "lintdb/constants.h" 12 | #include "lintdb/exception.h" 13 | #include "lintdb/invlists/ContextIterator.h" 14 | #include "lintdb/invlists/InvertedList.h" 15 | #include "lintdb/invlists/Iterator.h" 16 | #include "lintdb/version.h" 17 | 18 | namespace lintdb { 19 | 20 | /** 21 | * RocksdbInvertedList stores a slim version of the inverted list. There is no 22 | * data associated with each token, only the document id as part of the key. 23 | * 24 | * This inverted list is only capable of telling us what documents are 25 | * associated with what centroids. 26 | */ 27 | struct RocksdbInvertedList : public InvertedList { 28 | RocksdbInvertedList( 29 | std::shared_ptr db, 30 | std::vector& column_families, 31 | const Version& version); 32 | 33 | void remove( 34 | const uint64_t tenant, 35 | std::vector ids, 36 | const uint8_t field, 37 | const DataType data_type, 38 | const std::vector field_types) override; 39 | void merge(rocksdb::DB* db, std::vector& cfs) 40 | override; 41 | 42 | std::vector get_mapping(const uint64_t tenant, idx_t id) 43 | const override; 44 | 45 | [[nodiscard]] std::unique_ptr get_iterator( 46 | const std::string& prefix) const override; 47 | 48 | std::unique_ptr get_context_iterator( 49 | const uint64_t tenant, 50 | const uint8_t field_id) const override; 51 | 52 | protected: 53 | Version version; 54 | std::shared_ptr db_; 55 | std::vector& column_families; 56 | }; 57 | 58 | } // namespace lintdb 59 | 60 | #endif // LINTDB_ROCKSDBINVERTEDLIST_H 61 | -------------------------------------------------------------------------------- /lintdb/python/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | cmake_minimum_required(VERSION 3.21 FATAL_ERROR) 4 | 5 | project(lintdb 6 | LANGUAGES CXX 7 | ) 8 | 9 | set(CMAKE_CXX_STANDARD 17) 10 | 11 | find_package(Python 12 | REQUIRED COMPONENTS Interpreter Development.Module 13 | OPTIONAL_COMPONENTS Development.SABIModule 14 | ) 15 | 16 | message("=== Selected Python Variables ===") 17 | message(STATUS "Python3_STDLIB: " ${Python_STDLIB}) 18 | message(STATUS "Python3_SITELIB: " ${Python_SITELIB}) 19 | message(STATUS "Python3_VERSION: " ${Python_VERSION}) 20 | message(STATUS "Python3_EXECUTABLE: " ${Python_EXECUTABLE}) 21 | 22 | if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) 23 | set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) 24 | set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") 25 | endif() 26 | 27 | add_subdirectory(${PROJECT_SOURCE_DIR}/../../third_party/nanobind nanobind EXCLUDE_FROM_ALL) 28 | find_package(nanobind CONFIG REQUIRED) 29 | 30 | nanobind_add_module( 31 | core 32 | STABLE_ABI 33 | pylintdb.cpp 34 | ) 35 | 36 | target_link_libraries(core PRIVATE 37 | lintdb_lib 38 | ) 39 | 40 | install(TARGETS core LIBRARY DESTINATION lintdb) 41 | 42 | # this doesn't work because python looks for typing_extensions and doesn't find it on py3.10 43 | #nanobind_add_stub( 44 | # lintdb_stub 45 | # INSTALL_TIME 46 | # MODULE core 47 | # OUTPUT core.pyi 48 | # PYTHON_PATH $ 49 | # DEPENDS core 50 | # VERBOSE 51 | #) 52 | # 53 | #install(FILES "core.pyi" DESTINATION lintdb) 54 | #install(FILES "py.typed" DESTINATION lintdb) -------------------------------------------------------------------------------- /lintdb/python/version.txt: -------------------------------------------------------------------------------- 1 | 0.2.0 -------------------------------------------------------------------------------- /lintdb/quantizers/Binarizer.h: -------------------------------------------------------------------------------- 1 | #ifndef LINTDB_BINARIZER_H 2 | #define LINTDB_BINARIZER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "lintdb/api.h" 9 | #include "lintdb/quantizers/PQDistanceTables.h" 10 | #include "lintdb/quantizers/Quantizer.h" 11 | 12 | namespace lintdb { 13 | struct Binarizer : public Quantizer { 14 | std::vector bucket_cutoffs; 15 | std::vector bucket_weights; 16 | float avg_residual; 17 | size_t nbits; 18 | size_t dim; 19 | std::vector reverse_bitmap; 20 | std::vector decompression_lut; 21 | 22 | Binarizer(size_t nbits, size_t dim); 23 | 24 | Binarizer( 25 | const std::vector& bucket_cutoffs, 26 | const std::vector& bucket_weights, 27 | const float avg_residual, 28 | const size_t nbits, 29 | const size_t dim); 30 | 31 | // copy constructor 32 | Binarizer(const Binarizer& other); 33 | 34 | Binarizer& operator=(Binarizer other) { 35 | swap(*this, other); 36 | return *this; 37 | } 38 | 39 | std::vector binarize(const std::vector& residuals); 40 | void train(const size_t n, const float* x, const size_t dim) override; 41 | void save(const std::string path) override; 42 | 43 | void sa_encode(size_t n, const float* x, residual_t* codes) override; 44 | void sa_decode(size_t n, const residual_t* codes, float* x) override; 45 | size_t code_size() override; 46 | 47 | size_t get_nbits() override { 48 | return nbits; 49 | } 50 | 51 | static std::unique_ptr load(std::string path); 52 | 53 | QuantizerType get_type() override; 54 | 55 | friend void swap(Binarizer& first, Binarizer& second) { 56 | std::swap(first.bucket_cutoffs, second.bucket_cutoffs); 57 | std::swap(first.bucket_weights, second.bucket_weights); 58 | std::swap(first.avg_residual, second.avg_residual); 59 | std::swap(first.nbits, second.nbits); 60 | std::swap(first.dim, second.dim); 61 | std::swap(first.reverse_bitmap, second.reverse_bitmap); 62 | std::swap(first.decompression_lut, second.decompression_lut); 63 | } 64 | 65 | private: 66 | void calculate_quantiles(const std::vector& heldoout_residual); 67 | 68 | std::vector bucketize(const std::vector& residuals); 69 | std::vector packbits(const std::vector& binarized); 70 | std::vector unpackbits( 71 | const std::vector& packed, 72 | size_t dim, 73 | size_t nbits); 74 | // binarize takes in the residuals as floats, bucketizes them, and 75 | // then returns the binarized version of the residuals. 76 | // the returned vector is of size dim * nbits. 77 | 78 | std::vector create_reverse_bitmap(); 79 | std::vector create_decompression_lut(); 80 | }; 81 | } // namespace lintdb 82 | 83 | #endif -------------------------------------------------------------------------------- /lintdb/quantizers/IdentityQuantizer.cpp: -------------------------------------------------------------------------------- 1 | #include "IdentityQuantizer.h" 2 | 3 | namespace lintdb { 4 | void IdentityQuantizer::train( 5 | const size_t n, 6 | const float* x, 7 | const size_t dim) {} 8 | 9 | void IdentityQuantizer::save(const std::string path) {} 10 | 11 | void IdentityQuantizer::sa_encode(size_t n, const float* x, residual_t* codes) { 12 | codes = (residual_t*)x; 13 | } 14 | 15 | void IdentityQuantizer::sa_decode(size_t n, const residual_t* codes, float* x) { 16 | x = (float*)codes; 17 | } 18 | 19 | size_t IdentityQuantizer::code_size() { 20 | return dim * sizeof(float); 21 | } 22 | 23 | size_t IdentityQuantizer::get_nbits() { 24 | return dim * sizeof(float); 25 | } 26 | 27 | QuantizerType IdentityQuantizer::get_type() { 28 | return NONE; 29 | } 30 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/quantizers/IdentityQuantizer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "lintdb/quantizers/Quantizer.h" 4 | 5 | namespace lintdb { 6 | 7 | class IdentityQuantizer : public Quantizer { 8 | public: 9 | IdentityQuantizer(size_t dim) : dim(dim) {} 10 | 11 | void train(const size_t n, const float* x, const size_t dim) override; 12 | 13 | void save(const std::string path) override; 14 | 15 | void sa_encode(size_t n, const float* x, residual_t* codes) override; 16 | 17 | void sa_decode(size_t n, const residual_t* codes, float* x) override; 18 | 19 | size_t code_size() override; 20 | 21 | size_t get_nbits() override; 22 | 23 | QuantizerType get_type() override; 24 | 25 | private: 26 | size_t dim; 27 | }; 28 | 29 | } // namespace lintdb 30 | -------------------------------------------------------------------------------- /lintdb/quantizers/InvertedListScanner.cpp: -------------------------------------------------------------------------------- 1 | #include "InvertedListScanner.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace lintdb { 9 | InvertedListScanner::InvertedListScanner( 10 | std::shared_ptr& quantizer, 11 | const float* query_data, 12 | size_t num_tokens) 13 | : quantizer(quantizer), code_size(quantizer->code_size()) { 14 | distance_tables = quantizer->get_distance_tables(query_data, num_tokens); 15 | } 16 | 17 | std::vector InvertedListScanner::scan( 18 | const idx_t key, 19 | const std::unique_ptr list_iterator, 20 | const std::vector& query_tokens_to_score) { 21 | std::vector query_token_ids; 22 | query_token_ids.reserve(query_tokens_to_score.size()); 23 | for (const auto& q : query_tokens_to_score) { 24 | query_token_ids.push_back(q.query_token); 25 | } 26 | 27 | std::vector precomputed_distances; 28 | precomputed_distances.reserve(query_tokens_to_score.size()); 29 | for (const auto& q : query_tokens_to_score) { 30 | precomputed_distances.push_back(q.distance); 31 | } 32 | 33 | std::vector results; 34 | for (; list_iterator->has_next(); list_iterator->next()) { 35 | auto partial_codes = list_iterator->get_value(); 36 | size_t num_tokens = partial_codes.partial_residuals.size() / code_size; 37 | if (num_tokens != 1) { 38 | LOG(WARNING) 39 | << "Codes found in inverted index are the wrong size. residual size: " 40 | << partial_codes.partial_residuals.size() 41 | << " code size: " << code_size; 42 | } 43 | 44 | ScoredPartialDocumentCodes doc_results; 45 | auto token_key = list_iterator->get_key(); 46 | doc_results.doc_id = token_key.doc_id; 47 | doc_results.doc_token_id = token_key.token_id; 48 | 49 | auto scores = distance_tables->calculate_query_distances( 50 | query_token_ids, 51 | precomputed_distances, 52 | partial_codes.partial_residuals); 53 | 54 | for (idx_t i = 0; i < scores.size(); i++) { 55 | const auto query_token_id = query_token_ids[i]; 56 | doc_results.query_token_id = query_token_id; 57 | doc_results.score = scores[i]; 58 | } 59 | 60 | results.push_back(doc_results); 61 | } 62 | return results; 63 | } 64 | 65 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/quantizers/InvertedListScanner.h: -------------------------------------------------------------------------------- 1 | #ifndef LINTDB_INVERTEDLISTSCANNER_H 2 | #define LINTDB_INVERTEDLISTSCANNER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "lintdb/api.h" 8 | #include "lintdb/invlists/EncodedDocument.h" 9 | #include "lintdb/invlists/Iterator.h" 10 | #include "lintdb/quantizers/PQDistanceTables.h" 11 | #include "lintdb/quantizers/Quantizer.h" 12 | #include "ProductEncoder.h" 13 | 14 | namespace lintdb { 15 | 16 | /** 17 | * ScoredPartialDocumentCodes holds per-token scores to help calculate 18 | * sum-of-max scores. 19 | * 20 | * Each token in a document is scored across the query tokens, and we want to 21 | * keep the max score per query token. 22 | */ 23 | struct ScoredPartialDocumentCodes { 24 | idx_t doc_id; 25 | idx_t doc_token_id; 26 | idx_t query_token_id; 27 | float score; 28 | 29 | ScoredPartialDocumentCodes() = default; 30 | 31 | ScoredPartialDocumentCodes( 32 | idx_t doc_id, 33 | idx_t doc_token_id, 34 | idx_t query_token_id, 35 | float score) 36 | : doc_id(doc_id), 37 | doc_token_id(doc_token_id), 38 | query_token_id(query_token_id), 39 | score(score) {} 40 | }; 41 | 42 | /** 43 | * QueryTokenCentroidScore holds the distance between a query token and a 44 | centroid. 45 | * 46 | * This is passed to scan to help calculate the score of a token. 47 | 48 | */ 49 | struct QueryTokenCentroidScore { 50 | idx_t query_token; 51 | idx_t centroid_id; 52 | float distance; 53 | }; 54 | 55 | /** 56 | * InvertedListScanner helps us scan through an inverted list and score the 57 | * results. 58 | * 59 | * The score is going to be a calculation between the stored codes, the 60 | * centroid, and the query. 61 | */ 62 | class InvertedListScanner { 63 | public: 64 | InvertedListScanner( 65 | std::shared_ptr& quantizer, 66 | const float* query_data, 67 | size_t num_tokens); 68 | 69 | std::vector scan( 70 | idx_t key, 71 | const std::unique_ptr list_iterator, 72 | const std::vector& query_tokens_to_score); 73 | 74 | private: 75 | std::unique_ptr distance_tables; 76 | std::shared_ptr quantizer; 77 | size_t code_size; 78 | }; 79 | 80 | } // namespace lintdb 81 | 82 | #endif // LINTDB_INVERTEDLISTSCANNER_H 83 | -------------------------------------------------------------------------------- /lintdb/quantizers/PQDistanceTables.cpp: -------------------------------------------------------------------------------- 1 | #include "PQDistanceTables.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "lintdb/assert.h" 8 | 9 | namespace lintdb { 10 | PQDistanceTables::PQDistanceTables( 11 | const float* query_data, 12 | size_t num_tokens, 13 | size_t dim, 14 | const std::shared_ptr ipq, 15 | bool is_ip) 16 | : ipq(ipq), is_ip(is_ip), dim(dim) { 17 | // right now, we only support IP. 18 | LINTDB_THROW_IF_NOT(ipq->metric_type == faiss::METRIC_INNER_PRODUCT); 19 | 20 | for (size_t i = 0; i < num_tokens; i++) { 21 | std::vector distance_table(ipq->pq.M * ipq->pq.ksub); 22 | ipq->pq.compute_inner_prod_table( 23 | query_data + i * dim, distance_table.data()); 24 | distance_tables.push_back(distance_table); 25 | } 26 | } 27 | 28 | std::vector PQDistanceTables::calculate_query_distances( 29 | const std::vector& query_tokens_to_score, 30 | const std::vector& precomputed_distances, 31 | const std::vector& codes) { 32 | std::vector results(precomputed_distances); 33 | // use the distance to the centroid as a precomputed distance. 34 | // we'll then add the distance from the centroid to the document code. 35 | for (int j = 0; j < query_tokens_to_score.size(); j++) { 36 | auto query_token_id = query_tokens_to_score[j]; 37 | auto sim_table = distance_tables[query_token_id]; 38 | float score = faiss::distance_single_code( 39 | ipq->pq, sim_table.data(), codes.data()); 40 | results[j] += score; 41 | } 42 | return results; 43 | } 44 | 45 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/quantizers/PQDistanceTables.h: -------------------------------------------------------------------------------- 1 | #ifndef LINTDB_PQDISTANCETABLES_H 2 | #define LINTDB_PQDISTANCETABLES_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "lintdb/api.h" 9 | 10 | namespace faiss { 11 | struct IndexPQ; 12 | } 13 | 14 | namespace lintdb { 15 | 16 | /** 17 | * PQDistanceTables calculates scores for a given query token and doc token. 18 | * 19 | * This class holds all of the compute logic and returns the pieces of the 20 | * calculation to InvertedListScanner. 21 | * 22 | * This class also knows about the IndexPQ internals, and should be owned by 23 | * the quantizer. In the future, this will move inside ProductEncoder. 24 | */ 25 | class PQDistanceTables { 26 | public: 27 | PQDistanceTables( 28 | const float* query_data, 29 | size_t num_tokens, 30 | size_t dim, 31 | std::shared_ptr ipq, 32 | bool is_ip = true); 33 | 34 | /** 35 | * precompute_list_tables precomputes the distance to the list's centroid 36 | * using the quantizer. We store the initial distance to each query token. 37 | */ 38 | // std::vector precompute_list_tables(const std::vector& 39 | // query_token_ids); 40 | 41 | std::vector calculate_query_distances( 42 | const std::vector& query_tokens_to_score, 43 | const std::vector& precomputed_distances, 44 | const std::vector& codes); 45 | 46 | private: 47 | std::vector> distance_tables; 48 | std::shared_ptr ipq; 49 | bool is_ip; 50 | size_t dim; 51 | }; 52 | 53 | } // namespace lintdb 54 | 55 | #endif // LINTDB_PQDISTANCETABLES_H 56 | -------------------------------------------------------------------------------- /lintdb/quantizers/ProductEncoder.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "lintdb/quantizers/PQDistanceTables.h" 5 | #include "lintdb/quantizers/Quantizer.h" 6 | 7 | namespace faiss { 8 | struct IndexPQ; 9 | } 10 | 11 | namespace lintdb { 12 | struct ProductEncoder : public Quantizer{ 13 | std::shared_ptr pq; 14 | size_t nbits; // number of bits used in binarizing the residuals. 15 | size_t dim; // number of dimensions per embedding. 16 | size_t dsub; // dimensionality of each subvector; 17 | size_t ksub; // number of centroids per subquantizer. 18 | size_t num_subquantizers; 19 | 20 | /// This table is used to precompute the inner product between the centroids 21 | /// of the PQ quantizer. 22 | std::vector precomputed_table; 23 | 24 | ProductEncoder(size_t dim, size_t nbits, size_t num_subquantizers); 25 | 26 | ProductEncoder(const ProductEncoder& other); 27 | 28 | friend void swap(ProductEncoder& lhs, ProductEncoder& rhs); 29 | 30 | ProductEncoder& operator=(ProductEncoder& other) { 31 | swap(*this, other); 32 | return *this; 33 | } 34 | 35 | bool is_trained = false; 36 | 37 | void sa_encode(size_t n, const float* x, residual_t* codes) override; 38 | void sa_decode(size_t n, const residual_t* codes, float* x) override; 39 | size_t code_size() override; 40 | 41 | size_t get_nbits() override { 42 | return nbits; 43 | } 44 | 45 | // Compute the inner product table for the given embeddings. 46 | // This currently wraps the underlying faiss PQ index. 47 | std::unique_ptr get_distance_tables( 48 | const float* query_data, 49 | size_t num_tokens) const; 50 | 51 | void save(const std::string path) override; 52 | 53 | static std::unique_ptr load( 54 | std::string path, 55 | QuantizerConfig& config); 56 | 57 | void train(const size_t n, const float* embeddings, const size_t dim) 58 | override; 59 | 60 | QuantizerType get_type() override; 61 | }; 62 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/quantizers/Quantizer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "lintdb/api.h" 6 | #include "lintdb/quantizers/PQDistanceTables.h" 7 | 8 | namespace lintdb { 9 | static const std::string QUANTIZER_FILENAME = "_residual_quantizer.bin"; 10 | static const std::string LEGACY_QUANTIZER_FILENAME = "_binarizer.bin"; 11 | 12 | enum QuantizerType { 13 | UNKNOWN, 14 | NONE, 15 | BINARIZER, 16 | PRODUCT_ENCODER, 17 | }; 18 | 19 | struct QuantizerConfig { 20 | size_t nbits; 21 | size_t dim; 22 | size_t num_subquantizers; // used in ProductEncoder 23 | }; 24 | /** 25 | * Quantizer is responsible for vector encoding. Unlike the Encoder, this isn't 26 | * responsible for IVF assignment. 27 | */ 28 | struct Quantizer { 29 | virtual void train(const size_t n, const float* x, const size_t dim) = 0; 30 | virtual void save(const std::string path) = 0; 31 | 32 | virtual void sa_encode(size_t n, const float* x, residual_t* codes) = 0; 33 | virtual void sa_decode(size_t n, const residual_t* codes, float* x) = 0; 34 | virtual size_t code_size() = 0; 35 | 36 | virtual size_t get_nbits() = 0; 37 | 38 | virtual QuantizerType get_type() = 0; 39 | 40 | virtual ~Quantizer() = default; 41 | }; 42 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/quantizers/impl/kmeans.cpp: -------------------------------------------------------------------------------- 1 | #include "lintdb/quantizers/impl/kmeans.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "lintdb/assert.h" 9 | 10 | namespace lintdb { 11 | std::vector kmeans( 12 | const float* data, 13 | size_t n, 14 | size_t dim, 15 | size_t k, 16 | Metric metric, 17 | int iterations) { 18 | LINTDB_THROW_IF_NOT_MSG( 19 | n > k, 20 | "Number of data points must be greater than the number of clusters."); 21 | 22 | LOG(INFO) << "clustering " << n << " points in " << dim 23 | << " dimensions into " << k << " clusters."; 24 | 25 | faiss::IndexFlatIP index(dim); 26 | faiss::ClusteringParameters cp; 27 | cp.niter = iterations; 28 | cp.nredo = 1; 29 | cp.verbose = true; 30 | faiss::Clustering clus(dim, k, cp); 31 | 32 | clus.train(n, data, index); 33 | 34 | return std::vector(index.get_xb(), index.get_xb() + k * dim); 35 | } 36 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/quantizers/impl/kmeans.h: -------------------------------------------------------------------------------- 1 | #ifndef LINTDB_KMEANS_H 2 | #define LINTDB_KMEANS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace lintdb { 10 | 11 | enum class Metric { EUCLIDEAN, INNER_PRODUCT }; 12 | // Helper function for Euclidean distance 13 | inline float euclidean_distance( 14 | gsl::span a, 15 | gsl::span b) { 16 | float sum = 0.0f; 17 | for (size_t i = 0; i < a.size(); ++i) { 18 | float diff = a[i] - b[i]; 19 | sum += diff * diff; 20 | } 21 | return std::sqrt(sum); 22 | } 23 | 24 | // Helper function for inner product 25 | inline float inner_product(gsl::span a, gsl::span b) { 26 | size_t size = a.size(); 27 | size_t i = 0; 28 | float result = 0.0f; 29 | 30 | // Use manual loop unrolling for better performance 31 | for (; i + 4 <= size; i += 4) { 32 | result += a[i] * b[i]; 33 | result += a[i + 1] * b[i + 1]; 34 | result += a[i + 2] * b[i + 2]; 35 | result += a[i + 3] * b[i + 3]; 36 | } 37 | 38 | // Process remaining elements 39 | for (; i < size; ++i) { 40 | result += a[i] * b[i]; 41 | } 42 | 43 | return result; 44 | } 45 | 46 | inline float inner_product(std::vector& a, std::vector& b) { 47 | size_t size = a.size(); 48 | size_t i = 0; 49 | float result = 0.0f; 50 | 51 | // Use manual loop unrolling for better performance 52 | for (; i + 4 <= size; i += 4) { 53 | result += a[i] * b[i]; 54 | result += a[i + 1] * b[i + 1]; 55 | result += a[i + 2] * b[i + 2]; 56 | result += a[i + 3] * b[i + 3]; 57 | } 58 | 59 | // Process remaining elements 60 | for (; i < size; ++i) { 61 | result += a[i] * b[i]; 62 | } 63 | 64 | return result; 65 | } 66 | 67 | // K-means clustering for a single sub-vector 68 | std::vector kmeans( 69 | const float* data, 70 | size_t n, 71 | size_t dim, 72 | size_t k, 73 | Metric metric, 74 | int iterations = 100); 75 | 76 | } // namespace lintdb 77 | 78 | #endif // LINTDB_KMEANS_H 79 | -------------------------------------------------------------------------------- /lintdb/quantizers/io.cpp: -------------------------------------------------------------------------------- 1 | #include "lintdb/quantizers/io.h" 2 | #include 3 | 4 | namespace lintdb { 5 | std::unique_ptr load_quantizer( 6 | std::string path, 7 | QuantizerType type, 8 | QuantizerConfig& config) { 9 | if (type == QuantizerType::NONE) { 10 | // the file won't exist, so we check NONE first. 11 | return std::make_unique(config.dim); 12 | } 13 | 14 | if (FILE* file = fopen((path).c_str(), "r")) { 15 | fclose(file); 16 | switch (type) { 17 | case QuantizerType::NONE: 18 | return std::make_unique(config.dim); 19 | case QuantizerType::BINARIZER: 20 | return Binarizer::load(path); 21 | 22 | case QuantizerType::PRODUCT_ENCODER: 23 | return ProductEncoder::load(path, config); 24 | 25 | default: 26 | throw LintDBException("Quantizer type not valid."); 27 | } 28 | return ProductEncoder::load(path, config); 29 | } else { 30 | throw LintDBException("Quantizer not found at path: " + path); 31 | } 32 | } 33 | 34 | void save_quantizer(std::string path, Quantizer* quantizer) { 35 | if (quantizer == nullptr) { 36 | return; 37 | } 38 | 39 | switch (quantizer->get_type()) { 40 | case QuantizerType::NONE: 41 | break; 42 | case QuantizerType::BINARIZER: 43 | quantizer->save(path); 44 | break; 45 | 46 | case QuantizerType::PRODUCT_ENCODER: 47 | quantizer->save(path); 48 | break; 49 | 50 | default: 51 | throw LintDBException("Quantizer type not valid."); 52 | } 53 | } 54 | 55 | std::unique_ptr create_quantizer( 56 | QuantizerType type, 57 | QuantizerConfig& config) { 58 | switch (type) { 59 | case QuantizerType::NONE: 60 | return std::make_unique(config.dim); 61 | ; 62 | 63 | case QuantizerType::BINARIZER: 64 | return std::make_unique(config.nbits, config.dim); 65 | 66 | case QuantizerType::PRODUCT_ENCODER: 67 | return std::make_unique( 68 | config.dim, config.nbits, config.num_subquantizers); 69 | 70 | default: 71 | throw LintDBException("Quantizer type not valid."); 72 | } 73 | } 74 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/quantizers/io.h: -------------------------------------------------------------------------------- 1 | #ifndef LINTDB_QUANTIZERS_IO_H 2 | #define LINTDB_QUANTIZERS_IO_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "lintdb/exception.h" 8 | #include "lintdb/quantizers/Binarizer.h" 9 | #include "lintdb/quantizers/IdentityQuantizer.h" 10 | #include "lintdb/quantizers/ProductEncoder.h" 11 | #include "lintdb/quantizers/Quantizer.h" 12 | #include "lintdb/SearchOptions.h" 13 | 14 | namespace lintdb { 15 | std::unique_ptr load_quantizer( 16 | std::string path, 17 | QuantizerType type, 18 | QuantizerConfig& config); 19 | 20 | void save_quantizer(std::string path, Quantizer* quantizer); 21 | 22 | std::unique_ptr create_quantizer( 23 | QuantizerType type, 24 | QuantizerConfig& config); 25 | } // namespace lintdb 26 | 27 | #endif -------------------------------------------------------------------------------- /lintdb/query/DocValue.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "lintdb/api.h" 5 | #include "lintdb/invlists/Iterator.h" 6 | #include "lintdb/schema/DataTypes.h" 7 | 8 | namespace lintdb { 9 | /** 10 | * DocValue is a simple struct that holds a field value and the field id. 11 | * 12 | * It is the job of the caller to ensure that the field is valid, because this 13 | * class has no concept of what the field should look like. 14 | */ 15 | struct DocValue { 16 | lintdb::SupportedTypes value; 17 | uint8_t field_id; 18 | DataType type; 19 | bool unread_value = 20 | false; /// ColBERT fields do not have their values decoded from the 21 | /// index. We check this flag so that 22 | /// we can throw an exception if the user tries to access the value. 23 | 24 | DocValue(SupportedTypes value, uint8_t field_id, DataType type) 25 | : value(std::move(value)), field_id(field_id), type(type) {} 26 | 27 | SupportedTypes get_value() const { 28 | if (unread_value) { 29 | throw LintDBException( 30 | "Document's value was not decoded from the index. This is likely because a ColBERT field was read"); 31 | } 32 | return value; 33 | } 34 | }; 35 | 36 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/query/KnnNearestCentroids.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "lintdb/assert.h" 7 | #include "lintdb/quantizers/CoarseQuantizer.h" 8 | 9 | namespace lintdb { 10 | 11 | struct QueryTensor { 12 | const std::vector& query; 13 | size_t num_query_tokens; 14 | }; 15 | 16 | class KnnNearestCentroids { 17 | public: 18 | KnnNearestCentroids() = default; 19 | void calculate( 20 | std::vector& query, 21 | const size_t num_query_tokens, 22 | const std::shared_ptr quantizer, 23 | const size_t total_centroids_to_calculate); 24 | 25 | std::vector> get_top_centroids( 26 | const size_t k_top_centroids, /// k centroids per token to consider. 27 | const size_t n_probe /// overall number of centroids to return. 28 | ) const; 29 | 30 | inline std::vector get_distances() const { 31 | return distances; 32 | } 33 | inline std::vector get_indices() const { 34 | return coarse_idx; 35 | } 36 | 37 | inline size_t get_num_centroids() const { 38 | return num_centroids; 39 | } 40 | 41 | /// Returns the top centroid id for the idx-th token. 42 | inline idx_t get_assigned_centroid(size_t idx) const { 43 | return coarse_idx[idx * total_centroids_to_calculate]; 44 | } 45 | 46 | inline const std::vector& get_reordered_distances() const { 47 | return reordered_distances; 48 | } 49 | 50 | inline bool is_valid() const { 51 | // this works because we don't set num_centroids until we have 52 | // calculated them. 53 | return num_centroids > 0; 54 | } 55 | 56 | inline QueryTensor get_query_tensor() const { 57 | LINTDB_THROW_IF_NOT_MSG(!query.empty(), "query is empty"); 58 | return {query, num_query_tokens}; 59 | } 60 | 61 | private: 62 | std::vector query; 63 | size_t num_query_tokens; 64 | size_t num_centroids; 65 | size_t total_centroids_to_calculate; 66 | std::vector> top_centroids; 67 | std::vector distances; 68 | std::vector coarse_idx; 69 | std::vector reordered_distances; /// distances that match the 70 | /// centroid id position. 71 | }; 72 | 73 | } // namespace lintdb 74 | -------------------------------------------------------------------------------- /lintdb/query/Query.cpp: -------------------------------------------------------------------------------- 1 | #include "Query.h" 2 | 3 | namespace lintdb { 4 | 5 | Query::Query(std::unique_ptr root) : root(std::move(root)) {} 6 | 7 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/query/Query.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "QueryNode.h" 4 | 5 | namespace lintdb { 6 | struct Query { 7 | public: 8 | Query(std::unique_ptr root); 9 | 10 | std::unique_ptr root; 11 | }; 12 | 13 | } // namespace lintdb 14 | -------------------------------------------------------------------------------- /lintdb/query/QueryContext.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "lintdb/invlists/InvertedList.h" 6 | #include "lintdb/quantizers/CoarseQuantizer.h" 7 | #include "lintdb/quantizers/Quantizer.h" 8 | #include "lintdb/query/KnnNearestCentroids.h" 9 | #include "lintdb/schema/FieldMapper.h" 10 | 11 | namespace lintdb { 12 | 13 | class QueryContext { 14 | public: 15 | const std::string colbert_context; 16 | 17 | explicit QueryContext( 18 | const uint64_t tenant, 19 | const std::string colbert_field, 20 | const std::shared_ptr invertedList, 21 | const std::shared_ptr fieldMapper, 22 | const std::unordered_map< 23 | std::string, 24 | std::shared_ptr>& coarse_quantizer_map, 25 | const std::unordered_map>& 26 | quantizer_map) 27 | : colbert_context(colbert_field), 28 | tenant(tenant), 29 | db_(invertedList), 30 | fieldMapper_(fieldMapper), 31 | coarse_quantizer_map(coarse_quantizer_map), 32 | quantizer_map(quantizer_map) {} 33 | 34 | inline std::shared_ptr getFieldMapper() const { 35 | return fieldMapper_; 36 | } 37 | 38 | inline std::shared_ptr getIndex() const { 39 | return db_; 40 | } 41 | 42 | inline uint64_t getTenant() const { 43 | return tenant; 44 | } 45 | 46 | inline std::shared_ptr getCoarseQuantizer( 47 | const std::string& field) const { 48 | return coarse_quantizer_map.at(field); 49 | } 50 | 51 | inline std::shared_ptr getQuantizer( 52 | const std::string& field) const { 53 | return quantizer_map.at(field); 54 | } 55 | 56 | inline std::shared_ptr getOrCreateNearestCentroids( 57 | const std::string& field) { 58 | if (knnNearestCentroidsMap.find(field) == 59 | knnNearestCentroidsMap.end()) { 60 | auto knnNearestCentroids = std::make_shared(); 61 | knnNearestCentroidsMap.insert( 62 | {field, std::move(knnNearestCentroids)}); 63 | } 64 | return knnNearestCentroidsMap.at(field); 65 | } 66 | 67 | inline void setNearestCentroids( 68 | const std::string& field, 69 | std::shared_ptr knnNearestCentroids) { 70 | knnNearestCentroidsMap.insert({field, knnNearestCentroids}); 71 | } 72 | 73 | private: 74 | const uint64_t tenant; 75 | const std::shared_ptr db_; 76 | const std::shared_ptr fieldMapper_; 77 | const std::unordered_map>& 78 | coarse_quantizer_map; 79 | const std::unordered_map>& 80 | quantizer_map; 81 | std::unordered_map> 82 | knnNearestCentroidsMap; 83 | }; 84 | 85 | } // namespace lintdb 86 | -------------------------------------------------------------------------------- /lintdb/query/QueryExecutor.cpp: -------------------------------------------------------------------------------- 1 | #include "QueryExecutor.h" 2 | #include 3 | #include "decode.h" 4 | #include "DocIterator.h" 5 | #include "DocValue.h" 6 | #include "lintdb/query/KnnNearestCentroids.h" 7 | #include "lintdb/scoring/ContextCollector.h" 8 | #include "lintdb/scoring/ScoredDocument.h" 9 | 10 | namespace lintdb { 11 | QueryExecutor::QueryExecutor(Scorer& ranker) 12 | : ranker(ranker) {} 13 | 14 | std::vector QueryExecutor::execute( 15 | QueryContext& context, 16 | const Query& query, 17 | const size_t num_results, 18 | const SearchOptions& opts) { 19 | std::unique_ptr doc_it = query.root->process(context, opts); 20 | 21 | std::vector>> documents; 22 | for(; doc_it->is_valid(); doc_it->advance()) { 23 | std::vector dvs = doc_it->fields(); 24 | 25 | documents.emplace_back(doc_it->doc_id(), dvs); 26 | } 27 | 28 | std::vector results(documents.size()); 29 | #pragma omp parallel for if(documents.size() > 100) 30 | for(int i = 0; i < documents.size(); i++) { 31 | auto doc = documents[i]; 32 | // for (auto& dv : doc.second) { 33 | // // ColBERT is a special case where we don't have a value to decode. 34 | // if (dv.unread_value) { 35 | // continue; 36 | // } 37 | // dv = decode_vectors(context, dv); 38 | // } 39 | ScoredDocument scored = doc_it->score(doc.second); 40 | scored.doc_id = doc.first; 41 | 42 | if (opts.expected_id != -1 && doc.first == opts.expected_id) { 43 | LOG(INFO) << "\tscore: " << scored.score; 44 | } 45 | 46 | results[i] = scored; 47 | } // end for 48 | 49 | std::sort(results.begin(), results.end(), std::greater<>()); 50 | 51 | size_t num_to_rank = std::min(results.size(), opts.num_second_pass); 52 | 53 | std::vector top_results_ranked(num_to_rank); 54 | for (size_t i = 0; i < num_to_rank; i++) { 55 | top_results_ranked[i] = ranker.score( 56 | context, results[i].doc_id, results[i].values); 57 | } 58 | 59 | std::sort( 60 | top_results_ranked.begin(), 61 | top_results_ranked.end(), 62 | std::greater<>()); 63 | 64 | // return num_results from top_results_ranked 65 | std::vector final_results; 66 | for (size_t i = 0; i < num_results && i < top_results_ranked.size(); i++) { 67 | final_results.push_back(top_results_ranked[i]); 68 | } 69 | 70 | return final_results; 71 | } 72 | 73 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/query/QueryExecutor.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "lintdb/invlists/RocksdbInvertedList.h" 4 | #include "lintdb/query/DocValue.h" 5 | #include "lintdb/schema/FieldMapper.h" 6 | #include "lintdb/scoring/Scorer.h" 7 | #include "lintdb/SearchOptions.h" 8 | #include "lintdb/SearchResult.h" 9 | #include "Query.h" 10 | #include "QueryContext.h" 11 | #include "lintdb/scoring/ScoredDocument.h" 12 | 13 | namespace lintdb { 14 | /** 15 | * QueryExecutor helps manage the execution of queries. 16 | * 17 | * The basic flow of retrieval is: 18 | * 1. Optimize the query. 19 | * 2. Translate the query into a series of document iterators. 20 | * 3. Scan those iterators to retrieve the right documents. 21 | * 4. Score the documents. 22 | * 23 | */ 24 | class QueryExecutor { 25 | public: 26 | QueryExecutor(Scorer& ranker); 27 | 28 | std::vector execute( 29 | QueryContext& context, 30 | const Query& query, 31 | const size_t num_results, 32 | const SearchOptions& opts); 33 | 34 | private: 35 | Scorer& ranker; 36 | }; 37 | 38 | } // namespace lintdb 39 | -------------------------------------------------------------------------------- /lintdb/query/decode.cpp: -------------------------------------------------------------------------------- 1 | #include "decode.h" 2 | #include 3 | #include 4 | #include "DocValue.h" 5 | #include "lintdb/schema/DataTypes.h" 6 | 7 | namespace lintdb { 8 | DocValue decode_vectors( 9 | const lintdb::QueryContext& context, 10 | const lintdb::DocValue& doc_value) { 11 | if (doc_value.unread_value) { 12 | return doc_value; 13 | } 14 | switch (doc_value.type) { 15 | case lintdb::QUANTIZED_TENSOR: { 16 | std::string field = 17 | context.getFieldMapper()->getFieldName(doc_value.field_id); 18 | // check if field has a quantizer. 19 | if (!context.getQuantizer(field)) { 20 | return doc_value; 21 | } 22 | 23 | auto quantizer = context.getQuantizer(field); 24 | 25 | std::vector quantized = 26 | std::get(doc_value.get_value()); 27 | size_t dim = context.getFieldMapper()->getFieldDimensions( 28 | doc_value.field_id); 29 | size_t num_vectors = quantized.size() / quantizer->code_size(); 30 | 31 | std::vector tensor(num_vectors * dim, 0); 32 | quantizer->sa_decode(num_vectors, quantized.data(), tensor.data()); 33 | 34 | return {tensor, doc_value.field_id, lintdb::TENSOR}; 35 | } 36 | default: 37 | return doc_value; 38 | } 39 | } 40 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/query/decode.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "DocValue.h" 4 | #include "QueryContext.h" 5 | 6 | namespace lintdb { 7 | /** 8 | * decode_vectors manages the decoding of vectors from the doc_value. All 9 | * tensors become QuantizedTensor values going into the index, and we need to 10 | * decode any tensors that have an associated quantizer. 11 | * 12 | * @param context 13 | * @param doc_value 14 | * @return 15 | */ 16 | DocValue decode_vectors(const QueryContext& context, const DocValue& doc_value); 17 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/schema/DocEncoder.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "lintdb/invlists/PostingData.h" 7 | #include "lintdb/schema/DataTypes.h" 8 | #include "lintdb/schema/ProcessedData.h" 9 | #include "lintdb/api.h" 10 | 11 | namespace lintdb { 12 | 13 | class DocEncoder { 14 | public: 15 | static std::vector encode_inverted_data( 16 | const ProcessedData& data, 17 | size_t code_size); 18 | 19 | static PostingData encode_forward_data( 20 | const std::vector& data); 21 | 22 | static PostingData encode_context_data(const ProcessedData& data); 23 | 24 | static std::vector encode_inverted_mapping_data( 25 | const ProcessedData& data); 26 | 27 | static SupportedTypes decode_supported_types(std::string& data); 28 | 29 | static std::map decode_forward_data( 30 | std::string& data); 31 | 32 | static std::vector decode_inverted_mapping_data(std::string& data); 33 | }; 34 | 35 | } // namespace lintdb 36 | -------------------------------------------------------------------------------- /lintdb/schema/DocProcessor.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "lintdb/invlists/IndexWriter.h" 7 | #include "lintdb/quantizers/CoarseQuantizer.h" 8 | #include "lintdb/quantizers/Quantizer.h" 9 | #include "lintdb/schema/DataTypes.h" 10 | #include "lintdb/schema/Document.h" 11 | #include "lintdb/schema/FieldMapper.h" 12 | #include "lintdb/schema/ProcessedData.h" 13 | #include "lintdb/schema/Schema.h" 14 | 15 | namespace lintdb { 16 | 17 | class DocumentProcessor { 18 | public: 19 | DocumentProcessor( 20 | const Schema& schema, 21 | const std::unordered_map>& 22 | quantizer_map, 23 | const std::unordered_map< 24 | std::string, 25 | std::shared_ptr>& coarse_quantizer_map, 26 | const std::shared_ptr field_mapper, 27 | std::unique_ptr index_writer); 28 | void processDocument(const uint64_t tenant, const Document& document); 29 | 30 | private: 31 | static void validateField(const Field& field, const FieldValue& value); 32 | FieldValue quantizeField(const Field& field, const FieldValue& value); 33 | std::vector assignIVFCentroids( 34 | const Field& field, 35 | const FieldValue& value); 36 | 37 | Schema schema; 38 | std::unordered_map field_map; 39 | const std::shared_ptr field_mapper; 40 | // each tensor/tensor_array field has a quantizer 41 | const std::unordered_map>& 42 | quantizer_map; 43 | const std::unordered_map>& 44 | coarse_quantizer_map; 45 | 46 | std::unique_ptr index_writer; 47 | }; 48 | 49 | } // namespace lintdb 50 | -------------------------------------------------------------------------------- /lintdb/schema/Document.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "DataTypes.h" 5 | 6 | namespace lintdb { 7 | /** 8 | * Documents hold data as they are passed into the database from the user. 9 | * 10 | * Each Document must have a unique id. 11 | */ 12 | struct Document { 13 | std::vector fields; 14 | idx_t id; /// the unique id of the document 15 | 16 | Document(idx_t id, const std::vector& fields) 17 | : fields(fields), id(id) {} 18 | 19 | Json::Value toJson() const { 20 | Json::Value root; 21 | root["id"] = static_cast(id); 22 | 23 | Json::Value fieldsArray(Json::arrayValue); 24 | for (const auto &field : fields) { 25 | fieldsArray.append(field.toJson()); 26 | } 27 | root["fields"] = fieldsArray; 28 | 29 | return root; 30 | } 31 | 32 | static Document fromJson(const Json::Value &json) { 33 | idx_t id = json["id"].asInt64(); 34 | 35 | std::vector fields; 36 | const Json::Value &fieldsArray = json["fields"]; 37 | for (const auto &fieldJson : fieldsArray) { 38 | fields.push_back(FieldValue::fromJson(fieldJson)); 39 | } 40 | 41 | return Document(id, fields); 42 | } 43 | }; 44 | 45 | } // namespace lintdb 46 | -------------------------------------------------------------------------------- /lintdb/schema/FieldMapper.cpp: -------------------------------------------------------------------------------- 1 | #include "FieldMapper.h" 2 | #include 3 | 4 | namespace lintdb { 5 | std::shared_ptr FieldMapper::fromJson(const Json::Value& json) { 6 | std::shared_ptr mapper = std::make_shared(); 7 | int highest_id = 0; 8 | for (const auto& member : json["nameToID"].getMemberNames()) { 9 | mapper->nameToID[member] = json["nameToID"][member].asInt(); 10 | if (mapper->nameToID[member] > highest_id) { 11 | highest_id = mapper->nameToID[member]; 12 | } 13 | } 14 | 15 | for (const auto& field : json["idToField"]) { 16 | mapper->idToField[field["id"].asInt()] = Field::fromJson(field); 17 | } 18 | 19 | mapper->fieldID = highest_id + 1; 20 | return mapper; 21 | } 22 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/schema/FieldMapper.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "lintdb/schema/DataTypes.h" 9 | #include "lintdb/schema/Schema.h" 10 | 11 | namespace lintdb { 12 | 13 | class FieldMapper { 14 | public: 15 | FieldMapper() = default; 16 | // copy constructor 17 | FieldMapper(const FieldMapper& other) { 18 | nameToID = other.nameToID; 19 | fieldID = other.fieldID; 20 | idToField = other.idToField; 21 | } 22 | 23 | // copy assignment operator 24 | // using copy and swap idiom. 25 | FieldMapper& operator=(FieldMapper other) { 26 | std::swap(nameToID, other.nameToID); 27 | std::swap(fieldID, other.fieldID); 28 | std::swap(idToField, other.idToField); 29 | return *this; 30 | } 31 | 32 | inline void addSchema(const Schema& schema) { 33 | for (const auto& field : schema.fields) { 34 | addMapping(field); 35 | } 36 | } 37 | 38 | inline DataType getDataType(const uint8_t field_id) const { 39 | return idToField.at(field_id).data_type; 40 | } 41 | 42 | inline std::vector getFieldTypes(const uint8_t field_id) const { 43 | return idToField.at(field_id).field_types; 44 | } 45 | 46 | inline int getFieldID(const std::string& fieldName) const { 47 | auto it = nameToID.find(fieldName); 48 | if (it != nameToID.end()) { 49 | return it->second; 50 | } 51 | throw std::runtime_error("Field name not found: " + fieldName); 52 | } 53 | 54 | inline std::string getFieldName(int fieldID) const { 55 | auto it = idToField.find(fieldID); 56 | if (it != idToField.end()) { 57 | return it->second.name; 58 | } 59 | throw std::runtime_error("Field ID not found"); 60 | } 61 | 62 | inline size_t getFieldDimensions(int field_id) const { 63 | return idToField.at(field_id).parameters.dimensions; 64 | } 65 | 66 | inline Json::Value toJson() const { 67 | Json::Value json; 68 | for (const auto& pair : nameToID) { 69 | json["nameToID"][pair.first] = pair.second; 70 | } 71 | for (const auto& pair : idToField) { 72 | json["idToField"][pair.first] = pair.second.toJson(); 73 | } 74 | return json; 75 | } 76 | 77 | static std::shared_ptr fromJson(const Json::Value& json); 78 | 79 | private: 80 | std::unordered_map idToField; 81 | std::unordered_map nameToID; 82 | int fieldID = 0; 83 | 84 | inline void addMapping(const Field& field) { 85 | if (nameToID.find(field.name) != nameToID.end()) { 86 | throw std::runtime_error( 87 | "Field name already exists: " + field.name); 88 | } 89 | nameToID[field.name] = fieldID; 90 | idToField[fieldID] = field; 91 | 92 | fieldID++; 93 | } 94 | }; 95 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/schema/ProcessedData.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "lintdb/schema/DataTypes.h" 4 | 5 | namespace lintdb { 6 | /** 7 | * ColumnInverter is a helper class that inverts a column of a document. 8 | * 9 | * Once the document processor has assigned codes to a tensor, we can invert it 10 | * into the index. 11 | * 12 | * inverted index: 13 | * key => tenant, field, IVF centroid id, doc_id 14 | * value => codes assigned to this centroid 15 | * context index: 16 | * key => tenant, field, doc_id 17 | * value => values of the field 18 | * forward index: 19 | * key => tenant, doc_id 20 | * value => all stored data of the document 21 | */ 22 | struct ProcessedData { 23 | uint64_t tenant; 24 | uint8_t field; 25 | std::vector centroid_ids; 26 | idx_t doc_id; 27 | 28 | FieldValue value; 29 | }; 30 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/schema/Schema.cpp: -------------------------------------------------------------------------------- 1 | #include "Schema.h" 2 | 3 | namespace lintdb { 4 | Json::Value Field::toJson() const { 5 | Json::Value json; 6 | json["name"] = name; 7 | json["data_type"] = static_cast(data_type); 8 | 9 | Json::Value fieldTypesJson(Json::arrayValue); 10 | for (const auto& fieldType : field_types) { 11 | fieldTypesJson.append(static_cast(fieldType)); 12 | } 13 | json["field_types"] = fieldTypesJson; 14 | 15 | Json::Value params; 16 | params["dimensions"] = static_cast(parameters.dimensions); 17 | params["analyzer"] = parameters.analyzer; 18 | params["quantization"] = static_cast(parameters.quantization); 19 | params["num_centroids"] = static_cast(parameters.num_centroids); 20 | params["num_iterations"] = static_cast(parameters.num_iterations); 21 | params["num_subquantizers"] = static_cast(parameters.num_subquantizers); 22 | params["nbits"] = static_cast(parameters.nbits); 23 | json["parameters"] = params; 24 | 25 | return json; 26 | } 27 | 28 | Field Field::fromJson(const Json::Value& json) { 29 | Field field; 30 | field.name = json["name"].asString(); 31 | field.data_type = static_cast(json["data_type"].asInt()); 32 | 33 | const Json::Value& fieldTypesJson = json["field_types"]; 34 | for (const auto& fieldTypeJson : fieldTypesJson) { 35 | field.field_types.push_back( 36 | static_cast(fieldTypeJson.asInt())); 37 | } 38 | 39 | const Json::Value& params = json["parameters"]; 40 | field.parameters.dimensions = params["dimensions"].asUInt(); 41 | field.parameters.analyzer = params["analyzer"].asString(); 42 | field.parameters.quantization = 43 | static_cast(params["quantization"].asInt()); 44 | field.parameters.num_centroids = params["num_centroids"].asUInt(); 45 | field.parameters.num_iterations = params["num_iterations"].asUInt(); 46 | field.parameters.num_subquantizers = params["num_subquantizers"].asUInt(); 47 | field.parameters.nbits = params["nbits"].asUInt(); 48 | 49 | return field; 50 | } 51 | 52 | Json::Value Schema::toJson() const { 53 | Json::Value json; 54 | for (const auto& field : fields) { 55 | json["fields"].append(field.toJson()); 56 | } 57 | return json; 58 | } 59 | 60 | Schema Schema::fromJson(const Json::Value& json) { 61 | Schema schema; 62 | for (const auto& jsonField : json["fields"]) { 63 | schema.fields.push_back(Field::fromJson(jsonField)); 64 | } 65 | return schema; 66 | } 67 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/schema/Schema.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "lintdb/quantizers/Quantizer.h" 9 | #include "lintdb/schema/DataTypes.h" 10 | 11 | namespace lintdb { 12 | 13 | enum class FieldType { Indexed, Context, Stored, Colbert }; 14 | 15 | struct FieldParameters { 16 | size_t dimensions = 0; 17 | std::string analyzer = ""; 18 | QuantizerType quantization = QuantizerType::UNKNOWN; 19 | size_t num_centroids = 0; 20 | size_t num_iterations = 10; 21 | size_t num_subquantizers = 0; // used for PQ quantizer 22 | size_t nbits = 1; // used for PQ quantizer 23 | }; 24 | 25 | /** 26 | * A Schema is made up of multiple fields. 27 | */ 28 | struct Field { 29 | std::string name; /// the name of the field 30 | DataType data_type; /// the data type. e.g. int, float, string, embedding. 31 | std::vector field_types; /// the field types. e.g. indexed or 32 | /// stored in the database. 33 | FieldParameters parameters; /// parameters for the field. 34 | 35 | Field() = default; 36 | Field(const std::string& name, 37 | const DataType data_type, 38 | const std::vector& field_types, 39 | const FieldParameters& parameters) 40 | : name(name), 41 | data_type(data_type), 42 | field_types(field_types), 43 | parameters(parameters) {} 44 | 45 | Json::Value toJson() const; 46 | static Field fromJson(const Json::Value& json); 47 | 48 | void add_field_type(FieldType field_type) { 49 | field_types.push_back(field_type); 50 | } 51 | }; 52 | 53 | struct IndexedField : public Field { 54 | IndexedField( 55 | const std::string& name, 56 | const DataType data_type, 57 | const FieldParameters& parameters) 58 | : Field(name, data_type, {FieldType::Indexed}, parameters) {} 59 | }; 60 | 61 | struct ContextField : public Field { 62 | ContextField( 63 | const std::string& name, 64 | const DataType data_type, 65 | const FieldParameters& parameters) 66 | : Field(name, data_type, {FieldType::Context}, parameters) {} 67 | }; 68 | 69 | struct StoredField : public Field { 70 | StoredField( 71 | const std::string& name, 72 | const DataType data_type, 73 | const FieldParameters& parameters) 74 | : Field(name, data_type, {FieldType::Stored}, parameters) {} 75 | }; 76 | 77 | struct ColbertField : public Field { 78 | ColbertField( 79 | const std::string& name, 80 | const DataType data_type, 81 | const FieldParameters& parameters) 82 | : Field(name, data_type, {FieldType::Colbert}, parameters) {} 83 | }; 84 | 85 | /** 86 | * A schema dictates what data is stored, how it is stored, and the way we are 87 | * able to interact with the data. 88 | */ 89 | struct Schema { 90 | std::vector fields; 91 | 92 | Schema() = default; 93 | explicit Schema(const std::vector& fields) : fields(fields) {} 94 | 95 | Json::Value toJson() const; 96 | static Schema fromJson(const Json::Value& json); 97 | 98 | inline void add_field(Field& field) { 99 | fields.push_back(field); 100 | } 101 | }; 102 | 103 | } // namespace lintdb 104 | -------------------------------------------------------------------------------- /lintdb/scoring/ContextCollector.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright (c) 2024 ${ORGANIZATION_NAME}. All rights reserved. 3 | // 4 | 5 | #include "ContextCollector.h" 6 | 7 | namespace lintdb {} // namespace lintdb -------------------------------------------------------------------------------- /lintdb/scoring/ContextCollector.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "lintdb/query/QueryContext.h" 7 | #include "lintdb/query/DocValue.h" 8 | #include "lintdb/invlists/ContextIterator.h" 9 | #include "lintdb/schema/DocEncoder.h" 10 | #include 11 | #include 12 | 13 | namespace lintdb { 14 | 15 | class ContextCollector { 16 | public: 17 | ContextCollector() = default; 18 | 19 | void add_field(const QueryContext& context, const std::string& field) { 20 | context_fields.push_back(field); 21 | 22 | uint8_t colbert_field_id = 23 | context.getFieldMapper()->getFieldID(context.colbert_context); 24 | context_field_ids.push_back(colbert_field_id); 25 | 26 | bool is_colbert = false; 27 | auto field_types = context.getFieldMapper()->getFieldTypes(colbert_field_id); 28 | /** 29 | * This is a pretty big hack because we modify the ColBERT fields internally. A user passes in 30 | * a tensor data type, and we process it distinctly for colbert and reset it to be datatype::colbert. 31 | * 32 | * A solution is to stop modifying datatypes internally, or we could expose ColBERT 33 | * as a datatype. However, our colbert storage is meant to be internal. 34 | */ 35 | if (std::find(field_types.begin(), field_types.end(), FieldType::Colbert) != field_types.end()) { 36 | is_colbert = true; 37 | } 38 | if (!is_colbert) { 39 | context_data_types.push_back(context.getFieldMapper()->getDataType(colbert_field_id)); 40 | } else { 41 | context_data_types.push_back(DataType::COLBERT); 42 | } 43 | 44 | auto it = context.getIndex()->get_context_iterator( 45 | context.getTenant(), colbert_field_id); 46 | 47 | context_iterators.push_back(std::move(it)); 48 | } 49 | 50 | std::vector get_context_values(const idx_t doc_id) const { 51 | std::vector results; 52 | results.reserve(context_iterators.size()); 53 | 54 | for(int i=0; i < context_iterators.size(); i++) { 55 | auto it = context_iterators[i].get(); 56 | it->advance(doc_id); 57 | 58 | if(it->is_valid() && it->get_key().doc_id() == doc_id) { 59 | std::string context_str = it->get_value(); 60 | SupportedTypes colbert_context = 61 | DocEncoder::decode_supported_types(context_str); 62 | 63 | // create DocValues for the context info. 64 | uint8_t colbert_field_id = context_field_ids[i]; 65 | results.emplace_back(colbert_context, colbert_field_id, context_data_types[i]); 66 | } else { 67 | LOG(WARNING) << "No context found for doc_id: " << doc_id << " field: " << context_fields[i]; 68 | } 69 | } 70 | 71 | return results; 72 | } 73 | 74 | 75 | private: 76 | std::vector context_fields; 77 | std::vector context_field_ids; 78 | std::vector context_data_types; 79 | std::vector> context_iterators; 80 | 81 | }; 82 | 83 | } // namespace lintdb 84 | -------------------------------------------------------------------------------- /lintdb/scoring/ScoredDocument.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include "lintdb/query/DocValue.h" 5 | 6 | namespace lintdb { 7 | struct ScoredDocument { 8 | double score = 0; 9 | idx_t doc_id = -1; 10 | std::vector 11 | values; /// ScoredDocument takes ownership of the values, because 12 | /// we assume we are iterating over a DocIterator and the values are only 13 | /// valid for the duration of the iteration. 14 | 15 | ScoredDocument() = default; 16 | 17 | ScoredDocument(float score, idx_t doc_id, std::vector values) 18 | : score(score), doc_id(doc_id), values(std::move(values)) {} 19 | 20 | bool operator<(const ScoredDocument& other) const { 21 | return score < other.score; 22 | } 23 | 24 | bool operator>(const ScoredDocument& other) const { 25 | return score > other.score; 26 | } 27 | }; 28 | } 29 | -------------------------------------------------------------------------------- /lintdb/scoring/Scorer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "lintdb/invlists/ContextIterator.h" 6 | #include "lintdb/invlists/InvertedList.h" 7 | #include "lintdb/quantizers/Quantizer.h" 8 | #include "lintdb/query/DocIterator.h" 9 | #include "lintdb/query/DocValue.h" 10 | #include "lintdb/query/QueryContext.h" 11 | #include "lintdb/schema/DataTypes.h" 12 | #include "lintdb/scoring/plaid.h" 13 | #include "ScoredDocument.h" 14 | 15 | namespace lintdb { 16 | 17 | /** 18 | * Scorer is an interface for scoring documents. 19 | * 20 | * Scorers will iterate over a DocIterator and score each document. 21 | * The caller of Scorer.score() will be responsible for keeping the scores in 22 | * order. 23 | * 24 | * Additionally, different scorers can retrieve different context from fast 25 | * fields. For example, ColBERT will use a context field to retrieve all 26 | * document codes during scoring. 27 | */ 28 | class Scorer { 29 | public: 30 | virtual ~Scorer() = default; 31 | virtual ScoredDocument score( 32 | QueryContext& context, 33 | idx_t doc_id, 34 | std::vector& fvs) const = 0; 35 | }; 36 | 37 | class PlaidScorer : public Scorer { 38 | public: 39 | explicit PlaidScorer(const QueryContext& context); 40 | ScoredDocument score( 41 | QueryContext& context, 42 | idx_t doc_id, 43 | std::vector& fvs) const override; 44 | ~PlaidScorer() override = default; 45 | 46 | }; 47 | 48 | class ColBERTScorer : public Scorer { 49 | public: 50 | explicit ColBERTScorer(const QueryContext& context); 51 | ScoredDocument score( 52 | QueryContext& context, 53 | idx_t doc_id, 54 | std::vector& fvs) const override; 55 | ~ColBERTScorer() override = default; 56 | 57 | }; 58 | 59 | // class XTRScorer: public Scorer { 60 | // double score(idx_t doc_id, std::vector& fvs) const override; 61 | // }; 62 | 63 | } // namespace lintdb 64 | -------------------------------------------------------------------------------- /lintdb/scoring/plaid.h: -------------------------------------------------------------------------------- 1 | #ifndef LINTDB_PLAID_H 2 | #define LINTDB_PLAID_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "lintdb/api.h" 9 | 10 | namespace lintdb { 11 | /** 12 | * score_documents_by_codes aggregates a document score based on each token's 13 | * code and how well it matches the query. 14 | * 15 | * We return the list of scores for each centroid. 16 | */ 17 | float score_documents_by_codes( 18 | const gsl::span 19 | max_scores_by_centroid, // the max score per centroid across the 20 | // query tokens. 21 | const std::vector& doc_codes, 22 | const float centroid_score_threshold, 23 | const idx_t expected_id = -1); 24 | 25 | std::vector max_score_by_centroid( 26 | gsl::span coarse_idx, 27 | gsl::span distances, 28 | size_t k_per_token, 29 | size_t num_tokens, 30 | size_t num_centroids); 31 | 32 | float colbert_centroid_score( 33 | const std::vector& doc_codes, /// codes from the document. each 34 | /// token is assigned a code. 35 | const std::vector& 36 | centroid_scores, /// the score of those codes to the query. 37 | const size_t nquery_vectors, /// the number of query vectors. 38 | const size_t n_centroids, /// how many centroids there are. this may 39 | /// change based on how many scores we choose 40 | /// to calculate. 41 | const idx_t expected_id); 42 | 43 | struct DocumentScore { 44 | float score; 45 | std::vector tokens; 46 | }; 47 | 48 | DocumentScore score_document_by_residuals( 49 | const gsl::span 50 | query_vectors, // size: (num_query_tokens, num_dim) 51 | const size_t num_query_tokens, 52 | float* doc_residuals, // size: (num_doc_tokens, num_dim) 53 | const size_t num_doc_tokens, 54 | const size_t dim, 55 | const idx_t doc_id, 56 | bool normalize = true); 57 | 58 | } // namespace lintdb 59 | 60 | #endif -------------------------------------------------------------------------------- /lintdb/scoring/scoring_methods.cpp: -------------------------------------------------------------------------------- 1 | #include "scoring_methods.h" 2 | 3 | namespace lintdb { 4 | score_t score_one(const std::vector& values) { 5 | return 1.0; 6 | } 7 | 8 | score_t plaid_similarity(const std::vector& values, std::shared_ptr knn) { 9 | int colbert_idx = -1; 10 | for (size_t i = 0; i < values.size(); i++) { 11 | if (values[i].type == DataType::COLBERT) { 12 | colbert_idx = i; 13 | break; 14 | } 15 | } 16 | 17 | if (colbert_idx == -1) { 18 | LOG(WARNING) << "plaid context field not found for doc_id"; 19 | return 0.0; 20 | } 21 | 22 | // rank phase 1: use the codes to score the document using the centroid 23 | // scores. 24 | auto reordered_distances = knn->get_reordered_distances(); 25 | 26 | // gives us a potentially quantized vector 27 | SupportedTypes colbert_context = values[colbert_idx].value; 28 | ColBERTContextData codes = std::get(colbert_context); 29 | size_t num_tensors = codes.doc_codes.size(); 30 | 31 | QueryTensor query = knn->get_query_tensor(); 32 | float score = colbert_centroid_score( 33 | codes.doc_codes, 34 | reordered_distances, 35 | query.num_query_tokens, 36 | knn->get_num_centroids(), 37 | -1); 38 | 39 | return score; 40 | } 41 | 42 | UnaryScoringMethodFunction unary_scoring_methods[] = { 43 | score_one, 44 | }; 45 | 46 | score_t score(const UnaryScoringMethod method, const std::vector& values) { 47 | int scoring_type = static_cast(method); 48 | return unary_scoring_methods[scoring_type](values); 49 | } 50 | 51 | EmbeddingScoringMethodFunction embedding_scoring_methods[] = { 52 | plaid_similarity, 53 | }; 54 | 55 | 56 | score_t score_embeddings(const EmbeddingScoringMethod method, const std::vector& values, std::shared_ptr knn) { 57 | int scoring_type = static_cast(method); 58 | return embedding_scoring_methods[scoring_type](values, knn); 59 | } 60 | 61 | score_t sum(const std::vector& values) { 62 | score_t sum = 0; 63 | for (const score_t value : values) { 64 | sum += value; 65 | } 66 | return sum; 67 | } 68 | 69 | score_t reduce(const std::vector& values) { 70 | score_t product = 1; 71 | for (const score_t value : values) { 72 | product *= value; 73 | } 74 | return product; 75 | } 76 | 77 | score_t max(const std::vector& values) { 78 | score_t max = values[0]; 79 | for (const score_t value : values) { 80 | if (value > max) { 81 | max = value; 82 | } 83 | } 84 | return max; 85 | } 86 | 87 | NaryScoringMethodFunction nary_scoring_methods[] = { 88 | sum, 89 | reduce, 90 | max, 91 | }; 92 | 93 | score_t score(const NaryScoringMethod method, const std::vector& values) { 94 | int scoring_type = static_cast(method); 95 | return nary_scoring_methods[scoring_type](values); 96 | } 97 | 98 | } -------------------------------------------------------------------------------- /lintdb/scoring/scoring_methods.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "lintdb/schema/DataTypes.h" 4 | #include "lintdb/query/DocValue.h" 5 | #include "lintdb/query/KnnNearestCentroids.h" 6 | #include "lintdb/scoring/plaid.h" 7 | #include 8 | #include 9 | #include 10 | 11 | namespace lintdb { 12 | 13 | typedef double score_t; 14 | typedef score_t (*UnaryScoringMethodFunction)(const std::vector& values); 15 | typedef score_t (*NaryScoringMethodFunction)(const std::vector& values); 16 | typedef score_t (*EmbeddingScoringMethodFunction)(const std::vector& values, std::shared_ptr knn); 17 | 18 | score_t score_one(const std::vector& values); 19 | 20 | score_t plaid_similarity(const std::vector& values, std::shared_ptr knn); 21 | 22 | 23 | enum class UnaryScoringMethod { 24 | ONE = 0, 25 | }; 26 | 27 | score_t score(const UnaryScoringMethod method, const std::vector& values); 28 | 29 | enum class EmbeddingScoringMethod { 30 | PLAID = 1, 31 | COLBERT = 2 32 | }; 33 | 34 | score_t score_embeddings(const EmbeddingScoringMethod method, const std::vector& values, std::shared_ptr knn); 35 | 36 | score_t sum(const std::vector& values); 37 | 38 | score_t reduce(const std::vector& values); 39 | 40 | score_t max(const std::vector& values); 41 | 42 | enum class NaryScoringMethod { 43 | SUM = 0, 44 | REDUCE = 1, 45 | MAX = 2, 46 | }; 47 | score_t score(const NaryScoringMethod method, const std::vector& values); 48 | 49 | } -------------------------------------------------------------------------------- /lintdb/server/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LINTDB_SERVER_SRCS 2 | main.cpp 3 | controllers/v1/query_node_translator.h 4 | controllers/v1/result_translator.h 5 | ) 6 | add_executable(lintdb-server ${LINTDB_SERVER_SRCS}) 7 | 8 | find_package(Drogon CONFIG REQUIRED) 9 | target_link_libraries(lintdb-server PRIVATE Drogon::Drogon) 10 | 11 | find_package(args CONFIG REQUIRED) 12 | target_link_libraries(lintdb-server PRIVATE taywee::args) 13 | 14 | target_link_libraries(lintdb-server PRIVATE lintdb_lib) 15 | 16 | 17 | install( 18 | TARGETS lintdb-server 19 | EXPORT lintdb-targets 20 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} 21 | ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} 22 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 23 | INCLUDES 24 | DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) -------------------------------------------------------------------------------- /lintdb/server/api_tests.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | def test_search(): 4 | tensor = [0.1] * 128 * 32 5 | request = { 6 | "query": { 7 | "type": "TENSOR", 8 | "name": "colbert", 9 | "value": tensor, 10 | "num_tensors": 1 11 | }, 12 | "options": { 13 | "colbert_field": "colbert" 14 | }, 15 | "k": 10 16 | } 17 | 18 | resp = requests.post("http://0.0.0.0:8080/v1/Index/search/0", json=request) 19 | assert resp.status_code == 200 20 | data = resp.json() 21 | 22 | assert('results' in data), "Results not found in response" 23 | 24 | print("search test passed") 25 | 26 | def test_add(): 27 | tensor = [0.1] * 128 * 32 28 | request = { 29 | "documents": [ 30 | { 31 | "id": 50001, 32 | "fields": [ 33 | { 34 | "name": "colbert", 35 | "data_type": "TENSOR", 36 | "value": tensor, 37 | } 38 | ] 39 | } 40 | ] 41 | } 42 | 43 | resp = requests.post("http://0.0.0.0:8080/v1/Index/add/0", json=request) 44 | assert resp.status_code == 200 45 | data = resp.json() 46 | 47 | assert('ok' in data) 48 | 49 | print("add test passed") 50 | 51 | def test_update(): 52 | tensor = [0.2] * 128 * 32 53 | request = { 54 | "documents": [ 55 | { 56 | "id": 50001, 57 | "fields": [ 58 | { 59 | "name": "colbert", 60 | "data_type": "TENSOR", 61 | "value": tensor, 62 | } 63 | ] 64 | } 65 | ] 66 | } 67 | 68 | resp = requests.post("http://0.0.0.0:8080/v1/Index/update/0", json=request) 69 | assert resp.status_code == 200 70 | data = resp.json() 71 | 72 | assert('ok' in data) 73 | 74 | print("update test passed") 75 | 76 | def test_remove(): 77 | request = { 78 | 'ids': [50001] 79 | } 80 | resp = requests.post("http://0.0.0.0:8080/v1/Index/remove/0", json=request) 81 | assert resp.status_code == 200 82 | data = resp.json() 83 | 84 | assert('ok' in data) 85 | 86 | print("remove test passed") 87 | 88 | 89 | if __name__ == "__main__": 90 | test_search() 91 | test_add() 92 | test_update() 93 | test_remove() -------------------------------------------------------------------------------- /lintdb/server/controllers/v1/Index.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright (c) 2024 ${ORGANIZATION_NAME}. All rights reserved. 3 | // 4 | 5 | #include "Index.h" 6 | -------------------------------------------------------------------------------- /lintdb/server/controllers/v1/query_node_translator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "lintdb/query/QueryNode.h" 4 | #include "lintdb/schema/DataTypes.h" 5 | #include 6 | #include 7 | #include 8 | 9 | namespace server { 10 | class QueryNodeJsonTranslator { 11 | public: 12 | static std::unique_ptr fromJson(const Json::Value& json) { 13 | std::string type_string = json["type"].asString(); 14 | 15 | lintdb::QueryNodeType type; 16 | if(type_string == "TERM") { 17 | type = lintdb::QueryNodeType::TERM; 18 | } else if (type_string == "TENSOR") { 19 | type = lintdb::QueryNodeType::VECTOR; 20 | } else if (type_string == "AND") { 21 | type = lintdb::QueryNodeType::AND; 22 | } else { 23 | throw std::runtime_error("unknown QueryNodeType"); 24 | } 25 | 26 | switch (type) { 27 | case lintdb::QueryNodeType::TERM: { 28 | lintdb::FieldValue value = lintdb::FieldValue::fromJson(json["value"]); 29 | return std::make_unique(value); 30 | } 31 | case lintdb::QueryNodeType::VECTOR: { 32 | lintdb::Tensor value; 33 | for(auto& v : json["value"]) { 34 | value.push_back(v.asFloat()); 35 | } 36 | uint64_t num_tensors = json["num_tensors"].asUInt64(); 37 | std::string field = json["name"].asString(); 38 | lintdb::FieldValue fv = lintdb::FieldValue(field, value, size_t(num_tensors)); 39 | return std::make_unique(fv); 40 | } 41 | case lintdb::QueryNodeType::AND: { 42 | std::vector> children; 43 | for (const auto& childJson : json["children"]) { 44 | children.push_back(fromJson(childJson)); 45 | } 46 | return std::make_unique(std::move(children)); 47 | } 48 | default: 49 | throw std::runtime_error("Unknown QueryNodeType"); 50 | } 51 | } 52 | }; 53 | } -------------------------------------------------------------------------------- /lintdb/server/controllers/v1/result_translator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "lintdb/SearchResult.h" 7 | #include "lintdb/schema/DataTypes.h" 8 | 9 | namespace server { 10 | class SearchResultJsonTranslator { 11 | public: 12 | static Json::Value toJson(const lintdb::SearchResult& result) { 13 | Json::Value root; 14 | root["id"] = static_cast(result.id); 15 | root["score"] = result.score; 16 | 17 | Json::Value metadataJson; 18 | for (const auto& [key, value] : result.metadata) { 19 | 20 | metadataJson[key] = lintdb::supportedTypeToJSON(value); 21 | } 22 | root["metadata"] = metadataJson; 23 | 24 | return root; 25 | } 26 | }; 27 | } -------------------------------------------------------------------------------- /lintdb/server/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "controllers/v1/Index.h" 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace drogon; 8 | 9 | int main(int argc, char**argv) 10 | { 11 | args::ArgumentParser parser("LintDB Server."); 12 | args::HelpFlag help(parser, "help", "Display this help menu", {'h', "help"}); 13 | 14 | args::ValueFlag path(parser, "path", "Set the path to the database", {'p', "path"}); 15 | args::Flag read_only(parser, "read-only", "Set the database to read-only mode", {'r', "read-only"}); 16 | 17 | try 18 | { 19 | parser.ParseCLI(argc, argv); 20 | } 21 | catch (args::Help) 22 | { 23 | std::cout << parser; 24 | return 0; 25 | } 26 | catch (args::ParseError e) 27 | { 28 | std::cerr << e.what() << std::endl; 29 | std::cerr << parser; 30 | return 1; 31 | } 32 | catch (args::ValidationError e) 33 | { 34 | std::cerr << e.what() << std::endl; 35 | std::cerr << parser; 36 | return 1; 37 | } 38 | 39 | std::string p = args::get(path); 40 | auto indexController = std::make_shared(p, !!read_only); 41 | 42 | app().setLogPath("./", "lintdb-server.log") 43 | .setLogLevel(trantor::Logger::kDebug) 44 | .addListener("0.0.0.0", 8080) 45 | .setThreadNum(12) 46 | .registerController(indexController) 47 | // .enableRunAsDaemon() 48 | .run(); 49 | } -------------------------------------------------------------------------------- /lintdb/util.cpp: -------------------------------------------------------------------------------- 1 | #include "lintdb/util.h" 2 | #include 3 | #include 4 | #include 5 | #include "lintdb/api.h" 6 | #include "lintdb/exception.h" 7 | #include "lintdb/SearchOptions.h" 8 | 9 | namespace lintdb { 10 | extern "C" { 11 | // this is to keep the clang syntax checker happy 12 | #ifndef FINTEGER 13 | #define FINTEGER int 14 | #endif 15 | 16 | /* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */ 17 | 18 | float cblas_snrm2(FINTEGER n, const float* x, FINTEGER incx); 19 | 20 | int cblas_sscal(FINTEGER n, const float alpha, float* x, FINTEGER incx); 21 | } 22 | 23 | void normalize_vector( 24 | float* doc_residuals, 25 | const size_t num_doc_tokens, 26 | const size_t dim) { 27 | float mod = 0.0; 28 | 29 | int dim2 = dim; 30 | 31 | for (size_t i = 0; i < num_doc_tokens; i++) { 32 | mod = cblas_snrm2(dim2, doc_residuals + i * dim2, 1); 33 | if (mod == 1.0) { 34 | continue; 35 | } 36 | 37 | int dim2 = dim; 38 | float mod2 = 1.0 / mod; 39 | int incx = 1; 40 | // auto adjusted = std::max(mod, 1e-12f); 41 | cblas_sscal(dim2, mod2, doc_residuals + i * dim, incx); 42 | } 43 | } 44 | 45 | Json::Value loadJson(const std::string& path) { 46 | Json::Value root; 47 | std::ifstream in(path); 48 | Json::CharReaderBuilder readerBuilder; 49 | std::string errs; 50 | if (in.is_open()) { 51 | if (!Json::parseFromStream(readerBuilder, in, &root, &errs)) { 52 | LOG(ERROR) << "Failed to parse JSON from file: " << path 53 | << ", Error: " << errs; 54 | } 55 | in.close(); 56 | } else { 57 | LOG(ERROR) << "Unable to open file for reading: " << path; 58 | } 59 | 60 | return root; 61 | } 62 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/util.h: -------------------------------------------------------------------------------- 1 | #ifndef LINTDB_UTIL_H 2 | #define LINTDB_UTIL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "lintdb/SearchOptions.h" 13 | 14 | namespace lintdb { 15 | /** 16 | * Normalize vector normalizes vectors in place. 17 | * 18 | * do i need to consider simd instructions for optimizations? 19 | * https://stackoverflow.com/questions/57469359/how-to-efficiently-normalize-vector-c 20 | */ 21 | void normalize_vector( 22 | float* doc_residuals, 23 | const size_t num_doc_tokens, 24 | const size_t dim); 25 | 26 | template 27 | void product_helper( 28 | const std::vector>& pools, 29 | std::vector& result, 30 | size_t index, 31 | std::vector& current) { 32 | if (index == pools.size()) { 33 | for (const auto& elem : current) { 34 | result.push_back(elem); 35 | } 36 | return; 37 | } 38 | for (const auto& element : pools[index]) { 39 | current.push_back(element); 40 | product_helper(pools, result, index + 1, current); 41 | current.pop_back(); 42 | } 43 | } 44 | 45 | /** 46 | * product creates the cartesian product of a range of elements. Similar to 47 | * python, it enables us to repeat the input a certain amount of times. 48 | */ 49 | template 50 | std::vector product( 51 | const std::vector>& args, 52 | size_t repeat = 1) { 53 | std::vector> pools; 54 | for (const auto& arg : args) { 55 | pools.insert(pools.end(), repeat, arg); 56 | } 57 | std::vector result; 58 | std::vector current; 59 | product_helper(pools, result, 0, current); 60 | return result; 61 | } 62 | 63 | Json::Value loadJson(const std::string& path); 64 | 65 | inline std::vector subsample(const size_t total, const size_t sample) { 66 | std::mt19937 rng; 67 | std::seed_seq seed{1234}; 68 | 69 | rng.seed(seed); 70 | 71 | std::uniform_int_distribution dist(0, total - 1); 72 | std::vector indices; 73 | for (size_t i = 0; i < sample; i++) { 74 | indices.push_back(dist(rng)); 75 | } 76 | 77 | return indices; 78 | } 79 | 80 | } // namespace lintdb 81 | 82 | #endif -------------------------------------------------------------------------------- /lintdb/utils/endian.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace lintdb { 7 | template 8 | T load_bigendian(void const* bytes) { 9 | T num = 0; 10 | for (size_t i = 0; i < sizeof(T); ++i) { 11 | num |= static_cast(static_cast(bytes)[i]) 12 | << (8 * (sizeof(T) - i - 1)); 13 | } 14 | return num; 15 | } 16 | 17 | template 18 | void store_bigendian(T num, std::vector& bigEndian) { 19 | for (int i = sizeof(T) - 1; i >= 0; i--) { 20 | bigEndian.push_back((num >> (8 * i)) & 0xff); 21 | } 22 | } 23 | } // namespace lintdb -------------------------------------------------------------------------------- /lintdb/version.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #define LINTDB_VERSION_STRING "0.5.1" 6 | 7 | namespace lintdb { 8 | struct Version { 9 | Version(std::string versionStr = LINTDB_VERSION_STRING) { 10 | sscanf(versionStr.c_str(), "%d.%d.%d", &major, &minor, &revision); 11 | metadata_enabled = major >= 0 && minor >= 3 && revision >= 0; 12 | } 13 | 14 | bool operator==(const Version& otherVersion) const { 15 | return major == otherVersion.major && minor == otherVersion.minor && 16 | revision == otherVersion.revision; 17 | } 18 | 19 | bool operator<(const Version& otherVersion) { 20 | if (major < otherVersion.major) 21 | return true; 22 | if (minor < otherVersion.minor) 23 | return true; 24 | if (revision < otherVersion.revision) 25 | return true; 26 | return false; 27 | } 28 | 29 | bool metadata_enabled; 30 | 31 | int major, minor, revision, build; 32 | }; 33 | 34 | static const Version LINTDB_VERSION(LINTDB_VERSION_STRING); 35 | } // namespace lintdb 36 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: "LintDB" 2 | site_description: "A vector database for multi vector representations and late interaction scoring." 3 | site_url: "https://deployql.github.io/LintDB/" 4 | repo_url: "https://github.com/DeployQL/lintdb" 5 | repo_name: "DeployQL/lintdb" 6 | edit_uri: "edit/main/docs/" 7 | 8 | extra: 9 | version: 10 | provider: mike 11 | 12 | theme: 13 | name: "material" 14 | logo: icon.svg 15 | features: 16 | - announce.dismiss 17 | - content.action.edit 18 | - content.action.view 19 | - content.code.annotate 20 | - content.code.copy 21 | - content.tooltips 22 | - navigation.footer 23 | - navigation.indexes 24 | - search.highlight 25 | - search.suggest 26 | - toc.follow 27 | palette: 28 | - media: "(prefers-color-scheme)" 29 | toggle: 30 | icon: material/link 31 | name: Switch to light mode 32 | - media: "(prefers-color-scheme: light)" 33 | scheme: default 34 | primary: indigo 35 | accent: indigo 36 | toggle: 37 | icon: material/toggle-switch 38 | name: Switch to dark mode 39 | - media: "(prefers-color-scheme: dark)" 40 | scheme: slate 41 | primary: indigo 42 | accent: black 43 | toggle: 44 | icon: material/toggle-switch-off 45 | name: Switch to system preference 46 | 47 | markdown_extensions: 48 | - attr_list 49 | - admonition 50 | - callouts 51 | - footnotes 52 | - pymdownx.details 53 | - pymdownx.emoji: 54 | emoji_index: !!python/name:material.extensions.emoji.twemoji 55 | emoji_generator: !!python/name:material.extensions.emoji.to_svg 56 | - pymdownx.highlight: 57 | pygments_lang_class: true 58 | - pymdownx.magiclink 59 | - pymdownx.snippets: 60 | base_path: [ !relative $config_dir ] 61 | check_paths: true 62 | - pymdownx.superfences 63 | - pymdownx.tabbed: 64 | alternate_style: true 65 | slugify: !!python/object/apply:pymdownx.slugs.slugify 66 | kwds: 67 | case: lower 68 | - pymdownx.tasklist: 69 | custom_checkbox: true 70 | - pymdownx.tilde 71 | - toc: 72 | permalink: "¤" 73 | 74 | plugins: 75 | - search 76 | - literate-nav: 77 | nav_file: "nav.md" 78 | - mkdocstrings: 79 | handlers: 80 | python: 81 | options: 82 | find_stubs_package: true 83 | docstring_options: 84 | ignore_init_summary: true 85 | docstring_section_style: list 86 | filters: [ "!^_" ] 87 | heading_level: 2 88 | inherited_members: true 89 | merge_init_into_class: true 90 | parameter_headings: true 91 | separate_signature: true 92 | show_root_heading: true 93 | show_root_full_path: false 94 | show_signature: true 95 | show_signature_annotations: true 96 | show_symbol_type_heading: true 97 | show_symbol_type_toc: true 98 | signature_crossrefs: true 99 | summary: true 100 | - mike: 101 | # These fields are all optional; the defaults are as below... 102 | alias_type: symlink 103 | redirect_template: null 104 | deploy_prefix: '' 105 | canonical_version: latest 106 | version_selector: true 107 | css_dir: css 108 | javascript_dir: js -------------------------------------------------------------------------------- /ports/bitsery/portfile.cmake: -------------------------------------------------------------------------------- 1 | vcpkg_from_github( 2 | OUT_SOURCE_PATH SOURCE_PATH 3 | REPO fraillt/bitsery 4 | REF "v${VERSION}" 5 | SHA512 26e525d799d1777e182753c6c970765be8695a557e0fef35224ab8f4629a094c04fd8d7e456da369938d74acb0ca84084f394f212ae1343fa62a27256dba971f 6 | HEAD_REF master 7 | ) 8 | 9 | vcpkg_cmake_configure( 10 | SOURCE_PATH "${SOURCE_PATH}" 11 | ) 12 | 13 | vcpkg_cmake_install() 14 | 15 | vcpkg_cmake_config_fixup(CONFIG_PATH lib/cmake/${PORT}) 16 | 17 | file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/lib") 18 | file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug") 19 | 20 | file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright) 21 | -------------------------------------------------------------------------------- /ports/bitsery/vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "bitsery", 3 | "version": "5.2.4", 4 | "description": "Header only C++ binary serialization library", 5 | "homepage": "https://github.com/fraillt/bitsery", 6 | "dependencies": [ 7 | { 8 | "name": "vcpkg-cmake", 9 | "host": true 10 | }, 11 | { 12 | "name": "vcpkg-cmake-config", 13 | "host": true 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /ports/faiss/faiss.patch: -------------------------------------------------------------------------------- 1 | diff --git a/cmake/faiss-config.cmake.in b/cmake/faiss-config.cmake.in 2 | index 43ea9d4c..a7beff69 100644 3 | --- a/cmake/faiss-config.cmake.in 4 | +++ b/cmake/faiss-config.cmake.in 5 | @@ -4,4 +4,6 @@ 6 | # This source code is licensed under the BSD-style license found in the 7 | # LICENSE file in the root directory of this source tree. 8 | 9 | +find_dependency(MKL REQUIRED) 10 | + 11 | include("${CMAKE_CURRENT_LIST_DIR}/faiss-targets.cmake") 12 | diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt 13 | index 1fea676c..8723be27 100644 14 | --- a/faiss/CMakeLists.txt 15 | +++ b/faiss/CMakeLists.txt 16 | @@ -269,16 +269,16 @@ target_link_libraries(faiss_avx2 PRIVATE OpenMP::OpenMP_CXX) 17 | 18 | find_package(MKL) 19 | if(MKL_FOUND) 20 | - target_link_libraries(faiss PRIVATE ${MKL_LIBRARIES}) 21 | - target_link_libraries(faiss_avx2 PRIVATE ${MKL_LIBRARIES}) 22 | + target_link_libraries(faiss PRIVATE MKL::MKL) 23 | + target_link_libraries(faiss_avx2 PRIVATE MKL::MKL) 24 | else() 25 | find_package(BLAS REQUIRED) 26 | - target_link_libraries(faiss PRIVATE ${BLAS_LIBRARIES}) 27 | - target_link_libraries(faiss_avx2 PRIVATE ${BLAS_LIBRARIES}) 28 | + target_link_libraries(faiss PRIVATE BLAS::BLAS) 29 | + target_link_libraries(faiss_avx2 PRIVATE BLAS::BLAS) 30 | 31 | find_package(LAPACK REQUIRED) 32 | - target_link_libraries(faiss PRIVATE ${LAPACK_LIBRARIES}) 33 | - target_link_libraries(faiss_avx2 PRIVATE ${LAPACK_LIBRARIES}) 34 | + target_link_libraries(faiss PRIVATE lAPACK::LAPACK) 35 | + target_link_libraries(faiss_avx2 PRIVATE LAPACK::LAPACK) 36 | endif() 37 | 38 | install(TARGETS faiss 39 | -------------------------------------------------------------------------------- /ports/faiss/fix-dependencies.patch: -------------------------------------------------------------------------------- 1 | diff --git a/cmake/faiss-config.cmake.in b/cmake/faiss-config.cmake.in 2 | index 43ea9d4..437a7f8 100644 3 | --- a/cmake/faiss-config.cmake.in 4 | +++ b/cmake/faiss-config.cmake.in 5 | @@ -4,4 +4,7 @@ 6 | # This source code is licensed under the BSD-style license found in the 7 | # LICENSE file in the root directory of this source tree. 8 | 9 | +find_dependency(OpenMP REQUIRED) 10 | +find_dependency(BLAS REQUIRED) 11 | +find_dependency(LAPACK REQUIRED) 12 | include("${CMAKE_CURRENT_LIST_DIR}/faiss-targets.cmake") 13 | diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt 14 | index 30d573f..9af8baf 100644 15 | --- a/faiss/CMakeLists.txt 16 | +++ b/faiss/CMakeLists.txt 17 | @@ -212,17 +212,17 @@ target_link_libraries(faiss PRIVATE OpenMP::OpenMP_CXX) 18 | target_link_libraries(faiss_avx2 PRIVATE OpenMP::OpenMP_CXX) 19 | 20 | find_package(MKL) 21 | -if(MKL_FOUND) 22 | +if(MKL_FOUND) 23 | target_link_libraries(faiss PRIVATE ${MKL_LIBRARIES}) 24 | target_link_libraries(faiss_avx2 PRIVATE ${MKL_LIBRARIES}) 25 | else() 26 | find_package(BLAS REQUIRED) 27 | - target_link_libraries(faiss PRIVATE ${BLAS_LIBRARIES}) 28 | - target_link_libraries(faiss_avx2 PRIVATE ${BLAS_LIBRARIES}) 29 | + target_link_libraries(faiss PRIVATE BLAS::BLAS) 30 | + target_link_libraries(faiss_avx2 PRIVATE BLAS::BLAS) 31 | 32 | find_package(LAPACK REQUIRED) 33 | - target_link_libraries(faiss PRIVATE ${LAPACK_LIBRARIES}) 34 | - target_link_libraries(faiss_avx2 PRIVATE ${LAPACK_LIBRARIES}) 35 | + target_link_libraries(faiss PRIVATE LAPACK::LAPACK) 36 | + target_link_libraries(faiss_avx2 PRIVATE LAPACK::LAPACK) 37 | endif() 38 | 39 | install(TARGETS faiss 40 | -------------------------------------------------------------------------------- /ports/faiss/portfile.cmake: -------------------------------------------------------------------------------- 1 | vcpkg_from_github( 2 | OUT_SOURCE_PATH SOURCE_PATH 3 | REPO facebookresearch/faiss 4 | # REF v1.8.0 5 | # SHA512 38d4215e3e019915d8b367ff0e8d14901b1495f6f45b835e9248276567a422b0370baab6bd887045442dd1e268b7fe7c347107162e66bb3ec6b1a53be4b2e441 6 | REF v1.7.4 7 | SHA512 9622fb989cb2e1879450c2ad257cb55d0c0c639f54f0815e4781f4e4b2ae2f01779f5c8c0738ae9a29fde7e418587e6a92e91240d36c1ca051a6228bfb777638 8 | HEAD_REF master 9 | ) 10 | 11 | vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS 12 | FEATURES 13 | gpu FAISS_ENABLE_GPU 14 | ) 15 | 16 | if ("${FAISS_ENABLE_GPU}") 17 | if (NOT VCPKG_CMAKE_SYSTEM_NAME AND NOT ENV{CUDACXX}) 18 | set(ENV{CUDACXX} "$ENV{CUDA_PATH}/bin/nvcc.exe") 19 | endif() 20 | endif() 21 | 22 | 23 | vcpkg_cmake_configure( 24 | SOURCE_PATH "${SOURCE_PATH}" 25 | OPTIONS 26 | ${FEATURE_OPTIONS} 27 | -DFAISS_ENABLE_PYTHON=OFF # Requires SWIG 28 | -DBUILD_TESTING=OFF 29 | -DCMAKE_BUILD_TYPE=Release 30 | # -DBLA_VENDOR=Intel10_64lp 31 | # -DCMAKE_TOOLCHAIN_FILE="${CMAKE_CURRENT_SOURCE_DIR}/tools/vcpkg/scripts/buildsystems/vcpkg.cmake" 32 | ) 33 | 34 | # # Setup vcpkg script with CMake (note: should be placed before project() call) 35 | # set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_SOURCE_DIR}/tools/vcpkg/scripts/buildsystems/vcpkg.cmake CACHE STRING "Vcpkg toolchain file") 36 | 37 | 38 | vcpkg_cmake_install() 39 | 40 | vcpkg_cmake_config_fixup() 41 | 42 | vcpkg_copy_pdbs() 43 | 44 | file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright) 45 | 46 | file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include") 47 | -------------------------------------------------------------------------------- /ports/faiss/vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "faiss", 3 | "version": "1.7.4", 4 | "description": "Faiss is a library for efficient similarity search and clustering of dense vectors.", 5 | "homepage": "https://github.com/facebookresearch/faiss", 6 | "license": "MIT", 7 | "supports": "!uwp & !osx & !x86", 8 | "dependencies": [ 9 | "lapack", 10 | "openblas", 11 | { 12 | "name": "vcpkg-cmake", 13 | "host": true 14 | }, 15 | { 16 | "name": "vcpkg-cmake-config", 17 | "host": true 18 | } 19 | ], 20 | "features": { 21 | "gpu": { 22 | "description": "Whether to enable GPU support", 23 | "dependencies": [ 24 | "cuda" 25 | ] 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /ports/intel-mkl/copy-from-dmg.cmake: -------------------------------------------------------------------------------- 1 | find_program(HDIUTIL NAMES hdiutil REQUIRED) 2 | set(dmg_path "NOTFOUND" CACHE FILEPATH "Where to find the DMG") 3 | set(output_dir "output_dir" CACHE FILEPATH "Where to put the packages") 4 | 5 | if(NOT EXISTS "${dmg_path}") 6 | message(FATAL_ERROR "'dmg_path' (${dmg_path}) does not exist.") 7 | endif() 8 | if(NOT IS_DIRECTORY "${output_dir}") 9 | message(FATAL_ERROR "'output_dir' (${output_dir}) is not a directory.") 10 | endif() 11 | 12 | execute_process( 13 | COMMAND mktemp -d 14 | RESULT_VARIABLE mktemp_result 15 | OUTPUT_VARIABLE mount_point 16 | OUTPUT_STRIP_TRAILING_WHITESPACE 17 | ) 18 | if(NOT mktemp_result STREQUAL "0") 19 | message(FATAL_ERROR "mktemp -d failed: ${mktemp_result}") 20 | elseif(NOT IS_DIRECTORY "${mount_point}") 21 | message(FATAL_ERROR "'mount_point' (${mount_point}) is not a directory.") 22 | endif() 23 | 24 | execute_process( 25 | COMMAND "${HDIUTIL}" attach "${dmg_path}" -mountpoint "${mount_point}" -readonly 26 | RESULT_VARIABLE mount_result 27 | ) 28 | if(mount_result STREQUAL "0") 29 | set(dmg_packages_dir "${mount_point}/bootstrapper.app/Contents/Resources/packages") 30 | file(GLOB packages 31 | "${dmg_packages_dir}/intel.oneapi.mac.mkl.devel,*" 32 | "${dmg_packages_dir}/intel.oneapi.mac.mkl.runtime,*" 33 | "${dmg_packages_dir}/intel.oneapi.mac.mkl.product,*" 34 | "${dmg_packages_dir}/intel.oneapi.mac.openmp,*" 35 | ) 36 | # Using execute_process to avoid direct errors 37 | execute_process( 38 | COMMAND cp -R ${packages} "${output_dir}/" 39 | RESULT_VARIABLE copy_result 40 | ) 41 | endif() 42 | execute_process( 43 | COMMAND "${HDIUTIL}" detach "${mount_point}" 44 | RESULT_VARIABLE unmount_result 45 | ) 46 | 47 | if(NOT mount_result STREQUAL "0") 48 | message(FATAL_ERROR "Mounting ${dmg_path} failed: ${mount_result}") 49 | elseif(NOT copy_result STREQUAL "0") 50 | message(FATAL_ERROR "Coyping packages failed: ${copy_result}") 51 | elseif(NOT unmount_result STREQUAL "0") 52 | message(FATAL_ERROR "Unounting ${dmg_path} failed: ${unmount_result}") 53 | endif() 54 | -------------------------------------------------------------------------------- /ports/intel-mkl/usage: -------------------------------------------------------------------------------- 1 | intel-mkl provides CMake targets: 2 | 3 | find_package(MKL CONFIG REQUIRED) 4 | target_link_libraries(main PRIVATE MKL::MKL) 5 | -------------------------------------------------------------------------------- /ports/intel-mkl/vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "intel-mkl", 3 | "version": "2023.0.0", 4 | "port-version": 3, 5 | "description": "Intel® Math Kernel Library (Intel® MKL) accelerates math processing routines, increases application performance, and reduces development time on Intel® processors.", 6 | "homepage": "https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html", 7 | "license": null, 8 | "supports": "(windows | linux | osx) & x64", 9 | "dependencies": [ 10 | { 11 | "name": "vcpkg-tool-lessmsi", 12 | "host": true, 13 | "platform": "windows" 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /ports/onnxruntime/vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "onnxruntime", 3 | "version": "1.17.3", 4 | "description": "onnxruntime", 5 | "homepage": "https://github.com/microsoft/onnxruntime", 6 | "license": "MIT", 7 | "supports": "(windows | linux | osx)" 8 | } 9 | -------------------------------------------------------------------------------- /ports/rocksdb/0001-fix-dependencies.patch: -------------------------------------------------------------------------------- 1 | CMakeLists.txt | 33 +++++++++++++++------------------ 2 | cmake/RocksDBConfig.cmake.in | 11 ++++++++--- 3 | 2 files changed, 23 insertions(+), 21 deletions(-) 4 | 5 | diff --git a/CMakeLists.txt b/CMakeLists.txt 6 | index 23a4014bc..045f5a36d 100644 7 | --- a/CMakeLists.txt 8 | +++ b/CMakeLists.txt 9 | @@ -87,7 +87,7 @@ endif() 10 | 11 | include(CMakeDependentOption) 12 | 13 | -if(MSVC) 14 | +if(0) 15 | option(WITH_GFLAGS "build with GFlags" OFF) 16 | option(WITH_XPRESS "build with windows built in compression" OFF) 17 | option(ROCKSDB_SKIP_THIRDPARTY "skip thirdparty.inc" OFF) 18 | @@ -136,10 +136,7 @@ else() 19 | endif() 20 | 21 | if(WITH_SNAPPY) 22 | - find_package(Snappy CONFIG) 23 | - if(NOT Snappy_FOUND) 24 | - find_package(Snappy REQUIRED) 25 | - endif() 26 | + find_package(Snappy CONFIG REQUIRED) 27 | add_definitions(-DSNAPPY) 28 | list(APPEND THIRDPARTY_LIBS Snappy::snappy) 29 | endif() 30 | @@ -163,16 +160,19 @@ else() 31 | endif() 32 | 33 | if(WITH_LZ4) 34 | - find_package(lz4 REQUIRED) 35 | + find_package(lz4 CONFIG REQUIRED) 36 | add_definitions(-DLZ4) 37 | list(APPEND THIRDPARTY_LIBS lz4::lz4) 38 | endif() 39 | 40 | if(WITH_ZSTD) 41 | - find_package(zstd REQUIRED) 42 | + find_package(zstd CONFIG REQUIRED) 43 | add_definitions(-DZSTD) 44 | - include_directories(${ZSTD_INCLUDE_DIR}) 45 | - list(APPEND THIRDPARTY_LIBS zstd::zstd) 46 | + if(TARGET zstd::libzstd_shared) 47 | + list(APPEND THIRDPARTY_LIBS zstd::libzstd_shared) 48 | + elseif(TARGET zstd::libzstd_static) 49 | + list(APPEND THIRDPARTY_LIBS zstd::libzstd_static) 50 | + endif() 51 | endif() 52 | endif() 53 | 54 | @@ -312,11 +312,10 @@ int main() { 55 | endif() 56 | 57 | if (WITH_LIBURING) 58 | - find_package(uring) 59 | - if (uring_FOUND) 60 | - add_definitions(-DROCKSDB_IOURING_PRESENT) 61 | - list(APPEND THIRDPARTY_LIBS uring::uring) 62 | - endif() 63 | + find_package(PkgConfig) 64 | + pkg_check_modules(liburing REQUIRED IMPORTED_TARGET GLOBAL liburing>=2.0) 65 | + add_definitions(-DROCKSDB_IOURING_PRESENT) 66 | + list(APPEND THIRDPARTY_LIBS PkgConfig::liburing) 67 | endif() 68 | 69 | # Reset the required flags 70 | @@ -382,9 +381,9 @@ endif() 71 | 72 | option(WITH_TBB "build with Threading Building Blocks (TBB)" OFF) 73 | if(WITH_TBB) 74 | - find_package(TBB REQUIRED) 75 | + find_package(TBB CONFIG REQUIRED) 76 | add_definitions(-DTBB) 77 | - list(APPEND THIRDPARTY_LIBS TBB::TBB) 78 | + list(APPEND THIRDPARTY_LIBS TBB::tbb) 79 | endif() 80 | 81 | # Stall notifications eat some performance from inserts 82 | @@ -1202,8 +1201,6 @@ if(NOT WIN32 OR ROCKSDB_INSTALL_ON_WINDOWS) 83 | endforeach() 84 | endforeach() 85 | 86 | - install(DIRECTORY "${PROJECT_SOURCE_DIR}/cmake/modules" COMPONENT devel DESTINATION ${package_config_destination}) 87 | - 88 | install( 89 | TARGETS ${ROCKSDB_STATIC_LIB} 90 | EXPORT RocksDBTargets 91 | diff --git a/cmake/RocksDBConfig.cmake.in b/cmake/RocksDBConfig.cmake.in 92 | index 0bd14be11..a420d8bfe 100644 93 | --- a/cmake/RocksDBConfig.cmake.in 94 | +++ b/cmake/RocksDBConfig.cmake.in 95 | @@ -33,11 +33,11 @@ if(@WITH_BZ2@) 96 | endif() 97 | 98 | if(@WITH_LZ4@) 99 | - find_dependency(lz4) 100 | + find_dependency(lz4 CONFIG) 101 | endif() 102 | 103 | if(@WITH_ZSTD@) 104 | - find_dependency(zstd) 105 | + find_dependency(zstd CONFIG) 106 | endif() 107 | 108 | if(@WITH_NUMA@) 109 | @@ -45,7 +45,12 @@ if(@WITH_NUMA@) 110 | endif() 111 | 112 | if(@WITH_TBB@) 113 | - find_dependency(TBB) 114 | + find_dependency(TBB CONFIG) 115 | +endif() 116 | + 117 | +if(@WITH_LIBURING@) 118 | + find_dependency(PkgConfig) 119 | + pkg_check_modules(liburing REQUIRED IMPORTED_TARGET GLOBAL liburing>=2.0) 120 | endif() 121 | 122 | find_dependency(Threads) 123 | -------------------------------------------------------------------------------- /ports/rocksdb/portfile.cmake: -------------------------------------------------------------------------------- 1 | vcpkg_from_github( 2 | OUT_SOURCE_PATH SOURCE_PATH 3 | REPO facebook/rocksdb 4 | REF "v${VERSION}" 5 | SHA512 524e3e70ed2b1d2e6c61a7b401946e50473cc95684ce4efc6250062f5bc945e443e96f7907fcc3ee1ab98c71179a8b56a654383cf2c0bbe1bb20907ab1ac7523 6 | HEAD_REF main 7 | PATCHES 8 | 0001-fix-dependencies.patch 9 | ) 10 | 11 | string(COMPARE EQUAL "${VCPKG_CRT_LINKAGE}" "dynamic" WITH_MD_LIBRARY) 12 | string(COMPARE EQUAL "${VCPKG_LIBRARY_LINKAGE}" "dynamic" ROCKSDB_BUILD_SHARED) 13 | 14 | vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS 15 | FEATURES 16 | "liburing" WITH_LIBURING 17 | "snappy" WITH_SNAPPY 18 | "lz4" WITH_LZ4 19 | "zlib" WITH_ZLIB 20 | "zstd" WITH_ZSTD 21 | "bzip2" WITH_BZ2 22 | "numa" WITH_NUMA 23 | "tbb" WITH_TBB 24 | ) 25 | 26 | vcpkg_cmake_configure( 27 | SOURCE_PATH "${SOURCE_PATH}" 28 | OPTIONS 29 | -DWITH_GFLAGS=OFF 30 | -DWITH_TESTS=OFF 31 | -DWITH_BENCHMARK_TOOLS=OFF 32 | -DWITH_TOOLS=OFF 33 | -DUSE_RTTI=ON 34 | -DROCKSDB_INSTALL_ON_WINDOWS=ON 35 | -DFAIL_ON_WARNINGS=OFF 36 | -DWITH_MD_LIBRARY=${WITH_MD_LIBRARY} 37 | -DPORTABLE=1 # Minimum CPU arch to support, or 0 = current CPU, 1 = baseline CPU 38 | -DROCKSDB_BUILD_SHARED=${ROCKSDB_BUILD_SHARED} 39 | -DCMAKE_DISABLE_FIND_PACKAGE_Git=TRUE 40 | ${FEATURE_OPTIONS} 41 | OPTIONS_DEBUG 42 | -DCMAKE_DEBUG_POSTFIX=d 43 | -DWITH_RUNTIME_DEBUG=ON 44 | OPTIONS_RELEASE 45 | -DWITH_RUNTIME_DEBUG=OFF 46 | ) 47 | 48 | vcpkg_cmake_install() 49 | 50 | vcpkg_cmake_config_fixup(CONFIG_PATH lib/cmake/rocksdb) 51 | 52 | vcpkg_copy_pdbs() 53 | 54 | file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include") 55 | file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/share") 56 | 57 | vcpkg_fixup_pkgconfig() 58 | 59 | vcpkg_install_copyright(COMMENT [[ 60 | RocksDB is dual-licensed under both the GPLv2 (found in COPYING) 61 | and Apache 2.0 License (found in LICENSE.Apache). You may select, 62 | at your option, one of the above-listed licenses. 63 | ]] 64 | FILE_LIST 65 | "${SOURCE_PATH}/LICENSE.leveldb" 66 | "${SOURCE_PATH}/LICENSE.Apache" 67 | "${SOURCE_PATH}/COPYING" 68 | ) 69 | -------------------------------------------------------------------------------- /ports/rocksdb/vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "rocksdb", 3 | "version": "8.10.0", 4 | "description": "A library that provides an embeddable, persistent key-value store for fast storage", 5 | "homepage": "https://github.com/facebook/rocksdb", 6 | "license": "GPL-2.0-only OR Apache-2.0", 7 | "supports": "!uwp & !(arm & !arm64 & android)", 8 | "dependencies": [ 9 | { 10 | "name": "vcpkg-cmake", 11 | "host": true 12 | }, 13 | { 14 | "name": "vcpkg-cmake-config", 15 | "host": true 16 | } 17 | ], 18 | "default-features": [ 19 | "zlib" 20 | ], 21 | "features": { 22 | "bzip2": { 23 | "description": "build with bzip2", 24 | "dependencies": [ 25 | "bzip2" 26 | ] 27 | }, 28 | "liburing": { 29 | "description": "build with liburing", 30 | "supports": "linux", 31 | "dependencies": [ 32 | { 33 | "name": "liburing", 34 | "platform": "linux" 35 | } 36 | ] 37 | }, 38 | "lz4": { 39 | "description": "build with lz4", 40 | "dependencies": [ 41 | "lz4" 42 | ] 43 | }, 44 | "numa": { 45 | "description": "build with NUMA policy support", 46 | "supports": "linux" 47 | }, 48 | "snappy": { 49 | "description": "build with SNAPPY", 50 | "dependencies": [ 51 | "snappy" 52 | ] 53 | }, 54 | "tbb": { 55 | "description": "build with Threading Building Blocks (TBB)", 56 | "dependencies": [ 57 | "tbb" 58 | ] 59 | }, 60 | "zlib": { 61 | "description": "build with zlib", 62 | "dependencies": [ 63 | "zlib" 64 | ] 65 | }, 66 | "zstd": { 67 | "description": "build with zstd", 68 | "dependencies": [ 69 | "zstd" 70 | ] 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["scikit-build-core >=0.4.3", "nanobind >=1.3.2"] 3 | build-backend = "scikit_build_core.build" 4 | 5 | [project] 6 | name = "lintdb" 7 | version = "0.5.1" 8 | description = "Python library for LintDB, a vector database for token embeddings and late interaction." 9 | readme = "README.md" 10 | license = {text = "Apache-2.0 License"} 11 | requires-python = ">=3.9" 12 | authors = [ 13 | { name = "Matt Barta", email = "matt@deployql.com" }, 14 | ] 15 | classifiers = [ 16 | "License :: OSI Approved :: Apache-2.0 License", 17 | ] 18 | dependencies = [ 19 | "typing_extensions", 20 | ] 21 | 22 | [project.urls] 23 | Homepage = "https://github.com/deployQL/lintdb" 24 | 25 | [tool.pytest.ini_options] 26 | #pythonpath = [ "./builds/python/lintdb/python/Release", "lintdb/python" ] 27 | testpaths = [ 28 | "lintdb/python/tests" 29 | ] 30 | 31 | [tool.scikit-build] 32 | # Protect the configuration against future changes in scikit-build-core 33 | minimum-version = "0.4" 34 | 35 | # Setuptools-style build caching in a local directory 36 | build-dir = "builds/{wheel_tag}" 37 | 38 | # Build stable ABI wheels for CPython 3.12+ 39 | #wheel.py-api = "cp310" 40 | 41 | cmake.build-type = "Release" 42 | 43 | sdist.cmake = true 44 | wheel.packages = ["lintdb/python"] 45 | cmake.targets = ['lintdb_lib', 'core'] 46 | 47 | [tool.scikit-build.cmake.define] 48 | # to use a shared library, we need to jump through some hoops for python: https://github.com/scikit-build/scikit-build/issues/272 49 | # SO link: https://stackoverflow.com/questions/70044257/packaging-executable-shared-library-and-python-bindings-not-finding-library 50 | BUILD_SHARED_LIBS = "OFF" 51 | CMAKE_CXX_COMPILER = "clang++" 52 | #CMAKE_INSTALL_LIBDIR = "lib" 53 | CMAKE_VERBOSE_MAKEFILE = "OFF" 54 | ENABLE_PYTHON = "ON" 55 | BUILD_TESTING = "OFF" 56 | #BLA_VENDOR = "Intel10_64lp" -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LINT_DB_TESTS 2 | util.h 3 | coarse_quantizer_test.cpp 4 | doc_iterator_test.cpp 5 | index_test.cpp 6 | mocks.h 7 | keys_test.cpp 8 | doc_encoder_test.cpp 9 | colbert_test.cpp 10 | plaid_test.cpp 11 | binarizer_test.cpp 12 | inverted_list_test.cpp 13 | doc_processor_test.cpp 14 | product_quantizer_test.cpp) 15 | 16 | add_executable(lintdb-tests ${LINT_DB_TESTS}) 17 | 18 | target_link_libraries(lintdb-tests PRIVATE lintdb_lib) 19 | 20 | find_package(Bitsery CONFIG REQUIRED) 21 | target_link_libraries(lintdb-tests PRIVATE Bitsery::bitsery) 22 | 23 | enable_testing() 24 | 25 | find_package(GTest CONFIG REQUIRED) 26 | 27 | include(FetchContent) 28 | set(BUILD_GMOCK CACHE BOOL OFF) 29 | set(INSTALL_GTEST CACHE BOOL OFF) 30 | FetchContent_Declare( 31 | googletest 32 | URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip 33 | ) 34 | 35 | # target_include_directories(lintdb-tests PUBLIC 36 | # $) 37 | 38 | target_link_libraries(lintdb-tests PRIVATE GTest::gtest GTest::gtest_main 39 | GTest::gmock GTest::gmock_main) 40 | 41 | include(GoogleTest) 42 | gtest_discover_tests(lintdb-tests 43 | WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/tests") 44 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeployQL/LintDB/bba57a67d1289edc14bdc00cb3c45df60877e2e4/tests/__init__.py -------------------------------------------------------------------------------- /tests/colbert_test.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "lintdb/index.h" 7 | #include "lintdb/query/Query.h" 8 | #include "lintdb/query/QueryNode.h" 9 | #include "lintdb/schema/DataTypes.h" 10 | #include 11 | #include 12 | #include 13 | 14 | #define DATABASE_PATH "data/colbert_test.db" 15 | #define QUERY_EMBEDDING_PATH "data/query.txt" 16 | #define EXPECTED_RESULTS_PATH "data/colbert.ranking.tsv" 17 | 18 | using namespace std; 19 | /** 20 | * This test uses 1 query from LoTTE lifestyle and 1,000 documents. 21 | * 22 | * This is a fairly relaxed test. We ensure that the top doc ids are correct, but don't 23 | * enforce the order or score. 24 | * 25 | * We can notice scores change slightly between any given indexing run. 26 | */ 27 | TEST(ColBertTests, ScoresCorrectly) { 28 | auto index = lintdb::IndexIVF(DATABASE_PATH); 29 | 30 | // read query embeddings 31 | std::ifstream queryFile; 32 | queryFile.open(QUERY_EMBEDDING_PATH); 33 | std::string line; 34 | std::vector embeddings; 35 | 36 | while(std::getline(queryFile, line)) { 37 | std::stringstream buf(line); 38 | std::string tmp; 39 | while(getline(buf, tmp, ' ')) { 40 | float f = std::stof(tmp); 41 | embeddings.push_back(f); 42 | } 43 | } 44 | // we save a padded query, which should be 32 tokens long. 45 | ASSERT_EQ(embeddings.size(), 32 * 128); 46 | 47 | lintdb::SearchOptions searchOpts; 48 | searchOpts.k_top_centroids = 32; 49 | 50 | lintdb::FieldValue fv("colbert", embeddings, 32); 51 | std::unique_ptr root = std::make_unique(fv); 52 | lintdb::Query query(std::move(root)); 53 | 54 | std::vector results = index.search(0, query, 4, searchOpts); 55 | 56 | // print result ids and score 57 | for (auto& result : results) { 58 | std::cout << result.id << " " << result.score << std::endl; 59 | } 60 | 61 | ifstream dataFile; 62 | dataFile.open(EXPECTED_RESULTS_PATH); 63 | 64 | // read each line. 65 | std::unordered_set doc_ids; 66 | int count = 0; 67 | while(!dataFile.eof() && count < 4) { 68 | std::string str; 69 | std::getline( dataFile, str); 70 | std::stringstream buffer(str); 71 | std::string tmp; 72 | 73 | // read each column 74 | int doc_id; 75 | float doc_score; 76 | int ranking = 0; 77 | 78 | int i = 0; 79 | while( getline( buffer, tmp, '\t') ) { 80 | if (i == 1) { 81 | // doc id 82 | doc_id = std::stoi(tmp); 83 | } 84 | if (i==2) { 85 | ranking = std::stoi(tmp); 86 | } 87 | if (i==3) { 88 | doc_score = std::stof(tmp); 89 | } 90 | i++; 91 | } 92 | doc_ids.insert(doc_id); 93 | count++; 94 | } 95 | 96 | // check if the top 10 doc ids are in the expected results. 97 | for (auto& result : results) { 98 | ASSERT_TRUE(doc_ids.find(result.id) != doc_ids.end()) << "Doc id " << result.id << " not found in expected results"; 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /tests/data/colbert.ranking.tsv: -------------------------------------------------------------------------------- 1 | 1 509 1 15.1640625 2 | 1 619 2 14.296875 3 | 1 795 3 14.2734375 4 | 1 637 4 14.046875 5 | 1 716 5 14.0078125 6 | 1 55 6 13.546875 7 | 1 311 7 13.3203125 8 | 1 513 8 12.6875 9 | 1 313 9 12.2265625 10 | 1 787 10 11.5390625 11 | 1 323 11 11.3125 12 | 1 960 12 11.28125 13 | 1 686 13 11.2421875 14 | 1 767 14 11.1640625 15 | 1 33 15 11.1171875 16 | 1 267 16 11.09375 17 | 1 629 17 11.0625 18 | 1 451 18 10.984375 19 | 1 321 19 10.9375 20 | 1 682 20 10.6953125 21 | 1 237 21 10.390625 22 | 1 644 22 10.3828125 23 | 1 683 23 10.328125 24 | 1 937 24 10.2734375 25 | 1 362 25 10.1796875 26 | 1 25 26 10.078125 27 | 1 45 27 10.078125 28 | 1 886 28 9.96875 29 | 1 727 29 9.828125 30 | 1 609 30 9.6796875 31 | 1 478 31 9.6484375 32 | 1 602 32 9.5703125 33 | 1 338 33 9.484375 34 | 1 514 34 9.40625 35 | 1 608 35 9.359375 36 | 1 390 36 9.3046875 37 | 1 822 37 9.28125 38 | 1 598 38 9.2734375 39 | 1 755 39 9.265625 40 | 1 835 40 9.1484375 41 | 1 701 41 9.140625 42 | 1 229 42 9.1328125 43 | 1 456 43 9.125 44 | 1 621 44 9.0390625 45 | 1 786 45 9.0234375 46 | 1 914 46 9.0234375 47 | 1 403 47 9.015625 48 | 1 620 48 9.0 49 | 1 24 49 8.9453125 50 | 1 42 50 8.9375 51 | 1 356 51 8.9375 52 | 1 373 52 8.9140625 53 | 1 500 53 8.8828125 54 | 1 687 54 8.8828125 55 | 1 344 55 8.84375 56 | 1 924 56 8.828125 57 | 1 988 57 8.828125 58 | 1 197 58 8.75 59 | 1 912 59 8.71875 60 | 1 925 60 8.7109375 61 | 1 378 61 8.6875 62 | 1 867 62 8.6875 63 | 1 417 63 8.65625 64 | 1 695 64 8.65625 65 | 1 601 65 8.6484375 66 | 1 436 66 8.640625 67 | 1 699 67 8.640625 68 | 1 473 68 8.609375 69 | 1 603 69 8.609375 70 | 1 111 70 8.59375 71 | 1 315 71 8.5859375 72 | 1 455 72 8.5859375 73 | 1 582 73 8.5 74 | 1 148 74 8.4921875 75 | 1 492 75 8.4921875 76 | 1 352 76 8.46875 77 | 1 384 77 8.4609375 78 | 1 814 78 8.4609375 79 | 1 249 79 8.375 80 | 1 260 80 8.3671875 81 | 1 864 81 8.359375 82 | 1 333 82 8.3359375 83 | 1 944 83 8.3203125 84 | 1 196 84 8.3125 85 | 1 604 85 8.3125 86 | 1 471 86 8.2890625 87 | 1 409 87 8.2734375 88 | 1 553 88 8.2734375 89 | 1 992 89 8.2734375 90 | 1 275 90 8.25 91 | 1 804 91 8.2265625 92 | 1 673 92 8.21875 93 | 1 948 93 8.15625 94 | 1 995 94 8.15625 95 | 1 309 95 8.1328125 96 | 1 281 96 8.09375 97 | 1 950 97 8.0546875 98 | 1 649 98 8.0234375 99 | 1 357 99 8.015625 100 | 1 73 100 8.0 101 | -------------------------------------------------------------------------------- /tests/data/colbert_test.db/000008.sst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeployQL/LintDB/bba57a67d1289edc14bdc00cb3c45df60877e2e4/tests/data/colbert_test.db/000008.sst -------------------------------------------------------------------------------- /tests/data/colbert_test.db/000009.sst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeployQL/LintDB/bba57a67d1289edc14bdc00cb3c45df60877e2e4/tests/data/colbert_test.db/000009.sst -------------------------------------------------------------------------------- /tests/data/colbert_test.db/000010.sst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeployQL/LintDB/bba57a67d1289edc14bdc00cb3c45df60877e2e4/tests/data/colbert_test.db/000010.sst -------------------------------------------------------------------------------- /tests/data/colbert_test.db/000011.sst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeployQL/LintDB/bba57a67d1289edc14bdc00cb3c45df60877e2e4/tests/data/colbert_test.db/000011.sst -------------------------------------------------------------------------------- /tests/data/colbert_test.db/000176.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeployQL/LintDB/bba57a67d1289edc14bdc00cb3c45df60877e2e4/tests/data/colbert_test.db/000176.log -------------------------------------------------------------------------------- /tests/data/colbert_test.db/CURRENT: -------------------------------------------------------------------------------- 1 | MANIFEST-000177 2 | -------------------------------------------------------------------------------- /tests/data/colbert_test.db/IDENTITY: -------------------------------------------------------------------------------- 1 | ca27e180-c0ac-40a6-8ba1-abc5931d9ca6 -------------------------------------------------------------------------------- /tests/data/colbert_test.db/LOCK: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeployQL/LintDB/bba57a67d1289edc14bdc00cb3c45df60877e2e4/tests/data/colbert_test.db/LOCK -------------------------------------------------------------------------------- /tests/data/colbert_test.db/MANIFEST-000177: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeployQL/LintDB/bba57a67d1289edc14bdc00cb3c45df60877e2e4/tests/data/colbert_test.db/MANIFEST-000177 -------------------------------------------------------------------------------- /tests/data/colbert_test.db/_field_mapper.json: -------------------------------------------------------------------------------- 1 | { 2 | "idToField" : [ 3 | { 4 | "data_type" : 0, 5 | "field_types" : [ 3 ], 6 | "name" : "colbert", 7 | "parameters" : { 8 | "analyzer" : "", 9 | "dimensions" : 128, 10 | "nbits" : 1, 11 | "num_centroids" : 32768, 12 | "num_iterations" : 10, 13 | "num_subquantizers" : 0, 14 | "quantization" : 2 15 | } 16 | } 17 | ], 18 | "nameToID" : { 19 | "colbert" : 0 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /tests/data/colbert_test.db/_lintdb_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "lintdb_version" : "0.4.1" 3 | } 4 | -------------------------------------------------------------------------------- /tests/data/colbert_test.db/_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields" : [ 3 | { 4 | "data_type" : 0, 5 | "field_types" : [ 3 ], 6 | "name" : "colbert", 7 | "parameters" : { 8 | "analyzer" : "", 9 | "dimensions" : 128, 10 | "nbits" : 1, 11 | "num_centroids" : 32768, 12 | "num_iterations" : 10, 13 | "num_subquantizers" : 0, 14 | "quantization" : 2 15 | } 16 | } 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /tests/data/colbert_test.db/colbert_coarse_quantizer: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeployQL/LintDB/bba57a67d1289edc14bdc00cb3c45df60877e2e4/tests/data/colbert_test.db/colbert_coarse_quantizer -------------------------------------------------------------------------------- /tests/doc_encoder_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "lintdb/schema/DocEncoder.h" 3 | #include "bitsery/bitsery.h" 4 | #include "bitsery/adapter/buffer.h" 5 | #include "lintdb/schema/DocEncoder.h" 6 | #include "lintdb/schema/ProcessedData.h" 7 | 8 | TEST(DocEncoder, EncodeInvertedDataForTensorDataType) { 9 | lintdb::DocEncoder encoder; 10 | lintdb::ProcessedData data; 11 | data.value.data_type = lintdb::DataType::TENSOR; 12 | data.value.num_tensors = 2; 13 | data.centroid_ids = {1, 2}; 14 | data.tenant = 0; 15 | data.field = 1; 16 | data.doc_id = 1; 17 | data.value.value = lintdb::Tensor{1.0f, 2.0f, 3.0f, 4.0f}; 18 | 19 | auto result = encoder.encode_inverted_data(data, 2); 20 | 21 | EXPECT_EQ(result.size(), 2); 22 | } 23 | 24 | TEST(DocEncoder, EncodeInvertedDataForNonTensorDataType) { 25 | lintdb::DocEncoder encoder; 26 | lintdb::ProcessedData data; 27 | data.value.data_type = lintdb::DataType::INTEGER; 28 | data.value.value = 10; 29 | data.tenant = 0; 30 | data.field = 1; 31 | data.doc_id = 1; 32 | 33 | auto result = encoder.encode_inverted_data(data, 2); 34 | 35 | EXPECT_EQ(result.size(), 1); 36 | } 37 | 38 | TEST(DocEncoder, EncodeInvertedMappingData) { 39 | lintdb::DocEncoder encoder; 40 | lintdb::ProcessedData data; 41 | data.tenant = 0; 42 | data.field = 1; 43 | data.doc_id = 1; 44 | data.centroid_ids = {1, 2, 3}; 45 | 46 | auto result = encoder.encode_inverted_mapping_data(data); 47 | 48 | EXPECT_EQ(result.size(), 1); 49 | } 50 | 51 | 52 | TEST(DocEncoder, EncodeContextData) { 53 | lintdb::DocEncoder encoder; 54 | lintdb::ProcessedData data; 55 | data.tenant = 0; 56 | data.field = 1; 57 | data.doc_id = 1; 58 | data.value.value = "context"; 59 | 60 | auto result = encoder.encode_context_data(data); 61 | 62 | EXPECT_FALSE(result.key.empty()); 63 | EXPECT_FALSE(result.value.empty()); 64 | } -------------------------------------------------------------------------------- /tests/inverted_list_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "lintdb/cf.h" 11 | #include "lintdb/index.h" 12 | #include "lintdb/version.h" 13 | #include "util.h" 14 | #include "lintdb/invlists/KeyBuilder.h" 15 | 16 | using ::testing::Test; 17 | using ::testing::Values; 18 | 19 | class InvertedListTest : public Test { 20 | public: 21 | ~InvertedListTest() override {} 22 | void SetUp() override { 23 | version = lintdb::Version(); 24 | temp_db = create_temporary_directory(); 25 | rocksdb::Options options; 26 | options.create_if_missing = true; 27 | options.create_missing_column_families = true; 28 | 29 | auto cfs = lintdb::create_column_families(); 30 | 31 | rocksdb::DB* ptr; 32 | rocksdb::Status s = rocksdb::DB::Open( 33 | options, temp_db, cfs, &column_families, &ptr); 34 | 35 | assert(s.ok()); 36 | this->db = std::shared_ptr(ptr); 37 | } 38 | void TearDown() override { 39 | for (auto cf : column_families) { 40 | db->DestroyColumnFamilyHandle(cf); 41 | } 42 | std::filesystem::remove_all(temp_db); 43 | } 44 | 45 | protected: 46 | lintdb::Version version; 47 | std::filesystem::path temp_db; 48 | std::shared_ptr db; 49 | std::vector column_families; 50 | }; 51 | 52 | TEST_F(InvertedListTest, StoresCodesCorrectly) { 53 | lintdb::RocksdbInvertedList invlist(db, column_families, version); 54 | 55 | 56 | auto one = lintdb::create_index_id(0, 1, lintdb::DataType::QUANTIZED_TENSOR, 1, 555); 57 | auto two = lintdb::create_index_id(0, 1, lintdb::DataType::QUANTIZED_TENSOR, 1, 556); 58 | auto three = lintdb::create_index_id(0, 1, lintdb::DataType::QUANTIZED_TENSOR, 3, 555); 59 | rocksdb::WriteOptions wo; 60 | this->db->Put(wo, column_families[lintdb::kIndexColumnIndex], one, "value"); 61 | this->db->Put(wo, column_families[lintdb::kIndexColumnIndex], two, "value"); 62 | this->db->Put(wo, column_families[lintdb::kIndexColumnIndex], three, "value"); 63 | 64 | std::string prefix = lintdb::create_index_prefix(0, 1, lintdb::DataType::QUANTIZED_TENSOR, 1); 65 | auto it1 = invlist.get_iterator(prefix); 66 | 67 | // inverted list should have 2 entries 68 | EXPECT_TRUE(it1->is_valid()); 69 | auto key = it1->get_key(); 70 | ASSERT_EQ(key.doc_id(), 555); 71 | 72 | std::string val = it1->get_value(); 73 | ASSERT_EQ(val, "value"); 74 | 75 | it1->next(); 76 | 77 | EXPECT_TRUE(it1->is_valid()); 78 | key = it1->get_key(); 79 | ASSERT_EQ(key.doc_id(), 556); 80 | 81 | val = it1->get_value(); 82 | ASSERT_EQ(val, "value"); 83 | 84 | // only two documents. 85 | it1->next(); 86 | EXPECT_FALSE(it1->is_valid()); 87 | 88 | 89 | std::string prefix_three = lintdb::create_index_prefix(0, 1, lintdb::DataType::QUANTIZED_TENSOR, 3); 90 | auto it3 = invlist.get_iterator(prefix_three); 91 | 92 | EXPECT_TRUE(it3->is_valid()); 93 | 94 | auto key_three = it3->get_key(); 95 | ASSERT_EQ(key_three.doc_id(), 555); 96 | 97 | std::string val_three = it3->get_value(); 98 | ASSERT_EQ(val_three, "value"); 99 | 100 | // only one document. 101 | it3->next(); 102 | EXPECT_FALSE(it3->is_valid()); 103 | 104 | } -------------------------------------------------------------------------------- /tests/keys_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "lintdb/invlists/KeyBuilder.h" 3 | #include "lintdb/schema/DataTypes.h" 4 | #include 5 | #include 6 | 7 | class KeySerializationTests : public ::testing::Test { 8 | protected: 9 | lintdb::KeyBuilder builder; 10 | }; 11 | 12 | TEST_F(KeySerializationTests, SerializeAndDeserializeInvertedIndexKey_IntegerType) { 13 | std::string expectedKey = builder.add(static_cast(1)) // tenant 14 | .add(static_cast(2)) // field 15 | .add(lintdb::DataType::INTEGER) // field_type 16 | .add(static_cast(3)) // inverted_list 17 | .add(static_cast(4)) // doc_id 18 | .build(); 19 | lintdb::InvertedIndexKey key(expectedKey); 20 | ASSERT_EQ(key.field(), uint8_t(2)); 21 | idx_t actual = std::get(key.field_value()); 22 | ASSERT_EQ(actual, 3); 23 | ASSERT_EQ(key.doc_id(), 4); 24 | } 25 | 26 | TEST_F(KeySerializationTests, SerializeAndDeserializeInvertedIndexKey_StringType) { 27 | std::string expectedKey = lintdb::create_index_id(1, 2, lintdb::DataType::TEXT, "some value", 123); 28 | lintdb::InvertedIndexKey key(expectedKey); 29 | ASSERT_EQ(key.field(), uint8_t(2)); 30 | auto actual = std::get(key.field_value()); 31 | ASSERT_EQ(actual, "some value"); 32 | ASSERT_EQ(key.doc_id(), 123); 33 | } 34 | 35 | TEST_F(KeySerializationTests, SerializeAndDeserializeInvertedIndexKey_DateType) { 36 | lintdb::DateTime now = std::chrono::time_point_cast(std::chrono::system_clock::now()); 37 | std::string expectedKey = lintdb::create_index_id(1, 2, lintdb::DataType::DATETIME, lintdb::DateTime(now), 123); 38 | lintdb::InvertedIndexKey key(expectedKey); 39 | ASSERT_EQ(key.field(), uint8_t(2)); 40 | 41 | std::visit([](auto&& arg) { 42 | using T = std::decay_t; 43 | if constexpr (std::is_same_v) { 44 | // Handle DateTime 45 | std::cout << "DateTime with ms: " << arg.time_since_epoch().count() << std::endl; 46 | } else { 47 | // Handle other types 48 | std::cout << "Not a DateTime" << std::endl; 49 | } 50 | }, key.field_value()); 51 | 52 | auto actual = std::get(key.field_value()); 53 | ASSERT_EQ(actual, now); 54 | ASSERT_EQ(key.doc_id(), 123); 55 | } 56 | 57 | TEST_F(KeySerializationTests, SerializeAndDeserializeContextKey) { 58 | std::string expectedKey = builder.add(static_cast(1)) // tenant 59 | .add(static_cast(2)) // field 60 | .add(static_cast(3)) // doc_id 61 | .build(); 62 | lintdb::ContextKey key(expectedKey); 63 | ASSERT_EQ(key.doc_id(), 3); 64 | } 65 | 66 | TEST_F(KeySerializationTests, SerializeAndDeserializeForwardIndexKey) { 67 | std::string expectedKey = builder.add(static_cast(1)) // tenant 68 | .add(static_cast(2)) // doc_id 69 | .build(); 70 | lintdb::ForwardIndexKey key(expectedKey); 71 | ASSERT_EQ(key.doc_id(), 2); 72 | } -------------------------------------------------------------------------------- /tests/mocks.h: -------------------------------------------------------------------------------- 1 | #ifndef LINTDB_MOCKS_H 2 | #define LINTDB_MOCKS_H 3 | 4 | #include "lintdb/invlists/InvertedList.h" 5 | #include "lintdb/invlists/Iterator.h" 6 | #include "lintdb/quantizers/ProductEncoder.h" 7 | #include "lintdb/quantizers/Quantizer.h" 8 | #include "lintdb/invlists/IndexWriter.h" 9 | #include "lintdb/quantizers/CoarseQuantizer.h" 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | 16 | class MockIndexWriter : public lintdb::IIndexWriter { 17 | public: 18 | MOCK_METHOD(void, write, (const lintdb::BatchPostingData& batch_posting_data), (override)); 19 | }; 20 | 21 | class MockQuantizer : public lintdb::Quantizer { 22 | public: 23 | MOCK_METHOD(void, train, (const size_t n, const float* x, const size_t dim), (override)); 24 | MOCK_METHOD(void, save, (const std::string path), (override)); 25 | MOCK_METHOD(void, sa_encode, (size_t n, const float* x, residual_t* codes), (override)); 26 | MOCK_METHOD(void, sa_decode, (size_t n, const residual_t* codes, float* x), (override)); 27 | MOCK_METHOD(size_t, code_size, (), (override)); 28 | MOCK_METHOD(size_t, get_nbits, (), (override)); 29 | MOCK_METHOD(lintdb::QuantizerType, get_type, (), (override)); 30 | }; 31 | 32 | class MockCoarseQuantizer : public lintdb::ICoarseQuantizer { 33 | public: 34 | MOCK_METHOD(void, train, (const size_t n, const float* x, size_t k, size_t num_iter), (override)); 35 | MOCK_METHOD(void, save, (const std::string& path), (override)); 36 | MOCK_METHOD(void, assign, (size_t n, const float* x, idx_t* codes), (override)); 37 | MOCK_METHOD(void, sa_decode, (size_t n, const idx_t* codes, float* x), (override)); 38 | MOCK_METHOD(void, compute_residual, (const float* vec, float* residual, idx_t centroid_id), (override)); 39 | MOCK_METHOD(void, compute_residual_n, (int n, const float* vec, float* residual, idx_t* centroid_ids), (override)); 40 | MOCK_METHOD(void, reconstruct, (idx_t centroid_id, float* embedding), (override)); 41 | MOCK_METHOD(void, search, (size_t num_query_tok, const float* data, size_t k_top_centroids, float* distances, idx_t* coarse_idx), (override)); 42 | MOCK_METHOD(void, reset, (), (override)); 43 | MOCK_METHOD(void, add, (int n, float* data), (override)); 44 | MOCK_METHOD(size_t, code_size, (), (override)); 45 | MOCK_METHOD(size_t, num_centroids, (), (override)); 46 | MOCK_METHOD(float*, get_xb, (), (override)); 47 | MOCK_METHOD(void, serialize, (const std::string& filename), (const, override)); 48 | MOCK_METHOD(bool, is_trained, (), (const, override)); 49 | 50 | }; 51 | 52 | #endif // LINTDB_MOCKS_H 53 | -------------------------------------------------------------------------------- /tests/util.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | inline std::filesystem::path create_temporary_directory( 11 | unsigned long long max_tries = 1000) { 12 | auto tmp_dir = std::filesystem::temp_directory_path(); 13 | unsigned long long i = 0; 14 | std::random_device dev; 15 | std::mt19937 prng(dev()); 16 | std::uniform_int_distribution rand(0); 17 | std::filesystem::path path; 18 | while (true) { 19 | std::stringstream ss; 20 | ss << std::hex << rand(prng); 21 | path = tmp_dir / ss.str(); 22 | // true if the directory was created. 23 | if (std::filesystem::create_directory(path)) { 24 | break; 25 | } 26 | if (i == max_tries) { 27 | throw std::runtime_error("could not find non-existing directory"); 28 | } 29 | i++; 30 | } 31 | return path; 32 | } 33 | -------------------------------------------------------------------------------- /vcpkg-configuration.json: -------------------------------------------------------------------------------- 1 | { 2 | "overlay-ports": [ 3 | "./ports" 4 | ] 5 | } 6 | -------------------------------------------------------------------------------- /vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "lintdb", 3 | "version-string": "0.5.1", 4 | "license": "MIT", 5 | "dependencies": [ 6 | "faiss", 7 | "rocksdb", 8 | "flatbuffers", 9 | "gtest", 10 | "glog", 11 | "jsoncpp", 12 | "ms-gsl", 13 | "benchmark", 14 | "intel-mkl", 15 | "openblas", 16 | "bitsery", 17 | "drogon", 18 | "args" 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /version.txt: -------------------------------------------------------------------------------- 1 | 0.5.1 --------------------------------------------------------------------------------