├── util ├── .gitattributes ├── version.cpp ├── version.h.in ├── delta.h ├── CMakeLists.txt ├── codec.h ├── util.h └── bitvec.h ├── embeddings ├── .gitignore ├── src │ ├── test_modules.rs │ ├── lib.rs │ ├── ffi.rs │ ├── error.rs │ └── model │ │ ├── mod.rs │ │ ├── openai.rs │ │ └── voyage.rs ├── cbindgen.toml ├── README.md ├── .editorconfig ├── build.rs ├── Cargo.toml └── manticoresearch_text_embeddings.h ├── NOTICE ├── .gitignore ├── benchmarks ├── hn_small_es_ms_100MB.png ├── hn_small_es_ms_30MB.png ├── hn_small_ma_co_100MB.png ├── hn_small_ma_co_30MB.png ├── hn_small_es_ms_1024MB.png ├── hn_small_ma_co_1024MB.png ├── logs116m_es_ms_1500MB.png ├── logs116m_es_ms_36000MB.png └── logs116m_es_ms_4400MB.png ├── manticore_src.txt ├── cmake ├── builds │ ├── build_rhel.cmake │ ├── build_default.cmake │ ├── build_rhel8.cmake │ ├── build_rhel7.cmake │ ├── build_rhel9.cmake │ ├── build_rhel10.cmake │ ├── build_buster.cmake │ ├── build_bookworm.cmake │ ├── build_bullseye.cmake │ ├── build_focal.cmake │ ├── build_jammy.cmake │ ├── build_bionic.cmake │ ├── build_macos.cmake │ ├── CommonDeb.cmake │ ├── CommonRpm.cmake │ └── build_windows.cmake ├── CPackOptions.cmake.in ├── GetHNSW.cmake ├── init_cache_settings.cmake ├── GetFastPFOR.cmake ├── GetStreamvbyte.cmake ├── GetPGM.cmake ├── external-build.cmake.in ├── helpers.cmake ├── revcheck.cmake ├── CommonInfo.cmake ├── SetBuildType.cmake ├── build_embeddings.cmake ├── printers.cmake ├── rev.cmake ├── citest.cmake └── update_bundle.cmake ├── .github ├── ISSUE_TEMPLATE │ ├── config.yml │ ├── feature_request.yml │ └── bug_report.yml └── workflows │ ├── checklist_validator.yml │ ├── clt_tests.yml │ ├── mirror.yml │ └── test_template.yml ├── smoke.sh ├── config └── CPackOptions.cmake ├── knn ├── embeddings.h ├── iterator.h ├── quantizer.h ├── CMakeLists.txt ├── iterator.cpp ├── knn.h └── space.h ├── columnar ├── builder │ ├── builderbool.h │ ├── builderstr.h │ ├── CMakeLists.txt │ ├── buildermva.h │ ├── builderint.h │ ├── buildertraits.cpp │ └── builderminmax.h ├── accessor │ ├── accessor.cpp │ ├── CMakeLists.txt │ ├── accessortraits.cpp │ ├── accessormva.h │ ├── accessorstr.h │ ├── accessorbool.h │ ├── accessorint.h │ ├── attributeheader.h │ ├── accessor.h │ └── check.h ├── CMakeLists.txt ├── builder.h └── columnar.h ├── licenses-binary └── LICENSE-simde ├── secondary ├── CMakeLists.txt ├── builder.h ├── iterator.h ├── secondary.h ├── blockreader.h └── pgm.h ├── common ├── CMakeLists.txt ├── blockiterator.h ├── schema.h ├── filter.h ├── filter.cpp └── interval.h ├── pgm └── CMakeLists.txt ├── gitsync_columnar.sh ├── NOTICE-binary ├── testing.cmake ├── README.md └── Changelog.md /util/.gitattributes: -------------------------------------------------------------------------------- 1 | version.h.in export-subst -------------------------------------------------------------------------------- /embeddings/.gitignore: -------------------------------------------------------------------------------- 1 | target/** 2 | .cache 3 | cache/** 4 | relative/** 5 | 6 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright 2017-2025 Manticore Software, Ltd. 2 | 3 | https://manticoresearch.com/ 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /build/ 2 | /.idea/ 3 | build 4 | cache 5 | 6 | # local non-traced url for external tools 7 | /local_manticore_src.txt -------------------------------------------------------------------------------- /benchmarks/hn_small_es_ms_100MB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manticoresoftware/columnar/HEAD/benchmarks/hn_small_es_ms_100MB.png -------------------------------------------------------------------------------- /benchmarks/hn_small_es_ms_30MB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manticoresoftware/columnar/HEAD/benchmarks/hn_small_es_ms_30MB.png -------------------------------------------------------------------------------- /benchmarks/hn_small_ma_co_100MB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manticoresoftware/columnar/HEAD/benchmarks/hn_small_ma_co_100MB.png -------------------------------------------------------------------------------- /benchmarks/hn_small_ma_co_30MB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manticoresoftware/columnar/HEAD/benchmarks/hn_small_ma_co_30MB.png -------------------------------------------------------------------------------- /benchmarks/hn_small_es_ms_1024MB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manticoresoftware/columnar/HEAD/benchmarks/hn_small_es_ms_1024MB.png -------------------------------------------------------------------------------- /benchmarks/hn_small_ma_co_1024MB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manticoresoftware/columnar/HEAD/benchmarks/hn_small_ma_co_1024MB.png -------------------------------------------------------------------------------- /benchmarks/logs116m_es_ms_1500MB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manticoresoftware/columnar/HEAD/benchmarks/logs116m_es_ms_1500MB.png -------------------------------------------------------------------------------- /benchmarks/logs116m_es_ms_36000MB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manticoresoftware/columnar/HEAD/benchmarks/logs116m_es_ms_36000MB.png -------------------------------------------------------------------------------- /benchmarks/logs116m_es_ms_4400MB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manticoresoftware/columnar/HEAD/benchmarks/logs116m_es_ms_4400MB.png -------------------------------------------------------------------------------- /manticore_src.txt: -------------------------------------------------------------------------------- 1 | GIT_REPOSITORY https://github.com/manticoresoftware/manticoresearch.git GIT_TAG b83863e29cebc1a3de6f1ae4b0c1cd92a0c9c2dc 2 | -------------------------------------------------------------------------------- /util/version.cpp: -------------------------------------------------------------------------------- 1 | #include "gen_version.h" 2 | 3 | #define LIB_VERSION_STR VERSION_STR " " GIT_COMMIT_ID "@" GIT_TIMESTAMP_ID 4 | 5 | const char * LIB_VERSION = LIB_VERSION_STR; -------------------------------------------------------------------------------- /cmake/builds/build_rhel.cmake: -------------------------------------------------------------------------------- 1 | # ---------- rhel ---------- 2 | # Above line is mandatory! 3 | # rules to build rpm package for Red Hat/ Centos 4 | 5 | message ( STATUS "Will create RPM for generic RedHat/Centos" ) 6 | include ( builds/CommonRpm ) -------------------------------------------------------------------------------- /embeddings/src/test_modules.rs: -------------------------------------------------------------------------------- 1 | // Add test modules to lib.rs 2 | #[cfg(test)] 3 | mod utils_test; 4 | 5 | #[cfg(test)] 6 | mod error_test; 7 | 8 | #[cfg(test)] 9 | mod integration_test; 10 | 11 | #[cfg(test)] 12 | mod error_handling_test; -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | contact_links: 2 | - name: "Manticore Team's professional services" 3 | about: "Looking for a faster solution to your issues with Manticore? Manticore Team can help." 4 | url: "https://manticoresearch.com/services" 5 | -------------------------------------------------------------------------------- /smoke.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #Run the very same test suite as run on CI, but locally 3 | 4 | #export CTEST_BUILD_CONFIGURATION=Debug 5 | #export VERBOSE=1 6 | #export DIAGNOSTIC=1 7 | #ctest -VV -S cmake/citest.cmake 8 | ctest -V -S cmake/citest.cmake --progress 9 | -------------------------------------------------------------------------------- /cmake/builds/build_default.cmake: -------------------------------------------------------------------------------- 1 | # ---------- default ---------- 2 | # Above line is mandatory! 3 | # rules to build default zip archive 4 | 5 | message ( STATUS "Will create default ZIP" ) 6 | set ( CPACK_PACKAGING_INSTALL_PREFIX "/" ) 7 | set ( CPACK_GENERATOR "ZIP" ) 8 | -------------------------------------------------------------------------------- /cmake/builds/build_rhel8.cmake: -------------------------------------------------------------------------------- 1 | # ---------- rhel8 ---------- 2 | # Above line is mandatory! 3 | # rules to build rpm package for Red Hat linux 8 / Centos 8 4 | 5 | message ( STATUS "Will create RPM for RedHat/Centos 8" ) 6 | set ( RELEASE_DIST ".el8" ) 7 | include ( builds/CommonRpm ) -------------------------------------------------------------------------------- /cmake/builds/build_rhel7.cmake: -------------------------------------------------------------------------------- 1 | # ---------- rhel7 ---------- 2 | # Above line is mandatory! 3 | # rules to build rpm package for Red Hat linux 7 / Centos 7 4 | 5 | message ( STATUS "Will create RPM for RedHat/Centos 7" ) 6 | set ( RELEASE_DIST ".el7.centos" ) 7 | include ( builds/CommonRpm ) -------------------------------------------------------------------------------- /cmake/builds/build_rhel9.cmake: -------------------------------------------------------------------------------- 1 | # ---------- rhel9 ---------- 2 | # Above line is mandatory! 3 | # rules to build rpm package for Red Hat linux 9 / Centos Stream 9 4 | 5 | message ( STATUS "Will create RPM for RedHat/Centos Stream 9" ) 6 | set ( RELEASE_DIST ".el9" ) 7 | include ( builds/CommonRpm ) -------------------------------------------------------------------------------- /cmake/builds/build_rhel10.cmake: -------------------------------------------------------------------------------- 1 | # ---------- rhel10 ---------- 2 | # Above line is mandatory! 3 | # rules to build rpm package for Red Hat linux 10 / Centos Stream 10 4 | 5 | message ( STATUS "Will create RPM for RedHat/Centos Stream 10" ) 6 | set ( RELEASE_DIST ".el10" ) 7 | include ( builds/CommonRpm ) -------------------------------------------------------------------------------- /util/version.h.in: -------------------------------------------------------------------------------- 1 | #define VERSION_STR "@CMAKE_PROJECT_VERSION@" 2 | // GIT_COMMIT_ID "60baa521" 3 | // GIT_TIMESTAMP_ID "2025-12-14 18:02:53 +0300" 4 | // GIT_EPOCH_ID "1765724573" 5 | #define GIT_COMMIT_ID "@GIT_COMMIT_ID@" 6 | #define GIT_TIMESTAMP_ID "@GIT_TIMESTAMP_ID@" 7 | #cmakedefine GIT_BRANCH_ID "@GIT_BRANCH_ID@" 8 | -------------------------------------------------------------------------------- /embeddings/cbindgen.toml: -------------------------------------------------------------------------------- 1 | language = "C++" 2 | # namespace = "manticoresearch" 3 | include_guard = "MANTICORESEARCH_TEXT_EMBEDDINGS_H" 4 | header = "// Auto-generated file. Do not edit." 5 | 6 | [struct] 7 | derive_eq = false 8 | derive_neq = false 9 | 10 | [export] 11 | include = ["FloatVecResult", "FloatVec", "TextModelResult", "TextModelWrapper"] 12 | 13 | -------------------------------------------------------------------------------- /embeddings/src/lib.rs: -------------------------------------------------------------------------------- 1 | mod error; 2 | mod ffi; 3 | mod model; 4 | mod utils; 5 | 6 | #[cfg(test)] 7 | mod utils_test; 8 | 9 | #[cfg(test)] 10 | mod error_test; 11 | 12 | #[cfg(test)] 13 | mod integration_test; 14 | 15 | #[cfg(test)] 16 | mod error_handling_test; 17 | 18 | pub use error::LibError; 19 | pub use ffi::{EmbedLib, GetLibFuncs}; 20 | pub use model::TextModel; 21 | -------------------------------------------------------------------------------- /embeddings/README.md: -------------------------------------------------------------------------------- 1 | # manticore-knn-embeddings 2 | Proof of Concept to use Rust in building lib for generating text embeddings 3 | 4 | 5 | ## How to build rust library 6 | 7 | ```bash 8 | cargo build --lib --release 9 | ``` 10 | 11 | ## How to build examples/test.cpp 12 | 13 | ```bash 14 | g++ -o test examples/test.cpp -Ltarget/release -lmanticore_knn_embeddings -I. -lpthread -ldl -std=c++17 15 | ``` 16 | 17 | -------------------------------------------------------------------------------- /.github/workflows/checklist_validator.yml: -------------------------------------------------------------------------------- 1 | name: 📝 Checklist Validator 2 | run-name: 📝 Checklist Validator for issue ${{ github.event.issue.number }} 3 | 4 | on: 5 | issues: 6 | types: 7 | - closed 8 | 9 | jobs: 10 | checklist-validation: 11 | name: ✅ Checklist Completion Check 12 | runs-on: ubuntu-22.04 13 | steps: 14 | - uses: manticoresoftware/manticoresearch/actions/checklist-validator@master 15 | -------------------------------------------------------------------------------- /cmake/CPackOptions.cmake.in: -------------------------------------------------------------------------------- 1 | execute_process(COMMAND "${CMAKE_COMMAND}" 2 | -D CONFIGURED_GIT_COMMIT_ID=@GIT_COMMIT_ID@ 3 | -D SOURCE_DIR=@columnar_SOURCE_DIR@ 4 | -P @columnar_SOURCE_DIR@/cmake/revcheck.cmake 5 | RESULT_VARIABLE check_failed 6 | ) 7 | 8 | if (check_failed) 9 | MESSAGE (FATAL_ERROR "Version check failed. Configured @GIT_COMMIT_ID@. Run 'cmake .' in build tree to fix version/hash") 10 | endif() -------------------------------------------------------------------------------- /cmake/builds/build_buster.cmake: -------------------------------------------------------------------------------- 1 | # ---------- buster ---------- 2 | # Above line is mandatory! 3 | # rules to build deb package for Debian Buster) 4 | 5 | message ( STATUS "Will create DEB for Debian Buster" ) 6 | 7 | # we provide explicit dependencies, so shlideps is not necessary 8 | set ( disable_shlibdeps ON ) 9 | set ( CPACK_DEBIAN_PACKAGE_DEPENDS "libc6 (>= 2.27), libgcc1 (>= 4.2), libstdc++6 (>= 5.2)" ) 10 | 11 | include ( builds/CommonDeb ) 12 | -------------------------------------------------------------------------------- /cmake/builds/build_bookworm.cmake: -------------------------------------------------------------------------------- 1 | # ---------- bookworm ---------- 2 | # Above line is mandatory! 3 | # rules to build deb package for Debian Bookworm) 4 | 5 | message ( STATUS "Will create DEB for Debian Bookworm" ) 6 | 7 | # we provide explicit dependencies, so shlideps is not necessary 8 | set ( disable_shlibdeps ON ) 9 | set ( CPACK_DEBIAN_PACKAGE_DEPENDS "libc6 (>= 2.34), libgcc1 (>= 4.2), libstdc++6 (>= 11)" ) 10 | 11 | include ( builds/CommonDeb ) 12 | -------------------------------------------------------------------------------- /cmake/builds/build_bullseye.cmake: -------------------------------------------------------------------------------- 1 | # ---------- bullseye ---------- 2 | # Above line is mandatory! 3 | # rules to build deb package for Debian Bullseye) 4 | 5 | message ( STATUS "Will create DEB for Debian Bullseye" ) 6 | 7 | # we provide explicit dependencies, so shlideps is not necessary 8 | set ( disable_shlibdeps ON ) 9 | set ( CPACK_DEBIAN_PACKAGE_DEPENDS "libc6 (>= 2.29), libgcc1 (>= 4.2), libstdc++6 (>= 9)" ) 10 | 11 | include ( builds/CommonDeb ) 12 | -------------------------------------------------------------------------------- /cmake/builds/build_focal.cmake: -------------------------------------------------------------------------------- 1 | # ---------- focal ---------- 2 | # Above line is mandatory! 3 | # rules to build deb package for Ubuntu 20.04 (focal) 4 | 5 | message ( STATUS "Will create DEB for Ubuntu 20.04 (focal)" ) 6 | 7 | # we provide explicit dependencies, so shlideps is not necessary 8 | set ( disable_shlibdeps ON ) 9 | set ( CPACK_DEBIAN_PACKAGE_DEPENDS "libc6 (>= 2.29), libgcc1 (>= 4.2), libstdc++6 (>= 9)" ) 10 | 11 | include ( builds/CommonDeb ) 12 | -------------------------------------------------------------------------------- /cmake/builds/build_jammy.cmake: -------------------------------------------------------------------------------- 1 | # ---------- jammy ---------- 2 | # Above line is mandatory! 3 | # rules to build deb package for Ubuntu 22.04 (jammy) 4 | 5 | message ( STATUS "Will create DEB for Ubuntu 22.04 (jammy)" ) 6 | 7 | # we provide explicit dependencies, so shlideps is not necessary 8 | set ( disable_shlibdeps ON ) 9 | set ( CPACK_DEBIAN_PACKAGE_DEPENDS "libc6 (>= 2.34), libgcc1 (>= 4.2), libstdc++6 (>= 11)" ) 10 | 11 | include ( builds/CommonDeb ) 12 | -------------------------------------------------------------------------------- /cmake/builds/build_bionic.cmake: -------------------------------------------------------------------------------- 1 | # ---------- bionic ---------- 2 | # Above line is mandatory! 3 | # rules to build deb package for Ubuntu 18.04 (bionic) 4 | 5 | message ( STATUS "Will create DEB for Ubuntu 18.04 (bionic)" ) 6 | 7 | # we provide explicit dependencies, so shlideps is not necessary 8 | set ( disable_shlibdeps ON ) 9 | set ( CPACK_DEBIAN_PACKAGE_DEPENDS "libc6 (>= 2.27), libgcc1 (>= 4.2), libstdc++6 (>= 5.2)" ) 10 | 11 | include ( builds/CommonDeb ) 12 | -------------------------------------------------------------------------------- /cmake/builds/build_macos.cmake: -------------------------------------------------------------------------------- 1 | # ---------- macos ---------- 2 | # Above line is mandatory! 3 | # rules to build tgz archive for Mac OS X 4 | 5 | message ( STATUS "Will create TGZ with build for Mac Os X" ) 6 | 7 | set ( CPACK_PACKAGING_INSTALL_PREFIX /usr/local ) 8 | set ( CPACK_ARCHIVE_COMPONENT_INSTALL OFF ) 9 | set ( CPACK_GENERATOR "TGZ" ) 10 | set ( CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_FILE_NAME}-osx${CMAKE_OSX_DEPLOYMENT_TARGET}-${CMAKE_SYSTEM_PROCESSOR}" ) -------------------------------------------------------------------------------- /embeddings/.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | end_of_line = lf 5 | charset = utf-8 6 | trim_trailing_whitespace = true 7 | insert_final_newline = true 8 | visual_wrap = true 9 | max_line_length = 120 10 | 11 | [*] 12 | indent_style = tab 13 | indent_size = 2 14 | 15 | [**.yaml] 16 | indent_style = space 17 | indent_size = 2 18 | 19 | [**.yml] 20 | indent_style = space 21 | indent_size = 2 22 | 23 | [**.md] 24 | indent_style = space 25 | indent_size = 2 26 | -------------------------------------------------------------------------------- /embeddings/build.rs: -------------------------------------------------------------------------------- 1 | extern crate cbindgen; 2 | 3 | use std::env; 4 | use std::path::PathBuf; 5 | 6 | fn main() { 7 | let crate_dir = PathBuf::from( 8 | env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR env var is not defined"), 9 | ); 10 | 11 | let config = cbindgen::Config::from_file("cbindgen.toml") 12 | .expect("Unable to find cbindgen.toml configuration file"); 13 | 14 | cbindgen::generate_with_config(&crate_dir, config) 15 | .expect("Unable to generate bindings") 16 | .write_to_file(crate_dir.join("manticoresearch_text_embeddings.h")); 17 | } 18 | -------------------------------------------------------------------------------- /config/CPackOptions.cmake: -------------------------------------------------------------------------------- 1 | # Custom CPack options 2 | 3 | # Ensure the library doesn't need a Build ID - specifically for embeddings 4 | set(CPACK_DEBIAN_EMBEDDINGS_PACKAGE_SHLIBDEPS OFF) 5 | 6 | # Skip other dependency checks 7 | set(CPACK_DEBIAN_PACKAGE_DEBUG OFF) 8 | 9 | # Make sure the specified version matches the actual build 10 | if(NOT "${CPACK_PACKAGE_VERSION}" STREQUAL "${CMAKE_PROJECT_VERSION}") 11 | message(WARNING "CPACK_PACKAGE_VERSION ${CPACK_PACKAGE_VERSION} differs from the project version ${CMAKE_PROJECT_VERSION}, using project version.") 12 | set(CPACK_PACKAGE_VERSION "${CMAKE_PROJECT_VERSION}") 13 | endif() -------------------------------------------------------------------------------- /cmake/builds/CommonDeb.cmake: -------------------------------------------------------------------------------- 1 | # only cmake since 3.13 supports packaging of debuginfo 2 | cmake_minimum_required ( VERSION 3.17 ) 3 | 4 | set ( CPACK_PACKAGING_INSTALL_PREFIX "/usr" ) 5 | set ( CPACK_GENERATOR DEB ) 6 | set ( CPACK_DEBIAN_FILE_NAME DEB-DEFAULT ) 7 | set ( CPACK_DEB_COMPONENT_INSTALL OFF ) 8 | set ( CPACK_DEBIAN_DEBUGINFO_PACKAGE ON ) 9 | set ( CPACK_DEBIAN_PACKAGE_SECTION misc ) 10 | set ( CPACK_DEBIAN_PACKAGE_PRIORITY optional ) 11 | set ( CPACK_DEBIAN_PACKAGE_CONTROL_STRICT_PERMISSION ON ) 12 | set ( CPACK_DEBIAN_PACKAGE_SUGGESTS "manticore (>= 3.6.1)" ) 13 | 14 | if (NOT disable_shlibdeps) 15 | set ( CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON ) 16 | endif () 17 | -------------------------------------------------------------------------------- /knn/embeddings.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #pragma once 18 | 19 | #include "knn.h" 20 | 21 | namespace knn 22 | { 23 | 24 | knn::EmbeddingsLib_i * LoadEmbeddingsLib ( const std::string & sLibPath, std::string & sError ); 25 | 26 | } // namespace knn 27 | -------------------------------------------------------------------------------- /columnar/builder/builderbool.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #pragma once 18 | 19 | #include "util_private.h" 20 | 21 | namespace columnar 22 | { 23 | 24 | enum class BoolPacking_e : uint32_t 25 | { 26 | CONST, 27 | BITMAP, 28 | 29 | TOTAL 30 | }; 31 | 32 | 33 | class Packer_i; 34 | struct Settings_t; 35 | Packer_i * CreatePackerBool ( const Settings_t & tSettings, const std::string & sName ); 36 | 37 | } // namespace columnar 38 | -------------------------------------------------------------------------------- /columnar/builder/builderstr.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #pragma once 18 | 19 | #include "builder.h" 20 | 21 | namespace columnar 22 | { 23 | 24 | enum class StrPacking_e : uint32_t 25 | { 26 | CONST = 0, 27 | CONSTLEN, 28 | TABLE, 29 | GENERIC, 30 | 31 | TOTAL 32 | }; 33 | 34 | class Packer_i; 35 | struct Settings_t; 36 | Packer_i * CreatePackerStr ( const Settings_t & tSettings, const std::string & sName ); 37 | 38 | } // namespace columnar 39 | -------------------------------------------------------------------------------- /columnar/accessor/accessor.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #include "accessor.h" 18 | 19 | #include "columnar.h" 20 | #include 21 | 22 | namespace columnar 23 | { 24 | 25 | bool CheckEmptySpan ( uint32_t * pRowID, uint32_t * pRowIdStart, util::Span_T & dRowIdBlock ) 26 | { 27 | if ( pRowID==pRowIdStart ) 28 | return false; 29 | 30 | dRowIdBlock = { pRowIdStart, size_t(pRowID-pRowIdStart) }; 31 | return true; 32 | } 33 | 34 | } // namespace columnar 35 | -------------------------------------------------------------------------------- /licenses-binary/LICENSE-simde: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Evan Nemerson 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /cmake/builds/CommonRpm.cmake: -------------------------------------------------------------------------------- 1 | # Common rpm-specific build variables 2 | cmake_minimum_required ( VERSION 3.17 ) 3 | 4 | set ( CPACK_PACKAGING_INSTALL_PREFIX "/usr" ) 5 | set ( CPACK_GENERATOR "RPM" ) 6 | 7 | ## RPM commons 8 | set ( CPACK_RPM_FILE_NAME RPM-DEFAULT ) 9 | if (RELEASE_DIST) 10 | set ( CPACK_RPM_PACKAGE_RELEASE 1${RELEASE_DIST} ) 11 | set ( MYVER "${CPACK_RPM_PACKAGE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}" ) 12 | else () 13 | set ( CPACK_RPM_PACKAGE_RELEASE 1 ) 14 | set ( CPACK_RPM_PACKAGE_RELEASE_DIST ON ) # that adds 'el7', 'el8', etc. 15 | set ( MYVER "${CPACK_RPM_PACKAGE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}%{?dist}" ) 16 | endif () 17 | 18 | set ( CPACK_RPM_PACKAGE_GROUP "Applications/Internet" ) 19 | set ( CPACK_RPM_PACKAGE_AUTOREQ ON ) 20 | set ( CPACK_RPM_PACKAGE_ARCHITECTURE ${CMAKE_SYSTEM_PROCESSOR} ) 21 | 22 | set ( CPACK_RPM_COMPONENT_INSTALL OFF ) 23 | set ( CPACK_RPM_INSTALL_WITH_EXEC ON ) 24 | set ( CPACK_RPM_DEBUGINFO_PACKAGE ON ) 25 | if (DEFINED ENV{CPACK_RPM_BUILD_SOURCE_DIRS_PREFIX}) 26 | set ( CPACK_RPM_BUILD_SOURCE_DIRS_PREFIX "$ENV{CPACK_RPM_BUILD_SOURCE_DIRS_PREFIX}" ) 27 | endif () 28 | 29 | # uncomment this line to produce long (really long) verbose output of rpm building 30 | #set ( CPACK_RPM_PACKAGE_DEBUG ON ) -------------------------------------------------------------------------------- /secondary/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | # All rights reserved 3 | # 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | cmake_minimum_required ( VERSION 3.17 ) 18 | 19 | include ( GetPGM ) 20 | 21 | add_library ( secondary_index MODULE 22 | builder.cpp iterator.cpp blockreader.cpp secondary.cpp 23 | builder.h iterator.h blockreader.h secondary.h ) 24 | 25 | target_link_libraries ( secondary_index PRIVATE PGM::pgmindexlib FastPFOR::FastPFOR columnar_root util common ) 26 | set_target_properties ( secondary_index PROPERTIES 27 | POSITION_INDEPENDENT_CODE ON 28 | INTERPROCEDURAL_OPTIMIZATION OFF 29 | PREFIX "" 30 | OUTPUT_NAME lib_manticore_secondary${lib_arch_suffix} ) 31 | -------------------------------------------------------------------------------- /columnar/builder/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2025, Manticore Software LTD (https://manticoresearch.com) 2 | # All rights reserved 3 | # 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | cmake_minimum_required ( VERSION 3.17 ) 18 | 19 | add_library ( builder OBJECT 20 | builderbool.cpp 21 | builderint.cpp 22 | buildermva.cpp 23 | builderstr.cpp 24 | buildertraits.cpp 25 | builderbool.h 26 | builderint.h 27 | builderminmax.h 28 | buildermva.h 29 | builderstr.h 30 | buildertraits.h 31 | ) 32 | 33 | target_link_libraries ( builder PRIVATE FastPFOR::FastPFOR columnar_root ) 34 | set_property ( TARGET builder PROPERTY POSITION_INDEPENDENT_CODE TRUE ) 35 | target_include_directories ( builder INTERFACE ${CMAKE_CURRENT_SOURCE_DIR} ) 36 | 37 | -------------------------------------------------------------------------------- /columnar/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2025, Manticore Software LTD (https://manticoresearch.com) 2 | # All rights reserved 3 | # 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | cmake_minimum_required ( VERSION 3.17 ) 18 | 19 | target_include_directories ( columnar_root INTERFACE accessor builder ) 20 | add_subdirectory ( builder ) 21 | add_subdirectory ( accessor ) 22 | 23 | # main library 24 | add_library ( columnar_lib MODULE columnar.cpp builder.cpp columnar.h builder.h ) 25 | target_compile_options ( columnar_lib PRIVATE $<$:-wd4996> ) 26 | target_link_libraries ( columnar_lib PRIVATE columnar_root util common builder accessor ) 27 | set_target_properties( columnar_lib PROPERTIES PREFIX "" OUTPUT_NAME lib_manticore_columnar${lib_arch_suffix} ) 28 | -------------------------------------------------------------------------------- /common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023-2025, Manticore Software LTD (https://manticoresearch.com) 2 | # All rights reserved 3 | # 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | cmake_minimum_required ( VERSION 3.17 ) 19 | 20 | add_library ( common OBJECT 21 | filter.cpp 22 | blockiterator.h 23 | filter.h 24 | interval.h 25 | schema.h 26 | ) 27 | 28 | target_link_libraries ( common PRIVATE columnar_root ) 29 | 30 | # runaround - POSITION_INDEPENDENT_CODE must be transitive from columnar_root, but it doesn't work (a bug in cmake?) 31 | get_target_property ( pic columnar_root INTERFACE_POSITION_INDEPENDENT_CODE ) 32 | set_property ( TARGET common PROPERTY POSITION_INDEPENDENT_CODE ${pic} ) 33 | target_include_directories ( common INTERFACE ${CMAKE_CURRENT_SOURCE_DIR} ) -------------------------------------------------------------------------------- /embeddings/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "manticore-knn-embeddings" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | [dependencies] 8 | candle-core = { git = "https://github.com/huggingface/candle.git", rev = "59b18d974ec3cad6963b774aa245e23f8c80414f" } 9 | tokenizers = "0.15.2" 10 | hf-hub = { git = "https://github.com/huggingface/hf-hub.git", rev = "ac22200ea0b5af4d8c362f699be0340647b19060", default-features = false,features = ["ureq"] } 11 | anyhow = "1.0.81" 12 | candle-nn = { git = "https://github.com/huggingface/candle.git", rev = "59b18d974ec3cad6963b774aa245e23f8c80414f" } 13 | candle-transformers = { git = "https://github.com/huggingface/candle.git", rev = "59b18d974ec3cad6963b774aa245e23f8c80414f" } 14 | serde_json = "1.0.114" 15 | serde = "1.0.197" 16 | rand = "0.8.5" 17 | reqwest = { version = "0.12.8", default-features = false, features = ["blocking", "json", "rustls-tls"] } 18 | rayon = "1.10.0" 19 | 20 | [build-dependencies] 21 | cbindgen = "0.26.0" 22 | 23 | # Example of customizing the library in Cargo.toml. 24 | [lib] 25 | name = "manticore_knn_embeddings" 26 | crate-type = ["cdylib"] 27 | 28 | [profile.release] 29 | opt-level = "z" 30 | codegen-units = 1 31 | lto = true 32 | strip = "debuginfo" 33 | 34 | [dev-dependencies] 35 | approx = "0.5.1" 36 | -------------------------------------------------------------------------------- /knn/iterator.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | // 17 | // This file is a part of the common headers (API). 18 | // If you make any significant changes to this file, you MUST bump the LIB_VERSION. 19 | 20 | 21 | #pragma once 22 | 23 | #include "knn.h" 24 | 25 | namespace knn 26 | { 27 | 28 | class KNNIndex_i 29 | { 30 | public: 31 | virtual ~KNNIndex_i() = default; 32 | virtual void Search ( std::vector & dResults, const util::Span_T & dData, int64_t iResults, int iEf, std::vector & dQuantized ) const = 0; 33 | }; 34 | 35 | Iterator_i * CreateIterator ( KNNIndex_i & tIndex, const util::Span_T & dData, int64_t iResults, int iEf ); 36 | 37 | } // namespace knn 38 | -------------------------------------------------------------------------------- /columnar/accessor/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2025, Manticore Software LTD (https://manticoresearch.com) 2 | # All rights reserved 3 | # 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | cmake_minimum_required ( VERSION 3.17 ) 18 | 19 | add_library ( accessor OBJECT 20 | attributeheader.cpp 21 | accessor.cpp 22 | accessorbool.cpp 23 | accessorint.cpp 24 | accessormva.cpp 25 | accessorstr.cpp 26 | accessortraits.cpp 27 | check.cpp 28 | attributeheader.h 29 | accessor.h 30 | accessorbool.h 31 | accessorint.h 32 | accessormva.h 33 | accessorstr.h 34 | accessortraits.h 35 | check.h 36 | ) 37 | 38 | target_link_libraries ( accessor PRIVATE columnar_root ) 39 | set_property ( TARGET accessor PROPERTY POSITION_INDEPENDENT_CODE TRUE ) 40 | target_include_directories ( accessor INTERFACE ${CMAKE_CURRENT_SOURCE_DIR} ) 41 | 42 | -------------------------------------------------------------------------------- /columnar/accessor/accessortraits.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #include "accessortraits.h" 18 | 19 | namespace columnar 20 | { 21 | 22 | SubblockCalc_t::SubblockCalc_t ( int iSubblockSize ) 23 | : m_iSubblockSize ( iSubblockSize ) 24 | , m_iSubblockShift ( util::CalcNumBits(iSubblockSize) - 1 ) 25 | , m_iSubblocksPerBlock ( DOCS_PER_BLOCK / iSubblockSize ) 26 | {} 27 | 28 | 29 | void StoredBlockTraits_t::SetBlockId ( uint32_t uBlockId, uint32_t uNumDocsInBlock ) 30 | { 31 | m_uBlockId = uBlockId; 32 | m_uNumDocsInBlock = uNumDocsInBlock; 33 | m_iNumSubblocks = (uNumDocsInBlock+m_iSubblockSize-1) / m_iSubblockSize; 34 | m_tStartBlockRowId = BlockId2RowId(uBlockId); 35 | } 36 | 37 | } // namespace columnar -------------------------------------------------------------------------------- /columnar/accessor/accessormva.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #pragma once 18 | 19 | #include "buildertraits.h" 20 | 21 | namespace columnar 22 | { 23 | 24 | class Analyzer_i; 25 | class Checker_i; 26 | class AttributeHeader_i; 27 | 28 | Iterator_i * CreateIteratorMVA ( const AttributeHeader_i & tHeader, uint32_t uVersion, util::FileReader_c * pReader, bool bBuffered ); 29 | Analyzer_i * CreateAnalyzerMVA ( const AttributeHeader_i & tHeader, uint32_t uVersion, util::FileReader_c * pReader, const common::Filter_t & tSettings, bool bHaveMatchingBlocks ); 30 | Checker_i * CreateCheckerMva ( const AttributeHeader_i & tHeader, util::FileReader_c * pReader, Reporter_fn & fnProgress, Reporter_fn & fnError ); 31 | 32 | } // namespace columnar 33 | -------------------------------------------------------------------------------- /columnar/accessor/accessorstr.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #pragma once 18 | 19 | #include "buildertraits.h" 20 | 21 | namespace columnar 22 | { 23 | 24 | class Iterator_i; 25 | class Analyzer_i; 26 | class Checker_i; 27 | class AttributeHeader_i; 28 | 29 | Iterator_i * CreateIteratorStr ( const AttributeHeader_i & tHeader, uint32_t uVersion, util::FileReader_c * pReader ); 30 | Analyzer_i * CreateAnalyzerStr ( const AttributeHeader_i & tHeader, uint32_t uVersion, util::FileReader_c * pReader, const common::Filter_t & tSettings, bool bHaveMatchingBlocks ); 31 | Checker_i * CreateCheckerStr ( const AttributeHeader_i & tHeader, util::FileReader_c * pReader, Reporter_fn & fnProgress, Reporter_fn & fnError ); 32 | 33 | } // namespace columnar 34 | -------------------------------------------------------------------------------- /cmake/GetHNSW.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023-2025, Manticore Software LTD (https://manticoresearch.com) 2 | # All rights reserved 3 | # 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | set ( HNSW_GITHUB "https://github.com/manticoresoftware/hnswlib/archive/d7bb3bb.zip" ) 17 | set ( HNSW_BUNDLEZIP "${LIBS_BUNDLE}/hnswlib-0.7.0.tar.gz" ) 18 | 19 | cmake_minimum_required ( VERSION 3.17 FATAL_ERROR ) 20 | include ( update_bundle ) 21 | 22 | # determine destination folder where we expect pre-built hnswlib 23 | find_package ( hnswlib QUIET CONFIG ) 24 | return_if_target_found ( hnswlib::hnswlib "found ready" ) 25 | 26 | # not found. Populate and prepare sources 27 | select_nearest_url ( HNSW_PLACE hnswlib ${HNSW_BUNDLEZIP} ${HNSW_GITHUB} ) 28 | fetch_sources ( hnswlib ${HNSW_PLACE} HNSW_SRC ) 29 | 30 | get_build ( HNSW_BUILD hnswlib ) 31 | external_build ( hnswlib HNSW_SRC HNSW_BUILD ) 32 | 33 | find_package ( hnswlib REQUIRED CONFIG ) 34 | -------------------------------------------------------------------------------- /columnar/accessor/accessorbool.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #pragma once 18 | 19 | #include "builder.h" 20 | #include "accessor.h" 21 | 22 | namespace util 23 | { 24 | class FileReader_c; 25 | } 26 | 27 | namespace columnar 28 | { 29 | 30 | class Iterator_i; 31 | class Analyzer_i; 32 | class AttributeHeader_i; 33 | 34 | Iterator_i * CreateIteratorBool ( const AttributeHeader_i & tHeader, util::FileReader_c * pReader ); 35 | Analyzer_i * CreateAnalyzerBool ( const AttributeHeader_i & tHeader, util::FileReader_c * pReader, const common::Filter_t & tSettings, bool bHaveMatchingBlocks ); 36 | Checker_i * CreateCheckerBool ( const AttributeHeader_i & tHeader, util::FileReader_c * pReader, Reporter_fn & fnProgress, Reporter_fn & fnError ); 37 | 38 | } // namespace columnar 39 | -------------------------------------------------------------------------------- /columnar/builder/buildermva.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #pragma once 18 | 19 | #include "util_private.h" 20 | 21 | namespace columnar 22 | { 23 | 24 | enum class MvaPacking_e : uint32_t 25 | { 26 | CONST, 27 | CONSTLEN, 28 | TABLE, 29 | DELTA_PFOR, 30 | CONSTLEN_NONCOMPRESSED, 31 | 32 | TOTAL 33 | }; 34 | 35 | 36 | class Packer_i; 37 | struct Settings_t; 38 | 39 | Packer_i * CreatePackerMva32 ( const Settings_t & tSettings, const std::string & sName ); 40 | Packer_i * CreatePackerMva64 ( const Settings_t & tSettings, const std::string & sName ); 41 | Packer_i * CreatePackerFloatVec ( const Settings_t & tSettings, const std::string & sName ); 42 | Packer_i * CreatePackerFloatVecKnn ( const Settings_t & tSettings, const std::string & sName ); 43 | 44 | } // namespace columnar 45 | -------------------------------------------------------------------------------- /util/delta.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #pragma once 18 | 19 | #include "util_private.h" 20 | 21 | namespace util 22 | { 23 | 24 | FORCE_INLINE void ComputeDeltas ( uint32_t * pData, int iLength, bool bAsc ); 25 | FORCE_INLINE void ComputeDeltas ( uint64_t * pData, int iLength, bool bAsc ); 26 | FORCE_INLINE void ComputeInverseDeltas ( Span_T & dData, bool bAsc ); 27 | FORCE_INLINE void ComputeInverseDeltas ( Span_T & dData, bool bAsc ); 28 | FORCE_INLINE void ComputeInverseDeltas ( std::vector & dData, bool bAsc ); 29 | FORCE_INLINE void ComputeInverseDeltas ( std::vector & dData, bool bAsc ); 30 | FORCE_INLINE void ComputeInverseDeltasAsc ( Span_T & dData ); 31 | FORCE_INLINE void ComputeInverseDeltasAsc ( Span_T & dData ); 32 | 33 | } // namespace util 34 | 35 | #include "delta_impl.h" -------------------------------------------------------------------------------- /columnar/builder/builderint.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #pragma once 18 | 19 | #include "columnar.h" 20 | 21 | namespace columnar 22 | { 23 | 24 | enum class IntDeltaPacking_e : uint8_t 25 | { 26 | DELTA_ASC, 27 | DELTA_DESC 28 | }; 29 | 30 | 31 | enum class IntPacking_e : uint32_t 32 | { 33 | CONST, 34 | TABLE, 35 | DELTA, 36 | GENERIC, 37 | HASH, 38 | 39 | TOTAL 40 | }; 41 | 42 | class Packer_i; 43 | struct Settings_t; 44 | 45 | Packer_i * CreatePackerUint32 ( const Settings_t & tSettings, const std::string & sName ); 46 | Packer_i * CreatePackerInt64 ( const Settings_t & tSettings, const std::string & sName ); 47 | Packer_i * CreatePackerHash ( const Settings_t & tSettings, const std::string & sName, common::StringHash_fn fnCalcHash ); 48 | Packer_i * CreatePackerFloat ( const Settings_t & tSettings, const std::string & sName ); 49 | 50 | } // namespace columnar 51 | -------------------------------------------------------------------------------- /pgm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required ( VERSION 3.17 ) 2 | project ( PiecewiseGeometricModelIndex 3 | VERSION 1.0 4 | HOMEPAGE_URL https://github.com/gvinciguerra/PGM-index ) 5 | 6 | set ( CMAKE_CXX_STANDARD 17 ) 7 | 8 | # PGM-index library 9 | add_library ( pgmindexlib INTERFACE ) 10 | target_include_directories ( pgmindexlib INTERFACE $$ ) 11 | 12 | #find_package ( OpenMP ) 13 | if (OpenMP_CXX_FOUND) 14 | message ( STATUS "OpenMP found" ) 15 | target_link_libraries ( pgmindexlib INTERFACE OpenMP::OpenMP_CXX ) 16 | endif () 17 | target_compile_features ( pgmindexlib INTERFACE cxx_std_17 ) 18 | 19 | # installation stuff 20 | set ( EXPORT_CMAKE_DIR "lib/cmake/PGM" ) 21 | 22 | install ( DIRECTORY "include/" DESTINATION include ) 23 | install ( TARGETS pgmindexlib EXPORT pgmexport ) 24 | install ( EXPORT pgmexport FILE "PGMTargets.cmake" DESTINATION "${EXPORT_CMAKE_DIR}" NAMESPACE "PGM::" ) 25 | 26 | # below is for support find_package(fastpfor) 27 | include ( CMakePackageConfigHelpers ) 28 | 29 | set ( pkgconfin "${CMAKE_CURRENT_BINARY_DIR}/PGMConfig.cmake.in" ) 30 | file ( WRITE "${pkgconfin}" "@PACKAGE_INIT@ 31 | 32 | if(NOT TARGET PGM::pgmindexlib) 33 | include(\"\${CMAKE_CURRENT_LIST_DIR}/PGMTargets.cmake\") 34 | endif()" ) 35 | 36 | configure_package_config_file ( "${pkgconfin}" "${CMAKE_CURRENT_BINARY_DIR}/PGMConfig.cmake" INSTALL_DESTINATION "${EXPORT_CMAKE_DIR}" ) 37 | install ( FILES "${CMAKE_CURRENT_BINARY_DIR}/PGMConfig.cmake" DESTINATION "${EXPORT_CMAKE_DIR}" ) -------------------------------------------------------------------------------- /cmake/init_cache_settings.cmake: -------------------------------------------------------------------------------- 1 | if (__init_columnar_cache_settings_included) 2 | return () 3 | endif () 4 | set ( __init_columnar_cache_settings_included YES ) 5 | 6 | # bundle - contains sources (tarballs) of 3-rd party libs. If not provided, try path 'bundle' aside sources. 7 | # if it is provided anyway (via cmake var, ir via env var) and NOT absolute - point it into binary (build) dir. 8 | if (DEFINED ENV{LIBS_BUNDLE}) 9 | set ( LIBS_BUNDLE "$ENV{LIBS_BUNDLE}" ) 10 | endif () 11 | 12 | if (NOT LIBS_BUNDLE) 13 | get_filename_component ( LIBS_BUNDLE "${columnar_SOURCE_DIR}/../bundle" ABSOLUTE ) 14 | endif () 15 | 16 | if (NOT IS_ABSOLUTE ${LIBS_BUNDLE}) 17 | set ( LIBS_BUNDLE "${columnar_BINARY_DIR}/${LIBS_BUNDLE}" ) 18 | endif () 19 | 20 | set ( LIBS_BUNDLE "${LIBS_BUNDLE}" CACHE PATH "Choose the path to the dir which contains downloaded sources for libs like re2, icu, stemmer, etc." FORCE ) 21 | 22 | # cacheb (means 'cache binary') - contains unpacked sources and builds of 3-rd party libs, alive between rebuilds. 23 | # if not provided, set to folder 'cache' aside bundle. If not absolute, point it into binary (build) dir. 24 | if (DEFINED ENV{CACHEB}) 25 | set ( CACHEB "$ENV{CACHEB}" ) 26 | endif () 27 | 28 | if (NOT DEFINED CACHEB) 29 | get_filename_component ( CACHEB "${LIBS_BUNDLE}/../cache" ABSOLUTE ) 30 | endif () 31 | 32 | if (NOT IS_ABSOLUTE ${CACHEB}) 33 | set ( CACHEB "${columnar_BINARY_DIR}/${CACHEB}" ) 34 | endif () 35 | 36 | if (DEFINED CACHEB) 37 | set ( CACHEB "${CACHEB}" CACHE PATH "Cache dir where unpacked sources and builds found." ) 38 | endif () 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yml: -------------------------------------------------------------------------------- 1 | name: 🌟 Feature Request 2 | description: Submit a proposal for a new Manticore Columnar Library feature or enhancement 3 | body: 4 | - type: textarea 5 | id: proposal 6 | attributes: 7 | label: "Proposal:" 8 | description: > 9 | Please describe your proposal in detail. 10 | Include why you believe this feature should be added to Manticore Columnar Library and what use cases it supports. 11 | If applicable, add any examples or code snippets inside triple backticks to clarify your proposal. 12 | validations: 13 | required: true 14 | - type: markdown 15 | attributes: 16 | value: "## Thank you for completing the form! If you are interested in sponsoring the development of this feature, consider our [professional services](https://manticoresearch.com/services/)." 17 | - type: textarea 18 | id: checklist 19 | attributes: 20 | label: "Checklist:" 21 | description: > 22 | **For Manticore Team Use Only** — Please do not edit this section. This checklist will be completed by the Manticore team as they manage the issue. 23 | value: | 24 | To be completed by the assignee. Check off tasks that have been completed or are not applicable. 25 |
26 | 27 | - [ ] Implementation completed 28 | - [ ] Tests developed 29 | - [ ] Documentation updated 30 | - [ ] Documentation reviewed 31 | - [x] OpenAPI YAML updated and issue created to rebuild clients 32 | 33 |
34 | validations: 35 | required: true 36 | -------------------------------------------------------------------------------- /common/blockiterator.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | // This file is a part of the common headers (API). 18 | // If you make any significant changes to this file, you MUST bump the LIB_VERSION in columnar.h or secondary.h 19 | 20 | #pragma once 21 | 22 | #include "util/util.h" 23 | 24 | namespace common 25 | { 26 | 27 | struct IteratorDesc_t 28 | { 29 | std::string m_sAttr; 30 | std::string m_sType; 31 | }; 32 | 33 | 34 | class BlockIterator_i 35 | { 36 | public: 37 | virtual ~BlockIterator_i() = default; 38 | 39 | virtual bool HintRowID ( uint32_t tRowID ) = 0; 40 | virtual bool GetNextRowIdBlock ( util::Span_T & dRowIdBlock ) = 0; 41 | virtual int64_t GetNumProcessed() const = 0; 42 | 43 | virtual void SetCutoff ( int iCutoff ) = 0; 44 | virtual bool WasCutoffHit() const = 0; 45 | 46 | virtual void AddDesc ( std::vector & dDesc ) const = 0; 47 | }; 48 | 49 | 50 | } // namespace common 51 | -------------------------------------------------------------------------------- /common/schema.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | // This file is a part of the common headers (API). 18 | // If you make any significant changes to this file, you MUST bump the LIB_VERSION in columnar.h or secondary.h 19 | 20 | #pragma once 21 | 22 | #include 23 | #include 24 | #include 25 | 26 | namespace common 27 | { 28 | 29 | enum class AttrType_e : uint32_t 30 | { 31 | NONE, 32 | UINT32, 33 | TIMESTAMP, 34 | INT64, 35 | UINT64, 36 | BOOLEAN, 37 | FLOAT, 38 | STRING, 39 | UINT32SET, 40 | INT64SET, 41 | FLOATVEC, 42 | 43 | TOTAL 44 | }; 45 | 46 | using StringHash_fn = uint64_t (*)( const uint8_t * pStr, int iLen, uint64_t uPrev ); 47 | 48 | struct SchemaAttr_t 49 | { 50 | std::string m_sName; 51 | AttrType_e m_eType = AttrType_e::NONE; 52 | StringHash_fn m_fnCalcHash = nullptr; 53 | bool m_bKNN = false; 54 | }; 55 | 56 | using Schema_t = std::vector; 57 | 58 | } // namespace common 59 | -------------------------------------------------------------------------------- /secondary/builder.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | // This file is a part of the common headers (API). 18 | // If you make any significant changes to this file, you MUST bump the LIB_VERSION in secondary.h 19 | 20 | #pragma once 21 | 22 | #include "util/util.h" 23 | #include "common/schema.h" 24 | 25 | namespace SI 26 | { 27 | 28 | class Builder_i 29 | { 30 | public: 31 | virtual ~Builder_i() = default; 32 | 33 | virtual void SetRowID ( uint32_t tRowID ) = 0; 34 | virtual void SetAttr ( int iAttr, int64_t tAttr ) = 0; 35 | virtual void SetAttr ( int iAttr, const uint8_t * pData, int iLength ) = 0; 36 | virtual void SetAttr ( int iAttr, const int64_t * pData, int iLength ) = 0; 37 | 38 | virtual bool Done ( std::string & sError ) = 0; 39 | }; 40 | 41 | } // namespace SI 42 | 43 | 44 | extern "C" 45 | { 46 | DLLEXPORT SI::Builder_i * CreateBuilder ( const common::Schema_t & tSchema, size_t tMemoryLimit, const std::string & sFile, size_t tBufferSize, std::string & sError ); 47 | } -------------------------------------------------------------------------------- /cmake/GetFastPFOR.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | # All rights reserved 3 | # 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set ( FP_GITHUB "https://github.com/manticoresoftware/FastPFor/archive/refs/heads/simde.zip" ) 18 | set ( FP_BUNDLEZIP "${LIBS_BUNDLE}/FastPFor-simde.zip" ) 19 | 20 | cmake_minimum_required ( VERSION 3.17 FATAL_ERROR ) 21 | include ( update_bundle ) 22 | 23 | # determine destination folder where we expect pre-built fastpfor 24 | find_package ( FastPFOR QUIET CONFIG ) 25 | return_if_target_found ( FastPFOR::FastPFOR "ready (no need to build)" ) 26 | 27 | # not found. Populate and prepare sources 28 | select_nearest_url ( FP_PLACE fastpfor ${FP_BUNDLEZIP} ${FP_GITHUB} ) 29 | fetch_sources ( fastpfor ${FP_PLACE} FASTPFOR_SRC ) 30 | execute_process ( COMMAND ${CMAKE_COMMAND} -E copy_if_different "${columnar_SOURCE_DIR}/libfastpfor/CMakeLists.txt" "${FASTPFOR_SRC}/CMakeLists.txt" ) 31 | 32 | # build external project 33 | get_build ( FASTPFOR_BUILD fastpfor ) 34 | external_build ( FastPFOR FASTPFOR_SRC FASTPFOR_BUILD ) 35 | 36 | find_package ( FastPFOR CONFIG REQUIRED ) -------------------------------------------------------------------------------- /columnar/accessor/accessorint.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #pragma once 18 | 19 | #include "builder.h" 20 | 21 | namespace common 22 | { 23 | struct Filter_t; 24 | } 25 | 26 | namespace util 27 | { 28 | class FileReader_c; 29 | } 30 | 31 | namespace columnar 32 | { 33 | 34 | class Iterator_i; 35 | class Analyzer_i; 36 | class Checker_i; 37 | class AttributeHeader_i; 38 | 39 | Iterator_i * CreateIteratorUint32 ( const AttributeHeader_i & tHeader, uint32_t uVersion, util::FileReader_c * pReader ); 40 | Iterator_i * CreateIteratorUint64 ( const AttributeHeader_i & tHeader, uint32_t uVersion, util::FileReader_c * pReader ); 41 | 42 | Analyzer_i * CreateAnalyzerInt ( const AttributeHeader_i & tHeader, uint32_t uVersion, util::FileReader_c * pReader, const common::Filter_t & tSettings, bool bHaveMatchingBlocks ); 43 | 44 | Checker_i * CreateCheckerInt ( const AttributeHeader_i & tHeader, util::FileReader_c * pReader, Reporter_fn & fnProgress, Reporter_fn & fnError ); 45 | 46 | } // namespace columnar 47 | -------------------------------------------------------------------------------- /cmake/GetStreamvbyte.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | # All rights reserved 3 | # 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set ( SVB_GITHUB "https://github.com/manticoresoftware/streamvbyte/archive/efdd9dac.zip" ) 18 | set ( SVB_BUNDLEZIP "${LIBS_BUNDLE}/streamvbyte.zip" ) 19 | 20 | cmake_minimum_required ( VERSION 3.17 FATAL_ERROR ) 21 | include ( update_bundle ) 22 | 23 | # determine destination folder where we expect pre-built lib 24 | find_package ( streamvbyte QUIET CONFIG ) 25 | return_if_target_found ( streamvbyte::streamvbyte "ready (no need to build)" ) 26 | 27 | # not found. Populate and prepare sources 28 | select_nearest_url ( SVB_PLACE streamvbyte ${SVB_BUNDLEZIP} ${SVB_GITHUB} ) 29 | fetch_sources ( streamvbyte ${SVB_PLACE} STREAMVBYTE_SRC ) 30 | execute_process ( COMMAND ${CMAKE_COMMAND} -E copy_if_different "${columnar_SOURCE_DIR}/streamvbyte/CMakeLists.txt" "${STREAMVBYTE_SRC}/CMakeLists.txt" ) 31 | 32 | # build external project 33 | get_build ( STREAMVBYTE_BUILD streamvbyte ) 34 | external_build ( streamvbyte STREAMVBYTE_SRC STREAMVBYTE_BUILD ) 35 | 36 | find_package ( streamvbyte REQUIRED CONFIG ) -------------------------------------------------------------------------------- /cmake/GetPGM.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | # All rights reserved 3 | # 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set ( PGM_GITHUB "https://github.com/manticoresoftware/PGM-index/archive/refs/heads/pgm_2022_08_02.zip" ) 18 | set ( PGM_BUNDLEZIP "${LIBS_BUNDLE}/pgm_2022_08_02.zip" ) 19 | 20 | cmake_minimum_required ( VERSION 3.17 FATAL_ERROR ) 21 | include ( update_bundle ) 22 | 23 | # not mandatory, but if available, pgm will depends on it 24 | # find_package ( OpenMP ) 25 | 26 | # determine destination folder where we expect pre-built pgm 27 | find_package ( PGM QUIET CONFIG ) 28 | return_if_target_found ( PGM::pgmindexlib "ready (no need to build)" ) 29 | 30 | # not found. Populate and prepare sources 31 | select_nearest_url ( PGM_PLACE pgm ${PGM_BUNDLEZIP} ${PGM_GITHUB} ) 32 | fetch_sources ( pgm ${PGM_PLACE} PGM_SRC ) 33 | execute_process ( COMMAND ${CMAKE_COMMAND} -E copy_if_different "${columnar_SOURCE_DIR}/pgm/CMakeLists.txt" "${PGM_SRC}/CMakeLists.txt" ) 34 | 35 | # build external project 36 | get_build ( PGM_BUILD pgm ) 37 | external_build ( PGM PGM_SRC PGM_BUILD ) 38 | 39 | find_package ( PGM CONFIG REQUIRED ) 40 | -------------------------------------------------------------------------------- /cmake/external-build.cmake.in: -------------------------------------------------------------------------------- 1 | cmake_minimum_required ( VERSION 3.17 ) 2 | 3 | project ( @module@-prebuild NONE ) 4 | 5 | include ( ExternalProject ) 6 | 7 | set ( DEVMODE @DISTR_BUILD@ ) 8 | 9 | set ( BUILD_TYPE @CMAKE_BUILD_TYPE@ ) 10 | if (NOT BUILD_TYPE) 11 | set ( BUILD_TYPE "RelWithDebInfo" ) 12 | endif () 13 | 14 | if (NOT DEVMODE) 15 | message ( STATUS "Build RelWithDebInfo and Debug for developing" ) 16 | ExternalProject_Add ( @module@_populate 17 | SOURCE_DIR @MODULE_SRC@ 18 | @CMAKE_ARGS@ 19 | BUILD_COMMAND "@CMAKE_COMMAND@" -E echo "Starting build config RelWithDebInfo" 20 | COMMAND "@CMAKE_COMMAND@" -DCMAKE_BUILD_TYPE=RelWithDebInfo . 21 | COMMAND "@CMAKE_COMMAND@" --build . --config RelWithDebInfo 22 | COMMAND "@CMAKE_COMMAND@" --install . --config RelWithDebInfo --prefix "@MODULE_BUILD@" 23 | COMMAND "@CMAKE_COMMAND@" -E echo "Starting build config Debug" 24 | COMMAND "@CMAKE_COMMAND@" -DCMAKE_BUILD_TYPE=Debug . 25 | COMMAND "@CMAKE_COMMAND@" --build . --config Debug --clean-first 26 | COMMAND "@CMAKE_COMMAND@" --install . --config Debug --prefix "@MODULE_BUILD@" 27 | INSTALL_COMMAND "" 28 | TEST_COMMAND "" 29 | ) 30 | else () 31 | message ( STATUS "Build only Release for releasing" ) 32 | ExternalProject_Add ( @module@_populate 33 | SOURCE_DIR @MODULE_SRC@ 34 | @CMAKE_ARGS@ 35 | BUILD_COMMAND "@CMAKE_COMMAND@" -E echo "Starting build config Release" 36 | COMMAND "@CMAKE_COMMAND@" -DCMAKE_BUILD_TYPE=Release . 37 | COMMAND "@CMAKE_COMMAND@" --build . --config Release 38 | COMMAND "@CMAKE_COMMAND@" --install . --config Release --prefix "@MODULE_BUILD@" 39 | INSTALL_COMMAND "" 40 | TEST_COMMAND "" 41 | ) 42 | endif () 43 | 44 | # file configured from cmake/external-build.cmake.in 45 | -------------------------------------------------------------------------------- /util/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2025, Manticore Software LTD (https://manticoresearch.com) 2 | # All rights reserved 3 | # 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | cmake_minimum_required ( VERSION 3.17 ) 19 | 20 | add_library ( util OBJECT 21 | util_private.cpp 22 | version.cpp 23 | reader.cpp 24 | codec.cpp 25 | util.h 26 | util_private.h 27 | delta.h 28 | delta_impl.h 29 | reader.h 30 | codec.h 31 | bitvec.h 32 | ) 33 | 34 | include ( CheckFunctionExists ) 35 | check_function_exists ( pread HAVE_PREAD ) 36 | set_source_files_properties ( reader.cpp PROPERTIES COMPILE_DEFINITIONS HAVE_PREAD=${HAVE_PREAD} ) 37 | 38 | target_link_libraries ( util PRIVATE FastPFOR::FastPFOR streamvbyte::streamvbyte columnar_root ) 39 | set_property ( TARGET util PROPERTY POSITION_INDEPENDENT_CODE ON ) 40 | 41 | message ( STATUS "Version ${CMAKE_PROJECT_VERSION} ${GIT_COMMIT_ID}@${GIT_TIMESTAMP_ID}, ${GIT_BRANCH_ID}" ) 42 | configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/version.h.in ${columnar_BINARY_DIR}/config/gen_version.h ) 43 | 44 | target_include_directories ( util PUBLIC ${columnar_BINARY_DIR}/config ) 45 | target_include_directories ( util INTERFACE ${CMAKE_CURRENT_SOURCE_DIR} ) 46 | -------------------------------------------------------------------------------- /embeddings/src/ffi.rs: -------------------------------------------------------------------------------- 1 | use crate::model::text_model_wrapper::{ 2 | FloatVecResult, StringItem, TextModelResult, TextModelWrapper, 3 | }; 4 | use std::os::raw::c_char; 5 | 6 | type LoadModelFn = extern "C" fn( 7 | *const c_char, 8 | usize, 9 | *const c_char, 10 | usize, 11 | *const c_char, 12 | usize, 13 | bool, 14 | ) -> TextModelResult; 15 | 16 | type FreeModelResultFn = extern "C" fn(TextModelResult); 17 | 18 | type MakeVectEmbeddingsFn = 19 | extern "C" fn(&TextModelWrapper, *const StringItem, usize) -> FloatVecResult; 20 | 21 | type FreeVecResultFn = extern "C" fn(FloatVecResult); 22 | 23 | type GetLenFn = extern "C" fn(&TextModelWrapper) -> usize; 24 | 25 | #[repr(C)] 26 | pub struct EmbedLib { 27 | version: usize, 28 | version_str: *const c_char, 29 | load_model: LoadModelFn, 30 | free_model_result: FreeModelResultFn, 31 | make_vect_embeddings: MakeVectEmbeddingsFn, 32 | free_vec_result: FreeVecResultFn, 33 | get_hidden_size: GetLenFn, 34 | get_max_input_size: GetLenFn, 35 | } 36 | const LIB: EmbedLib = EmbedLib { 37 | version: 2usize, 38 | version_str: { 39 | let version_bytes = b"1.0.1\0"; 40 | version_bytes.as_ptr() as *const c_char 41 | }, 42 | load_model: TextModelWrapper::load_model, 43 | free_model_result: TextModelWrapper::free_model_result, 44 | make_vect_embeddings: TextModelWrapper::make_vect_embeddings, 45 | free_vec_result: TextModelWrapper::free_vec_result, 46 | get_hidden_size: TextModelWrapper::get_hidden_size, 47 | get_max_input_size: TextModelWrapper::get_max_input_len, 48 | }; 49 | 50 | #[no_mangle] 51 | pub extern "C" fn GetLibFuncs() -> *const EmbedLib { 52 | std::panic::set_hook(Box::new(|_| {})); 53 | &LIB 54 | } 55 | -------------------------------------------------------------------------------- /gitsync_columnar.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # To be run from gitlab CI runner. 4 | 5 | autotag() { 6 | API_VER=$(grep -Po 'LIB_VERSION.* \K\d+' columnar/columnar.h) 7 | SI_API_VER=$(grep -Po 'LIB_VERSION.* \K\d+' secondary/secondary.h) 8 | KNN_API_VER=$(grep -Po 'LIB_VERSION.* \K\d+' knn/knn.h) 9 | AUTO_TAG="c$API_VER-s$SI_API_VER-k$KNN_API_VER" 10 | NUMS=3 11 | 12 | # check whether all the numbers are available 13 | if [[ $(echo $AUTO_TAG | grep -Po '[0-9]+' | wc -l) = $NUMS ]]; then 14 | # tag is correct 15 | if [[ ! $(git ls-remote github | grep $AUTO_TAG) ]]; then 16 | echo "no tag - will add $AUTO_TAG" 17 | echo "> git tag $AUTO_TAG" 18 | git tag "$AUTO_TAG" 19 | echo "> git push github $AUTO_TAG" 20 | git push github "$AUTO_TAG" 21 | echo "> git status" 22 | git status 23 | else 24 | echo "repo github already has tag $AUTO_TAG, exiting..." 25 | fi 26 | else 27 | echo "generated tag $AUTO_TAG is not valid, do nothing" 28 | fi 29 | } 30 | 31 | echo "> rm -fr gitlab_github_sync" 32 | rm -fr gitlab_github_sync 33 | echo "> git clone git@gitlab.com:manticoresearch/columnar.git gitlab_github_sync" 34 | git clone git@gitlab.com:manticoresearch/columnar.git gitlab_github_sync 35 | echo "> cd gitlab_github_sync" 36 | cd gitlab_github_sync 37 | echo "> git checkout $CI_COMMIT_BRANCH" 38 | git checkout $CI_COMMIT_BRANCH 39 | echo "> git remote add github git@github.com:manticoresoftware/columnar.git" 40 | git remote add github git@github.com:manticoresoftware/columnar.git 41 | echo "> git fetch github" 42 | git fetch github 43 | echo "> git push -u github $CI_COMMIT_BRANCH" 44 | git push -u github "$CI_COMMIT_BRANCH" 45 | if [[ $CI_COMMIT_BRANCH == "master" ]]; then autotag; fi 46 | -------------------------------------------------------------------------------- /columnar/builder.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | // This file is a part of the common headers (API). 18 | // If you make any significant changes to this file, you MUST bump the LIB_VERSION in columnar.h 19 | 20 | #pragma once 21 | 22 | #include "columnar.h" 23 | #include "common/schema.h" 24 | 25 | namespace columnar 26 | { 27 | 28 | static const uint32_t STORAGE_VERSION = 13; 29 | 30 | inline bool StorageVersionWrong ( uint32_t uVer ) noexcept 31 | { 32 | if ( uVer < 10 ) 33 | return true; 34 | 35 | return uVer > STORAGE_VERSION; 36 | } 37 | 38 | class Builder_i 39 | { 40 | public: 41 | virtual ~Builder_i() = default; 42 | 43 | virtual void SetAttr ( int iAttr, int64_t tAttr ) = 0; 44 | virtual void SetAttr ( int iAttr, const uint8_t * pData, int iLength ) = 0; 45 | virtual void SetAttr ( int iAttr, const int64_t * pData, int iLength ) = 0; 46 | virtual bool Done ( std::string & sError ) = 0; 47 | }; 48 | 49 | } // namespace columnar 50 | 51 | extern "C" 52 | { 53 | DLLEXPORT columnar::Builder_i * CreateColumnarBuilder ( const common::Schema_t & tSchema, const std::string & sFile, size_t tBufferSize, std::string & sError ); 54 | } 55 | -------------------------------------------------------------------------------- /embeddings/manticoresearch_text_embeddings.h: -------------------------------------------------------------------------------- 1 | // Auto-generated file. Do not edit. 2 | 3 | #ifndef MANTICORESEARCH_TEXT_EMBEDDINGS_H 4 | #define MANTICORESEARCH_TEXT_EMBEDDINGS_H 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | struct TextModelResult { 13 | void *m_pModel; 14 | char *m_szError; 15 | }; 16 | 17 | using LoadModelFn = TextModelResult(*)(const char*, 18 | uintptr_t, 19 | const char*, 20 | uintptr_t, 21 | const char*, 22 | uintptr_t, 23 | bool); 24 | 25 | using FreeModelResultFn = void(*)(TextModelResult); 26 | 27 | struct FloatVec { 28 | const float *ptr; 29 | uintptr_t len; 30 | uintptr_t cap; 31 | }; 32 | 33 | struct FloatVecResult { 34 | char *m_szError; 35 | const FloatVec *m_tEmbedding; 36 | uintptr_t len; 37 | uintptr_t cap; 38 | }; 39 | 40 | using TextModelWrapper = void*; 41 | 42 | struct StringItem { 43 | const char *ptr; 44 | uintptr_t len; 45 | }; 46 | 47 | using MakeVectEmbeddingsFn = FloatVecResult(*)(const TextModelWrapper*, const StringItem*, uintptr_t); 48 | 49 | using FreeVecResultFn = void(*)(FloatVecResult); 50 | 51 | using GetLenFn = uintptr_t(*)(const TextModelWrapper*); 52 | 53 | struct EmbedLib { 54 | uintptr_t version; 55 | const char *version_str; 56 | LoadModelFn load_model; 57 | FreeModelResultFn free_model_result; 58 | MakeVectEmbeddingsFn make_vect_embeddings; 59 | FreeVecResultFn free_vec_result; 60 | GetLenFn get_hidden_size; 61 | GetLenFn get_max_input_size; 62 | }; 63 | 64 | extern "C" { 65 | 66 | const EmbedLib *GetLibFuncs(); 67 | 68 | } // extern "C" 69 | 70 | #endif // MANTICORESEARCH_TEXT_EMBEDDINGS_H 71 | -------------------------------------------------------------------------------- /secondary/iterator.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | // This file is a part of the common headers (API). 18 | // If you make any significant changes to this file, you MUST bump the LIB_VERSION in secondary.h 19 | 20 | #pragma once 21 | 22 | #include "util/reader.h" 23 | #include "util/codec.h" 24 | #include "common/filter.h" 25 | #include "common/blockiterator.h" 26 | #include "blockreader.h" 27 | #include "builder.h" 28 | #include 29 | 30 | namespace SI 31 | { 32 | 33 | class BlockIteratorWithSetup_i : public common::BlockIterator_i 34 | { 35 | public: 36 | virtual void Setup ( Packing_e eType, uint64_t uStartOffset, uint32_t uMinRowID, uint32_t uMaxRowID, uint32_t uCount ) = 0; 37 | }; 38 | 39 | BlockIteratorWithSetup_i * CreateRowidIterator ( const std::string & sAttr, Packing_e eType, uint64_t uStartOffset, uint32_t uMinRowID, uint32_t uMaxRowID, uint32_t uCount, uint32_t uRowidsPerBlock, std::shared_ptr & pSharedReader, std::shared_ptr & pCodec, const common::RowidRange_t * pBounds, bool bBitmap ); 40 | bool SetupRowidIterator ( BlockIteratorWithSetup_i * pIterator, Packing_e eType, uint64_t uStartOffset, uint32_t uMinRowID, uint32_t uMaxRowID, uint32_t uCount, const common::RowidRange_t * pBounds ); 41 | 42 | } 43 | -------------------------------------------------------------------------------- /columnar/accessor/attributeheader.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #pragma once 18 | 19 | #include "columnar.h" 20 | 21 | namespace util 22 | { 23 | class FileReader_c; 24 | } 25 | 26 | namespace columnar 27 | { 28 | 29 | struct Settings_t; 30 | 31 | class AttributeHeader_i 32 | { 33 | public: 34 | virtual ~AttributeHeader_i() = default; 35 | 36 | virtual const std::string & GetName() const = 0; 37 | virtual common::AttrType_e GetType() const = 0; 38 | virtual float GetComplexity() const = 0; 39 | virtual const Settings_t & GetSettings() const = 0; 40 | 41 | virtual uint32_t GetNumDocs() const = 0; 42 | virtual int GetNumBlocks() const = 0; 43 | virtual uint32_t GetNumDocs ( int iBlock ) const = 0; 44 | virtual uint64_t GetBlockOffset ( int iBlock ) const = 0; 45 | 46 | virtual int GetNumMinMaxLevels() const = 0; 47 | virtual int GetNumMinMaxBlocks ( int iLevel ) const = 0; 48 | virtual std::pair GetMinMax ( int iLevel, int iBlock ) const = 0; 49 | 50 | virtual bool Load ( util::FileReader_c & tReader, std::string & sError ) = 0; 51 | virtual bool Check ( util::FileReader_c & tReader, Reporter_fn & fnError ) = 0; 52 | }; 53 | 54 | 55 | AttributeHeader_i * CreateAttributeHeader ( common::AttrType_e eType, uint32_t uTotalDocs, std::string & sError ); 56 | 57 | } // namespace columnar 58 | -------------------------------------------------------------------------------- /cmake/builds/build_windows.cmake: -------------------------------------------------------------------------------- 1 | # ---------- windows ---------- 2 | # Above line is mandatory! 3 | # rules to build windows zip archive 4 | 5 | message ( STATUS "Will create windows ZIP" ) 6 | 7 | set ( CPACK_NSIS_INSTALL_ROOT "c:" ) 8 | set ( CPACK_NSIS_DISPLAY_NAME "Manticore columnar" ) 9 | set ( CPACK_NSIS_PACKAGE_NAME "Manticore libraries" ) 10 | set ( CPACK_PACKAGE_INSTALL_DIRECTORY manticore ) 11 | set ( CPACK_ARCHIVE_COMPONENT_INSTALL ON ) 12 | 13 | set ( CPACK_GENERATOR "ZIP;NSIS" ) 14 | set ( CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_FILE_NAME}-x64" ) 15 | 16 | set ( CPACK_COMPONENT_COLUMNAR_GROUP libs ) 17 | set ( CPACK_COMPONENT_SECONDARY_GROUP libs ) 18 | set ( CPACK_COMPONENT_KNN_GROUP libs ) 19 | set ( CPACK_COMPONENT_EMBEDDINGS_GROUP libs ) 20 | 21 | set ( CPACK_COMPONENT_GROUP_LIBS_DISPLAY_NAME "Manticore modules" ) 22 | set ( CPACK_COMPONENT_COLUMNAR_DISPLAY_NAME "Columnar storage library") 23 | set ( CPACK_COMPONENT_COLUMNAR_DESCRIPTION "${CPACK_PACKAGE_DESCRIPTION_SUMMARY}" ) 24 | set ( CPACK_COMPONENT_SECONDARY_DISPLAY_NAME "Secondary index library" ) 25 | set ( CPACK_COMPONENT_SECONDARY_DESCRIPTION "Secondary index" ) 26 | set ( CPACK_COMPONENT_KNN_DISPLAY_NAME "KNN search library" ) 27 | set ( CPACK_COMPONENT_KNN_DESCRIPTION "KNN search support" ) 28 | set ( CPACK_COMPONENT_EMBEDDINGS_DISPLAY_NAME "Text embeddings library" ) 29 | set ( CPACK_COMPONENT_EMBEDDINGS_DESCRIPTION "Text embeddings support for KNN" ) 30 | 31 | 32 | set ( CPACK_COMPONENT_DBGSYMBOLS_DISPLAY_NAME "Debug symbols" ) 33 | set ( CPACK_COMPONENT_DBGSYMBOLS_DISABLED ON ) 34 | set ( CPACK_COMPONENT_DBGSYMBOLS_DOWNLOADED ON ) 35 | 36 | # base where installer will download the archives 37 | set ( DISTR_URL "https://repo.manticoresearch.com/repository/manticoresearch_windows" ) 38 | 39 | if (DEV_BUILD) 40 | set ( CPACK_DOWNLOAD_SITE "${DISTR_URL}/dev/x64/" ) 41 | else () 42 | set ( CPACK_DOWNLOAD_SITE "${DISTR_URL}/release/x64/" ) 43 | endif () 44 | 45 | set ( CPACK_NSIS_MODIFY_PATH OFF ) 46 | set ( CPACK_NSIS_UNINSTALL_NAME Uninstall-columnar ) 47 | 48 | # HKLM/SOFTWARE/Wow6432Node/Manticore Software LTD/manticore -------------------------------------------------------------------------------- /.github/workflows/clt_tests.yml: -------------------------------------------------------------------------------- 1 | name: CLT tests 2 | on: 3 | workflow_call: 4 | inputs: 5 | docker_image: 6 | required: true 7 | type: string 8 | description: "Docker image to use for tests" 9 | artifact_name: 10 | required: false 11 | type: string 12 | description: "Name of the docker image artifact" 13 | repository: 14 | required: false 15 | type: string 16 | description: "Repository to checkout" 17 | ref: 18 | required: false 19 | type: string 20 | description: "Ref to checkout" 21 | continue_on_error: 22 | required: false 23 | type: boolean 24 | description: "Continue on error" 25 | default: false 26 | secrets: 27 | OPENAI_API_KEY: 28 | required: true 29 | description: "OpenAI API key for CLT tests" 30 | VOYAGE_API_KEY: 31 | required: true 32 | description: "Voyage API key for CLT tests" 33 | JINA_API_KEY: 34 | required: true 35 | description: "Jina API key for CLT tests" 36 | 37 | jobs: 38 | clt: 39 | name: CLT 40 | runs-on: ubuntu-22.04 41 | timeout-minutes: 30 42 | continue-on-error: ${{ inputs.continue_on_error }} 43 | env: 44 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 45 | VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }} 46 | JINA_API_KEY: ${{ secrets.JINA_API_KEY }} 47 | strategy: 48 | fail-fast: false 49 | matrix: 50 | test-suite: 51 | - name: MCL 52 | test_prefix: test/clt-tests/mcl/ 53 | 54 | steps: 55 | - uses: manticoresoftware/clt@0.7.3 56 | with: 57 | artifact: ${{ inputs.artifact_name }} 58 | image: ${{ inputs.docker_image }} 59 | repository: ${{ inputs.repository }} 60 | ref: ${{ inputs.ref }} 61 | test_prefix: ${{ matrix.test-suite.test_prefix }} 62 | comment_mode: failures 63 | run_args: "-e OPENAI_API_KEY -e VOYAGE_API_KEY -e JINA_API_KEY" 64 | ui_host: "https://clt.manticoresearch.com" 65 | -------------------------------------------------------------------------------- /.github/workflows/mirror.yml: -------------------------------------------------------------------------------- 1 | name: 🪞 Mirror 2 | run-name: 🪞 Mirror ${{ github.sha }} 3 | 4 | on: 5 | push: 6 | branches: 7 | - master 8 | - maintenance-release 9 | - 'columnar-*' 10 | 11 | # waits until the previous workflow run to finish when a new one appears 12 | concurrency: 13 | group: mirror_${{ github.ref }} 14 | 15 | jobs: 16 | to_gitlab: 17 | name: Sync to GitLab 18 | if: github.repository == 'manticoresoftware/columnar' 19 | runs-on: ubuntu-latest 20 | steps: 21 | - name: Checkout repository 22 | uses: actions/checkout@v3 23 | with: 24 | token: ${{ secrets.GITHUB_TOKEN }} 25 | fetch-depth: 0 26 | - name: Sync to Gitlab 27 | run: | 28 | echo "${{ secrets.GITLAB_SSH_KEY }}" > /tmp/ssh.key 29 | chmod 600 /tmp/ssh.key 30 | git remote add gitlab git@gitlab.com:manticoresearch/columnar.git 31 | git checkout ${{ github.ref }} 32 | 33 | API_VER=$(grep -Po 'LIB_VERSION.* \K\d+' columnar/columnar.h) 34 | SI_API_VER=$(grep -Po 'LIB_VERSION.* \K\d+' secondary/secondary.h) 35 | KNN_API_VER=$(grep -Po 'LIB_VERSION.* \K\d+' knn/knn.h) 36 | AUTO_TAG="c$API_VER-s$SI_API_VER-k$KNN_API_VER" 37 | NUMS=3 38 | 39 | if [[ ! $(git tag | grep $AUTO_TAG) ]]; then 40 | git tag "$AUTO_TAG" 41 | git push origin --tags 42 | git status 43 | else 44 | echo "Tag $AUTO_TAG already exists" 45 | fi 46 | 47 | GIT_SSH_COMMAND='ssh -i /tmp/ssh.key -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -F /dev/null' git fetch gitlab 48 | GIT_SSH_COMMAND='ssh -i /tmp/ssh.key -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -F /dev/null' git push -u gitlab ${{ github.ref }} 49 | GIT_SSH_COMMAND='ssh -i /tmp/ssh.key -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -F /dev/null' git push -u gitlab ${{ github.ref }} --tags --force 50 | shell: bash 51 | -------------------------------------------------------------------------------- /cmake/helpers.cmake: -------------------------------------------------------------------------------- 1 | if (__helpers_columnar_included) 2 | return () 3 | endif () 4 | set ( __helpers_columnar_included YES ) 5 | 6 | include ( printers ) 7 | macro ( return_if_target_found TRG LEGEND ) 8 | if (TARGET ${TRG}) 9 | diagst ( ${TRG} "${LEGEND}" ) 10 | return () 11 | endif () 12 | endmacro () 13 | 14 | # this macro is need for cross-compiling. If we just add path to CMAKE_PREFIX_PATH, it will NOT work with active root path, 15 | # if search strategy set to 'only'. So, we add path to the root path in this case instead. 16 | macro ( APPEND_PREFIX PATH ) 17 | if (CMAKE_FIND_ROOT_PATH_MODE_PACKAGE STREQUAL ONLY) 18 | diags ( "CMAKE_FIND_ROOT_PATH before inclusion of update_bundle was ${CMAKE_FIND_ROOT_PATH}" ) 19 | list ( APPEND CMAKE_FIND_ROOT_PATH "${PATH}" ) 20 | diags ( "CMAKE_FIND_ROOT_PATH refreshed from update_bundle and is ${CMAKE_FIND_ROOT_PATH}" ) 21 | # endif () 22 | else () 23 | diags ( "CMAKE_PREFIX_PATH before inclusion of update_bundle was ${CMAKE_PREFIX_PATH}" ) 24 | list ( APPEND CMAKE_PREFIX_PATH "${PATH}" ) 25 | diags ( "CMAKE_PREFIX_PATH refreshed from update_bundle and is ${CMAKE_PREFIX_PATH}" ) 26 | endif () 27 | endmacro () 28 | 29 | # helpers vars to shorten generate lines 30 | set ( CLANGCXX "$" ) 31 | set ( GNUCXX "$" ) 32 | set ( GNUCLANGCXX "$" ) 33 | set ( CLANGC "$" ) 34 | set ( GNUC "$" ) 35 | set ( GNUCLANGC "$" ) 36 | set ( GNUC_CXX "$" ) 37 | set ( CLANGC_CXX "$" ) 38 | set ( GNUCLANGC_CXX "$" ) 39 | if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "MSVC") 40 | set ( CLANG_CL 1 ) 41 | set ( ONLYGNUCLANGCXX 0 ) 42 | set ( ONLYGNUCLANGC_CXX 0 ) 43 | set ( MSCXX 1 ) 44 | else () 45 | set ( CLANG_CL 0 ) 46 | set ( ONLYGNUCLANGCXX "${GNUCLANGCXX}" ) 47 | set ( ONLYGNUCLANGC_CXX "${GNUCLANGC_CXX}" ) 48 | set ( MSCXX "$" ) 49 | endif () 50 | -------------------------------------------------------------------------------- /cmake/revcheck.cmake: -------------------------------------------------------------------------------- 1 | cmake_minimum_required ( VERSION 3.17 ) 2 | 3 | # this file included by cpack config in order to detect if configured and build version are the same 4 | # guess version strings from current git repo 5 | function ( guess_from_git ) 6 | if (NOT EXISTS "${SOURCE_DIR}/.git") 7 | return () 8 | endif () 9 | 10 | find_package ( Git QUIET ) 11 | if (NOT GIT_FOUND) 12 | return () 13 | endif () 14 | 15 | # extract short has as CHECK_GIT_COMMIT_ID 16 | execute_process ( COMMAND "${GIT_EXECUTABLE}" log -1 --format=%h 17 | WORKING_DIRECTORY "${SOURCE_DIR}" 18 | RESULT_VARIABLE res 19 | OUTPUT_VARIABLE CHECK_GIT_COMMIT_ID 20 | ERROR_QUIET 21 | OUTPUT_STRIP_TRAILING_WHITESPACE ) 22 | set ( CHECK_GIT_COMMIT_ID "${CHECK_GIT_COMMIT_ID}" PARENT_SCOPE ) 23 | endfunction () 24 | 25 | # guess version strings from template header file (git archive mark it there) 26 | function ( extract_from_git_slug HEADER ) 27 | if (EXISTS "${HEADER}") 28 | file ( STRINGS "${HEADER}" _CONTENT ) 29 | foreach (LINE ${_CONTENT}) 30 | # match definitions like - // GIT_*_ID VALUE 31 | if ("${LINE}" MATCHES "^//[ \t]+(GIT_.*_ID)[ \t]\"(.*)\"") 32 | set ( ${CMAKE_MATCH_1} "${CMAKE_MATCH_2}" ) 33 | endif () 34 | endforeach () 35 | if (GIT_COMMIT_ID STREQUAL "$Format:%h$") 36 | return () # no slug 37 | endif () 38 | # commit id 39 | set ( CHECK_GIT_COMMIT_ID "${GIT_COMMIT_ID}" PARENT_SCOPE ) 40 | endif () 41 | endfunction () 42 | 43 | # function definitions finished, execution starts from here 44 | ################################## 45 | 46 | # first try to use binary git 47 | guess_from_git () 48 | 49 | # 2-nd try - if we build from git archive. Correct hash and date provided then, but no branch 50 | if (NOT CHECK_GIT_COMMIT_ID) 51 | extract_from_git_slug ( "${SOURCE_DIR}/util/version.h.in" ) 52 | endif () 53 | 54 | if (NOT CHECK_GIT_COMMIT_ID) 55 | set ( CHECK_GIT_COMMIT_ID "deadbeef" ) 56 | endif () 57 | 58 | if (NOT CHECK_GIT_COMMIT_ID STREQUAL "${CONFIGURED_GIT_COMMIT_ID}") 59 | message ( FATAL_ERROR "Current commit ${CHECK_GIT_COMMIT_ID} is differs from stored ${CONFIGURED_GIT_COMMIT_ID}. Run 'cmake .' in build dir to fix" ) 60 | endif () -------------------------------------------------------------------------------- /columnar/builder/buildertraits.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #include "buildertraits.h" 18 | 19 | namespace columnar 20 | { 21 | 22 | using namespace util; 23 | using namespace common; 24 | 25 | 26 | AttributeHeaderBuilder_c::AttributeHeaderBuilder_c ( const Settings_t & tSettings, const std::string & sName, AttrType_e eType ) 27 | : m_sName ( sName ) 28 | , m_eType ( eType ) 29 | , m_tSettings ( tSettings ) 30 | {} 31 | 32 | 33 | bool AttributeHeaderBuilder_c::Save ( FileWriter_c & tWriter, int64_t & tBaseOffset, std::string & sError ) 34 | { 35 | m_tSettings.Save(tWriter); 36 | 37 | tWriter.Write_string(m_sName); 38 | 39 | // store base offset to correct it later 40 | tBaseOffset = tWriter.GetPos(); 41 | 42 | tWriter.Write_uint64 ( 0 ); // stub 43 | tWriter.Pack_uint32 ( (uint32_t)m_dBlocks.size() ); 44 | int64_t tPrevOffset = 0; 45 | 46 | // no offset for 1st block 47 | for ( size_t i=1; i < m_dBlocks.size(); i++ ) 48 | { 49 | tWriter.Pack_uint64 ( m_dBlocks[i].first - tPrevOffset ); 50 | tPrevOffset = m_dBlocks[i].first; 51 | } 52 | 53 | uint32_t uMaxPacking = 0; 54 | for ( const auto & i : m_dBlocks ) 55 | uMaxPacking = std::max ( i.second, uMaxPacking ); 56 | 57 | std::vector dPackings ( uMaxPacking+1, 0 ); 58 | for ( const auto & i : m_dBlocks ) 59 | dPackings[i.second]++; 60 | 61 | tWriter.Pack_uint32 ( dPackings.size() ); 62 | for ( auto i : dPackings ) 63 | tWriter.Pack_uint32(i); 64 | 65 | return !tWriter.IsError(); 66 | } 67 | 68 | } // namespace columnar -------------------------------------------------------------------------------- /columnar/accessor/accessor.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #pragma once 18 | 19 | #include "columnar.h" 20 | #include "builder.h" 21 | #include "attributeheader.h" 22 | 23 | namespace columnar 24 | { 25 | 26 | // fixme: add bitmaps 27 | class MatchingBlocks_c 28 | { 29 | public: 30 | MatchingBlocks_c() { m_dBlocks.reserve(1024); } 31 | 32 | FORCE_INLINE void Add ( int iBlock ) { m_dBlocks.push_back(iBlock); } 33 | FORCE_INLINE int GetBlock ( int iBlock ) const { return m_dBlocks[iBlock]; } 34 | FORCE_INLINE int GetNumBlocks() const { return (int)m_dBlocks.size(); } 35 | FORCE_INLINE int Find ( int iStartBlock, int iValue ); 36 | 37 | private: 38 | std::vector m_dBlocks; 39 | }; 40 | 41 | 42 | int MatchingBlocks_c::Find ( int iStartBlock, int iValue ) 43 | { 44 | auto tFound = std::lower_bound ( m_dBlocks.begin()+iStartBlock, m_dBlocks.end(), iValue ); 45 | if ( tFound==m_dBlocks.end() ) 46 | return (int)m_dBlocks.size(); 47 | 48 | return tFound-m_dBlocks.begin(); 49 | } 50 | 51 | 52 | using SharedBlocks_c = std::shared_ptr; 53 | 54 | class Analyzer_i : public common::BlockIterator_i 55 | { 56 | public: 57 | virtual void Setup ( SharedBlocks_c & pBlocks, uint32_t uTotalDocs ) = 0; 58 | }; 59 | 60 | 61 | class Checker_i 62 | { 63 | public: 64 | virtual ~Checker_i() = default; 65 | 66 | virtual bool Check() = 0; 67 | }; 68 | 69 | 70 | bool CheckEmptySpan ( uint32_t * pRowID, uint32_t * pRowIdStart, util::Span_T & dRowIdBlock ); 71 | 72 | } // namespace columnar 73 | -------------------------------------------------------------------------------- /NOTICE-binary: -------------------------------------------------------------------------------- 1 | Copyright 2017-2025 Manticore Software, Ltd. 2 | 3 | ------------------------------------------------------------------------------- 4 | The binary distribution of this product bundles binaries of 5 | FastPFor https://github.com/lemire/FastPFor, which contains file AUTHORS.txt with the following list of names: 6 | 7 | 1. Daniel Lemire 8 | 2. Leonid Boytsov 9 | 3. Owen Kaser 10 | 4. Maxime Caron 11 | 5. Louis Dionne 12 | 6. Michel Lemay 13 | 7. Erik Kruus 14 | 8. Andrea Bedini 15 | 9. Matthias Petri 16 | 10. Robson Braga Araujo 17 | 11. Patrick Damme 18 | 12. Xiening Dai 19 | 13. Pavel Pavlov 20 | 21 | ------------------------------------------------------------------------------- 22 | The binary distribution of this product bundles binaries of PGM index https://github.com/manticoresoftware/PGM-index, 23 | which has the following notices: 24 | 25 | If you use the library please put a link to the website (https://pgm.di.unipi.it/ ) and cite the following paper: 26 | 27 | Paolo Ferragina and Giorgio Vinciguerra. The PGM-index: a fully-dynamic compressed learned index with provable worst-case bounds. PVLDB, 13(8): 1162-1175, 2020. 28 | 29 | @article{Ferragina:2020pgm, 30 | Author = {Paolo Ferragina and Giorgio Vinciguerra}, 31 | Title = {The {PGM-index}: a fully-dynamic compressed learned index with provable worst-case bounds}, 32 | Year = {2020}, 33 | Volume = {13}, 34 | Number = {8}, 35 | Pages = {1162--1175}, 36 | Doi = {10.14778/3389133.3389135}, 37 | Url = {https://pgm.di.unipi.it}, 38 | Issn = {2150-8097}, 39 | Journal = {{PVLDB}}} 40 | 41 | Doi = {10.14778/3389133.3389135}, 42 | Url = {https://pgm.di.unipi.it}, 43 | Issn = {2150-8097}, 44 | Journal = {{PVLDB}}}` 45 | ------------------------------------------------------------------------------- 46 | The binary distribution of this product bundles binaries of simde 47 | https://github.com/simd-everywhere/simde, which has the 48 | following notices: 49 | 50 | Copyright (c) 2017 Evan Nemerson 51 | ------------------------------------------------------------------------------- 52 | The binary distribution of this product bundles binaries of HNSW index https://github.com/manticoresoftware/hnswlib, 53 | which is a fork of https://github.com/nmslib/hnswlib -------------------------------------------------------------------------------- /cmake/CommonInfo.cmake: -------------------------------------------------------------------------------- 1 | # Common informational variables for CPack 2 | 3 | set ( CPACK_PACKAGE_VENDOR "Manticore Software LTD" ) 4 | 5 | set ( CMAKE_PROJECT_HOMEPAGE_URL "https://github.com/manticoresoftware/columnar/" ) 6 | 7 | set ( CPACK_PACKAGE_RELOCATABLE ON ) 8 | 9 | set ( CPACK_PACKAGE_CONTACT "Manticore Team " ) 10 | 11 | set ( CPACK_PACKAGE_URL "https://github.com/manticoresoftware/columnar/" ) 12 | 13 | set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "Manticore Columnar Library is a column-oriented storage library, aiming to provide decent performance with low memory footprint at big data volume" ) 14 | 15 | set ( CPACK_PACKAGE_DESCRIPTION "Manticore Columnar Library is a column-oriented storage library, aiming to provide decent performance with low memory footprint at big data volume. When used in combination with Manticore Search can be beneficial for faster / lower resource consumption log/metrics analytics and running log / metric analytics in docker / kubernetes" ) 16 | 17 | set ( CPACK_RPM_PACKAGE_DESCRIPTION "${CPACK_PACKAGE_DESCRIPTION}" ) # the description will default to the default one (This is an installer created using CPack..." otherwise, i.e. it doesn't take CPACK_PACKAGE_DESCRIPTION as a default 18 | 19 | ## Don't be confused; there is NO generic *_LICENSE variables in cmake/cpack, only for rpm and resource file 20 | #set ( CPACK_PACKAGE_LICENSE "Apache-2.0" ) 21 | set ( CPACK_RPM_PACKAGE_LICENSE "Apache-2.0" ) 22 | #set ( CPACK_DEBIAN_PACKAGE_LICENSE "Apache-2.0" ) 23 | set ( CPACK_RESOURCE_FILE_LICENSE "${columnar_SOURCE_DIR}/LICENSE" ) 24 | 25 | if (DEFINED ENV{PACKAGE_VERSION} AND NOT "$ENV{PACKAGE_VERSION}" STREQUAL "") 26 | set ( CPACK_PACKAGE_VERSION "$ENV{PACKAGE_VERSION}" ) 27 | else() 28 | set ( CPACK_PACKAGE_VERSION "${PROJECT_VERSION}-${GIT_TIMESTAMP_ID}-${GIT_COMMIT_ID}" ) 29 | endif() 30 | if (DEFINED ENV{RPM_PACKAGE_VERSION} AND NOT "$ENV{RPM_PACKAGE_VERSION}" STREQUAL "") 31 | set ( CPACK_RPM_PACKAGE_VERSION "$ENV{RPM_PACKAGE_VERSION}" ) 32 | else() 33 | set ( CPACK_RPM_PACKAGE_VERSION "${PROJECT_VERSION}_${GIT_TIMESTAMP_ID}.${GIT_COMMIT_ID}" ) 34 | endif() 35 | 36 | set ( CPACK_PACKAGE_NAME "manticore-columnar-lib" ) 37 | set ( CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}" ) 38 | -------------------------------------------------------------------------------- /cmake/SetBuildType.cmake: -------------------------------------------------------------------------------- 1 | # Helper script - scans cmake/builds for concrete distr rules. 2 | # Rules for each distr must be placed in separate .cmake file. 3 | # Each file with rule must be named asd build_*.cmake (that is used to find them). 4 | # Name of the distr must be in comment in the first line. The name 5 | # must not be too long, since could be reffered not only from GUI menu, 6 | # but also from command line, like -DDISTR=rhel7, etc. and so, 7 | # defs like -DDISTR="my super-puper os" is not so good in the case. 8 | 9 | # It gives possibility to add more choices without touching the rest of files. 10 | 11 | set ( MENUDISTR ) 12 | set ( FILESDISTR ) 13 | file ( GLOB _BUILDS "cmake/builds/build_*.cmake" ) 14 | # list among all build_...cmake, take first line and extract name from it 15 | foreach (CHOICE ${_BUILDS}) 16 | file ( READ ${CHOICE} CHOICE_NAME LIMIT 1024 ) 17 | string ( REGEX REPLACE "\n.*" "" CHOICE_NAME "${CHOICE_NAME}" ) 18 | string ( REGEX REPLACE "^# -* " "" CHOICE_NAME "${CHOICE_NAME}" ) 19 | string ( REGEX REPLACE " -*$" "" CHOICE_NAME "${CHOICE_NAME}" ) 20 | list ( APPEND MENUDISTR "${CHOICE_NAME}" ) 21 | list ( APPEND FILESDISTR "${CHOICE}" ) 22 | endforeach (CHOICE ${_BUILDS}) 23 | 24 | if (NOT DISTR) 25 | set ( DISTR "" CACHE STRING "Choose the distr." ) 26 | endif () 27 | 28 | if (NOT DISTR) 29 | message ( STATUS "Provide distr with -DDISTR=, one of: ${MENUDISTR}" ) 30 | 31 | if (WIN32) 32 | set ( SUGGEST_GENERATOR "ZIP" ) 33 | elseif (APPLE) 34 | set ( SUGGEST_GENERATOR "TGZ" ) 35 | elseif (EXISTS "/etc/redhat-release") 36 | set ( SUGGEST_GENERATOR "RPM" ) 37 | set ( DISTR "rhel" ) 38 | elseif (EXISTS "/etc/debian_version") 39 | set ( SUGGEST_GENERATOR "DEB" ) 40 | set ( DISTR "debian" ) 41 | endif () 42 | 43 | if (NOT CPACK_GENERATOR) 44 | set ( CPACK_GENERATOR "${SUGGEST_GENERATOR}" ) 45 | endif () 46 | 47 | endif () 48 | 49 | set_property ( CACHE DISTR PROPERTY STRINGS ${MENUDISTR} ) 50 | 51 | # find back the file by name of the distr and include it 52 | list ( FIND MENUDISTR "${DISTR}" _idistr ) 53 | if (_idistr GREATER -1) 54 | list ( GET FILESDISTR ${_idistr} _RULES ) 55 | include ( ${_RULES} ) 56 | message ( STATUS "Building for ${DISTR}" ) 57 | else () 58 | message ( STATUS "no distr selected " ) 59 | endif () 60 | -------------------------------------------------------------------------------- /util/codec.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #pragma once 18 | 19 | #include "util_private.h" 20 | 21 | namespace util 22 | { 23 | 24 | class IntCodec_i 25 | { 26 | public: 27 | virtual ~IntCodec_i() = default; 28 | 29 | virtual void Encode ( const util::Span_T & dUncompressed, std::vector & dCompressed ) = 0; 30 | virtual void EncodeDelta ( util::Span_T & dUncompressed, std::vector & dCompressed ) = 0; 31 | 32 | virtual void Encode ( const util::Span_T & dUncompressed, std::vector & dCompressed ) = 0; 33 | virtual void EncodeDelta ( util::Span_T & dUncompressed, std::vector & dCompressed ) = 0; 34 | 35 | virtual void Decode ( const util::Span_T & dCompressed, util::SpanResizeable_T & dDecompressed ) = 0; 36 | virtual void DecodeDelta ( const util::Span_T & dCompressed, util::SpanResizeable_T & dDecompressed ) = 0; 37 | 38 | virtual void Decode ( const util::Span_T & dCompressed, util::SpanResizeable_T & dDecompressed ) = 0; 39 | virtual void DecodeDelta ( const util::Span_T & dCompressed, util::SpanResizeable_T & dDecompressed ) = 0; 40 | }; 41 | 42 | 43 | void BitPack ( const std::vector & dValues, std::vector & dPacked, int iBits ); 44 | void BitUnpack ( const std::vector & dPacked, std::vector & dValues, int iBits ); 45 | void BitUnpack ( const util::Span_T & dPacked, util::Span_T & dValues, int iBits ); 46 | 47 | IntCodec_i * CreateIntCodec ( const std::string & sCodec32, const std::string & sCodec64 ); 48 | 49 | } // namespace util 50 | -------------------------------------------------------------------------------- /common/filter.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | // This file is a part of the common headers (API). 18 | // If you make any significant changes to this file, you MUST bump the LIB_VERSION in columnar.h or secondary.h 19 | 20 | #pragma once 21 | 22 | #include "schema.h" 23 | #include 24 | 25 | namespace common 26 | { 27 | 28 | enum class FilterType_e 29 | { 30 | NONE, 31 | VALUES, 32 | RANGE, 33 | FLOATRANGE, 34 | STRINGS, 35 | NOTNULL 36 | }; 37 | 38 | 39 | enum class MvaAggr_e 40 | { 41 | NONE, 42 | ALL, 43 | ANY 44 | }; 45 | 46 | using StringCmp_fn = int (*) ( std::pair tStrA, std::pair tStrB, bool bPacked ); 47 | 48 | struct Filter_t 49 | { 50 | std::string m_sName; 51 | bool m_bExclude = false; 52 | FilterType_e m_eType = FilterType_e::NONE; 53 | MvaAggr_e m_eMvaAggr = MvaAggr_e::NONE; 54 | int64_t m_iMinValue = 0; 55 | int64_t m_iMaxValue = 0; 56 | float m_fMinValue = 0.0f; 57 | float m_fMaxValue = 0.0f; 58 | bool m_bLeftUnbounded = false; 59 | bool m_bRightUnbounded = false; 60 | bool m_bLeftClosed = true; 61 | bool m_bRightClosed = true; 62 | 63 | StringHash_fn m_fnCalcStrHash = nullptr; 64 | StringCmp_fn m_fnStrCmp = nullptr; 65 | 66 | std::vector m_dValues; 67 | std::vector> m_dStringValues; 68 | }; 69 | 70 | struct RowidRange_t 71 | { 72 | uint32_t m_uMin { std::numeric_limits::min() }; 73 | uint32_t m_uMax{ std::numeric_limits::max() }; 74 | }; 75 | 76 | void FixupFilterSettings ( Filter_t & tFilter, AttrType_e eAttrType ); 77 | Filter_t StringFilterToHashFilter ( const Filter_t & tFilter, bool bGenerateName ); 78 | std::string GenerateHashAttrName ( const std::string & sAttr ); 79 | 80 | } // namespace common 81 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: 🐞 Bug Report 2 | description: Submit a bug report for Manticore Columnar Library 3 | labels: bug 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Thank you for submitting a bug report. We appreciate your effort to provide detailed information. Please answer the following questions to help us identify and fix the bug. Thank you! 9 | - type: textarea 10 | id: proposal 11 | attributes: 12 | label: "Bug Description:" 13 | description: > 14 | Describe the bug in detail. Include a [Minimal Reproducible Example](https://en.wikipedia.org/wiki/Minimal_reproducible_example) (MRE) if possible. Place any code blocks within triple backticks: 15 | value: | 16 | ```bash 17 | # Example code block; replace with your code if applicable 18 | ``` 19 | validations: 20 | required: true 21 | - type: input 22 | id: version 23 | attributes: 24 | label: "Manticore Search Version:" 25 | description: > 26 | Provide the version of Manticore Search you are using. Execute `searchd -v` in the command line to find this information. 27 | validations: 28 | required: true 29 | - type: input 30 | id: os 31 | attributes: 32 | label: "Operating System Version:" 33 | description: > 34 | Specify the version of your operating system. 35 | validations: 36 | required: true 37 | - type: dropdown 38 | id: dev 39 | attributes: 40 | label: "Have you tried the latest development version?" 41 | multiple: false 42 | options: 43 | - "Yes" 44 | - "No" 45 | - type: markdown 46 | attributes: 47 | value: "## Thank you for completing the form! For an expedited solution, consider our [professional services](https://manticoresearch.com/services/)." 48 | - type: textarea 49 | id: checklist 50 | attributes: 51 | label: "Internal Checklist:" 52 | description: > 53 | **For Manticore Team Use Only** — Please do not edit this section. This checklist will be completed by the Manticore team as they manage the issue. 54 | value: | 55 | To be completed by the assignee. Check off tasks that have been completed or are not applicable. 56 |
57 | 58 | - [ ] Implementation completed 59 | - [ ] Tests developed 60 | - [ ] Documentation updated 61 | - [ ] Documentation reviewed 62 | 63 |
64 | validations: 65 | required: true 66 | -------------------------------------------------------------------------------- /cmake/build_embeddings.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | # All rights reserved 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | cmake_minimum_required ( VERSION 3.17 ) 17 | 18 | if (__build_embeddings_included) 19 | return () 20 | endif () 21 | set ( __build_embeddings_included YES ) 22 | 23 | function(build_embeddings_lib) 24 | message ( STATUS "building embeddings locally..." ) 25 | 26 | # Set platform-specific library file names 27 | if(WIN32) 28 | set(EMBEDDINGS_LIB_FILE_SRC "${EMBEDDINGS_LIB_NAME}.dll") 29 | set(EMBEDDINGS_LIB_FILE_DST "lib_${EMBEDDINGS_LIB_NAME}.dll") 30 | elseif(APPLE) 31 | set(EMBEDDINGS_LIB_FILE_SRC "lib${EMBEDDINGS_LIB_NAME}.dylib") 32 | set(EMBEDDINGS_LIB_FILE_DST "lib_${EMBEDDINGS_LIB_NAME}.dylib") 33 | else() 34 | set(EMBEDDINGS_LIB_FILE_SRC "lib${EMBEDDINGS_LIB_NAME}.so") 35 | set(EMBEDDINGS_LIB_FILE_DST "lib_${EMBEDDINGS_LIB_NAME}.so") 36 | endif() 37 | 38 | if (NOT DEFINED CARGO_COMMAND) 39 | find_program ( CARGO_COMMAND cargo ) 40 | if (NOT CARGO_COMMAND) 41 | message ( FATAL_ERROR "Cargo command not found. Please install Rust and ensure cargo is in your PATH." ) 42 | endif () 43 | endif () 44 | 45 | execute_process ( 46 | COMMAND cargo build --manifest-path ${CMAKE_SOURCE_DIR}/embeddings/Cargo.toml --lib --release --target-dir ${CMAKE_CURRENT_BINARY_DIR}/embeddings 47 | RESULT_VARIABLE CMD_RESULT 48 | ) 49 | 50 | if (NOT CMD_RESULT EQUAL 0) 51 | message ( FATAL_ERROR "Failed to build: ${CMD_RESULT}" ) 52 | endif () 53 | 54 | file(RENAME "${CMAKE_CURRENT_BINARY_DIR}/embeddings/release/${EMBEDDINGS_LIB_FILE_SRC}" "${CMAKE_CURRENT_BINARY_DIR}/embeddings/release/${EMBEDDINGS_LIB_FILE_DST}" ) 55 | if ( EXISTS "${CMAKE_CURRENT_BINARY_DIR}/embeddings/release/${EMBEDDINGS_LIB_NAME}.pdb" ) 56 | file(RENAME "${CMAKE_CURRENT_BINARY_DIR}/embeddings/release/${EMBEDDINGS_LIB_NAME}.pdb" "${CMAKE_CURRENT_BINARY_DIR}/embeddings/release/lib_${EMBEDDINGS_LIB_NAME}.pdb") 57 | endif() 58 | endfunction () 59 | 60 | -------------------------------------------------------------------------------- /knn/quantizer.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | // This file is a part of the common headers (API). 18 | // If you make any significant changes to this file, you MUST bump the LIB_VERSION. 19 | 20 | #pragma once 21 | 22 | #include "knn.h" 23 | #include 24 | 25 | namespace knn 26 | { 27 | 28 | struct QuantizationSettings_t 29 | { 30 | float m_fMin = 0.0f; 31 | float m_fMax = 0.0f; 32 | float m_fK = 0.0f; 33 | float m_fB = 0.0f; 34 | 35 | std::vector m_dCentroid; 36 | }; 37 | 38 | struct Binary4BitFactors_t 39 | { 40 | float m_fQuantizedSum; 41 | float m_fDistanceToCentroidSq; 42 | float m_fMin; 43 | float m_fRange; 44 | float m_fVecMinusCentroidNorm; 45 | float m_fVecDotCentroid; 46 | }; 47 | 48 | struct Binary1BitFactorsL2_t 49 | { 50 | float m_fDistanceToCentroid; 51 | float m_fVectorMagnitude; 52 | float m_fPopCnt; 53 | }; 54 | 55 | struct Binary1BitFactorsIP_t 56 | { 57 | float m_fQuality; 58 | float m_fVecMinusCentroidNorm; 59 | float m_fVecDocCentroid; 60 | float m_fPopCnt; 61 | }; 62 | 63 | class ScalarQuantizer_i 64 | { 65 | public: 66 | virtual ~ScalarQuantizer_i() = default; 67 | 68 | virtual void Train ( const util::Span_T & dPoint ) = 0; 69 | virtual bool FinalizeTraining ( std::string & sError ) = 0; 70 | virtual void Encode ( uint32_t uRowID, const util::Span_T & dPoint, std::vector & dQuantized ) = 0; 71 | virtual void FinalizeEncoding() = 0; 72 | virtual const QuantizationSettings_t & GetSettings() = 0; 73 | 74 | virtual std::function GetPoolFetcher() const = 0; 75 | }; 76 | 77 | ScalarQuantizer_i * CreateQuantizer ( Quantization_e eQuantization, const QuantizationSettings_t & tQuantSettings, HNSWSimilarity_e eSimilarity ); 78 | ScalarQuantizer_i * CreateQuantizer ( Quantization_e eQuantization, HNSWSimilarity_e eSimilarity, int64_t iNumElements, const std::string & sTmpFilename ); 79 | 80 | } // namespace knn 81 | -------------------------------------------------------------------------------- /knn/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2025, Manticore Software LTD (https://manticoresearch.com) 2 | # All rights reserved 3 | # 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | cmake_minimum_required ( VERSION 3.21 ) # because of IMPORTED_RUNTIME_ARTIFACTS 18 | 19 | include ( GetHNSW ) 20 | 21 | add_library ( knn_lib MODULE knn.cpp knn.h iterator.cpp iterator.h embeddings.cpp embeddings.h quantizer.cpp quantizer.h space.cpp space.h ${CMAKE_SOURCE_DIR}/embeddings/manticoresearch_text_embeddings.h ) 22 | target_include_directories(knn_lib PRIVATE ${CMAKE_SOURCE_DIR}/embeddings) 23 | 24 | target_link_libraries ( knn_lib PRIVATE hnswlib::hnswlib columnar_root util common ) 25 | set_target_properties( knn_lib PROPERTIES PREFIX "" OUTPUT_NAME lib_manticore_knn${lib_arch_suffix} ) 26 | 27 | # Try to find manticoresearch text embeddings library 28 | message(STATUS "Looking for manticoresearch text embeddings library...") 29 | 30 | if ( BUILD_EMBEDDINGS_LOCALLY ) 31 | include (build_embeddings) 32 | build_embeddings_lib() 33 | endif() 34 | 35 | # find embeddings library 36 | set(EMBEDDINGS_LIB_NAME manticore_knn_embeddings) 37 | set ( CMAKE_FIND_LIBRARY_SUFFIXES .so .dylib .dll ) 38 | FIND_LIBRARY ( EMBEDDINGS_LIB NAMES _${EMBEDDINGS_LIB_NAME} ${EMBEDDINGS_LIB_NAME} lib_${EMBEDDINGS_LIB_NAME} HINTS 39 | "${CMAKE_SOURCE_DIR}/embeddings/target/debug" 40 | "${CMAKE_SOURCE_DIR}/embeddings/target/release" 41 | "${CMAKE_CURRENT_BINARY_DIR}/embeddings/release" 42 | NO_CMAKE_FIND_ROOT_PATH 43 | NO_DEFAULT_PATH 44 | ) 45 | 46 | if (PACK AND NOT EMBEDDINGS_LIB) 47 | message(FATAL_ERROR "Could not find manticoresearch text embeddings library at: ${EMBEDDINGS_LIB}") 48 | endif() 49 | 50 | # Add embeddings library as a component to be included in packages 51 | add_library ( embeddings MODULE IMPORTED ) 52 | set_target_properties ( embeddings PROPERTIES IMPORTED_LOCATION "${EMBEDDINGS_LIB}" INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_SOURCE_DIR}/embeddings/") 53 | 54 | if ( NOT EXTERNAL_LIB ) # install only once, not from external_project build 55 | install ( IMPORTED_RUNTIME_ARTIFACTS embeddings RUNTIME DESTINATION ${MODULES_DIR} LIBRARY DESTINATION ${MODULES_DIR} COMPONENT embeddings ) 56 | endif() 57 | 58 | message ( STATUS "Found manticoresearch text embeddings library: ${EMBEDDINGS_LIB}" ) 59 | -------------------------------------------------------------------------------- /columnar/accessor/check.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #pragma once 18 | 19 | #include "accessortraits.h" 20 | 21 | namespace columnar 22 | { 23 | 24 | class Checker_c : public Checker_i 25 | { 26 | public: 27 | Checker_c ( const AttributeHeader_i & tHeader, util::FileReader_c * pReader, Reporter_fn & fnProgress, Reporter_fn & fnError ); 28 | 29 | bool Check() override; 30 | 31 | protected: 32 | const AttributeHeader_i & m_tHeader; 33 | std::unique_ptr m_pReader; 34 | Reporter_fn & m_fnProgress; 35 | Reporter_fn & m_fnError; 36 | uint32_t m_uBlockId = INVALID_BLOCK_ID; 37 | uint32_t m_uChecked = 0; 38 | 39 | virtual bool CheckBlockHeader ( uint32_t uBlockId ) = 0; 40 | }; 41 | 42 | 43 | void CheckStorage ( const std::string & sFilename, uint32_t uNumRows, Reporter_fn & fnError, Reporter_fn & fnProgress ); 44 | 45 | bool CheckString ( util::FileReader_c & tReader, int iMinLength, int iMaxLength, const std::string & sMessage, Reporter_fn & fnError ); 46 | bool CheckUint8 ( util::FileReader_c & tReader, uint8_t uMin, uint8_t uMax, const std::string & sMessage, Reporter_fn & fnError ); 47 | bool CheckUint8 ( util::FileReader_c & tReader, uint8_t uMin, uint8_t uMax, const std::string & sMessage, uint8_t & uValue, Reporter_fn & fnError ); 48 | bool CheckInt32 ( util::FileReader_c & tReader, int iMin, int iMax, const std::string & sMessage, Reporter_fn & fnError ); 49 | bool CheckInt32 ( util::FileReader_c & tReader, int iMin, int iMax, const std::string & sMessage, int & iValue, Reporter_fn & fnError ); 50 | bool CheckInt32Packed ( util::FileReader_c & tReader, int iMin, int iMax, const std::string & sMessage, Reporter_fn & fnError ); 51 | bool CheckInt32Packed ( util::FileReader_c & tReader, int iMin, int iMax, const std::string & sMessage, int & iValue, Reporter_fn & fnError ); 52 | bool CheckInt64 ( util::FileReader_c & tReader, int64_t iMin, int64_t iMax, const std::string & sMessage, Reporter_fn & fnError ); 53 | bool CheckInt64 ( util::FileReader_c & tReader, int64_t iMin, int64_t iMax, const std::string & sMessage, int64_t & iValue, Reporter_fn & fnError ); 54 | 55 | } // namespace columnar 56 | -------------------------------------------------------------------------------- /secondary/secondary.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | // This file is a part of the common headers (API). 18 | // If you make any significant changes to this file, you MUST bump the LIB_VERSION. 19 | 20 | #pragma once 21 | 22 | #include "util/util.h" 23 | #include "common/schema.h" 24 | 25 | namespace util 26 | { 27 | class FileReader_c; 28 | class FileWriter_c; 29 | } 30 | 31 | namespace common 32 | { 33 | struct Filter_t; 34 | struct RowidRange_t; 35 | class BlockIterator_i; 36 | } 37 | 38 | namespace SI 39 | { 40 | 41 | static const int LIB_VERSION = 18; 42 | static const uint32_t STORAGE_VERSION = 9; 43 | 44 | struct IndexAttrInfo_t 45 | { 46 | std::string m_sName; 47 | common::AttrType_e m_eType; 48 | bool m_bEnabled; 49 | }; 50 | 51 | struct IndexSettings_t 52 | { 53 | uint64_t m_uBlockCacheSize = 0; 54 | }; 55 | 56 | struct IteratorSettings_t 57 | { 58 | const common::RowidRange_t * m_pBounds = nullptr; 59 | uint32_t m_uMaxValues = 0; 60 | int64_t m_iRsetSize = 0; 61 | int m_iCutoff = 0; 62 | bool m_bUseCache = false; 63 | }; 64 | 65 | class Index_i 66 | { 67 | public: 68 | virtual ~Index_i() = default; 69 | 70 | virtual bool CreateIterators ( std::vector & dIterators, const common::Filter_t & tFilter, const IteratorSettings_t & tSettings, std::string & sWarning, std::string & sError ) const = 0; 71 | virtual bool CalcCount ( uint32_t & uCount, const common::Filter_t & tFilter, uint32_t uMaxValues, std::string & sError ) const = 0; 72 | virtual uint32_t GetNumIterators ( const common::Filter_t & tFilter ) const = 0; 73 | virtual bool IsEnabled ( const std::string & sName ) const = 0; 74 | virtual int64_t GetCountDistinct ( const std::string & sName ) const = 0; 75 | virtual bool SaveMeta ( std::string & sError ) = 0; 76 | virtual void ColumnUpdated ( const char * sName ) = 0; 77 | virtual void GetAttrInfo ( std::vector & dAttrs ) const = 0; 78 | }; 79 | 80 | class Builder_i; 81 | 82 | } // namespace SI 83 | 84 | extern "C" 85 | { 86 | DLLEXPORT SI::Index_i * CreateSecondaryIndex ( const char * sFile, const SI::IndexSettings_t & tSettings, std::string & sError ); 87 | 88 | DLLEXPORT int GetSecondaryLibVersion(); 89 | DLLEXPORT const char * GetSecondaryLibVersionStr(); 90 | } 91 | -------------------------------------------------------------------------------- /common/filter.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #include "filter.h" 18 | #include "util/util_private.h" 19 | 20 | namespace common 21 | { 22 | 23 | using namespace util; 24 | 25 | void FixupFilterSettings ( Filter_t & tFilter, AttrType_e eAttrType ) 26 | { 27 | switch ( eAttrType ) 28 | { 29 | case AttrType_e::UINT32: 30 | case AttrType_e::UINT32SET: 31 | case AttrType_e::TIMESTAMP: 32 | // clamp to min and max values from a wider type 33 | if ( tFilter.m_iMinValue<0 ) 34 | { 35 | tFilter.m_iMinValue = INT64_MIN; 36 | tFilter.m_bLeftUnbounded = true; 37 | tFilter.m_bLeftClosed = false; 38 | } 39 | 40 | if ( tFilter.m_iMaxValue>UINT_MAX ) 41 | { 42 | tFilter.m_iMaxValue = INT64_MAX; 43 | tFilter.m_bRightUnbounded = true; 44 | tFilter.m_bRightClosed = false; 45 | } 46 | break; 47 | 48 | case AttrType_e::FLOAT: 49 | // this is basically the same stuff we do when we create filters, but we don't have access to previously modified filter settings 50 | // that's why we need to do it all over again 51 | if ( tFilter.m_eType==FilterType_e::VALUES && tFilter.m_dValues.size()==1 ) 52 | { 53 | tFilter.m_eType = FilterType_e::FLOATRANGE; 54 | tFilter.m_fMinValue = tFilter.m_fMaxValue = (float)tFilter.m_dValues[0]; 55 | } 56 | 57 | if ( tFilter.m_eType==FilterType_e::RANGE ) 58 | { 59 | tFilter.m_eType = FilterType_e::FLOATRANGE; 60 | tFilter.m_fMinValue = (float)tFilter.m_iMinValue; 61 | tFilter.m_fMaxValue = (float)tFilter.m_iMaxValue; 62 | } 63 | break; 64 | 65 | default: 66 | break; 67 | } 68 | } 69 | 70 | 71 | std::string GenerateHashAttrName ( const std::string & sAttr ) 72 | { 73 | return FormatStr ( "$%s_HASH", sAttr.c_str() ); 74 | } 75 | 76 | 77 | Filter_t StringFilterToHashFilter ( const Filter_t & tFilter, bool bGenerateName ) 78 | { 79 | assert ( tFilter.m_eType==FilterType_e::STRINGS ); 80 | Filter_t tRes; 81 | 82 | tRes.m_eType = FilterType_e::VALUES; 83 | tRes.m_bExclude = tFilter.m_bExclude; 84 | tRes.m_sName = bGenerateName ? GenerateHashAttrName ( tFilter.m_sName ) : tFilter.m_sName; 85 | 86 | for ( const auto & i : tFilter.m_dStringValues ) 87 | tRes.m_dValues.push_back ( i.empty() ? 0 : tFilter.m_fnCalcStrHash ( i.data(), i.size(), STR_HASH_SEED ) ); 88 | 89 | return tRes; 90 | } 91 | 92 | } // namespace common 93 | -------------------------------------------------------------------------------- /embeddings/src/error.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug, PartialEq, Eq, Hash)] 2 | pub enum LibError { 3 | HuggingFaceApiBuildFailed, 4 | ModelConfigFetchFailed, 5 | ModelConfigReadFailed, 6 | ModelConfigParseFailed, 7 | ModelTokenizerFetchFailed, 8 | ModelTokenizerLoadFailed, 9 | ModelTokenizerConfigurationFailed, 10 | ModelTokenizerEncodeFailed, 11 | ModelWeightsFetchFailed, 12 | ModelWeightsLoadFailed, 13 | ModelHiddenSizeGetFailed, 14 | ModelMaxInputLenGetFailed, 15 | ModelLoadFailed, 16 | DeviceCudaInitFailed, 17 | RemoteUnsupportedModel, 18 | RemoteInvalidAPIKey, 19 | RemoteRequestSendFailed, 20 | RemoteResponseParseFailed, 21 | } 22 | 23 | // Implement std::error::Error for LibError 24 | impl std::error::Error for LibError {} 25 | 26 | // Implement Display for LibError 27 | impl std::fmt::Display for LibError { 28 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 29 | match self { 30 | LibError::HuggingFaceApiBuildFailed => { 31 | write!(f, "Failed to set up the Hugging Face API connection") 32 | } 33 | LibError::ModelConfigFetchFailed => write!(f, "Failed to download model configuration"), 34 | LibError::ModelConfigReadFailed => write!(f, "Failed to read model configuration file"), 35 | LibError::ModelConfigParseFailed => write!(f, "Failed to parse model configuration"), 36 | LibError::ModelTokenizerFetchFailed => write!(f, "Failed to download model tokenizer"), 37 | LibError::ModelTokenizerLoadFailed => { 38 | write!(f, "Failed to load model tokenizer to memory") 39 | } 40 | LibError::ModelTokenizerConfigurationFailed => { 41 | write!(f, "Failed to configure model tokenizer") 42 | } 43 | LibError::ModelTokenizerEncodeFailed => write!(f, "Failed to encode text for model"), 44 | LibError::ModelWeightsFetchFailed => write!(f, "Failed to download model weights"), 45 | LibError::ModelWeightsLoadFailed => write!(f, "Failed to load model weights to memory"), 46 | LibError::ModelLoadFailed => write!(f, "Failed to create an instance of the model"), 47 | LibError::ModelHiddenSizeGetFailed => write!(f, "Failed to get model hidden size"), 48 | LibError::ModelMaxInputLenGetFailed => { 49 | write!(f, "Failed to get model max input length") 50 | } 51 | LibError::DeviceCudaInitFailed => write!(f, "Failed to initialize CUDA device"), 52 | LibError::RemoteUnsupportedModel => write!(f, "Unsupported remote model given"), 53 | LibError::RemoteInvalidAPIKey => write!(f, "Invalid API key for remote model"), 54 | LibError::RemoteRequestSendFailed => { 55 | write!(f, "Failed to send request to remote model") 56 | } 57 | LibError::RemoteResponseParseFailed => { 58 | write!(f, "Failed to parse response from remote model") 59 | } 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /embeddings/src/model/mod.rs: -------------------------------------------------------------------------------- 1 | mod jina; 2 | mod local; 3 | mod openai; 4 | pub mod text_model_wrapper; 5 | mod voyage; 6 | 7 | #[cfg(test)] 8 | mod openai_test; 9 | 10 | #[cfg(test)] 11 | mod voyage_test; 12 | 13 | #[cfg(test)] 14 | mod jina_test; 15 | 16 | #[cfg(test)] 17 | mod local_test; 18 | 19 | #[cfg(test)] 20 | mod ffi_test; 21 | 22 | use std::error::Error; 23 | use std::path::PathBuf; 24 | 25 | pub trait TextModel { 26 | fn predict(&self, texts: &[&str]) -> Result>, Box>; 27 | fn get_hidden_size(&self) -> usize; 28 | fn get_max_input_len(&self) -> usize; 29 | } 30 | 31 | #[repr(C)] 32 | pub struct ModelOptions { 33 | pub model_id: String, 34 | pub cache_path: Option, 35 | pub api_key: Option, 36 | pub use_gpu: Option, 37 | } 38 | 39 | #[repr(C)] 40 | pub enum Model { 41 | OpenAI(Box), 42 | Voyage(Box), 43 | Jina(Box), 44 | Local(Box), 45 | } 46 | 47 | impl TextModel for Model { 48 | fn predict(&self, texts: &[&str]) -> Result>, Box> { 49 | match self { 50 | Model::OpenAI(m) => m.predict(texts), 51 | Model::Voyage(m) => m.predict(texts), 52 | Model::Jina(m) => m.predict(texts), 53 | Model::Local(m) => m.predict(texts), 54 | } 55 | } 56 | 57 | fn get_hidden_size(&self) -> usize { 58 | match self { 59 | Model::OpenAI(m) => m.get_hidden_size(), 60 | Model::Voyage(m) => m.get_hidden_size(), 61 | Model::Jina(m) => m.get_hidden_size(), 62 | Model::Local(m) => m.get_hidden_size(), 63 | } 64 | } 65 | 66 | fn get_max_input_len(&self) -> usize { 67 | match self { 68 | Model::OpenAI(m) => m.get_max_input_len(), 69 | Model::Voyage(m) => m.get_max_input_len(), 70 | Model::Jina(m) => m.get_max_input_len(), 71 | Model::Local(m) => m.get_max_input_len(), 72 | } 73 | } 74 | } 75 | 76 | pub fn create_model(options: ModelOptions) -> Result> { 77 | let model_id = options.model_id.as_str(); 78 | if model_id.starts_with("openai/") { 79 | let model = 80 | openai::OpenAIModel::new(model_id, options.api_key.unwrap_or_default().as_str())?; 81 | 82 | Ok(Model::OpenAI(Box::new(model))) 83 | } else if model_id.starts_with("voyage/") { 84 | let model = 85 | voyage::VoyageModel::new(model_id, options.api_key.unwrap_or_default().as_str())?; 86 | 87 | Ok(Model::Voyage(Box::new(model))) 88 | } else if model_id.starts_with("jina/") { 89 | let model = jina::JinaModel::new(model_id, options.api_key.unwrap_or_default().as_str())?; 90 | 91 | Ok(Model::Jina(Box::new(model))) 92 | } else { 93 | let model = local::LocalModel::new( 94 | model_id, 95 | PathBuf::from( 96 | options 97 | .cache_path 98 | .unwrap_or(String::from(".cache/manticore")), 99 | ), 100 | options.use_gpu.unwrap_or(false), 101 | )?; 102 | 103 | Ok(Model::Local(Box::new(model))) 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /common/interval.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #pragma once 18 | 19 | #include "filter.h" 20 | 21 | namespace common 22 | { 23 | 24 | template 25 | FORCE_INLINE bool ValueInInterval ( T tValue, T tMin, T tMax ) 26 | { 27 | if constexpr ( LEFT_UNBOUNDED && RIGHT_UNBOUNDED ) 28 | return true; 29 | 30 | if constexpr ( LEFT_UNBOUNDED ) 31 | return RIGHT_CLOSED ? ( tValue<=tMax ) : ( tValue=tMin ) : ( tValue>tMin ); 35 | 36 | return ( LEFT_CLOSED ? ( tValue>=tMin ) : ( tValue>tMin ) ) && ( RIGHT_CLOSED ? ( tValue<=tMax ) : ( tValue 40 | FORCE_INLINE bool ValueInInterval ( T tValue, const Filter_t & tFilter ) 41 | { 42 | T tMin = (T)tFilter.m_iMinValue; 43 | T tMax = (T)tFilter.m_iMaxValue; 44 | 45 | if ( tFilter.m_bLeftUnbounded ) 46 | return tFilter.m_bRightClosed ? ( tValue<=tMax ) : ( tValue=tMin ) : ( tValue>tMin ); 50 | 51 | return ( tFilter.m_bLeftClosed ? ( tValue>=tMin ) : ( tValue>tMin ) ) && ( tFilter.m_bRightClosed ? ( tValue<=tMax ) : ( tValue 55 | FORCE_INLINE bool ValueInInterval ( float fVal, const Filter_t & tFilter ) 56 | { 57 | float tMin = tFilter.m_fMinValue; 58 | float tMax = tFilter.m_fMaxValue; 59 | 60 | if ( tFilter.m_bLeftUnbounded ) 61 | return tFilter.m_bRightClosed ? ( fVal<=tMax ) : ( fVal=tMin ) : ( fVal>tMin ); 65 | 66 | return ( tFilter.m_bLeftClosed ? ( fVal>=tMin ) : ( fVal>tMin ) ) && ( tFilter.m_bRightClosed ? ( fVal<=tMax ) : ( fVal 70 | struct Interval_T 71 | { 72 | T m_tStart; 73 | T m_tEnd; 74 | 75 | Interval_T() = default; 76 | Interval_T ( T tStart, T tEnd ) 77 | : m_tStart ( tStart ) 78 | , m_tEnd ( tEnd ) 79 | {} 80 | 81 | FORCE_INLINE bool operator < ( const Interval_T & tOther ) const 82 | { 83 | return ( m_tStart 93 | FORCE_INLINE bool Interval_T::operator < ( const Interval_T & tOther ) const 94 | { 95 | return ( m_tStart" ) 36 | set_property ( TEST "${test}" APPEND PROPERTY ENVIRONMENT "LIB_MANTICORE_SECONDARY=$" ) 37 | set_property ( TEST "${test}" APPEND PROPERTY ENVIRONMENT "LIB_MANTICORE_KNN=$" ) 38 | endfunction () 39 | 40 | # this will switch off pure manticore-specific tests: google, api, keyword consistency and benches (we don't need them here) 41 | set ( TEST_SPECIAL_EXTERNAL ON ) 42 | 43 | message ( STATUS "Checking MANTICORE_LOCATOR sources..." ) 44 | if (DEFINED ENV{MANTICORE_LOCATOR} AND NOT "$ENV{MANTICORE_LOCATOR}" STREQUAL "") 45 | set ( MANTICORE_LOCATOR $ENV{MANTICORE_LOCATOR} ) 46 | message ( STATUS "Using MANTICORE_LOCATOR from environment: '${MANTICORE_LOCATOR}'" ) 47 | elseif (EXISTS "${columnar_SOURCE_DIR}/local_manticore_src.txt") 48 | file ( READ "${columnar_SOURCE_DIR}/local_manticore_src.txt" MANTICORE_LOCATOR ) 49 | message ( STATUS "Using MANTICORE_LOCATOR from local_manticore_src.txt: '${MANTICORE_LOCATOR}'" ) 50 | else () 51 | file ( READ "${columnar_SOURCE_DIR}/manticore_src.txt" MANTICORE_LOCATOR ) 52 | message ( STATUS "Using MANTICORE_LOCATOR from manticore_src.txt: '${MANTICORE_LOCATOR}'" ) 53 | endif () 54 | 55 | message ( STATUS "MANTICORE_LOCATOR before configure: '${MANTICORE_LOCATOR}'" ) 56 | string ( CONFIGURE "${MANTICORE_LOCATOR}" MANTICORE_LOCATOR ) # that is to expand possible inside variables 57 | message ( STATUS "MANTICORE_LOCATOR after configure: '${MANTICORE_LOCATOR}'" ) 58 | 59 | file ( WRITE "${columnar_BINARY_DIR}/manticore-get.cmake" "FetchContent_Declare ( manticore ${MANTICORE_LOCATOR} )\n" ) 60 | message ( STATUS "Written to ${columnar_BINARY_DIR}/manticore-get.cmake: 'FetchContent_Declare ( manticore ${MANTICORE_LOCATOR} )'" ) 61 | 62 | include ( FetchContent ) 63 | include ( "${columnar_BINARY_DIR}/manticore-get.cmake" ) 64 | 65 | # add manticore sources to the tree. All testing will be done on manticore side; necessary additional tests/properties will 66 | # be set by cb functions defined above. 67 | FetchContent_MakeAvailable ( manticore ) -------------------------------------------------------------------------------- /knn/iterator.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #include "iterator.h" 18 | #include "knn.h" 19 | 20 | #include 21 | 22 | namespace knn 23 | { 24 | 25 | using namespace util; 26 | 27 | class RowidIteratorKNN_c : public Iterator_i 28 | { 29 | public: 30 | RowidIteratorKNN_c ( KNNIndex_i & tIndex, const Span_T & dData, int64_t iResults, int iEf ); 31 | 32 | bool HintRowID ( uint32_t tRowID ) override; 33 | bool GetNextRowIdBlock ( Span_T & dRowIdBlock ) override; 34 | int64_t GetNumProcessed() const override { return m_iIndex; } 35 | void SetCutoff ( int iCutoff ) override {} 36 | bool WasCutoffHit() const override { return false; } 37 | void AddDesc ( std::vector & dDesc ) const override {} 38 | 39 | Span_T GetData() const override { return Span_T ( m_dCollected.data(), m_dCollected.size() ); } 40 | 41 | private: 42 | static const int DOCS_PER_CHUNK = 1000; 43 | 44 | std::vector m_dRowIDs; 45 | std::vector m_dCollected; 46 | std::vector m_dQuantized; 47 | int m_iIndex = 0; 48 | }; 49 | 50 | 51 | RowidIteratorKNN_c::RowidIteratorKNN_c ( KNNIndex_i & tIndex, const Span_T & dData, int64_t iResults, int iEf ) 52 | { 53 | tIndex.Search ( m_dCollected, dData, iResults, iEf, m_dQuantized ); 54 | std::sort ( m_dCollected.begin(), m_dCollected.end(), []( const auto & a, const auto & b ) { return a.m_tRowID=(int)m_dCollected.size() ) 62 | return false; 63 | 64 | auto tEnd = m_dCollected.end(); 65 | auto tFound = std::lower_bound ( m_dCollected.begin() + m_iIndex, tEnd, tRowID, []( auto & tEntry, uint32_t tValue ){ return tEntry.m_tRowID < tValue; } ); 66 | if ( tFound==tEnd ) 67 | { 68 | m_iIndex = (int)m_dCollected.size(); 69 | return false; 70 | } 71 | 72 | m_iIndex = tFound - m_dCollected.begin(); 73 | return true; 74 | } 75 | 76 | 77 | bool RowidIteratorKNN_c::GetNextRowIdBlock ( Span_T & dRowIdBlock ) 78 | { 79 | int iCollected = std::max ( std::min ( int(m_dCollected.size()) - m_iIndex, DOCS_PER_CHUNK ), 0 ); 80 | DocDist_t * pStart = m_dCollected.data() + m_iIndex; 81 | DocDist_t * pMax = pStart + iCollected; 82 | m_iIndex += iCollected; 83 | 84 | if ( pStart==pMax ) 85 | return false; 86 | 87 | DocDist_t * pDoc = pStart; 88 | uint32_t * pRowID = m_dRowIDs.data(); 89 | while ( pDocm_tRowID; 92 | pRowID++; 93 | pDoc++; 94 | } 95 | 96 | dRowIdBlock = Span_T ( m_dRowIDs.data(), pRowID-m_dRowIDs.data() ); 97 | return true; 98 | } 99 | 100 | ///////////////////////////////////////////////////////////////////// 101 | 102 | Iterator_i * CreateIterator ( KNNIndex_i & tIndex, const util::Span_T & dData, int64_t iResults, int iEf ) 103 | { 104 | return new RowidIteratorKNN_c ( tIndex, dData, iResults, iEf ); 105 | } 106 | 107 | } // namespace knn 108 | -------------------------------------------------------------------------------- /columnar/columnar.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | // This file is a part of the common headers (API). 18 | // If you make any significant changes to this file, you MUST bump the LIB_VERSION. 19 | 20 | #pragma once 21 | 22 | #include "util/util.h" 23 | #include "common/blockiterator.h" 24 | #include "common/filter.h" 25 | #include "common/schema.h" 26 | #include 27 | 28 | namespace columnar 29 | { 30 | 31 | static const int LIB_VERSION = 27; 32 | 33 | class Iterator_i 34 | { 35 | public: 36 | virtual ~Iterator_i() = default; 37 | 38 | virtual int64_t Get ( uint32_t tRowID ) = 0; 39 | virtual void Fetch ( const util::Span_T & dRowIDs, util::Span_T & dValues ) = 0; 40 | 41 | virtual int Get ( uint32_t tRowID, const uint8_t * & pData ) = 0; 42 | virtual uint8_t * GetPacked ( uint32_t tRowID ) = 0; 43 | virtual int GetLength ( uint32_t tRowID ) = 0; 44 | 45 | virtual void AddDesc ( std::vector & dDesc ) const = 0; 46 | }; 47 | 48 | 49 | using MinMaxVec_t = std::vector>; 50 | 51 | class BlockTester_i 52 | { 53 | public: 54 | virtual ~BlockTester_i() = default; 55 | 56 | virtual bool Test ( const MinMaxVec_t & dMinMax ) const = 0; 57 | }; 58 | 59 | 60 | struct IteratorHints_t 61 | { 62 | bool m_bNeedStringHashes = false; 63 | bool m_bBuffered = true; 64 | }; 65 | 66 | 67 | struct IteratorCapabilities_t 68 | { 69 | bool m_bStringHashes = false; 70 | }; 71 | 72 | using Reporter_fn = std::function; 73 | 74 | struct AttrInfo_t 75 | { 76 | int m_iId = -1; 77 | common::AttrType_e m_eType = common::AttrType_e::NONE; 78 | float m_fComplexity = 0.0f; 79 | }; 80 | 81 | 82 | class Columnar_i 83 | { 84 | public: 85 | virtual ~Columnar_i() = default; 86 | 87 | virtual Iterator_i * CreateIterator ( const std::string & sName, const IteratorHints_t & tHints, columnar::IteratorCapabilities_t * pCapabilities, std::string & sError ) const = 0; 88 | virtual std::vector CreateAnalyzerOrPrefilter ( const std::vector & dFilters, std::vector & dDeletedFilters, const BlockTester_i & tBlockTester ) const = 0; 89 | virtual int64_t EstimateMinMax ( const common::Filter_t & tFilter, const BlockTester_i & tBlockTester ) const = 0; 90 | virtual bool GetAttrInfo ( const std::string & sName, AttrInfo_t & tInfo ) const = 0; 91 | 92 | virtual bool EarlyReject ( const std::vector & dFilters, const BlockTester_i & tBlockTester ) const = 0; 93 | virtual bool IsFilterDegenerate ( const common::Filter_t & tFilter ) const = 0; 94 | }; 95 | 96 | } // namespace columnar 97 | 98 | 99 | extern "C" 100 | { 101 | DLLEXPORT columnar::Columnar_i * CreateColumnarStorageReader ( const std::string & sFilename, uint32_t uTotalDocs, std::string & sError ); 102 | DLLEXPORT void CheckColumnarStorage ( const std::string & sFilename, uint32_t uNumRows, columnar::Reporter_fn & fnError, columnar::Reporter_fn & fnProgress ); 103 | DLLEXPORT int GetColumnarLibVersion(); 104 | DLLEXPORT const char * GetColumnarLibVersionStr(); 105 | } 106 | -------------------------------------------------------------------------------- /cmake/printers.cmake: -------------------------------------------------------------------------------- 1 | if (__cmake_columnar_printers_included) 2 | return () 3 | endif () 4 | set ( __cmake_columnar_printers_included YES ) 5 | 6 | if (DEFINED ENV{DIAGNOSTIC}) 7 | set ( DIAGNOSTIC "$ENV{DIAGNOSTIC}" ) 8 | endif () 9 | 10 | if (DIAGNOSTIC) 11 | set ( CMAKE_MESSAGE_LOG_LEVEL DEBUG ) 12 | endif () 13 | 14 | # different print diagnostic stuff 15 | # 16 | # diag - print one or many variables 17 | # diags - print message in DEBUG scope, if DIAGNOSTIC defined 18 | # infomsg - make message STATUS if not CMAKE_REQUIRED_QUIET 19 | # debugmsg - make message DEBUG 20 | # trace - traces target properties 21 | # diagst - promptly output if target found and where 22 | 23 | include ( CMakePrintHelpers ) 24 | function ( DIAG ) 25 | if (DIAGNOSTIC) 26 | cmake_print_variables ( ${ARGN} ) 27 | endif () 28 | endfunction () 29 | 30 | function ( DIAGS MSG ) 31 | if (DIAGNOSTIC) 32 | message ( DEBUG "${MSG}" ) 33 | endif () 34 | endfunction () 35 | 36 | function ( infomsg MSG ) 37 | if (NOT CMAKE_REQUIRED_QUIET) 38 | message ( STATUS "${MSG}" ) 39 | endif () 40 | endfunction () 41 | 42 | function ( debugmsg MSG ) 43 | message ( DEBUG "${MSG}" ) 44 | endfunction () 45 | 46 | function ( tracep LIB PROPERTY ) 47 | get_property ( _prp TARGET ${LIB} PROPERTY ${PROPERTY} ) 48 | if (_prp) 49 | diags ( "${PROPERTY} = '${_prp}'" ) 50 | endif () 51 | endfunction () 52 | 53 | function ( trace LIB ) 54 | if (NOT TARGET ${LIB}) 55 | diags ( "=========== ${LIB} is not found" ) 56 | return () 57 | endif () 58 | 59 | diags ( "=========== Trace properties for ${LIB} =============" ) 60 | foreach (_prop 61 | TYPE 62 | INTERFACE_AUTOUIC_OPTIONS 63 | INTERFACE_COMPILE_DEFINITIONS 64 | INTERFACE_COMPILE_FEATURES 65 | INTERFACE_COMPILE_OPTIONS 66 | INTERFACE_INCLUDE_DIRECTORIES 67 | INTERFACE_LINK_LIBRARIES 68 | INTERFACE_LINK_DEPENDS 69 | INTERFACE_LINK_DIRECTORIES 70 | INTERFACE_LINK_OPTIONS 71 | INTERFACE_PRECOMPILE_HEADERS 72 | INTERFACE_POSITION_INDEPENDENT_CODE 73 | INTERFACE_SOURCES 74 | INTERFACE_SYSTEM_INCLUDE_DIRECTORIES 75 | IMPORTED_CONFIGURATIONS 76 | IMPORTED_LINK_INTERFACE_LANGUAGES_DEBUG 77 | IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE 78 | IMPORTED_LINK_INTERFACE_LANGUAGES_RELWITHDEBINFO 79 | IMPORTED_LOCATION_DEBUG 80 | IMPORTED_LOCATION_RELEASE 81 | IMPORTED_LOCATION_RELWITHDEBINFO 82 | IMPORTED_LOCATION 83 | IMPORTED_IMPLIB_DEBUG 84 | IMPORTED_IMPLIB_RELEASE 85 | IMPORTED_IMPLIB_RELWITHDEBINFO 86 | IMPORTED_IMPLIB 87 | MAP_IMPORTED_CONFIG_DEBUG 88 | MAP_IMPORTED_CONFIG_RELEASE 89 | MAP_IMPORTED_CONFIG_RELWITHDEBINFO 90 | MAP_IMPORTED_CONFIG_MINSIZEREL 91 | DL_LIBRARY 92 | LOCATION 93 | ) 94 | tracep ( ${LIB} ${_prop} ) 95 | endforeach () 96 | diags ( "=========== Trace properties for ${LIB} finished =============" ) 97 | endfunction () 98 | 99 | function ( trace_internal LIB ) 100 | diags ( "=========== Trace properties for internal ${LIB} =============" ) 101 | foreach (_prop 102 | TYPE 103 | INTERFACE_COMPILE_DEFINITIONS 104 | INTERFACE_COMPILE_FEATURES 105 | INTERFACE_COMPILE_OPTIONS 106 | INTERFACE_INCLUDE_DIRECTORIES 107 | INTERFACE_LINK_LIBRARIES 108 | INTERFACE_LINK_DEPENDS 109 | INTERFACE_LINK_DIRECTORIES 110 | INTERFACE_LINK_OPTIONS 111 | INTERFACE_PRECOMPILE_HEADERS 112 | INTERFACE_POSITION_INDEPENDENT_CODE 113 | INTERFACE_SOURCES 114 | INTERFACE_SYSTEM_INCLUDE_DIRECTORIES 115 | ) 116 | tracep ( ${LIB} ${_prop} ) 117 | endforeach () 118 | diags ( "=========== Trace properties for internal ${LIB} finished =============" ) 119 | endfunction () 120 | 121 | function ( diagst TRG LEGEND ) 122 | if (TARGET ${TRG}) 123 | get_target_property ( LOC ${TRG} LOCATION ) 124 | diags ( "${TRG} ${LEGEND} at ${LOC}" ) 125 | else () 126 | diags ( "${TRG} is not found" ) 127 | endif () 128 | endfunction () 129 | -------------------------------------------------------------------------------- /util/util.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | // This file is a part of the common headers (API). 18 | // If you make any significant changes to this file, you MUST bump the LIB_VERSION in columnar.h or secondary.h 19 | 20 | #pragma once 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | namespace util 34 | { 35 | 36 | #ifdef _MSC_VER 37 | #define DLLEXPORT __declspec(dllexport) 38 | #else 39 | #define O_BINARY 0 40 | #define DLLEXPORT 41 | #endif 42 | 43 | #ifndef FORCE_INLINE 44 | #ifndef NDEBUG 45 | #define FORCE_INLINE inline 46 | #else 47 | #ifdef _MSC_VER 48 | #define FORCE_INLINE __forceinline 49 | #else 50 | #if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ 51 | #ifdef __GNUC__ 52 | #define FORCE_INLINE inline __attribute__((always_inline)) 53 | #else 54 | #define FORCE_INLINE inline 55 | #endif 56 | #else 57 | #define FORCE_INLINE 58 | #endif 59 | #endif 60 | #endif 61 | #endif 62 | 63 | template 64 | class Span_T 65 | { 66 | public: 67 | Span_T() = default; 68 | 69 | Span_T ( T * pData, size_t tLength ) 70 | : m_pData ( pData ) 71 | , m_tLength ( tLength ) 72 | {} 73 | 74 | Span_T ( std::vector & dVec ) 75 | : m_pData ( dVec.data() ) 76 | , m_tLength ( dVec.size() ) 77 | {} 78 | 79 | T * data() const { return m_pData; } 80 | T & front() const { return *m_pData; } 81 | T & back() const { return *(m_pData+m_tLength-1); } 82 | T * begin() const { return m_pData; } 83 | T * end() const { return m_pData+m_tLength; } 84 | size_t size() const { return m_tLength; } 85 | bool empty() const { return m_tLength==0; } 86 | FORCE_INLINE T & operator [] ( size_t i ) 87 | { 88 | assert ( i < m_tLength ); 89 | return m_pData[i]; 90 | } 91 | 92 | FORCE_INLINE const T & operator [] ( size_t i ) const 93 | { 94 | assert ( i < m_tLength ); 95 | return m_pData[i]; 96 | } 97 | 98 | protected: 99 | T * m_pData = nullptr; 100 | size_t m_tLength = 0; 101 | }; 102 | 103 | 104 | template 105 | class SpanResizeable_T : public Span_T 106 | { 107 | using BASE = Span_T; 108 | 109 | public: 110 | FORCE_INLINE void resize ( size_t tLength ) 111 | { 112 | if ( tLength>m_tMaxLength ) 113 | { 114 | m_tMaxLength = tLength; 115 | m_dData.resize(m_tMaxLength); 116 | BASE::m_pData = m_dData.data(); 117 | } 118 | 119 | BASE::m_tLength = tLength; 120 | } 121 | 122 | FORCE_INLINE void resize ( size_t tLength, T tValue ) 123 | { 124 | if ( tLength>m_tMaxLength ) 125 | { 126 | m_tMaxLength = tLength; 127 | m_dData.resize ( m_tMaxLength, tValue ); 128 | BASE::m_pData = m_dData.data(); 129 | } 130 | else 131 | std::fill(BASE::m_pData + tLength, BASE::m_pData + m_tMaxLength, T{}); 132 | 133 | BASE::m_tLength = tLength; 134 | } 135 | 136 | size_t capacity() const { return m_tMaxLength; } 137 | 138 | private: 139 | std::vector m_dData; 140 | size_t m_tMaxLength = 0; 141 | }; 142 | 143 | } // namespace util 144 | -------------------------------------------------------------------------------- /secondary/blockreader.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #pragma once 18 | 19 | #include "util/codec.h" 20 | #include "common/filter.h" 21 | #include "common/blockiterator.h" 22 | #include "builder.h" 23 | #include 24 | 25 | namespace util 26 | { 27 | class FileReader_c; 28 | class FileWriter_c; 29 | } 30 | 31 | namespace SI 32 | { 33 | 34 | class BlockIteratorSize_i; 35 | struct ApproxPos_t; 36 | 37 | struct BlockIter_t 38 | { 39 | uint64_t m_uVal { 0 }; 40 | 41 | uint64_t m_iPos { 0 }; 42 | uint64_t m_iStart { 0 }; 43 | uint64_t m_iLast { 0 }; 44 | 45 | BlockIter_t() = default; 46 | BlockIter_t ( const ApproxPos_t & tFrom, uint64_t uVal, uint64_t uBlocksCount, uint32_t uValuesPerBlock ); 47 | }; 48 | 49 | 50 | class BlockReader_i 51 | { 52 | public: 53 | virtual ~BlockReader_i() = default; 54 | 55 | virtual void CreateBlocksIterator ( const std::vector & dIt, const common::Filter_t & tFilter, std::vector & dRes ) = 0; 56 | virtual void CreateBlocksIterator ( const BlockIter_t & tIt, const common::Filter_t & tFilter, std::vector & dRes ) = 0; 57 | virtual uint32_t CalcValueCount ( const std::vector & dIt ) = 0; 58 | virtual uint32_t CalcValueCount ( const BlockIter_t & tIt, const common::Filter_t & tVal ) = 0; 59 | }; 60 | 61 | 62 | class BlockCache_i 63 | { 64 | public: 65 | virtual ~BlockCache_i() = default; 66 | 67 | virtual uint64_t GetMaxSize() const = 0; 68 | virtual bool IsCacheFull() const = 0; 69 | virtual float GetReuseRatio() const = 0; 70 | }; 71 | 72 | 73 | struct RsetInfo_t 74 | { 75 | int64_t m_iNumIterators = 0; 76 | uint32_t m_uRowsCount = 0; 77 | int64_t m_iRsetSize = 0; 78 | }; 79 | 80 | 81 | enum class Packing_e : uint32_t 82 | { 83 | ROW, 84 | ROW_BLOCK, 85 | ROW_BLOCKS_LIST, 86 | 87 | TOTAL 88 | }; 89 | 90 | 91 | struct ColumnInfo_t 92 | { 93 | common::AttrType_e m_eType = common::AttrType_e::NONE; 94 | std::string m_sName; 95 | std::string m_sJsonParentName; 96 | uint32_t m_uCountDistinct = 0; 97 | uint64_t m_tMin = 0; 98 | uint64_t m_tMax = 0; 99 | bool m_bEnabled = true; 100 | 101 | void Load ( util::FileReader_c & tReader, uint32_t uVersion ); 102 | void Save ( util::FileWriter_c & tWriter ) const; 103 | }; 104 | 105 | 106 | struct Settings_t 107 | { 108 | std::string m_sCompressionUINT32 = "libstreamvbyte"; 109 | std::string m_sCompressionUINT64 = "fastpfor256"; 110 | 111 | void Load ( util::FileReader_c & tReader, uint32_t uVersion ); 112 | void Save ( util::FileWriter_c & tWriter ) const; 113 | }; 114 | 115 | 116 | class ReaderFactory_c 117 | { 118 | public: 119 | ColumnInfo_t m_tCol; 120 | Settings_t m_tSettings; 121 | RsetInfo_t m_tRsetInfo; 122 | int m_iFD = -1; 123 | uint32_t m_uVersion = 0; 124 | uint64_t m_uBlockBaseOff = 0; 125 | uint64_t m_uBlocksCount = 0; 126 | uint32_t m_uValuesPerBlock = 1; 127 | uint32_t m_uRowidsPerBlock = 1; 128 | const common::RowidRange_t * m_pBounds = nullptr; 129 | int m_iCutoff = 0; 130 | BlockCache_i * m_pBlockCache = nullptr; 131 | 132 | BlockReader_i * CreateBlockReader(); 133 | BlockReader_i * CreateRangeReader(); 134 | }; 135 | 136 | 137 | BlockCache_i * CreateBlockCache ( common::AttrType_e eType, uint32_t uBlocksCount, uint64_t uMaxSize ); 138 | 139 | } // namespace SI 140 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | Manicore Search Logo 4 | 5 |

6 | 7 |

8 | Manticore Columnar Library 9 |

10 | 11 | Manticore Columnar Library is a column-oriented storage and secondary indexing library, aiming to provide **decent performance with low memory footprint at big data volume**. 12 | When used in combination with [Manticore Search](https://github.com/manticoresoftware/manticoresearch) can be beneficial for those looking for: 13 | 1. log analytics including rich free text search capabities (which is missing in e.g. [Clickhouse](https://github.com/ClickHouse/ClickHouse) - great tool for metrics analytics) 14 | 2. faster / low resource consumption log/metrics analytics. Since the library and Manticore Search are both written in C++ with low level optimizations in mind, in many cases the performance / RAM consumption is better than in Lucene / SOLR / Elasticsearch 15 | 3. running log / metric analytics in docker / kubernetes. Manticore Search + the library can work with as little as 30 megabytes of RAM which Elasticsearch / Clickhouse can't. It also starts in less than a second or a few seconds in the worst case. Since the overhead is so little you can afford having more nodes of Manticore Search + the library than Elasticsearch. More nodes and quicker start means higher high availability and agility. 16 | 4. powerful SQL for logs/metrics analytics and everything else [Manticore Search](https://github.com/manticoresoftware/manticoresearch) can give you 17 | 18 | ## Getting started 19 | 20 | ### Requirements 21 | 22 | 1. CPU: with SSE >= 4.2 23 | 2. Architecture: arm64 or x86_64 24 | 3. OS: Debian-based OS (Debian, Ubuntu, Mint), RHEL-based (RHEL, Centos, Alma, Oracle Linux, Amazon Linux) OS, Windows, MacOS 25 | 26 | ### Installation from yum/apt repositories 27 | #### Ubuntu, Debian: 28 | ```bash 29 | wget https://repo.manticoresearch.com/manticore-repo.noarch.deb 30 | sudo dpkg -i manticore-repo.noarch.deb 31 | sudo apt update 32 | sudo apt install manticore-columnar-lib 33 | ``` 34 | 35 | #### Centos: 36 | ```bash 37 | sudo yum install https://repo.manticoresearch.com/manticore-repo.noarch.rpm 38 | sudo yum install manticore-columnar-lib 39 | ``` 40 | 41 | `searchd -v` should include `columnar x.y.z`, e.g.: 42 | ```bash 43 | root@srv# searchd -v 44 | Manticore 6.0.2 89c7a5139@230210 (columnar 2.0.0 a7c703d@230130) (secondary 2.0.0 a7c703d@230130) 45 | ``` 46 | 47 | ### Basic usage: 48 | 1. Read https://manual.manticoresearch.com/Creating_a_table/Data_types#Row-wise-and-columnar-attribute-storages 49 | 2. Create plain or real-time table specifying that the columnar storage should be used 50 | 51 | ## Benchmarks 52 | 53 | ### Log analytics - 6x faster than Elasticsearch 54 | 55 | https://db-benchmarks.com/test-logs10m/#elasticsearch-tuned-vs-manticore-search-columnar-storage 56 | 57 | ![logs_es_msc](https://db-benchmarks.com/test-logs10m/est_msc.png) 58 | 59 | ### Log analytics - 1.4x faster than Clickhouse 60 | 61 | https://db-benchmarks.com/test-logs10m/#clickhouse-vs-manticore-search-columnar-storage 62 | 63 | ![logs_es_ch](https://db-benchmarks.com/test-logs10m/ch_msc.png) 64 | 65 | ### Medium data - 110M Hackernews comments - 5x faster than Elasticsearch 66 | 67 | https://db-benchmarks.com/test-hn/#manticore-search-columnar-storage-vs-elasticsearch 68 | 69 | ![hn_es_msc](https://db-benchmarks.com/test-hn/msc_es.png) 70 | 71 | ### Medium data - 110M Hackernews comments - 11x faster than Clickhouse 72 | 73 | https://db-benchmarks.com/test-hn/#manticore-search-columnar-storage-vs-clickhouse 74 | 75 | ![hn_msc_ch](https://db-benchmarks.com/test-hn/msc_ch.png) 76 | 77 | ### Big data - 1.7B NYC taxi rides - 4x faster than Elasticsearch 78 | 79 | https://db-benchmarks.com/test-taxi/#manticore-search-vs-elasticsearch 80 | 81 | ![taxi_ms_es](https://db-benchmarks.com/test-taxi/ms_es.png) 82 | 83 | ### Big data - 1.7B NYC taxi rides - 1.8x faster than Clickhouse 84 | 85 | https://db-benchmarks.com/test-taxi/#manticore-search-vs-clickhouse 86 | 87 | ![taxi_ms_ch](https://db-benchmarks.com/test-taxi/ms_ch.png) 88 | 89 | 90 | -------------------------------------------------------------------------------- /embeddings/src/model/openai.rs: -------------------------------------------------------------------------------- 1 | use super::TextModel; 2 | use crate::LibError; 3 | use reqwest::blocking::Client; 4 | 5 | #[derive(Debug)] 6 | pub struct OpenAIModel { 7 | pub client: Client, 8 | pub model: String, 9 | pub api_key: String, 10 | } 11 | 12 | pub fn validate_model(model: &str) -> Result<(), String> { 13 | match model { 14 | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" => Ok(()), 15 | _ => Err(format!("Invalid model: {}", model)), 16 | } 17 | } 18 | 19 | pub fn validate_api_key(api_key: &str) -> Result<(), String> { 20 | if api_key.is_empty() { 21 | return Err("API key is required".to_string()); 22 | } 23 | 24 | // Trim whitespace and check 25 | let trimmed = api_key.trim(); 26 | if trimmed != api_key { 27 | return Err("API key must not have leading or trailing whitespace".to_string()); 28 | } 29 | 30 | // now match that it starts with sk- and has content after 31 | if !api_key.starts_with("sk-") || api_key.len() <= 3 { 32 | return Err("API key must start with sk- and have content".to_string()); 33 | } 34 | 35 | Ok(()) 36 | } 37 | 38 | impl OpenAIModel { 39 | pub fn new(model_id: &str, api_key: &str) -> Result> { 40 | let model = model_id.trim_start_matches("openai/").to_string(); 41 | validate_model(&model).map_err(|_| LibError::RemoteUnsupportedModel)?; 42 | validate_api_key(api_key).map_err(|_| LibError::RemoteInvalidAPIKey)?; 43 | Ok(Self { 44 | client: Client::new(), 45 | model, 46 | api_key: api_key.to_string(), 47 | }) 48 | } 49 | } 50 | 51 | impl TextModel for OpenAIModel { 52 | fn predict(&self, texts: &[&str]) -> Result>, Box> { 53 | let url = "https://api.openai.com/v1/embeddings"; 54 | 55 | let request_body = serde_json::json!({ 56 | "input": texts, 57 | "model": self.model, 58 | }); 59 | 60 | let response = self 61 | .client 62 | .post(url) 63 | .header("Authorization", format!("Bearer {}", self.api_key)) 64 | .header("Content-Type", "application/json") 65 | .json(&request_body) 66 | .send() 67 | .map_err(|_| LibError::RemoteRequestSendFailed)?; 68 | 69 | let response_body: serde_json::Value = response 70 | .json() 71 | .map_err(|_| LibError::RemoteResponseParseFailed)?; 72 | 73 | // Check if there's an error in the response - proper szError pattern handling 74 | if let Some(error) = response_body.get("error") { 75 | let error_code = error 76 | .get("code") 77 | .and_then(|c| c.as_str()) 78 | .unwrap_or("unknown_error"); 79 | 80 | // Map OpenAI error codes to appropriate LibError types 81 | let lib_error = match error_code { 82 | "invalid_api_key" => LibError::RemoteInvalidAPIKey, 83 | "model_not_found" => LibError::RemoteUnsupportedModel, 84 | "insufficient_quota" | "rate_limit_exceeded" => LibError::RemoteRequestSendFailed, 85 | _ => LibError::RemoteResponseParseFailed, 86 | }; 87 | 88 | return Err(Box::new(lib_error)); 89 | } 90 | 91 | let embeddings: Vec> = response_body["data"] 92 | .as_array() 93 | .unwrap_or(&Vec::new()) 94 | .iter() 95 | .map(|item| { 96 | item["embedding"] 97 | .as_array() 98 | .unwrap_or(&Vec::new()) 99 | .iter() 100 | .map(|v| v.as_f64().unwrap() as f32) 101 | .collect() 102 | }) 103 | .collect(); 104 | 105 | Ok(embeddings) 106 | } 107 | 108 | fn get_hidden_size(&self) -> usize { 109 | match self.model.as_str() { 110 | "text-embedding-ada-002" => 1536, // Fixed: was 768, should be 1536 111 | "text-embedding-3-small" => 1536, 112 | "text-embedding-3-large" => 3072, 113 | _ => panic!("Unknown model"), 114 | } 115 | } 116 | 117 | fn get_max_input_len(&self) -> usize { 118 | 8192 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /knn/knn.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | // This file is a part of the common headers (API). 18 | // If you make any significant changes to this file, you MUST bump the LIB_VERSION. 19 | 20 | #pragma once 21 | 22 | #include "util/util.h" 23 | #include "common/schema.h" 24 | #include "common/blockiterator.h" 25 | 26 | namespace knn 27 | { 28 | 29 | static const int LIB_VERSION = 9; 30 | static const uint32_t STORAGE_VERSION = 3; 31 | 32 | enum class HNSWSimilarity_e 33 | { 34 | L2, 35 | IP, 36 | COSINE 37 | }; 38 | 39 | enum class Quantization_e 40 | { 41 | NONE, 42 | BIT1, 43 | BIT1SIMPLE, 44 | BIT4, // no longer supported 45 | BIT8 46 | }; 47 | 48 | struct IndexSettings_t 49 | { 50 | int m_iDims = 0; 51 | HNSWSimilarity_e m_eHNSWSimilarity = HNSWSimilarity_e::L2; 52 | Quantization_e m_eQuantization = Quantization_e::NONE; 53 | int m_iHNSWM = 16; 54 | int m_iHNSWEFConstruction = 200; 55 | }; 56 | 57 | struct ModelSettings_t 58 | { 59 | std::string m_sModelName; 60 | std::string m_sCachePath; 61 | std::string m_sAPIKey; 62 | bool m_bUseGPU = false; 63 | }; 64 | 65 | struct AttrWithSettings_t : public common::SchemaAttr_t, public IndexSettings_t {}; 66 | using Schema_t = std::vector; 67 | 68 | struct DocDist_t 69 | { 70 | uint32_t m_tRowID; 71 | float m_fDist; 72 | }; 73 | 74 | class Distance_i 75 | { 76 | public: 77 | virtual ~Distance_i() = default; 78 | 79 | virtual float CalcDist ( const util::Span_T & dPoint1, const util::Span_T & dPoint2 ) const = 0; 80 | }; 81 | 82 | class Iterator_i : public common::BlockIterator_i 83 | { 84 | public: 85 | virtual util::Span_T GetData() const = 0; 86 | }; 87 | 88 | class KNN_i 89 | { 90 | public: 91 | virtual ~KNN_i() = default; 92 | 93 | virtual bool Load ( const std::string & sFilename, std::string & sError ) = 0; 94 | virtual Iterator_i * CreateIterator ( const std::string & sName, const util::Span_T & dData, int iResults, int iEf, std::string & sError ) = 0; 95 | }; 96 | 97 | class Builder_i 98 | { 99 | public: 100 | virtual ~Builder_i() = default; 101 | 102 | virtual void Train ( int iAttr, uint32_t uRowID, const util::Span_T & dData ) = 0; 103 | virtual bool SetAttr ( int iAttr, uint32_t uRowID, const util::Span_T & dData ) = 0; 104 | virtual bool Save ( const std::string & sFilename, size_t tBufferSize, std::string & sError ) = 0; 105 | virtual const std::string & GetError() const = 0; 106 | }; 107 | 108 | class TextToEmbeddings_i 109 | { 110 | public: 111 | virtual ~TextToEmbeddings_i() = default; 112 | 113 | virtual bool Convert ( const std::vector & dTexts, std::vector> & dEmbeddings, std::string & sError ) const = 0; 114 | virtual int GetDims() const = 0; 115 | }; 116 | 117 | class EmbeddingsLib_i 118 | { 119 | public: 120 | virtual ~EmbeddingsLib_i() = default; 121 | 122 | virtual TextToEmbeddings_i * CreateTextToEmbeddings ( const knn::ModelSettings_t & tSettings, std::string & sError ) const = 0; 123 | virtual const std::string & GetVersionStr() const = 0; 124 | virtual int GetVersion() const = 0; 125 | }; 126 | 127 | } // namespace knn 128 | 129 | extern "C" 130 | { 131 | DLLEXPORT knn::Distance_i * CreateDistanceCalc ( const knn::IndexSettings_t & tSettings ); 132 | DLLEXPORT knn::KNN_i * CreateKNN(); 133 | DLLEXPORT knn::Builder_i * CreateKNNBuilder ( const knn::Schema_t & tSchema, int64_t iNumElements, const std::string & sTmpFilename ); 134 | DLLEXPORT knn::EmbeddingsLib_i * LoadEmbeddingsLib ( const std::string & sLibPath, std::string & sError ); 135 | DLLEXPORT int GetKNNLibVersion(); 136 | DLLEXPORT const char * GetKNNLibVersionStr(); 137 | } 138 | -------------------------------------------------------------------------------- /cmake/rev.cmake: -------------------------------------------------------------------------------- 1 | cmake_minimum_required ( VERSION 3.17 ) 2 | 3 | # guess version strings from current git repo 4 | function ( guess_from_git ) 5 | if (NOT EXISTS "${columnar_SOURCE_DIR}/.git") 6 | return () 7 | endif () 8 | 9 | find_package ( Git QUIET ) 10 | if (NOT GIT_FOUND) 11 | return () 12 | endif () 13 | 14 | # without this in some environments you can get error "detected dubious ownership in repository" 15 | # `git config --global --add safe.directory '*'` in the docker image it runs in may not help. TODO: check why 16 | execute_process ( COMMAND "${GIT_EXECUTABLE}" config --global --add safe.directory "${columnar_SOURCE_DIR}") 17 | 18 | # extract short hash as GIT_COMMIT_ID 19 | execute_process ( COMMAND "${GIT_EXECUTABLE}" log -1 --format=%h 20 | WORKING_DIRECTORY "${columnar_SOURCE_DIR}" 21 | RESULT_VARIABLE res 22 | OUTPUT_VARIABLE GIT_COMMIT_ID 23 | ERROR_QUIET 24 | OUTPUT_STRIP_TRAILING_WHITESPACE ) 25 | set ( GIT_COMMIT_ID "${GIT_COMMIT_ID}" PARENT_SCOPE ) 26 | 27 | # extract timestamp and make number YYMMDD from it 28 | execute_process ( COMMAND "${GIT_EXECUTABLE}" log -1 --date=format-local:"%y%m%d%H" --format=%cd 29 | WORKING_DIRECTORY "${columnar_SOURCE_DIR}" 30 | RESULT_VARIABLE res 31 | OUTPUT_VARIABLE GIT_TIMESTAMP_ID 32 | OUTPUT_STRIP_TRAILING_WHITESPACE ) 33 | 34 | string ( SUBSTRING "${GIT_TIMESTAMP_ID}" 1 8 GIT_TIMESTAMP_ID ) 35 | set ( GIT_TIMESTAMP_ID ${GIT_TIMESTAMP_ID} PARENT_SCOPE ) 36 | 37 | # timestamp for reproducable packages 38 | execute_process ( COMMAND "${GIT_EXECUTABLE}" log -1 --pretty=%ct 39 | WORKING_DIRECTORY "${columnar_SOURCE_DIR}" 40 | RESULT_VARIABLE res 41 | OUTPUT_VARIABLE GIT_EPOCH_ID 42 | ERROR_QUIET 43 | OUTPUT_STRIP_TRAILING_WHITESPACE ) 44 | set ( ENV{SOURCE_DATE_EPOCH} ${GIT_EPOCH_ID} ) 45 | 46 | # extract branch name (top of 'git status -s -b'), throw out leading '## ' 47 | execute_process ( COMMAND "${GIT_EXECUTABLE}" status -s -b 48 | WORKING_DIRECTORY "${columnar_SOURCE_DIR}" 49 | RESULT_VARIABLE res 50 | OUTPUT_VARIABLE GIT_BRANCH_ID 51 | ERROR_QUIET 52 | OUTPUT_STRIP_TRAILING_WHITESPACE ) 53 | string ( REGEX REPLACE "\n.*$" "" GIT_BRANCH_ID "${GIT_BRANCH_ID}" ) 54 | string ( REPLACE "## " "" GIT_BRANCH_ID "${GIT_BRANCH_ID}" ) 55 | set ( GIT_BRANCH_ID "git branch ${GIT_BRANCH_ID}" PARENT_SCOPE ) 56 | endfunction () 57 | 58 | # guess version strings from template header file (git archive mark it there) 59 | function ( extract_from_git_slug HEADER ) 60 | if (EXISTS "${HEADER}") 61 | file ( STRINGS "${HEADER}" _CONTENT ) 62 | foreach (LINE ${_CONTENT}) 63 | # match definitions like - // GIT_*_ID VALUE 64 | if ("${LINE}" MATCHES "^//[ \t]+(GIT_.*_ID)[ \t]\"(.*)\"") 65 | set ( ${CMAKE_MATCH_1} "${CMAKE_MATCH_2}" ) 66 | endif () 67 | endforeach () 68 | if (GIT_COMMIT_ID STREQUAL "$Format:%h$") 69 | return () # no slug 70 | endif () 71 | # commit id 72 | set ( GIT_COMMIT_ID "${GIT_COMMIT_ID}" PARENT_SCOPE ) 73 | # timestamp 74 | string ( REPLACE "-" "" GIT_TIMESTAMP_ID "${GIT_TIMESTAMP_ID}" ) 75 | string ( SUBSTRING "${GIT_TIMESTAMP_ID}" 2 6 GIT_TIMESTAMP_ID ) 76 | set ( GIT_TIMESTAMP_ID "${GIT_TIMESTAMP_ID}" PARENT_SCOPE ) 77 | # epoch for packaging 78 | set ( ENV{SOURCE_DATE_EPOCH} ${GIT_EPOCH_ID} ) 79 | # branch id 80 | set ( GIT_BRANCH_ID "from tarball" PARENT_SCOPE ) 81 | endif () 82 | endfunction () 83 | 84 | # function definitions finished, execution starts from here 85 | ################################## 86 | 87 | # first try to use binary git 88 | guess_from_git () 89 | 90 | # 2-nd try - if we build from git archive. Correct hash and date provided then, but no branch 91 | if (NOT GIT_COMMIT_ID) 92 | extract_from_git_slug ( "${columnar_SOURCE_DIR}/util/version.h.in" ) 93 | endif () 94 | 95 | # determine build as even/odd value of patch version 96 | math ( EXPR oddvalue "${PROJECT_VERSION_PATCH} % 2" OUTPUT_FORMAT DECIMAL ) 97 | 98 | if (oddvalue) 99 | set ( DEV_BUILD ON ) 100 | endif () 101 | 102 | # nothing found 103 | if (NOT GIT_COMMIT_ID) 104 | message ( STATUS "Dev mode, no guess, using predefined version" ) 105 | set ( GIT_TIMESTAMP_ID "000000" ) 106 | set ( GIT_COMMIT_ID "deadbeef" ) 107 | set ( GIT_BRANCH_ID "developer version" ) 108 | set ( ENV{SOURCE_DATE_EPOCH} "1607089638" ) 109 | set ( DEV_BUILD ON ) 110 | endif () 111 | 112 | # configure packaging 113 | configure_file ( "${columnar_SOURCE_DIR}/cmake/CPackOptions.cmake.in" "${columnar_BINARY_DIR}/config/CPackOptions.cmake" @ONLY ) 114 | -------------------------------------------------------------------------------- /util/bitvec.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #pragma once 18 | 19 | #ifdef _MSC_VER 20 | #include 21 | #endif 22 | 23 | namespace util 24 | { 25 | 26 | template 27 | class BitVec_T 28 | { 29 | public: 30 | explicit BitVec_T ( int iSize ) { Resize(iSize); } 31 | 32 | FORCE_INLINE bool BitGet ( int iBit ) 33 | { 34 | if ( !GetDataLen() ) 35 | return false; 36 | 37 | assert ( iBit>=0 && iBit>SHIFT ] & ( ( (T)1 )<<( iBit&MASK ) ) )!=0 ); 39 | } 40 | 41 | FORCE_INLINE void BitSet ( int iBit ) 42 | { 43 | if ( !GetDataLen() ) 44 | return; 45 | 46 | assert ( iBit>=0 && iBit>SHIFT ] |= ( (T)1 )<<( iBit&MASK ); 48 | } 49 | 50 | void Invert ( int iMinBit=-1, int iMaxBit=-1 ) 51 | { 52 | if ( !m_iSize ) 53 | return; 54 | 55 | if ( iMinBit<0 ) 56 | iMinBit = 0; 57 | 58 | if ( iMaxBit<0 ) 59 | iMaxBit = m_iSize-1; 60 | 61 | int iMinId = iMinBit>>SHIFT; 62 | int iMaxId = (iMaxBit+SIZEBITS)>>SHIFT; 63 | for ( int i = iMinId; i 80 | void Fetch ( int & iIterator, int iBase, RESULT * & pRes, RESULT * pMax ) 81 | { 82 | assert ( sizeof(T)==8 ); // this func should not be used with 32-bit based bitmaps 83 | 84 | const T * pDataStart = &m_dData.front(); 85 | const T * pData = pDataStart + iIterator; 86 | const T * pDataMax = pDataStart + GetDataLen(); 87 | 88 | pMax -= SIZEBITS; 89 | assert ( pMax>=pRes ); 90 | 91 | RESULT tOutStart = ( iIterator << SHIFT ) + iBase; 92 | for ( ; pRes=m_iSize ) 117 | return m_iSize; 118 | 119 | const T * pData = &m_dData.front(); 120 | int iIndex = iStart>>SHIFT; 121 | T uMask = ~( ( T(1)<<( iStart&MASK ) )-1 ); 122 | if ( pData[iIndex] & uMask ) 123 | return (iIndex<=(int)GetDataLen() ) 130 | return m_iSize; 131 | 132 | return (iIndex< m_dData.size() ) 141 | m_dData = std::vector ( m_iDataLen, 0 ); 142 | } 143 | 144 | int GetLength() const { return m_iSize; } 145 | const std::vector & GetData() const { return m_dData; } 146 | 147 | private: 148 | static const size_t SIZEBITS = sizeof(T)*8; 149 | static const T MASK = T(sizeof(T)*8 - 1); 150 | static constexpr T SHIFT = T(Log2(SIZEBITS)-1); 151 | 152 | std::vector m_dData; 153 | int m_iSize = 0; 154 | int m_iDataLen = 0; 155 | 156 | FORCE_INLINE int ScanBit ( T tData, int iStart ) 157 | { 158 | for ( int i = iStart; i < SIZEBITS; i++ ) 159 | if ( tData & ( (T)1<; 169 | 170 | } // namespace util 171 | -------------------------------------------------------------------------------- /cmake/citest.cmake: -------------------------------------------------------------------------------- 1 | # That is to be run from CI. For local tests use /smoke.sh or ctest over local build. 2 | # Initialize global vars with values came from outside (from gitlab-ci) 3 | # This is main test suite which runs all the tests. 4 | set ( CI_PROJECT_DIR "$ENV{CI_PROJECT_DIR}" ) 5 | set ( CTEST_BUILD_NAME "$ENV{CI_COMMIT_REF_NAME}" ) 6 | set ( CTEST_CONFIGURATION_TYPE "$ENV{CTEST_CONFIGURATION_TYPE}" ) 7 | set ( CTEST_CMAKE_GENERATOR "$ENV{CTEST_CMAKE_GENERATOR}" ) 8 | set ( LIBS_BUNDLE "$ENV{LIBS_BUNDLE}" ) 9 | set ( CTEST_REGEX "$ENV{CTEST_REGEX}" ) 10 | set ( CTEST_EXCLUDE_REGEX "$ENV{CTEST_EXCLUDE_REGEX}" ) 11 | set ( CTEST_START "$ENV{CTEST_START}" ) 12 | set ( CTEST_END "$ENV{CTEST_END}" ) 13 | set ( SEARCHD_CLI_EXTRA "$ENV{SEARCHD_CLI_EXTRA}" ) 14 | set ( WITH_COVERAGE "$ENV{WITH_COVERAGE}" ) 15 | set ( NO_TESTS "$ENV{NO_TESTS}" ) 16 | set ( NO_BUILD "$ENV{NO_BUILD}" ) 17 | set_property ( GLOBAL PROPERTY Label P$ENV{CI_PIPELINE_ID} J$ENV{CI_JOB_ID} ) 18 | 19 | # how may times try the test before it is considered failed 20 | set ( RETRIES 5 ) 21 | 22 | if (NOT CTEST_CMAKE_GENERATOR) 23 | set ( CTEST_CMAKE_GENERATOR "Unix Makefiles" ) 24 | endif () 25 | 26 | # platform specific options 27 | set ( CTEST_SITE "$ENV{CI_SERVER_NAME} ${CTEST_BUILD_CONFIGURATION}" ) 28 | 29 | # fallback to run without ctest 30 | if (NOT CTEST_SOURCE_DIRECTORY) 31 | set ( CTEST_SOURCE_DIRECTORY ".." ) 32 | endif () 33 | 34 | # common test options 35 | set ( CONFIG_OPTIONS "WITH_ODBC=0;WITH_POSTGRESQL=0;WITH_SSL=0;WITH_RE2=1;WITH_STEMMER=1;WITH_EXPAT=1" ) 36 | set ( CTEST_BINARY_DIRECTORY "build" ) 37 | 38 | if (WITH_COVERAGE) 39 | find_program ( CTEST_COVERAGE_COMMAND NAMES gcov ) 40 | list ( APPEND CONFIG_OPTIONS "COVERAGE_TEST=1" ) 41 | list ( APPEND CTEST_CUSTOM_COVERAGE_EXCLUDE "_deps/.*" ) 42 | endif () 43 | 44 | if (LIBS_BUNDLE) 45 | list ( APPEND CONFIG_OPTIONS "LIBS_BUNDLE=${LIBS_BUNDLE}" ) 46 | endif () 47 | 48 | if (SEARCHD_CLI_EXTRA) 49 | list ( APPEND CONFIG_OPTIONS "SEARCHD_CLI_EXTRA=${SEARCHD_CLI_EXTRA}" ) 50 | endif () 51 | 52 | set ( CTEST_START_WITH_EMPTY_BINARY_DIRECTORY TRUE ) 53 | #ctest_empty_binary_directory(${CTEST_BINARY_DIRECTORY}) 54 | 55 | ####################################################################### 56 | file ( WRITE "${CTEST_BINARY_DIRECTORY}/CTestConfig.cmake" " 57 | set ( CTEST_PROJECT_NAME \"Manticore columnar\" ) 58 | set ( CTEST_NIGHTLY_START_TIME \"01:00:00 UTC\" ) 59 | set ( CTEST_DROP_SITE_CDASH TRUE ) 60 | " ) 61 | 62 | # configure memcheck 63 | set ( WITH_MEMCHECK FALSE ) 64 | #find_program(CTEST_MEMORYCHECK_COMMAND NAMES valgrind) 65 | #set(CTEST_MEMORYCHECK_SUPPRESSIONS_FILE ${CTEST_SOURCE_DIRECTORY}/tests/valgrind.supp) 66 | 67 | # configure update (will log git rev id) 68 | find_program ( CTEST_GIT_COMMAND NAMES git ) 69 | set ( CTEST_UPDATE_COMMAND "${CTEST_GIT_COMMAND}" ) 70 | set ( CTEST_UPDATE_VERSION_ONLY ON ) 71 | 72 | set ( CMAKE_CALL "\"${CMAKE_COMMAND}\" \"-G${CTEST_CMAKE_GENERATOR}\" -DCMAKE_BUILD_TYPE:STRING=${CTEST_CONFIGURATION_TYPE}" ) 73 | foreach (OPTION ${CONFIG_OPTIONS}) 74 | set ( CMAKE_CALL "${CMAKE_CALL} -D${OPTION}" ) 75 | endforeach () 76 | set ( CTEST_CONFIGURE_COMMAND "${CMAKE_CALL} \"${CTEST_SOURCE_DIRECTORY}\"" ) 77 | 78 | # will not write and count warnings in auto-generated files of lexer 79 | set ( CTEST_CUSTOM_WARNING_EXCEPTION ".*flexsphinx.*" ) 80 | message ( STATUS "CTEST_CONFIGURE_COMMAND is ${CTEST_CONFIGURE_COMMAND}" ) 81 | 82 | # Do the test suite 83 | ctest_start ( "Continuous" ) 84 | #ctest_update () 85 | ctest_configure () 86 | 87 | if (NOT NO_BUILD) 88 | include ( ProcessorCount ) 89 | ProcessorCount ( N ) 90 | if (NOT N EQUAL 0) 91 | if (NOT CTEST_CMAKE_GENERATOR STREQUAL "Visual Studio 16 2019") 92 | set ( CTEST_BUILD_FLAGS -j${N} ) 93 | endif () 94 | set ( ctest_test_args ${ctest_test_args} PARALLEL_LEVEL ${N} ) 95 | endif () 96 | 97 | ctest_build ( ${ctest_test_args} ) 98 | endif () 99 | 100 | if (NO_TESTS) 101 | return () 102 | endif () 103 | 104 | if ( CTEST_REGEX ) 105 | ctest_test ( RETURN_VALUE retcode INCLUDE "${CTEST_REGEX}" EXCLUDE "${CTEST_EXCLUDE_REGEX}" REPEAT UNTIL_PASS:${RETRIES}) 106 | else() 107 | if ( CTEST_START AND CTEST_END ) 108 | ctest_test ( START ${CTEST_START} END ${CTEST_END} EXCLUDE "${CTEST_EXCLUDE_REGEX}" RETURN_VALUE retcode REPEAT UNTIL_PASS:${RETRIES}) 109 | else() 110 | ctest_test ( EXCLUDE "${CTEST_EXCLUDE_REGEX}" RETURN_VALUE retcode REPEAT UNTIL_PASS:${RETRIES}) 111 | endif() 112 | endif() 113 | 114 | #ctest_test ( START 24 END 25 RETURN_VALUE retcode ) 115 | #ctest_test ( STRIDE 50 ) 116 | #ctest_test ( STRIDE 50 EXCLUDE_LABEL RT RETURN_VALUE retcode ) 117 | 118 | if (WITH_COVERAGE AND CTEST_COVERAGE_COMMAND) 119 | ctest_coverage () 120 | endif (WITH_COVERAGE AND CTEST_COVERAGE_COMMAND) 121 | 122 | if (WITH_MEMCHECK AND CTEST_MEMORYCHECK_COMMAND) 123 | ctest_memcheck () 124 | endif (WITH_MEMCHECK AND CTEST_MEMORYCHECK_COMMAND) 125 | 126 | #ctest_submit () 127 | 128 | if (retcode) 129 | message ( FATAL_ERROR "tests failed with ${retcode} code" ) 130 | endif () 131 | -------------------------------------------------------------------------------- /Changelog.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | # Version 2.2.4 4 | 5 | ### Minor changes 6 | 7 | * [Commit c6db](https://github.com/manticoresoftware/columnar/commit/c6dbbcbf277ac35f398637980bb57398a4434dbc) Implemented better analyzer rewinding 8 | 9 | ### Bugfixes 10 | * [Commit 357e](https://github.com/manticoresoftware/columnar/commit/357eab2d7b93759e31927b1bdf62b119ed2d2db2) Fixed bitmap union selection logic 11 | * [Commit 4c90](https://github.com/manticoresoftware/columnar/commit/4c90bc0f11b8b5dddc2db365f4197e3812f20356) Fixup integer filters before creating integer analyzers 12 | * [Commit fea4](https://github.com/manticoresoftware/columnar/commit/fea449a36f45a436712f581f1589111b8ef637a1) Added an analyzer fastpath when all table values pass the filter 13 | 14 | # Version 2.2.0 15 | 16 | ### Major new features 17 | * Added the ability to fetch the number of documents corresponding to a given filter without using iterators 18 | * Significantly improved the performance of secondary indexes with rowid filtering 19 | * Added cutoff support to analyzers 20 | * Significantly improved the performance of secondary indexes with non-selective range filters 21 | 22 | ### Minor changes 23 | * Сhanged PGM resolution for better estimates 24 | 25 | * [Commit 0abc](https://github.com/manticoresoftware/columnar/commit/0abc7246) Inlined some functions 26 | * [Commit c45d](https://github.com/manticoresoftware/columnar/commit/c45ddf7b) Added inlines; changed codec interface; changed default 64bit codec to fastpfor256 27 | * [Commit 86b3](https://github.com/manticoresoftware/columnar/commit/86b3af30) Added exclude filters to CalcCount 28 | * [Commit 5ccf](https://github.com/manticoresoftware/columnar/commit/5ccffa0c) Changed columnar iterator interface to single-call 29 | * [Commit f7f5](https://github.com/manticoresoftware/columnar/commit/f7f54d93) Reduced partial minmax eval depth 30 | 31 | ### Bugfixes 32 | * [Commit 4f42](https://github.com/manticoresoftware/columnar/commit/1310c8af37398c42cfc010c24f07d146793b4f42) Fixed a crash caused by buffer overflow when encoding integer data 33 | * [Commit 7653](https://github.com/manticoresoftware/columnar/commit/76530db2f74072ea7787cb7d41124b1117ed014f) Fixed a crash caused by using a string filter without a hash func 34 | * [Issue #20](https://github.com/manticoresoftware/columnar/issues/20) Fixed a crash on indexing zero-length MVA attributes 35 | * [Commit 102d](https://github.com/manticoresoftware/columnar/commit/102d67c3) Bitmap iterator now rewinds only forward 36 | * [Commit 24e7](https://github.com/manticoresoftware/columnar/commit/24e76dd9) Fixed float range filters vs negative values 37 | * [Commit e447](https://github.com/manticoresoftware/columnar/commit/e447ec88) Fixed header integrity checks 38 | * [Commit 3c0b](https://github.com/manticoresoftware/columnar/commit/3c0b089c) Fixed bitmap iterator description on empty result sets 39 | * [Commit 4b21](https://github.com/manticoresoftware/columnar/commit/4b21f461) Clamp iterator esitmates for FilterType_e::VALUES 40 | 41 | # Version 2.0.4 42 | 43 | ### Bugfixes 44 | 45 | * [Issue #1054](https://github.com/manticoresoftware/manticoresearch/issues/1054) Bug on empty string condition 46 | 47 | # Version 4.0.0 48 | 49 | - [Commit 89ed74a3](https://github.com/manticoresoftware/columnar/commit/89ed74a3d767a4a9dfdfe20d7c954fbc36c5ab72) - Fixed a crash caused by mismatched filter and secondary index types. 50 | - [Commit 4223c525](https://github.com/manticoresoftware/columnar/commit/4223c525aed2cfb704ae9a0b439e5fac034913d0) - Implemented the NOTNULL filter type for secondary indexes. 51 | - [Commit 020c82ed](https://github.com/manticoresoftware/columnar/commit/020c82ede0903f898a685cae0b5d8fcb19027771) - Fixed exclude filter handling in columnar accessor for table encoding. 52 | - [Commit 3fb88e65](https://github.com/manticoresoftware/columnar/commit/3fb88e65fa6575a40d80cbf96b45ad3383b39c46) - Fixed issues with full-scan (NOTNULL) filters on strings. 53 | - [Commit b707d5b0](https://github.com/manticoresoftware/columnar/commit/b707d5b0eec0383cdae12730d36eb8a25bc26ce2) - Added native exclude filter handling using bitmaps. 54 | - [Commit bd59d083](https://github.com/manticoresoftware/columnar/commit/bd59d083eec5f6debcf190b69cedc303683553da) - Fixed issues with bitmap inversion. 55 | - [Commit ba9e283b](https://github.com/manticoresoftware/columnar/commit/ba9e283b2f0e8a60756af69b0a0d8c21e2263099) - Switched to the hnsw library to fix issues when loading multiple KNN indexes. 56 | - [Commit 89120fa7](https://github.com/manticoresoftware/columnar/commit/89120fa7ead9b2770f7ddc3912807e6e6bcca1f3) - Resolved another bitmap inversion issue. 57 | - [Commit edadc694](https://github.com/manticoresoftware/columnar/commit/edadc694c68d6022bdd13134263667430a42cc1d) - Addressed additional issues with bitmap inversion. 58 | - [Commit 3ff21a80](https://github.com/manticoresoftware/columnar/commit/3ff21a80357dcca80b021b4827524d9ba63f11e6) - Fixed incorrectly enabled secondary indexes for JSON attribute fields affected by updates. 59 | - [Commit 47da6760](https://github.com/manticoresoftware/columnar/commit/47da6760aa8b32b2ef9d82f3a55666e7d0dbdf30) - Added support for fetching index metadata. -------------------------------------------------------------------------------- /secondary/pgm.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #ifndef _sipgm_ 18 | #define _sipgm_ 19 | 20 | #include "reader.h" 21 | #include "pgm/pgm_index.hpp" 22 | 23 | namespace SI 24 | { 25 | struct ApproxPos_t 26 | { 27 | size_t m_iPos; ///< The approximate position of the key. 28 | size_t m_iLo; ///< The lower bound of the range. 29 | size_t m_iHi; ///< The upper bound of the range. 30 | }; 31 | 32 | class PGM_i 33 | { 34 | public: 35 | virtual ~PGM_i() = default; 36 | 37 | virtual size_t Save ( std::vector & dData ) const = 0; 38 | virtual void Load ( util::FileReader_c & tRd ) = 0; 39 | virtual ApproxPos_t Search ( uint64_t uVal ) const = 0; 40 | virtual bool IsEmpty() const = 0; 41 | }; 42 | 43 | template 44 | class PGM_T : public pgm::PGMIndex, public PGM_i 45 | { 46 | using BASE = pgm::PGMIndex; 47 | using BASE::BASE; 48 | 49 | public: 50 | template 51 | PGM_T ( RandomIt tBegin, RandomIt tEnd ) 52 | : BASE ( tBegin, tEnd ) 53 | {} 54 | 55 | size_t Save ( std::vector & dData ) const final 56 | { 57 | size_t uOff = dData.size(); 58 | 59 | util::MemWriter_c tWr ( dData ); 60 | 61 | tWr.Pack_uint32 ( (int)this->n ); 62 | WriteTypedKey ( tWr, this->first_key ); 63 | 64 | tWr.Pack_uint32 ( (int)this->segments.size() ); 65 | for ( const auto & tSeg : this->segments ) 66 | { 67 | WriteTypedKey ( tWr, tSeg.key ); 68 | tWr.Pack_uint32 ( util::FloatToUint ( tSeg.slope ) ); 69 | tWr.Pack_uint32 ( tSeg.intercept ); 70 | } 71 | 72 | tWr.Pack_uint32 ( (int)this->levels_sizes.size() ); 73 | for ( const size_t & tLvl : this->levels_sizes ) 74 | tWr.Pack_uint64 ( tLvl ); 75 | 76 | tWr.Pack_uint32 ( (int)this->levels_offsets.size() ); 77 | for ( const size_t & tOff : this->levels_offsets ) 78 | tWr.Pack_uint64 ( tOff ); 79 | 80 | return uOff; 81 | } 82 | 83 | void Load ( util::FileReader_c & tRd ) final 84 | { 85 | this->n = tRd.Unpack_uint32(); 86 | LoadTypedKey ( tRd, this->first_key ); 87 | 88 | this->segments.resize ( tRd.Unpack_uint32() ); 89 | for ( auto & tSeg : this->segments ) 90 | { 91 | LoadTypedKey ( tRd, tSeg.key ); 92 | tSeg.slope = util::UintToFloat ( tRd.Unpack_uint32() ); 93 | tSeg.intercept = tRd.Unpack_uint32(); 94 | } 95 | 96 | this->levels_sizes.resize ( tRd.Unpack_uint32() ); 97 | for ( size_t & tLvl : this->levels_sizes ) 98 | tLvl = tRd.Unpack_uint64(); 99 | 100 | this->levels_offsets.resize ( tRd.Unpack_uint32() ); 101 | for ( size_t & tOff : this->levels_offsets ) 102 | tOff = tRd.Unpack_uint64(); 103 | } 104 | 105 | ApproxPos_t Search ( uint64_t uVal ) const final; 106 | void WriteTypedKey ( util::MemWriter_c & tWr, VALUE tVal ) const { tWr.Pack_uint64 ( (uint64_t)tVal ); } 107 | void LoadTypedKey ( util::FileReader_c & tRd, VALUE & tVal ) const { tVal = tRd.Unpack_uint64(); } 108 | bool IsEmpty () const final { return this->n==0; } 109 | }; 110 | 111 | template<> 112 | inline void PGM_T::WriteTypedKey ( util::MemWriter_c & tWr, float tVal ) const 113 | { 114 | tWr.Pack_uint32 ( util::FloatToUint ( tVal ) ); 115 | } 116 | 117 | template<> 118 | inline void PGM_T::LoadTypedKey ( util::FileReader_c & tRd, float & tVal ) const 119 | { 120 | tVal = util::UintToFloat ( tRd.Unpack_uint32() ); 121 | } 122 | 123 | static ApproxPos_t GetPos ( pgm::ApproxPos tPos ) 124 | { 125 | ApproxPos_t tIt; 126 | tIt.m_iPos = tPos.pos; 127 | tIt.m_iLo = tPos.lo; 128 | tIt.m_iHi = tPos.hi; 129 | return tIt; 130 | } 131 | 132 | template<> 133 | inline ApproxPos_t PGM_T::Search ( uint64_t uVal ) const 134 | { 135 | return GetPos ( this->search ( (uint32_t)uVal ) ); 136 | } 137 | 138 | template<> 139 | inline ApproxPos_t PGM_T::Search ( uint64_t uVal ) const 140 | { 141 | return GetPos ( this->search ( uVal ) ); 142 | } 143 | 144 | template<> 145 | inline ApproxPos_t PGM_T::Search ( uint64_t uVal ) const 146 | { 147 | return GetPos ( this->search ( (int64_t)uVal ) ); 148 | } 149 | 150 | template<> 151 | inline ApproxPos_t PGM_T::Search ( uint64_t uVal ) const 152 | { 153 | return GetPos ( this->search ( util::UintToFloat ( uVal ) ) ); 154 | } 155 | 156 | } // namespace SI 157 | 158 | 159 | #endif // _sipgm_ -------------------------------------------------------------------------------- /cmake/update_bundle.cmake: -------------------------------------------------------------------------------- 1 | if (__update_bundle_columnar_included) 2 | return () 3 | endif () 4 | set ( __update_bundle_columnar_included YES ) 5 | 6 | IF (POLICY CMP0135) 7 | CMAKE_POLICY ( SET CMP0135 NEW ) 8 | ENDIF () 9 | 10 | # env WRITEB (as bool) means that we can store downloaded stuff to our bundle (that's to refresh the bundle) 11 | # env CACHEB may provide path to persistent folder where we will build heavy stuff (unpacked sources, builds) 12 | include ( helpers ) 13 | diag ( DIAGNOSTIC ) 14 | 15 | if ( USE_AVX2 ) 16 | set ( SUFF "${CMAKE_SYSTEM_NAME}-x86_64-v3" ) 17 | else() 18 | set ( SUFF "${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}" ) 19 | endif() 20 | string ( TOLOWER "${SUFF}" SUFF ) 21 | diag (SUFF) 22 | 23 | # SUFF is line like 'darwin-x86_64' (system-arch) 24 | 25 | # special cache folder where artefacts keep. Make it absolute also 26 | if (DEFINED CACHEB) 27 | if (NOT EXISTS ${CACHEB}) 28 | get_filename_component ( REL_BBUILD "${CACHEB}" REALPATH BASE_DIR "${columnar_BINARY_DIR}" ) 29 | file ( MAKE_DIRECTORY ${REL_BBUILD} ) 30 | endif () 31 | # get_filename_component(CACHEB "${CACHEB}" ABSOLUTE) 32 | diag ( CACHEB ) 33 | set ( HAVE_BBUILD TRUE ) 34 | endif () 35 | 36 | # HAVE_BBUILD means we will build in aside folder (inside CACHEB) and then store the result for future. 37 | 38 | # make libs_bundle absolute, if any 39 | if (DEFINED LIBS_BUNDLE) 40 | get_filename_component ( LIBS_BUNDLE "${LIBS_BUNDLE}" ABSOLUTE ) 41 | endif () 42 | 43 | unset ( WRITEB ) 44 | set ( WRITEB "$ENV{WRITEB}" ) 45 | if (WRITEB) 46 | message ( STATUS "========================================================" ) 47 | message ( STATUS "WRITEB is set, will modify bundle, will collect stuff..." ) 48 | message ( STATUS "${LIBS_BUNDLE}" ) 49 | message ( STATUS "========================================================" ) 50 | file ( MAKE_DIRECTORY ${LIBS_BUNDLE} ) 51 | else () 52 | message ( STATUS "WRITEB is not set, bundle will NOT be modified..." ) 53 | endif () 54 | 55 | diag ( WRITEB ) 56 | diag ( LIBS_BUNDLE ) 57 | diag ( CACHEB ) 58 | diag ( HAVE_BBUILD ) 59 | 60 | if (HAVE_BBUILD) 61 | set ( CACHE_BUILDS "${CACHEB}/${SUFF}" ) 62 | else () 63 | set ( CACHE_BUILDS "${columnar_BINARY_DIR}/cache" ) 64 | endif () 65 | 66 | # that is once populate cache to cmake prefix path 67 | append_prefix ( "${CACHE_BUILDS}" ) 68 | 69 | # get path for build folder. In case with HAVE_BBUILD it will be suffixed with /arch/name flag. 70 | function ( GET_BUILD RESULT NAME ) 71 | if (NOT HAVE_BBUILD) 72 | set ( detail "local " ) 73 | endif () 74 | diags ( "${NAME} build will be set to ${detail}${CACHE_BUILDS}/${NAME}" ) 75 | set ( ${RESULT} "${CACHE_BUILDS}/${NAME}" PARENT_SCOPE ) 76 | endfunction () 77 | 78 | # set PLACE to external url or to path in bundle. 79 | # if WRITEB is active, download external url into bundle 80 | function ( select_nearest_url PLACE NAME BUNDLE_URL REMOTE_URL ) 81 | if (NOT EXISTS "${BUNDLE_URL}" AND WRITEB) 82 | diags ( "fetch ${REMOTE_URL} into ${BUNDLE_URL}..." ) 83 | file ( DOWNLOAD ${REMOTE_URL} ${BUNDLE_URL} SHOW_PROGRESS ) 84 | message ( STATUS "Absent ${NAME} put into ${BUNDLE_URL}" ) 85 | endif () 86 | 87 | if (EXISTS "${BUNDLE_URL}") 88 | set ( ${PLACE} "${BUNDLE_URL}" PARENT_SCOPE ) 89 | else () 90 | set ( ${PLACE} "${REMOTE_URL}" PARENT_SCOPE ) 91 | endif () 92 | 93 | diag ( NAME ) 94 | diag ( BUNDLE_URL ) 95 | diag ( REMOTE_URL ) 96 | endfunction () 97 | 98 | function ( fetch_sources NAME URL OUTDIR ) 99 | include ( FetchContent ) 100 | FetchContent_Declare ( ${NAME} URL "${URL}" ) 101 | FetchContent_GetProperties ( ${NAME} ) 102 | if (NOT ${NAME}_POPULATED) 103 | message ( STATUS "Populate ${NAME} from ${URL}" ) 104 | FetchContent_Populate ( ${NAME} ) 105 | endif () 106 | 107 | string ( TOUPPER "${NAME}" UNAME ) 108 | mark_as_advanced ( FETCHCONTENT_SOURCE_DIR_${UNAME} FETCHCONTENT_UPDATES_DISCONNECTED_${UNAME} ) 109 | set ( ${OUTDIR} "${${NAME}_SOURCE_DIR}" PARENT_SCOPE ) 110 | endfunction () 111 | 112 | function ( is_amd64 RESULT ) 113 | string ( TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSTEM_PROCESSOR_LOWER ) 114 | if (SYSTEM_PROCESSOR_LOWER STREQUAL x86_64 OR SYSTEM_PROCESSOR_LOWER STREQUAL amd64) 115 | set (${RESULT} TRUE PARENT_SCOPE) 116 | endif () 117 | endfunction () 118 | 119 | function ( get_avx_flags RESULT ) 120 | is_amd64 (AMD) 121 | if (NOT AMD) 122 | return() 123 | endif() 124 | 125 | if (USE_AVX2) 126 | message ( STATUS "Add AVX2 flags to compiler flags for ${CMAKE_SYSTEM_PROCESSOR} arch" ) 127 | if (MSVC OR CLANG_CL) 128 | set ( ${RESULT} "/arch:AVX2" PARENT_SCOPE ) 129 | else() 130 | set ( ${RESULT} "-march=x86-64-v3" PARENT_SCOPE ) 131 | endif() 132 | else () 133 | message ( STATUS "Add SSE flags to compiler flags for ${CMAKE_SYSTEM_PROCESSOR} arch" ) 134 | if (MSVC OR CLANG_CL) 135 | set ( ${RESULT} "/arch:AVX" PARENT_SCOPE ) 136 | else () 137 | set ( ${RESULT} "-march=x86-64-v2" PARENT_SCOPE ) 138 | endif () 139 | endif () 140 | endfunction ( ) 141 | 142 | function ( external_build module MODULE_SRC_NAME MODULE_BUILD_NAME ) 143 | get_avx_flags ( FLAGS ) 144 | if (FLAGS) 145 | set ( ENV{CXXFLAGS} ${FLAGS} ) 146 | set ( ENV{CFLAGS} ${FLAGS} ) 147 | endif () 148 | 149 | set ( CMAKE_ARGS "" ) 150 | set ( MODULE_SRC "${${MODULE_SRC_NAME}}" ) 151 | set ( MODULE_BUILD "${${MODULE_BUILD_NAME}}" ) 152 | configure_file ( ${columnar_SOURCE_DIR}/cmake/external-build.cmake.in ${module}-build/CMakeLists.txt @ONLY ) 153 | execute_process ( COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${module}-build ) 154 | execute_process ( COMMAND ${CMAKE_COMMAND} --build . WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${module}-build ) 155 | endfunction () 156 | -------------------------------------------------------------------------------- /columnar/builder/builderminmax.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #pragma once 18 | 19 | #include "attributeheader.h" 20 | 21 | namespace columnar 22 | { 23 | 24 | template 25 | class MinMaxBuilder_T 26 | { 27 | public: 28 | MinMaxBuilder_T ( const Settings_t & tSettings ); 29 | 30 | void Add ( int64_t tValue ); 31 | void Add ( const int64_t * pValues, int iNumValues ); 32 | bool Save ( util::FileWriter_c & tWriter, std::string & sError ); 33 | 34 | private: 35 | Settings_t m_tSettings; 36 | 37 | using TreeLevel_t = std::vector>; 38 | std::vector m_dTreeLevels; 39 | int m_iCollected = 0; 40 | bool m_bHaveNonEmpty = false; 41 | T m_tMin; 42 | T m_tMax; 43 | 44 | void Flush(); 45 | void BuildTree(); 46 | 47 | inline bool SaveTreeLevels ( util::FileWriter_c & tWriter ) const; 48 | }; 49 | 50 | template 51 | MinMaxBuilder_T::MinMaxBuilder_T ( const Settings_t & tSettings ) 52 | : m_tSettings ( tSettings ) 53 | { 54 | m_dTreeLevels.resize(1); 55 | } 56 | 57 | template 58 | void MinMaxBuilder_T::Add ( int64_t tValue ) 59 | { 60 | if ( m_iCollected==m_tSettings.m_iSubblockSize ) 61 | Flush(); 62 | 63 | T tConverted = util::to_type(tValue); 64 | 65 | if ( !m_iCollected ) 66 | { 67 | m_tMin = tConverted; 68 | m_tMax = tConverted; 69 | } 70 | else 71 | { 72 | m_tMin = std::min ( m_tMin, tConverted ); 73 | m_tMax = std::max ( m_tMax, tConverted ); 74 | } 75 | 76 | m_bHaveNonEmpty = true; 77 | m_iCollected++; 78 | } 79 | 80 | template 81 | void MinMaxBuilder_T::Add ( const int64_t * pValues, int iNumValues ) 82 | { 83 | if ( m_iCollected==m_tSettings.m_iSubblockSize ) 84 | Flush(); 85 | 86 | if ( !iNumValues ) 87 | { 88 | m_iCollected++; 89 | return; 90 | } 91 | 92 | T tMin, tMax; 93 | for ( int i = 0; i < iNumValues; i++ ) 94 | { 95 | T tConverted = util::to_type(pValues[i]); 96 | if ( i ) 97 | { 98 | tMin = std::min ( tMin, tConverted ); 99 | tMax = std::max ( tMax, tConverted ); 100 | } 101 | else 102 | tMin = tMax = tConverted; 103 | } 104 | 105 | if ( !m_bHaveNonEmpty ) 106 | { 107 | m_tMin = tMin; 108 | m_tMax = tMax; 109 | } 110 | else 111 | { 112 | m_tMin = std::min ( m_tMin, tMin ); 113 | m_tMax = std::max ( m_tMax, tMax ); 114 | } 115 | 116 | m_bHaveNonEmpty = true; 117 | m_iCollected++; 118 | } 119 | 120 | template 121 | void MinMaxBuilder_T::Flush() 122 | { 123 | if ( !m_iCollected ) 124 | return; 125 | 126 | // fixme! this will give false positives for queries like ANY()>=0 127 | if ( !m_bHaveNonEmpty ) 128 | { 129 | m_tMin = (T)0; 130 | m_tMax = (T)0; 131 | } 132 | 133 | m_dTreeLevels[0].push_back ( { m_tMin, m_tMax } ); 134 | m_iCollected=0; 135 | m_bHaveNonEmpty = false; 136 | } 137 | 138 | template 139 | void MinMaxBuilder_T::BuildTree() 140 | { 141 | if ( m_dTreeLevels[0].size()<=1 ) 142 | return; 143 | 144 | do 145 | { 146 | m_dTreeLevels.push_back ( TreeLevel_t() ); 147 | auto & dNewBlocks = m_dTreeLevels.back(); 148 | auto & dBlocks = m_dTreeLevels[m_dTreeLevels.size()-2]; 149 | 150 | for ( int i = 0; i & tMinMax = dNewBlocks.back(); 154 | if ( i+11 ); 164 | } 165 | 166 | template 167 | bool MinMaxBuilder_T::Save ( util::FileWriter_c & tWriter, std::string & sError ) 168 | { 169 | Flush(); 170 | BuildTree(); 171 | 172 | // now save the tree 173 | tWriter.Pack_uint32 ( (uint32_t)m_dTreeLevels.size() ); 174 | for ( int i = (int)m_dTreeLevels.size()-1; i>=0; i-- ) 175 | tWriter.Pack_uint32 ( (uint32_t)m_dTreeLevels[i].size() ); 176 | 177 | return SaveTreeLevels(tWriter); 178 | } 179 | 180 | template 181 | inline bool MinMaxBuilder_T::SaveTreeLevels ( util::FileWriter_c & tWriter ) const 182 | { 183 | for ( int i = (int)m_dTreeLevels.size()-1; i>=0; i-- ) 184 | for ( auto & tMinMax : m_dTreeLevels[i] ) 185 | { 186 | tWriter.Pack_uint64 ( (uint64_t)tMinMax.first ); 187 | tWriter.Pack_uint64 ( uint64_t ( tMinMax.second-tMinMax.first ) ); 188 | } 189 | 190 | return !tWriter.IsError(); 191 | } 192 | 193 | template<> 194 | inline bool MinMaxBuilder_T::SaveTreeLevels ( util::FileWriter_c & tWriter ) const 195 | { 196 | for ( int i = (int)m_dTreeLevels.size()-1; i>=0; i-- ) 197 | for ( auto & tMinMax : m_dTreeLevels[i] ) 198 | { 199 | assert ( tMinMax.first<2 && tMinMax.second<2 ); 200 | tWriter.Write_uint8 ( ( tMinMax.first << 1 ) | tMinMax.second ); 201 | } 202 | 203 | return !tWriter.IsError(); 204 | } 205 | 206 | template<> 207 | inline bool MinMaxBuilder_T::SaveTreeLevels ( util::FileWriter_c & tWriter ) const 208 | { 209 | for ( int i = (int)m_dTreeLevels.size()-1; i>=0; i-- ) 210 | for ( auto & tMinMax : m_dTreeLevels[i] ) 211 | { 212 | tWriter.Pack_uint32 ( util::FloatToUint ( tMinMax.first ) ); 213 | tWriter.Pack_uint32 ( util::FloatToUint ( tMinMax.second ) ); 214 | } 215 | 216 | return !tWriter.IsError(); 217 | } 218 | 219 | } // namespace columnar 220 | -------------------------------------------------------------------------------- /knn/space.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025, Manticore Software LTD (https://manticoresearch.com) 2 | // All rights reserved 3 | // 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | #pragma once 18 | 19 | #include "hnswlib.h" 20 | #include "quantizer.h" 21 | #include 22 | 23 | namespace knn 24 | { 25 | 26 | struct QuantizationSettings_t; 27 | 28 | class Space_i : public hnswlib::SpaceInterface 29 | { 30 | public: 31 | virtual void SetQuantizationSettings ( ScalarQuantizer_i & tQuantizer ) {} 32 | }; 33 | 34 | class Space_c : public Space_i 35 | { 36 | using Dist_fn = hnswlib::DISTFUNC; 37 | 38 | public: 39 | Space_c ( size_t uDim ) : m_uDim ( uDim ) {} 40 | 41 | Dist_fn get_dist_func() override { return m_fnDist; } 42 | void * get_dist_func_param() override { return &m_uDim; } 43 | 44 | protected: 45 | Dist_fn m_fnDist = nullptr; 46 | size_t m_uDim = 0; 47 | }; 48 | 49 | /////////////////////////////////////////////////////////////////////////////// 50 | 51 | class L2Space32BitFloat_c : public Space_i 52 | { 53 | using Dist_fn = hnswlib::DISTFUNC; 54 | 55 | public: 56 | L2Space32BitFloat_c ( size_t uDim ) : m_tL2S(uDim) {} 57 | 58 | size_t get_data_size() override { return m_tL2S.get_data_size(); } 59 | Dist_fn get_dist_func() override { return m_tL2S.get_dist_func(); } 60 | void * get_dist_func_param() override { return m_tL2S.get_dist_func_param(); } 61 | 62 | private: 63 | hnswlib::L2Space m_tL2S; 64 | }; 65 | 66 | 67 | struct DistFuncParamL2_t 68 | { 69 | size_t m_uDim; 70 | float m_fA; 71 | }; 72 | 73 | 74 | class L2Space8BitFloat_c : public Space_c 75 | { 76 | public: 77 | L2Space8BitFloat_c ( size_t uDim ); 78 | 79 | void * get_dist_func_param() override { return &m_tDistFuncParam; } 80 | size_t get_data_size() override { return m_uDim; } 81 | 82 | void SetQuantizationSettings ( ScalarQuantizer_i & tQuantizer ) override; 83 | 84 | protected: 85 | DistFuncParamL2_t m_tDistFuncParam; 86 | 87 | virtual float CalcAlpha ( const QuantizationSettings_t & tSettings ) const { return ( tSettings.m_fMax-tSettings.m_fMin ) / 255.0; } 88 | }; 89 | 90 | class L2Space1BitFloat_c : public L2Space8BitFloat_c 91 | { 92 | public: 93 | L2Space1BitFloat_c ( size_t uDim ); 94 | 95 | size_t get_data_size() override { return (m_uDim+7)>>3; } 96 | 97 | protected: 98 | float CalcAlpha ( const QuantizationSettings_t & tSettings ) const override { return tSettings.m_fMax-tSettings.m_fMin; } 99 | }; 100 | 101 | /////////////////////////////////////////////////////////////////////////////// 102 | 103 | class IPSpace32BitFloat_c : public Space_i 104 | { 105 | using Dist_fn = hnswlib::DISTFUNC; 106 | 107 | public: 108 | IPSpace32BitFloat_c ( size_t uDim ) : m_tIPS(uDim) {} 109 | 110 | size_t get_data_size() override { return m_tIPS.get_data_size(); } 111 | Dist_fn get_dist_func() override { return m_tIPS.get_dist_func(); } 112 | void * get_dist_func_param() override { return m_tIPS.get_dist_func_param(); } 113 | 114 | private: 115 | hnswlib::InnerProductSpace m_tIPS; 116 | }; 117 | 118 | 119 | struct DistFuncParamIP_t 120 | { 121 | size_t m_uDim; 122 | float m_fK; 123 | float m_fB; 124 | 125 | FORCE_INLINE float CalcIP ( int iDotProduct ) const; 126 | }; 127 | 128 | class IPSpace8BitFloat_c : public Space_c 129 | { 130 | public: 131 | IPSpace8BitFloat_c ( size_t uDim ); 132 | 133 | void * get_dist_func_param() override { return &m_tDistFuncParam; } 134 | size_t get_data_size() override { return m_uDim + sizeof(float); } 135 | 136 | void SetQuantizationSettings ( ScalarQuantizer_i & tQuantizer ) override; 137 | 138 | private: 139 | DistFuncParamIP_t m_tDistFuncParam; 140 | }; 141 | 142 | 143 | class IPSpace1BitFloat_c : public IPSpace8BitFloat_c 144 | { 145 | public: 146 | IPSpace1BitFloat_c ( size_t uDim ); 147 | 148 | size_t get_data_size() override { return (m_uDim+7)>>3; } 149 | }; 150 | 151 | 152 | struct DistFuncParamBinary_t 153 | { 154 | size_t m_uDim = 0; 155 | std::function m_fnFetcher; 156 | float m_fCentroidDotCentroid = 0.0f; 157 | float m_fSqrtDim = 0.0f; 158 | float m_fInvSqrtDim = 0.0f; 159 | float m_fDoubleInvSqrtDim = 0.0f; 160 | float m_fMaxError = 0.0f; 161 | 162 | DistFuncParamBinary_t ( size_t uDim ) 163 | { 164 | m_uDim = uDim; 165 | m_fSqrtDim = sqrt(uDim); 166 | m_fInvSqrtDim = 1.0f / m_fSqrtDim; 167 | m_fDoubleInvSqrtDim = 2.0f * m_fInvSqrtDim; 168 | 169 | int iDimPadded = CalcPadding ( m_uDim, 64 ); 170 | m_fMaxError = (float) ( 1.9f / sqrt ( float(iDimPadded) - 1.0f ) ); 171 | } 172 | 173 | static int CalcPadding ( int iValue, int iPad ) 174 | { 175 | return ( ( iValue + iPad - 1 ) / iPad ) * iPad; 176 | } 177 | }; 178 | 179 | 180 | class IPSpaceBinaryFloat_c : public Space_c 181 | { 182 | public: 183 | IPSpaceBinaryFloat_c ( size_t uDim, bool bBuild ); 184 | 185 | void * get_dist_func_param() override { return &m_tDistFuncParam; } 186 | size_t get_data_size() override { return ( (m_uDim+7)>>3 ) + sizeof(float)*4; } 187 | 188 | void SetQuantizationSettings ( ScalarQuantizer_i & tQuantizer ) override; 189 | 190 | private: 191 | DistFuncParamBinary_t m_tDistFuncParam; 192 | }; 193 | 194 | class L2SpaceBinaryFloat_c : public Space_c 195 | { 196 | public: 197 | L2SpaceBinaryFloat_c ( size_t uDim, bool bBuild ); 198 | 199 | void * get_dist_func_param() override { return &m_tDistFuncParam; } 200 | size_t get_data_size() override { return ( (m_uDim+7)>>3 ) + sizeof(float)*3; } 201 | 202 | void SetQuantizationSettings ( ScalarQuantizer_i & tQuantizer ) override; 203 | 204 | private: 205 | DistFuncParamBinary_t m_tDistFuncParam; 206 | }; 207 | 208 | 209 | 210 | } // namespace knn 211 | -------------------------------------------------------------------------------- /embeddings/src/model/voyage.rs: -------------------------------------------------------------------------------- 1 | use super::TextModel; 2 | use crate::LibError; 3 | use reqwest::blocking::Client; 4 | 5 | #[derive(Debug)] 6 | pub struct VoyageModel { 7 | pub client: Client, 8 | pub model: String, 9 | pub api_key: String, 10 | } 11 | 12 | pub fn validate_model(model: &str) -> Result<(), String> { 13 | match model { 14 | "voyage-3-large" | "voyage-3.5" | "voyage-3.5-lite" | "voyage-code-3" 15 | | "voyage-finance-2" | "voyage-law-2" | "voyage-code-2" => Ok(()), 16 | _ => Err(format!("Invalid model: {}", model)), 17 | } 18 | } 19 | 20 | pub fn validate_api_key(api_key: &str) -> Result<(), String> { 21 | if api_key.is_empty() { 22 | return Err("API key is required".to_string()); 23 | } 24 | 25 | // Trim whitespace and check 26 | let trimmed = api_key.trim(); 27 | if trimmed != api_key { 28 | return Err("API key must not have leading or trailing whitespace".to_string()); 29 | } 30 | 31 | // Voyage API keys typically start with "pa-" prefix 32 | if !api_key.starts_with("pa-") || api_key.len() <= 3 { 33 | return Err("API key must start with pa- and have content".to_string()); 34 | } 35 | 36 | Ok(()) 37 | } 38 | 39 | impl VoyageModel { 40 | pub fn new(model_id: &str, api_key: &str) -> Result> { 41 | let model = model_id.trim_start_matches("voyage/").to_string(); 42 | validate_model(&model).map_err(|_| LibError::RemoteUnsupportedModel)?; 43 | validate_api_key(api_key).map_err(|_| LibError::RemoteInvalidAPIKey)?; 44 | Ok(Self { 45 | client: Client::new(), 46 | model, 47 | api_key: api_key.to_string(), 48 | }) 49 | } 50 | } 51 | 52 | impl TextModel for VoyageModel { 53 | fn predict(&self, texts: &[&str]) -> Result>, Box> { 54 | let url = "https://api.voyageai.com/v1/embeddings"; 55 | 56 | let request_body = serde_json::json!({ 57 | "input": texts, 58 | "model": self.model, 59 | }); 60 | 61 | let response = self 62 | .client 63 | .post(url) 64 | .header("Authorization", format!("Bearer {}", self.api_key)) 65 | .header("Content-Type", "application/json") 66 | .json(&request_body) 67 | .send() 68 | .map_err(|_| LibError::RemoteRequestSendFailed)?; 69 | 70 | let response_body: serde_json::Value = response 71 | .json() 72 | .map_err(|_| LibError::RemoteResponseParseFailed)?; 73 | 74 | // Check if there's an error in the response - proper szError pattern handling 75 | if let Some(error) = response_body.get("error") { 76 | let error_code = error 77 | .get("code") 78 | .and_then(|c| c.as_str()) 79 | .unwrap_or("unknown_error"); 80 | 81 | // Map Voyage error codes to appropriate LibError types 82 | let lib_error = match error_code { 83 | "invalid_api_key" | "authentication_error" => LibError::RemoteInvalidAPIKey, 84 | "model_not_found" | "invalid_model" => LibError::RemoteUnsupportedModel, 85 | "rate_limit_exceeded" | "quota_exceeded" => LibError::RemoteRequestSendFailed, 86 | _ => LibError::RemoteResponseParseFailed, 87 | }; 88 | 89 | return Err(Box::new(lib_error)); 90 | } 91 | 92 | // Check for alternative error format (some APIs use different structures) 93 | if let Some(detail) = response_body.get("detail") { 94 | if detail.is_string() { 95 | return Err(Box::new(LibError::RemoteResponseParseFailed)); 96 | } 97 | } 98 | 99 | let embeddings: Vec> = response_body["data"] 100 | .as_array() 101 | .unwrap_or(&Vec::new()) 102 | .iter() 103 | .map(|item| { 104 | item["embedding"] 105 | .as_array() 106 | .unwrap_or(&Vec::new()) 107 | .iter() 108 | .map(|v| v.as_f64().unwrap_or(0.0) as f32) 109 | .collect() 110 | }) 111 | .collect(); 112 | 113 | // Validate that we got embeddings - never return empty vectors 114 | if embeddings.is_empty() { 115 | return Err(Box::new(LibError::RemoteResponseParseFailed)); 116 | } 117 | 118 | // Validate embedding dimensions and handle empty individual embeddings 119 | let expected_dim = self.get_hidden_size(); 120 | for embedding in embeddings.iter() { 121 | if embedding.is_empty() { 122 | return Err(Box::new(LibError::RemoteResponseParseFailed)); 123 | } 124 | if embedding.len() != expected_dim { 125 | // Some models might return different dimensions, but we should validate 126 | // For now, we'll be lenient but could add stricter validation later 127 | } 128 | } 129 | 130 | Ok(embeddings) 131 | } 132 | 133 | fn get_hidden_size(&self) -> usize { 134 | match self.model.as_str() { 135 | "voyage-3-large" => 1024, // Default 1024, supports 256, 512, 2048 136 | "voyage-3.5" => 1024, // Default 1024, supports 256, 512, 2048 137 | "voyage-3.5-lite" => 1024, // Default 1024, supports 256, 512, 2048 138 | "voyage-code-3" => 1024, // Default 1024, supports 256, 512, 2048 139 | "voyage-finance-2" => 1024, 140 | "voyage-law-2" => 1024, 141 | "voyage-code-2" => 1536, 142 | _ => panic!("Unknown model"), 143 | } 144 | } 145 | 146 | fn get_max_input_len(&self) -> usize { 147 | match self.model.as_str() { 148 | "voyage-3-large" => 32000, 149 | "voyage-3.5" => 32000, 150 | "voyage-3.5-lite" => 32000, 151 | "voyage-code-3" => 32000, 152 | "voyage-finance-2" => 32000, 153 | "voyage-law-2" => 16000, 154 | "voyage-code-2" => 16000, 155 | _ => 8192, // Default fallback 156 | } 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /.github/workflows/test_template.yml: -------------------------------------------------------------------------------- 1 | name: Test Template 2 | 3 | on: 4 | workflow_call: 5 | inputs: 6 | CTEST_CONFIGURATION_TYPE: 7 | required: true 8 | type: string 9 | CTEST_START: 10 | required: false 11 | type: number 12 | default: 1 13 | CTEST_END: 14 | required: false 15 | type: number 16 | default: 999999 17 | artifact_name: 18 | required: true 19 | type: string 20 | build_artifact_name: 21 | required: false 22 | type: string 23 | default: "" 24 | timeout: 25 | required: false 26 | type: number 27 | default: 60 28 | xml_command: 29 | required: false 30 | type: string 31 | default: "cd build; cp -r Testing/2*/Test.xml .; xsltproc -o junit_tests.xml ../misc/junit/ctest2junit.xsl Test.xml" 32 | MANTICORE_LOCATOR: 33 | required: false 34 | type: string 35 | default: "" 36 | USE_AVX2: 37 | required: false 38 | type: boolean 39 | default: true 40 | 41 | jobs: 42 | test: 43 | name: ${{ inputs.CTEST_CONFIGURATION_TYPE }}_${{ inputs.CTEST_START }}_${{ inputs.CTEST_END }} 44 | runs-on: ubuntu-22.04 45 | timeout-minutes: ${{ inputs.timeout }} 46 | continue-on-error: true 47 | defaults: 48 | run: 49 | shell: bash 50 | container: 51 | image: manticoresearch/ubertests_ctest:3263_mar_2024 52 | env: 53 | DIAGNOSTIC: 1 54 | CACHEB: ../cache 55 | NO_BUILD: 1 56 | CTEST_START: ${{ inputs.CTEST_START }} 57 | CTEST_END: ${{ inputs.CTEST_END }} 58 | MANTICORE_LOCATOR: ${{ inputs.MANTICORE_LOCATOR }} 59 | # The following is useful to test a specific test, just uncomment it, no need to disable CTEST_START/END 60 | # CTEST_REGEX: test_234 61 | steps: 62 | - name: Checkout repository 63 | uses: actions/checkout@v3 64 | with: 65 | token: ${{ secrets.GITHUB_TOKEN }} 66 | 67 | - name: Print Manticore Locator 68 | run: | 69 | echo "Using MANTICORE_LOCATOR: '${{ inputs.MANTICORE_LOCATOR }}'" 70 | if [[ -n "${{ inputs.MANTICORE_LOCATOR }}" ]]; then 71 | echo "Manticore locator is set and will be used for testing" 72 | else 73 | echo "Manticore locator is empty, using default manticore sources" 74 | fi 75 | 76 | - name: Determine branch name for cache 77 | id: branch 78 | run: | 79 | if [ "${{ github.event_name }}" = "pull_request" ]; then 80 | echo "branch_name=${{ github.head_ref }}" >> $GITHUB_OUTPUT 81 | else 82 | echo "branch_name=${{ github.ref_name }}" >> $GITHUB_OUTPUT 83 | fi 84 | 85 | - name: Tests container entrypoint 86 | run: bash /entry_point.sh & 87 | 88 | - name: Set build artifact name 89 | id: build_artifact 90 | run: | 91 | if [ -n "${{ inputs.build_artifact_name }}" ]; then 92 | echo "name=${{ inputs.build_artifact_name }}" >> $GITHUB_OUTPUT 93 | else 94 | echo "name=build_jammy_${{ inputs.CTEST_CONFIGURATION_TYPE }}_x86_64" >> $GITHUB_OUTPUT 95 | fi 96 | 97 | - name: Download build artifacts 98 | uses: manticoresoftware/download_artifact_with_retries@v4 99 | with: 100 | name: ${{ steps.build_artifact.outputs.name }} 101 | path: . 102 | 103 | - name: Download embeddings lib 104 | uses: manticoresoftware/download_artifact_with_retries@v4 105 | continue-on-error: true 106 | with: 107 | name: embeddings_linux_x86_64 108 | path: ./embeddings-lib/ 109 | 110 | - name: Initialization of embeddings lib 111 | run: | 112 | mkdir -p embeddings/target/release 113 | mv ./embeddings-lib/build/* embeddings/target/release/ 114 | rm -fr ./embeddings-lib 115 | 116 | - name: Check out main cache before building 117 | uses: actions/cache@v4 118 | with: 119 | path: cache 120 | enableCrossOsArchive: true 121 | key: build_linux_x86_64_${{ steps.branch.outputs.branch_name }} 122 | 123 | - name: Check out deps cache before building 124 | uses: actions/cache@v4 125 | with: 126 | path: build/_deps/cache 127 | enableCrossOsArchive: true 128 | key: build_linux_x86_64_deps_${{ steps.branch.outputs.branch_name }} 129 | 130 | - name: List files 131 | run: find . 132 | 133 | - name: List of modules 134 | run: find / -name "*.so" 2>/dev/null || echo "No *.so files found" 135 | 136 | - name: Replace standard libs to AVX2 137 | if: ${{ inputs.USE_AVX2 == true }} 138 | run: | 139 | if [ -d /__w/columnar/columnar/build/columnar ]; then 140 | echo "Move linux based libs" 141 | mv /__w/columnar/columnar/build/columnar/lib_manticore_columnar_avx2.so /__w/columnar/columnar/build/columnar/lib_manticore_columnar.so 142 | mv /__w/columnar/columnar/build/secondary/lib_manticore_secondary_avx2.so /__w/columnar/columnar/build/secondary/lib_manticore_secondary.so 143 | fi 144 | 145 | - name: 🚀 Test 146 | id: test 147 | # --timeout may be not working https://gitlab.kitware.com/cmake/cmake/-/issues/23979 148 | run: ctest -VV -S cmake/citest.cmake --no-compress-output --timeout 60 149 | continue-on-error: true 150 | 151 | - name: Remember status 152 | if: always() 153 | run: echo "${{ steps.test.outcome }}" > build/status_${{ inputs.artifact_name }} 154 | 155 | - name: Prepare test report xmls 156 | if: always() 157 | continue-on-error: true 158 | run: ${{ inputs.xml_command }} 159 | 160 | - name: Upload test artifacts 161 | if: always() 162 | continue-on-error: true 163 | uses: manticoresoftware/upload_artifact_with_retries@v4 164 | with: 165 | name: ${{ inputs.artifact_name }} 166 | path: "build/junit*.xml build/_deps/manticore-build/test/test_*/report.* build/_deps/manticore-build/test/error*.txt build/_deps/manticore-build/test/*log build/status*" 167 | --------------------------------------------------------------------------------