├── .github └── workflows │ └── build.yml ├── .gitignore ├── CHANGELOG.md ├── CMakeLists.txt ├── LICENSE.txt ├── README.md ├── examples ├── citus │ ├── CMakeLists.txt │ └── example.cpp ├── cohere │ ├── CMakeLists.txt │ └── example.cpp ├── disco │ ├── CMakeLists.txt │ └── example.cpp ├── hybrid │ ├── CMakeLists.txt │ └── example.cpp ├── loading │ ├── CMakeLists.txt │ └── example.cpp ├── openai │ ├── CMakeLists.txt │ └── example.cpp ├── rdkit │ ├── CMakeLists.txt │ └── example.cpp └── sparse │ ├── CMakeLists.txt │ └── example.cpp ├── include └── pgvector │ ├── halfvec.hpp │ ├── pqxx.hpp │ ├── sparsevec.hpp │ └── vector.hpp └── test ├── halfvec_test.cpp ├── main.cpp ├── pqxx_test.cpp ├── sparsevec_test.cpp └── vector_test.cpp /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | on: [push, pull_request] 3 | jobs: 4 | build: 5 | runs-on: ubuntu-24.04 6 | steps: 7 | - uses: actions/checkout@v4 8 | - uses: ankane/setup-postgres@v1 9 | with: 10 | database: pgvector_cpp_test 11 | dev-files: true 12 | - run: | 13 | cd /tmp 14 | git clone --branch v0.8.0 https://github.com/pgvector/pgvector.git 15 | cd pgvector 16 | make 17 | sudo make install 18 | 19 | - run: cmake -S . -B build -DBUILD_TESTING=ON -DCMAKE_CXX_STANDARD=17 20 | - run: cmake --build build 21 | - run: build/test 22 | 23 | - run: cmake -S . -B build -DBUILD_TESTING=ON -DCMAKE_CXX_STANDARD=20 24 | - run: cmake --build build 25 | - run: build/test 26 | 27 | - run: | 28 | sudo apt-get install valgrind 29 | valgrind --leak-check=yes build/test 30 | 31 | # test install 32 | - run: rm -r build 33 | - run: cmake -S . -B build 34 | - run: cmake --build build 35 | - run: sudo cmake --install build 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 0.2.2 (2025-02-23) 2 | 3 | - Added map constructor to `SparseVector` 4 | - Added `std::span` constructor to `HalfVector` and `SparseVector` 5 | 6 | ## 0.2.1 (2025-01-15) 7 | 8 | - Added `std::span` constructor to `Vector` 9 | - Added support for CMake and FetchContent 10 | 11 | ## 0.2.0 (2024-07-10) 12 | 13 | - Added support for `halfvec` and `sparsevec` types 14 | - Fixed error with MSVC 15 | 16 | ## 0.1.1 (2022-11-13) 17 | 18 | - Added default constructor 19 | 20 | ## 0.1.0 (2021-09-10) 21 | 22 | - First release 23 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | project(pgvector VERSION 0.2.2 LANGUAGES CXX) 4 | 5 | include(GNUInstallDirs) 6 | 7 | add_library(pgvector INTERFACE) 8 | add_library(pgvector::pgvector ALIAS pgvector) 9 | 10 | target_compile_features(pgvector INTERFACE cxx_std_17) 11 | 12 | target_include_directories( 13 | pgvector 14 | INTERFACE 15 | "${PROJECT_SOURCE_DIR}/include/" 16 | ) 17 | 18 | install( 19 | DIRECTORY "${PROJECT_SOURCE_DIR}/include/" 20 | DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" 21 | ) 22 | 23 | if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) 24 | option(BUILD_TESTING "" OFF) 25 | 26 | if(BUILD_TESTING) 27 | include(FetchContent) 28 | 29 | FetchContent_Declare(libpqxx GIT_REPOSITORY https://github.com/jtv/libpqxx.git GIT_TAG 7.10.1) 30 | FetchContent_MakeAvailable(libpqxx) 31 | 32 | add_executable(test test/halfvec_test.cpp test/main.cpp test/pqxx_test.cpp test/sparsevec_test.cpp test/vector_test.cpp) 33 | target_link_libraries(test PRIVATE libpqxx::pqxx pgvector::pgvector) 34 | if(NOT MSVC) 35 | target_compile_options(test PRIVATE -Wall -Wextra -Wpedantic -Werror) 36 | endif() 37 | endif() 38 | endif() 39 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2021-2025 Andrew Kane 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pgvector-cpp 2 | 3 | [pgvector](https://github.com/pgvector/pgvector) support for C++ 4 | 5 | Supports [libpqxx](https://github.com/jtv/libpqxx) 6 | 7 | [![Build Status](https://github.com/pgvector/pgvector-cpp/actions/workflows/build.yml/badge.svg)](https://github.com/pgvector/pgvector-cpp/actions) 8 | 9 | ## Installation 10 | 11 | Add [the headers](https://github.com/pgvector/pgvector-cpp/tree/v0.2.2/include) to your project (supports C++17 and greater). 12 | 13 | There is also support for CMake and FetchContent: 14 | 15 | ```cmake 16 | include(FetchContent) 17 | 18 | FetchContent_Declare(pgvector GIT_REPOSITORY https://github.com/pgvector/pgvector-cpp.git GIT_TAG v0.2.2) 19 | FetchContent_MakeAvailable(pgvector) 20 | 21 | target_link_libraries(app PRIVATE pgvector::pgvector) 22 | ``` 23 | 24 | ## Getting Started 25 | 26 | Follow the instructions for your database library: 27 | 28 | - [libpqxx](#libpqxx) 29 | 30 | Or check out some examples: 31 | 32 | - [Embeddings](https://github.com/pgvector/pgvector-cpp/blob/master/examples/openai/example.cpp) with OpenAI 33 | - [Binary embeddings](https://github.com/pgvector/pgvector-cpp/blob/master/examples/cohere/example.cpp) with Cohere 34 | - [Hybrid search](https://github.com/pgvector/pgvector-cpp/blob/master/examples/hybrid/example.cpp) with llama.cpp (Reciprocal Rank Fusion) 35 | - [Sparse search](https://github.com/pgvector/pgvector-cpp/blob/master/examples/sparse/example.cpp) with Text Embeddings Inference 36 | - [Morgan fingerprints](https://github.com/pgvector/pgvector-cpp/blob/master/examples/rdkit/example.cpp) with RDKit 37 | - [Recommendations](https://github.com/pgvector/pgvector-cpp/blob/master/examples/disco/example.cpp) with Disco 38 | - [Horizontal scaling](https://github.com/pgvector/pgvector-cpp/blob/master/examples/citus/example.cpp) with Citus 39 | - [Bulk loading](https://github.com/pgvector/pgvector-cpp/blob/master/examples/loading/example.cpp) with `COPY` 40 | 41 | ## libpqxx 42 | 43 | Include the header 44 | 45 | ```cpp 46 | #include 47 | ``` 48 | 49 | Enable the extension 50 | 51 | ```cpp 52 | tx.exec("CREATE EXTENSION IF NOT EXISTS vector"); 53 | ``` 54 | 55 | Create a table 56 | 57 | ```cpp 58 | tx.exec("CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))"); 59 | ``` 60 | 61 | Insert a vector 62 | 63 | ```cpp 64 | auto embedding = pgvector::Vector({1, 2, 3}); 65 | tx.exec("INSERT INTO items (embedding) VALUES ($1)", {embedding}); 66 | ``` 67 | 68 | Get the nearest neighbors 69 | 70 | ```cpp 71 | pqxx::result r = tx.exec("SELECT * FROM items ORDER BY embedding <-> $1 LIMIT 5", {embedding}); 72 | ``` 73 | 74 | Retrieve a vector 75 | 76 | ```cpp 77 | auto row = tx.exec("SELECT embedding FROM items LIMIT 1").one_row(); 78 | auto embedding = row[0].as(); 79 | ``` 80 | 81 | Use `std::optional` if the value could be `NULL` 82 | 83 | ## Reference 84 | 85 | ### Vectors 86 | 87 | Create a vector from a `std::vector` 88 | 89 | ```cpp 90 | auto vec = pgvector::Vector({1, 2, 3}); 91 | ``` 92 | 93 | Convert to a `std::vector` 94 | 95 | ```cpp 96 | auto float_vec = static_cast>(vec); 97 | ``` 98 | 99 | ### Half Vectors 100 | 101 | Create a half vector from a `std::vector` 102 | 103 | ```cpp 104 | auto vec = pgvector::HalfVector({1, 2, 3}); 105 | ``` 106 | 107 | Convert to a `std::vector` 108 | 109 | ```cpp 110 | auto float_vec = static_cast>(vec); 111 | ``` 112 | 113 | ### Sparse Vectors 114 | 115 | Create a sparse vector from a `std::vector` 116 | 117 | ```cpp 118 | auto vec = pgvector::SparseVector({1, 0, 2, 0, 3, 0}); 119 | ``` 120 | 121 | Or a map of non-zero elements 122 | 123 | ```cpp 124 | std::unordered_map map = {{0, 1}, {2, 2}, {4, 3}}; 125 | auto vec = pgvector::SparseVector(map, 6); 126 | ``` 127 | 128 | Get the number of dimensions 129 | 130 | ```cpp 131 | int dim = vec.dimensions(); 132 | ``` 133 | 134 | Get the indices of non-zero elements 135 | 136 | ```cpp 137 | auto indices = vec.indices(); 138 | ``` 139 | 140 | Get the values of non-zero elements 141 | 142 | ```cpp 143 | auto values = vec.values(); 144 | ``` 145 | 146 | ## History 147 | 148 | View the [changelog](https://github.com/pgvector/pgvector-cpp/blob/master/CHANGELOG.md) 149 | 150 | ## Contributing 151 | 152 | Everyone is encouraged to help improve this project. Here are a few ways you can help: 153 | 154 | - [Report bugs](https://github.com/pgvector/pgvector-cpp/issues) 155 | - Fix bugs and [submit pull requests](https://github.com/pgvector/pgvector-cpp/pulls) 156 | - Write, clarify, or fix documentation 157 | - Suggest or add new features 158 | 159 | To get started with development: 160 | 161 | ```sh 162 | git clone https://github.com/pgvector/pgvector-cpp.git 163 | cd pgvector-cpp 164 | createdb pgvector_cpp_test 165 | cmake -S . -B build -DBUILD_TESTING=ON 166 | cmake --build build 167 | build/test 168 | ``` 169 | 170 | To run an example: 171 | 172 | ```sh 173 | cd examples/loading 174 | createdb pgvector_example 175 | cmake -S . -B build 176 | cmake --build build 177 | build/example 178 | ``` 179 | -------------------------------------------------------------------------------- /examples/citus/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | project(example) 4 | 5 | set(CMAKE_CXX_STANDARD 17) 6 | 7 | include(FetchContent) 8 | 9 | FetchContent_Declare(libpqxx GIT_REPOSITORY https://github.com/jtv/libpqxx.git GIT_TAG 7.10.1) 10 | FetchContent_MakeAvailable(libpqxx) 11 | 12 | add_subdirectory("${PROJECT_SOURCE_DIR}/../.." pgvector) 13 | 14 | add_executable(example example.cpp) 15 | target_link_libraries(example PRIVATE libpqxx::pqxx pgvector::pgvector) 16 | -------------------------------------------------------------------------------- /examples/citus/example.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | std::vector> random_embeddings(int rows, int dimensions) { 10 | std::random_device rd; 11 | std::mt19937_64 prng(rd()); 12 | std::uniform_real_distribution dist(0, 1); 13 | 14 | std::vector> embeddings; 15 | embeddings.reserve(rows); 16 | for (int i = 0; i < rows; i++) { 17 | std::vector embedding; 18 | embedding.reserve(dimensions); 19 | for (int j = 0; j < dimensions; j++) { 20 | embedding.push_back(dist(prng)); 21 | } 22 | embeddings.push_back(embedding); 23 | } 24 | return embeddings; 25 | } 26 | 27 | std::vector random_categories(int rows) { 28 | std::random_device rd; 29 | std::mt19937_64 prng(rd()); 30 | std::uniform_int_distribution dist(1, 100); 31 | 32 | std::vector categories; 33 | categories.reserve(rows); 34 | for (int i = 0; i < rows; i++) { 35 | categories.push_back(dist(prng)); 36 | } 37 | return categories; 38 | } 39 | 40 | int main() { 41 | // generate random data 42 | int rows = 100000; 43 | int dimensions = 128; 44 | auto embeddings = random_embeddings(rows, dimensions); 45 | auto categories = random_categories(rows); 46 | auto queries = random_embeddings(10, dimensions); 47 | 48 | // enable extensions 49 | pqxx::connection conn("dbname=pgvector_citus"); 50 | pqxx::nontransaction tx(conn); 51 | tx.exec("CREATE EXTENSION IF NOT EXISTS citus"); 52 | tx.exec("CREATE EXTENSION IF NOT EXISTS vector"); 53 | 54 | // GUC variables set on the session do not propagate to Citus workers 55 | // https://github.com/citusdata/citus/issues/462 56 | // you can either: 57 | // 1. set them on the system, user, or database and reconnect 58 | // 2. set them for a transaction with SET LOCAL 59 | tx.exec("ALTER DATABASE pgvector_citus SET maintenance_work_mem = '512MB'"); 60 | tx.exec("ALTER DATABASE pgvector_citus SET hnsw.ef_search = 20"); 61 | conn.close(); 62 | 63 | // reconnect for updated GUC variables to take effect 64 | pqxx::connection conn2("dbname=pgvector_citus"); 65 | pqxx::nontransaction tx2(conn2); 66 | 67 | std::cout << "Creating distributed table" << std::endl; 68 | tx2.exec("DROP TABLE IF EXISTS items"); 69 | tx2.exec("CREATE TABLE items (id bigserial, embedding vector(128), category_id bigint, PRIMARY KEY (id, category_id))"); 70 | tx2.exec("SET citus.shard_count = 4"); 71 | tx2.exec("SELECT create_distributed_table('items', 'category_id')"); 72 | 73 | // libpqxx does not support binary COPY 74 | std::cout << "Loading data in parallel" << std::endl; 75 | auto stream = pqxx::stream_to::table(tx2, {"items"}, {"embedding", "category_id"}); 76 | for (size_t i = 0; i < embeddings.size(); i++) { 77 | stream << std::make_tuple(pgvector::Vector(embeddings[i]), categories[i]); 78 | } 79 | stream.complete(); 80 | 81 | std::cout << "Creating index in parallel" << std::endl; 82 | tx2.exec("CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)"); 83 | 84 | std::cout << "Running distributed queries" << std::endl; 85 | for (auto& query : queries) { 86 | pqxx::result result = tx2.exec( 87 | "SELECT id FROM items ORDER BY embedding <-> $1 LIMIT 10", 88 | pqxx::params{pgvector::Vector(query)} 89 | ); 90 | for (const auto& row : result) { 91 | std::cout << row[0].as() << " "; 92 | } 93 | std::cout << std::endl; 94 | } 95 | 96 | return 0; 97 | } 98 | -------------------------------------------------------------------------------- /examples/cohere/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | project(example) 4 | 5 | set(CMAKE_CXX_STANDARD 17) 6 | 7 | include(FetchContent) 8 | 9 | FetchContent_Declare(cpr GIT_REPOSITORY https://github.com/libcpr/cpr.git GIT_TAG 1.11.1) 10 | FetchContent_Declare(json GIT_REPOSITORY https://github.com/nlohmann/json.git GIT_TAG v3.11.3) 11 | FetchContent_Declare(libpqxx GIT_REPOSITORY https://github.com/jtv/libpqxx.git GIT_TAG 7.10.1) 12 | FetchContent_MakeAvailable(cpr json libpqxx) 13 | 14 | add_subdirectory("${PROJECT_SOURCE_DIR}/../.." pgvector) 15 | 16 | add_executable(example example.cpp) 17 | target_link_libraries(example PRIVATE cpr::cpr libpqxx::pqxx nlohmann_json::nlohmann_json pgvector::pgvector) 18 | -------------------------------------------------------------------------------- /examples/cohere/example.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | using json = nlohmann::json; 13 | 14 | // https://docs.cohere.com/reference/embed 15 | std::vector embed(const std::vector& texts, const std::string& input_type, char *api_key) { 16 | std::string url = "https://api.cohere.com/v2/embed"; 17 | json data = { 18 | {"texts", texts}, 19 | {"model", "embed-v4.0"}, 20 | {"input_type", input_type}, 21 | {"embedding_types", {"ubinary"}} 22 | }; 23 | 24 | cpr::Response r = cpr::Post( 25 | cpr::Url{url}, 26 | cpr::Body{data.dump()}, 27 | cpr::Bearer{api_key}, 28 | cpr::Header{{"Content-Type", "application/json"}} 29 | ); 30 | if (r.status_code != 200) { 31 | throw std::runtime_error("Bad status: " + std::to_string(r.status_code)); 32 | } 33 | json response = json::parse(r.text); 34 | 35 | std::vector embeddings; 36 | for (auto& v : response["embeddings"]["ubinary"]) { 37 | std::stringstream buf; 38 | for (uint8_t c : v) { 39 | std::bitset<8> b{c}; 40 | buf << b.to_string(); 41 | } 42 | embeddings.emplace_back(buf.str()); 43 | } 44 | return embeddings; 45 | } 46 | 47 | int main() { 48 | char *api_key = std::getenv("CO_API_KEY"); 49 | if (!api_key) { 50 | std::cout << "Set CO_API_KEY" << std::endl; 51 | return 1; 52 | } 53 | 54 | pqxx::connection conn("dbname=pgvector_example"); 55 | 56 | pqxx::nontransaction tx(conn); 57 | tx.exec("CREATE EXTENSION IF NOT EXISTS vector"); 58 | tx.exec("DROP TABLE IF EXISTS documents"); 59 | tx.exec("CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding bit(1536))"); 60 | 61 | std::vector input = { 62 | "The dog is barking", 63 | "The cat is purring", 64 | "The bear is growling" 65 | }; 66 | auto embeddings = embed(input, "search_document", api_key); 67 | for (size_t i = 0; i < input.size(); i++) { 68 | tx.exec("INSERT INTO documents (content, embedding) VALUES ($1, $2)", pqxx::params{input[i], embeddings[i]}); 69 | } 70 | 71 | std::string query = "forest"; 72 | auto query_embedding = embed({query}, "search_query", api_key)[0]; 73 | pqxx::result result = tx.exec("SELECT content FROM documents ORDER BY embedding <~> $1 LIMIT 5", pqxx::params{query_embedding}); 74 | for (const auto& row : result) { 75 | std::cout << row[0].as() << std::endl; 76 | } 77 | 78 | return 0; 79 | } 80 | -------------------------------------------------------------------------------- /examples/disco/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | project(example) 4 | 5 | set(CMAKE_CXX_STANDARD 20) 6 | 7 | include(FetchContent) 8 | 9 | FetchContent_Declare(disco GIT_REPOSITORY https://github.com/ankane/disco-cpp.git GIT_TAG v0.1.3) 10 | FetchContent_Declare(libpqxx GIT_REPOSITORY https://github.com/jtv/libpqxx.git GIT_TAG 7.10.1) 11 | FetchContent_MakeAvailable(disco libpqxx) 12 | 13 | add_subdirectory("${PROJECT_SOURCE_DIR}/../.." pgvector) 14 | 15 | add_executable(example example.cpp) 16 | target_link_libraries(example PRIVATE disco::disco libpqxx::pqxx pgvector::pgvector) 17 | -------------------------------------------------------------------------------- /examples/disco/example.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | using disco::Dataset; 15 | using disco::Recommender; 16 | 17 | std::string convert_to_utf8(const std::string& str) { 18 | std::stringstream buf; 19 | for (auto &v : str) { 20 | if (v >= 0) { 21 | buf << v; 22 | } else { 23 | buf << static_cast(195) << static_cast(v - 64); 24 | } 25 | } 26 | return buf.str(); 27 | } 28 | 29 | Dataset load_movielens(const std::string& path) { 30 | std::string line; 31 | 32 | // read movies 33 | std::unordered_map movies; 34 | std::ifstream movies_file(path + "/u.item"); 35 | assert(movies_file.is_open()); 36 | while (std::getline(movies_file, line)) { 37 | std::string::size_type n = line.find('|'); 38 | std::string::size_type n2 = line.find('|', n + 1); 39 | movies.emplace(std::make_pair(line.substr(0, n), convert_to_utf8(line.substr(n + 1, n2 - n - 1)))); 40 | } 41 | 42 | // read ratings and create dataset 43 | auto data = Dataset(); 44 | std::ifstream ratings_file(path + "/u.data"); 45 | assert(ratings_file.is_open()); 46 | while (std::getline(ratings_file, line)) { 47 | std::string::size_type n = line.find('\t'); 48 | std::string::size_type n2 = line.find('\t', n + 1); 49 | std::string::size_type n3 = line.find('\t', n2 + 1); 50 | data.push( 51 | std::stoi(line.substr(0, n)), 52 | movies.at(line.substr(n + 1, n2 - n - 1)), 53 | std::stof(line.substr(n2 + 1, n3 - n2 - 1)) 54 | ); 55 | } 56 | 57 | return data; 58 | } 59 | 60 | int main() { 61 | // https://grouplens.org/datasets/movielens/100k/ 62 | char *movielens_path = std::getenv("MOVIELENS_100K_PATH"); 63 | if (!movielens_path) { 64 | std::cout << "Set MOVIELENS_100K_PATH" << std::endl; 65 | return 1; 66 | } 67 | 68 | pqxx::connection conn("dbname=pgvector_example"); 69 | 70 | pqxx::nontransaction tx(conn); 71 | tx.exec("CREATE EXTENSION IF NOT EXISTS vector"); 72 | tx.exec("DROP TABLE IF EXISTS users"); 73 | tx.exec("DROP TABLE IF EXISTS movies"); 74 | tx.exec("CREATE TABLE users (id integer PRIMARY KEY, factors vector(20))"); 75 | tx.exec("CREATE TABLE movies (name text PRIMARY KEY, factors vector(20))"); 76 | 77 | auto data = load_movielens(movielens_path); 78 | auto recommender = Recommender::fit_explicit(data, { .factors = 20 }); 79 | 80 | for (auto& user_id : recommender.user_ids()) { 81 | auto factors = pgvector::Vector(*recommender.user_factors(user_id)); 82 | tx.exec("INSERT INTO users (id, factors) VALUES ($1, $2)", pqxx::params{user_id, factors}); 83 | } 84 | 85 | for (auto& item_id : recommender.item_ids()) { 86 | auto factors = pgvector::Vector(*recommender.item_factors(item_id)); 87 | tx.exec("INSERT INTO movies (name, factors) VALUES ($1, $2)", pqxx::params{item_id, factors}); 88 | } 89 | 90 | std::string movie = "Star Wars (1977)"; 91 | std::cout << "Item-based recommendations for " << movie << std::endl; 92 | pqxx::result result = tx.exec("SELECT name FROM movies WHERE name != $1 ORDER BY factors <=> (SELECT factors FROM movies WHERE name = $1) LIMIT 5", pqxx::params{movie}); 93 | for (const auto& row : result) { 94 | std::cout << "- " << row[0].as() << std::endl; 95 | } 96 | 97 | int user_id = 123; 98 | std::cout << std::endl << "User-based recommendations for " << user_id << std::endl; 99 | result = tx.exec("SELECT name FROM movies ORDER BY factors <#> (SELECT factors FROM users WHERE id = $1) LIMIT 5", pqxx::params{user_id}); 100 | for (const auto& row : result) { 101 | std::cout << "- " << row[0].as() << std::endl; 102 | } 103 | 104 | return 0; 105 | } 106 | -------------------------------------------------------------------------------- /examples/hybrid/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | project(example) 4 | 5 | set(CMAKE_CXX_STANDARD 17) 6 | 7 | include(FetchContent) 8 | 9 | FetchContent_Declare(cpr GIT_REPOSITORY https://github.com/libcpr/cpr.git GIT_TAG 1.11.1) 10 | FetchContent_Declare(json GIT_REPOSITORY https://github.com/nlohmann/json.git GIT_TAG v3.11.3) 11 | FetchContent_Declare(libpqxx GIT_REPOSITORY https://github.com/jtv/libpqxx.git GIT_TAG 7.10.1) 12 | FetchContent_MakeAvailable(cpr json libpqxx) 13 | 14 | add_subdirectory("${PROJECT_SOURCE_DIR}/../.." pgvector) 15 | 16 | add_executable(example example.cpp) 17 | target_link_libraries(example PRIVATE cpr::cpr libpqxx::pqxx nlohmann_json::nlohmann_json pgvector::pgvector) 18 | -------------------------------------------------------------------------------- /examples/hybrid/example.cpp: -------------------------------------------------------------------------------- 1 | // run with 2 | // llama-server -hf nomic-ai/nomic-embed-text-v1.5-GGUF --embedding --pooling mean 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | using json = nlohmann::json; 15 | 16 | std::vector> embed(const std::vector& texts, const std::string& taskType) { 17 | // nomic-embed-text-v1.5 uses a task prefix 18 | // https://huggingface.co/nomic-ai/nomic-embed-text-v1.5 19 | std::vector input; 20 | input.reserve(texts.size()); 21 | for (auto& v : texts) { 22 | input.push_back(taskType + ": " + v); 23 | } 24 | 25 | std::string url = "http://localhost:8080/v1/embeddings"; 26 | json data = { 27 | {"input", input} 28 | }; 29 | 30 | cpr::Response r = cpr::Post( 31 | cpr::Url{url}, 32 | cpr::Body{data.dump()}, 33 | cpr::Header{{"Content-Type", "application/json"}} 34 | ); 35 | if (r.status_code != 200) { 36 | throw std::runtime_error("Bad status: " + std::to_string(r.status_code)); 37 | } 38 | json response = json::parse(r.text); 39 | 40 | std::vector> embeddings; 41 | for (auto& v : response["data"]) { 42 | embeddings.emplace_back(v["embedding"]); 43 | } 44 | return embeddings; 45 | } 46 | 47 | int main() { 48 | pqxx::connection conn("dbname=pgvector_example"); 49 | 50 | pqxx::nontransaction tx(conn); 51 | tx.exec("CREATE EXTENSION IF NOT EXISTS vector"); 52 | tx.exec("DROP TABLE IF EXISTS documents"); 53 | tx.exec("CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(768))"); 54 | tx.exec("CREATE INDEX ON documents USING GIN (to_tsvector('english', content))"); 55 | 56 | std::vector input = { 57 | "The dog is barking", 58 | "The cat is purring", 59 | "The bear is growling" 60 | }; 61 | auto embeddings = embed(input, "search_document"); 62 | 63 | for (size_t i = 0; i < input.size(); i++) { 64 | tx.exec("INSERT INTO documents (content, embedding) VALUES ($1, $2)", pqxx::params{input[i], pgvector::Vector(embeddings[i])}); 65 | } 66 | 67 | std::string sql = R"( 68 | WITH semantic_search AS ( 69 | SELECT id, RANK () OVER (ORDER BY embedding <=> $2) AS rank 70 | FROM documents 71 | ORDER BY embedding <=> $2 72 | LIMIT 20 73 | ), 74 | keyword_search AS ( 75 | SELECT id, RANK () OVER (ORDER BY ts_rank_cd(to_tsvector('english', content), query) DESC) 76 | FROM documents, plainto_tsquery('english', $1) query 77 | WHERE to_tsvector('english', content) @@ query 78 | ORDER BY ts_rank_cd(to_tsvector('english', content), query) DESC 79 | LIMIT 20 80 | ) 81 | SELECT 82 | COALESCE(semantic_search.id, keyword_search.id) AS id, 83 | COALESCE(1.0 / ($3 + semantic_search.rank), 0.0) + 84 | COALESCE(1.0 / ($3 + keyword_search.rank), 0.0) AS score 85 | FROM semantic_search 86 | FULL OUTER JOIN keyword_search ON semantic_search.id = keyword_search.id 87 | ORDER BY score DESC 88 | LIMIT 5 89 | )"; 90 | std::string query = "growling bear"; 91 | auto query_embedding = embed({query}, "search_query")[0]; 92 | double k = 60; 93 | pqxx::result result = tx.exec(sql, pqxx::params{query, pgvector::Vector(query_embedding), k}); 94 | for (const auto& row : result) { 95 | std::cout << "document: " << row[0].as() << ", RRF score: " << row[1].as() << std::endl; 96 | } 97 | 98 | return 0; 99 | } 100 | -------------------------------------------------------------------------------- /examples/loading/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | project(example) 4 | 5 | set(CMAKE_CXX_STANDARD 17) 6 | 7 | include(FetchContent) 8 | 9 | FetchContent_Declare(libpqxx GIT_REPOSITORY https://github.com/jtv/libpqxx.git GIT_TAG 7.10.1) 10 | FetchContent_MakeAvailable(libpqxx) 11 | 12 | add_subdirectory("${PROJECT_SOURCE_DIR}/../.." pgvector) 13 | 14 | add_executable(example example.cpp) 15 | target_link_libraries(example PRIVATE libpqxx::pqxx pgvector::pgvector) 16 | -------------------------------------------------------------------------------- /examples/loading/example.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | int main() { 9 | // generate random data 10 | int rows = 100000; 11 | int dimensions = 128; 12 | std::vector> embeddings; 13 | embeddings.reserve(rows); 14 | std::mt19937_64 prng; 15 | std::uniform_real_distribution dist(0, 1); 16 | for (int i = 0; i < rows; i++) { 17 | std::vector embedding; 18 | embedding.reserve(dimensions); 19 | for (int j = 0; j < dimensions; j++) { 20 | embedding.push_back(dist(prng)); 21 | } 22 | embeddings.push_back(embedding); 23 | } 24 | 25 | // enable extension 26 | pqxx::connection conn("dbname=pgvector_example"); 27 | pqxx::nontransaction tx(conn); 28 | tx.exec("CREATE EXTENSION IF NOT EXISTS vector"); 29 | 30 | // create table 31 | tx.exec("DROP TABLE IF EXISTS items"); 32 | tx.exec("CREATE TABLE items (id bigserial, embedding vector(128))"); 33 | 34 | // load data 35 | // libpqxx does not support binary COPY 36 | std::cout << "Loading " << rows << " rows" << std::endl; 37 | auto stream = pqxx::stream_to::table(tx, {"items"}, {"embedding"}); 38 | for (size_t i = 0; i < embeddings.size(); i++) { 39 | // show progress 40 | if (i % 10000 == 0) { 41 | std::cout << '.' << std::flush; 42 | } 43 | 44 | stream << pgvector::Vector(embeddings[i]); 45 | } 46 | stream.complete(); 47 | std::cout << std::endl << "Success!" << std::endl; 48 | 49 | // create any indexes *after* loading initial data (skipping for this example) 50 | bool create_index = false; 51 | if (create_index) { 52 | std::cout << "Creating index" << std::endl; 53 | tx.exec("SET maintenance_work_mem = '8GB'"); 54 | tx.exec("SET max_parallel_maintenance_workers = 7"); 55 | tx.exec("CREATE INDEX ON items USING hnsw (embedding vector_cosine_ops)"); 56 | } 57 | 58 | // update planner statistics for good measure 59 | tx.exec("ANALYZE items"); 60 | 61 | return 0; 62 | } 63 | -------------------------------------------------------------------------------- /examples/openai/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | project(example) 4 | 5 | set(CMAKE_CXX_STANDARD 17) 6 | 7 | include(FetchContent) 8 | 9 | FetchContent_Declare(cpr GIT_REPOSITORY https://github.com/libcpr/cpr.git GIT_TAG 1.11.1) 10 | FetchContent_Declare(json GIT_REPOSITORY https://github.com/nlohmann/json.git GIT_TAG v3.11.3) 11 | FetchContent_Declare(libpqxx GIT_REPOSITORY https://github.com/jtv/libpqxx.git GIT_TAG 7.10.1) 12 | FetchContent_MakeAvailable(cpr json libpqxx) 13 | 14 | add_subdirectory("${PROJECT_SOURCE_DIR}/../.." pgvector) 15 | 16 | add_executable(example example.cpp) 17 | target_link_libraries(example PRIVATE cpr::cpr libpqxx::pqxx nlohmann_json::nlohmann_json pgvector::pgvector) 18 | -------------------------------------------------------------------------------- /examples/openai/example.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using json = nlohmann::json; 11 | 12 | // https://platform.openai.com/docs/guides/embeddings/how-to-get-embeddings 13 | // input can be an array with 2048 elements 14 | std::vector> embed(const std::vector& input, char *api_key) { 15 | std::string url = "https://api.openai.com/v1/embeddings"; 16 | json data = { 17 | {"input", input}, 18 | {"model", "text-embedding-3-small"} 19 | }; 20 | 21 | cpr::Response r = cpr::Post( 22 | cpr::Url{url}, 23 | cpr::Body{data.dump()}, 24 | cpr::Bearer{api_key}, 25 | cpr::Header{{"Content-Type", "application/json"}} 26 | ); 27 | if (r.status_code != 200) { 28 | throw std::runtime_error("Bad status: " + std::to_string(r.status_code)); 29 | } 30 | json response = json::parse(r.text); 31 | 32 | std::vector> embeddings; 33 | for (auto& v : response["data"]) { 34 | embeddings.emplace_back(v["embedding"]); 35 | } 36 | return embeddings; 37 | } 38 | 39 | int main() { 40 | char *api_key = std::getenv("OPENAI_API_KEY"); 41 | if (!api_key) { 42 | std::cout << "Set OPENAI_API_KEY" << std::endl; 43 | return 1; 44 | } 45 | 46 | pqxx::connection conn("dbname=pgvector_example"); 47 | 48 | pqxx::nontransaction tx(conn); 49 | tx.exec("CREATE EXTENSION IF NOT EXISTS vector"); 50 | tx.exec("DROP TABLE IF EXISTS documents"); 51 | tx.exec("CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(1536))"); 52 | 53 | std::vector input = { 54 | "The dog is barking", 55 | "The cat is purring", 56 | "The bear is growling" 57 | }; 58 | auto embeddings = embed(input, api_key); 59 | for (size_t i = 0; i < input.size(); i++) { 60 | tx.exec("INSERT INTO documents (content, embedding) VALUES ($1, $2)", pqxx::params{input[i], pgvector::Vector(embeddings[i])}); 61 | } 62 | 63 | std::string query = "forest"; 64 | auto query_embedding = embed({query}, api_key)[0]; 65 | pqxx::result result = tx.exec("SELECT content FROM documents ORDER BY embedding <=> $1 LIMIT 5", pqxx::params{pgvector::Vector(query_embedding)}); 66 | for (const auto& row : result) { 67 | std::cout << row[0].as() << std::endl; 68 | } 69 | 70 | return 0; 71 | } 72 | -------------------------------------------------------------------------------- /examples/rdkit/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | project(example) 4 | 5 | set(CMAKE_CXX_STANDARD 17) 6 | 7 | find_package(RDKit REQUIRED) 8 | find_package(Boost COMPONENTS iostreams serialization system REQUIRED) 9 | 10 | include(FetchContent) 11 | 12 | FetchContent_Declare(libpqxx GIT_REPOSITORY https://github.com/jtv/libpqxx.git GIT_TAG 7.10.1) 13 | FetchContent_MakeAvailable(libpqxx) 14 | 15 | add_subdirectory("${PROJECT_SOURCE_DIR}/../.." pgvector) 16 | 17 | add_executable(example example.cpp) 18 | target_link_libraries(example PRIVATE libpqxx::pqxx pgvector::pgvector RDKit::Fingerprints RDKit::SmilesParse) 19 | -------------------------------------------------------------------------------- /examples/rdkit/example.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | std::string generate_fingerprint(const std::string& molecule) { 13 | std::unique_ptr mol(RDKit::SmilesToMol(molecule)); 14 | std::unique_ptr fp(RDKit::MorganFingerprints::getFingerprintAsBitVect(*mol, 3, 2048)); 15 | std::stringstream buf; 16 | for (size_t i = 0; i < fp->getNumBits(); i++) { 17 | buf << (fp->getBit(i) ? '1' : '0'); 18 | } 19 | return buf.str(); 20 | } 21 | 22 | int main() { 23 | pqxx::connection conn("dbname=pgvector_example"); 24 | 25 | pqxx::nontransaction tx(conn); 26 | tx.exec("CREATE EXTENSION IF NOT EXISTS vector"); 27 | tx.exec("DROP TABLE IF EXISTS molecules"); 28 | tx.exec("CREATE TABLE molecules (id text PRIMARY KEY, fingerprint bit(2048))"); 29 | 30 | std::vector molecules = {"Cc1ccccc1", "Cc1ncccc1", "c1ccccn1"}; 31 | for (auto& molecule : molecules) { 32 | auto fingerprint = generate_fingerprint(molecule); 33 | tx.exec("INSERT INTO molecules (id, fingerprint) VALUES ($1, $2)", pqxx::params{molecule, fingerprint}); 34 | } 35 | 36 | std::string query_molecule = "c1ccco1"; 37 | auto query_fingerprint = generate_fingerprint(query_molecule); 38 | pqxx::result result = tx.exec("SELECT id, fingerprint <%> $1 AS distance FROM molecules ORDER BY distance LIMIT 5", pqxx::params{query_fingerprint}); 39 | for (const auto& row : result) { 40 | std::cout << row[0].as() << ": " << row[1].as() << std::endl; 41 | } 42 | 43 | return 0; 44 | } 45 | -------------------------------------------------------------------------------- /examples/sparse/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | project(example) 4 | 5 | set(CMAKE_CXX_STANDARD 17) 6 | 7 | include(FetchContent) 8 | 9 | FetchContent_Declare(cpr GIT_REPOSITORY https://github.com/libcpr/cpr.git GIT_TAG 1.11.1) 10 | FetchContent_Declare(json GIT_REPOSITORY https://github.com/nlohmann/json.git GIT_TAG v3.11.3) 11 | FetchContent_Declare(libpqxx GIT_REPOSITORY https://github.com/jtv/libpqxx.git GIT_TAG 7.10.1) 12 | FetchContent_MakeAvailable(cpr json libpqxx) 13 | 14 | add_subdirectory("${PROJECT_SOURCE_DIR}/../.." pgvector) 15 | 16 | add_executable(example example.cpp) 17 | target_link_libraries(example PRIVATE cpr::cpr libpqxx::pqxx nlohmann_json::nlohmann_json pgvector::pgvector) 18 | -------------------------------------------------------------------------------- /examples/sparse/example.cpp: -------------------------------------------------------------------------------- 1 | // good resources 2 | // https://opensearch.org/blog/improving-document-retrieval-with-sparse-semantic-encoders/ 3 | // https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1 4 | // 5 | // run with 6 | // text-embeddings-router --model-id opensearch-project/opensearch-neural-sparse-encoding-v1 --pooling splade 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | using json = nlohmann::json; 19 | 20 | std::vector embed(const std::vector& inputs) { 21 | std::string url = "http://localhost:3000/embed_sparse"; 22 | json data = { 23 | {"inputs", inputs} 24 | }; 25 | 26 | cpr::Response r = cpr::Post( 27 | cpr::Url{url}, 28 | cpr::Body{data.dump()}, 29 | cpr::Header{{"Content-Type", "application/json"}} 30 | ); 31 | if (r.status_code != 200) { 32 | throw std::runtime_error("Bad status: " + std::to_string(r.status_code)); 33 | } 34 | json response = json::parse(r.text); 35 | 36 | std::vector embeddings; 37 | for (auto& item : response) { 38 | std::vector indices; 39 | std::vector values; 40 | for (auto& e : item) { 41 | indices.emplace_back(e["index"]); 42 | values.emplace_back(e["value"]); 43 | } 44 | embeddings.emplace_back(pgvector::SparseVector(30522, indices, values)); 45 | } 46 | return embeddings; 47 | } 48 | 49 | int main() { 50 | pqxx::connection conn("dbname=pgvector_example"); 51 | 52 | pqxx::nontransaction tx(conn); 53 | tx.exec("CREATE EXTENSION IF NOT EXISTS vector"); 54 | tx.exec("DROP TABLE IF EXISTS documents"); 55 | tx.exec("CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding sparsevec(30522))"); 56 | 57 | std::vector input = { 58 | "The dog is barking", 59 | "The cat is purring", 60 | "The bear is growling" 61 | }; 62 | auto embeddings = embed(input); 63 | for (size_t i = 0; i < input.size(); i++) { 64 | tx.exec("INSERT INTO documents (content, embedding) VALUES ($1, $2)", pqxx::params{input[i], embeddings[i]}); 65 | } 66 | 67 | std::string query = "forest"; 68 | auto query_embedding = embed({query})[0]; 69 | pqxx::result result = tx.exec("SELECT content FROM documents ORDER BY embedding <#> $1 LIMIT 5", pqxx::params{query_embedding}); 70 | for (const auto& row : result) { 71 | std::cout << row[0].as() << std::endl; 72 | } 73 | 74 | return 0; 75 | } 76 | -------------------------------------------------------------------------------- /include/pgvector/halfvec.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * pgvector-cpp v0.2.2 3 | * https://github.com/pgvector/pgvector-cpp 4 | * MIT License 5 | */ 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | #if __cplusplus >= 202002L 14 | #include 15 | #endif 16 | 17 | namespace pgvector { 18 | /// A half vector. 19 | class HalfVector { 20 | public: 21 | /// @private 22 | // TODO remove in 0.3.0 23 | HalfVector() = default; 24 | 25 | /// Creates a half vector from a `std::vector`. 26 | // TODO add explicit in 0.3.0 27 | HalfVector(const std::vector& value) { 28 | value_ = value; 29 | } 30 | 31 | /// Creates a half vector from a `std::vector`. 32 | // TODO add explicit in 0.3.0 33 | HalfVector(std::vector&& value) { 34 | value_ = std::move(value); 35 | } 36 | 37 | /// Creates a half vector from an array. 38 | HalfVector(const float* value, size_t n) { 39 | value_ = std::vector{value, value + n}; 40 | } 41 | 42 | #if __cplusplus >= 202002L 43 | /// Creates a half vector from a span. 44 | // TODO add explicit in 0.3.0 45 | HalfVector(std::span value) { 46 | value_ = std::vector(value.begin(), value.end()); 47 | } 48 | #endif 49 | 50 | /// Returns the number of dimensions. 51 | size_t dimensions() const { 52 | return value_.size(); 53 | } 54 | 55 | /// Returns the half vector as a `std::vector`. 56 | operator const std::vector() const { 57 | return value_; 58 | } 59 | 60 | friend bool operator==(const HalfVector& lhs, const HalfVector& rhs) { 61 | return lhs.value_ == rhs.value_; 62 | } 63 | 64 | friend std::ostream& operator<<(std::ostream& os, const HalfVector& value) { 65 | os << "["; 66 | for (size_t i = 0; i < value.value_.size(); i++) { 67 | if (i > 0) { 68 | os << ","; 69 | } 70 | os << value.value_[i]; 71 | } 72 | os << "]"; 73 | return os; 74 | } 75 | 76 | private: 77 | // TODO use std::float16_t for C++23 78 | std::vector value_; 79 | }; 80 | } // namespace pgvector 81 | -------------------------------------------------------------------------------- /include/pgvector/pqxx.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * pgvector-cpp v0.2.2 3 | * https://github.com/pgvector/pgvector-cpp 4 | * MIT License 5 | */ 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | 15 | #include "halfvec.hpp" 16 | #include "sparsevec.hpp" 17 | #include "vector.hpp" 18 | 19 | /// @cond 20 | 21 | namespace pqxx { 22 | template <> std::string const type_name{"vector"}; 23 | 24 | template <> struct nullness : pqxx::no_null {}; 25 | 26 | template <> struct string_traits { 27 | static constexpr bool converts_to_string{true}; 28 | 29 | static constexpr bool converts_from_string{true}; 30 | 31 | static pgvector::Vector from_string(std::string_view text) { 32 | if (text.front() != '[' || text.back() != ']') { 33 | throw conversion_error("Malformed vector literal"); 34 | } 35 | 36 | // TODO don't copy string 37 | std::vector result; 38 | std::stringstream ss(std::string(text.substr(1, text.size() - 2))); 39 | while (ss.good()) { 40 | std::string substr; 41 | getline(ss, substr, ','); 42 | result.push_back(std::stof(substr)); 43 | } 44 | return pgvector::Vector(result); 45 | } 46 | 47 | static zview to_buf(char* begin, char* end, const pgvector::Vector& value) { 48 | char *const next = into_buf(begin, end, value); 49 | return zview{begin, next - begin - 1}; 50 | } 51 | 52 | static char* into_buf(char* begin, char* end, const pgvector::Vector& value) { 53 | auto ret = string_traits>::into_buf( 54 | begin, end, static_cast>(value)); 55 | // replace array brackets 56 | *begin = '['; 57 | *(ret - 2) = ']'; 58 | return ret; 59 | } 60 | 61 | static size_t size_buffer(const pgvector::Vector& value) noexcept { 62 | return string_traits>::size_buffer( 63 | static_cast>(value)); 64 | } 65 | }; 66 | 67 | template <> std::string const type_name{"halfvec"}; 68 | 69 | template <> struct nullness : pqxx::no_null {}; 70 | 71 | template <> struct string_traits { 72 | static constexpr bool converts_to_string{true}; 73 | 74 | static constexpr bool converts_from_string{true}; 75 | 76 | static pgvector::HalfVector from_string(std::string_view text) { 77 | if (text.front() != '[' || text.back() != ']') { 78 | throw conversion_error("Malformed halfvec literal"); 79 | } 80 | 81 | // TODO don't copy string 82 | std::vector result; 83 | std::stringstream ss(std::string(text.substr(1, text.size() - 2))); 84 | while (ss.good()) { 85 | std::string substr; 86 | getline(ss, substr, ','); 87 | result.push_back(std::stof(substr)); 88 | } 89 | return pgvector::HalfVector(result); 90 | } 91 | 92 | static zview to_buf(char* begin, char* end, const pgvector::HalfVector& value) { 93 | char *const next = into_buf(begin, end, value); 94 | return zview{begin, next - begin - 1}; 95 | } 96 | 97 | static char* into_buf(char* begin, char* end, const pgvector::HalfVector& value) { 98 | auto ret = string_traits>::into_buf( 99 | begin, end, static_cast>(value)); 100 | // replace array brackets 101 | *begin = '['; 102 | *(ret - 2) = ']'; 103 | return ret; 104 | } 105 | 106 | static size_t size_buffer(const pgvector::HalfVector& value) noexcept { 107 | return string_traits>::size_buffer( 108 | static_cast>(value)); 109 | } 110 | }; 111 | 112 | template <> std::string const type_name{"sparsevec"}; 113 | 114 | template <> struct nullness : pqxx::no_null {}; 115 | 116 | template <> struct string_traits { 117 | static constexpr bool converts_to_string{true}; 118 | 119 | // TODO add from_string 120 | static constexpr bool converts_from_string{false}; 121 | 122 | static zview to_buf(char* begin, char* end, const pgvector::SparseVector& value) { 123 | char *const next = into_buf(begin, end, value); 124 | return zview{begin, next - begin - 1}; 125 | } 126 | 127 | static char* into_buf(char* begin, char* end, const pgvector::SparseVector& value) { 128 | int dimensions = value.dimensions(); 129 | auto indices = value.indices(); 130 | auto values = value.values(); 131 | size_t nnz = indices.size(); 132 | 133 | // important! size_buffer cannot throw an exception on overflow 134 | // so perform this check before writing any data 135 | if (nnz > 16000) { 136 | throw conversion_overrun{"sparsevec cannot have more than 16000 dimensions"}; 137 | } 138 | 139 | char *here = begin; 140 | *here++ = '{'; 141 | 142 | for (size_t i = 0; i < nnz; i++) { 143 | if (i != 0) { 144 | *here++ = ','; 145 | } 146 | 147 | here = string_traits::into_buf(here, end, indices[i] + 1) - 1; 148 | *here++ = ':'; 149 | here = string_traits::into_buf(here, end, values[i]) - 1; 150 | } 151 | 152 | *here++ = '}'; 153 | *here++ = '/'; 154 | here = string_traits::into_buf(here, end, dimensions) - 1; 155 | *here++ = '\0'; 156 | 157 | return here; 158 | } 159 | 160 | static size_t size_buffer(const pgvector::SparseVector& value) noexcept { 161 | int dimensions = value.dimensions(); 162 | auto indices = value.indices(); 163 | auto values = value.values(); 164 | size_t nnz = indices.size(); 165 | 166 | // cannot throw an exception here on overflow 167 | // so throw in into_buf 168 | 169 | size_t size = 4; // {, }, /, and \0 170 | size += string_traits::size_buffer(dimensions); 171 | for (size_t i = 0; i < nnz; i++) { 172 | size += 2; // : and , 173 | size += string_traits::size_buffer(indices[i]); 174 | size += string_traits::size_buffer(values[i]); 175 | } 176 | return size; 177 | } 178 | }; 179 | } // namespace pqxx 180 | 181 | /// @endcond 182 | -------------------------------------------------------------------------------- /include/pgvector/sparsevec.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * pgvector-cpp v0.2.2 3 | * https://github.com/pgvector/pgvector-cpp 4 | * MIT License 5 | */ 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #if __cplusplus >= 202002L 15 | #include 16 | #endif 17 | 18 | namespace pgvector { 19 | /// A sparse vector. 20 | class SparseVector { 21 | public: 22 | /// @private 23 | // TODO remove in 0.3.0 24 | SparseVector() = default; 25 | 26 | /// @private 27 | SparseVector(int dimensions, const std::vector& indices, const std::vector& values) { 28 | if (values.size() != indices.size()) { 29 | throw std::invalid_argument("indices and values must be the same length"); 30 | } 31 | dimensions_ = dimensions; 32 | indices_ = indices; 33 | values_ = values; 34 | } 35 | 36 | /// Creates a sparse vector from a dense vector. 37 | // TODO add explicit in 0.3.0 38 | SparseVector(const std::vector& value) { 39 | dimensions_ = value.size(); 40 | for (size_t i = 0; i < value.size(); i++) { 41 | float v = value[i]; 42 | if (v != 0) { 43 | indices_.push_back(i); 44 | values_.push_back(v); 45 | } 46 | } 47 | } 48 | 49 | #if __cplusplus >= 202002L 50 | /// Creates a sparse vector from a span. 51 | // TODO add explicit in 0.3.0 52 | SparseVector(std::span value) { 53 | dimensions_ = value.size(); 54 | for (size_t i = 0; i < value.size(); i++) { 55 | float v = value[i]; 56 | if (v != 0) { 57 | indices_.push_back(i); 58 | values_.push_back(v); 59 | } 60 | } 61 | } 62 | #endif 63 | 64 | /// Creates a sparse vector from a map of non-zero elements. 65 | SparseVector(const std::unordered_map& map, int dimensions) { 66 | if (dimensions < 1) { 67 | throw std::invalid_argument("sparsevec must have at least 1 dimension"); 68 | } 69 | dimensions_ = dimensions; 70 | 71 | for (auto [i, v] : map) { 72 | if (i < 0 || i >= dimensions) { 73 | throw std::invalid_argument("sparsevec index out of bounds"); 74 | } 75 | 76 | if (v != 0) { 77 | indices_.push_back(i); 78 | } 79 | } 80 | std::sort(indices_.begin(), indices_.end()); 81 | 82 | values_.reserve(indices_.size()); 83 | for (auto i : indices_) { 84 | values_.push_back(map.at(i)); 85 | } 86 | } 87 | 88 | /// Returns the number of dimensions. 89 | int dimensions() const { 90 | return dimensions_; 91 | } 92 | 93 | /// Returns the non-zero indices. 94 | const std::vector& indices() const { 95 | return indices_; 96 | } 97 | 98 | /// Returns the non-zero values. 99 | const std::vector& values() const { 100 | return values_; 101 | } 102 | 103 | friend bool operator==(const SparseVector& lhs, const SparseVector& rhs) { 104 | return lhs.dimensions_ == rhs.dimensions_ && lhs.indices_ == rhs.indices_ && lhs.values_ == rhs.values_; 105 | } 106 | 107 | friend std::ostream& operator<<(std::ostream& os, const SparseVector& value) { 108 | os << "{"; 109 | for (size_t i = 0; i < value.indices_.size(); i++) { 110 | if (i > 0) { 111 | os << ","; 112 | } 113 | os << value.indices_[i] + 1; 114 | os << ":"; 115 | os << value.values_[i]; 116 | } 117 | os << "}/"; 118 | os << value.dimensions_; 119 | return os; 120 | } 121 | 122 | private: 123 | int dimensions_; 124 | std::vector indices_; 125 | std::vector values_; 126 | }; 127 | } // namespace pgvector 128 | -------------------------------------------------------------------------------- /include/pgvector/vector.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * pgvector-cpp v0.2.2 3 | * https://github.com/pgvector/pgvector-cpp 4 | * MIT License 5 | */ 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | #if __cplusplus >= 202002L 14 | #include 15 | #endif 16 | 17 | namespace pgvector { 18 | /// A vector. 19 | class Vector { 20 | public: 21 | /// @private 22 | // TODO remove in 0.3.0 23 | Vector() = default; 24 | 25 | /// Creates a vector from a `std::vector`. 26 | // TODO add explicit in 0.3.0 27 | Vector(const std::vector& value) { 28 | value_ = value; 29 | } 30 | 31 | /// Creates a vector from a `std::vector`. 32 | // TODO add explicit in 0.3.0 33 | Vector(std::vector&& value) { 34 | value_ = std::move(value); 35 | } 36 | 37 | /// Creates a vector from an array. 38 | Vector(const float* value, size_t n) { 39 | value_ = std::vector{value, value + n}; 40 | } 41 | 42 | #if __cplusplus >= 202002L 43 | /// Creates a vector from a span. 44 | // TODO add explicit in 0.3.0 45 | Vector(std::span value) { 46 | value_ = std::vector(value.begin(), value.end()); 47 | } 48 | #endif 49 | 50 | /// Returns the number of dimensions. 51 | size_t dimensions() const { 52 | return value_.size(); 53 | } 54 | 55 | /// Returns the vector as a `std::vector`. 56 | operator const std::vector() const { 57 | return value_; 58 | } 59 | 60 | friend bool operator==(const Vector& lhs, const Vector& rhs) { 61 | return lhs.value_ == rhs.value_; 62 | } 63 | 64 | friend std::ostream& operator<<(std::ostream& os, const Vector& value) { 65 | os << "["; 66 | for (size_t i = 0; i < value.value_.size(); i++) { 67 | if (i > 0) { 68 | os << ","; 69 | } 70 | os << value.value_[i]; 71 | } 72 | os << "]"; 73 | return os; 74 | } 75 | 76 | private: 77 | std::vector value_; 78 | }; 79 | } // namespace pgvector 80 | -------------------------------------------------------------------------------- /test/halfvec_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../include/pgvector/halfvec.hpp" 4 | 5 | #if __cplusplus >= 202002L 6 | #include 7 | #endif 8 | 9 | using pgvector::HalfVector; 10 | 11 | static void test_constructor_vector() { 12 | auto vec = HalfVector({1, 2, 3}); 13 | assert(vec.dimensions() == 3); 14 | } 15 | 16 | #if __cplusplus >= 202002L 17 | static void test_constructor_span() { 18 | auto vec = HalfVector(std::span({1, 2, 3})); 19 | assert(vec.dimensions() == 3); 20 | } 21 | #endif 22 | 23 | void test_halfvec() { 24 | test_constructor_vector(); 25 | #if __cplusplus >= 202002L 26 | test_constructor_span(); 27 | #endif 28 | } 29 | -------------------------------------------------------------------------------- /test/main.cpp: -------------------------------------------------------------------------------- 1 | void test_vector(); 2 | void test_halfvec(); 3 | void test_sparsevec(); 4 | void test_pqxx(); 5 | 6 | int main() { 7 | test_vector(); 8 | test_halfvec(); 9 | test_sparsevec(); 10 | test_pqxx(); 11 | return 0; 12 | } 13 | -------------------------------------------------------------------------------- /test/pqxx_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include "../include/pgvector/pqxx.hpp" 9 | 10 | void setup(pqxx::connection &conn) { 11 | pqxx::nontransaction tx(conn); 12 | tx.exec("CREATE EXTENSION IF NOT EXISTS vector"); 13 | tx.exec("DROP TABLE IF EXISTS items"); 14 | tx.exec("CREATE TABLE items (id serial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), binary_embedding bit(3), sparse_embedding sparsevec(3))"); 15 | } 16 | 17 | void before_each(pqxx::connection &conn) { 18 | pqxx::nontransaction tx(conn); 19 | tx.exec("TRUNCATE items"); 20 | } 21 | 22 | void test_vector(pqxx::connection &conn) { 23 | before_each(conn); 24 | 25 | pqxx::nontransaction tx(conn); 26 | auto embedding = pgvector::Vector({1, 2, 3}); 27 | assert(embedding.dimensions() == 3); 28 | float arr[] = {4, 5, 6}; 29 | auto embedding2 = pgvector::Vector(arr, 3); 30 | tx.exec("INSERT INTO items (embedding) VALUES ($1), ($2), ($3)", {embedding, embedding2, std::nullopt}); 31 | 32 | pqxx::result res = tx.exec("SELECT embedding FROM items ORDER BY embedding <-> $1", {embedding2}); 33 | assert(res.size() == 3); 34 | assert(res[0][0].as() == embedding2); 35 | assert(res[1][0].as() == embedding); 36 | assert(!res[2][0].as>().has_value()); 37 | } 38 | 39 | void test_halfvec(pqxx::connection &conn) { 40 | before_each(conn); 41 | 42 | pqxx::nontransaction tx(conn); 43 | auto embedding = pgvector::HalfVector({1, 2, 3}); 44 | assert(embedding.dimensions() == 3); 45 | float arr[] = {4, 5, 6}; 46 | auto embedding2 = pgvector::HalfVector(arr, 3); 47 | tx.exec("INSERT INTO items (half_embedding) VALUES ($1), ($2), ($3)", {embedding, embedding2, std::nullopt}); 48 | 49 | pqxx::result res = tx.exec("SELECT half_embedding FROM items ORDER BY half_embedding <-> $1", {embedding2}); 50 | assert(res.size() == 3); 51 | assert(res[0][0].as() == embedding2); 52 | assert(res[1][0].as() == embedding); 53 | assert(!res[2][0].as>().has_value()); 54 | } 55 | 56 | void test_bit(pqxx::connection &conn) { 57 | before_each(conn); 58 | 59 | pqxx::nontransaction tx(conn); 60 | auto embedding = "101"; 61 | auto embedding2 = "111"; 62 | tx.exec("INSERT INTO items (binary_embedding) VALUES ($1), ($2), ($3)", {embedding, embedding2, std::nullopt}); 63 | 64 | pqxx::result res = tx.exec("SELECT binary_embedding FROM items ORDER BY binary_embedding <~> $1", pqxx::params{embedding2}); 65 | assert(res.size() == 3); 66 | assert(res[0][0].as() == embedding2); 67 | assert(res[1][0].as() == embedding); 68 | assert(!res[2][0].as>().has_value()); 69 | } 70 | 71 | void test_sparsevec(pqxx::connection &conn) { 72 | before_each(conn); 73 | 74 | pqxx::nontransaction tx(conn); 75 | auto embedding = pgvector::SparseVector({1, 2, 3}); 76 | auto embedding2 = pgvector::SparseVector({4, 5, 6}); 77 | tx.exec("INSERT INTO items (sparse_embedding) VALUES ($1), ($2), ($3)", {embedding, embedding2, std::nullopt}); 78 | 79 | pqxx::result res = tx.exec("SELECT sparse_embedding FROM items ORDER BY sparse_embedding <-> $1", {embedding2}); 80 | assert(res.size() == 3); 81 | assert(res[0][0].as() == "{1:4,2:5,3:6}/3"); 82 | assert(res[1][0].as() == "{1:1,2:2,3:3}/3"); 83 | assert(!res[2][0].as>().has_value()); 84 | } 85 | 86 | void test_sparsevec_nnz(pqxx::connection &conn) { 87 | before_each(conn); 88 | 89 | pqxx::nontransaction tx(conn); 90 | std::vector vec(16001, 1); 91 | auto embedding = pgvector::SparseVector(vec); 92 | try { 93 | tx.exec("INSERT INTO items (sparse_embedding) VALUES ($1)", {embedding}); 94 | assert(false); 95 | } catch (const pqxx::conversion_overrun& e) { 96 | assert(std::strcmp(e.what(), "sparsevec cannot have more than 16000 dimensions") == 0); 97 | } 98 | } 99 | 100 | void test_stream(pqxx::connection &conn) { 101 | before_each(conn); 102 | 103 | pqxx::nontransaction tx(conn); 104 | auto embedding = pgvector::Vector({1, 2, 3}); 105 | tx.exec("INSERT INTO items (embedding) VALUES ($1)", {embedding}); 106 | int count = 0; 107 | for (auto [id, embedding] : tx.stream("SELECT id, embedding FROM items WHERE embedding IS NOT NULL")) { 108 | assert(embedding.dimensions() == 3); 109 | count++; 110 | } 111 | assert(count == 1); 112 | } 113 | 114 | void test_stream_to(pqxx::connection &conn) { 115 | before_each(conn); 116 | 117 | pqxx::nontransaction tx(conn); 118 | auto stream = pqxx::stream_to::table(tx, {"items"}, {"embedding"}); 119 | stream << pgvector::Vector({1, 2, 3}); 120 | stream << pgvector::Vector({4, 5, 6}); 121 | stream.complete(); 122 | pqxx::result res = tx.exec("SELECT embedding FROM items ORDER BY id"); 123 | assert(res[0][0].as() == "[1,2,3]"); 124 | assert(res[1][0].as() == "[4,5,6]"); 125 | } 126 | 127 | void test_precision(pqxx::connection &conn) { 128 | before_each(conn); 129 | 130 | pqxx::nontransaction tx(conn); 131 | auto embedding = pgvector::Vector({1.23456789, 0, 0}); 132 | tx.exec("INSERT INTO items (embedding) VALUES ($1)", {embedding}); 133 | tx.exec("SET extra_float_digits = 3"); 134 | pqxx::result res = tx.exec("SELECT embedding FROM items ORDER BY id DESC LIMIT 1"); 135 | assert(res[0][0].as() == embedding); 136 | } 137 | 138 | void test_pqxx() { 139 | pqxx::connection conn("dbname=pgvector_cpp_test"); 140 | setup(conn); 141 | 142 | test_vector(conn); 143 | test_halfvec(conn); 144 | test_bit(conn); 145 | test_sparsevec(conn); 146 | test_sparsevec_nnz(conn); 147 | test_stream(conn); 148 | test_stream_to(conn); 149 | test_precision(conn); 150 | } 151 | -------------------------------------------------------------------------------- /test/sparsevec_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "../include/pgvector/sparsevec.hpp" 5 | 6 | #if __cplusplus >= 202002L 7 | #include 8 | #endif 9 | 10 | using pgvector::SparseVector; 11 | 12 | static void test_constructor_vector() { 13 | auto vec = SparseVector({1, 0, 2, 0, 3, 0}); 14 | assert(vec.dimensions() == 6); 15 | assert(vec.indices() == (std::vector{0, 2, 4})); 16 | assert(vec.values() == (std::vector{1, 2, 3})); 17 | } 18 | 19 | #if __cplusplus >= 202002L 20 | static void test_constructor_span() { 21 | auto vec = SparseVector(std::span({1, 0, 2, 0, 3, 0})); 22 | assert(vec.dimensions() == 6); 23 | } 24 | #endif 25 | 26 | static void test_constructor_map() { 27 | std::unordered_map map = {{2, 2}, {4, 3}, {3, 0}, {0, 1}}; 28 | auto vec = SparseVector(map, 6); 29 | assert(vec.dimensions() == 6); 30 | assert(vec.indices() == (std::vector{0, 2, 4})); 31 | assert(vec.values() == (std::vector{1, 2, 3})); 32 | } 33 | 34 | void test_sparsevec() { 35 | test_constructor_vector(); 36 | #if __cplusplus >= 202002L 37 | test_constructor_span(); 38 | #endif 39 | test_constructor_map(); 40 | } 41 | -------------------------------------------------------------------------------- /test/vector_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../include/pgvector/vector.hpp" 4 | 5 | #if __cplusplus >= 202002L 6 | #include 7 | #endif 8 | 9 | using pgvector::Vector; 10 | 11 | static void test_constructor_vector() { 12 | auto vec = Vector({1, 2, 3}); 13 | assert(vec.dimensions() == 3); 14 | } 15 | 16 | #if __cplusplus >= 202002L 17 | static void test_constructor_span() { 18 | auto vec = Vector(std::span({1, 2, 3})); 19 | assert(vec.dimensions() == 3); 20 | } 21 | #endif 22 | 23 | void test_vector() { 24 | test_constructor_vector(); 25 | #if __cplusplus >= 202002L 26 | test_constructor_span(); 27 | #endif 28 | } 29 | --------------------------------------------------------------------------------