├── .github └── workflows │ └── build.yml ├── DBI └── example.R ├── DESCRIPTION ├── LICENSE.txt ├── README.md ├── dbx └── example.R └── examples ├── cohere ├── Description └── example.R ├── fingerprint ├── Description └── example.R ├── openai ├── Description └── example.R └── sparse ├── Description └── example.R /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | on: [push, pull_request] 3 | jobs: 4 | build: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v4 8 | - uses: r-lib/actions/setup-r@v2 9 | - uses: r-lib/actions/setup-r-dependencies@v2 10 | - uses: ankane/setup-postgres@v1 11 | with: 12 | database: pgvector_r_test 13 | dev-files: true 14 | - run: | 15 | cd /tmp 16 | git clone --branch v0.8.0 https://github.com/pgvector/pgvector.git 17 | cd pgvector 18 | make 19 | sudo make install 20 | - run: Rscript DBI/example.R 21 | - run: Rscript dbx/example.R 22 | -------------------------------------------------------------------------------- /DBI/example.R: -------------------------------------------------------------------------------- 1 | library(DBI) 2 | 3 | db <- dbConnect(RPostgres::Postgres(), dbname="pgvector_r_test") 4 | 5 | invisible(dbExecute(db, "CREATE EXTENSION IF NOT EXISTS vector")) 6 | invisible(dbExecute(db, "DROP TABLE IF EXISTS items")) 7 | invisible(dbExecute(db, "CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))")) 8 | 9 | encodeVector <- function(vec) { 10 | stopifnot(is.numeric(vec)) 11 | paste0("[", paste(vec, collapse=","), "]") 12 | } 13 | 14 | decodeVector <- function(str) { 15 | as.numeric(strsplit(substring(str, 2, nchar(str) - 1), ",")[[1]]) 16 | } 17 | 18 | embeddings <- list( 19 | c(1, 1, 1), 20 | c(2, 2, 2), 21 | c(1, 1, 2) 22 | ) 23 | 24 | items <- data.frame(embedding=sapply(embeddings, encodeVector)) 25 | invisible(dbAppendTable(db, "items", items)) 26 | 27 | params <- list(encodeVector(c(1, 1, 1))) 28 | result <- dbGetQuery(db, "SELECT * FROM items ORDER BY embedding <-> $1 LIMIT 5", params=params) 29 | print(lapply(result$embedding, decodeVector)) 30 | 31 | invisible(dbExecute(db, "CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)")) 32 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: pgvector 2 | Version: 0.1.0 3 | Imports: 4 | DBI, 5 | dbx, 6 | RPostgres 7 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2023 Andrew Kane 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pgvector-r 2 | 3 | [pgvector](https://github.com/pgvector/pgvector) examples for R 4 | 5 | Supports [DBI](https://github.com/r-dbi/DBI) and [dbx](https://github.com/ankane/dbx) 6 | 7 | [![Build Status](https://github.com/pgvector/pgvector-r/actions/workflows/build.yml/badge.svg)](https://github.com/pgvector/pgvector-r/actions) 8 | 9 | ## Getting Started 10 | 11 | Follow the instructions for your database library: 12 | 13 | - [DBI](#dbi) 14 | - [dbx](#dbx) 15 | 16 | Or check out an example: 17 | 18 | - [Embeddings](examples/openai/example.R) with OpenAI 19 | - [Binary embeddings](examples/cohere/example.R) with Cohere 20 | - [Sparse search](examples/sparse/example.R) with Text Embeddings Inference 21 | - [Molecular fingerprints](examples/fingerprint/example.R) with ChemmineR 22 | 23 | ## DBI 24 | 25 | Enable the extension 26 | 27 | ```r 28 | dbExecute(db, "CREATE EXTENSION IF NOT EXISTS vector") 29 | ``` 30 | 31 | Create a table 32 | 33 | ```r 34 | dbExecute(db, "CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))") 35 | ``` 36 | 37 | Insert vectors 38 | 39 | ```r 40 | encodeVector <- function(vec) { 41 | stopifnot(is.numeric(vec)) 42 | paste0("[", paste(vec, collapse=","), "]") 43 | } 44 | 45 | embeddings <- list( 46 | c(1, 1, 1), 47 | c(2, 2, 2), 48 | c(1, 1, 2) 49 | ) 50 | 51 | items <- data.frame(embedding=sapply(embeddings, encodeVector)) 52 | dbAppendTable(db, "items", items) 53 | ``` 54 | 55 | Get the nearest neighbors 56 | 57 | ```r 58 | params <- list(encodeVector(c(1, 2, 3))) 59 | dbGetQuery(db, "SELECT * FROM items ORDER BY embedding <-> $1 LIMIT 5", params=params) 60 | ``` 61 | 62 | Add an approximate index 63 | 64 | ```r 65 | dbExecute(db, "CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)") 66 | # or 67 | dbExecute(db, "CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100)") 68 | ``` 69 | 70 | Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance 71 | 72 | See a [full example](DBI/example.R) 73 | 74 | ## dbx 75 | 76 | Enable the extension 77 | 78 | ```r 79 | dbxExecute(db, "CREATE EXTENSION IF NOT EXISTS vector") 80 | ``` 81 | 82 | Create a table 83 | 84 | ```r 85 | dbxExecute(db, "CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))") 86 | ``` 87 | 88 | Insert vectors 89 | 90 | ```r 91 | encodeVector <- function(vec) { 92 | stopifnot(is.numeric(vec)) 93 | paste0("[", paste(vec, collapse=","), "]") 94 | } 95 | 96 | embeddings <- list( 97 | c(1, 1, 1), 98 | c(2, 2, 2), 99 | c(1, 1, 2) 100 | ) 101 | 102 | items <- data.frame(embedding=sapply(embeddings, encodeVector)) 103 | dbxInsert(db, "items", items) 104 | ``` 105 | 106 | Get the nearest neighbors 107 | 108 | ```r 109 | params <- list(encodeVector(c(1, 2, 3))) 110 | dbxSelect(db, "SELECT * FROM items ORDER BY embedding <-> ? LIMIT 5", params=params) 111 | ``` 112 | 113 | Add an approximate index 114 | 115 | ```r 116 | dbxExecute(db, "CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)") 117 | # or 118 | dbxExecute(db, "CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100)") 119 | ``` 120 | 121 | Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance 122 | 123 | See a [full example](dbx/example.R) 124 | 125 | ## Contributing 126 | 127 | Everyone is encouraged to help improve this project. Here are a few ways you can help: 128 | 129 | - [Report bugs](https://github.com/pgvector/pgvector-r/issues) 130 | - Fix bugs and [submit pull requests](https://github.com/pgvector/pgvector-r/pulls) 131 | - Write, clarify, or fix documentation 132 | - Suggest or add new features 133 | 134 | To get started with development: 135 | 136 | ```sh 137 | git clone https://github.com/pgvector/pgvector-r.git 138 | cd pgvector-r 139 | createdb pgvector_r_test 140 | Rscript -e "install.packages('remotes', repos='https://cloud.r-project.org')" 141 | Rscript -e "remotes::install_deps(dependencies=TRUE)" 142 | Rscript DBI/example.R 143 | Rscript dbx/example.R 144 | ``` 145 | 146 | To run an example: 147 | 148 | ```sh 149 | cd examples/openai 150 | createdb pgvector_example 151 | Rscript -e "remotes::install_deps(dependencies=TRUE)" 152 | Rscript example.R 153 | ``` 154 | -------------------------------------------------------------------------------- /dbx/example.R: -------------------------------------------------------------------------------- 1 | library(dbx) 2 | 3 | db <- dbxConnect(adapter="postgres", dbname="pgvector_r_test") 4 | 5 | invisible(dbxExecute(db, "CREATE EXTENSION IF NOT EXISTS vector")) 6 | invisible(dbxExecute(db, "DROP TABLE IF EXISTS items")) 7 | invisible(dbxExecute(db, "CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))")) 8 | 9 | encodeVector <- function(vec) { 10 | stopifnot(is.numeric(vec)) 11 | paste0("[", paste(vec, collapse=","), "]") 12 | } 13 | 14 | decodeVector <- function(str) { 15 | as.numeric(strsplit(substring(str, 2, nchar(str) - 1), ",")[[1]]) 16 | } 17 | 18 | embeddings <- list( 19 | c(1, 1, 1), 20 | c(2, 2, 2), 21 | c(1, 1, 2) 22 | ) 23 | 24 | items <- data.frame(embedding=sapply(embeddings, encodeVector)) 25 | invisible(dbxInsert(db, "items", items)) 26 | 27 | params <- list(encodeVector(c(1, 1, 1))) 28 | result <- dbxSelect(db, "SELECT * FROM items ORDER BY embedding <-> ? LIMIT 5", params=params) 29 | print(lapply(result$embedding, decodeVector)) 30 | 31 | invisible(dbxExecute(db, "CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)")) 32 | -------------------------------------------------------------------------------- /examples/cohere/Description: -------------------------------------------------------------------------------- 1 | Package: example 2 | Version: 0.1.0 3 | Imports: 4 | DBI, 5 | httr2, 6 | RPostgres 7 | -------------------------------------------------------------------------------- /examples/cohere/example.R: -------------------------------------------------------------------------------- 1 | library(DBI) 2 | library(httr2) 3 | 4 | db <- dbConnect(RPostgres::Postgres(), dbname="pgvector_example") 5 | 6 | invisible(dbExecute(db, "CREATE EXTENSION IF NOT EXISTS vector")) 7 | invisible(dbExecute(db, "DROP TABLE IF EXISTS documents")) 8 | invisible(dbExecute(db, "CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding bit(1536))")) 9 | 10 | toBits <- function(ubinary) { 11 | paste0(sapply(ubinary, function(v) { rev(as.integer(intToBits(v)[1:8])) }), collapse="") 12 | } 13 | 14 | embed <- function(texts, inputType) { 15 | url <- "https://api.cohere.com/v2/embed" 16 | token <- Sys.getenv("CO_API_KEY") 17 | data <- list( 18 | texts=texts, 19 | model="embed-v4.0", 20 | input_type=inputType, 21 | embedding_types=list("ubinary") 22 | ) 23 | 24 | resp <- request(url) |> req_auth_bearer_token(token) |> req_body_json(data) |> req_perform() 25 | sapply((resp |> resp_body_json())$embeddings$ubinary, function(v) { toBits(v) }) 26 | } 27 | 28 | input <- c( 29 | "The dog is barking", 30 | "The cat is purring", 31 | "The bear is growling" 32 | ) 33 | embeddings <- embed(input, "search_document") 34 | items <- data.frame(content=input, embedding=embeddings) 35 | invisible(dbAppendTable(db, "documents", items)) 36 | 37 | query <- "forest" 38 | queryEmbedding <- embed(list(query), "search_query")[[1]] 39 | params <- list(queryEmbedding) 40 | result <- dbGetQuery(db, "SELECT content FROM documents ORDER BY embedding <~> $1 LIMIT 5", params=params) 41 | print(result$content) 42 | -------------------------------------------------------------------------------- /examples/fingerprint/Description: -------------------------------------------------------------------------------- 1 | Package: example 2 | Version: 0.1.0 3 | Imports: 4 | ChemmineR, 5 | DBI, 6 | RPostgres 7 | Remotes: 8 | bioc::release/ChemmineR 9 | -------------------------------------------------------------------------------- /examples/fingerprint/example.R: -------------------------------------------------------------------------------- 1 | # good resource 2 | # https://www.bioconductor.org/packages/release/bioc/vignettes/ChemmineR/inst/doc/ChemmineR.html 3 | 4 | library(ChemmineR) 5 | library(DBI) 6 | 7 | db <- dbConnect(RPostgres::Postgres(), dbname="pgvector_example") 8 | 9 | invisible(dbExecute(db, "CREATE EXTENSION IF NOT EXISTS vector")) 10 | invisible(dbExecute(db, "DROP TABLE IF EXISTS molecules")) 11 | invisible(dbExecute(db, "CREATE TABLE molecules (id text PRIMARY KEY, fingerprint bit(1024))")) 12 | 13 | data(sdfsample) 14 | fpset <- desc2fp(sdf2ap(sdfsample)) 15 | molecules <- data.frame(id=sdfid(sdfsample), fingerprint=as.character(fpset)) 16 | invisible(dbAppendTable(db, "molecules", molecules)) 17 | 18 | params <- list(molecules$fingerprint[[1]]) 19 | result <- dbGetQuery(db, "SELECT id FROM molecules ORDER BY fingerprint <%> $1 LIMIT 5", params=params) 20 | print(result$id) 21 | -------------------------------------------------------------------------------- /examples/openai/Description: -------------------------------------------------------------------------------- 1 | Package: example 2 | Version: 0.1.0 3 | Imports: 4 | DBI, 5 | httr2, 6 | RPostgres 7 | -------------------------------------------------------------------------------- /examples/openai/example.R: -------------------------------------------------------------------------------- 1 | library(DBI) 2 | library(httr2) 3 | 4 | db <- dbConnect(RPostgres::Postgres(), dbname="pgvector_example") 5 | 6 | invisible(dbExecute(db, "CREATE EXTENSION IF NOT EXISTS vector")) 7 | invisible(dbExecute(db, "DROP TABLE IF EXISTS documents")) 8 | invisible(dbExecute(db, "CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(1536))")) 9 | 10 | embed <- function(input) { 11 | url <- "https://api.openai.com/v1/embeddings" 12 | token <- Sys.getenv("OPENAI_API_KEY") 13 | data <- list( 14 | input=input, 15 | model="text-embedding-3-small" 16 | ) 17 | 18 | resp <- request(url) |> req_auth_bearer_token(token) |> req_body_json(data) |> req_perform() 19 | lapply((resp |> resp_body_json())$data, function(x) { unlist(x$embedding) }) 20 | } 21 | 22 | encodeVector <- function(vec) { 23 | stopifnot(is.numeric(vec)) 24 | paste0("[", paste(vec, collapse=","), "]") 25 | } 26 | 27 | input <- c( 28 | "The dog is barking", 29 | "The cat is purring", 30 | "The bear is growling" 31 | ) 32 | embeddings <- embed(input) 33 | items <- data.frame(content=input, embedding=sapply(embeddings, encodeVector)) 34 | invisible(dbAppendTable(db, "documents", items)) 35 | 36 | query <- "forest" 37 | queryEmbedding <- embed(c(query))[[1]] 38 | params <- list(encodeVector(queryEmbedding)) 39 | result <- dbGetQuery(db, "SELECT content FROM documents ORDER BY embedding <=> $1 LIMIT 5", params=params) 40 | print(result$content) 41 | -------------------------------------------------------------------------------- /examples/sparse/Description: -------------------------------------------------------------------------------- 1 | Package: example 2 | Version: 0.1.0 3 | Imports: 4 | DBI, 5 | httr2, 6 | RPostgres 7 | -------------------------------------------------------------------------------- /examples/sparse/example.R: -------------------------------------------------------------------------------- 1 | # good resources 2 | # https://opensearch.org/blog/improving-document-retrieval-with-sparse-semantic-encoders/ 3 | # https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1 4 | # 5 | # run with 6 | # text-embeddings-router --model-id opensearch-project/opensearch-neural-sparse-encoding-v1 --pooling splade 7 | 8 | library(DBI) 9 | library(httr2) 10 | library(Matrix) 11 | 12 | db <- dbConnect(RPostgres::Postgres(), dbname="pgvector_example") 13 | 14 | invisible(dbExecute(db, "CREATE EXTENSION IF NOT EXISTS vector")) 15 | invisible(dbExecute(db, "DROP TABLE IF EXISTS documents")) 16 | invisible(dbExecute(db, "CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding sparsevec(30522))")) 17 | 18 | embed <- function(inputs) { 19 | url <- "http://localhost:3000/embed_sparse" 20 | data <- list( 21 | inputs=inputs 22 | ) 23 | 24 | resp <- request(url) |> req_body_json(data) |> req_perform() 25 | sapply((resp |> resp_body_json()), function(v) { 26 | indices <- sapply(v, function(e) { e$index }) 27 | values <- sapply(v, function(e) { e$value }) 28 | sparseVector(i=indices, x=values, length=30522) 29 | }) 30 | } 31 | 32 | encodeSparseVector <- function(vec) { 33 | stopifnot(inherits(vec, "sparseVector")) 34 | elements <- mapply(function(i, v) { paste0(i, ":", v) }, vec@i, vec@x) 35 | paste0("{", paste0(elements, collapse=","), "}/", length(vec)) 36 | } 37 | 38 | input <- c( 39 | "The dog is barking", 40 | "The cat is purring", 41 | "The bear is growling" 42 | ) 43 | embeddings <- embed(input) 44 | items <- data.frame(content=input, embedding=sapply(embeddings, encodeSparseVector)) 45 | invisible(dbAppendTable(db, "documents", items)) 46 | 47 | query <- "forest" 48 | queryEmbedding <- embed(c(query))[[1]] 49 | params <- list(encodeSparseVector(queryEmbedding)) 50 | result <- dbGetQuery(db, "SELECT content FROM documents ORDER BY embedding <#> $1 LIMIT 5", params=params) 51 | print(result$content) 52 | --------------------------------------------------------------------------------