├── .github
    └── workflows
    │   └── build.yml
├── DBI
    └── example.R
├── DESCRIPTION
├── LICENSE.txt
├── README.md
├── dbx
    └── example.R
└── examples
    ├── cohere
        ├── Description
        └── example.R
    ├── fingerprint
        ├── Description
        └── example.R
    ├── openai
        ├── Description
        └── example.R
    └── sparse
        ├── Description
        └── example.R


/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: build
 2 | on: [push, pull_request]
 3 | jobs:
 4 |   build:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |       - uses: actions/checkout@v4
 8 |       - uses: r-lib/actions/setup-r@v2
 9 |       - uses: r-lib/actions/setup-r-dependencies@v2
10 |       - uses: ankane/setup-postgres@v1
11 |         with:
12 |           database: pgvector_r_test
13 |           dev-files: true
14 |       - run: |
15 |           cd /tmp
16 |           git clone --branch v0.8.0 https://github.com/pgvector/pgvector.git
17 |           cd pgvector
18 |           make
19 |           sudo make install
20 |       - run: Rscript DBI/example.R
21 |       - run: Rscript dbx/example.R
22 | 


--------------------------------------------------------------------------------
/DBI/example.R:
--------------------------------------------------------------------------------
 1 | library(DBI)
 2 | 
 3 | db <- dbConnect(RPostgres::Postgres(), dbname="pgvector_r_test")
 4 | 
 5 | invisible(dbExecute(db, "CREATE EXTENSION IF NOT EXISTS vector"))
 6 | invisible(dbExecute(db, "DROP TABLE IF EXISTS items"))
 7 | invisible(dbExecute(db, "CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))"))
 8 | 
 9 | encodeVector <- function(vec) {
10 |   stopifnot(is.numeric(vec))
11 |   paste0("[", paste(vec, collapse=","), "]")
12 | }
13 | 
14 | decodeVector <- function(str) {
15 |   as.numeric(strsplit(substring(str, 2, nchar(str) - 1), ",")[[1]])
16 | }
17 | 
18 | embeddings <- list(
19 |   c(1, 1, 1),
20 |   c(2, 2, 2),
21 |   c(1, 1, 2)
22 | )
23 | 
24 | items <- data.frame(embedding=sapply(embeddings, encodeVector))
25 | invisible(dbAppendTable(db, "items", items))
26 | 
27 | params <- list(encodeVector(c(1, 1, 1)))
28 | result <- dbGetQuery(db, "SELECT * FROM items ORDER BY embedding <-> $1 LIMIT 5", params=params)
29 | print(lapply(result$embedding, decodeVector))
30 | 
31 | invisible(dbExecute(db, "CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)"))
32 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: pgvector
2 | Version: 0.1.0
3 | Imports:
4 |     DBI,
5 |     dbx,
6 |     RPostgres
7 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2023 Andrew Kane
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pgvector-r
  2 | 
  3 | [pgvector](https://github.com/pgvector/pgvector) examples for R
  4 | 
  5 | Supports [DBI](https://github.com/r-dbi/DBI) and [dbx](https://github.com/ankane/dbx)
  6 | 
  7 | [![Build Status](https://github.com/pgvector/pgvector-r/actions/workflows/build.yml/badge.svg)](https://github.com/pgvector/pgvector-r/actions)
  8 | 
  9 | ## Getting Started
 10 | 
 11 | Follow the instructions for your database library:
 12 | 
 13 | - [DBI](#dbi)
 14 | - [dbx](#dbx)
 15 | 
 16 | Or check out an example:
 17 | 
 18 | - [Embeddings](examples/openai/example.R) with OpenAI
 19 | - [Binary embeddings](examples/cohere/example.R) with Cohere
 20 | - [Sparse search](examples/sparse/example.R) with Text Embeddings Inference
 21 | - [Molecular fingerprints](examples/fingerprint/example.R) with ChemmineR
 22 | 
 23 | ## DBI
 24 | 
 25 | Enable the extension
 26 | 
 27 | ```r
 28 | dbExecute(db, "CREATE EXTENSION IF NOT EXISTS vector")
 29 | ```
 30 | 
 31 | Create a table
 32 | 
 33 | ```r
 34 | dbExecute(db, "CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))")
 35 | ```
 36 | 
 37 | Insert vectors
 38 | 
 39 | ```r
 40 | encodeVector <- function(vec) {
 41 |   stopifnot(is.numeric(vec))
 42 |   paste0("[", paste(vec, collapse=","), "]")
 43 | }
 44 | 
 45 | embeddings <- list(
 46 |   c(1, 1, 1),
 47 |   c(2, 2, 2),
 48 |   c(1, 1, 2)
 49 | )
 50 | 
 51 | items <- data.frame(embedding=sapply(embeddings, encodeVector))
 52 | dbAppendTable(db, "items", items)
 53 | ```
 54 | 
 55 | Get the nearest neighbors
 56 | 
 57 | ```r
 58 | params <- list(encodeVector(c(1, 2, 3)))
 59 | dbGetQuery(db, "SELECT * FROM items ORDER BY embedding <-> $1 LIMIT 5", params=params)
 60 | ```
 61 | 
 62 | Add an approximate index
 63 | 
 64 | ```r
 65 | dbExecute(db, "CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)")
 66 | # or
 67 | dbExecute(db, "CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100)")
 68 | ```
 69 | 
 70 | Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance
 71 | 
 72 | See a [full example](DBI/example.R)
 73 | 
 74 | ## dbx
 75 | 
 76 | Enable the extension
 77 | 
 78 | ```r
 79 | dbxExecute(db, "CREATE EXTENSION IF NOT EXISTS vector")
 80 | ```
 81 | 
 82 | Create a table
 83 | 
 84 | ```r
 85 | dbxExecute(db, "CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))")
 86 | ```
 87 | 
 88 | Insert vectors
 89 | 
 90 | ```r
 91 | encodeVector <- function(vec) {
 92 |   stopifnot(is.numeric(vec))
 93 |   paste0("[", paste(vec, collapse=","), "]")
 94 | }
 95 | 
 96 | embeddings <- list(
 97 |   c(1, 1, 1),
 98 |   c(2, 2, 2),
 99 |   c(1, 1, 2)
100 | )
101 | 
102 | items <- data.frame(embedding=sapply(embeddings, encodeVector))
103 | dbxInsert(db, "items", items)
104 | ```
105 | 
106 | Get the nearest neighbors
107 | 
108 | ```r
109 | params <- list(encodeVector(c(1, 2, 3)))
110 | dbxSelect(db, "SELECT * FROM items ORDER BY embedding <-> ? LIMIT 5", params=params)
111 | ```
112 | 
113 | Add an approximate index
114 | 
115 | ```r
116 | dbxExecute(db, "CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)")
117 | # or
118 | dbxExecute(db, "CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100)")
119 | ```
120 | 
121 | Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance
122 | 
123 | See a [full example](dbx/example.R)
124 | 
125 | ## Contributing
126 | 
127 | Everyone is encouraged to help improve this project. Here are a few ways you can help:
128 | 
129 | - [Report bugs](https://github.com/pgvector/pgvector-r/issues)
130 | - Fix bugs and [submit pull requests](https://github.com/pgvector/pgvector-r/pulls)
131 | - Write, clarify, or fix documentation
132 | - Suggest or add new features
133 | 
134 | To get started with development:
135 | 
136 | ```sh
137 | git clone https://github.com/pgvector/pgvector-r.git
138 | cd pgvector-r
139 | createdb pgvector_r_test
140 | Rscript -e "install.packages('remotes', repos='https://cloud.r-project.org')"
141 | Rscript -e "remotes::install_deps(dependencies=TRUE)"
142 | Rscript DBI/example.R
143 | Rscript dbx/example.R
144 | ```
145 | 
146 | To run an example:
147 | 
148 | ```sh
149 | cd examples/openai
150 | createdb pgvector_example
151 | Rscript -e "remotes::install_deps(dependencies=TRUE)"
152 | Rscript example.R
153 | ```
154 | 


--------------------------------------------------------------------------------
/dbx/example.R:
--------------------------------------------------------------------------------
 1 | library(dbx)
 2 | 
 3 | db <- dbxConnect(adapter="postgres", dbname="pgvector_r_test")
 4 | 
 5 | invisible(dbxExecute(db, "CREATE EXTENSION IF NOT EXISTS vector"))
 6 | invisible(dbxExecute(db, "DROP TABLE IF EXISTS items"))
 7 | invisible(dbxExecute(db, "CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))"))
 8 | 
 9 | encodeVector <- function(vec) {
10 |   stopifnot(is.numeric(vec))
11 |   paste0("[", paste(vec, collapse=","), "]")
12 | }
13 | 
14 | decodeVector <- function(str) {
15 |   as.numeric(strsplit(substring(str, 2, nchar(str) - 1), ",")[[1]])
16 | }
17 | 
18 | embeddings <- list(
19 |   c(1, 1, 1),
20 |   c(2, 2, 2),
21 |   c(1, 1, 2)
22 | )
23 | 
24 | items <- data.frame(embedding=sapply(embeddings, encodeVector))
25 | invisible(dbxInsert(db, "items", items))
26 | 
27 | params <- list(encodeVector(c(1, 1, 1)))
28 | result <- dbxSelect(db, "SELECT * FROM items ORDER BY embedding <-> ? LIMIT 5", params=params)
29 | print(lapply(result$embedding, decodeVector))
30 | 
31 | invisible(dbxExecute(db, "CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)"))
32 | 


--------------------------------------------------------------------------------
/examples/cohere/Description:
--------------------------------------------------------------------------------
1 | Package: example
2 | Version: 0.1.0
3 | Imports:
4 |     DBI,
5 |     httr2,
6 |     RPostgres
7 | 


--------------------------------------------------------------------------------
/examples/cohere/example.R:
--------------------------------------------------------------------------------
 1 | library(DBI)
 2 | library(httr2)
 3 | 
 4 | db <- dbConnect(RPostgres::Postgres(), dbname="pgvector_example")
 5 | 
 6 | invisible(dbExecute(db, "CREATE EXTENSION IF NOT EXISTS vector"))
 7 | invisible(dbExecute(db, "DROP TABLE IF EXISTS documents"))
 8 | invisible(dbExecute(db, "CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding bit(1536))"))
 9 | 
10 | toBits <- function(ubinary) {
11 |   paste0(sapply(ubinary, function(v) { rev(as.integer(intToBits(v)[1:8])) }), collapse="")
12 | }
13 | 
14 | embed <- function(texts, inputType) {
15 |   url <- "https://api.cohere.com/v2/embed"
16 |   token <- Sys.getenv("CO_API_KEY")
17 |   data <- list(
18 |     texts=texts,
19 |     model="embed-v4.0",
20 |     input_type=inputType,
21 |     embedding_types=list("ubinary")
22 |   )
23 | 
24 |   resp <- request(url) |> req_auth_bearer_token(token) |> req_body_json(data) |> req_perform()
25 |   sapply((resp |> resp_body_json())$embeddings$ubinary, function(v) { toBits(v) })
26 | }
27 | 
28 | input <- c(
29 |   "The dog is barking",
30 |   "The cat is purring",
31 |   "The bear is growling"
32 | )
33 | embeddings <- embed(input, "search_document")
34 | items <- data.frame(content=input, embedding=embeddings)
35 | invisible(dbAppendTable(db, "documents", items))
36 | 
37 | query <- "forest"
38 | queryEmbedding <- embed(list(query), "search_query")[[1]]
39 | params <- list(queryEmbedding)
40 | result <- dbGetQuery(db, "SELECT content FROM documents ORDER BY embedding <~> $1 LIMIT 5", params=params)
41 | print(result$content)
42 | 


--------------------------------------------------------------------------------
/examples/fingerprint/Description:
--------------------------------------------------------------------------------
1 | Package: example
2 | Version: 0.1.0
3 | Imports:
4 |     ChemmineR,
5 |     DBI,
6 |     RPostgres
7 | Remotes:
8 |     bioc::release/ChemmineR
9 | 


--------------------------------------------------------------------------------
/examples/fingerprint/example.R:
--------------------------------------------------------------------------------
 1 | # good resource
 2 | # https://www.bioconductor.org/packages/release/bioc/vignettes/ChemmineR/inst/doc/ChemmineR.html
 3 | 
 4 | library(ChemmineR)
 5 | library(DBI)
 6 | 
 7 | db <- dbConnect(RPostgres::Postgres(), dbname="pgvector_example")
 8 | 
 9 | invisible(dbExecute(db, "CREATE EXTENSION IF NOT EXISTS vector"))
10 | invisible(dbExecute(db, "DROP TABLE IF EXISTS molecules"))
11 | invisible(dbExecute(db, "CREATE TABLE molecules (id text PRIMARY KEY, fingerprint bit(1024))"))
12 | 
13 | data(sdfsample)
14 | fpset <- desc2fp(sdf2ap(sdfsample))
15 | molecules <- data.frame(id=sdfid(sdfsample), fingerprint=as.character(fpset))
16 | invisible(dbAppendTable(db, "molecules", molecules))
17 | 
18 | params <- list(molecules$fingerprint[[1]])
19 | result <- dbGetQuery(db, "SELECT id FROM molecules ORDER BY fingerprint <%> $1 LIMIT 5", params=params)
20 | print(result$id)
21 | 


--------------------------------------------------------------------------------
/examples/openai/Description:
--------------------------------------------------------------------------------
1 | Package: example
2 | Version: 0.1.0
3 | Imports:
4 |     DBI,
5 |     httr2,
6 |     RPostgres
7 | 


--------------------------------------------------------------------------------
/examples/openai/example.R:
--------------------------------------------------------------------------------
 1 | library(DBI)
 2 | library(httr2)
 3 | 
 4 | db <- dbConnect(RPostgres::Postgres(), dbname="pgvector_example")
 5 | 
 6 | invisible(dbExecute(db, "CREATE EXTENSION IF NOT EXISTS vector"))
 7 | invisible(dbExecute(db, "DROP TABLE IF EXISTS documents"))
 8 | invisible(dbExecute(db, "CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(1536))"))
 9 | 
10 | embed <- function(input) {
11 |   url <- "https://api.openai.com/v1/embeddings"
12 |   token <- Sys.getenv("OPENAI_API_KEY")
13 |   data <- list(
14 |     input=input,
15 |     model="text-embedding-3-small"
16 |   )
17 | 
18 |   resp <- request(url) |> req_auth_bearer_token(token) |> req_body_json(data) |> req_perform()
19 |   lapply((resp |> resp_body_json())$data, function(x) { unlist(x$embedding) })
20 | }
21 | 
22 | encodeVector <- function(vec) {
23 |   stopifnot(is.numeric(vec))
24 |   paste0("[", paste(vec, collapse=","), "]")
25 | }
26 | 
27 | input <- c(
28 |   "The dog is barking",
29 |   "The cat is purring",
30 |   "The bear is growling"
31 | )
32 | embeddings <- embed(input)
33 | items <- data.frame(content=input, embedding=sapply(embeddings, encodeVector))
34 | invisible(dbAppendTable(db, "documents", items))
35 | 
36 | query <- "forest"
37 | queryEmbedding <- embed(c(query))[[1]]
38 | params <- list(encodeVector(queryEmbedding))
39 | result <- dbGetQuery(db, "SELECT content FROM documents ORDER BY embedding <=> $1 LIMIT 5", params=params)
40 | print(result$content)
41 | 


--------------------------------------------------------------------------------
/examples/sparse/Description:
--------------------------------------------------------------------------------
1 | Package: example
2 | Version: 0.1.0
3 | Imports:
4 |     DBI,
5 |     httr2,
6 |     RPostgres
7 | 


--------------------------------------------------------------------------------
/examples/sparse/example.R:
--------------------------------------------------------------------------------
 1 | # good resources
 2 | # https://opensearch.org/blog/improving-document-retrieval-with-sparse-semantic-encoders/
 3 | # https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1
 4 | #
 5 | # run with
 6 | # text-embeddings-router --model-id opensearch-project/opensearch-neural-sparse-encoding-v1 --pooling splade
 7 | 
 8 | library(DBI)
 9 | library(httr2)
10 | library(Matrix)
11 | 
12 | db <- dbConnect(RPostgres::Postgres(), dbname="pgvector_example")
13 | 
14 | invisible(dbExecute(db, "CREATE EXTENSION IF NOT EXISTS vector"))
15 | invisible(dbExecute(db, "DROP TABLE IF EXISTS documents"))
16 | invisible(dbExecute(db, "CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding sparsevec(30522))"))
17 | 
18 | embed <- function(inputs) {
19 |   url <- "http://localhost:3000/embed_sparse"
20 |   data <- list(
21 |     inputs=inputs
22 |   )
23 | 
24 |   resp <- request(url) |> req_body_json(data) |> req_perform()
25 |   sapply((resp |> resp_body_json()), function(v) {
26 |     indices <- sapply(v, function(e) { e$index })
27 |     values <- sapply(v, function(e) { e$value })
28 |     sparseVector(i=indices, x=values, length=30522)
29 |   })
30 | }
31 | 
32 | encodeSparseVector <- function(vec) {
33 |   stopifnot(inherits(vec, "sparseVector"))
34 |   elements <- mapply(function(i, v) { paste0(i, ":", v) }, vec@i, vec@x)
35 |   paste0("{", paste0(elements, collapse=","), "}/", length(vec))
36 | }
37 | 
38 | input <- c(
39 |   "The dog is barking",
40 |   "The cat is purring",
41 |   "The bear is growling"
42 | )
43 | embeddings <- embed(input)
44 | items <- data.frame(content=input, embedding=sapply(embeddings, encodeSparseVector))
45 | invisible(dbAppendTable(db, "documents", items))
46 | 
47 | query <- "forest"
48 | queryEmbedding <- embed(c(query))[[1]]
49 | params <- list(encodeSparseVector(queryEmbedding))
50 | result <- dbGetQuery(db, "SELECT content FROM documents ORDER BY embedding <#> $1 LIMIT 5", params=params)
51 | print(result$content)
52 | 


--------------------------------------------------------------------------------