├── .vscode └── settings.json ├── LICENSE ├── README.md └── supabase ├── .gitignore ├── config.toml ├── functions ├── _utils │ ├── env.ts │ └── errors.ts ├── import_map.json └── vector-search │ └── index.ts ├── migrations └── 20230525043454_init.sql └── seed.sql /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "deno.enablePaths": ["./supabase/functions"], 3 | "deno.importMap": "./supabase/functions/import_map.json" 4 | } 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Supabase 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Headless Vector Search 2 | 3 | Provides a vector/similarity search for any documentation site. It's [headless](https://en.wikipedia.org/wiki/Headless_software), so that you can integrate it into your existing website. 4 | 5 | #### How it works: 6 | 7 | - This repo initializes a new `docs` schema inside your database. 8 | - The accompanying [GitHub Action](https://github.com/supabase/supabase-vector-embeddings-github-action) ingests your markdown docs into your database as embeddings. 9 | - This repo provides an Edge Function that handles user queries, converting them into ChatGPT-like responses. 10 | 11 | #### Tech stack: 12 | 13 | - Supabase: Database & Edge Functions. 14 | - OpenAI: Embeddings and completions. 15 | - GitHub Actions: for ingesting your markdown docs. 16 | 17 | ## Set-up 18 | 19 | Start by creating a new Supabase Project: [database.new](https://database.new). 20 | 21 | 1. Clone this repo 22 | 2. Link the repo to your remote project: `supabase link --project-ref XXX` 23 | 3. Apply the database migrations: `supabase db push` 24 | 4. Set your OpenAI key as a secret: `supabase secrets set OPENAI_API_KEY=sk-xxx` 25 | 5. Deploy the Edge Functions: `supabase functions deploy --no-verify-jwt` 26 | 6. Expose `docs` schema via API in Supabase Dashboard [settings](https://app.supabase.com/project/_/settings/api) > `API Settings` > `Exposed schemas` 27 | 7. [Setup](https://github.com/supabase/supabase-vector-embeddings-github-action#use) `supabase-vector-embeddings` GitHub action in your Knowledge Base repo. You will see the embeddings populated in your database after the GitHub Action has run. 28 | 29 | ## Usage 30 | 31 | 1. Find the URL for the `vector-search` Edge Function in the [Functions section](https://app.supabase.com/project/_/functions) of the Dashboard. 32 | 2. Inside your appliction, you can send the user queries to this endpoint to receive a streamed response from OpenAI. 33 | 34 |
35 | See cURL example 36 | 37 | ```bash 38 | curl -i --location --request GET 'https://your-project-ref.functions.supabase.co/vector-search?query=What%27s+Supabase%3F' 39 | ``` 40 | 41 |
42 | 43 |
44 | See EventSource example 45 | 46 | 47 | ```ts 48 | const onSubmit = (e: Event) => { 49 | e.preventDefault() 50 | answer.value = "" 51 | isLoading.value = true 52 | 53 | const query = new URLSearchParams({ query: inputRef.current!.value }) 54 | const projectUrl = `https://your-project-ref.functions.supabase.co` 55 | const queryURL = `${projectURL}/${query}` 56 | const eventSource = new EventSource(queryURL) 57 | 58 | eventSource.addEventListener("error", (err) => { 59 | isLoading.value = false 60 | console.error(err) 61 | }) 62 | 63 | eventSource.addEventListener("message", (e: MessageEvent) => { 64 | isLoading.value = false 65 | 66 | if (e.data === "[DONE]") { 67 | eventSource.close() 68 | return 69 | } 70 | 71 | const completionResponse: CreateCompletionResponse = JSON.parse(e.data) 72 | const text = completionResponse.choices[0].text 73 | 74 | answer.value += text 75 | }); 76 | 77 | isLoading.value = true 78 | } 79 | ``` 80 | 81 |
82 | 83 | 84 | ## Showcase 85 | 86 | - [docs.supabase.com](https://supabase.com/docs) - Use cmd+k to access. 87 | 88 | ## License 89 | 90 | MIT 91 | -------------------------------------------------------------------------------- /supabase/.gitignore: -------------------------------------------------------------------------------- 1 | # Supabase 2 | .branches 3 | .temp 4 | -------------------------------------------------------------------------------- /supabase/config.toml: -------------------------------------------------------------------------------- 1 | # A string used to distinguish different Supabase projects on the same host. Defaults to the working 2 | # directory name when running `supabase init`. 3 | project_id = "headless-supabase-vector-search" 4 | 5 | [api] 6 | # Port to use for the API URL. 7 | port = 54321 8 | # Schemas to expose in your API. Tables, views and stored procedures in this schema will get API 9 | # endpoints. public and storage are always included. 10 | schemas = ["public", "storage", "graphql_public"] 11 | # Extra schemas to add to the search_path of every request. public is always included. 12 | extra_search_path = ["public", "extensions"] 13 | # The maximum number of rows returns from a view, table, or stored procedure. Limits payload size 14 | # for accidental or malicious requests. 15 | max_rows = 1000 16 | 17 | [db] 18 | # Port to use for the local database URL. 19 | port = 54322 20 | # The database major version to use. This has to be the same as your remote database's. Run `SHOW 21 | # server_version;` on the remote database to check. 22 | major_version = 15 23 | 24 | [studio] 25 | # Port to use for Supabase Studio. 26 | port = 54323 27 | 28 | # Email testing server. Emails sent with the local dev setup are not actually sent - rather, they 29 | # are monitored, and you can view the emails that would have been sent from the web interface. 30 | [inbucket] 31 | # Port to use for the email testing server web interface. 32 | port = 54324 33 | smtp_port = 54325 34 | pop3_port = 54326 35 | 36 | [storage] 37 | # The maximum file size allowed (e.g. "5MB", "500KB"). 38 | file_size_limit = "50MiB" 39 | 40 | [auth] 41 | # The base URL of your website. Used as an allow-list for redirects and for constructing URLs used 42 | # in emails. 43 | site_url = "http://localhost:3000" 44 | # A list of *exact* URLs that auth providers are permitted to redirect to post authentication. 45 | additional_redirect_urls = ["https://localhost:3000"] 46 | # How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 seconds (one 47 | # week). 48 | jwt_expiry = 3600 49 | # Allow/disallow new user signups to your project. 50 | enable_signup = true 51 | 52 | [auth.email] 53 | # Allow/disallow new user signups via email to your project. 54 | enable_signup = true 55 | # If enabled, a user will be required to confirm any email change on both the old, and new email 56 | # addresses. If disabled, only the new email is required to confirm. 57 | double_confirm_changes = true 58 | # If enabled, users need to confirm their email address before signing in. 59 | enable_confirmations = false 60 | 61 | # Use an external OAuth provider. The full list of providers are: `apple`, `azure`, `bitbucket`, 62 | # `discord`, `facebook`, `github`, `gitlab`, `google`, `keycloak`, `linkedin`, `notion`, `twitch`, 63 | # `twitter`, `slack`, `spotify`, `workos`, `zoom`. 64 | [auth.external.apple] 65 | enabled = false 66 | client_id = "" 67 | secret = "" 68 | # Overrides the default auth redirectUrl. 69 | redirect_uri = "" 70 | # Overrides the default auth provider URL. Used to support self-hosted gitlab, single-tenant Azure, 71 | # or any other third-party OIDC providers. 72 | url = "" 73 | 74 | [analytics] 75 | enabled = false 76 | port = 54327 77 | vector_port = 54328 78 | # Setup BigQuery project to enable log viewer on local development stack. 79 | # See: https://supabase.com/docs/guides/getting-started/local-development#enabling-local-logging 80 | gcp_project_id = "" 81 | gcp_project_number = "" 82 | gcp_jwt_path = "supabase/gcloud.json" 83 | -------------------------------------------------------------------------------- /supabase/functions/_utils/env.ts: -------------------------------------------------------------------------------- 1 | import { assert } from "std/testing/asserts.ts"; 2 | 3 | // Throws with an assertion error if the specified environment variable is not defined 4 | export function ensureGetEnv(key: string) { 5 | const value = Deno.env.get(key); 6 | assert(value !== undefined, `Missing ${key} environment variable`); 7 | return value; 8 | } 9 | -------------------------------------------------------------------------------- /supabase/functions/_utils/errors.ts: -------------------------------------------------------------------------------- 1 | export class ApplicationError extends Error { 2 | // deno-lint-ignore no-explicit-any 3 | constructor(message: string, public data: Record = {}) { 4 | super(message); 5 | } 6 | } 7 | 8 | export class UserError extends ApplicationError {} 9 | -------------------------------------------------------------------------------- /supabase/functions/import_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "imports": { 3 | "commmon-tags": "https://esm.sh/common-tags@1.8.2", 4 | "gpt3-tokenizer": "https://esm.sh/gpt3-tokenizer@1.1.5", 5 | "openai": "https://esm.sh/openai@3.2.1", 6 | "preact": "https://esm.sh/preact@10.13.2", 7 | "preact/": "https://esm.sh/preact@10.13.2/", 8 | "preact-render-to-string": "https://esm.sh/*preact-render-to-string@5.2.6", 9 | "@preact/signals": "https://esm.sh/*@preact/signals@1.1.3", 10 | "@preact/signals-core": "https://esm.sh/*@preact/signals-core@1.3.0", 11 | "@supabase/supabase-js": "https://esm.sh/@supabase/supabase-js@2.21.0", 12 | "std/": "https://deno.land/std@0.184.0/", 13 | "twind": "https://esm.sh/twind@0.16.19", 14 | "twind/": "https://esm.sh/twind@0.16.19/", 15 | "xhr": "https://deno.land/x/xhr@0.3.0/mod.ts", 16 | "github-slugger": "npm:github-slugger@2.0.0", 17 | "mdast-util-from-markdown": "npm:mdast-util-from-markdown@1.3.0", 18 | "mdast-util-mdx": "npm:mdast-util-mdx@2.0.1", 19 | "mdast-util-to-markdown": "npm:mdast-util-to-markdown@1.5.0", 20 | "mdast-util-to-string": "npm:mdast-util-to-string@3.2.0", 21 | "micromark-extension-mdxjs": "npm:micromark-extension-mdxjs@1.0.0", 22 | "unist-builder": "npm:unist-builder@3.0.1", 23 | "unist-util-filter": "npm:unist-util-filter@4.0.1", 24 | "types/mdast": "npm:@types/mdast@3.0.11", 25 | "types/estree": "npm:@types/estree@1.0.0" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /supabase/functions/vector-search/index.ts: -------------------------------------------------------------------------------- 1 | import "xhr"; 2 | import { serve } from "std/http/server.ts"; 3 | import { createClient } from "@supabase/supabase-js"; 4 | import { codeBlock, oneLine } from "commmon-tags"; 5 | import GPT3Tokenizer from "gpt3-tokenizer"; 6 | import { Configuration, CreateCompletionRequest, OpenAIApi } from "openai"; 7 | import { ensureGetEnv } from "../_utils/env.ts"; 8 | import { ApplicationError, UserError } from "../_utils/errors.ts"; 9 | 10 | const OPENAI_API_KEY = ensureGetEnv("OPENAI_API_KEY"); 11 | const SUPABASE_URL = ensureGetEnv("SUPABASE_URL"); 12 | const SUPABASE_SERVICE_ROLE_KEY = ensureGetEnv("SUPABASE_SERVICE_ROLE_KEY"); 13 | 14 | const supabaseClient = createClient(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY, { 15 | db: { schema: "docs" }, 16 | }); 17 | const openAiConfiguration = new Configuration({ apiKey: OPENAI_API_KEY }); 18 | const openai = new OpenAIApi(openAiConfiguration); 19 | 20 | export const corsHeaders = { 21 | "Access-Control-Allow-Origin": "*", 22 | "Access-Control-Allow-Headers": 23 | "authorization, x-client-info, apikey, content-type", 24 | }; 25 | 26 | serve(async (req) => { 27 | try { 28 | // Handle CORS 29 | if (req.method === "OPTIONS") { 30 | return new Response("ok", { headers: corsHeaders }); 31 | } 32 | 33 | const query = new URL(req.url).searchParams.get("query"); 34 | 35 | if (!query) { 36 | throw new UserError("Missing query in request data"); 37 | } 38 | 39 | const sanitizedQuery = query.trim(); 40 | 41 | // Moderate the content to comply with OpenAI T&C 42 | const moderationResponse = await openai.createModeration({ 43 | input: sanitizedQuery, 44 | }); 45 | 46 | const [results] = moderationResponse.data.results; 47 | 48 | if (results.flagged) { 49 | throw new UserError("Flagged content", { 50 | flagged: true, 51 | categories: results.categories, 52 | }); 53 | } 54 | 55 | const embeddingResponse = await openai.createEmbedding({ 56 | model: "text-embedding-ada-002", 57 | input: sanitizedQuery.replaceAll("\n", " "), 58 | }); 59 | 60 | if (embeddingResponse.status !== 200) { 61 | throw new ApplicationError( 62 | "Failed to create embedding for question", 63 | embeddingResponse 64 | ); 65 | } 66 | 67 | const [{ embedding }] = embeddingResponse.data.data; 68 | 69 | const { error: matchError, data: pageSections } = await supabaseClient.rpc( 70 | "match_page_sections", 71 | { 72 | embedding, 73 | match_threshold: 0.78, 74 | match_count: 10, 75 | min_content_length: 50, 76 | } 77 | ); 78 | 79 | if (matchError) { 80 | throw new ApplicationError("Failed to match page sections", matchError); 81 | } 82 | 83 | const tokenizer = new GPT3Tokenizer({ type: "gpt3" }); 84 | let tokenCount = 0; 85 | let contextText = ""; 86 | 87 | for (const pageSection of pageSections) { 88 | const content = pageSection.content; 89 | const encoded = tokenizer.encode(content); 90 | tokenCount += encoded.text.length; 91 | 92 | if (tokenCount >= 1500) { 93 | break; 94 | } 95 | 96 | contextText += `${content.trim()}\n---\n`; 97 | } 98 | 99 | const prompt = codeBlock` 100 | ${oneLine` 101 | You are a very enthusiastic Supabase representative who loves 102 | to help people! Given the following sections from the Supabase 103 | documentation, answer the question using only that information, 104 | outputted in markdown format. If you are unsure and the answer 105 | is not explicitly written in the documentation, say 106 | "Sorry, I don't know how to help with that." 107 | `} 108 | 109 | Context sections: 110 | ${contextText} 111 | 112 | Question: """ 113 | ${sanitizedQuery} 114 | """ 115 | 116 | Answer as markdown (including related code snippets if available): 117 | `; 118 | 119 | const completionOptions: CreateCompletionRequest = { 120 | model: "gpt-3.5-turbo-instruct", 121 | prompt, 122 | max_tokens: 512, 123 | temperature: 0, 124 | stream: true, 125 | }; 126 | 127 | // The Fetch API allows for easier response streaming over the OpenAI client. 128 | const response = await fetch("https://api.openai.com/v1/completions", { 129 | headers: { 130 | Authorization: `Bearer ${OPENAI_API_KEY}`, 131 | "Content-Type": "application/json", 132 | }, 133 | method: "POST", 134 | body: JSON.stringify(completionOptions), 135 | }); 136 | 137 | if (!response.ok) { 138 | const error = await response.json(); 139 | throw new ApplicationError("Failed to generate completion", error); 140 | } 141 | 142 | // Proxy the streamed SSE response from OpenAI 143 | return new Response(response.body, { 144 | headers: { 145 | ...corsHeaders, 146 | "Content-Type": "text/event-stream", 147 | }, 148 | }); 149 | } catch (err: unknown) { 150 | if (err instanceof UserError) { 151 | return Response.json( 152 | { 153 | error: err.message, 154 | data: err.data, 155 | }, 156 | { 157 | status: 400, 158 | headers: corsHeaders, 159 | } 160 | ); 161 | } else if (err instanceof ApplicationError) { 162 | // Print out application errors with their additional data 163 | console.error(`${err.message}: ${JSON.stringify(err.data)}`); 164 | } else { 165 | // Print out unexpected errors as is to help with debugging 166 | console.error(err); 167 | } 168 | 169 | // TODO: include more response info in debug environments 170 | return Response.json( 171 | { 172 | error: "There was an error processing your request", 173 | }, 174 | { 175 | status: 500, 176 | headers: corsHeaders, 177 | } 178 | ); 179 | } 180 | }); 181 | -------------------------------------------------------------------------------- /supabase/migrations/20230525043454_init.sql: -------------------------------------------------------------------------------- 1 | -- Create separate docs schema and grants 2 | create schema if not exists docs; 3 | grant usage on schema docs to postgres, service_role; 4 | 5 | -- Enable pgvector extension 6 | create extension if not exists vector; 7 | 8 | -- Create tables 9 | create table "docs"."page" ( 10 | id bigserial primary key, 11 | parent_page_id bigint references docs.page, 12 | path text not null unique, 13 | checksum text, 14 | meta jsonb, 15 | type text, 16 | source text, 17 | "version" uuid, 18 | "last_refresh" timestamptz 19 | ); 20 | alter table "docs"."page" enable row level security; 21 | 22 | create table "docs"."page_section" ( 23 | id bigserial primary key, 24 | page_id bigint not null references docs.page on delete cascade, 25 | content text, 26 | token_count int, 27 | embedding vector(1536), 28 | slug text, 29 | heading text 30 | ); 31 | alter table "docs"."page_section" enable row level security; 32 | 33 | -- Create embedding similarity search functions 34 | create or replace function "docs"."match_page_sections"(embedding vector(1536), match_threshold float, match_count int, min_content_length int) 35 | returns table (id bigint, page_id bigint, slug text, heading text, content text, similarity float) 36 | language plpgsql 37 | as $$ 38 | #variable_conflict use_variable 39 | begin 40 | return query 41 | select 42 | page_section.id, 43 | page_section.page_id, 44 | page_section.slug, 45 | page_section.heading, 46 | page_section.content, 47 | (page_section.embedding <#> embedding) * -1 as similarity 48 | from page_section 49 | 50 | -- We only care about sections that have a useful amount of content 51 | where length(page_section.content) >= min_content_length 52 | 53 | -- The dot product is negative because of a Postgres limitation, so we negate it 54 | and (page_section.embedding <#> embedding) * -1 > match_threshold 55 | 56 | -- OpenAI embeddings are normalized to length 1, so 57 | -- cosine similarity and dot product will produce the same results. 58 | -- Using dot product which can be computed slightly faster. 59 | -- 60 | -- For the different syntaxes, see https://github.com/pgvector/pgvector 61 | order by page_section.embedding <#> embedding 62 | 63 | limit match_count; 64 | end; 65 | $$; 66 | 67 | create or replace function "docs"."get_page_parents"(page_id bigint) 68 | returns table (id bigint, parent_page_id bigint, path text, meta jsonb) 69 | language sql 70 | as $$ 71 | with recursive chain as ( 72 | select * 73 | from docs.page 74 | where id = page_id 75 | 76 | union all 77 | 78 | select child.* 79 | from docs.page as child 80 | join chain on chain.parent_page_id = child.id 81 | ) 82 | select id, parent_page_id, path, meta 83 | from chain; 84 | $$; 85 | 86 | -- Update table grants 87 | ALTER DEFAULT PRIVILEGES IN SCHEMA docs 88 | GRANT ALL ON TABLES TO postgres, service_role; 89 | 90 | GRANT SELECT, INSERT, UPDATE, DELETE 91 | ON ALL TABLES IN SCHEMA docs 92 | TO postgres, service_role; 93 | 94 | GRANT USAGE, SELECT 95 | ON ALL SEQUENCES IN SCHEMA docs 96 | TO postgres, service_role; 97 | 98 | 99 | -------------------------------------------------------------------------------- /supabase/seed.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/supabase/headless-vector-search/a68de46917aeec4dfb41b4e853a40f08712301ac/supabase/seed.sql --------------------------------------------------------------------------------