├── .vscode
└── settings.json
├── LICENSE
├── README.md
└── supabase
├── .gitignore
├── config.toml
├── functions
├── _utils
│ ├── env.ts
│ └── errors.ts
├── import_map.json
└── vector-search
│ └── index.ts
├── migrations
└── 20230525043454_init.sql
└── seed.sql
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "deno.enablePaths": ["./supabase/functions"],
3 | "deno.importMap": "./supabase/functions/import_map.json"
4 | }
5 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Supabase
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Headless Vector Search
2 |
3 | Provides a vector/similarity search for any documentation site. It's [headless](https://en.wikipedia.org/wiki/Headless_software), so that you can integrate it into your existing website.
4 |
5 | #### How it works:
6 |
7 | - This repo initializes a new `docs` schema inside your database.
8 | - The accompanying [GitHub Action](https://github.com/supabase/supabase-vector-embeddings-github-action) ingests your markdown docs into your database as embeddings.
9 | - This repo provides an Edge Function that handles user queries, converting them into ChatGPT-like responses.
10 |
11 | #### Tech stack:
12 |
13 | - Supabase: Database & Edge Functions.
14 | - OpenAI: Embeddings and completions.
15 | - GitHub Actions: for ingesting your markdown docs.
16 |
17 | ## Set-up
18 |
19 | Start by creating a new Supabase Project: [database.new](https://database.new).
20 |
21 | 1. Clone this repo
22 | 2. Link the repo to your remote project: `supabase link --project-ref XXX`
23 | 3. Apply the database migrations: `supabase db push`
24 | 4. Set your OpenAI key as a secret: `supabase secrets set OPENAI_API_KEY=sk-xxx`
25 | 5. Deploy the Edge Functions: `supabase functions deploy --no-verify-jwt`
26 | 6. Expose `docs` schema via API in Supabase Dashboard [settings](https://app.supabase.com/project/_/settings/api) > `API Settings` > `Exposed schemas`
27 | 7. [Setup](https://github.com/supabase/supabase-vector-embeddings-github-action#use) `supabase-vector-embeddings` GitHub action in your Knowledge Base repo. You will see the embeddings populated in your database after the GitHub Action has run.
28 |
29 | ## Usage
30 |
31 | 1. Find the URL for the `vector-search` Edge Function in the [Functions section](https://app.supabase.com/project/_/functions) of the Dashboard.
32 | 2. Inside your appliction, you can send the user queries to this endpoint to receive a streamed response from OpenAI.
33 |
34 |
35 | See cURL example
36 |
37 | ```bash
38 | curl -i --location --request GET 'https://your-project-ref.functions.supabase.co/vector-search?query=What%27s+Supabase%3F'
39 | ```
40 |
41 |
42 |
43 |
44 | See EventSource example
45 |
46 |
47 | ```ts
48 | const onSubmit = (e: Event) => {
49 | e.preventDefault()
50 | answer.value = ""
51 | isLoading.value = true
52 |
53 | const query = new URLSearchParams({ query: inputRef.current!.value })
54 | const projectUrl = `https://your-project-ref.functions.supabase.co`
55 | const queryURL = `${projectURL}/${query}`
56 | const eventSource = new EventSource(queryURL)
57 |
58 | eventSource.addEventListener("error", (err) => {
59 | isLoading.value = false
60 | console.error(err)
61 | })
62 |
63 | eventSource.addEventListener("message", (e: MessageEvent) => {
64 | isLoading.value = false
65 |
66 | if (e.data === "[DONE]") {
67 | eventSource.close()
68 | return
69 | }
70 |
71 | const completionResponse: CreateCompletionResponse = JSON.parse(e.data)
72 | const text = completionResponse.choices[0].text
73 |
74 | answer.value += text
75 | });
76 |
77 | isLoading.value = true
78 | }
79 | ```
80 |
81 |
82 |
83 |
84 | ## Showcase
85 |
86 | - [docs.supabase.com](https://supabase.com/docs) - Use cmd+k to access.
87 |
88 | ## License
89 |
90 | MIT
91 |
--------------------------------------------------------------------------------
/supabase/.gitignore:
--------------------------------------------------------------------------------
1 | # Supabase
2 | .branches
3 | .temp
4 |
--------------------------------------------------------------------------------
/supabase/config.toml:
--------------------------------------------------------------------------------
1 | # A string used to distinguish different Supabase projects on the same host. Defaults to the working
2 | # directory name when running `supabase init`.
3 | project_id = "headless-supabase-vector-search"
4 |
5 | [api]
6 | # Port to use for the API URL.
7 | port = 54321
8 | # Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
9 | # endpoints. public and storage are always included.
10 | schemas = ["public", "storage", "graphql_public"]
11 | # Extra schemas to add to the search_path of every request. public is always included.
12 | extra_search_path = ["public", "extensions"]
13 | # The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
14 | # for accidental or malicious requests.
15 | max_rows = 1000
16 |
17 | [db]
18 | # Port to use for the local database URL.
19 | port = 54322
20 | # The database major version to use. This has to be the same as your remote database's. Run `SHOW
21 | # server_version;` on the remote database to check.
22 | major_version = 15
23 |
24 | [studio]
25 | # Port to use for Supabase Studio.
26 | port = 54323
27 |
28 | # Email testing server. Emails sent with the local dev setup are not actually sent - rather, they
29 | # are monitored, and you can view the emails that would have been sent from the web interface.
30 | [inbucket]
31 | # Port to use for the email testing server web interface.
32 | port = 54324
33 | smtp_port = 54325
34 | pop3_port = 54326
35 |
36 | [storage]
37 | # The maximum file size allowed (e.g. "5MB", "500KB").
38 | file_size_limit = "50MiB"
39 |
40 | [auth]
41 | # The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
42 | # in emails.
43 | site_url = "http://localhost:3000"
44 | # A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
45 | additional_redirect_urls = ["https://localhost:3000"]
46 | # How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 seconds (one
47 | # week).
48 | jwt_expiry = 3600
49 | # Allow/disallow new user signups to your project.
50 | enable_signup = true
51 |
52 | [auth.email]
53 | # Allow/disallow new user signups via email to your project.
54 | enable_signup = true
55 | # If enabled, a user will be required to confirm any email change on both the old, and new email
56 | # addresses. If disabled, only the new email is required to confirm.
57 | double_confirm_changes = true
58 | # If enabled, users need to confirm their email address before signing in.
59 | enable_confirmations = false
60 |
61 | # Use an external OAuth provider. The full list of providers are: `apple`, `azure`, `bitbucket`,
62 | # `discord`, `facebook`, `github`, `gitlab`, `google`, `keycloak`, `linkedin`, `notion`, `twitch`,
63 | # `twitter`, `slack`, `spotify`, `workos`, `zoom`.
64 | [auth.external.apple]
65 | enabled = false
66 | client_id = ""
67 | secret = ""
68 | # Overrides the default auth redirectUrl.
69 | redirect_uri = ""
70 | # Overrides the default auth provider URL. Used to support self-hosted gitlab, single-tenant Azure,
71 | # or any other third-party OIDC providers.
72 | url = ""
73 |
74 | [analytics]
75 | enabled = false
76 | port = 54327
77 | vector_port = 54328
78 | # Setup BigQuery project to enable log viewer on local development stack.
79 | # See: https://supabase.com/docs/guides/getting-started/local-development#enabling-local-logging
80 | gcp_project_id = ""
81 | gcp_project_number = ""
82 | gcp_jwt_path = "supabase/gcloud.json"
83 |
--------------------------------------------------------------------------------
/supabase/functions/_utils/env.ts:
--------------------------------------------------------------------------------
1 | import { assert } from "std/testing/asserts.ts";
2 |
3 | // Throws with an assertion error if the specified environment variable is not defined
4 | export function ensureGetEnv(key: string) {
5 | const value = Deno.env.get(key);
6 | assert(value !== undefined, `Missing ${key} environment variable`);
7 | return value;
8 | }
9 |
--------------------------------------------------------------------------------
/supabase/functions/_utils/errors.ts:
--------------------------------------------------------------------------------
1 | export class ApplicationError extends Error {
2 | // deno-lint-ignore no-explicit-any
3 | constructor(message: string, public data: Record = {}) {
4 | super(message);
5 | }
6 | }
7 |
8 | export class UserError extends ApplicationError {}
9 |
--------------------------------------------------------------------------------
/supabase/functions/import_map.json:
--------------------------------------------------------------------------------
1 | {
2 | "imports": {
3 | "commmon-tags": "https://esm.sh/common-tags@1.8.2",
4 | "gpt3-tokenizer": "https://esm.sh/gpt3-tokenizer@1.1.5",
5 | "openai": "https://esm.sh/openai@3.2.1",
6 | "preact": "https://esm.sh/preact@10.13.2",
7 | "preact/": "https://esm.sh/preact@10.13.2/",
8 | "preact-render-to-string": "https://esm.sh/*preact-render-to-string@5.2.6",
9 | "@preact/signals": "https://esm.sh/*@preact/signals@1.1.3",
10 | "@preact/signals-core": "https://esm.sh/*@preact/signals-core@1.3.0",
11 | "@supabase/supabase-js": "https://esm.sh/@supabase/supabase-js@2.21.0",
12 | "std/": "https://deno.land/std@0.184.0/",
13 | "twind": "https://esm.sh/twind@0.16.19",
14 | "twind/": "https://esm.sh/twind@0.16.19/",
15 | "xhr": "https://deno.land/x/xhr@0.3.0/mod.ts",
16 | "github-slugger": "npm:github-slugger@2.0.0",
17 | "mdast-util-from-markdown": "npm:mdast-util-from-markdown@1.3.0",
18 | "mdast-util-mdx": "npm:mdast-util-mdx@2.0.1",
19 | "mdast-util-to-markdown": "npm:mdast-util-to-markdown@1.5.0",
20 | "mdast-util-to-string": "npm:mdast-util-to-string@3.2.0",
21 | "micromark-extension-mdxjs": "npm:micromark-extension-mdxjs@1.0.0",
22 | "unist-builder": "npm:unist-builder@3.0.1",
23 | "unist-util-filter": "npm:unist-util-filter@4.0.1",
24 | "types/mdast": "npm:@types/mdast@3.0.11",
25 | "types/estree": "npm:@types/estree@1.0.0"
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/supabase/functions/vector-search/index.ts:
--------------------------------------------------------------------------------
1 | import "xhr";
2 | import { serve } from "std/http/server.ts";
3 | import { createClient } from "@supabase/supabase-js";
4 | import { codeBlock, oneLine } from "commmon-tags";
5 | import GPT3Tokenizer from "gpt3-tokenizer";
6 | import { Configuration, CreateCompletionRequest, OpenAIApi } from "openai";
7 | import { ensureGetEnv } from "../_utils/env.ts";
8 | import { ApplicationError, UserError } from "../_utils/errors.ts";
9 |
10 | const OPENAI_API_KEY = ensureGetEnv("OPENAI_API_KEY");
11 | const SUPABASE_URL = ensureGetEnv("SUPABASE_URL");
12 | const SUPABASE_SERVICE_ROLE_KEY = ensureGetEnv("SUPABASE_SERVICE_ROLE_KEY");
13 |
14 | const supabaseClient = createClient(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY, {
15 | db: { schema: "docs" },
16 | });
17 | const openAiConfiguration = new Configuration({ apiKey: OPENAI_API_KEY });
18 | const openai = new OpenAIApi(openAiConfiguration);
19 |
20 | export const corsHeaders = {
21 | "Access-Control-Allow-Origin": "*",
22 | "Access-Control-Allow-Headers":
23 | "authorization, x-client-info, apikey, content-type",
24 | };
25 |
26 | serve(async (req) => {
27 | try {
28 | // Handle CORS
29 | if (req.method === "OPTIONS") {
30 | return new Response("ok", { headers: corsHeaders });
31 | }
32 |
33 | const query = new URL(req.url).searchParams.get("query");
34 |
35 | if (!query) {
36 | throw new UserError("Missing query in request data");
37 | }
38 |
39 | const sanitizedQuery = query.trim();
40 |
41 | // Moderate the content to comply with OpenAI T&C
42 | const moderationResponse = await openai.createModeration({
43 | input: sanitizedQuery,
44 | });
45 |
46 | const [results] = moderationResponse.data.results;
47 |
48 | if (results.flagged) {
49 | throw new UserError("Flagged content", {
50 | flagged: true,
51 | categories: results.categories,
52 | });
53 | }
54 |
55 | const embeddingResponse = await openai.createEmbedding({
56 | model: "text-embedding-ada-002",
57 | input: sanitizedQuery.replaceAll("\n", " "),
58 | });
59 |
60 | if (embeddingResponse.status !== 200) {
61 | throw new ApplicationError(
62 | "Failed to create embedding for question",
63 | embeddingResponse
64 | );
65 | }
66 |
67 | const [{ embedding }] = embeddingResponse.data.data;
68 |
69 | const { error: matchError, data: pageSections } = await supabaseClient.rpc(
70 | "match_page_sections",
71 | {
72 | embedding,
73 | match_threshold: 0.78,
74 | match_count: 10,
75 | min_content_length: 50,
76 | }
77 | );
78 |
79 | if (matchError) {
80 | throw new ApplicationError("Failed to match page sections", matchError);
81 | }
82 |
83 | const tokenizer = new GPT3Tokenizer({ type: "gpt3" });
84 | let tokenCount = 0;
85 | let contextText = "";
86 |
87 | for (const pageSection of pageSections) {
88 | const content = pageSection.content;
89 | const encoded = tokenizer.encode(content);
90 | tokenCount += encoded.text.length;
91 |
92 | if (tokenCount >= 1500) {
93 | break;
94 | }
95 |
96 | contextText += `${content.trim()}\n---\n`;
97 | }
98 |
99 | const prompt = codeBlock`
100 | ${oneLine`
101 | You are a very enthusiastic Supabase representative who loves
102 | to help people! Given the following sections from the Supabase
103 | documentation, answer the question using only that information,
104 | outputted in markdown format. If you are unsure and the answer
105 | is not explicitly written in the documentation, say
106 | "Sorry, I don't know how to help with that."
107 | `}
108 |
109 | Context sections:
110 | ${contextText}
111 |
112 | Question: """
113 | ${sanitizedQuery}
114 | """
115 |
116 | Answer as markdown (including related code snippets if available):
117 | `;
118 |
119 | const completionOptions: CreateCompletionRequest = {
120 | model: "gpt-3.5-turbo-instruct",
121 | prompt,
122 | max_tokens: 512,
123 | temperature: 0,
124 | stream: true,
125 | };
126 |
127 | // The Fetch API allows for easier response streaming over the OpenAI client.
128 | const response = await fetch("https://api.openai.com/v1/completions", {
129 | headers: {
130 | Authorization: `Bearer ${OPENAI_API_KEY}`,
131 | "Content-Type": "application/json",
132 | },
133 | method: "POST",
134 | body: JSON.stringify(completionOptions),
135 | });
136 |
137 | if (!response.ok) {
138 | const error = await response.json();
139 | throw new ApplicationError("Failed to generate completion", error);
140 | }
141 |
142 | // Proxy the streamed SSE response from OpenAI
143 | return new Response(response.body, {
144 | headers: {
145 | ...corsHeaders,
146 | "Content-Type": "text/event-stream",
147 | },
148 | });
149 | } catch (err: unknown) {
150 | if (err instanceof UserError) {
151 | return Response.json(
152 | {
153 | error: err.message,
154 | data: err.data,
155 | },
156 | {
157 | status: 400,
158 | headers: corsHeaders,
159 | }
160 | );
161 | } else if (err instanceof ApplicationError) {
162 | // Print out application errors with their additional data
163 | console.error(`${err.message}: ${JSON.stringify(err.data)}`);
164 | } else {
165 | // Print out unexpected errors as is to help with debugging
166 | console.error(err);
167 | }
168 |
169 | // TODO: include more response info in debug environments
170 | return Response.json(
171 | {
172 | error: "There was an error processing your request",
173 | },
174 | {
175 | status: 500,
176 | headers: corsHeaders,
177 | }
178 | );
179 | }
180 | });
181 |
--------------------------------------------------------------------------------
/supabase/migrations/20230525043454_init.sql:
--------------------------------------------------------------------------------
1 | -- Create separate docs schema and grants
2 | create schema if not exists docs;
3 | grant usage on schema docs to postgres, service_role;
4 |
5 | -- Enable pgvector extension
6 | create extension if not exists vector;
7 |
8 | -- Create tables
9 | create table "docs"."page" (
10 | id bigserial primary key,
11 | parent_page_id bigint references docs.page,
12 | path text not null unique,
13 | checksum text,
14 | meta jsonb,
15 | type text,
16 | source text,
17 | "version" uuid,
18 | "last_refresh" timestamptz
19 | );
20 | alter table "docs"."page" enable row level security;
21 |
22 | create table "docs"."page_section" (
23 | id bigserial primary key,
24 | page_id bigint not null references docs.page on delete cascade,
25 | content text,
26 | token_count int,
27 | embedding vector(1536),
28 | slug text,
29 | heading text
30 | );
31 | alter table "docs"."page_section" enable row level security;
32 |
33 | -- Create embedding similarity search functions
34 | create or replace function "docs"."match_page_sections"(embedding vector(1536), match_threshold float, match_count int, min_content_length int)
35 | returns table (id bigint, page_id bigint, slug text, heading text, content text, similarity float)
36 | language plpgsql
37 | as $$
38 | #variable_conflict use_variable
39 | begin
40 | return query
41 | select
42 | page_section.id,
43 | page_section.page_id,
44 | page_section.slug,
45 | page_section.heading,
46 | page_section.content,
47 | (page_section.embedding <#> embedding) * -1 as similarity
48 | from page_section
49 |
50 | -- We only care about sections that have a useful amount of content
51 | where length(page_section.content) >= min_content_length
52 |
53 | -- The dot product is negative because of a Postgres limitation, so we negate it
54 | and (page_section.embedding <#> embedding) * -1 > match_threshold
55 |
56 | -- OpenAI embeddings are normalized to length 1, so
57 | -- cosine similarity and dot product will produce the same results.
58 | -- Using dot product which can be computed slightly faster.
59 | --
60 | -- For the different syntaxes, see https://github.com/pgvector/pgvector
61 | order by page_section.embedding <#> embedding
62 |
63 | limit match_count;
64 | end;
65 | $$;
66 |
67 | create or replace function "docs"."get_page_parents"(page_id bigint)
68 | returns table (id bigint, parent_page_id bigint, path text, meta jsonb)
69 | language sql
70 | as $$
71 | with recursive chain as (
72 | select *
73 | from docs.page
74 | where id = page_id
75 |
76 | union all
77 |
78 | select child.*
79 | from docs.page as child
80 | join chain on chain.parent_page_id = child.id
81 | )
82 | select id, parent_page_id, path, meta
83 | from chain;
84 | $$;
85 |
86 | -- Update table grants
87 | ALTER DEFAULT PRIVILEGES IN SCHEMA docs
88 | GRANT ALL ON TABLES TO postgres, service_role;
89 |
90 | GRANT SELECT, INSERT, UPDATE, DELETE
91 | ON ALL TABLES IN SCHEMA docs
92 | TO postgres, service_role;
93 |
94 | GRANT USAGE, SELECT
95 | ON ALL SEQUENCES IN SCHEMA docs
96 | TO postgres, service_role;
97 |
98 |
99 |
--------------------------------------------------------------------------------
/supabase/seed.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/supabase/headless-vector-search/a68de46917aeec4dfb41b4e853a40f08712301ac/supabase/seed.sql
--------------------------------------------------------------------------------