46 | >;
47 |
48 | /* prettier-ignore-end */
49 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "embeddings-in-convex",
3 | "private": true,
4 | "version": "0.0.0",
5 | "scripts": {
6 | "dev": "npm-run-all --parallel dev:backend dev:frontend",
7 | "build": "tsc && vite build",
8 | "dev:backend": "convex dev",
9 | "dev:frontend": "vite --open --clearScreen false",
10 | "predev": "convex dev --until-success"
11 | },
12 | "dependencies": {
13 | "@phosphor-icons/react": "^2.0.10",
14 | "@rewind-ui/core": "^0.12.2",
15 | "@tailwindcss/forms": "^0.5.3",
16 | "@tailwindcss/typography": "^0.5.9",
17 | "convex": "^1.16.0",
18 | "convex-helpers": "^0.1.58",
19 | "langchain": "^0.0.92",
20 | "openai": "^3.2.1",
21 | "react": "^18.2.0",
22 | "react-dom": "^18.2.0",
23 | "react-hook-form": "^7.44.3",
24 | "tailwind-scrollbar": "^3.0.4"
25 | },
26 | "devDependencies": {
27 | "@types/node": "^20.2.3",
28 | "@types/react": "^18.0.28",
29 | "@types/react-dom": "^18.0.11",
30 | "@typescript-eslint/eslint-plugin": "^5.57.1",
31 | "@typescript-eslint/parser": "^5.57.1",
32 | "@vitejs/plugin-react": "^4.0.0",
33 | "autoprefixer": "^10.4.14",
34 | "eslint": "^8.38.0",
35 | "eslint-plugin-react-hooks": "^4.6.0",
36 | "eslint-plugin-react-refresh": "^0.3.4",
37 | "npm-run-all": "^4.1.5",
38 | "postcss": "^8.4.24",
39 | "tailwindcss": "^3.3.2",
40 | "typescript": "^5.0.2",
41 | "vite": "^4.3.2"
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/convex/lib/embeddings.ts:
--------------------------------------------------------------------------------
1 | import { query } from "../_generated/server";
2 |
3 | export async function fetchEmbeddingBatch(texts: string[]) {
4 | const start = Date.now();
5 | const result = await fetch("https://api.openai.com/v1/embeddings", {
6 | method: "POST",
7 | headers: {
8 | "Content-Type": "application/json",
9 | Authorization: "Bearer " + process.env.OPENAI_API_KEY,
10 | },
11 |
12 | body: JSON.stringify({
13 | model: "text-embedding-ada-002",
14 | input: texts.map((text) => text.replace(/\n/g, " ")),
15 | }),
16 | });
17 | const embeddingMs = Date.now() - start;
18 |
19 | const jsonresults = await result.json();
20 | if (jsonresults.data.length !== texts.length) {
21 | console.error(result);
22 | throw new Error("Unexpected number of embeddings");
23 | }
24 | const allembeddings = jsonresults.data as {
25 | embedding: number[];
26 | index: number;
27 | }[];
28 | allembeddings.sort((a, b) => a.index - b.index);
29 | return {
30 | embeddings: allembeddings.map(({ embedding }) => embedding),
31 | totalTokens: jsonresults.usage.total_tokens,
32 | embeddingMs,
33 | };
34 | }
35 |
36 | export async function fetchEmbedding(text: string) {
37 | const { embeddings, ...stats } = await fetchEmbeddingBatch([text]);
38 | return { embedding: embeddings[0], ...stats };
39 | }
40 |
41 | export const envCheck = query(async () => {
42 | return {
43 | OPENAI_API_KEY: !!process.env.OPENAI_API_KEY,
44 | };
45 | });
46 |
--------------------------------------------------------------------------------
/scripts/addURL.py:
--------------------------------------------------------------------------------
1 | """ Import files into Convex using Langchain document loaders
2 |
3 | Setup:
4 | !pip install "playwright"
5 | !pip install "unstructured"
6 | !pip install "convex"
7 | !pip install "python-dotenv"
8 | !pip install tiktoken
9 |
10 | !playwright install
11 | """
12 |
13 | import os
14 | import sys
15 | from dotenv import load_dotenv
16 | from convex import ConvexClient
17 | from langchain.document_loaders import PlaywrightURLLoader
18 | from langchain.text_splitter import CharacterTextSplitter
19 |
20 |
21 | urls = sys.argv[1:]
22 | loader = PlaywrightURLLoader(urls=urls, remove_selectors=["header", "footer"])
23 | data = loader.load()
24 | text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
25 | chunk_size=100, chunk_overlap=0
26 | )
27 | texts = text_splitter.split_text(data[0].page_content)
28 |
29 |
30 | load_dotenv(".env.local")
31 | load_dotenv()
32 |
33 | backend = os.getenv("VITE_CONVEX_URL")
34 | if not backend:
35 | raise KeyError("Missing VITE_CONVEX_URL")
36 |
37 | client = ConvexClient(backend)
38 | print(
39 | client.action(
40 | "sources:add",
41 | dict(
42 | name=data[0].metadata["source"],
43 | chunks=list(
44 | map(
45 | lambda chunk: dict(
46 | text=chunk,
47 | # TODO: add real line numbers
48 | lines={"from": 0, "to": 0},
49 | ),
50 | texts,
51 | )
52 | ),
53 | ),
54 | )
55 | )
56 |
--------------------------------------------------------------------------------
/src/Compare.tsx:
--------------------------------------------------------------------------------
1 | import { useQuery } from "convex/react";
2 | import { Button, Text } from "@rewind-ui/core";
3 | import { api } from "../convex/_generated/api";
4 | import { Loading } from "./Loading";
5 | import { AllChunks, Chunks } from "./Chunks";
6 | import { Target, CompareFn } from "./useComparison";
7 |
8 | export function Compare({
9 | target,
10 | compare,
11 | }: {
12 | target?: Target;
13 | compare: CompareFn;
14 | }) {
15 | return (
16 | <>
17 | {target ? (
18 |
19 | ) : (
20 |
21 | )}
22 | >
23 | );
24 | }
25 |
26 | function ComparisonResults({
27 | target: { comparisonId, chunkId },
28 | compare,
29 | }: {
30 | target: Target;
31 | compare: CompareFn;
32 | }) {
33 | const comparison = useQuery(api.comparisons.get, { comparisonId });
34 | return comparison?.relatedChunks.length ? (
35 | <>
36 |
37 |
38 | Results for {comparison.target.sourceName} (
39 | {comparison.target.chunkIndex}): "{comparison.target.text}
40 |
41 |
44 |
45 | {" "}
46 | >
47 | ) : (
48 | <>
49 |
50 | Comparing against "{comparison?.target?.text ?? chunkId}"...
51 |
52 |
53 | >
54 | );
55 | }
56 |
--------------------------------------------------------------------------------
/src/App.tsx:
--------------------------------------------------------------------------------
1 | import "./App.css";
2 |
3 | import { Tabs, Text } from "@rewind-ui/core";
4 | import { AddSource } from "./AddSource";
5 | import { EnvCheck } from "./EnvCheck";
6 | import { Sources } from "./Sources";
7 | import { Search } from "./Search";
8 | import { Compare } from "./Compare";
9 | import { useComparison } from "./useComparison";
10 | import { Prompt } from "./Prompt";
11 | import { useRef } from "react";
12 |
13 | function App() {
14 | const compareRef = useRef(null);
15 | console.log(compareRef);
16 | const [target, compare] = useComparison(compareRef);
17 | return (
18 |
19 |
20 |
21 | Sources
22 | Search
23 |
24 | Compare
25 |
26 |
27 | Prompt
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 | Add a source
41 |
42 | This is the data you will be able to search over and compare
43 | semantically.
44 |
45 |
46 |
47 |
48 |
49 |
50 | );
51 | }
52 |
53 | export default App;
54 |
--------------------------------------------------------------------------------
/src/Sources.tsx:
--------------------------------------------------------------------------------
1 | import { useMutation, usePaginatedQuery } from "convex/react";
2 | import { Button, Table } from "@rewind-ui/core";
3 | import { api } from "../convex/_generated/api";
4 |
5 | export function Sources() {
6 | const {
7 | status,
8 | loadMore,
9 | results: sources,
10 | } = usePaginatedQuery(api.sources.paginate, {}, { initialNumItems: 10 });
11 | const deleteSource = useMutation(api.sources.deleteSource);
12 | return (
13 | <>
14 |
15 |
16 |
17 | Source
18 | Chunks
19 | Content
20 | Tokens
21 |
22 |
23 |
24 |
25 | {sources.map((source) => (
26 |
27 | {source.name}
28 | {source.chunkIds.length}
29 |
30 | {source.firstChunkText}
31 |
32 |
33 | {source.saved ? source.totalTokens : "Unsaved"}
34 |
35 |
36 |
42 |
43 |
44 | ))}
45 |
46 |
47 | {status !== "Exhausted" && (
48 |
54 | )}
55 | >
56 | );
57 | }
58 |
--------------------------------------------------------------------------------
/convex/_generated/dataModel.d.ts:
--------------------------------------------------------------------------------
1 | /* prettier-ignore-start */
2 |
3 | /* eslint-disable */
4 | /**
5 | * Generated data model types.
6 | *
7 | * THIS CODE IS AUTOMATICALLY GENERATED.
8 | *
9 | * To regenerate, run `npx convex dev`.
10 | * @module
11 | */
12 |
13 | import type {
14 | DataModelFromSchemaDefinition,
15 | DocumentByName,
16 | TableNamesInDataModel,
17 | SystemTableNames,
18 | } from "convex/server";
19 | import type { GenericId } from "convex/values";
20 | import schema from "../schema.js";
21 |
22 | /**
23 | * The names of all of your Convex tables.
24 | */
25 | export type TableNames = TableNamesInDataModel;
26 |
27 | /**
28 | * The type of a document stored in Convex.
29 | *
30 | * @typeParam TableName - A string literal type of the table name (like "users").
31 | */
32 | export type Doc = DocumentByName<
33 | DataModel,
34 | TableName
35 | >;
36 |
37 | /**
38 | * An identifier for a document in Convex.
39 | *
40 | * Convex documents are uniquely identified by their `Id`, which is accessible
41 | * on the `_id` field. To learn more, see [Document IDs](https://docs.convex.dev/using/document-ids).
42 | *
43 | * Documents can be loaded using `db.get(id)` in query and mutation functions.
44 | *
45 | * IDs are just strings at runtime, but this type can be used to distinguish them from other
46 | * strings when type checking.
47 | *
48 | * @typeParam TableName - A string literal type of the table name (like "users").
49 | */
50 | export type Id =
51 | GenericId;
52 |
53 | /**
54 | * A type describing your Convex data model.
55 | *
56 | * This type includes information about what tables you have, the type of
57 | * documents stored in those tables, and the indexes defined on them.
58 | *
59 | * This type is used to parameterize methods like `queryGeneric` and
60 | * `mutationGeneric` to make them type-safe.
61 | */
62 | export type DataModel = DataModelFromSchemaDefinition;
63 |
64 | /* prettier-ignore-end */
65 |
--------------------------------------------------------------------------------
/src/AddSource.tsx:
--------------------------------------------------------------------------------
1 | import { useEffect, useState } from "react";
2 | import { useMutation } from "convex/react";
3 | import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
4 | import { useForm, Controller } from "react-hook-form";
5 | import { Alert, Input, Textarea, Button } from "@rewind-ui/core";
6 | import { api } from "../convex/_generated/api";
7 |
8 | export function AddSource() {
9 | const createSource = useMutation(api.sources.add);
10 | const [added, setAdded] = useState("");
11 |
12 | const { formState, handleSubmit, control, reset } = useForm<{
13 | name: string;
14 | text: string;
15 | }>({});
16 | const onSubmit = handleSubmit(({ name, text }) => {
17 | const textSplitter = new RecursiveCharacterTextSplitter({
18 | chunkSize: 1000,
19 | });
20 | textSplitter.createDocuments([text]).then((docs) => {
21 | createSource({
22 | name,
23 | chunks: docs.map((doc) => ({
24 | text: doc.pageContent,
25 | lines: doc.metadata.loc.lines,
26 | })),
27 | }).then(() => {
28 | setAdded(name);
29 | setTimeout(
30 | () => setAdded((state) => (state === name ? "" : state)),
31 | 1000
32 | );
33 | });
34 | });
35 | });
36 | useEffect(() => {
37 | if (formState.isSubmitSuccessful) {
38 | reset();
39 | }
40 | }, [formState, reset]);
41 |
42 | return (
43 |
65 | );
66 | }
67 |
--------------------------------------------------------------------------------
/src/Chunks.tsx:
--------------------------------------------------------------------------------
1 | import { Button, Table } from "@rewind-ui/core";
2 | import { Doc } from "../convex/_generated/dataModel";
3 | import { usePaginatedQuery } from "convex/react";
4 | import { api } from "../convex/_generated/api";
5 | import { CompareFn } from "./useComparison";
6 |
7 | export function Chunks({
8 | chunks,
9 | compare,
10 | }: {
11 | chunks: (Doc<"chunks"> & { sourceName: string; score?: number })[];
12 | compare?: CompareFn;
13 | }) {
14 | return (
15 |
16 |
17 |
18 | Source
19 | Index
20 | Content
21 | Score
22 | {compare && Compare}
23 |
24 |
25 |
26 | {chunks.map((chunk) => (
27 |
28 | {chunk.sourceName}
29 | {chunk.chunkIndex}
30 |
31 | {chunk.text}
32 |
33 |
34 | {chunk.score ? `${(chunk.score * 100).toFixed(2)}%` : "-"}
35 |
36 | {compare && (
37 |
38 |
41 |
42 | )}
43 |
44 | ))}
45 |
46 |
47 | );
48 | }
49 |
50 | export function AllChunks({ compare }: { compare: CompareFn }) {
51 | const {
52 | status,
53 | loadMore,
54 | results: chunks,
55 | } = usePaginatedQuery(
56 | api.sources.paginateChunks,
57 | {},
58 | { initialNumItems: 10 }
59 | );
60 | return (
61 | <>
62 |
63 | {status !== "Exhausted" && (
64 |
70 | )}
71 | >
72 | );
73 | }
74 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Embeddings Playground with OpenAI and Convex
2 |
3 | An example of working with embeddings and vector databases in Convex.
4 |
5 | [Embeddings](https://stack.convex.dev/the-magic-of-embeddings) enable all sorts
6 | of use cases, but it's hard to know how they'll perform on comparisons and
7 | queries without playing around with them.
8 |
9 | This project allows you to add source data, generate embeddings via OpenAI,
10 | compare them to each other, and compare semantic and word searches over them.
11 |
12 | You can then use the queried source data to include in a ChatGPT prompt (WIP).
13 |
14 | 
15 |
16 | UI:
17 |
18 | - React
19 | - Tailwindcss
20 | - Rewind-UI
21 | - Vite
22 |
23 | Backend:
24 |
25 | - OpenAI API for creating vector embeddings.
26 | - Convex for storing vectors, application data, and running server-side functions.
27 |
28 | Work planned:
29 |
30 | - [x] Add a python script that scrapes URLs and imports the data.
31 | - [x] Add a node script that imports local files (.pdf, .md, .txt).
32 | - [ ] Allow picking which sources to use in a ChatGPT prompt, and what template to use, to iterate on templates.
33 | - [ ] Configuration to fetch the most 20, 40, or 80 documents when searching (hard-coded to 10 currently).
34 |
35 | ## Setup
36 |
37 | ### Prerequisites:
38 |
39 | 1. A Convex backend: it will be configured automatically on `npm run dev`.
40 | By running this first, you can enter environment variables for (2) and (3) on
41 | the [dashboard](https://dashboard.convex.dev).
42 |
43 | 2. An [OpenAI](https://platform.openai.com/) API key.
44 | Environment variable: `OPEN_API_KEY` (should start with `sk-`).
45 | Run `npx convex env set OPEN_API_KEY sk-XXXX # --prod`
46 |
47 | ## Run:
48 |
49 | ```bash
50 | npm install
51 | npm run dev
52 | ```
53 |
54 | ## Upload sources from a URL
55 |
56 | You can add a source from a URL using the scripts/addURL.py python script:
57 |
58 | ```sh
59 | pip install dotenv convex langchain
60 | python scripts/addURL.py https://example.com
61 | ```
62 |
63 | ## Upload sources from a folder
64 |
65 | You can add .txt, .md, and .pdf files as sources to your project via:
66 |
67 | ```sh
68 | export VITE_CONVEX_URL= # your backend url - see .env.local (dev) or .env (prod)
69 | npx ts-node-esm scripts/addFiles.ts ./path/to/folder
70 | ```
71 |
72 | By default it'll check in a documents folder at the root of the repo.
73 | It will upload in chunks
74 |
--------------------------------------------------------------------------------
/scripts/addFiles.ts:
--------------------------------------------------------------------------------
1 | import { api } from "../convex/_generated/api.js";
2 | import path from "path";
3 | import { ConvexHttpClient } from "convex/browser";
4 | import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
5 | import { TextLoader } from "langchain/document_loaders/fs/text";
6 | import { DirectoryLoader } from "langchain/document_loaders/fs/directory";
7 | import { PDFLoader } from "langchain/document_loaders/fs/pdf";
8 |
9 | const ChunkBatchSize = 100;
10 | const ChunkSize = 1000;
11 |
12 | // Recursively processes all files with these extensions in the given dir.
13 | const loader = new DirectoryLoader(process.argv[2] || "../documents", {
14 | ".txt": (path) => new TextLoader(path),
15 | ".md": (path) => new TextLoader(path),
16 | ".pdf": (path) => new PDFLoader(path),
17 | });
18 |
19 | // The Convex backend where we're uploading results
20 | const address = process.env.VITE_CONVEX_URL;
21 | if (!address) throw new Error("Specify VITE_CONVEX_URL env variable");
22 | const client = new ConvexHttpClient(address);
23 |
24 | export const uploadDocuments = async () => {
25 | const start = Date.now();
26 | const docs = await loader.load();
27 | let batch: {
28 | name: string;
29 | chunks: { text: string; lines: { from: number; to: number } }[];
30 | }[] = [];
31 | for (const doc of docs) {
32 | console.log(`Processing document: ${doc.metadata.source}`);
33 | const textSplitter = new RecursiveCharacterTextSplitter({
34 | chunkSize: ChunkSize,
35 | });
36 | const chunks = await textSplitter.createDocuments([doc.pageContent]);
37 | // If this doc will put us over the batch limit, let's send it
38 | if (
39 | batch.length > 0 &&
40 | batch.reduce((sum, val) => sum + val.chunks.length, 0) >=
41 | ChunkBatchSize - chunks.length
42 | ) {
43 | console.log("Sending up a batch:", batch.length);
44 | await client.mutation(api.sources.addBatch, { batch });
45 | batch = [];
46 | }
47 | batch.push({
48 | name: path.parse(doc.metadata.source).base,
49 | chunks: chunks.map((chunk) => ({
50 | text: chunk.pageContent,
51 | lines: chunk.metadata.loc.lines,
52 | })),
53 | });
54 | }
55 | if (batch.length) {
56 | console.log("Sending up a final batch:", batch.length);
57 | await client.mutation(api.sources.addBatch, { batch });
58 | batch = [];
59 | }
60 | console.log("Finished embedding documents. ms:", Date.now() - start);
61 | return "success";
62 | };
63 |
64 | uploadDocuments().then(console.log);
65 |
--------------------------------------------------------------------------------
/convex/schema.ts:
--------------------------------------------------------------------------------
1 | import { v } from "convex/values";
2 | import { defineSchema, defineTable } from "convex/server";
3 | import { deprecated } from "convex-helpers/validators";
4 |
5 | export default defineSchema({
6 | // Chunks are one part of a Source, broken up to generate embeddings.
7 | chunks: defineTable({
8 | // raw text: ~1k bytes or less
9 | text: v.string(),
10 | sourceId: v.id("sources"),
11 | // Where in a larger document is this text.
12 | chunkIndex: v.number(),
13 | lines: v.object({
14 | from: v.number(),
15 | to: v.number(),
16 | }),
17 | // Approx: estimated based on the total batch size.
18 | tokens: v.optional(v.number()),
19 | embeddingId: v.optional(v.id("chunkEmbeddings")),
20 | })
21 | .searchIndex("text", { searchField: "text" })
22 | .index("embeddingId", ["embeddingId"]),
23 |
24 | chunkEmbeddings: defineTable({
25 | vector: v.array(v.number()),
26 | }).vectorIndex("vector", { vectorField: "vector", dimensions: 1536 }),
27 |
28 | // Sources are materials to search over / compare, made of chunks of text.
29 | sources: defineTable({
30 | name: v.string(),
31 | // Max 1k chunks (otherwise remove this and use an index on sourceId)
32 | chunkIds: v.array(v.id("chunks")),
33 | // Whether the embeddings have been saved.
34 | saved: v.boolean(),
35 | // stats
36 | totalTokens: v.optional(v.number()),
37 | embeddingMs: v.optional(v.number()),
38 | }),
39 |
40 | // Searches track a comparison between an input string and related chunks.
41 | searches: defineTable({
42 | input: v.string(),
43 | float32Buffer: v.optional(v.bytes()),
44 | relatedChunks: v.optional(
45 | v.array(
46 | v.object({
47 | id: v.id("chunkEmbeddings"),
48 | score: v.optional(v.number()),
49 | }),
50 | ),
51 | ),
52 | // stats
53 | count: v.number(),
54 | inputTokens: v.optional(v.number()),
55 | embeddingMs: v.optional(v.number()),
56 | queryMs: v.optional(v.number()),
57 | saveSearchMs: deprecated,
58 | embeddingId: v.optional(v.id("searchEmbeddings")),
59 | })
60 | .index("input", ["input"])
61 | .index("embeddingId", ["embeddingId"]),
62 |
63 | searchEmbeddings: defineTable({
64 | vector: v.array(v.number()),
65 | }).vectorIndex("vector", { vectorField: "vector", dimensions: 1536 }),
66 |
67 | // Comparisons track a comparison between one chunk and other chunks.
68 | comparisons: defineTable({
69 | target: v.id("chunks"),
70 | relatedChunks: v.optional(
71 | v.array(
72 | v.object({
73 | id: v.id("chunkEmbeddings"),
74 | score: v.optional(v.number()),
75 | }),
76 | ),
77 | ),
78 | // stats
79 | count: v.number(),
80 | queryMs: v.optional(v.number()),
81 | }).index("target", ["target"]),
82 | });
83 |
--------------------------------------------------------------------------------
/convex/README.md:
--------------------------------------------------------------------------------
1 | # Welcome to your Convex functions directory!
2 |
3 | Write your Convex functions here. See
4 | https://docs.convex.dev/using/writing-convex-functions for more.
5 |
6 | A query function that takes two arguments looks like:
7 |
8 | ```ts
9 | // functions.js
10 | import { query } from "./_generated/server";
11 | import { v } from "convex/values";
12 |
13 | export const myQueryFunction = query({
14 | // Validators for arguments.
15 | args: {
16 | first: v.number(),
17 | second: v.string(),
18 | },
19 |
20 | // Function implementation.
21 | hander: async (ctx, args) => {
22 | // Read the database as many times as you need here.
23 | // See https://docs.convex.dev/database/reading-data.
24 | const documents = await ctx.db.query("tablename").collect();
25 |
26 | // Arguments passed from the client are properties of the args object.
27 | console.log(args.first, args.second);
28 |
29 | // Write arbitrary JavaScript here: filter, aggregate, build derived data,
30 | // remove non-public properties, or create new objects.
31 | return documents;
32 | },
33 | });
34 | ```
35 |
36 | Using this query function in a React component looks like:
37 |
38 | ```ts
39 | const data = useQuery(api.functions.myQueryFunction, {
40 | first: 10,
41 | second: "hello",
42 | });
43 | ```
44 |
45 | A mutation function looks like:
46 |
47 | ```ts
48 | // functions.js
49 | import { mutation } from "./_generated/server";
50 | import { v } from "convex/values";
51 |
52 | export const myMutationFunction = mutation({
53 | // Validators for arguments.
54 | args: {
55 | first: v.string(),
56 | second: v.string(),
57 | },
58 |
59 | // Function implementation.
60 | hander: async (ctx, args) => {
61 | // Insert or modify documents in the database here.
62 | // Mutations can also read from the database like queries.
63 | // See https://docs.convex.dev/database/writing-data.
64 | const message = { body: args.first, author: args.second };
65 | const id = await ctx.db.insert("messages", message);
66 |
67 | // Optionally, return a value from your mutation.
68 | return await ctx.db.get(id);
69 | },
70 | });
71 | ```
72 |
73 | Using this mutation function in a React component looks like:
74 |
75 | ```ts
76 | const mutation = useMutation(api.functions.myMutationFunction);
77 | function handleButtonPress() {
78 | // fire and forget, the most common way to use mutations
79 | mutation({ first: "Hello!", second: "me" });
80 | // OR
81 | // use the result once the mutation has completed
82 | mutation({ first: "Hello!", second: "me" }).then((result) =>
83 | console.log(result)
84 | );
85 | }
86 | ```
87 |
88 | Use the Convex CLI to push your functions to a deployment. See everything
89 | the Convex CLI can do by running `npx convex -h` in your project root
90 | directory. To learn more, launch the docs with `npx convex docs`.
91 |
--------------------------------------------------------------------------------
/convex/_generated/server.js:
--------------------------------------------------------------------------------
1 | /* prettier-ignore-start */
2 |
3 | /* eslint-disable */
4 | /**
5 | * Generated utilities for implementing server-side Convex query and mutation functions.
6 | *
7 | * THIS CODE IS AUTOMATICALLY GENERATED.
8 | *
9 | * To regenerate, run `npx convex dev`.
10 | * @module
11 | */
12 |
13 | import {
14 | actionGeneric,
15 | httpActionGeneric,
16 | queryGeneric,
17 | mutationGeneric,
18 | internalActionGeneric,
19 | internalMutationGeneric,
20 | internalQueryGeneric,
21 | } from "convex/server";
22 |
23 | /**
24 | * Define a query in this Convex app's public API.
25 | *
26 | * This function will be allowed to read your Convex database and will be accessible from the client.
27 | *
28 | * @param func - The query function. It receives a {@link QueryCtx} as its first argument.
29 | * @returns The wrapped query. Include this as an `export` to name it and make it accessible.
30 | */
31 | export const query = queryGeneric;
32 |
33 | /**
34 | * Define a query that is only accessible from other Convex functions (but not from the client).
35 | *
36 | * This function will be allowed to read from your Convex database. It will not be accessible from the client.
37 | *
38 | * @param func - The query function. It receives a {@link QueryCtx} as its first argument.
39 | * @returns The wrapped query. Include this as an `export` to name it and make it accessible.
40 | */
41 | export const internalQuery = internalQueryGeneric;
42 |
43 | /**
44 | * Define a mutation in this Convex app's public API.
45 | *
46 | * This function will be allowed to modify your Convex database and will be accessible from the client.
47 | *
48 | * @param func - The mutation function. It receives a {@link MutationCtx} as its first argument.
49 | * @returns The wrapped mutation. Include this as an `export` to name it and make it accessible.
50 | */
51 | export const mutation = mutationGeneric;
52 |
53 | /**
54 | * Define a mutation that is only accessible from other Convex functions (but not from the client).
55 | *
56 | * This function will be allowed to modify your Convex database. It will not be accessible from the client.
57 | *
58 | * @param func - The mutation function. It receives a {@link MutationCtx} as its first argument.
59 | * @returns The wrapped mutation. Include this as an `export` to name it and make it accessible.
60 | */
61 | export const internalMutation = internalMutationGeneric;
62 |
63 | /**
64 | * Define an action in this Convex app's public API.
65 | *
66 | * An action is a function which can execute any JavaScript code, including non-deterministic
67 | * code and code with side-effects, like calling third-party services.
68 | * They can be run in Convex's JavaScript environment or in Node.js using the "use node" directive.
69 | * They can interact with the database indirectly by calling queries and mutations using the {@link ActionCtx}.
70 | *
71 | * @param func - The action. It receives an {@link ActionCtx} as its first argument.
72 | * @returns The wrapped action. Include this as an `export` to name it and make it accessible.
73 | */
74 | export const action = actionGeneric;
75 |
76 | /**
77 | * Define an action that is only accessible from other Convex functions (but not from the client).
78 | *
79 | * @param func - The function. It receives an {@link ActionCtx} as its first argument.
80 | * @returns The wrapped function. Include this as an `export` to name it and make it accessible.
81 | */
82 | export const internalAction = internalActionGeneric;
83 |
84 | /**
85 | * Define a Convex HTTP action.
86 | *
87 | * @param func - The function. It receives an {@link ActionCtx} as its first argument, and a `Request` object
88 | * as its second.
89 | * @returns The wrapped endpoint function. Route a URL path to this function in `convex/http.js`.
90 | */
91 | export const httpAction = httpActionGeneric;
92 |
93 | /* prettier-ignore-end */
94 |
--------------------------------------------------------------------------------
/src/assets/react.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/convex/comparisons.ts:
--------------------------------------------------------------------------------
1 | import { internal } from "./_generated/api";
2 | import {
3 | internalAction,
4 | internalMutation,
5 | mutation,
6 | query,
7 | } from "./_generated/server";
8 | import { v } from "convex/values";
9 | import { pruneNull } from "./lib/utils";
10 |
11 | export const upsert = mutation({
12 | args: { target: v.id("chunks"), count: v.optional(v.number()) },
13 | handler: async (ctx, { target, count }) => {
14 | const topK = count || 10;
15 | const existing = await ctx.db
16 | .query("comparisons")
17 | .withIndex("target", (q) => q.eq("target", target))
18 | .filter((q) => q.gte(q.field("count"), topK))
19 | .unique();
20 | if (existing) return existing._id;
21 | const comparisonId = await ctx.db.insert("comparisons", {
22 | target,
23 | count: topK,
24 | });
25 | const chunk = await ctx.db.get(target);
26 | if (!chunk) throw new Error("Unknown chunk");
27 | if (!chunk.embeddingId) throw new Error("Chunk has no embedding yet");
28 | const embedding = await ctx.db.get(chunk.embeddingId);
29 | if (!embedding) throw new Error("Unknown embedding");
30 | await ctx.scheduler.runAfter(0, internal.comparisons.compare, {
31 | vector: embedding.vector,
32 | comparisonId,
33 | topK,
34 | });
35 | return comparisonId;
36 | },
37 | });
38 |
39 | export const compare = internalAction({
40 | args: {
41 | vector: v.array(v.number()),
42 | comparisonId: v.optional(v.id("comparisons")),
43 | topK: v.number(),
44 | },
45 | handler: async (ctx, { vector, comparisonId, topK }) => {
46 | const start = Date.now();
47 | const matches = await ctx.vectorSearch("chunkEmbeddings", "vector", {
48 | vector,
49 | limit: topK,
50 | });
51 | if (!matches) throw new Error("Vector search matches are empty");
52 | const relatedChunks = matches.map(({ _id, _score }) => ({
53 | id: _id,
54 | score: _score,
55 | }));
56 | const queryMs = Date.now() - start;
57 | console.log({
58 | queryMs,
59 | });
60 | if (comparisonId) {
61 | await ctx.runMutation(internal.comparisons.patch, {
62 | id: comparisonId,
63 | patch: {
64 | relatedChunks,
65 | // stats
66 | queryMs,
67 | },
68 | });
69 | }
70 | return relatedChunks;
71 | },
72 | });
73 |
74 | export const get = query({
75 | args: { comparisonId: v.id("comparisons") },
76 | handler: async (ctx, { comparisonId }) => {
77 | const comparison = await ctx.db.get(comparisonId);
78 | if (!comparison) throw new Error("Unknown comparison");
79 | if (!comparison.relatedChunks) return null;
80 | const target = await ctx.db.get(comparison.target);
81 | return (
82 | target && {
83 | ...comparison,
84 | relatedChunks: pruneNull(
85 | await Promise.all(
86 | comparison.relatedChunks.map(async ({ id, score }) => {
87 | const chunk = await ctx.db
88 | .query("chunks")
89 | .withIndex("embeddingId", (q) => q.eq("embeddingId", id))
90 | .unique();
91 | if (chunk?._id === comparison.target) return null;
92 | if (!chunk) throw new Error("Unknown chunk for embedding" + id);
93 | const source = await ctx.db.get(chunk.sourceId);
94 | if (!source) throw new Error("Unknown source" + chunk.sourceId);
95 | return { ...chunk, score, sourceName: source!.name };
96 | }),
97 | ),
98 | ),
99 | target: {
100 | ...target,
101 | sourceName: (await ctx.db.get(target.sourceId))!.name,
102 | },
103 | }
104 | );
105 | },
106 | });
107 |
108 | export const patch = internalMutation({
109 | args: { id: v.id("comparisons"), patch: v.any() },
110 | handler: async (ctx, { id, patch }) => {
111 | return await ctx.db.patch(id, patch);
112 | },
113 | });
114 |
--------------------------------------------------------------------------------
/convex/searches.ts:
--------------------------------------------------------------------------------
1 | import { api, internal } from "./_generated/api";
2 | import { action, internalMutation, mutation, query } from "./_generated/server";
3 | import { fetchEmbedding } from "./lib/embeddings";
4 | import { v } from "convex/values";
5 | import { pruneNull } from "./lib/utils";
6 | import { pick } from "convex-helpers";
7 | import schema from "./schema";
8 | import { crud } from "convex-helpers/server/crud";
9 | import { getOrThrow } from "convex-helpers/server/relationships";
10 |
11 | export const upsert = mutation({
12 | args: { input: v.string(), count: v.optional(v.number()) },
13 | handler: async (ctx, { input, count: countOpt }) => {
14 | const count = countOpt || 10;
15 | const existing = await ctx.db
16 | .query("searches")
17 | .withIndex("input", (q) => q.eq("input", input))
18 | .filter((q) => q.gte(q.field("count"), count))
19 | .unique();
20 | if (existing) {
21 | console.log("Re-using search for", input);
22 | return existing._id;
23 | }
24 | const searchId = await ctx.db.insert("searches", { input, count });
25 | console.log("Starting search for", input);
26 | await ctx.scheduler.runAfter(0, api.searches.search, {
27 | input,
28 | searchId,
29 | topK: count,
30 | });
31 | return searchId;
32 | },
33 | });
34 |
35 | export const search = action({
36 | args: {
37 | input: v.string(),
38 | topK: v.number(),
39 | searchId: v.optional(v.id("searches")),
40 | },
41 | handler: async (ctx, { input, topK, searchId }) => {
42 | const {
43 | embedding,
44 | totalTokens: inputTokens,
45 | embeddingMs,
46 | } = await fetchEmbedding(input);
47 | const start = Date.now();
48 | const relatedChunks = (
49 | await ctx.vectorSearch("chunkEmbeddings", "vector", {
50 | vector: embedding,
51 | limit: topK,
52 | })
53 | ).map(({ _id: id, _score: score }) => ({ id, score }));
54 | const queryMs = Date.now() - start;
55 | if (searchId) {
56 | await ctx.runMutation(internal.searches.storeEmbedding, {
57 | embedding,
58 | searchId,
59 | relatedChunks,
60 | // stats
61 | inputTokens,
62 | embeddingMs,
63 | queryMs,
64 | });
65 | const saveSearchMs = Date.now() - start - queryMs;
66 | console.log({
67 | inputTokens,
68 | embeddingMs,
69 | queryMs,
70 | saveSearchMs,
71 | });
72 | }
73 | },
74 | });
75 |
76 | export const storeEmbedding = internalMutation({
77 | args: {
78 | searchId: v.id("searches"),
79 | embedding: v.array(v.number()),
80 | ...pick(schema.tables.searches.validator.fields, [
81 | "inputTokens",
82 | "embeddingMs",
83 | "queryMs",
84 | "relatedChunks",
85 | ]),
86 | },
87 | handler: async (ctx, { searchId, embedding, ...patch }) => {
88 | const search = await getOrThrow(ctx, searchId);
89 | if (search.embeddingId) {
90 | await ctx.db.patch(search.embeddingId, { vector: embedding });
91 | } else {
92 | const embeddingId = await ctx.db.insert("searchEmbeddings", {
93 | vector: embedding,
94 | });
95 | await ctx.db.patch(searchId, { embeddingId, ...patch });
96 | }
97 | },
98 | });
99 |
100 | export const wordSearch = query({
101 | args: { input: v.string(), count: v.number() },
102 | handler: async (ctx, { input, count }) => {
103 | const results = await ctx.db
104 | .query("chunks")
105 | .withSearchIndex("text", (q) => q.search("text", input))
106 | .take(count);
107 | return Promise.all(
108 | results.map(async (chunk) => {
109 | const source = await ctx.db.get(chunk.sourceId);
110 | if (!source) throw new Error("Missing source for chunk " + chunk._id);
111 | return { ...chunk, sourceName: source.name };
112 | })
113 | );
114 | },
115 | });
116 |
117 | export const semanticSearch = query({
118 | args: { searchId: v.id("searches") },
119 | handler: async (ctx, { searchId }) => {
120 | const search = await ctx.db.get(searchId);
121 | if (!search) throw new Error("Unknown search " + searchId);
122 | if (!search.relatedChunks) return null;
123 | return pruneNull(
124 | await Promise.all(
125 | search.relatedChunks.map(async ({ id, score }) => {
126 | const chunk = await ctx.db
127 | .query("chunks")
128 | .withIndex("embeddingId", (q) => q.eq("embeddingId", id))
129 | .unique();
130 | if (!chunk) return null;
131 | const source = await ctx.db.get(chunk.sourceId);
132 | return { ...chunk, score, sourceName: source!.name };
133 | })
134 | )
135 | );
136 | },
137 | });
138 |
139 | export const { paginate } = crud(schema, "searches", query);
140 |
--------------------------------------------------------------------------------
/convex/_generated/server.d.ts:
--------------------------------------------------------------------------------
1 | /* prettier-ignore-start */
2 |
3 | /* eslint-disable */
4 | /**
5 | * Generated utilities for implementing server-side Convex query and mutation functions.
6 | *
7 | * THIS CODE IS AUTOMATICALLY GENERATED.
8 | *
9 | * To regenerate, run `npx convex dev`.
10 | * @module
11 | */
12 |
13 | import {
14 | ActionBuilder,
15 | HttpActionBuilder,
16 | MutationBuilder,
17 | QueryBuilder,
18 | GenericActionCtx,
19 | GenericMutationCtx,
20 | GenericQueryCtx,
21 | GenericDatabaseReader,
22 | GenericDatabaseWriter,
23 | } from "convex/server";
24 | import type { DataModel } from "./dataModel.js";
25 |
26 | /**
27 | * Define a query in this Convex app's public API.
28 | *
29 | * This function will be allowed to read your Convex database and will be accessible from the client.
30 | *
31 | * @param func - The query function. It receives a {@link QueryCtx} as its first argument.
32 | * @returns The wrapped query. Include this as an `export` to name it and make it accessible.
33 | */
34 | export declare const query: QueryBuilder;
35 |
36 | /**
37 | * Define a query that is only accessible from other Convex functions (but not from the client).
38 | *
39 | * This function will be allowed to read from your Convex database. It will not be accessible from the client.
40 | *
41 | * @param func - The query function. It receives a {@link QueryCtx} as its first argument.
42 | * @returns The wrapped query. Include this as an `export` to name it and make it accessible.
43 | */
44 | export declare const internalQuery: QueryBuilder;
45 |
46 | /**
47 | * Define a mutation in this Convex app's public API.
48 | *
49 | * This function will be allowed to modify your Convex database and will be accessible from the client.
50 | *
51 | * @param func - The mutation function. It receives a {@link MutationCtx} as its first argument.
52 | * @returns The wrapped mutation. Include this as an `export` to name it and make it accessible.
53 | */
54 | export declare const mutation: MutationBuilder;
55 |
56 | /**
57 | * Define a mutation that is only accessible from other Convex functions (but not from the client).
58 | *
59 | * This function will be allowed to modify your Convex database. It will not be accessible from the client.
60 | *
61 | * @param func - The mutation function. It receives a {@link MutationCtx} as its first argument.
62 | * @returns The wrapped mutation. Include this as an `export` to name it and make it accessible.
63 | */
64 | export declare const internalMutation: MutationBuilder;
65 |
66 | /**
67 | * Define an action in this Convex app's public API.
68 | *
69 | * An action is a function which can execute any JavaScript code, including non-deterministic
70 | * code and code with side-effects, like calling third-party services.
71 | * They can be run in Convex's JavaScript environment or in Node.js using the "use node" directive.
72 | * They can interact with the database indirectly by calling queries and mutations using the {@link ActionCtx}.
73 | *
74 | * @param func - The action. It receives an {@link ActionCtx} as its first argument.
75 | * @returns The wrapped action. Include this as an `export` to name it and make it accessible.
76 | */
77 | export declare const action: ActionBuilder;
78 |
79 | /**
80 | * Define an action that is only accessible from other Convex functions (but not from the client).
81 | *
82 | * @param func - The function. It receives an {@link ActionCtx} as its first argument.
83 | * @returns The wrapped function. Include this as an `export` to name it and make it accessible.
84 | */
85 | export declare const internalAction: ActionBuilder;
86 |
87 | /**
88 | * Define an HTTP action.
89 | *
90 | * This function will be used to respond to HTTP requests received by a Convex
91 | * deployment if the requests matches the path and method where this action
92 | * is routed. Be sure to route your action in `convex/http.js`.
93 | *
94 | * @param func - The function. It receives an {@link ActionCtx} as its first argument.
95 | * @returns The wrapped function. Import this function from `convex/http.js` and route it to hook it up.
96 | */
97 | export declare const httpAction: HttpActionBuilder;
98 |
99 | /**
100 | * A set of services for use within Convex query functions.
101 | *
102 | * The query context is passed as the first argument to any Convex query
103 | * function run on the server.
104 | *
105 | * This differs from the {@link MutationCtx} because all of the services are
106 | * read-only.
107 | */
108 | export type QueryCtx = GenericQueryCtx;
109 |
110 | /**
111 | * A set of services for use within Convex mutation functions.
112 | *
113 | * The mutation context is passed as the first argument to any Convex mutation
114 | * function run on the server.
115 | */
116 | export type MutationCtx = GenericMutationCtx;
117 |
118 | /**
119 | * A set of services for use within Convex action functions.
120 | *
121 | * The action context is passed as the first argument to any Convex action
122 | * function run on the server.
123 | */
124 | export type ActionCtx = GenericActionCtx;
125 |
126 | /**
127 | * An interface to read from the database within Convex query functions.
128 | *
129 | * The two entry points are {@link DatabaseReader.get}, which fetches a single
130 | * document by its {@link Id}, or {@link DatabaseReader.query}, which starts
131 | * building a query.
132 | */
133 | export type DatabaseReader = GenericDatabaseReader;
134 |
135 | /**
136 | * An interface to read from and write to the database within Convex mutation
137 | * functions.
138 | *
139 | * Convex guarantees that all writes within a single mutation are
140 | * executed atomically, so you never have to worry about partial writes leaving
141 | * your data in an inconsistent state. See [the Convex Guide](https://docs.convex.dev/understanding/convex-fundamentals/functions#atomicity-and-optimistic-concurrency-control)
142 | * for the guarantees Convex provides your functions.
143 | */
144 | export type DatabaseWriter = GenericDatabaseWriter;
145 |
146 | /* prettier-ignore-end */
147 |
--------------------------------------------------------------------------------
/src/Search.tsx:
--------------------------------------------------------------------------------
1 | import {
2 | useAction,
3 | useMutation,
4 | usePaginatedQuery,
5 | useQuery,
6 | } from "convex/react";
7 | import { useRef, useState } from "react";
8 | import { Dispatch } from "react";
9 | import { MagnifyingGlass } from "@phosphor-icons/react";
10 | import { Accordion, Button, Table, Text, InputGroup } from "@rewind-ui/core";
11 | import { Id } from "../convex/_generated/dataModel";
12 | import { api } from "../convex/_generated/api";
13 | import { Loading } from "./Loading";
14 | import { Chunks } from "./Chunks";
15 | import { CompareFn } from "./useComparison";
16 |
17 | type Target = { text: string; searchId: Id<"searches"> };
18 |
19 | export function Search({ compare }: { compare?: CompareFn }) {
20 | const [target, setTarget] = useState();
21 | const [input, setInput] = useState("");
22 |
23 | return (
24 | <>
25 |
26 | {target && (
27 |
28 |
29 | Semantic
30 |
31 |
32 |
33 |
34 |
35 | Word-Based
36 |
37 |
38 |
39 |
40 |
41 | )}
42 | Search History
43 | {
45 | setTarget(target);
46 | setInput(target.text);
47 | }}
48 | />
49 | >
50 | );
51 | }
52 |
53 | function SearchBar({
54 | setInput,
55 | setTarget,
56 | }: {
57 | setInput: Dispatch;
58 | setTarget: Dispatch;
59 | }) {
60 | const addSearch = useMutation(api.searches.upsert);
61 | const searchRef = useRef(null);
62 | return (
63 |
90 | );
91 | }
92 |
93 | function WordSearch({
94 | input,
95 | compare,
96 | }: {
97 | input: string;
98 | compare?: CompareFn;
99 | }) {
100 | const wordBased = useQuery(api.searches.wordSearch, { input, count: 10 });
101 | return wordBased ? (
102 | <>
103 | Results for "{input}":
104 |
105 | >
106 | ) : (
107 | <>
108 | Searching for "{input}"...
109 |
110 | >
111 | );
112 | }
113 |
114 | function SemanticSearch({
115 | target: { searchId, text },
116 | compare,
117 | }: {
118 | target: Target;
119 | compare?: CompareFn;
120 | }) {
121 | const semantic = useQuery(api.searches.semanticSearch, { searchId });
122 | return (
123 | <>
124 | {semantic ? (
125 | <>
126 | Results for "{text}":
127 |
128 | >
129 | ) : (
130 | <>
131 | Searching for "{text}"...
132 |
133 | >
134 | )}
135 | >
136 | );
137 | }
138 |
139 | export type UseSearchFn = (target: Target) => void;
140 |
141 | export function PreviousSearches({
142 | reuseSearch,
143 | }: {
144 | reuseSearch: UseSearchFn;
145 | }) {
146 | const { status, loadMore, results } = usePaginatedQuery(
147 | api.searches.paginate,
148 | {},
149 | { initialNumItems: 10 },
150 | );
151 | const search = useAction(api.searches.search);
152 | return (
153 | <>
154 |
155 |
156 |
157 | Input
158 | Tokens
159 | Results
160 | Embedding
161 | Search
162 |
163 |
164 |
165 |
166 | {results.map((result) => (
167 |
168 | {result.input}
169 | {result.inputTokens}
170 |
171 | {result.relatedChunks?.length || result.count}
172 |
173 | {result.embeddingMs?.toFixed(0) + " ms"}
174 | {result.queryMs?.toFixed(0) + " ms"}
175 |
176 |
184 |
185 |
199 |
200 |
201 | ))}
202 |
203 |
204 |
205 | {status !== "Exhausted" && (
206 |
212 | )}
213 | >
214 | );
215 | }
216 |
--------------------------------------------------------------------------------
/convex/sources.ts:
--------------------------------------------------------------------------------
1 | import { paginationOptsValidator } from "convex/server";
2 | import { internal } from "./_generated/api";
3 | import {
4 | DatabaseWriter,
5 | internalAction,
6 | internalMutation,
7 | mutation,
8 | query,
9 | } from "./_generated/server";
10 | import { fetchEmbeddingBatch } from "./lib/embeddings";
11 | import { v, Infer } from "convex/values";
12 | import { Doc } from "./_generated/dataModel";
13 | import { crud } from "convex-helpers/server/crud";
14 | import schema from "./schema";
15 | import { getOrThrow } from "convex-helpers/server/relationships";
16 |
17 | const InputChunk = v.object({
18 | text: v.string(),
19 | lines: v.object({ from: v.number(), to: v.number() }),
20 | });
21 | type InputChunk = Infer;
22 |
23 | async function addSource(
24 | db: DatabaseWriter,
25 | name: string,
26 | chunks: InputChunk[]
27 | ) {
28 | const sourceId = await db.insert("sources", {
29 | name,
30 | chunkIds: [],
31 | saved: false,
32 | });
33 | const chunkIds = await Promise.all(
34 | chunks.map(({ text, lines }, chunkIndex) =>
35 | db.insert("chunks", {
36 | text,
37 | sourceId,
38 | chunkIndex,
39 | lines,
40 | })
41 | )
42 | );
43 | await db.patch(sourceId, { chunkIds });
44 | return (await db.get(sourceId))!;
45 | }
46 |
47 | // Insert the source into the DB, along with the associated chunks.
48 | export const add = mutation({
49 | args: { name: v.string(), chunks: v.array(InputChunk) },
50 | handler: async (ctx, { name, chunks }) => {
51 | const source = await addSource(ctx.db, name, chunks);
52 | await ctx.scheduler.runAfter(0, internal.sources.addEmbedding, {
53 | source,
54 | texts: chunks.map(({ text }) => text),
55 | });
56 | },
57 | });
58 |
59 | // Make embeddings for a source's chunks and store them.
60 | export const addEmbedding = internalAction({
61 | handler: async (
62 | ctx,
63 | { source, texts }: { source: Doc<"sources">; texts: string[] }
64 | ) => {
65 | const { embeddings, embeddingMs, totalTokens } =
66 | await fetchEmbeddingBatch(texts);
67 | console.log({
68 | batchSize: texts.length,
69 | totalTokens,
70 | embeddingMs,
71 | });
72 | await ctx.runMutation(internal.sources.storeEmbeddings, {
73 | embeddings: embeddings.map((embedding, idx) => ({
74 | chunkId: source.chunkIds[idx],
75 | embedding,
76 | })),
77 | });
78 | await ctx.runMutation(internal.sources.patch, {
79 | id: source._id,
80 | patch: { saved: true, totalTokens, embeddingMs },
81 | });
82 | },
83 | });
84 |
85 | /**
86 | * Add a batch of sources, where each one is a named source with all chunks.
87 | */
88 | export const addBatch = mutation({
89 | args: {
90 | batch: v.array(v.object({ name: v.string(), chunks: v.array(InputChunk) })),
91 | },
92 | handler: async (ctx, { batch }) => {
93 | await ctx.scheduler.runAfter(0, internal.sources.addEmbeddingBatch, {
94 | batch: await Promise.all(
95 | batch.map(async ({ name, chunks }) => ({
96 | source: await addSource(ctx.db, name, chunks),
97 | texts: chunks.map(({ text }) => text),
98 | }))
99 | ),
100 | });
101 | },
102 | });
103 |
104 | export function chunk(items: T[], chunkSize?: number): T[][] {
105 | const chunks = [];
106 | const size = chunkSize || 100;
107 | for (let i = 0; i < items.length; i += size) {
108 | chunks.push(items.slice(i, i + size));
109 | }
110 | return chunks;
111 | }
112 |
113 | export const addEmbeddingBatch = internalAction(
114 | async (
115 | ctx,
116 | { batch }: { batch: { source: Doc<"sources">; texts: string[] }[] }
117 | ) => {
118 | const chunks = chunk(
119 | batch.flatMap(({ texts, source }) =>
120 | texts.map((text, i) => ({ text, chunkId: source.chunkIds[i] }))
121 | )
122 | );
123 | let totalTokens_ = 0;
124 | let embeddingMs_ = 0;
125 | for (const chunkBatch of chunks) {
126 | // Calculate all the embeddings for all sources at once.
127 | const { embeddings, totalTokens, embeddingMs } =
128 | await fetchEmbeddingBatch(chunkBatch.map(({ text }) => text));
129 | totalTokens_ += totalTokens;
130 | embeddingMs_ += embeddingMs;
131 | console.log({ batchSize: embeddings.length, totalTokens, embeddingMs });
132 | await ctx.runMutation(internal.sources.storeEmbeddings, {
133 | embeddings: embeddings.map((embedding, i) => ({
134 | chunkId: chunkBatch[i].chunkId,
135 | embedding,
136 | })),
137 | });
138 | }
139 | // The length of all strings put together.
140 | const totalLength = batch.reduce(
141 | (sum, { texts }) => sum + textLength(texts),
142 | 0
143 | );
144 | await Promise.all(
145 | batch.map(async ({ source, texts }) => {
146 | const sourceLength = textLength(texts);
147 | const portion = sourceLength / totalLength;
148 | await ctx.runMutation(internal.sources.patch, {
149 | id: source._id,
150 | patch: {
151 | saved: true,
152 | embeddingMs: Math.ceil(embeddingMs_ * portion),
153 | totalTokens: Math.ceil(totalTokens_ * portion),
154 | },
155 | });
156 | })
157 | );
158 | }
159 | );
160 |
161 | export const storeEmbeddings = internalMutation({
162 | args: {
163 | embeddings: v.array(
164 | v.object({ chunkId: v.id("chunks"), embedding: v.array(v.number()) })
165 | ),
166 | },
167 | handler: async (ctx, args) => {
168 | await Promise.all(
169 | args.embeddings.map(async ({ chunkId, embedding }) => {
170 | const chunk = await getOrThrow(ctx, chunkId);
171 | if (chunk.embeddingId) {
172 | await ctx.db.patch(chunk.embeddingId, { vector: embedding });
173 | } else {
174 | const embeddingId = await ctx.db.insert("chunkEmbeddings", {
175 | vector: embedding,
176 | });
177 | await ctx.db.patch(chunkId, { embeddingId });
178 | }
179 | })
180 | );
181 | },
182 | });
183 |
184 | function textLength(texts: string[]) {
185 | return texts.reduce((sum, cur) => sum + cur.length, 0);
186 | }
187 |
188 | export const { update: patch } = crud(schema, "sources");
189 |
190 | export const paginate = query({
191 | args: { paginationOpts: paginationOptsValidator },
192 | handler: async (ctx, { paginationOpts }) => {
193 | const results = await ctx.db
194 | .query("sources")
195 | .order("desc")
196 | .paginate(paginationOpts);
197 |
198 | return {
199 | ...results,
200 | page: await Promise.all(
201 | results.page.map(async (source) => {
202 | let firstChunkText = "";
203 | if (source.chunkIds.length) {
204 | const firstChunk = await ctx.db.get(source.chunkIds[0]);
205 | firstChunkText = firstChunk?.text ?? "";
206 | }
207 | return { ...source, firstChunkText };
208 | })
209 | ),
210 | };
211 | },
212 | });
213 |
214 | export const deleteSource = mutation({
215 | args: { id: v.id("sources") },
216 | handler: async (ctx, { id }) => {
217 | const source = await ctx.db.get(id);
218 | if (!source) return;
219 | await ctx.db.delete(id);
220 | await Promise.all(
221 | source.chunkIds.map(async (id) => {
222 | const chunk = await ctx.db.get(id);
223 | if (!chunk) return;
224 | ctx.db.delete(id);
225 | if (chunk.embeddingId) {
226 | ctx.db.delete(chunk.embeddingId);
227 | }
228 | })
229 | );
230 | },
231 | });
232 |
233 | export const paginateChunks = query({
234 | args: { paginationOpts: paginationOptsValidator },
235 | handler: async (ctx, { paginationOpts }) => {
236 | const results = await ctx.db.query("chunks").paginate(paginationOpts);
237 |
238 | return {
239 | ...results,
240 | page: await Promise.all(
241 | results.page.map(async (chunk) => {
242 | const source = await ctx.db.get(chunk.sourceId);
243 | return { ...chunk, sourceName: source!.name };
244 | })
245 | ),
246 | };
247 | },
248 | });
249 |
250 | export const getChunk = query({
251 | args: { id: v.id("chunks") },
252 | handler: async (ctx, { id }) => {
253 | const doc = await ctx.db.get(id);
254 | if (!doc) {
255 | throw new Error("Document not found: " + id);
256 | }
257 | return doc;
258 | },
259 | });
260 |
--------------------------------------------------------------------------------