├── .gitattributes ├── tsconfig.json ├── src ├── answer.ts ├── lib │ ├── db.ts │ ├── scrape.ts │ └── openai.ts └── ingest.ts ├── package.json ├── LICENSE ├── README.md └── .gitignore /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "module": "commonjs", 4 | "esModuleInterop": true, 5 | "target": "es6", 6 | "moduleResolution": "node", 7 | "sourceMap": true, 8 | "outDir": "dist" 9 | }, 10 | "lib": ["es2015"] 11 | } -------------------------------------------------------------------------------- /src/answer.ts: -------------------------------------------------------------------------------- 1 | 2 | 3 | import { queryDatabase } from "./lib/db"; 4 | import { generateEmbedding, generateResponse } from "./lib/openai"; 5 | 6 | async function askQuestion(question: string) { 7 | 8 | const embedding = await generateEmbedding(question); 9 | 10 | const queryRes = await queryDatabase(embedding.data[0].embedding); 11 | 12 | const response = await generateResponse(question, queryRes.map((doc) => doc.text)); 13 | 14 | return response; 15 | } 16 | 17 | askQuestion("Why are George Russell and Max Verstappen arguing after Qatar 2024?").then((res) => { 18 | console.log(res); 19 | }); -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "f1-ai", 3 | "version": "1.0.0", 4 | "main": "index.js", 5 | "scripts": { 6 | "test": "echo \"Error: no test specified\" && exit 1", 7 | "ingest": "ts-node src/ingest.ts", 8 | "answer": "ts-node src/answer.ts" 9 | }, 10 | "keywords": [], 11 | "author": "", 12 | "license": "ISC", 13 | "description": "", 14 | "devDependencies": { 15 | "@types/express": "^4.17.1", 16 | "typescript": "^5.7.2" 17 | }, 18 | "dependencies": { 19 | "@datastax/astra-db-ts": "^1.5.0", 20 | "cassandra-driver": "^4.7.2", 21 | "express": "^4.21.2", 22 | "fs": "^0.0.1-security", 23 | "langchain": "^0.3.6", 24 | "openai": "^4.76.0", 25 | "playwright": "^1.49.0", 26 | "ts-node": "^10.9.2" 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/lib/db.ts: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | import { DataAPIClient } from "@datastax/astra-db-ts"; 5 | 6 | const client = new DataAPIClient('YOUR_TOKEN'); 7 | const db = client.db('YOUR_DB_URL'); 8 | const collection = db.collection('f1gpt'); 9 | 10 | export async function createCollection() { 11 | const res = await db.createCollection("f1gpt", { 12 | vector: { 13 | dimension: 1536, 14 | metric: "dot_product" 15 | } 16 | }); 17 | return res 18 | } 19 | 20 | export async function uploadData(data: { 21 | $vector: number[], 22 | text: string 23 | }[]) { 24 | return await collection.insertMany(data); 25 | } 26 | 27 | 28 | export async function queryDatabase(query: number[]) { 29 | const res = await collection.find(null, { 30 | sort: { 31 | $vector: query 32 | }, 33 | limit: 10 34 | }).toArray(); 35 | 36 | return res 37 | } -------------------------------------------------------------------------------- /src/lib/scrape.ts: -------------------------------------------------------------------------------- 1 | // Using playwright to scrape the data from the website urls 2 | 3 | 4 | import playwright from "playwright"; 5 | import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; 6 | 7 | 8 | export async function scrape(url: string) { 9 | 10 | // Scrape the text from the website 11 | 12 | const browser = await playwright.chromium.launch(); 13 | 14 | const context = await browser.newContext(); 15 | 16 | const page = await context.newPage(); 17 | 18 | await page.goto(url); 19 | 20 | const text = await page.innerText("body"); 21 | 22 | text.replace(/\n/g, " "); 23 | 24 | await browser.close(); 25 | 26 | // Split the text into chunks 27 | 28 | const splitter = new RecursiveCharacterTextSplitter({ 29 | chunkSize: 512, 30 | chunkOverlap: 100, 31 | }); 32 | 33 | const output = await splitter.createDocuments([text]); 34 | 35 | return output; 36 | 37 | } 38 | 39 | -------------------------------------------------------------------------------- /src/lib/openai.ts: -------------------------------------------------------------------------------- 1 | // Generate vector embeddings using the openai api 2 | 3 | 4 | import OpenAI from 'openai'; 5 | 6 | const client = new OpenAI({ 7 | apiKey: "YOUR_API_KEY", 8 | }); 9 | 10 | export async function generateEmbedding(text: string) { 11 | const embedding = await client.embeddings.create({ 12 | model: "text-embedding-ada-002", 13 | input: text 14 | }) 15 | 16 | return embedding; 17 | } 18 | 19 | export async function generateResponse(question: string, context: string[]) { 20 | const response = await client.chat.completions.create({ 21 | model: "gpt-4o", 22 | messages: [{ 23 | role: "user", 24 | content: `You are an expert in Formula 1 racing. 25 | You need to answer this question using the context provided. 26 | Do not mention that you have been provided with the context. 27 | QUESTION: ${question}. 28 | ` 29 | }] 30 | }) 31 | 32 | return response.choices[0].message.content; 33 | 34 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Tom Shaw 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/ingest.ts: -------------------------------------------------------------------------------- 1 | 2 | 3 | import { createCollection, uploadData } from "./lib/db"; 4 | import { generateEmbedding } from "./lib/openai"; 5 | import { scrape } from "./lib/scrape"; 6 | 7 | const urls = [ 8 | "https://en.wikipedia.org/wiki/Formula_One", 9 | "https://en.wikipedia.org/wiki/George_Russell_(racing_driver)", 10 | ]; 11 | 12 | async function ingest() { 13 | 14 | let chunks: { text: string, $vector: number[], url: string }[] = []; 15 | 16 | await (Promise.all(urls.map(async (url) => { 17 | let data = await scrape(url); 18 | 19 | const embeddings = await Promise.all(data.map(async (doc, index) => { 20 | const embedding = await generateEmbedding(doc.pageContent); 21 | return embedding; 22 | })); 23 | 24 | chunks = chunks.concat(data.map((doc, index) => { 25 | return { 26 | text: doc.pageContent, 27 | $vector: embeddings[index].data[0].embedding, 28 | url: url 29 | } 30 | })); 31 | }))); 32 | 33 | await createCollection(); 34 | 35 | await uploadData(chunks.map((doc, index) => { 36 | return { 37 | $vector: doc.$vector, 38 | text: doc.text, 39 | source: doc.url 40 | } 41 | })); 42 | } 43 | 44 | ingest(); 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # F1-AI: Retrieval-Augmented Generation (RAG) Application 2 | 3 | ## Overview 4 | 5 | F1-AI is a Retrieval-Augmented Generation (RAG) application that leverages OpenAI's GPT-4 model and a vector database to provide context-aware answers to questions about Formula 1 racing. This project demonstrates how to build a RAG application using TypeScript, OpenAI, DataStax Astra DB, and Playwright. 6 | 7 | ## Prerequisites 8 | 9 | - [Node.js](https://nodejs.org/en/download/) 10 | - [OpenAI API Key](https://beta.openai.com/signup/) 11 | - [DataStax Astra DB](https://astra.datastax.com/register) 12 | 13 | ## Installation 14 | 15 | 1. Clone the repository: 16 | 17 | ```bash 18 | git clone https://github.com/IAmTomShaw/f1-rag-ai.git 19 | ``` 20 | 21 | 2. Install the dependencies: 22 | 23 | ```bash 24 | cd f1-rag-ai 25 | npm install 26 | ``` 27 | 28 | ## Configuration 29 | 30 | You'll need to paste your OpenAI API key and DataStax Astra DB credentials into the relevant files, or create a `.env` file in the root directory with the following environment variables: 31 | 32 | ```bash 33 | OPENAI_API_KEY=your-openai-api-key 34 | ASTRA_DB_ID=your-astra-db-id 35 | ASTRA_DB_REGION=your-astra-db-region 36 | ASTRA_DB_USERNAME=your-astra-db-username 37 | ASTRA_DB_PASSWORD=your-astra-db-password 38 | ``` 39 | 40 | You'll then need to make sure that these environment variables are referenced in your code and loaded correctly. 41 | 42 | ## Usage 43 | 44 | You can modify the list of urls that I am scraping in the `src/ingest.ts` file. You can then run the following command to scrape the data: 45 | 46 | ```bash 47 | npm run ingest 48 | ``` 49 | 50 | This will scrape the data from the urls and store it in the Astra DB. 51 | 52 | You can then run the following command to test the RAG application using the query defined in the `src/answer.ts` file: 53 | 54 | ```bash 55 | npm run answer 56 | ``` 57 | 58 | 59 | ## License 60 | 61 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 62 | 63 | ## Credit 64 | 65 | This project was created by [Tom Shaw](https://tomshaw.dev) 66 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | .pnpm-debug.log* 9 | 10 | # Diagnostic reports (https://nodejs.org/api/report.html) 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 12 | 13 | # Runtime data 14 | pids 15 | *.pid 16 | *.seed 17 | *.pid.lock 18 | 19 | # Directory for instrumented libs generated by jscoverage/JSCover 20 | lib-cov 21 | 22 | # Coverage directory used by tools like istanbul 23 | coverage 24 | *.lcov 25 | 26 | # nyc test coverage 27 | .nyc_output 28 | 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 30 | .grunt 31 | 32 | # Bower dependency directory (https://bower.io/) 33 | bower_components 34 | 35 | # node-waf configuration 36 | .lock-wscript 37 | 38 | # Compiled binary addons (https://nodejs.org/api/addons.html) 39 | build/Release 40 | 41 | # Dependency directories 42 | node_modules/ 43 | jspm_packages/ 44 | 45 | # Snowpack dependency directory (https://snowpack.dev/) 46 | web_modules/ 47 | 48 | # TypeScript cache 49 | *.tsbuildinfo 50 | 51 | # Optional npm cache directory 52 | .npm 53 | 54 | # Optional eslint cache 55 | .eslintcache 56 | 57 | # Optional stylelint cache 58 | .stylelintcache 59 | 60 | # Microbundle cache 61 | .rpt2_cache/ 62 | .rts2_cache_cjs/ 63 | .rts2_cache_es/ 64 | .rts2_cache_umd/ 65 | 66 | # Optional REPL history 67 | .node_repl_history 68 | 69 | # Output of 'npm pack' 70 | *.tgz 71 | 72 | # Yarn Integrity file 73 | .yarn-integrity 74 | 75 | # dotenv environment variable files 76 | .env 77 | .env.development.local 78 | .env.test.local 79 | .env.production.local 80 | .env.local 81 | 82 | # parcel-bundler cache (https://parceljs.org/) 83 | .cache 84 | .parcel-cache 85 | 86 | # Next.js build output 87 | .next 88 | out 89 | 90 | # Nuxt.js build / generate output 91 | .nuxt 92 | dist 93 | 94 | # Gatsby files 95 | .cache/ 96 | # Comment in the public line in if your project uses Gatsby and not Next.js 97 | # https://nextjs.org/blog/next-9-1#public-directory-support 98 | # public 99 | 100 | # vuepress build output 101 | .vuepress/dist 102 | 103 | # vuepress v2.x temp and cache directory 104 | .temp 105 | .cache 106 | 107 | # Docusaurus cache and generated files 108 | .docusaurus 109 | 110 | # Serverless directories 111 | .serverless/ 112 | 113 | # FuseBox cache 114 | .fusebox/ 115 | 116 | # DynamoDB Local files 117 | .dynamodb/ 118 | 119 | # TernJS port file 120 | .tern-port 121 | 122 | # Stores VSCode versions used for testing VSCode extensions 123 | .vscode-test 124 | 125 | # yarn v2 126 | .yarn/cache 127 | .yarn/unplugged 128 | .yarn/build-state.yml 129 | .yarn/install-state.gz 130 | .pnp.* 131 | --------------------------------------------------------------------------------