├── .gitattributes
├── tsconfig.json
├── src
    ├── answer.ts
    ├── lib
    │   ├── db.ts
    │   ├── scrape.ts
    │   └── openai.ts
    └── ingest.ts
├── package.json
├── LICENSE
├── README.md
└── .gitignore


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "module": "commonjs",
 4 |     "esModuleInterop": true,
 5 |     "target": "es6",
 6 |     "moduleResolution": "node",
 7 |     "sourceMap": true,
 8 |     "outDir": "dist"
 9 |   },
10 |   "lib": ["es2015"]
11 | }


--------------------------------------------------------------------------------
/src/answer.ts:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import { queryDatabase } from "./lib/db";
 4 | import { generateEmbedding, generateResponse } from "./lib/openai";
 5 | 
 6 | async function askQuestion(question: string) {
 7 | 
 8 |   const embedding = await generateEmbedding(question);
 9 | 
10 |   const queryRes = await queryDatabase(embedding.data[0].embedding);
11 | 
12 |   const response = await generateResponse(question, queryRes.map((doc) => doc.text));
13 | 
14 |   return response;
15 | }
16 | 
17 | askQuestion("Why are George Russell and Max Verstappen arguing after Qatar 2024?").then((res) => {
18 |   console.log(res);
19 | });


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "f1-ai",
 3 |   "version": "1.0.0",
 4 |   "main": "index.js",
 5 |   "scripts": {
 6 |     "test": "echo \"Error: no test specified\" && exit 1",
 7 |     "ingest": "ts-node src/ingest.ts",
 8 |     "answer": "ts-node src/answer.ts"
 9 |   },
10 |   "keywords": [],
11 |   "author": "",
12 |   "license": "ISC",
13 |   "description": "",
14 |   "devDependencies": {
15 |     "@types/express": "^4.17.1",
16 |     "typescript": "^5.7.2"
17 |   },
18 |   "dependencies": {
19 |     "@datastax/astra-db-ts": "^1.5.0",
20 |     "cassandra-driver": "^4.7.2",
21 |     "express": "^4.21.2",
22 |     "fs": "^0.0.1-security",
23 |     "langchain": "^0.3.6",
24 |     "openai": "^4.76.0",
25 |     "playwright": "^1.49.0",
26 |     "ts-node": "^10.9.2"
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/lib/db.ts:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | import { DataAPIClient } from "@datastax/astra-db-ts";
 5 | 
 6 | const client = new DataAPIClient('YOUR_TOKEN');
 7 | const db = client.db('YOUR_DB_URL');
 8 | const collection = db.collection('f1gpt');
 9 | 
10 | export async function createCollection() {
11 |   const res = await db.createCollection("f1gpt", {
12 |     vector: {
13 |       dimension: 1536,
14 |       metric: "dot_product"
15 |     }
16 |   });
17 |   return res
18 | }
19 | 
20 | export async function uploadData(data: {
21 |   $vector: number[],
22 |   text: string
23 | }[]) {
24 |   return await collection.insertMany(data);
25 | }
26 | 
27 | 
28 | export async function queryDatabase(query: number[]) {
29 |   const res = await collection.find(null, {
30 |     sort: {
31 |       $vector: query
32 |     },
33 |     limit: 10
34 |   }).toArray();
35 | 
36 |   return res
37 | }


--------------------------------------------------------------------------------
/src/lib/scrape.ts:
--------------------------------------------------------------------------------
 1 | // Using playwright to scrape the data from the website urls
 2 | 
 3 | 
 4 | import playwright from "playwright";
 5 | import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
 6 | 
 7 | 
 8 | export async function scrape(url: string) {
 9 | 
10 |   // Scrape the text from the website
11 | 
12 |   const browser = await playwright.chromium.launch();
13 | 
14 |   const context = await browser.newContext();
15 | 
16 |   const page = await context.newPage();
17 | 
18 |   await page.goto(url);
19 | 
20 |   const text = await page.innerText("body");
21 |   
22 |   text.replace(/\n/g, " ");
23 | 
24 |   await browser.close();
25 | 
26 |   // Split the text into chunks
27 | 
28 |   const splitter = new RecursiveCharacterTextSplitter({
29 |     chunkSize: 512,
30 |     chunkOverlap: 100,
31 |   });
32 | 
33 |   const output = await splitter.createDocuments([text]);
34 | 
35 |   return output;
36 | 
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/src/lib/openai.ts:
--------------------------------------------------------------------------------
 1 | // Generate vector embeddings using the openai api
 2 | 
 3 | 
 4 | import OpenAI from 'openai';
 5 | 
 6 | const client = new OpenAI({
 7 |   apiKey: "YOUR_API_KEY",
 8 | });
 9 | 
10 | export async function generateEmbedding(text: string) {
11 |   const embedding = await client.embeddings.create({
12 |     model: "text-embedding-ada-002",
13 |     input: text
14 |   })
15 | 
16 |   return embedding;
17 | }
18 | 
19 | export async function generateResponse(question: string, context: string[]) {
20 |   const response = await client.chat.completions.create({
21 |     model: "gpt-4o",
22 |     messages: [{
23 |       role: "user",
24 |       content: `You are an expert in Formula 1 racing.
25 |       You need to answer this question using the context provided.
26 |       Do not mention that you have been provided with the context.
27 |       QUESTION: ${question}.
28 |       `
29 |     }]
30 |   })
31 | 
32 |   return response.choices[0].message.content;
33 | 
34 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Tom Shaw
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/ingest.ts:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import { createCollection, uploadData } from "./lib/db";
 4 | import { generateEmbedding } from "./lib/openai";
 5 | import { scrape } from "./lib/scrape";
 6 | 
 7 | const urls = [
 8 |   "https://en.wikipedia.org/wiki/Formula_One",
 9 |   "https://en.wikipedia.org/wiki/George_Russell_(racing_driver)",
10 | ];
11 | 
12 | async function ingest() {
13 |    
14 |   let chunks: { text: string, $vector: number[], url: string }[] = [];
15 | 
16 |   await (Promise.all(urls.map(async (url) => {
17 |     let data = await scrape(url);
18 | 
19 |     const embeddings = await Promise.all(data.map(async (doc, index) => {
20 |       const embedding = await generateEmbedding(doc.pageContent);
21 |       return embedding;
22 |     }));
23 | 
24 |     chunks = chunks.concat(data.map((doc, index) => {
25 |       return {
26 |         text: doc.pageContent,
27 |         $vector: embeddings[index].data[0].embedding,
28 |         url: url
29 |       }
30 |     }));
31 |   })));
32 | 
33 |   await createCollection();
34 |   
35 |   await uploadData(chunks.map((doc, index) => {
36 |     return {
37 |       $vector: doc.$vector,
38 |       text: doc.text,
39 |       source: doc.url
40 |     }
41 |   }));
42 | }
43 | 
44 | ingest();
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # F1-AI: Retrieval-Augmented Generation (RAG) Application
 2 | 
 3 | ## Overview
 4 | 
 5 | F1-AI is a Retrieval-Augmented Generation (RAG) application that leverages OpenAI's GPT-4 model and a vector database to provide context-aware answers to questions about Formula 1 racing. This project demonstrates how to build a RAG application using TypeScript, OpenAI, DataStax Astra DB, and Playwright.
 6 | 
 7 | ## Prerequisites
 8 | 
 9 | - [Node.js](https://nodejs.org/en/download/)
10 | - [OpenAI API Key](https://beta.openai.com/signup/)
11 | - [DataStax Astra DB](https://astra.datastax.com/register)
12 | 
13 | ## Installation
14 | 
15 | 1. Clone the repository:
16 | 
17 | ```bash
18 | git clone https://github.com/IAmTomShaw/f1-rag-ai.git
19 | ```
20 | 
21 | 2. Install the dependencies:
22 | 
23 | ```bash
24 | cd f1-rag-ai
25 | npm install
26 | ```
27 | 
28 | ## Configuration
29 | 
30 | You'll need to paste your OpenAI API key and DataStax Astra DB credentials into the relevant files, or create a `.env` file in the root directory with the following environment variables:
31 | 
32 | ```bash
33 | OPENAI_API_KEY=your-openai-api-key
34 | ASTRA_DB_ID=your-astra-db-id
35 | ASTRA_DB_REGION=your-astra-db-region
36 | ASTRA_DB_USERNAME=your-astra-db-username
37 | ASTRA_DB_PASSWORD=your-astra-db-password
38 | ```
39 | 
40 | You'll then need to make sure that these environment variables are referenced in your code and loaded correctly.
41 | 
42 | ## Usage
43 | 
44 | You can modify the list of urls that I am scraping in the `src/ingest.ts` file. You can then run the following command to scrape the data:
45 | 
46 | ```bash
47 | npm run ingest
48 | ```
49 | 
50 | This will scrape the data from the urls and store it in the Astra DB.
51 | 
52 | You can then run the following command to test the RAG application using the query defined in the `src/answer.ts` file:
53 | 
54 | ```bash
55 | npm run answer
56 | ```
57 | 
58 | 
59 | ## License
60 | 
61 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
62 | 
63 | ## Credit
64 | 
65 | This project was created by [Tom Shaw](https://tomshaw.dev)
66 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Logs
  2 | logs
  3 | *.log
  4 | npm-debug.log*
  5 | yarn-debug.log*
  6 | yarn-error.log*
  7 | lerna-debug.log*
  8 | .pnpm-debug.log*
  9 | 
 10 | # Diagnostic reports (https://nodejs.org/api/report.html)
 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 12 | 
 13 | # Runtime data
 14 | pids
 15 | *.pid
 16 | *.seed
 17 | *.pid.lock
 18 | 
 19 | # Directory for instrumented libs generated by jscoverage/JSCover
 20 | lib-cov
 21 | 
 22 | # Coverage directory used by tools like istanbul
 23 | coverage
 24 | *.lcov
 25 | 
 26 | # nyc test coverage
 27 | .nyc_output
 28 | 
 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 30 | .grunt
 31 | 
 32 | # Bower dependency directory (https://bower.io/)
 33 | bower_components
 34 | 
 35 | # node-waf configuration
 36 | .lock-wscript
 37 | 
 38 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 39 | build/Release
 40 | 
 41 | # Dependency directories
 42 | node_modules/
 43 | jspm_packages/
 44 | 
 45 | # Snowpack dependency directory (https://snowpack.dev/)
 46 | web_modules/
 47 | 
 48 | # TypeScript cache
 49 | *.tsbuildinfo
 50 | 
 51 | # Optional npm cache directory
 52 | .npm
 53 | 
 54 | # Optional eslint cache
 55 | .eslintcache
 56 | 
 57 | # Optional stylelint cache
 58 | .stylelintcache
 59 | 
 60 | # Microbundle cache
 61 | .rpt2_cache/
 62 | .rts2_cache_cjs/
 63 | .rts2_cache_es/
 64 | .rts2_cache_umd/
 65 | 
 66 | # Optional REPL history
 67 | .node_repl_history
 68 | 
 69 | # Output of 'npm pack'
 70 | *.tgz
 71 | 
 72 | # Yarn Integrity file
 73 | .yarn-integrity
 74 | 
 75 | # dotenv environment variable files
 76 | .env
 77 | .env.development.local
 78 | .env.test.local
 79 | .env.production.local
 80 | .env.local
 81 | 
 82 | # parcel-bundler cache (https://parceljs.org/)
 83 | .cache
 84 | .parcel-cache
 85 | 
 86 | # Next.js build output
 87 | .next
 88 | out
 89 | 
 90 | # Nuxt.js build / generate output
 91 | .nuxt
 92 | dist
 93 | 
 94 | # Gatsby files
 95 | .cache/
 96 | # Comment in the public line in if your project uses Gatsby and not Next.js
 97 | # https://nextjs.org/blog/next-9-1#public-directory-support
 98 | # public
 99 | 
100 | # vuepress build output
101 | .vuepress/dist
102 | 
103 | # vuepress v2.x temp and cache directory
104 | .temp
105 | .cache
106 | 
107 | # Docusaurus cache and generated files
108 | .docusaurus
109 | 
110 | # Serverless directories
111 | .serverless/
112 | 
113 | # FuseBox cache
114 | .fusebox/
115 | 
116 | # DynamoDB Local files
117 | .dynamodb/
118 | 
119 | # TernJS port file
120 | .tern-port
121 | 
122 | # Stores VSCode versions used for testing VSCode extensions
123 | .vscode-test
124 | 
125 | # yarn v2
126 | .yarn/cache
127 | .yarn/unplugged
128 | .yarn/build-state.yml
129 | .yarn/install-state.gz
130 | .pnp.*
131 | 


--------------------------------------------------------------------------------