60 | );
61 | };
62 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Karpathy-GPT
2 |
3 | ## Context
4 |
5 | This app is a template for using LangChain to build a LLM Q+A assistant from any set of YouTube videos.
6 |
7 | We use Karpathy's [course on LLMs](https://www.youtube.com/@AndrejKarpathy/videos) as an example.
8 |
9 | 
10 |
11 | We use LangChain to:
12 |
13 | (1) convert YouTube urls to text
14 |
15 | (2) feed the text into LangChain [auto-evaluator](https://autoevaluator.langchain.com/) to test different chain parameters
16 |
17 | (3) with our chosen parameters, build a vectorstore retriever back-end with FastAPI (deployed to Railway)
18 |
19 | (4) stream the generated results (answer and retrieved docs) to a front-end (deployed to Vercel)
20 |
21 | ---
22 |
23 | ## Step 1: URLs to text
24 |
25 | See [the notebook](https://github.com/rlancemartin/karpathy-gpt/blob/main/index/youtube_urls_to_vectordb.ipynb) in `/index` folder:
26 |
27 | * Uses LangChain's `OpenAIWhisperParser` to convert urls to text in < 10 lines of code
28 |
29 | ## Step 2: Testing
30 |
31 | See [the text files](https://github.com/rlancemartin/karpathy-gpt/tree/main/eval) in `/eval` folder:
32 |
33 | * Feed the text from step 1 and, optionally, an eval set to the [auto-evaluator app](https://autoevaluator.langchain.com/playground)
34 | * We can use this to test different parameters (see full README in the repo [here](https://github.com/langchain-ai/auto-evaluator))
35 | * Use the UI to run experiments
36 | * Select your best retriever, chain settings (e.g., k, split size, split overlap, etc), LLM, embeddings
37 |
38 | 
39 |
40 | ## Step 3: text to VectorDB
41 |
42 | See [the notebook](https://github.com/rlancemartin/karpathy-gpt/blob/main/index/youtube_urls_to_vectordb.ipynb) in `/index` folder:
43 |
44 | * Split the text from step 1 using parameters you found in step 2
45 | * Upsert the vectors to a VectorDB (e.g., in this example, `Pinecone`) with metadata
46 | * See this [PR / notebook](https://github.com/rlancemartin/langchain/blob/e1fa1a41d0b2d7f476627a6798e98f02ebe4a83d/docs/modules/indexes/document_loaders/examples/youtube_audio.ipynb) if you want to use locally with a different VectorDB
47 |
48 | ## Step 4: Back-end
49 |
50 | See the `karpathy_app.py` file in `/api` folder:
51 |
52 | * We use LangChain's `load_qa_chain` with a user specified LLM and prompt (see `default_prompt_template`)
53 | * Given a question, this will stream answer the text back to front-end and pass the retrieved documents back
54 | * We deploy this FastAPI API to Railway
55 | * See README.md in `/api` for local testing instructions
56 |
57 | ## Step 5: Front-end
58 |
59 | See `/nextjs` directory for nextJS app:
60 |
61 | * This will call the back-end with the query and fetch the documents / answer
62 | * Test the app locally by launching the back-end:
63 | ```
64 | uvicorn karpathy_app:app
65 | ```
66 | * To run front-end locally with you locally running back-end, simply change the source in `fetchEventSource` [here](https://github.com/rlancemartin/karpathy-gpt/blob/a338ceb8666c02b0ec7e7f47ca0a196d774d1e4d/nextjs/pages/index.tsx#L37) and [here](https://github.com/rlancemartin/karpathy-gpt/blob/a338ceb8666c02b0ec7e7f47ca0a196d774d1e4d/nextjs/pages/index.tsx#L55) to `http://localhost:8000/karpathy-docs` and `http://localhost:8000/karpathy-stream`
67 | * To run the front-end locally, run:
68 | ```
69 | npm run dev
70 | ```
71 |
--------------------------------------------------------------------------------
/eval/test-set.csv:
--------------------------------------------------------------------------------
1 | "question","answer",
2 | "Why do we need to zero out the gradient before backprop at each step?","When we call backward, we fill in the gradients. This will update self.grad, so the gradients will accumulate unless we explicity flush it by setting to zero.",
3 | "What does the gradient tell us and how can we use it to minimize the loss?","The gradient tells us the direction to nudge each parameter in order to increace loss. Each update (to minimize loss) take the negative gradient multplied by a step size.",
4 | "What is the mean squared error loss?","The mean squared error loss is the average of the squared differences between actual values and predicted values",
5 | "What is Makemore?","Makemore is a simple charecter-level langugae model that will predict next char in a sequence given some prior charecters before it.",
6 | "What is log likelihood loss and why do we use the negative log likelihood?","We take the sum of the log probability of the label (correct charecter) for each charecter in the string. If the label has a high probability, such as 1, will have a low loss because the log(1) is 0. If the label has a low probability, such as 0, will have a very low loss because the log(0) is -inf. But, if the label has a low probability, we want it to have a high loss, so we take the negative log.",
7 | "How does cross entropy relate to negative log likelihood?","We use a softmax layer to compute probabilities from raw logits and then compute negative log likelihood loss from raw probabilities. Cross entropy just rolls these steps into 1.",
8 | "What is the problem with extreme values for logits?","The exponentiation of very large positive logits can exceed the dynamic range of floating-point numbers, causing overflow issues.",
9 | "Why do we use batch normalization?","We want roughly gaussian activaitions to avoid vanishing gradients and use a normalization layer to automate this.",
10 | "What context window does a transformer have when predicting the output?","Transformer will never see more than `block_size` when predicting the output.",
11 | "What is the problem with the Bigram model's context window?","The Bigram model is only looking at the last char to predict the next char.",
12 | "How can self-attention improve on the limited context window of the Bigram model?","Self-attention lets all prior tokens to 'talk' to each other when predicting the next token.",
13 | "How are keys and queries generated, and how do they interact?","We embed each token to a Key and Query. Each query does a dot-product with the key at each prior location. If a Key and Query are aligned, they will produce a high value.",
14 | "For any token, what are x, k, v, and q?","x is private information to the token. q is what the token is interested in. k is what the token has. v is what the token will communicate to you if you find it interesting.",
15 | "What is the difference between an encoder and decoder?","The decoder is typically just self-attention (communication) and feed-forward (compute). We condition on the past. It uses triangular mask on future tokens. It has an auto-regressive property where we can sample from it. The encoder can condition on the past or on a seperate source via cross-attention.",
16 | "What are two innovations that improve optimization for deep neural nets?","First, residual connections create a gradient superhighway that goes directly from the supervision all the way to the input, unimpeded. At initialization, residual blocks effectivly allow the gradient to flow unimpeded and, over time, they come online and start to contribute. Second, layer norm will normalize the rows (or examples) in each batch independently."
17 |
--------------------------------------------------------------------------------
/api/karpathy_app.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pinecone
3 | import logging
4 | import asyncio
5 | from fastapi import FastAPI, Form
6 | from langchain.prompts import PromptTemplate
7 | from langchain.chat_models import ChatOpenAI
8 | from langchain.vectorstores import Pinecone
9 | from sse_starlette.sse import EventSourceResponse
10 | from fastapi.middleware.cors import CORSMiddleware
11 | from langchain.embeddings.openai import OpenAIEmbeddings
12 | from langchain.chains.question_answering import load_qa_chain
13 | from langchain.callbacks import AsyncIteratorCallbackHandler
14 |
15 | # Prompt template for QA
16 | default_prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
17 |
18 | {context}
19 |
20 | Question: {question}
21 | Helpful Answer:"""
22 |
23 | def make_llm(model_version):
24 | """
25 | Make LLM
26 | @param model_version: model_version
27 | @return: llm, callback handler
28 | """
29 |
30 | if (model_version == "gpt-3.5-turbo") or (model_version == "gpt-4"):
31 | callback = AsyncIteratorCallbackHandler()
32 | chosen_model = ChatOpenAI(model_name=model_version,streaming=True,callbacks=[callback],temperature=0)
33 | return chosen_model, callback
34 |
35 | def make_chain(llm):
36 | """
37 | Make QA chain using specified default_prompt_template
38 | @param llm: llm for answering
39 | @return: qa_chain
40 | """
41 | QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=default_prompt_template)
42 | qa_chain = load_qa_chain(llm, chain_type="stuff", prompt=QA_CHAIN_PROMPT)
43 | return qa_chain
44 |
45 | def make_retriever(logger):
46 | """
47 | Make document retriever
48 | @return: Pinecone
49 | """
50 | logger.info("`Retriving docs ...`")
51 |
52 | # Set embeddings (must match your Pinecone DB)
53 | embedding = OpenAIEmbeddings()
54 | pc_api_key = os.environ.get('PINECONE_API_KEY')
55 | pc_region = "us-east1-gcp"
56 | pc_index = "karpathy-gpt"
57 |
58 | # Set Pinecone
59 | pinecone.init(api_key=str(pc_api_key), environment=str(pc_region))
60 | p = Pinecone.from_existing_index(index_name=str(pc_index), embedding=embedding)
61 | return p
62 |
63 | import json
64 | async def generate_docs(question):
65 | """
66 | @param question: question
67 | @return: docs
68 | """
69 |
70 | # Set up logging
71 | logging.config.fileConfig('logging.conf', disable_existing_loggers=False)
72 | logger = logging.getLogger(__name__)
73 |
74 | # Model for answering
75 | model = "gpt-3.5-turbo"
76 | llm, callback=make_llm(model)
77 |
78 | # Chain
79 | chain=make_chain(llm)
80 |
81 | # Retriever
82 | retriever=make_retriever(logger)
83 |
84 | # Stream
85 | logger.info("`Getting docs ...`")
86 | docs = retriever.similarity_search(query=question,k=3)
87 | for doc in docs:
88 | yield json.dumps({"data":{"pageContent": doc.page_content, "metadata": doc.metadata}})
89 |
90 | async def generate_response(question):
91 | """
92 | @param question: question
93 | @return: answer stream
94 | """
95 |
96 | # Set up logging
97 | logging.config.fileConfig('logging.conf', disable_existing_loggers=False)
98 | logger = logging.getLogger(__name__)
99 |
100 | # Model for answering
101 | model = "gpt-3.5-turbo"
102 | llm, callback=make_llm(model)
103 |
104 | # Chain
105 | chain=make_chain(llm)
106 |
107 | # Retriever
108 | retriever=make_retriever(logger)
109 |
110 | # Stream
111 | logger.info("`Generating answer ...`")
112 | docs = retriever.similarity_search(query=question,k=3)
113 | task = asyncio.create_task(
114 | chain.acall({
115 | "input_documents": docs,
116 | "question": question
117 | }),
118 | )
119 | async for token in callback.aiter():
120 | yield token
121 | await task
122 |
123 | # App
124 | app = FastAPI()
125 |
126 | origins = [
127 | "http://localhost:3000",
128 | "localhost:3000",
129 | ]
130 |
131 | app.add_middleware(
132 | CORSMiddleware,
133 | allow_origins=["*"],
134 | allow_credentials=True,
135 | allow_methods=["*"],
136 | allow_headers=["*"],
137 | )
138 |
139 | @app.get("/")
140 | async def root():
141 | return {"message": "Welcome to Karpathy GPT!"}
142 |
143 | # Docs
144 | @app.post("/karpathy-docs")
145 | async def create_docs_response(
146 | query: str = Form("What is the difference between an encoder and decoder?"),
147 | ):
148 | return EventSourceResponse(generate_docs(query), headers={"Content-Type": "text/event-stream", "Connection": "keep-alive", "Cache-Control": "no-cache"})
149 |
150 | # Answer stream
151 | @app.post("/karpathy-stream")
152 | async def create_response(
153 | query: str = Form("What is the difference between an encoder and decoder?"),
154 | ):
155 | # Return SSE
156 | return EventSourceResponse(generate_response(query), headers={"Content-Type": "text/event-stream", "Connection": "keep-alive", "Cache-Control": "no-cache"})
157 |
--------------------------------------------------------------------------------
/api/README.md:
--------------------------------------------------------------------------------
1 | # `karpathy-gpt-api`
2 |
3 | This it is the back-end for Karpathy-GPT.
4 |
5 | ### `Test locally` -
6 |
7 | Set API keys:
8 | ```
9 | export OPENAI_API_KEY=xxx
10 | ```
11 |
12 | Start local server:
13 | ```
14 | uvicorn karpathy_app:app
15 | ```
16 |
17 | Inputs:
18 | ```
19 | question
20 | ```
21 |
22 | Test doc retrieval:
23 | ```
24 | curl -X POST -F "question=What is makemore" http://localhost:8000/karpathy-docs
25 | ```
26 |
27 | ```
28 | data: page_content="Hi everyone, hope you're well. And next up what I'd like to do is I'd like to build out Makemore. Like Micrograd before it, Makemore is a repository that I have on my GitHub web page. You can look at it. But just like with Micrograd, I'm going to build it out step by step and I'm going to spell everything out. So we're going to build it out slowly and together. Now, what is Makemore? Makemore, as the name suggests, makes more of things that you give it. So here's an example. Names.txt is an example dataset to Makemore. And when you look at Names.txt, you'll find that it's a very large dataset of names. So here's lots of different types of names. In fact, I believe there are 32,000 names that I've sort of found randomly on a government website. And if you train Makemore on this dataset, it will learn to make more of things like this. And in particular, in this case, that will mean more things that sound name-like, but are actually unique names. And maybe if you have a baby and you're trying to assign a name, maybe you're looking for a cool new sounding unique name, Makemore might help you. So here are some example generations from the neural network once we train it on our dataset. So here's some example unique names that it will generate. Don't tell, I rot, Zendi, and so on. And so all these sort of sound name-like, but they're not, of course, names. So under the hood, Makemore is a character-level language model. So what that means is that it is treating every single line" metadata={'id': '02', 'link': 'https://youtu.be/PaCmpygFfXo', 'source': 'The spelled-out intro to language modeling: building makemore 02', 'title': 'The spelled-out intro to language modeling: building makemore'}
29 |
30 | data: page_content="not, of course, names. So under the hood, Makemore is a character-level language model. So what that means is that it is treating every single line here as an example. And within each example, it's treating them all as sequences of individual characters. So R-E-E-S-E is this example, and that's the sequence of characters. And that's the level on which we are building out Makemore. And what it means to be a character-level language model, then, is that it's just sort of modeling those sequences of characters, and it knows how to predict the next character in the sequence. Now, we're actually going to implement a large number of character-level language models in terms of the neural networks that are involved in predicting the next character in a sequence. So very simple bigram and bag-of-word models, multilayered perceptrons, recurrent neural networks, all the way to modern transformers. In fact, the transformer that we will build will be basically the equivalent transformer to GPT-2, if you have heard of GPT. So that's kind of a big deal. It's a modern network, and by the end of the series, you will actually understand how that works on the level of characters. Now, to give you a sense of the extensions here, after characters, we will probably spend some time on the word level, so that we can generate documents of words, not just little segments of characters, but we can generate entire large, much larger documents. And then we're probably going to go into images and" metadata={'id': '02', 'link': 'https://youtu.be/PaCmpygFfXo', 'source': 'The spelled-out intro to language modeling: building makemore 02', 'title': 'The spelled-out intro to language modeling: building makemore'}
31 |
32 | data: page_content="Hi everyone. Today we are continuing our implementation of MakeMore, our favorite character-level language model. Now, you'll notice that the background behind me is different. That's because I am in Kyoto and it is awesome. So I'm in a hotel room here. Now, over the last few lectures, we've built up to this architecture that is a multi-layer perceptron character-level language model. So we see that it receives three previous characters and tries to predict the fourth character in a sequence using a very simple multi-layer perceptron using one hidden layer of neurons with tenational neurons. So what I'd like to do now in this lecture is I'd like to complexify this architecture. In particular, we would like to take more characters in a sequence as an input, not just three. And in addition to that, we don't just want to feed them all into a single hidden layer because that squashes too much information too quickly. Instead, we would like to make a deeper model that progressively fuses this information to make its guess about the next character in a sequence. And so we'll see that as we make this architecture more complex, we're actually going to arrive at something that looks very much like a WaveNet. So WaveNet is this paper published by Dequined in 2016. And it is also a language model, basically, but it tries to predict audio sequences instead of character-level sequences or word-level sequences. But fundamentally, the modeling setup is identical. It is an autoregressive" metadata={'id': '06', 'link': 'htt
33 | ```
34 |
35 | Test answer stream:
36 |
37 | ```
38 | curl -X POST -F "question=What is makemore" http://localhost:8000/karpathy-stream
39 | ```
40 |
41 | ```
42 | data: M
43 |
44 | data: ak
45 |
46 | data: em
47 |
48 | data: ore
49 |
50 | data: is
51 |
52 | data: a
53 |
54 | data: character
55 |
56 | data: -level
57 |
58 | data: language
59 |
60 | data: model
61 | ```
62 |
63 | ### `Test deployed API -`
64 |
65 | We deploy as an API to [Railway](https://railway.app/).
66 |
67 | Test:
68 | ```
69 | curl -X POST -F "question=What is makemore" https://karpathy-gpt-production.up.railway.app/karpathy-stream
70 | ```
71 |
72 | Returns streaming events, as shown above.
--------------------------------------------------------------------------------
/nextjs/pages/index.tsx:
--------------------------------------------------------------------------------
1 | import { Answer } from "@/components/Answer/Answer";
2 | import { Footer } from "@/components/Footer";
3 | import { Navbar } from "@/components/Navbar";
4 | import { LEXChunk } from "@/types";
5 | import { IconArrowRight, IconExternalLink, IconSearch } from "@tabler/icons-react";
6 | import Head from "next/head";
7 | import Image from "next/image";
8 | import { KeyboardEvent, useEffect, useRef, useState } from "react";
9 | import { fetchEventSource } from '@microsoft/fetch-event-source';
10 |
11 | export default function Home() {
12 |
13 | const inputRef = useRef(null);
14 | const [query, setQuery] = useState("");
15 | const [chunks, setChunks] = useState([]);
16 | const [answer, setAnswer] = useState("");
17 | const [loading, setLoading] = useState(false);
18 | const [showSettings, setShowSettings] = useState(false);
19 |
20 | // Handle answer
21 | const handleAnswer = async () => {
22 |
23 | if (!query) {
24 | alert("Please enter a query.");
25 | return;
26 | }
27 |
28 | setAnswer("");
29 | setChunks([]);
30 | setLoading(true);
31 |
32 | const formData = new FormData();
33 | formData.append("query",query);
34 | console.log(formData)
35 | console.log(query)
36 |
37 | fetchEventSource("https://karpathy-gpt-production.up.railway.app/karpathy-docs", {
38 | method: "POST",
39 | headers: {
40 | Accept: "text/event-stream",
41 | Connection: "keep-alive",
42 | },
43 | body: formData,
44 | onmessage: (event) => {
45 | setLoading(false);
46 | if (event.data === "DONE") {
47 | } else {
48 | const newChunk: LEXChunk = JSON.parse(event.data)?.data;
49 | setChunks((oldChunks) => [...oldChunks, newChunk]);
50 | }
51 | }});
52 |
53 | const ctrl = new AbortController();
54 |
55 | fetchEventSource("https://karpathy-gpt-production.up.railway.app/karpathy-stream", {
56 | method: "POST",
57 | headers: {
58 | Accept: "text/event-stream",
59 | Connection: "keep-alive",
60 | },
61 | body: formData,
62 | onmessage: (event) => {
63 | setLoading(false);
64 | if (event.data === "DONE") {
65 | } else {
66 | setAnswer((prev) => prev + event.data);
67 | }
68 | }});
69 |
70 | };
71 |
72 | const handleKeyDown = (e: KeyboardEvent) => {
73 | if (e.key === "Enter") {
74 | handleAnswer();
75 | }
76 | };
77 |
78 | // Render page
79 | return (
80 | <>
81 |
82 | Karpathy GPT
83 |
87 |
91 |
95 |
96 |
97 |