├── .gitignore ├── LICENSE ├── README.md ├── api ├── .env ├── README.md ├── Test_Inference.ipynb ├── __pycache__ │ ├── evaluator_app.cpython-38.pyc │ └── text_utils.cpython-38.pyc ├── docs │ ├── gpt3 │ │ ├── 2005.14165.pdf │ │ └── gpt3-eval.csv │ ├── karpathy-lex-pod │ │ ├── karpathy-pod-eval.csv │ │ └── karpathy-pod.txt │ └── transformers-challenge │ │ ├── 2005.14165.pdf │ │ ├── 2112.04426.pdf │ │ ├── 2203.15556.pdf │ │ ├── 2205.06175.pdf │ │ ├── 2302.13971.pdf │ │ └── transformers-eval.csv ├── evaluator_app.py ├── logging.conf ├── railway.json ├── requirements.txt └── text_utils.py ├── nextjs ├── .env.local ├── .gitignore ├── components │ ├── Demo.tsx │ ├── ExperimentSummaryTable.tsx │ ├── HeaderEvaluator.tsx │ ├── PersonCard.tsx │ ├── Playground.tsx │ ├── Sidebar.tsx │ ├── SummaryChart.tsx │ ├── TestFileUploadZone.tsx │ └── tables │ │ ├── ExperimentResultTable.tsx │ │ └── FilesTable.tsx ├── next-env.d.ts ├── next.config.js ├── package.json ├── pages │ ├── _app.tsx │ ├── about │ │ └── index.tsx │ ├── index.tsx │ └── playground │ │ └── index.tsx ├── public │ ├── favicon │ │ ├── about.txt │ │ ├── android-chrome-192x192.png │ │ ├── android-chrome-512x512.png │ │ ├── apple-touch-icon.png │ │ ├── favicon-16x16.png │ │ ├── favicon-32x32.png │ │ ├── favicon.ico │ │ └── site.webmanifest │ ├── github-mark.svg │ ├── slack-mark.svg │ ├── testData │ │ ├── experiments.json │ │ ├── karpathy-pod.json │ │ ├── results.json │ │ └── testDataset.json │ └── twitter-black.svg ├── styles │ ├── global.css │ └── utils.module.css ├── tsconfig.json ├── utils │ ├── renderPassFail.ts │ ├── types.ts │ └── variables.ts └── yarn.lock └── streamlit ├── README.md ├── auto-evaluator.py ├── eval_sets └── lex-pod-eval.json ├── img └── diagnostic.jpg ├── kor_retriever_lex.py ├── prompts.py ├── requirements.txt └── self_query_retriever_lex.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .vercel 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Elastic License 2.0 (ELv2) 2 | 3 | **Acceptance** 4 | By using the software, you agree to all of the terms and conditions below. 5 | 6 | **Copyright License** 7 | The licensor grants you a non-exclusive, royalty-free, worldwide, non-sublicensable, non-transferable license to use, copy, distribute, make available, and prepare derivative works of the software, in each case subject to the limitations and conditions below 8 | 9 | **Limitations** 10 | You may not provide the software to third parties as a hosted or managed service, where the service provides users with access to any substantial set of the features or functionality of the software. 11 | 12 | You may not move, change, disable, or circumvent the license key functionality in the software, and you may not remove or obscure any functionality in the software that is protected by the license key. 13 | 14 | You may not alter, remove, or obscure any licensing, copyright, or other notices of the licensor in the software. Any use of the licensor’s trademarks is subject to applicable law. 15 | 16 | **Patents** 17 | The licensor grants you a license, under any patent claims the licensor can license, or becomes able to license, to make, have made, use, sell, offer for sale, import and have imported the software, in each case subject to the limitations and conditions in this license. This license does not cover any patent claims that you cause to be infringed by modifications or additions to the software. If you or your company make any written claim that the software infringes or contributes to infringement of any patent, your patent license for the software granted under these terms ends immediately. If your company makes such a claim, your patent license ends immediately for work on behalf of your company. 18 | 19 | **Notices** 20 | You must ensure that anyone who gets a copy of any part of the software from you also gets a copy of these terms. 21 | 22 | If you modify the software, you must include in any modified copies of the software prominent notices stating that you have modified the software. 23 | 24 | **No Other Rights** 25 | These terms do not imply any licenses other than those expressly granted in these terms. 26 | 27 | **Termination** 28 | If you use the software in violation of these terms, such use is not licensed, and your licenses will automatically terminate. If the licensor provides you with a notice of your violation, and you cease all violation of this license no later than 30 days after you receive that notice, your licenses will be reinstated retroactively. However, if you violate these terms after such reinstatement, any additional violation of these terms will cause your licenses to terminate automatically and permanently. 29 | 30 | **No Liability** 31 | As far as the law allows, the software comes as is, without any warranty or condition, and the licensor will not be liable to you for any damages arising out of these terms or the use or nature of the software, under any kind of legal claim. 32 | 33 | **Definitions** 34 | The *licensor* is the entity offering these terms, and the *software* is the software the licensor makes available under these terms, including any portion of it. 35 | 36 | *you* refers to the individual or entity agreeing to these terms. 37 | 38 | *your company* is any legal entity, sole proprietorship, or other kind of organization that you work for, plus all organizations that have control over, are under the control of, or are under common control with that organization. *control* means ownership of substantially all the assets of an entity, or the power to direct its management and policies by vote, contract, or otherwise. Control can be direct or indirect. 39 | 40 | *your licenses* are all the licenses granted to you for the software under these terms. 41 | 42 | *use* means anything you do with the software requiring one of your licenses. 43 | 44 | *trademark* means trademarks, service marks, and similar rights. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `Auto-evaluator` :brain: :memo: 2 | 3 | `Context` 4 | 5 | Document [Question-Answering](https://python.langchain.com/en/latest/use_cases/question_answering.html) is a popular LLM use-case. LangChain makes it easy to assemble LLM components (e.g., models and retrievers) into chains that support question-answering: input documents are split into chunks and stored in a retriever, relevant chunks are retrieved given a user `question` and passed to an LLM for synthesis into an `answer`. 6 | 7 | `Challenge` 8 | 9 | The quality of QA systems can vary considerably; for example, [we have seen](https://lancemartin.notion.site/lancemartin/Lex-GPT-a3ad671766d34f4a9a078da7adf9d382) cases of hallucination and poor answer quality due specific parameter settings. But, it is not always obvious to (1) evaluate the answer quality in a systematic way and (2) use this evaluation to guide improved QA chain settings (e.g., chunk size) or components (e.g., model or retriever choice). 10 | 11 | 12 | `App overview` 13 | 14 | This app aims to address the above limitations. Recent [work](https://arxiv.org/abs/2212.09251) from Anthropic has used model-written evaluation sets. OpenAI and others [have shown](https://github.com/openai/evals/blob/main/evals/registry/modelgraded/closedqa.yaml) that model-graded evaluation is an effective way to evaluate models. This app combines both of these ideas into a single workspace, auto-generating a QA test set and auto-grading the result of the specified QA chain. 15 | 16 | ![image](https://user-images.githubusercontent.com/122662504/235393525-be89ef39-2f72-4b61-b8ee-add6a14796b9.png) 17 | 18 | `Usage` 19 | 20 | The app can be used in two ways: 21 | 22 | - `Demo`: We pre-loaded a document (a [transcript](https://youtu.be/OYsYgzzsdT0) of the Lex Fridman podcast with Andrej Karpathy) and a set of 5 [question-answer pairs](https://github.com/langchain-ai/auto-evaluator/blob/main/api/docs/karpathy-lex-pod/karpathy-pod-eval.csv) from the podcast. You can configure QA chain(s) and run an experiment. 23 | 24 | ![image](https://user-images.githubusercontent.com/122662504/234627824-2304f741-9f7b-4252-bdb4-ef2bdfd8139a.png) 25 | 26 | - `Playground`: Input a set of documents that you want to ask questions about. Optionally, also include your own test set of question-answer pairs related to the documents; see an example [here](https://github.com/langchain-ai/auto-evaluator/tree/main/api/docs/karpathy-lex-pod). If you do not supply a test set, the app will auto-generate one. If the test set is smaller than the desired number of eval questions specified in the top left, the app will auto-generate the remainder. 27 | 28 | ![image](https://user-images.githubusercontent.com/122662504/234629201-4c17b411-f910-476b-9bf6-1246c7c5a307.png) 29 | 30 | `Building the document retrieval`: 31 | 32 | - The app will build a [retriever](https://blog.langchain.dev/retrieval/) for the input documents. 33 | - Retriever is a Langchain abstraction that accepts a question and returns a set of relevant documents. 34 | - The retriever can be selected by the user in the drop-down list in the configurations (red panel above). 35 | 36 | `Test set generation`: 37 | 38 | - The app will auto-generate a test set of question-answer pair on the doc(s). 39 | - To do this, it uses the Langchain `QAGenerationChain` with the default prompt [here](https://github.com/hwchase17/langchain/blob/master/langchain/chains/qa_generation/prompt.py). 40 | 41 | `LLM question-answering`: 42 | 43 | - For each question, we use a `RetrievalQA` chain to answer it. 44 | - This will fetch chunks that are relevant to the question from the `retriever` and pass them to the LLM. 45 | - We expose the `QA_CHAIN_PROMPT` used for to pass this context to the LLM [here](https://github.com/langchain-ai/auto-evaluator/blob/main/api/text_utils.py). 46 | 47 | `Model-graded evaluation`: 48 | 49 | - We let the user select from a number of model-graded evaluation prompts: 50 | 51 | (1) The app will evaluate the `relevance of the retrieved documents` relative to the question. 52 | 53 | (2) The app will evaluate the `similarity of the LLM generated answer` relative to ground truth answer. 54 | 55 | - The prompts for both can be seen [here](https://github.com/dankolesnikov/evaluator-app/blob/main/api/text_utils.py) 56 | - Users can select which grading prompt to use. [Here](https://rlancemartin.notion.site/Auto-Evaluator-Opportunities-7b3459dc2ae34440ae3481fe6f43ba40) are some notes in prompt selection from our experience. 57 | 58 | `Experimental results`: 59 | 60 | - The app will produce a table summarizing the results. 61 | - It shows the question and the ground truth (expected) answer. 62 | - It shows the chain-generated answer. 63 | - It shows the binary score (PASS / FAIL) for retrieval and the answer. 64 | - It shows the latency for retrieval and LLM answer summarization per question. 65 | - It shows the model grader output (the raw output of the grading prompt). 66 | 67 | ![image](https://user-images.githubusercontent.com/122662504/235396320-e392f912-977c-4871-b1d2-bd7a8be128a1.png) 68 | 69 | ## User inputs 70 | 71 | The left panel of the app (shown in red in the above image) has several user-configurable parameters. 72 | 73 | `Number of eval questions` - This is the number of question-answer pairs to auto-generate for the given inputs documents. As mentioned above, question-answer pair auto-generation will use Langchain's `QAGenerationChain` with prompt specified [here](https://github.com/hwchase17/langchain/blob/master/langchain/chains/qa_generation/prompt.py). 74 | 75 | `Chunk size` - Number of characters per chunk when the input documents are split. This [can impact answer quality](https://lancemartin.notion.site/lancemartin/Q-A-assistant-limitations-f576bf55b61c44e0970330ac3883315e). Retrievers often use text embedding similarity to select chunks related to the question. If the chunks are too large, each chunk may contain more information unrelated to the question, which may degrade the summarized answer quality. If chunks are too small, important context may be left out of the retrieved chunks. 76 | 77 | `Overlap` - The overlap in characters between chunks. 78 | 79 | `Embedding` - The method used to embed chunks. 80 | 81 | `Retriever` - The method used to [retrieve chunks](https://blog.langchain.dev/retrieval/) that are relevant to the user question. The default vector database used for similarity search is [FAISS](https://github.com/dankolesnikov/evaluator-app/blob/235105642ff1d0ab15be87be7328df71b403268b/api/evaluator_app.py#L131), but support for others is a welcome addition. You can also try other methods, such as [SVM](https://twitter.com/karpathy/status/1647025230546886658) or [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf). 82 | 83 | `Number of chunks to retrieve` - Number of chunks retrieved. More chunks can improve performance by giving the LLM more context for answer summarization. 84 | 85 | `Model` - LLM for summarization of retrieved chunks into the answer. 86 | 87 | `Grading prompt style` - The prompt choice for model-graded evaluation. As mentioned above, the prompts can be seen [here](https://github.com/dankolesnikov/evaluator-app/blob/main/api/text_utils.py). More prompts would be a welcome addition. For example, with the `Descriptive` prompt, you will see a more detailed output with model grade justification. 88 | 89 | ## Logging experiments 90 | 91 | A user can select the desired configuration and then choose `Re-Run Experiment`. 92 | 93 | This will run the new chain on the existing test set. 94 | 95 | The results from all experiments will be summarized in the table and chart. 96 | 97 | ![image](https://user-images.githubusercontent.com/122662504/235396398-5fbf2821-d47f-4496-b8c7-201c9b7e66bc.png) 98 | 99 | ## Contributing 100 | 101 | Run the backend from `api` folder: 102 | 103 | ``` 104 | pip install -r requirements.txt 105 | uvicorn evaluator_app:app 106 | ``` 107 | 108 | Test the `api` locally: 109 | 110 | ``` 111 | curl -X POST -F "files=@docs/karpathy-lex-pod/karpathy-pod.txt" -F "num_eval_questions=1" -F "chunk_chars=1000" -F "overlap=100" -F "split_method=RecursiveTextSplitter" -F "retriever_type=similarity-search" -F "embeddings=OpenAI" -F "model_version=gpt-3.5-turbo" -F "grade_prompt=Fast" -F "num_neighbors=3" http://localhost:8000/evaluator-stream 112 | ``` 113 | 114 | Run the frontend from `nextjs` folder and view web app at specified URL (e.g., `http://localhost:3000/`): 115 | 116 | ``` 117 | yarn install 118 | yarn dev 119 | ``` 120 | 121 | ### Environment Variables 122 | 123 | Front-end: 124 | 125 | `.env.local` contains the env variables needed to run the project. 126 | 127 | Back-end: 128 | 129 | Specify the API keys for any models that you want to use. 130 | 131 | ``` 132 | OPENAI_API_KEY= 133 | ANTHROPIC_API_KEY= 134 | ``` 135 | 136 | ## Deployment 137 | 138 | The front-end is deployed to [Vercel](https://vercel.com/). 139 | 140 | The back-end is deployed to [Railway](https://railway.app/). 141 | -------------------------------------------------------------------------------- /api/.env: -------------------------------------------------------------------------------- 1 | ENVIRONMENT=development -------------------------------------------------------------------------------- /api/README.md: -------------------------------------------------------------------------------- 1 | # `auto-evaluator-api` 2 | 3 | This API includes much of the functionality of the [auto-evaluator Streamlit app](https://github.com/PineappleExpress808/auto-evaluator). 4 | 5 | And it is the back-end for [the hosted app](https://autoevaluator.langchain.com/). 6 | 7 | ### `Test locally` - 8 | 9 | Set API keys: 10 | ``` 11 | export OPENAI_API_KEY= 12 | export ANTHROPIC_API_KEY= 13 | ``` 14 | 15 | Start local server: 16 | ``` 17 | uvicorn evaluator_app:app 18 | ``` 19 | 20 | `Disclaimer: You will not be able to use all the models unless you have the corresponding API key (e.g., Anthropic).` 21 | 22 | Test: 23 | ``` 24 | curl -X POST -F "files=@docs/karpathy-lex-pod/karpathy-pod.txt" -F "num_eval_questions=1" -F "chunk_chars=1000" -F "overlap=100" -F "split_method=RecursiveTextSplitter" -F "retriever_type=similarity-search" -F "embeddings=OpenAI" -F "model_version=gpt-3.5-turbo" -F "grade_prompt=Fast" -F "num_neighbors=3" http://localhost:8000/evaluator-stream 25 | ``` 26 | 27 | ### `Test deployed API -` 28 | 29 | This API is deployed to [Railway](https://railway.app/). 30 | 31 | ``` 32 | curl -X POST -F "files=@docs/karpathy-lex-pod/karpathy-pod.txt" -F "num_eval_questions=1" -F "chunk_chars=1000" -F "overlap=100" -F "split_method=RecursiveTextSplitter" -F "retriever_type=similarity-search" -F "embeddings=OpenAI" -F "model_version=gpt-3.5-turbo" -F "grade_prompt=Fast" -F "num_neighbors=3" https://auto-evaluator-production.up.railway.app/evaluator-stream 33 | 34 | ``` -------------------------------------------------------------------------------- /api/Test_Inference.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "2c926d26", 6 | "metadata": {}, 7 | "source": [ 8 | "### Test Vicuna\n", 9 | "\n", 10 | "* `Background`: https://python.langchain.com/en/latest/modules/models/llms/integrations/llamacpp.html\n", 11 | "* Reproduce the logic that happens in API of the `auto-evaluator`" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "b08e9089", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "!pip install llama-cpp-python" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "9d96ad1f", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import glob, os\n", 32 | "from langchain.llms import LlamaCpp\n", 33 | "from langchain.llms import Replicate\n", 34 | "from langchain.chains import RetrievalQA\n", 35 | "from langchain.vectorstores import FAISS\n", 36 | "from langchain import PromptTemplate, LLMChain\n", 37 | "from langchain.callbacks.base import BaseCallbackManager\n", 38 | "from langchain.embeddings.openai import OpenAIEmbeddings\n", 39 | "from langchain.document_loaders import UnstructuredFileLoader\n", 40 | "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", 41 | "from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "id": "47f4e440", 47 | "metadata": {}, 48 | "source": [ 49 | "`Load`" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "id": "8fb243c5", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "def load_docs(files):\n", 60 | "\n", 61 | " # Load docs\n", 62 | " # IN: List of upload files (from Streamlit)\n", 63 | " # OUT: str\n", 64 | " # TODO: Support multple docs, Use Langchain loader\n", 65 | "\n", 66 | " all_text = \"\"\n", 67 | " for file_path in files:\n", 68 | " file_extension = os.path.splitext(file_path)[1]\n", 69 | " if file_extension == \".pdf\":\n", 70 | " pdf_reader = pypdf.PdfReader(file_path)\n", 71 | " text = \"\"\n", 72 | " for page in pdf_reader.pages:\n", 73 | " text += page.extract_text()\n", 74 | " all_text += text\n", 75 | " elif file_extension == \".txt\":\n", 76 | " loader = UnstructuredFileLoader(file_path)\n", 77 | " docs = loader.load()\n", 78 | " all_text += docs[0].page_content\n", 79 | " else:\n", 80 | " print('Please provide txt or pdf.')\n", 81 | "\n", 82 | " return all_text\n", 83 | "\n", 84 | "fis = glob.glob(\"docs/karpathy-lex-pod/*txt\")\n", 85 | "text = load_docs(fis)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "id": "d6e75a9c", 91 | "metadata": {}, 92 | "source": [ 93 | "`Split`" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "id": "a3370cd8", 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "def split_texts(text, chunk_size, overlap, split_method):\n", 104 | "\n", 105 | " # Split text\n", 106 | " # IN: text, chunk size, overlap\n", 107 | " # OUT: list of str splits\n", 108 | " # TODO: Add parameter for splitter type\n", 109 | "\n", 110 | " print(\"`Splitting doc ...`\")\n", 111 | " if split_method == \"RecursiveTextSplitter\":\n", 112 | " text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,\n", 113 | " chunk_overlap=overlap)\n", 114 | " elif split_method == \"CharacterTextSplitter\":\n", 115 | " text_splitter = CharacterTextSplitter(separator=\" \",\n", 116 | " chunk_size=chunk_size,\n", 117 | " chunk_overlap=overlap)\n", 118 | " splits = text_splitter.split_text(text)\n", 119 | " return splits\n", 120 | "\n", 121 | "split_method = \"RecursiveTextSplitter\" \n", 122 | "overlap = 20\n", 123 | "chunk_size = 500\n", 124 | "splits = split_texts(text, chunk_size, overlap, split_method)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "id": "8b3c35fd", 130 | "metadata": {}, 131 | "source": [ 132 | "`Test model`" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "id": "521ab75c", 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "### *** update with your local path *** ###\n", 143 | "LLAMA_CPP_PATH = \"/Users/31treehaus/Desktop/AI/llama.cpp\"" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "id": "6264c05d", 150 | "metadata": { 151 | "scrolled": false 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "# Pass the raw question into the prompt template.\n", 156 | "template = \"\"\"Question: {question}\n", 157 | "Answer: Let's think step by step.\"\"\"\n", 158 | "prompt = PromptTemplate(template=template, input_variables=[\"question\"])\n", 159 | "\n", 160 | "callback_manager = BaseCallbackManager([StreamingStdOutCallbackHandler()])\n", 161 | "llm = LlamaCpp(\n", 162 | " \n", 163 | " model_path=LLAMA_CPP_PATH+\"models/vicuna_13B/ggml-vicuna-13b-4bit.bin\",\n", 164 | " callback_manager=callback_manager,\n", 165 | " verbose=True,\n", 166 | " n_threads=6,\n", 167 | " n_ctx=2048,\n", 168 | " use_mlock=True)\n", 169 | "\n", 170 | "llm_chain = LLMChain(prompt=prompt,llm=llm)\n", 171 | "question = \"What NFL team won the Super Bowl in the year Justin Bieber was born?\"\n", 172 | "llm_chain.run(question)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "id": "68a09e14", 178 | "metadata": {}, 179 | "source": [ 180 | "`Make Retrieval Chain`" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "id": "a5d174af", 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "def make_retriever(splits, retriever_type, embeddings, num_neighbors):\n", 191 | "\n", 192 | " # Make document retriever\n", 193 | " # IN: list of str splits, retriever type, embedding type, number of neighbors for retrieval\n", 194 | " # OUT: retriever\n", 195 | "\n", 196 | " print(\"`Making retriever ...`\")\n", 197 | " # Set embeddings\n", 198 | " if embeddings == \"OpenAI\":\n", 199 | " embd = OpenAIEmbeddings()\n", 200 | " elif embeddings == \"HuggingFace\":\n", 201 | " embd = HuggingFaceEmbeddings()\n", 202 | "\n", 203 | " # Select retriever\n", 204 | " if retriever_type == \"similarity-search\":\n", 205 | " try:\n", 206 | " vectorstore = FAISS.from_texts(splits, embd)\n", 207 | " except ValueError:\n", 208 | " print(\"`Error using OpenAI embeddings (disallowed TikToken token in the text). Using HuggingFace.`\")\n", 209 | " vectorstore = FAISS.from_texts(splits, HuggingFaceEmbeddings())\n", 210 | " retriever = vectorstore.as_retriever(k=num_neighbors)\n", 211 | " elif retriever_type == \"SVM\":\n", 212 | " retriever = SVMRetriever.from_texts(splits,embd)\n", 213 | " elif retriever_type == \"TF-IDF\":\n", 214 | " retriever = TFIDFRetriever.from_texts(splits)\n", 215 | " return retriever\n", 216 | "\n", 217 | "retriever_type = \"similarity-search\"\n", 218 | "embeddings = \"OpenAI\"\n", 219 | "num_neighbors = 3\n", 220 | "retriever = make_retriever(splits, retriever_type, embeddings, num_neighbors)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "id": "7205c92d", 226 | "metadata": {}, 227 | "source": [ 228 | "`Make Prompt`" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "id": "26bed6cd", 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "template = \"\"\"Use the following pieces of context to answer the question at the end. Use three sentences maximum. \n", 239 | "{context}\n", 240 | "Question: {question}\n", 241 | "Answer: Think step by step \"\"\"\n", 242 | "\n", 243 | "QA_CHAIN_PROMPT = PromptTemplate(input_variables=[\"context\", \"question\"],template=template,)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "id": "5deb1522", 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "def make_llm(model):\n", 254 | " \"\"\"\n", 255 | " Make LLM\n", 256 | " @param model: LLM to use\n", 257 | " @return: LLM\n", 258 | " \"\"\"\n", 259 | "\n", 260 | " if model in (\"gpt-3.5-turbo\", \"gpt-4\"):\n", 261 | " llm = ChatOpenAI(model_name=model, temperature=0)\n", 262 | " elif model == \"anthropic\":\n", 263 | " llm = ChatAnthropic(temperature=0)\n", 264 | " elif model in (\"vicuna-7b\",\"vicuna-13b\"):\n", 265 | " callback_manager = BaseCallbackManager([StreamingStdOutCallbackHandler()])\n", 266 | " if model == \"vicuna-7b\":\n", 267 | " llm = LlamaCpp(\n", 268 | " model_path=LLAMA_CPP_PATH+\"models/vicuna_7B/ggml-vicuna-7b-q4_0.bin\",\n", 269 | " callback_manager=callback_manager,\n", 270 | " verbose=True,\n", 271 | " n_threads=6,\n", 272 | " n_ctx=2048,\n", 273 | " use_mlock=True)\n", 274 | " else:\n", 275 | " llm = LlamaCpp(\n", 276 | " model_path=LLAMA_CPP_PATH+\"models/vicuna_13B/ggml-vicuna-13b-4bit.bin\",\n", 277 | " callback_manager=callback_manager,\n", 278 | " verbose=True,\n", 279 | " n_threads=6,\n", 280 | " n_ctx=2048,\n", 281 | " use_mlock=True)\n", 282 | " return llm\n", 283 | "\n", 284 | "llm = make_llm('vicuna-13b')" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "id": "6f48abfe", 290 | "metadata": {}, 291 | "source": [ 292 | "`Eval Set`" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "id": "f861a780", 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "import json, pandas as pd\n", 303 | "test_dataset = pd.read_csv(\"docs/karpathy-lex-pod/karpathy-pod-eval.csv\")\n", 304 | "qus = []\n", 305 | "for i in test_dataset.index:\n", 306 | " question = test_dataset.loc[i, \"question\"]\n", 307 | " answer = test_dataset.loc[i, \"answer\"]\n", 308 | " data = {\n", 309 | " \"question\": question,\n", 310 | " \"answer\": answer\n", 311 | " }\n", 312 | " qus.append(data)" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "id": "72e60bb5", 319 | "metadata": { 320 | "scrolled": true 321 | }, 322 | "outputs": [], 323 | "source": [ 324 | "qus[0]" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "id": "6003593a", 330 | "metadata": {}, 331 | "source": [ 332 | "`Run Inference`" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "id": "6f675d67", 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "def make_chain(llm, retriever, retriever_type):\n", 343 | " \"\"\"\n", 344 | " Make retrieval chain\n", 345 | " @param llm: model\n", 346 | " @param retriever: retriever\n", 347 | " @param retriever_type: retriever type\n", 348 | " @return: QA chain or Llama-Index retriever, which enables QA\n", 349 | " \"\"\"\n", 350 | "\n", 351 | " chain_type_kwargs = {\"prompt\": QA_CHAIN_PROMPT}\n", 352 | " qa_chain = RetrievalQA.from_chain_type(llm,\n", 353 | " chain_type=\"stuff\",\n", 354 | " retriever=retriever,\n", 355 | " chain_type_kwargs=chain_type_kwargs,\n", 356 | " input_key=\"question\")\n", 357 | " return qa_chain\n", 358 | "\n", 359 | "qa_chain = make_chain(llm, retriever, retriever_type)\n", 360 | "result = qa_chain(qus[0])\n", 361 | "result" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "id": "469143b0", 367 | "metadata": {}, 368 | "source": [ 369 | "`Test Vicuna endpoint on Replicate`\n", 370 | "\n", 371 | "Deployed to `A100` on Replicate.\n", 372 | "\n", 373 | "* `max_length` maximum length of the prompt + the output for a given generation\n", 374 | "* `context window` 2048 tokens\n", 375 | "\n", 376 | "Useful reference:\n", 377 | "https://github.com/replicate/cog-vicuna-13b/issues/3\n", 378 | "\n" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "id": "c7a38076", 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "llm = Replicate(model=\"replicate/vicuna-13b:e6d469c2b11008bb0e446c3e9629232f9674581224536851272c54871f84076e\",\n", 389 | " input={\"temperature\": 0.75, \"max_length\": 3000, \"top_p\":0.25})" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "id": "f81af87e", 396 | "metadata": {}, 397 | "outputs": [], 398 | "source": [ 399 | "from text_utils import QA_CHAIN_PROMPT, QA_CHAIN_PROMPT_LLAMA\n", 400 | "chain_type_kwargs = {\"prompt\": QA_CHAIN_PROMPT_LLAMA}\n", 401 | "qa_chain = RetrievalQA.from_chain_type(llm,\n", 402 | " chain_type=\"stuff\",\n", 403 | " retriever=retriever,\n", 404 | " chain_type_kwargs=chain_type_kwargs,\n", 405 | " input_key=\"question\")" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": null, 411 | "id": "81694537", 412 | "metadata": {}, 413 | "outputs": [], 414 | "source": [ 415 | "qa_chain(qus[0])" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "id": "5a8ada58", 421 | "metadata": {}, 422 | "source": [ 423 | "`Test Mosaic`\n", 424 | "\n", 425 | "LangChain docs: \n", 426 | "\n", 427 | "https://python.langchain.com/en/latest/modules/models/text_embedding/examples/mosaicml.html\n", 428 | "\n", 429 | "Args: \n", 430 | "\n", 431 | "https://docs.mosaicml.com/en/latest/inference.html" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "id": "d9371e1a", 438 | "metadata": {}, 439 | "outputs": [], 440 | "source": [ 441 | "from langchain.llms import MosaicML" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "id": "402f9fea", 448 | "metadata": { 449 | "scrolled": false 450 | }, 451 | "outputs": [], 452 | "source": [ 453 | "llm = MosaicML(inject_instruction_format=True,model_kwargs={'do_sample': False,'max_length': 3000})" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": null, 459 | "id": "dfb1753c", 460 | "metadata": {}, 461 | "outputs": [], 462 | "source": [ 463 | "template = \"\"\"Use the following pieces of context to answer the question at the end. Use three sentences maximum. \n", 464 | "{context}\n", 465 | "Question: {question} \"\"\"\n", 466 | "QA_CHAIN_PROMPT = PromptTemplate(input_variables=[\"context\", \"question\"],template=template,)" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "id": "07232f14", 473 | "metadata": {}, 474 | "outputs": [], 475 | "source": [ 476 | "chain_type_kwargs = {\"prompt\": QA_CHAIN_PROMPT}\n", 477 | "qa_chain = RetrievalQA.from_chain_type(llm,\n", 478 | " chain_type=\"stuff\",\n", 479 | " retriever=retriever,\n", 480 | " chain_type_kwargs=chain_type_kwargs,\n", 481 | " input_key=\"question\")" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "id": "9030598d", 488 | "metadata": {}, 489 | "outputs": [], 490 | "source": [ 491 | "qa_chain(qus[0])" 492 | ] 493 | } 494 | ], 495 | "metadata": { 496 | "kernelspec": { 497 | "display_name": "Python 3 (ipykernel)", 498 | "language": "python", 499 | "name": "python3" 500 | }, 501 | "language_info": { 502 | "codemirror_mode": { 503 | "name": "ipython", 504 | "version": 3 505 | }, 506 | "file_extension": ".py", 507 | "mimetype": "text/x-python", 508 | "name": "python", 509 | "nbconvert_exporter": "python", 510 | "pygments_lexer": "ipython3", 511 | "version": "3.9.16" 512 | } 513 | }, 514 | "nbformat": 4, 515 | "nbformat_minor": 5 516 | } 517 | -------------------------------------------------------------------------------- /api/__pycache__/evaluator_app.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/api/__pycache__/evaluator_app.cpython-38.pyc -------------------------------------------------------------------------------- /api/__pycache__/text_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/api/__pycache__/text_utils.cpython-38.pyc -------------------------------------------------------------------------------- /api/docs/gpt3/2005.14165.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/api/docs/gpt3/2005.14165.pdf -------------------------------------------------------------------------------- /api/docs/gpt3/gpt3-eval.csv: -------------------------------------------------------------------------------- 1 | "question","answer", 2 | "What are the limitations of task-specific fine-tuning?", "First, the need for a large dataset of labeled examples for every new task limits the applicability of language models. Second, high capacity models tend to over-fit on narrow fine-tuning datasets and do not generalize well outside of them. Third, humans do not require large supervised datasets to learn most language tasks. To be broadly useful, we would someday like our NLP systems to have this same fluidity and generality.", 3 | "What is in-context learning?","In-context learning is an approach to meta-learning, which means the model develops a broad set of skills and pattern recognition abilities at training time, and then uses those abilities at inference time to rapidly adapt to or recognize the desired task when given examples. This involves absorbing many skills and tasks within the parameters of the model.", 4 | "On what NLP tasks does GPT3 report state-of-the-art performance using zero or few shot learning relative to fine-tuned benchmarks?","GPT3 achieves 71.2% on TriviaQA in the few-shot setting, which is state of the art relative to fine-tuned models operating in the same closed-book setting.", 5 | "What are the pros and cons of fine-tuning, zero-shot learning, and few-shot learning?","Fine-tuning involves updating the weights of a pre-trained model by training on a supervised dataset specific to the desired task. It benefits from strong performance on many benchmarks, but requires a new large dataset for every task. Few shot learning gives the model a few demonstrations of the task at inference time as conditioning, but no weight updates are done. It benefits from a major reduction in the need for task-specific data. But results from this method have so far been much worse than state-of-the-art fine-tuned models. In zero-shot learning, the model is only given a natural language instruction describing the task without any examples. It is the most convent and potentially robust approach, but the most challenges (especially for tasks that are difficult to describe).", 6 | "How is the batch size increased over the course of training?","The batch size is increased linearly from a small value (32k tokens) to the full value over the first 4-12 billion tokens of training, depending on the model size.", -------------------------------------------------------------------------------- /api/docs/karpathy-lex-pod/karpathy-pod-eval.csv: -------------------------------------------------------------------------------- 1 | "question","answer", 2 | "Why is the transformer architecture expressive in the forward pass?","The transformer architecture is expressive because it uses a general message passing scheme where nodes get to look at each other, decide what's interesting and then update each other.", 3 | "What design criteria does the Transformer meet?","The transformer is very expressive in a forward pass, optimizable in the backward pass using the techniques that we have such as gradient descent, and it can run efficiently on our hardware such as GPUs.", 4 | "Why is next word prediction an effective training objective?","On a sufficiently large dataset, the task of predicting the next word multi-tasks knowledge of a lot of things, including understanding of chemistry, physics, and human nature. You have to understand a lot about the world to make that prediction on an internet-scale dataset.", 5 | "What was the World Of Bits project and why did it fail?","World Of Bits was an effort to give AI access to tools, such as a keyboard and mouse, in order to complete tasks, such as complete bookings. It failed because it turned out that reinforcement learning is an extremely inefficient way of training neural networks. You take many actions, but you only get a sparse reward once in a while. Starting from scratch, it is very unlikely to stumble on the correct action - such as a booking - by chance at random, so the reward signal is very sparse.", 6 | "Why can additional sensors be a liability in an autonomous vehicle system?","Each sensor adds complexity to the system. The hardware must be sourced, versioned, and maintain firmware. Software must ingest it, track versions. The cost of this additional bloat or entropy must be weighted against the added benefit of that particular sensor." -------------------------------------------------------------------------------- /api/docs/transformers-challenge/2005.14165.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/api/docs/transformers-challenge/2005.14165.pdf -------------------------------------------------------------------------------- /api/docs/transformers-challenge/2112.04426.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/api/docs/transformers-challenge/2112.04426.pdf -------------------------------------------------------------------------------- /api/docs/transformers-challenge/2203.15556.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/api/docs/transformers-challenge/2203.15556.pdf -------------------------------------------------------------------------------- /api/docs/transformers-challenge/2205.06175.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/api/docs/transformers-challenge/2205.06175.pdf -------------------------------------------------------------------------------- /api/docs/transformers-challenge/2302.13971.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/api/docs/transformers-challenge/2302.13971.pdf -------------------------------------------------------------------------------- /api/docs/transformers-challenge/transformers-eval.csv: -------------------------------------------------------------------------------- 1 | "question","answer", 2 | "What are the limitations of task-specific fine-tuning?", "First, the need for a large dataset of labeled examples for every new task limits the applicability of language models. Second, high capacity models tend to over-fit on narrow fine-tuning datasets and do not generalize well outside of them. Third, humans do not require large supervised datasets to learn most language tasks. To be broadly useful, we would someday like our NLP systems to have this same fluidity and generality.", 3 | "What is in-context learning?","In-context learning is an approach to meta-learning, which means the model develops a broad set of skills and pattern recognition abilities at training time, and then uses those abilities at inference time to rapidly adapt to or recognize the desired task when given examples. This involves absorbing many skills and tasks within the parameters of the model.", 4 | "On what NLP tasks does GPT3 report state-of-the-art performance using zero or few shot learning relative to fine-tuned benchmarks?","GPT3 achieves 71.2% on TriviaQA in the few-shot setting, which is state of the art relative to fine-tuned models operating in the same closed-book setting.", 5 | "What are the pros and cons of fine-tuning, zero-shot learning, and few-shot learning?","Fine-tuning involves updating the weights of a pre-trained model by training on a supervised dataset specific to the desired task. It benefits from strong performance on many benchmarks, but requires a new large dataset for every task. Few shot learning gives the model a few demonstrations of the task at inference time as conditioning, but no weight updates are done. It benefits from a major reduction in the need for task-specific data. But results from this method have so far been much worse than state-of-the-art fine-tuned models. In zero-shot learning, the model is only given a natural language instruction describing the task without any examples. It is the most convent and potentially robust approach, but the most challenges (especially for tasks that are difficult to describe).", 6 | "How is the batch size increased for the GPT3 models?","The batch size is increased linearly from a small value (32k tokens) to the full value of 3.2M token over the first 2 billion tokens of training.", 7 | "How does RETRO perform retrieval in terms of search and latency?", " For each chunk, RETRO will retrieve its approximate k-nearest neighbours from a key-value database using the L2 distance on BERT embeddings. It uses the SCaNN library to query the approximate nearest neighbours in O(log𝑇) time." 8 | "What scaling law does the Chinchilla paper propose and how does Chinchilla compare to Gopher?", "The paper fits a scaling law for loss L, as a function of model size N and data size D. Based on the losses of over 400 models, the paper suggests that large models should be substantially smaller and therefore trained much longer than is currently done. They verify this by training a more compute-optimal 70B model, called Chinchilla, on 1.4 trillion tokens, which is 4x smaller than Gopher." 9 | "How do the LLaMA model compare to prior benchmarks, such as PALM, Chinchilla, and GPT-3?","LLaMA is trained only on publicly available data, making the work compatible with open-sourcing. LLaMA-13B outperforms GPT-3 on most benchmarks, despite being 10× smaller. The 65B-parameter model is also competitive with the best large language models such as Chinchilla or PaLM-540B." 10 | "How did the LLaMA models draw inspiration from GPT3, PaLM, GPTNeo, or Chinchilla?","Like GPT3, LLaMA normalizes the input of each transformer sub-layer using RMSNorm. Like PaLM, they replace the ReLU non-linearity with the SwiGLU activation function. Like GPTNeo, they remove the absolute positional embeddings, and instead, add rotary positional embeddings. The general approach was inspired by the Chinchilla scaling laws: LLaMA-13B outperforms GPT3, but can be run on a single GPU." 11 | "How does Gato embed multi-modal inputs?" , "Tokens belonging to text, discrete or continuous-valued observations or actions for any time-step are embedded via a lookup table into a learned vector embedding space. Tokens belonging to image patches for any time-step are embedded using a single ResNet block to obtain a vector per patch." -------------------------------------------------------------------------------- /api/evaluator_app.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is an API to support the LLM QA chain auto-evaluator. 3 | """ 4 | 5 | import io 6 | import os 7 | from dotenv import load_dotenv 8 | import sentry_sdk 9 | import json 10 | import time 11 | import pypdf 12 | import random 13 | import logging 14 | import itertools 15 | import faiss 16 | import pandas as pd 17 | from typing import Dict, List 18 | from json import JSONDecodeError 19 | from langchain.llms import MosaicML 20 | from langchain.llms import Anthropic 21 | from langchain.llms import Replicate 22 | from langchain.schema import Document 23 | from langchain.vectorstores import FAISS 24 | from langchain.chains import RetrievalQA 25 | from langchain.chat_models import ChatOpenAI 26 | from langchain.chains import QAGenerationChain 27 | from langchain.retrievers import SVMRetriever 28 | from langchain.evaluation.qa import QAEvalChain 29 | from langchain.retrievers import TFIDFRetriever 30 | from sse_starlette.sse import EventSourceResponse 31 | from fastapi.middleware.cors import CORSMiddleware 32 | from langchain.embeddings import LlamaCppEmbeddings 33 | from langchain.embeddings import MosaicMLInstructorEmbeddings 34 | from fastapi import FastAPI, File, UploadFile, Form 35 | from langchain.embeddings.openai import OpenAIEmbeddings 36 | from langchain.chains.question_answering import load_qa_chain 37 | from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter 38 | from text_utils import GRADE_DOCS_PROMPT, GRADE_ANSWER_PROMPT, GRADE_DOCS_PROMPT_FAST, GRADE_ANSWER_PROMPT_FAST, GRADE_ANSWER_PROMPT_BIAS_CHECK, GRADE_ANSWER_PROMPT_OPENAI, QA_CHAIN_PROMPT, QA_CHAIN_PROMPT_LLAMA 39 | 40 | def generate_eval(text, chunk, logger): 41 | """ 42 | Generate question answer pair from input text 43 | @param text: text to generate eval set from 44 | @param chunk: chunk size to draw question from text 45 | @param logger: logger 46 | @return: dict with keys "question" and "answer" 47 | """ 48 | 49 | logger.info("`Generating eval QA pair ...`") 50 | # Generate random starting index in the doc to draw question from 51 | num_of_chars = len(text) 52 | starting_index = random.randint(0, num_of_chars-chunk) 53 | sub_sequence = text[starting_index:starting_index+chunk] 54 | # Set up QAGenerationChain chain using GPT 3.5 as default 55 | chain = QAGenerationChain.from_llm(ChatOpenAI(temperature=0)) 56 | eval_set = [] 57 | # Catch any QA generation errors and re-try until QA pair is generated 58 | awaiting_answer = True 59 | while awaiting_answer: 60 | try: 61 | qa_pair = chain.run(sub_sequence) 62 | eval_set.append(qa_pair) 63 | awaiting_answer = False 64 | except JSONDecodeError: 65 | logger.error("Error on question") 66 | starting_index = random.randint(0, num_of_chars-chunk) 67 | sub_sequence = text[starting_index:starting_index+chunk] 68 | eval_pair = list(itertools.chain.from_iterable(eval_set)) 69 | return eval_pair 70 | 71 | 72 | def split_texts(text, chunk_size, overlap, split_method, logger): 73 | """ 74 | Split text into chunks 75 | @param text: text to split 76 | @param chunk_size: charecters per split 77 | @param overlap: charecter overlap between splits 78 | @param split_method: method used to split text 79 | @param logger: logger 80 | @return: list of str splits 81 | """ 82 | 83 | logger.info("`Splitting doc ...`") 84 | if split_method == "RecursiveTextSplitter": 85 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, 86 | chunk_overlap=overlap) 87 | elif split_method == "CharacterTextSplitter": 88 | text_splitter = CharacterTextSplitter(separator=" ", 89 | chunk_size=chunk_size, 90 | chunk_overlap=overlap) 91 | splits = text_splitter.split_text(text) 92 | return splits 93 | 94 | 95 | def make_llm(model): 96 | """ 97 | Make LLM 98 | @param model: LLM to use 99 | @return: LLM 100 | """ 101 | 102 | if model in ("gpt-3.5-turbo", "gpt-4"): 103 | llm = ChatOpenAI(model_name=model, temperature=0) 104 | elif model == "anthropic": 105 | llm = Anthropic(temperature=0) 106 | elif model == "Anthropic-100k": 107 | llm = Anthropic(model="claude-v1-100k",temperature=0) 108 | elif model == "vicuna-13b": 109 | llm = Replicate(model="replicate/vicuna-13b:e6d469c2b11008bb0e446c3e9629232f9674581224536851272c54871f84076e", 110 | input={"temperature": 0.75, "max_length": 3000, "top_p":0.25}) 111 | elif model == "mosaic": 112 | llm = MosaicML(inject_instruction_format=True,model_kwargs={'do_sample': False, 'max_length': 3000}) 113 | return llm 114 | 115 | def make_retriever(splits, retriever_type, embeddings, num_neighbors, llm, logger): 116 | """ 117 | Make document retriever 118 | @param splits: list of str splits 119 | @param retriever_type: retriever type 120 | @param embedding_type: embedding type 121 | @param num_neighbors: number of neighbors for retrieval 122 | @param _llm: model 123 | @param logger: logger 124 | @return: retriever 125 | """ 126 | 127 | logger.info("`Making retriever ...`") 128 | # Set embeddings 129 | if embeddings == "OpenAI": 130 | embd = OpenAIEmbeddings() 131 | # Note: Still WIP (can't be selected by user yet) 132 | elif embeddings == "LlamaCppEmbeddings": 133 | embd = LlamaCppEmbeddings(model="replicate/vicuna-13b:e6d469c2b11008bb0e446c3e9629232f9674581224536851272c54871f84076e") 134 | # Note: Test 135 | elif embeddings == "Mosaic": 136 | embd = MosaicMLInstructorEmbeddings(query_instruction="Represent the query for retrieval: ") 137 | 138 | # Select retriever 139 | if retriever_type == "similarity-search": 140 | vectorstore = FAISS.from_texts(splits, embd) 141 | retriever = vectorstore.as_retriever(k=num_neighbors) 142 | elif retriever_type == "SVM": 143 | retriever = SVMRetriever.from_texts(splits, embd) 144 | elif retriever_type == "TF-IDF": 145 | retriever = TFIDFRetriever.from_texts(splits) 146 | elif retriever_type == "Anthropic-100k": 147 | retriever = llm 148 | return retriever 149 | 150 | def make_chain(llm, retriever, retriever_type, model): 151 | 152 | """ 153 | Make retrieval chain 154 | @param llm: model 155 | @param retriever: retriever 156 | @param retriever_type: retriever type 157 | @return: QA chain 158 | """ 159 | 160 | # Select prompt 161 | if model == "vicuna-13b": 162 | # Note: Better answer quality using default prompt 163 | # chain_type_kwargs = {"prompt": QA_CHAIN_PROMPT_LLAMA} 164 | chain_type_kwargs = {"prompt": QA_CHAIN_PROMPT} 165 | else: 166 | chain_type_kwargs = {"prompt": QA_CHAIN_PROMPT} 167 | 168 | # Select model 169 | if retriever_type == "Anthropic-100k": 170 | qa_chain = load_qa_chain(llm,chain_type="stuff",prompt=QA_CHAIN_PROMPT) 171 | else: 172 | qa_chain = RetrievalQA.from_chain_type(llm, 173 | chain_type="stuff", 174 | retriever=retriever, 175 | chain_type_kwargs=chain_type_kwargs, 176 | input_key="question") 177 | return qa_chain 178 | 179 | 180 | def grade_model_answer(predicted_dataset, predictions, grade_answer_prompt, logger): 181 | """ 182 | Grades the answer based on ground truth and model predictions. 183 | @param predicted_dataset: A list of dictionaries containing ground truth questions and answers. 184 | @param predictions: A list of dictionaries containing model predictions for the questions. 185 | @param grade_answer_prompt: The prompt level for the grading. Either "Fast" or "Full". 186 | @param logger: logger 187 | @return: A list of scores for the distilled answers. 188 | """ 189 | 190 | logger.info("`Grading model answer ...`") 191 | if grade_answer_prompt == "Fast": 192 | prompt = GRADE_ANSWER_PROMPT_FAST 193 | elif grade_answer_prompt == "Descriptive w/ bias check": 194 | prompt = GRADE_ANSWER_PROMPT_BIAS_CHECK 195 | elif grade_answer_prompt == "OpenAI grading prompt": 196 | prompt = GRADE_ANSWER_PROMPT_OPENAI 197 | else: 198 | prompt = GRADE_ANSWER_PROMPT 199 | 200 | # Note: GPT-4 grader is advised by OAI 201 | eval_chain = QAEvalChain.from_llm(llm=ChatOpenAI(model_name="gpt-4", temperature=0), 202 | prompt=prompt) 203 | graded_outputs = eval_chain.evaluate(predicted_dataset, 204 | predictions, 205 | question_key="question", 206 | prediction_key="result") 207 | return graded_outputs 208 | 209 | 210 | def grade_model_retrieval(gt_dataset, predictions, grade_docs_prompt, logger): 211 | """ 212 | Grades the relevance of retrieved documents based on ground truth and model predictions. 213 | @param gt_dataset: list of dictionaries containing ground truth questions and answers. 214 | @param predictions: list of dictionaries containing model predictions for the questions 215 | @param grade_docs_prompt: prompt level for the grading. 216 | @return: list of scores for the retrieved documents. 217 | """ 218 | 219 | logger.info("`Grading relevance of retrieved docs ...`") 220 | if grade_docs_prompt == "Fast": 221 | prompt = GRADE_DOCS_PROMPT_FAST 222 | else: 223 | prompt = GRADE_DOCS_PROMPT 224 | 225 | # Note: GPT-4 grader is advised by OAI 226 | eval_chain = QAEvalChain.from_llm(llm=ChatOpenAI(model_name="gpt-4", temperature=0), 227 | prompt=prompt) 228 | graded_outputs = eval_chain.evaluate(gt_dataset, 229 | predictions, 230 | question_key="question", 231 | prediction_key="result") 232 | return graded_outputs 233 | 234 | 235 | def run_eval(chain, retriever, eval_qa_pair, grade_prompt, retriever_type, num_neighbors, text, logger): 236 | """ 237 | Runs evaluation on a model's performance on a given evaluation dataset. 238 | @param chain: Model chain used for answering questions 239 | @param retriever: Document retriever used for retrieving relevant documents 240 | @param eval_set: List of dictionaries containing questions and corresponding ground truth answers 241 | @param grade_prompt: String prompt used for grading model's performance 242 | @param retriever_type: String specifying the type of retriever used 243 | @param num_neighbors: Number of neighbors to retrieve using the retriever 244 | @param text: full document text 245 | @return: A tuple of four items: 246 | - answers_grade: A dictionary containing scores for the model's answers. 247 | - retrieval_grade: A dictionary containing scores for the model's document retrieval. 248 | - latencies_list: A list of latencies in seconds for each question answered. 249 | - predictions_list: A list of dictionaries containing the model's predicted answers and relevant documents for each question. 250 | """ 251 | 252 | logger.info("`Running eval ...`") 253 | predictions = [] 254 | retrieved_docs = [] 255 | gt_dataset = [] 256 | latency = [] 257 | 258 | # Get answer and log latency 259 | start_time = time.time() 260 | if retriever_type == "Anthropic-100k": 261 | docs=[Document(page_content=text)] 262 | answer = chain.run(input_documents=docs,question=eval_qa_pair["question"]) 263 | predictions.append( 264 | {"question": eval_qa_pair["question"], "answer": eval_qa_pair["answer"], "result": answer}) 265 | else : 266 | predictions.append(chain(eval_qa_pair)) 267 | gt_dataset.append(eval_qa_pair) 268 | end_time = time.time() 269 | elapsed_time = end_time - start_time 270 | latency.append(elapsed_time) 271 | 272 | # Extract text from retrieved docs 273 | retrieved_doc_text = "" 274 | if retriever_type == "Anthropic-100k": 275 | retrieved_doc_text = "Doc %s: " % str(eval_qa_pair["answer"]) 276 | else: 277 | docs = retriever.get_relevant_documents(eval_qa_pair["question"]) 278 | for i, doc in enumerate(docs): 279 | retrieved_doc_text += "Doc %s: " % str(i+1) + \ 280 | doc.page_content + " " 281 | 282 | # Log 283 | retrieved = {"question": eval_qa_pair["question"], 284 | "answer": eval_qa_pair["answer"], "result": retrieved_doc_text} 285 | retrieved_docs.append(retrieved) 286 | 287 | # Grade 288 | graded_answers = grade_model_answer( 289 | gt_dataset, predictions, grade_prompt, logger) 290 | graded_retrieval = grade_model_retrieval( 291 | gt_dataset, retrieved_docs, grade_prompt, logger) 292 | return graded_answers, graded_retrieval, latency, predictions 293 | 294 | load_dotenv() 295 | 296 | if os.environ.get("ENVIRONMENT") != "development": 297 | sentry_sdk.init( 298 | dsn="https://065aa152c4de4e14af9f9e7335c8eae4@o4505106202820608.ingest.sentry.io/4505106207735808", 299 | traces_sample_rate=1.0, 300 | ) 301 | 302 | app = FastAPI() 303 | 304 | origins = [ 305 | "http://localhost:3000", 306 | "localhost:3000", 307 | "https://evaluator-ui.vercel.app/" 308 | "https://evaluator-ui.vercel.app" 309 | "evaluator-ui.vercel.app/" 310 | "evaluator-ui.vercel.app" 311 | ] 312 | 313 | app.add_middleware( 314 | CORSMiddleware, 315 | allow_origins=["*"], 316 | allow_credentials=True, 317 | allow_methods=["*"], 318 | allow_headers=["*"], 319 | ) 320 | 321 | 322 | @app.get("/") 323 | async def root(): 324 | return {"message": "Welcome to the Auto Evaluator!"} 325 | 326 | 327 | def run_evaluator( 328 | files, 329 | num_eval_questions, 330 | chunk_chars, 331 | overlap, 332 | split_method, 333 | retriever_type, 334 | embeddings, 335 | model_version, 336 | grade_prompt, 337 | num_neighbors, 338 | test_dataset 339 | ): 340 | 341 | # Set up logging 342 | logging.config.fileConfig('logging.conf', disable_existing_loggers=False) 343 | logger = logging.getLogger(__name__) 344 | 345 | # Read content of files 346 | texts = [] 347 | fnames = [] 348 | for file in files: 349 | logger.info("Reading file: {}".format(file.filename)) 350 | contents = file.file.read() 351 | # PDF file 352 | if file.content_type == 'application/pdf': 353 | logger.info("File {} is a PDF".format(file.filename)) 354 | pdf_reader = pypdf.PdfReader(io.BytesIO(contents)) 355 | text = "" 356 | for page in pdf_reader.pages: 357 | text += page.extract_text() 358 | texts.append(text) 359 | fnames.append(file.filename) 360 | # Text file 361 | elif file.content_type == 'text/plain': 362 | logger.info("File {} is a TXT".format(file.filename)) 363 | texts.append(contents.decode()) 364 | fnames.append(file.filename) 365 | else: 366 | logger.warning( 367 | "Unsupported file type for file: {}".format(file.filename)) 368 | text = " ".join(texts) 369 | 370 | if retriever_type == "Anthropic-100k": 371 | splits = "" 372 | model_version = "Anthropic-100k" 373 | else: 374 | logger.info("Splitting texts") 375 | splits = split_texts(text, chunk_chars, overlap, split_method, logger) 376 | 377 | logger.info("Make LLM") 378 | llm = make_llm(model_version) 379 | 380 | logger.info("Make retriever") 381 | retriever = make_retriever( 382 | splits, retriever_type, embeddings, num_neighbors, llm, logger) 383 | 384 | logger.info("Make chain") 385 | qa_chain = make_chain(llm, retriever, retriever_type, model_version) 386 | 387 | for i in range(num_eval_questions): 388 | 389 | # Generate one question 390 | if i < len(test_dataset): 391 | eval_pair = test_dataset[i] 392 | else: 393 | eval_pair = generate_eval(text, 3000, logger) 394 | if len(eval_pair) == 0: 395 | # Error in eval generation 396 | continue 397 | else: 398 | # This returns a list, so we unpack to dict 399 | eval_pair = eval_pair[0] 400 | 401 | # Run eval 402 | graded_answers, graded_retrieval, latency, predictions = run_eval( 403 | qa_chain, retriever, eval_pair, grade_prompt, retriever_type, num_neighbors, text, logger) 404 | 405 | # Assemble output 406 | d = pd.DataFrame(predictions) 407 | d['answerScore'] = [g['text'] for g in graded_answers] 408 | d['retrievalScore'] = [g['text'] for g in graded_retrieval] 409 | d['latency'] = latency 410 | 411 | # Summary statistics 412 | d['answerScore'] = [{'score': 1 if "Incorrect" not in text else 0, 413 | 'justification': text} for text in d['answerScore']] 414 | d['retrievalScore'] = [{'score': 1 if "Incorrect" not in text else 0, 415 | 'justification': text} for text in d['retrievalScore']] 416 | 417 | # Convert dataframe to dict 418 | d_dict = d.to_dict('records') 419 | if len(d_dict) == 1: 420 | yield json.dumps({"data": d_dict[0]}) 421 | else: 422 | logger.warn( 423 | "A QA pair was not evaluated correctly. Skipping this pair.") 424 | 425 | 426 | @app.post("/evaluator-stream") 427 | async def create_response( 428 | files: List[UploadFile] = File(...), 429 | num_eval_questions: int = Form(5), 430 | chunk_chars: int = Form(1000), 431 | overlap: int = Form(100), 432 | split_method: str = Form("RecursiveTextSplitter"), 433 | retriever_type: str = Form("similarity-search"), 434 | embeddings: str = Form("OpenAI"), 435 | model_version: str = Form("gpt-3.5-turbo"), 436 | grade_prompt: str = Form("Fast"), 437 | num_neighbors: int = Form(3), 438 | test_dataset: str = Form("[]"), 439 | ): 440 | test_dataset = json.loads(test_dataset) 441 | return EventSourceResponse(run_evaluator(files, num_eval_questions, chunk_chars, 442 | overlap, split_method, retriever_type, embeddings, model_version, grade_prompt, num_neighbors, test_dataset), headers={"Content-Type": "text/event-stream", "Connection": "keep-alive", "Cache-Control": "no-cache"}) 443 | -------------------------------------------------------------------------------- /api/logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root,uicheckapp 3 | 4 | [handlers] 5 | keys=consoleHandler 6 | 7 | [formatters] 8 | keys=normalFormatter 9 | 10 | [logger_root] 11 | level=INFO 12 | handlers=consoleHandler 13 | 14 | [logger_uicheckapp] 15 | level=DEBUG 16 | handlers=consoleHandler 17 | qualname=uicheckapp 18 | propagate=0 19 | 20 | [formatter_normalFormatter] 21 | format=%(asctime)s loglevel=%(levelname)-6s logger=%(name)s %(funcName)s() L%(lineno)-4d %(message)s 22 | 23 | [handler_consoleHandler] 24 | class=StreamHandler 25 | level=DEBUG 26 | formatter=normalFormatter 27 | args=(sys.stdout,) -------------------------------------------------------------------------------- /api/railway.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://railway.app/railway.schema.json", 3 | "build": { 4 | "builder": "NIXPACKS" 5 | }, 6 | "deploy": { 7 | "startCommand": "uvicorn evaluator_app:app --host 0.0.0.0 --port $PORT", 8 | "restartPolicyType": "ON_FAILURE", 9 | "restartPolicyMaxRetries": 10 10 | } 11 | } -------------------------------------------------------------------------------- /api/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==1.4.3 2 | fastapi==0.85.2 3 | langchain==0.0.181 4 | python-multipart==0.0.6 5 | uvicorn==0.18.3 6 | openai==0.27.0 7 | tiktoken==0.3.1 8 | faiss-cpu==1.7.3 9 | huggingface-hub==0.12.0 10 | anthropic==0.2.8 11 | pypdf==3.7.1 12 | filetype==1.2.0 13 | tokenizers==0.13.3 14 | sentence-transformers==2.2.2 15 | scikit-learn==1.2.1 16 | llama-index==0.4.35.post1 17 | sse_starlette==1.3.3 18 | gpt-index==0.5.16 19 | faiss-cpu==1.7.3 20 | python-dotenv==1.0.0 21 | sentry_sdk==1.21.1 22 | llama-cpp-python==0.1.43 23 | replicate==0.8.3 -------------------------------------------------------------------------------- /api/text_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from langchain.prompts import PromptTemplate 4 | 5 | def clean_pdf_text(text: str) -> str: 6 | """Cleans text extracted from a PDF file.""" 7 | # TODO: Remove References/Bibliography section. 8 | return remove_citations(text) 9 | 10 | def remove_citations(text: str) -> str: 11 | """Removes in-text citations from a string.""" 12 | # (Author, Year) 13 | text = re.sub(r'\([A-Za-z0-9,.\s]+\s\d{4}\)', '', text) 14 | # [1], [2], [3-5], [3, 33, 49, 51] 15 | text = re.sub(r'\[[0-9,-]+(,\s[0-9,-]+)*\]', '', text) 16 | return text 17 | 18 | template = """You are a teacher grading a quiz. 19 | You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either Correct or Incorrect. 20 | 21 | Example Format: 22 | QUESTION: question here 23 | STUDENT ANSWER: student's answer here 24 | TRUE ANSWER: true answer here 25 | GRADE: Correct or Incorrect here 26 | 27 | Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. If the student answers that there is no specific information provided in the context, then the answer is Incorrect. Begin! 28 | 29 | QUESTION: {query} 30 | STUDENT ANSWER: {result} 31 | TRUE ANSWER: {answer} 32 | GRADE: 33 | 34 | Your response should be as follows: 35 | 36 | GRADE: (Correct or Incorrect) 37 | (line break) 38 | JUSTIFICATION: (Without mentioning the student/teacher framing of this prompt, explain why the STUDENT ANSWER is Correct or Incorrect. Use one or two sentences maximum. Keep the answer as concise as possible.) 39 | """ 40 | 41 | GRADE_ANSWER_PROMPT = PromptTemplate(input_variables=["query", "result", "answer"], template=template) 42 | 43 | template = """You are a teacher grading a quiz. 44 | You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either Correct or Incorrect. 45 | 46 | Example Format: 47 | QUESTION: question here 48 | STUDENT ANSWER: student's answer here 49 | TRUE ANSWER: true answer here 50 | GRADE: Correct or Incorrect here 51 | 52 | Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. If the student answers that there is no specific information provided in the context, then the answer is Incorrect. Begin! 53 | 54 | QUESTION: {query} 55 | STUDENT ANSWER: {result} 56 | TRUE ANSWER: {answer} 57 | GRADE:""" 58 | 59 | GRADE_ANSWER_PROMPT_FAST = PromptTemplate(input_variables=["query", "result", "answer"], template=template) 60 | 61 | template = """You are a teacher grading a quiz. 62 | You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either Correct or Incorrect. 63 | You are also asked to identify potential sources of bias in the question and in the true answer. 64 | 65 | Example Format: 66 | QUESTION: question here 67 | STUDENT ANSWER: student's answer here 68 | TRUE ANSWER: true answer here 69 | GRADE: Correct or Incorrect here 70 | 71 | Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. If the student answers that there is no specific information provided in the context, then the answer is Incorrect. Begin! 72 | 73 | QUESTION: {query} 74 | STUDENT ANSWER: {result} 75 | TRUE ANSWER: {answer} 76 | GRADE: 77 | 78 | Your response should be as follows: 79 | 80 | GRADE: (Correct or Incorrect) 81 | (line break) 82 | JUSTIFICATION: (Without mentioning the student/teacher framing of this prompt, explain why the STUDENT ANSWER is Correct or Incorrect, identify potential sources of bias in the QUESTION, and identify potential sources of bias in the TRUE ANSWER. Use one or two sentences maximum. Keep the answer as concise as possible.) 83 | """ 84 | 85 | GRADE_ANSWER_PROMPT_BIAS_CHECK = PromptTemplate(input_variables=["query", "result", "answer"], template=template) 86 | 87 | template = """You are assessing a submitted student answer to a question relative to the true answer based on the provided criteria: 88 | 89 | *** 90 | QUESTION: {query} 91 | *** 92 | STUDENT ANSWER: {result} 93 | *** 94 | TRUE ANSWER: {answer} 95 | *** 96 | Criteria: 97 | relevance: Is the submission referring to a real quote from the text?" 98 | conciseness: Is the answer concise and to the point?" 99 | correct: Is the answer correct?" 100 | *** 101 | Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print "Correct" or "Incorrect" (without quotes or punctuation) on its own line corresponding to the correct answer. 102 | Reasoning: 103 | """ 104 | 105 | GRADE_ANSWER_PROMPT_OPENAI = PromptTemplate(input_variables=["query", "result", "answer"], template=template) 106 | 107 | template = """ 108 | Given the question: \n 109 | {query} 110 | Here are some documents retrieved in response to the question: \n 111 | {result} 112 | And here is the answer to the question: \n 113 | {answer} 114 | Criteria: 115 | relevance: Are the retrieved documents relevant to the question and do they support the answer?" 116 | Do the retrieved documents meet the criterion? Print "Correct" (without quotes or punctuation) if the retrieved context are relevant or "Incorrect" if not (without quotes or punctuation) on its own line. """ 117 | 118 | GRADE_DOCS_PROMPT_FAST = PromptTemplate(input_variables=["query", "result", "answer"], template=template) 119 | 120 | template = """ 121 | Given the question: \n 122 | {query} 123 | Here are some documents retrieved in response to the question: \n 124 | {result} 125 | And here is the answer to the question: \n 126 | {answer} 127 | Criteria: 128 | relevance: Are the retrieved documents relevant to the question and do they support the answer?" 129 | 130 | Your response should be as follows: 131 | 132 | GRADE: (Correct or Incorrect, depending if the retrieved documents meet the criterion) 133 | (line break) 134 | JUSTIFICATION: (Write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Use one or two sentences maximum. Keep the answer as concise as possible.) 135 | """ 136 | 137 | GRADE_DOCS_PROMPT = PromptTemplate(input_variables=["query", "result", "answer"], template=template) 138 | 139 | 140 | template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. 141 | {context} 142 | Question: {question} 143 | Helpful Answer:""" 144 | 145 | QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,) 146 | 147 | template = """ 148 | ### Human 149 | You are question-answering assistant tasked with answering questions based on the provided context. 150 | 151 | Here is the question: \ 152 | {question} 153 | 154 | Use the following pieces of context to answer the question at the end. Use three sentences maximum. \ 155 | {context} 156 | 157 | ### Assistant 158 | Answer: Think step by step. """ 159 | QA_CHAIN_PROMPT_LLAMA = PromptTemplate(input_variables=["context", "question"],template=template,) 160 | 161 | -------------------------------------------------------------------------------- /nextjs/.env.local: -------------------------------------------------------------------------------- 1 | NEXT_PUBLIC_API_URL=http://localhost:8000 -------------------------------------------------------------------------------- /nextjs/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | 8 | # testing 9 | /coverage 10 | /coverage-ts 11 | 12 | # next.js 13 | /.next/ 14 | /out/ 15 | 16 | # production 17 | /build 18 | 19 | /temp 20 | 21 | # misc 22 | .DS_Store 23 | *.pem 24 | 25 | # debug 26 | npm-debug.log* 27 | yarn-debug.log* 28 | yarn-error.log* 29 | 30 | # local env files 31 | .env.local 32 | .env.development.local 33 | .env.test.local 34 | .env.production.local 35 | 36 | # vercel 37 | .vercel 38 | 39 | 40 | /public/graphql/ 41 | .vscode/* 42 | 43 | /src/styles/styles.css 44 | 45 | *.generated.ts 46 | *.generated.tsx 47 | *.generated.json 48 | 49 | graphql.schema.json 50 | schema.graphql 51 | 52 | # Sentry 53 | .sentryclirc 54 | /test-results/ 55 | /playwright-report/ 56 | /playwright/.cache/ 57 | 58 | tsconfig.tsbuildinfo 59 | .next -------------------------------------------------------------------------------- /nextjs/components/Demo.tsx: -------------------------------------------------------------------------------- 1 | import React, { 2 | useCallback, 3 | useEffect, 4 | useMemo, 5 | useRef, 6 | useState, 7 | } from "react"; 8 | import { 9 | Group, 10 | Text, 11 | useMantineTheme, 12 | Alert, 13 | Table, 14 | Button, 15 | Title, 16 | Flex, 17 | Stack, 18 | Spoiler, 19 | Progress, 20 | Card, 21 | } from "@mantine/core"; 22 | import { IconAlertCircle } from "@tabler/icons-react"; 23 | import { Experiment, Form, QAPair, Result } from "../utils/types"; 24 | import { notifications } from "@mantine/notifications"; 25 | import { API_URL, IS_DEV } from "../utils/variables"; 26 | import { fetchEventSource } from "@microsoft/fetch-event-source"; 27 | import { Parser } from "@json2csv/plainjs"; 28 | import { isEmpty, isNil, orderBy } from "lodash"; 29 | import sampleResults from "../public/testData/results.json"; 30 | import sampleTestDataset from "../public/testData/testDataset.json"; 31 | import sampleExperiments from "../public/testData/experiments.json"; 32 | import SummaryChart from "./SummaryChart"; 33 | import ExperimentSummaryTable from "./ExperimentSummaryTable"; 34 | import FilesTable from "./tables/FilesTable"; 35 | import ExperimentResultTable from "./tables/ExperimentResultTable"; 36 | import sampleText from "../public/testData/karpathy-pod.json"; 37 | import LogRocket from "logrocket"; 38 | 39 | const Demo = ({ form }: { form: Form }) => { 40 | const { setValue, watch, getValues, handleSubmit } = form; 41 | const watchFiles = watch("files"); 42 | const [loading, setLoading] = useState(false); 43 | const [results, setResults] = useState([]); 44 | const [testDataset, setTestDataset] = useState([]); 45 | const [evalQuestionsCount, setEvalQuestionsCount] = useState(5); 46 | const [experiments, setExperiments] = useState([]); 47 | const [shouldShowProgress, setShouldShowProgress] = useState(false); 48 | const [gradingPromptStyle, setGradingPromptStyle] = useState(undefined); 49 | const experimentsResultsSpoilerRef = useRef(null); 50 | const summarySpoilerRef = useRef(null); 51 | const testDatasetSpoilerRef = useRef(null); 52 | const [isFirstRun, setIsFirstRun] = useState(true); 53 | 54 | const alertStyle = { backgroundColor: `rgba(193,194,197,0.38)` }; 55 | useEffect(() => { 56 | setValue("files", [ 57 | new File( 58 | [new Blob([sampleText.text], { type: "text/plain" })], 59 | "karpathy-pod.txt", 60 | { 61 | type: "text/plain", 62 | } 63 | ), 64 | ]); 65 | setResults(sampleResults); 66 | setTestDataset(sampleTestDataset); 67 | setExperiments(sampleExperiments); 68 | }, []); 69 | 70 | const runExperimentButtonLabel = 71 | experiments.length > 1 ? "Re-run experiment" : "Run Experiment"; 72 | 73 | const initialProgress = { 74 | value: 15, 75 | color: "purple", 76 | label: "Building Index ...", 77 | }; 78 | 79 | const finishedProgress = { 80 | value: 100, 81 | color: "green", 82 | label: "Completed", 83 | }; 84 | 85 | const experimentProgress = useMemo(() => { 86 | if (results.length === 0) { 87 | return [initialProgress]; 88 | } 89 | 90 | const res = 15 + Math.floor((results?.length / evalQuestionsCount) * 85); 91 | 92 | if (res === 100) { 93 | return [finishedProgress]; 94 | } 95 | const ret = [ 96 | initialProgress, 97 | { 98 | value: res, 99 | color: "blue", 100 | label: "Generating Evals & Grading", 101 | }, 102 | ]; 103 | return ret; 104 | }, [results, evalQuestionsCount]); 105 | 106 | const chartData = experiments.map((experiment) => ({ 107 | id: "Expt #" + experiment.id, 108 | data: [ 109 | { 110 | x: experiment.avgAnswerScore, 111 | y: experiment.avgLatency, 112 | }, 113 | ], 114 | })); 115 | 116 | const submit = handleSubmit(async (data) => { 117 | setShouldShowProgress(true); 118 | setLoading(true); 119 | setResults([]); 120 | 121 | const resetExpts = data.evalQuestionsCount !== evalQuestionsCount; 122 | if (resetExpts) { 123 | setExperiments([]); 124 | } 125 | 126 | const formData = new FormData(); 127 | data.files.forEach((file) => { 128 | formData.append("files", file); 129 | }); 130 | formData.append("num_eval_questions", data.evalQuestionsCount.toString()); 131 | formData.append("chunk_chars", data.chunkSize.toString()); 132 | formData.append("overlap", data.overlap.toString()); 133 | formData.append("split_method", data.splitMethod); 134 | formData.append("retriever_type", data.retriever); 135 | formData.append("embeddings", data.embeddingAlgorithm); 136 | formData.append("model_version", data.model); 137 | formData.append("grade_prompt", data.gradingPrompt); 138 | formData.append("num_neighbors", data.numNeighbors.toString()); 139 | formData.append("test_dataset", JSON.stringify(testDataset)); 140 | 141 | if (!IS_DEV) { 142 | LogRocket.track("DemoSubmission", { 143 | numQuestions: data.evalQuestionsCount, 144 | overlap: data.overlap, 145 | split: data.splitMethod, 146 | retriever: data.retriever, 147 | embedding: data.embeddingAlgorithm, 148 | model: data.model, 149 | promptStyle: data.gradingPrompt, 150 | numNeighbors: data.numNeighbors, 151 | }); 152 | } 153 | 154 | setEvalQuestionsCount(data.evalQuestionsCount); 155 | setGradingPromptStyle(data.gradingPrompt); 156 | 157 | const controller = new AbortController(); 158 | 159 | let localResults = []; 160 | let rowCount = 0; 161 | 162 | try { 163 | await fetchEventSource(API_URL + "/evaluator-stream", { 164 | method: "POST", 165 | body: formData, 166 | headers: { 167 | Accept: "text/event-stream", 168 | }, 169 | openWhenHidden: true, 170 | signal: controller.signal, 171 | onmessage(ev) { 172 | try { 173 | const row: Result = JSON.parse(ev.data)?.data; 174 | setResults((results) => [...results, row]); 175 | localResults = [...localResults, row]; 176 | rowCount += 1; 177 | if (rowCount > testDataset.length) { 178 | setTestDataset((testDataset) => [ 179 | ...testDataset, 180 | { 181 | question: row.question, 182 | answer: row.answer, 183 | }, 184 | ]); 185 | } 186 | if (rowCount === data.evalQuestionsCount) { 187 | controller.abort(); 188 | } 189 | } catch (e) { 190 | console.warn("Error parsing data", e); 191 | } 192 | }, 193 | onclose() { 194 | console.log("Connection closed by the server"); 195 | setLoading(false); 196 | }, 197 | onerror(err) { 198 | console.log("There was an error from server", err); 199 | throw err; 200 | }, 201 | }); 202 | } catch (e) { 203 | notifications.show({ 204 | title: "Error", 205 | message: "There was an error from the server.", 206 | color: "red", 207 | }); 208 | setShouldShowProgress(false); 209 | setLoading(false); 210 | return; 211 | } 212 | setLoading(false); 213 | setIsFirstRun(false); 214 | const avgAnswerScore = 215 | localResults.reduce((acc, curr) => acc + curr.answerScore.score, 0) / 216 | localResults.length; 217 | const avgRelevancyScore = 218 | localResults.reduce((acc, curr) => acc + curr.retrievalScore.score, 0) / 219 | localResults.length; 220 | const avgLatency = 221 | localResults.reduce((acc, curr) => acc + curr.latency, 0) / 222 | localResults.length; 223 | const newExperiment: Experiment = { 224 | evalQuestionsCount: data.evalQuestionsCount, 225 | chunkSize: data.chunkSize, 226 | overlap: data.overlap, 227 | splitMethod: data.splitMethod, 228 | retriever: data.retriever, 229 | embeddingAlgorithm: data.embeddingAlgorithm, 230 | model: data.model, 231 | gradingPrompt: data.gradingPrompt, 232 | numNeighbors: data.numNeighbors, 233 | avgRelevancyScore, 234 | avgAnswerScore, 235 | avgLatency, 236 | performance: avgAnswerScore / avgLatency, 237 | id: resetExpts ? 1 : experiments.length + 1, 238 | }; 239 | setExperiments((experiments) => 240 | resetExpts ? [newExperiment] : [...experiments, newExperiment] 241 | ); 242 | }); 243 | 244 | const download = useCallback( 245 | (data: any[], filename: string) => { 246 | const parser = new Parser(); 247 | const csv = parser.parse(data); 248 | const blob = new Blob([csv], { type: "text/csv;charset=utf-8;" }); 249 | const url = URL.createObjectURL(blob); 250 | const link = document.createElement("a"); 251 | link.setAttribute("href", url); 252 | link.setAttribute("download", filename + ".csv"); 253 | link.style.visibility = "hidden"; 254 | document.body.appendChild(link); 255 | link.click(); 256 | document.body.removeChild(link); 257 | }, 258 | [results] 259 | ); 260 | 261 | const isFastGradingPrompt = gradingPromptStyle === "Fast"; 262 | 263 | return ( 264 | 265 | Get Started 266 | 267 | Welcome to the auto-evaluator! This is an app to evaluate the 268 | performance of question-answering LLM chains. This demo has pre-loaded 269 | two things: (1) a document (the Lex Fridman podcast with Andrej 270 | Karpathy) and (2) a "test set" of question-answer pairs for this 271 | episode. The aim is to evaluate the performance of various 272 | question-answering LLM chain configurations against the test set. You 273 | can build any QA chain using the components and score its performance. 274 |
275 |
276 | 277 | Choose the question-answering chain configuration (left) and launch an 278 | experiment using the button below. For more detail on each setting, 279 | see full the documentation{" "} 280 | 284 | here 285 | 286 | . 287 | 288 |
289 | {!!watchFiles?.length && ( 290 | <> 291 | 292 | {!!testDataset.length && ( 293 | 294 | 301 | 302 | 303 | Test Dataset 304 | 305 | 313 | 328 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | {testDataset?.map((result: QAPair, index: number) => ( 351 | 352 | 353 | 354 | 355 | ))} 356 | 357 |
QuestionAnswer
{result?.question}{result?.answer}
358 |
359 |
360 | )} 361 | 362 | {!loading || isFirstRun ? ( 363 | 364 | 372 | 373 | ) : null} 374 | 375 | 376 | )} 377 | {shouldShowProgress && ( 378 | 386 | )} 387 | {!isEmpty(results) ? ( 388 | 389 | 397 | 398 | 399 | Experiment Results 400 | 401 | This table shows the each question-answer pair from the test 402 | set along with the model's answer to the question. The app 403 | will score two things: (1) the relevance of the retrieved 404 | documents relative to the question and (2) the similarity of 405 | the LLM generated answer relative to ground truth answer. The 406 | prompts for both can be seen{" "} 407 | 411 | here 412 | {" "} 413 | and can be chosen by the user in the drop-down list "Grading 414 | prompt style". The "Fast" prompt will only have the LLM grader 415 | output the score. The other prompts will also produce an 416 | explanation. 417 | 418 | 419 | 427 | 438 | 439 | 440 | 441 | 445 | 446 | 447 | ) : null} 448 | {!!experiments.length && ( 449 | 450 | 458 | 459 | 460 | Summary 461 | 462 | 470 | 481 | 482 | 483 | 484 | 485 |
486 | 487 |
488 |
489 |
490 | )} 491 |
492 | ); 493 | }; 494 | export default Demo; 495 | -------------------------------------------------------------------------------- /nextjs/components/ExperimentSummaryTable.tsx: -------------------------------------------------------------------------------- 1 | import { ScrollArea, Table } from "@mantine/core"; 2 | import { Experiment } from "../utils/types"; 3 | 4 | const ExperimentSummaryTable = ({ 5 | experiments, 6 | }: { 7 | experiments: Experiment[]; 8 | }) => { 9 | return ( 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | {experiments?.map((result: Experiment, index: number) => ( 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | ))} 47 | 48 |
Experiment ## of Eval QuestionsChunk SizeOverlapSplit MethodRetrieverEmbedding AlgorithmModelGrading Prompt Style# of Chunks RetrievedAvg Retrieval Relevancy ScoreAvg Answer Similarity ScoreAvg Latency (s)
{result.id}{result?.evalQuestionsCount}{result?.chunkSize}{result?.overlap}{result?.splitMethod}{result?.retriever}{result?.embeddingAlgorithm}{result?.model}{result?.gradingPrompt}{result?.numNeighbors}{result?.avgRelevancyScore}{result?.avgAnswerScore}{result?.avgLatency.toFixed(3)}
49 |
50 | ); 51 | }; 52 | export default ExperimentSummaryTable; 53 | -------------------------------------------------------------------------------- /nextjs/components/HeaderEvaluator.tsx: -------------------------------------------------------------------------------- 1 | import { Group, Header, Stack, Text } from "@mantine/core"; 2 | import Image from "next/image"; 3 | import Link from "next/link"; 4 | import React from "react"; 5 | import githubIcon from "../public/github-mark.svg"; 6 | import { useMediaQuery } from "@mantine/hooks"; 7 | 8 | export enum MenuItem { 9 | Demo = "Demo", 10 | Playground = "Playground", 11 | About = "About", 12 | } 13 | 14 | const HeaderEvaluator = ({ activeTab }: { activeTab: MenuItem }) => { 15 | const mobileWidth = useMediaQuery("(max-width: 390px)"); 16 | const borderBottom = "1px solid #000"; 17 | 18 | return ( 19 |
20 | 21 | 22 | 23 | 24 | 🦜🔗 25 | 30 | Auto-Evaluator 31 | 32 | 33 | 34 | 35 | 42 | Demo 43 | 44 | 52 | Playground 53 | 54 | 61 | Docs 62 | 63 | 71 | About 72 | 73 | 77 | github 78 | 79 | 80 | 81 | 82 |
83 | ); 84 | }; 85 | export default HeaderEvaluator; 86 | -------------------------------------------------------------------------------- /nextjs/components/PersonCard.tsx: -------------------------------------------------------------------------------- 1 | import { 2 | createStyles, 3 | Card, 4 | Avatar, 5 | Text, 6 | Group, 7 | Button, 8 | rem, 9 | Stack, 10 | } from "@mantine/core"; 11 | import Link from "next/link"; 12 | import githubIcon from "../public/github-mark.svg"; 13 | import twitterBlackIcon from "../public/twitter-black.svg"; 14 | import Image from "next/image"; 15 | 16 | const useStyles = createStyles((theme) => ({ 17 | card: { 18 | backgroundColor: 19 | theme.colorScheme === "dark" ? theme.colors.dark[7] : theme.white, 20 | }, 21 | 22 | avatar: { 23 | border: `${rem(2)} solid ${ 24 | theme.colorScheme === "dark" ? theme.colors.dark[7] : theme.white 25 | }`, 26 | }, 27 | })); 28 | 29 | interface UserCardImageProps { 30 | avatar: string; 31 | name: string; 32 | job: string; 33 | twitterHandle: string; 34 | githubHandle: string; 35 | } 36 | 37 | export function UserCardImage({ 38 | avatar, 39 | name, 40 | job, 41 | twitterHandle, 42 | githubHandle, 43 | }: UserCardImageProps) { 44 | const { classes, theme } = useStyles(); 45 | 46 | return ( 47 | 48 | 49 | 56 |
57 | 58 | {name} 59 | 60 | 61 | {job} 62 | 63 |
64 | 65 | 66 | github 67 | 68 | 69 | github 70 | 71 | 72 |
73 |
74 | ); 75 | } 76 | -------------------------------------------------------------------------------- /nextjs/components/Playground.tsx: -------------------------------------------------------------------------------- 1 | import React, { 2 | useCallback, 3 | useEffect, 4 | useMemo, 5 | useRef, 6 | useState, 7 | } from "react"; 8 | import { 9 | Group, 10 | Text, 11 | useMantineTheme, 12 | Alert, 13 | Table, 14 | Button, 15 | Title, 16 | Flex, 17 | Stack, 18 | Spoiler, 19 | Progress, 20 | Card, 21 | ScrollArea, 22 | createStyles, 23 | } from "@mantine/core"; 24 | import { IconUpload, IconX, IconAlertCircle } from "@tabler/icons-react"; 25 | import { Dropzone, MIME_TYPES } from "@mantine/dropzone"; 26 | import { Experiment, Form, QAPair, Result } from "../utils/types"; 27 | import { notifications } from "@mantine/notifications"; 28 | import { API_URL, IS_DEV } from "../utils/variables"; 29 | import { fetchEventSource } from "@microsoft/fetch-event-source"; 30 | import { Parser } from "@json2csv/plainjs"; 31 | import { IconFile } from "@tabler/icons-react"; 32 | import { ResponsiveScatterPlot } from "@nivo/scatterplot"; 33 | import { isEmpty, isNil, orderBy } from "lodash"; 34 | import TestFileUploadZone from "./TestFileUploadZone"; 35 | import LogRocket from "logrocket"; 36 | 37 | const MAX_FILE_SIZE_MB = 50; 38 | 39 | enum DropZoneErrorCode { 40 | FileTooLarge = "file-too-large", 41 | FileInvalidType = "file-invalid-type", 42 | } 43 | 44 | const useStyles = createStyles((theme) => ({ 45 | disabled: { 46 | backgroundColor: 47 | theme.colorScheme === "dark" 48 | ? theme.colors.dark[6] 49 | : theme.colors.gray[0], 50 | borderColor: 51 | theme.colorScheme === "dark" 52 | ? theme.colors.dark[5] 53 | : theme.colors.gray[2], 54 | cursor: "not-allowed", 55 | 56 | "& *": { 57 | color: 58 | theme.colorScheme === "dark" 59 | ? theme.colors.dark[3] 60 | : theme.colors.gray[5], 61 | }, 62 | }, 63 | })); 64 | 65 | const Playground = ({ form }: { form: Form }) => { 66 | const { setValue, watch, getValues, handleSubmit } = form; 67 | const watchFiles = watch("files"); 68 | const theme = useMantineTheme(); 69 | const [loading, setLoading] = useState(false); 70 | const [results, setResults] = useState([]); 71 | const [testDataset, setTestDataset] = useState([]); 72 | const [evalQuestionsCount, setEvalQuestionsCount] = useState(-1); 73 | const [experiments, setExperiments] = useState([]); 74 | const [didUploadTestDataset, setDidUploadTestDataset] = useState(false); 75 | const [shouldShowProgress, setShouldShowProgress] = useState(false); 76 | const [gradingPromptStyle, setGradingPromptStyle] = useState(undefined); 77 | const experimentsResultsSpoilerRef = useRef(null); 78 | const summarySpoilerRef = useRef(null); 79 | const testDatasetSpoilerRef = useRef(null); 80 | const [testFilesDropzoneDisabled, setTestFilesDropzoneDisabled] = 81 | useState(true); 82 | const [fileUploadDisabled, setFileUploadDisabled] = useState(false); 83 | 84 | const { classes } = useStyles(); 85 | 86 | const initialProgress = { 87 | value: 15, 88 | color: "purple", 89 | label: "Processing Files", 90 | }; 91 | 92 | const finishedProgress = { 93 | value: 100, 94 | color: "green", 95 | label: "Completed", 96 | }; 97 | 98 | const experimentProgress = useMemo(() => { 99 | if (results.length === 0) { 100 | return [initialProgress]; 101 | } 102 | 103 | const res = 15 + Math.floor((results?.length / evalQuestionsCount) * 85); 104 | 105 | if (res === 100) { 106 | return [finishedProgress]; 107 | } 108 | const ret = [ 109 | initialProgress, 110 | { 111 | value: res, 112 | color: "blue", 113 | label: "Generating Evals & Grading", 114 | }, 115 | ]; 116 | return ret; 117 | }, [results, evalQuestionsCount]); 118 | 119 | const chartData = experiments.map((experiment, index) => ({ 120 | id: "Expt #" + (index + 1), 121 | data: [ 122 | { 123 | x: experiment.avgAnswerScore, 124 | y: experiment.avgLatency, 125 | }, 126 | ], 127 | })); 128 | 129 | const renderPassFail = (data: any) => { 130 | if (data.score === 0) { 131 | return "Incorrect"; 132 | } 133 | if (data.score === 1) { 134 | return "Correct"; 135 | } 136 | throw new Error(`Problem parsing ${data}`); 137 | }; 138 | 139 | const submit = handleSubmit(async (data) => { 140 | setShouldShowProgress(true); 141 | setLoading(true); 142 | setResults([]); 143 | 144 | const resetExpts = 145 | data.evalQuestionsCount !== evalQuestionsCount || didUploadTestDataset; 146 | if (resetExpts) { 147 | setExperiments([]); 148 | } 149 | 150 | setDidUploadTestDataset(false); 151 | 152 | const formData = new FormData(); 153 | data.files.forEach((file) => { 154 | formData.append("files", file); 155 | }); 156 | formData.append("num_eval_questions", data.evalQuestionsCount.toString()); 157 | formData.append("chunk_chars", data.chunkSize.toString()); 158 | formData.append("overlap", data.overlap.toString()); 159 | formData.append("split_method", data.splitMethod); 160 | formData.append("retriever_type", data.retriever); 161 | formData.append("embeddings", data.embeddingAlgorithm); 162 | formData.append("model_version", data.model); 163 | formData.append("grade_prompt", data.gradingPrompt); 164 | formData.append("num_neighbors", data.numNeighbors.toString()); 165 | formData.append("test_dataset", JSON.stringify(testDataset)); 166 | 167 | if (!IS_DEV) { 168 | LogRocket.track("PlaygroundSubmission", { 169 | fileSizes: data.files.map((file) => file.size), 170 | fileTypes: data.files.map((file) => file.type), 171 | numQuestions: data.evalQuestionsCount, 172 | overlap: data.overlap, 173 | split: data.splitMethod, 174 | retriever: data.retriever, 175 | embedding: data.embeddingAlgorithm, 176 | model: data.model, 177 | promptStyle: data.gradingPrompt, 178 | numNeighbors: data.numNeighbors, 179 | uploadedTestDataset: !!testDataset.length, 180 | }); 181 | } 182 | 183 | setEvalQuestionsCount(data.evalQuestionsCount); 184 | setGradingPromptStyle(data.gradingPrompt); 185 | 186 | const controller = new AbortController(); 187 | 188 | let localResults = []; 189 | let rowCount = 0; 190 | try { 191 | await fetchEventSource(API_URL + "/evaluator-stream", { 192 | method: "POST", 193 | body: formData, 194 | headers: { 195 | Accept: "text/event-stream", 196 | Connection: "keep-alive", // Add the keep-alive header 197 | }, 198 | openWhenHidden: true, 199 | signal: controller.signal, 200 | onmessage(ev) { 201 | try { 202 | const row: Result = JSON.parse(ev.data)?.data; 203 | setResults((results) => [...results, row]); 204 | localResults = [...localResults, row]; 205 | rowCount += 1; 206 | if (rowCount > testDataset.length) { 207 | setTestDataset((testDataset) => [ 208 | ...testDataset, 209 | { 210 | question: row.question, 211 | answer: row.answer, 212 | }, 213 | ]); 214 | } 215 | if (rowCount === data.evalQuestionsCount) { 216 | controller.abort(); 217 | } 218 | } catch (e) { 219 | console.warn("Error parsing data", e); 220 | } 221 | }, 222 | onclose() { 223 | console.log("Connection closed by the server"); 224 | setLoading(false); 225 | if (!rowCount) { 226 | throw new Error("No results were returned from the server."); 227 | } 228 | }, 229 | onerror(err) { 230 | console.log("There was an error from server", err); 231 | throw new Error(err); 232 | }, 233 | }); 234 | } catch (e) { 235 | notifications.show({ 236 | title: "Error", 237 | message: "There was an error from the server.", 238 | color: "red", 239 | }); 240 | setShouldShowProgress(false); 241 | setLoading(false); 242 | return; 243 | } 244 | setLoading(false); 245 | const avgAnswerScore = 246 | localResults.reduce((acc, curr) => acc + curr.answerScore.score, 0) / 247 | localResults.length; 248 | const avgRelevancyScore = 249 | localResults.reduce((acc, curr) => acc + curr.retrievalScore.score, 0) / 250 | localResults.length; 251 | const avgLatency = 252 | localResults.reduce((acc, curr) => acc + curr.latency, 0) / 253 | localResults.length; 254 | const newExperiment: Experiment = { 255 | evalQuestionsCount: data.evalQuestionsCount, 256 | chunkSize: data.chunkSize, 257 | overlap: data.overlap, 258 | splitMethod: data.splitMethod, 259 | retriever: data.retriever, 260 | embeddingAlgorithm: data.embeddingAlgorithm, 261 | model: data.model, 262 | gradingPrompt: data.gradingPrompt, 263 | numNeighbors: data.numNeighbors, 264 | avgRelevancyScore, 265 | avgAnswerScore, 266 | avgLatency, 267 | performance: avgAnswerScore / avgLatency, 268 | id: resetExpts ? 1 : experiments.length + 1, 269 | }; 270 | setExperiments((experiments) => 271 | resetExpts ? [newExperiment] : [...experiments, newExperiment] 272 | ); 273 | }); 274 | 275 | const runExperimentButtonLabel = experiments.length 276 | ? "Re-run experiment" 277 | : "Run Experiment"; 278 | 279 | const download = useCallback( 280 | (data: any[], filename: string) => { 281 | const parser = new Parser(); 282 | const csv = parser.parse(data); 283 | const blob = new Blob([csv], { type: "text/csv;charset=utf-8;" }); 284 | const url = URL.createObjectURL(blob); 285 | const link = document.createElement("a"); 286 | link.setAttribute("href", url); 287 | link.setAttribute("download", filename + ".csv"); 288 | link.style.visibility = "hidden"; 289 | document.body.appendChild(link); 290 | link.click(); 291 | document.body.removeChild(link); 292 | }, 293 | [results] 294 | ); 295 | 296 | const isFastGradingPrompt = gradingPromptStyle === "Fast"; 297 | const alertStyle = { backgroundColor: `rgba(193,194,197,0.38)` }; 298 | 299 | return ( 300 | 301 | } 303 | title="Instructions" 304 | style={alertStyle} 305 | > 306 | Upload a file (up to 50 MB) and choose the parameters for your QA 307 | chain. This evaluator will generate a test dataset of QA pairs and grade 308 | the performance of the QA chain. You can experiment with different 309 | parameters and evaluate the performance. 310 | 311 | 312 | { 316 | setValue("files", [...(getValues("files") ?? []), ...files]); 317 | setExperiments([]); 318 | setResults([]); 319 | setShouldShowProgress(false); 320 | setTestFilesDropzoneDisabled(false); 321 | setFileUploadDisabled(true); 322 | }} 323 | maxFiles={1} 324 | multiple={false} 325 | maxSize={MAX_FILE_SIZE_MB * 1024 ** 2} // 50 MB 326 | accept={[ 327 | MIME_TYPES.pdf, 328 | MIME_TYPES.docx, 329 | MIME_TYPES.doc, 330 | "text/plain", 331 | ]} 332 | onReject={(files) => { 333 | const errorCode = files?.[0]?.errors?.[0]?.code; 334 | let message = files?.[0]?.errors?.[0]?.message; 335 | switch (errorCode) { 336 | case DropZoneErrorCode.FileTooLarge: 337 | message = `File size too large. Max file size is ${MAX_FILE_SIZE_MB} MB.`; 338 | break; 339 | case DropZoneErrorCode.FileInvalidType: 340 | message = "File type not supported"; 341 | break; 342 | default: 343 | break; 344 | } 345 | notifications.show({ 346 | title: "Error", 347 | message, 348 | color: "red", 349 | }); 350 | }} 351 | // maxSize={3 * 1024 ** 2} 352 | style={{ width: "100%" }} 353 | > 354 | 355 | 356 | 365 | 366 | 367 | 372 | 373 | 374 | 375 | 376 |
377 | 378 | Upload Text for QA Eval 379 | 380 | 381 | {"Attach a file (.txt, .pdf, .doc, .docx)"} 382 | 383 |
384 |
385 |
386 | 391 |
392 | {!!watchFiles?.length && ( 393 | <> 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | {watchFiles?.map((file, id) => ( 403 | 404 | 405 | 406 | 407 | ))} 408 | 409 |
File NameSize (MB)
{file?.name}{(file?.size / 1024 ** 2).toFixed(1)}
410 | {!!testDataset.length && ( 411 | 412 | 419 | 420 | 421 | Test Dataset 422 | 423 | 431 | 446 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | {testDataset?.map((result: QAPair, index: number) => ( 469 | 470 | 471 | 472 | 473 | ))} 474 | 475 |
QuestionAnswer
{result?.question}{result?.answer}
476 |
477 |
478 | )} 479 | 480 | 488 | 489 | 490 | )} 491 | {shouldShowProgress && ( 492 | 500 | )} 501 | {!!experiments.length && ( 502 | 503 | 511 | 512 | 513 | Summary 514 | 515 | 523 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | {experiments?.map((result: Experiment, index: number) => ( 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | ))} 574 | 575 |
Experiment ## of Eval QuestionsChunk SizeOverlapSplit MethodRetrieverEmbedding AlgorithmModelGrading Prompt Style# of Chunks RetrievedAvg Retrieval Relevancy ScoreAvg Answer Similarity ScoreAvg Latency (s)
{index + 1}{result?.evalQuestionsCount}{result?.chunkSize}{result?.overlap}{result?.splitMethod}{result?.retriever}{result?.embeddingAlgorithm}{result?.model}{result?.gradingPrompt}{result?.numNeighbors}{result?.avgRelevancyScore}{result?.avgAnswerScore}{result?.avgLatency.toFixed(3)}
576 |
577 |
578 | 629 |
630 |
631 |
632 | )} 633 | {!isEmpty(results) ? ( 634 | 635 | 643 | 644 | 645 | Experiment Results 646 |
647 |
648 | 649 | 657 | 668 | 669 |
670 |
671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | {results?.map((result: Result, index: number) => ( 685 | 686 | 687 | 688 | 689 | 710 | 731 | 732 | 733 | ))} 734 | 735 |
QuestionExpected AnswerObserved AnswerRetrieval Relevancy ScoreAnswer Similarity ScoreLatency (s)
{result?.question}{result?.answer}{result?.result} 690 | {isFastGradingPrompt ? ( 691 | renderPassFail(result.retrievalScore) 692 | ) : ( 693 | 697 | Show less 698 | 699 | } 700 | showLabel={ 701 | 702 | Show more 703 | 704 | } 705 | > 706 | {result?.retrievalScore.justification} 707 | 708 | )} 709 | 711 | {isFastGradingPrompt ? ( 712 | renderPassFail(result?.answerScore) 713 | ) : ( 714 | 718 | Show less 719 | 720 | } 721 | showLabel={ 722 | 723 | Show more 724 | 725 | } 726 | > 727 | {result?.answerScore.justification} 728 | 729 | )} 730 | {result?.latency?.toFixed(3)}
736 |
737 |
738 |
739 | ) : null} 740 |
741 | ); 742 | }; 743 | export default Playground; 744 | -------------------------------------------------------------------------------- /nextjs/components/Sidebar.tsx: -------------------------------------------------------------------------------- 1 | import { ScrollArea, Select, Slider, Stack, Text } from "@mantine/core"; 2 | import React from "react"; 3 | import { Form } from "../utils/types"; 4 | import { Controller, useForm } from "react-hook-form"; 5 | 6 | const Sidebar = ({ form }: { form: Form }) => { 7 | const { control, setValue } = form; 8 | 9 | return ( 10 | <> 11 | 12 | Parameters 13 | 23 |
24 | Number of eval questions 25 | ( 29 | 42 | )} 43 | /> 44 |
45 |
46 | Chunk size 47 | ( 51 | 64 | )} 65 | /> 66 |
67 |
68 | Chunk overlap 69 | ( 73 | 86 | )} 87 | /> 88 |
89 |
90 | Model 91 | ( 95 | 127 | )} 128 | /> 129 |
130 |
131 | Embedding algorithm 132 | ( 136 | { 157 | field.onChange(value); 158 | if (value === "Anthropic-100k") { 159 | setValue("model", "anthropic"); 160 | setValue("splitMethod", ""); 161 | setValue("embeddingAlgorithm", ""); 162 | } 163 | }} 164 | data={[ 165 | { 166 | label: "Similarity Search", 167 | value: "similarity-search", 168 | }, 169 | { 170 | label: "SVM", 171 | value: "SVM", 172 | }, 173 | { label: "TF-IDF", value: "TF-IDF" }, 174 | { label: "Anthropic-100k", value: "Anthropic-100k" }, 175 | ]} 176 | /> 177 | )} 178 | /> 179 |
180 |
181 | Number of chunks to retrieve 182 | ( 186 | 198 | )} 199 | /> 200 |
201 |
202 | Grading prompt style 203 | ( 207 |