├── .gitignore
├── LICENSE
├── README.md
├── api
    ├── .env
    ├── README.md
    ├── Test_Inference.ipynb
    ├── __pycache__
    │   ├── evaluator_app.cpython-38.pyc
    │   └── text_utils.cpython-38.pyc
    ├── docs
    │   ├── gpt3
    │   │   ├── 2005.14165.pdf
    │   │   └── gpt3-eval.csv
    │   ├── karpathy-lex-pod
    │   │   ├── karpathy-pod-eval.csv
    │   │   └── karpathy-pod.txt
    │   └── transformers-challenge
    │   │   ├── 2005.14165.pdf
    │   │   ├── 2112.04426.pdf
    │   │   ├── 2203.15556.pdf
    │   │   ├── 2205.06175.pdf
    │   │   ├── 2302.13971.pdf
    │   │   └── transformers-eval.csv
    ├── evaluator_app.py
    ├── logging.conf
    ├── railway.json
    ├── requirements.txt
    └── text_utils.py
├── nextjs
    ├── .env.local
    ├── .gitignore
    ├── components
    │   ├── Demo.tsx
    │   ├── ExperimentSummaryTable.tsx
    │   ├── HeaderEvaluator.tsx
    │   ├── PersonCard.tsx
    │   ├── Playground.tsx
    │   ├── Sidebar.tsx
    │   ├── SummaryChart.tsx
    │   ├── TestFileUploadZone.tsx
    │   └── tables
    │   │   ├── ExperimentResultTable.tsx
    │   │   └── FilesTable.tsx
    ├── next-env.d.ts
    ├── next.config.js
    ├── package.json
    ├── pages
    │   ├── _app.tsx
    │   ├── about
    │   │   └── index.tsx
    │   ├── index.tsx
    │   └── playground
    │   │   └── index.tsx
    ├── public
    │   ├── favicon
    │   │   ├── about.txt
    │   │   ├── android-chrome-192x192.png
    │   │   ├── android-chrome-512x512.png
    │   │   ├── apple-touch-icon.png
    │   │   ├── favicon-16x16.png
    │   │   ├── favicon-32x32.png
    │   │   ├── favicon.ico
    │   │   └── site.webmanifest
    │   ├── github-mark.svg
    │   ├── slack-mark.svg
    │   ├── testData
    │   │   ├── experiments.json
    │   │   ├── karpathy-pod.json
    │   │   ├── results.json
    │   │   └── testDataset.json
    │   └── twitter-black.svg
    ├── styles
    │   ├── global.css
    │   └── utils.module.css
    ├── tsconfig.json
    ├── utils
    │   ├── renderPassFail.ts
    │   ├── types.ts
    │   └── variables.ts
    └── yarn.lock
└── streamlit
    ├── README.md
    ├── auto-evaluator.py
    ├── eval_sets
        └── lex-pod-eval.json
    ├── img
        └── diagnostic.jpg
    ├── kor_retriever_lex.py
    ├── prompts.py
    ├── requirements.txt
    └── self_query_retriever_lex.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .vercel
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Elastic License 2.0 (ELv2)
 2 | 
 3 | **Acceptance**
 4 | By using the software, you agree to all of the terms and conditions below.
 5 | 
 6 | **Copyright License**
 7 | The licensor grants you a non-exclusive, royalty-free, worldwide, non-sublicensable, non-transferable license to use, copy, distribute, make available, and prepare derivative works of the software, in each case subject to the limitations and conditions below
 8 | 
 9 | **Limitations**
10 | You may not provide the software to third parties as a hosted or managed service, where the service provides users with access to any substantial set of the features or functionality of the software.
11 | 
12 | You may not move, change, disable, or circumvent the license key functionality in the software, and you may not remove or obscure any functionality in the software that is protected by the license key.
13 | 
14 | You may not alter, remove, or obscure any licensing, copyright, or other notices of the licensor in the software. Any use of the licensor’s trademarks is subject to applicable law.
15 | 
16 | **Patents**
17 | The licensor grants you a license, under any patent claims the licensor can license, or becomes able to license, to make, have made, use, sell, offer for sale, import and have imported the software, in each case subject to the limitations and conditions in this license. This license does not cover any patent claims that you cause to be infringed by modifications or additions to the software. If you or your company make any written claim that the software infringes or contributes to infringement of any patent, your patent license for the software granted under these terms ends immediately. If your company makes such a claim, your patent license ends immediately for work on behalf of your company.
18 | 
19 | **Notices**
20 | You must ensure that anyone who gets a copy of any part of the software from you also gets a copy of these terms.
21 | 
22 | If you modify the software, you must include in any modified copies of the software prominent notices stating that you have modified the software.
23 | 
24 | **No Other Rights**
25 | These terms do not imply any licenses other than those expressly granted in these terms.
26 | 
27 | **Termination**
28 | If you use the software in violation of these terms, such use is not licensed, and your licenses will automatically terminate. If the licensor provides you with a notice of your violation, and you cease all violation of this license no later than 30 days after you receive that notice, your licenses will be reinstated retroactively. However, if you violate these terms after such reinstatement, any additional violation of these terms will cause your licenses to terminate automatically and permanently.
29 | 
30 | **No Liability**
31 | As far as the law allows, the software comes as is, without any warranty or condition, and the licensor will not be liable to you for any damages arising out of these terms or the use or nature of the software, under any kind of legal claim.
32 | 
33 | **Definitions**
34 | The *licensor* is the entity offering these terms, and the *software* is the software the licensor makes available under these terms, including any portion of it.
35 | 
36 | *you* refers to the individual or entity agreeing to these terms.
37 | 
38 | *your company* is any legal entity, sole proprietorship, or other kind of organization that you work for, plus all organizations that have control over, are under the control of, or are under common control with that organization. *control* means ownership of substantially all the assets of an entity, or the power to direct its management and policies by vote, contract, or otherwise. Control can be direct or indirect.
39 | 
40 | *your licenses* are all the licenses granted to you for the software under these terms.
41 | 
42 | *use* means anything you do with the software requiring one of your licenses.
43 | 
44 | *trademark* means trademarks, service marks, and similar rights.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # `Auto-evaluator` :brain: :memo:
  2 | 
  3 | `Context`
  4 | 
  5 | Document [Question-Answering](https://python.langchain.com/en/latest/use_cases/question_answering.html) is a popular LLM use-case. LangChain makes it easy to assemble LLM components (e.g., models and retrievers) into chains that support question-answering: input documents are split into chunks and stored in a retriever, relevant chunks are retrieved given a user `question` and passed to an LLM for synthesis into an `answer`.
  6 | 
  7 | `Challenge`
  8 | 
  9 | The quality of QA systems can vary considerably; for example, [we have seen](https://lancemartin.notion.site/lancemartin/Lex-GPT-a3ad671766d34f4a9a078da7adf9d382) cases of hallucination and poor answer quality due specific parameter settings. But, it is not always obvious to (1) evaluate the answer quality in a systematic way and (2) use this evaluation to guide improved QA chain settings (e.g., chunk size) or components (e.g., model or retriever choice).
 10 | 
 11 | 
 12 | `App overview`
 13 | 
 14 | This app aims to address the above limitations. Recent [work](https://arxiv.org/abs/2212.09251) from Anthropic has used model-written evaluation sets. OpenAI and others [have shown](https://github.com/openai/evals/blob/main/evals/registry/modelgraded/closedqa.yaml) that model-graded evaluation is an effective way to evaluate models. This app combines both of these ideas into a single workspace, auto-generating a QA test set and auto-grading the result of the specified QA chain.
 15 | 
 16 | ![image](https://user-images.githubusercontent.com/122662504/235393525-be89ef39-2f72-4b61-b8ee-add6a14796b9.png)
 17 | 
 18 | `Usage`
 19 | 
 20 | The app can be used in two ways:
 21 | 
 22 | - `Demo`: We pre-loaded a document (a [transcript](https://youtu.be/OYsYgzzsdT0) of the Lex Fridman podcast with Andrej Karpathy) and a set of 5 [question-answer pairs](https://github.com/langchain-ai/auto-evaluator/blob/main/api/docs/karpathy-lex-pod/karpathy-pod-eval.csv) from the podcast. You can configure QA chain(s) and run an experiment.
 23 | 
 24 | ![image](https://user-images.githubusercontent.com/122662504/234627824-2304f741-9f7b-4252-bdb4-ef2bdfd8139a.png)
 25 | 
 26 | - `Playground`: Input a set of documents that you want to ask questions about. Optionally, also include your own test set of question-answer pairs related to the documents; see an example [here](https://github.com/langchain-ai/auto-evaluator/tree/main/api/docs/karpathy-lex-pod). If you do not supply a test set, the app will auto-generate one. If the test set is smaller than the desired number of eval questions specified in the top left, the app will auto-generate the remainder.
 27 | 
 28 | ![image](https://user-images.githubusercontent.com/122662504/234629201-4c17b411-f910-476b-9bf6-1246c7c5a307.png)
 29 | 
 30 | `Building the document retrieval`:
 31 | 
 32 | - The app will build a [retriever](https://blog.langchain.dev/retrieval/) for the input documents.
 33 | - Retriever is a Langchain abstraction that accepts a question and returns a set of relevant documents.
 34 | - The retriever can be selected by the user in the drop-down list in the configurations (red panel above).
 35 | 
 36 | `Test set generation`:
 37 | 
 38 | - The app will auto-generate a test set of question-answer pair on the doc(s).
 39 | - To do this, it uses the Langchain `QAGenerationChain` with the default prompt [here](https://github.com/hwchase17/langchain/blob/master/langchain/chains/qa_generation/prompt.py).
 40 | 
 41 | `LLM question-answering`:
 42 | 
 43 | - For each question, we use a `RetrievalQA` chain to answer it.
 44 | - This will fetch chunks that are relevant to the question from the `retriever` and pass them to the LLM.
 45 | - We expose the `QA_CHAIN_PROMPT` used for to pass this context to the LLM [here](https://github.com/langchain-ai/auto-evaluator/blob/main/api/text_utils.py).
 46 | 
 47 | `Model-graded evaluation`:
 48 | 
 49 | - We let the user select from a number of model-graded evaluation prompts:
 50 | 
 51 | (1) The app will evaluate the `relevance of the retrieved documents` relative to the question.
 52 | 
 53 | (2) The app will evaluate the `similarity of the LLM generated answer` relative to ground truth answer.
 54 | 
 55 | - The prompts for both can be seen [here](https://github.com/dankolesnikov/evaluator-app/blob/main/api/text_utils.py)
 56 | - Users can select which grading prompt to use. [Here](https://rlancemartin.notion.site/Auto-Evaluator-Opportunities-7b3459dc2ae34440ae3481fe6f43ba40) are some notes in prompt selection from our experience.
 57 | 
 58 | `Experimental results`:
 59 | 
 60 | - The app will produce a table summarizing the results.
 61 | - It shows the question and the ground truth (expected) answer.
 62 | - It shows the chain-generated answer.
 63 | - It shows the binary score (PASS / FAIL) for retrieval and the answer.
 64 | - It shows the latency for retrieval and LLM answer summarization per question.
 65 | - It shows the model grader output (the raw output of the grading prompt).
 66 | 
 67 | ![image](https://user-images.githubusercontent.com/122662504/235396320-e392f912-977c-4871-b1d2-bd7a8be128a1.png)
 68 | 
 69 | ## User inputs
 70 | 
 71 | The left panel of the app (shown in red in the above image) has several user-configurable parameters.
 72 | 
 73 | `Number of eval questions` - This is the number of question-answer pairs to auto-generate for the given inputs documents. As mentioned above, question-answer pair auto-generation will use Langchain's `QAGenerationChain` with prompt specified [here](https://github.com/hwchase17/langchain/blob/master/langchain/chains/qa_generation/prompt.py).
 74 | 
 75 | `Chunk size` - Number of characters per chunk when the input documents are split. This [can impact answer quality](https://lancemartin.notion.site/lancemartin/Q-A-assistant-limitations-f576bf55b61c44e0970330ac3883315e). Retrievers often use text embedding similarity to select chunks related to the question. If the chunks are too large, each chunk may contain more information unrelated to the question, which may degrade the summarized answer quality. If chunks are too small, important context may be left out of the retrieved chunks.
 76 | 
 77 | `Overlap` - The overlap in characters between chunks.
 78 | 
 79 | `Embedding` - The method used to embed chunks.
 80 | 
 81 | `Retriever` - The method used to [retrieve chunks](https://blog.langchain.dev/retrieval/) that are relevant to the user question. The default vector database used for similarity search is [FAISS](https://github.com/dankolesnikov/evaluator-app/blob/235105642ff1d0ab15be87be7328df71b403268b/api/evaluator_app.py#L131), but support for others is a welcome addition. You can also try other methods, such as [SVM](https://twitter.com/karpathy/status/1647025230546886658) or [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf).
 82 | 
 83 | `Number of chunks to retrieve` - Number of chunks retrieved. More chunks can improve performance by giving the LLM more context for answer summarization.
 84 | 
 85 | `Model` - LLM for summarization of retrieved chunks into the answer.
 86 | 
 87 | `Grading prompt style` - The prompt choice for model-graded evaluation. As mentioned above, the prompts can be seen [here](https://github.com/dankolesnikov/evaluator-app/blob/main/api/text_utils.py). More prompts would be a welcome addition. For example, with the `Descriptive` prompt, you will see a more detailed output with model grade justification.
 88 | 
 89 | ## Logging experiments
 90 | 
 91 | A user can select the desired configuration and then choose `Re-Run Experiment`.
 92 | 
 93 | This will run the new chain on the existing test set.
 94 | 
 95 | The results from all experiments will be summarized in the table and chart.
 96 | 
 97 | ![image](https://user-images.githubusercontent.com/122662504/235396398-5fbf2821-d47f-4496-b8c7-201c9b7e66bc.png)
 98 | 
 99 | ## Contributing
100 | 
101 | Run the backend from `api` folder:
102 | 
103 | ```
104 |  pip install -r requirements.txt
105 |  uvicorn evaluator_app:app
106 |  ```
107 | 
108 | Test the `api` locally:
109 | 
110 | ```
111 | curl -X POST -F "files=@docs/karpathy-lex-pod/karpathy-pod.txt" -F "num_eval_questions=1" -F "chunk_chars=1000" -F "overlap=100" -F "split_method=RecursiveTextSplitter" -F "retriever_type=similarity-search" -F "embeddings=OpenAI" -F "model_version=gpt-3.5-turbo" -F "grade_prompt=Fast" -F "num_neighbors=3" http://localhost:8000/evaluator-stream
112 | ```
113 | 
114 | Run the frontend from `nextjs` folder and view web app at specified URL (e.g., `http://localhost:3000/`):
115 | 
116 | ```
117 | yarn install
118 | yarn dev
119 | ```
120 | 
121 | ### Environment Variables
122 | 
123 | Front-end: 
124 | 
125 | `.env.local` contains the env variables needed to run the project. 
126 | 
127 | Back-end:
128 | 
129 | Specify the API keys for any models that you want to use.
130 | 
131 | ```
132 | OPENAI_API_KEY=
133 | ANTHROPIC_API_KEY=
134 | ```
135 | 
136 | ## Deployment
137 | 
138 | The front-end is deployed to [Vercel](https://vercel.com/).
139 | 
140 | The back-end is deployed to [Railway](https://railway.app/).
141 | 


--------------------------------------------------------------------------------
/api/.env:
--------------------------------------------------------------------------------
1 | ENVIRONMENT=development


--------------------------------------------------------------------------------
/api/README.md:
--------------------------------------------------------------------------------
 1 | # `auto-evaluator-api`
 2 | 
 3 | This API includes much of the functionality of the [auto-evaluator Streamlit app](https://github.com/PineappleExpress808/auto-evaluator).
 4 | 
 5 | And it is the back-end for [the hosted app](https://autoevaluator.langchain.com/).
 6 | 
 7 | ### `Test locally` - 
 8 | 
 9 | Set API keys: 
10 | ```
11 | export OPENAI_API_KEY=
12 | export ANTHROPIC_API_KEY=
13 | ```
14 | 
15 | Start local server:
16 | ```
17 | uvicorn evaluator_app:app
18 | ```
19 | 
20 | `Disclaimer: You will not be able to use all the models unless you have the corresponding API key (e.g., Anthropic).`
21 | 
22 | Test:
23 | ```
24 | curl -X POST -F "files=@docs/karpathy-lex-pod/karpathy-pod.txt" -F "num_eval_questions=1" -F "chunk_chars=1000" -F "overlap=100" -F "split_method=RecursiveTextSplitter" -F "retriever_type=similarity-search" -F "embeddings=OpenAI" -F "model_version=gpt-3.5-turbo" -F "grade_prompt=Fast" -F "num_neighbors=3" http://localhost:8000/evaluator-stream
25 | ```
26 | 
27 | ### `Test deployed API -`  
28 | 
29 | This API is deployed to [Railway](https://railway.app/).
30 |  
31 | ```
32 | curl -X POST -F "files=@docs/karpathy-lex-pod/karpathy-pod.txt" -F "num_eval_questions=1" -F "chunk_chars=1000" -F "overlap=100" -F "split_method=RecursiveTextSplitter" -F "retriever_type=similarity-search" -F "embeddings=OpenAI" -F "model_version=gpt-3.5-turbo" -F "grade_prompt=Fast" -F "num_neighbors=3" https://auto-evaluator-production.up.railway.app/evaluator-stream
33 | 
34 | ```


--------------------------------------------------------------------------------
/api/Test_Inference.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "2c926d26",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "### Test Vicuna\n",
  9 |     "\n",
 10 |     "* `Background`: https://python.langchain.com/en/latest/modules/models/llms/integrations/llamacpp.html\n",
 11 |     "* Reproduce the logic that happens in API of the `auto-evaluator`"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "id": "b08e9089",
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "!pip install llama-cpp-python"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "id": "9d96ad1f",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "import glob, os\n",
 32 |     "from langchain.llms import LlamaCpp\n",
 33 |     "from langchain.llms import Replicate\n",
 34 |     "from langchain.chains import RetrievalQA\n",
 35 |     "from langchain.vectorstores import FAISS\n",
 36 |     "from langchain import PromptTemplate, LLMChain\n",
 37 |     "from langchain.callbacks.base import BaseCallbackManager\n",
 38 |     "from langchain.embeddings.openai import OpenAIEmbeddings\n",
 39 |     "from langchain.document_loaders import UnstructuredFileLoader\n",
 40 |     "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
 41 |     "from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "id": "47f4e440",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "`Load`"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "id": "8fb243c5",
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "def load_docs(files):\n",
 60 |     "\n",
 61 |     "    # Load docs\n",
 62 |     "    # IN: List of upload files (from Streamlit)\n",
 63 |     "    # OUT: str\n",
 64 |     "    # TODO: Support multple docs, Use Langchain loader\n",
 65 |     "\n",
 66 |     "    all_text = \"\"\n",
 67 |     "    for file_path in files:\n",
 68 |     "        file_extension = os.path.splitext(file_path)[1]\n",
 69 |     "        if file_extension == \".pdf\":\n",
 70 |     "            pdf_reader = pypdf.PdfReader(file_path)\n",
 71 |     "            text = \"\"\n",
 72 |     "            for page in pdf_reader.pages:\n",
 73 |     "                text += page.extract_text()\n",
 74 |     "            all_text += text\n",
 75 |     "        elif file_extension == \".txt\":\n",
 76 |     "            loader = UnstructuredFileLoader(file_path)\n",
 77 |     "            docs = loader.load()\n",
 78 |     "            all_text += docs[0].page_content\n",
 79 |     "        else:\n",
 80 |     "            print('Please provide txt or pdf.')\n",
 81 |     "\n",
 82 |     "    return all_text\n",
 83 |     "\n",
 84 |     "fis = glob.glob(\"docs/karpathy-lex-pod/*txt\")\n",
 85 |     "text = load_docs(fis)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "id": "d6e75a9c",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "`Split`"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "id": "a3370cd8",
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "def split_texts(text, chunk_size, overlap, split_method):\n",
104 |     "\n",
105 |     "    # Split text\n",
106 |     "    # IN: text, chunk size, overlap\n",
107 |     "    # OUT: list of str splits\n",
108 |     "    # TODO: Add parameter for splitter type\n",
109 |     "\n",
110 |     "    print(\"`Splitting doc ...`\")\n",
111 |     "    if split_method == \"RecursiveTextSplitter\":\n",
112 |     "        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,\n",
113 |     "                                                       chunk_overlap=overlap)\n",
114 |     "    elif split_method == \"CharacterTextSplitter\":\n",
115 |     "        text_splitter = CharacterTextSplitter(separator=\" \",\n",
116 |     "                                              chunk_size=chunk_size,\n",
117 |     "                                              chunk_overlap=overlap)\n",
118 |     "    splits = text_splitter.split_text(text)\n",
119 |     "    return splits\n",
120 |     "\n",
121 |     "split_method = \"RecursiveTextSplitter\" \n",
122 |     "overlap = 20\n",
123 |     "chunk_size = 500\n",
124 |     "splits = split_texts(text, chunk_size, overlap, split_method)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "id": "8b3c35fd",
130 |    "metadata": {},
131 |    "source": [
132 |     "`Test model`"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "id": "521ab75c",
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "### *** update with your local path *** ###\n",
143 |     "LLAMA_CPP_PATH = \"/Users/31treehaus/Desktop/AI/llama.cpp\""
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "id": "6264c05d",
150 |    "metadata": {
151 |     "scrolled": false
152 |    },
153 |    "outputs": [],
154 |    "source": [
155 |     "# Pass the raw question into the prompt template.\n",
156 |     "template = \"\"\"Question: {question}\n",
157 |     "Answer: Let's think step by step.\"\"\"\n",
158 |     "prompt = PromptTemplate(template=template, input_variables=[\"question\"])\n",
159 |     "\n",
160 |     "callback_manager = BaseCallbackManager([StreamingStdOutCallbackHandler()])\n",
161 |     "llm = LlamaCpp(\n",
162 |     "    \n",
163 |     "    model_path=LLAMA_CPP_PATH+\"models/vicuna_13B/ggml-vicuna-13b-4bit.bin\",\n",
164 |     "    callback_manager=callback_manager,\n",
165 |     "    verbose=True,\n",
166 |     "    n_threads=6,\n",
167 |     "    n_ctx=2048,\n",
168 |     "    use_mlock=True)\n",
169 |     "\n",
170 |     "llm_chain = LLMChain(prompt=prompt,llm=llm)\n",
171 |     "question = \"What NFL team won the Super Bowl in the year Justin Bieber was born?\"\n",
172 |     "llm_chain.run(question)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "id": "68a09e14",
178 |    "metadata": {},
179 |    "source": [
180 |     "`Make Retrieval Chain`"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "id": "a5d174af",
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "def make_retriever(splits, retriever_type, embeddings, num_neighbors):\n",
191 |     "\n",
192 |     "    # Make document retriever\n",
193 |     "    # IN: list of str splits, retriever type, embedding type, number of neighbors for retrieval\n",
194 |     "    # OUT: retriever\n",
195 |     "\n",
196 |     "    print(\"`Making retriever ...`\")\n",
197 |     "    # Set embeddings\n",
198 |     "    if embeddings == \"OpenAI\":\n",
199 |     "        embd = OpenAIEmbeddings()\n",
200 |     "    elif embeddings == \"HuggingFace\":\n",
201 |     "        embd = HuggingFaceEmbeddings()\n",
202 |     "\n",
203 |     "    # Select retriever\n",
204 |     "    if retriever_type == \"similarity-search\":\n",
205 |     "        try:\n",
206 |     "            vectorstore = FAISS.from_texts(splits, embd)\n",
207 |     "        except ValueError:\n",
208 |     "            print(\"`Error using OpenAI embeddings (disallowed TikToken token in the text). Using HuggingFace.`\")\n",
209 |     "            vectorstore = FAISS.from_texts(splits, HuggingFaceEmbeddings())\n",
210 |     "        retriever = vectorstore.as_retriever(k=num_neighbors)\n",
211 |     "    elif retriever_type == \"SVM\":\n",
212 |     "        retriever = SVMRetriever.from_texts(splits,embd)\n",
213 |     "    elif retriever_type == \"TF-IDF\":\n",
214 |     "        retriever = TFIDFRetriever.from_texts(splits)\n",
215 |     "    return retriever\n",
216 |     "\n",
217 |     "retriever_type = \"similarity-search\"\n",
218 |     "embeddings = \"OpenAI\"\n",
219 |     "num_neighbors = 3\n",
220 |     "retriever = make_retriever(splits, retriever_type, embeddings, num_neighbors)"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "id": "7205c92d",
226 |    "metadata": {},
227 |    "source": [
228 |     "`Make Prompt`"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "id": "26bed6cd",
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "template = \"\"\"Use the following pieces of context to answer the question at the end. Use three sentences maximum. \n",
239 |     "{context}\n",
240 |     "Question: {question}\n",
241 |     "Answer: Think step by step \"\"\"\n",
242 |     "\n",
243 |     "QA_CHAIN_PROMPT = PromptTemplate(input_variables=[\"context\", \"question\"],template=template,)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "id": "5deb1522",
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "def make_llm(model):\n",
254 |     "    \"\"\"\n",
255 |     "    Make LLM\n",
256 |     "    @param model: LLM to use\n",
257 |     "    @return: LLM\n",
258 |     "    \"\"\"\n",
259 |     "\n",
260 |     "    if model in (\"gpt-3.5-turbo\", \"gpt-4\"):\n",
261 |     "        llm = ChatOpenAI(model_name=model, temperature=0)\n",
262 |     "    elif model == \"anthropic\":\n",
263 |     "        llm = ChatAnthropic(temperature=0)\n",
264 |     "    elif model in (\"vicuna-7b\",\"vicuna-13b\"):\n",
265 |     "        callback_manager = BaseCallbackManager([StreamingStdOutCallbackHandler()])\n",
266 |     "        if model == \"vicuna-7b\":\n",
267 |     "            llm = LlamaCpp(\n",
268 |     "                model_path=LLAMA_CPP_PATH+\"models/vicuna_7B/ggml-vicuna-7b-q4_0.bin\",\n",
269 |     "                callback_manager=callback_manager,\n",
270 |     "                verbose=True,\n",
271 |     "                n_threads=6,\n",
272 |     "                n_ctx=2048,\n",
273 |     "                use_mlock=True)\n",
274 |     "        else:\n",
275 |     "            llm = LlamaCpp(\n",
276 |     "                model_path=LLAMA_CPP_PATH+\"models/vicuna_13B/ggml-vicuna-13b-4bit.bin\",\n",
277 |     "                callback_manager=callback_manager,\n",
278 |     "                verbose=True,\n",
279 |     "                n_threads=6,\n",
280 |     "                n_ctx=2048,\n",
281 |     "                use_mlock=True)\n",
282 |     "    return llm\n",
283 |     "\n",
284 |     "llm = make_llm('vicuna-13b')"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "id": "6f48abfe",
290 |    "metadata": {},
291 |    "source": [
292 |     "`Eval Set`"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": null,
298 |    "id": "f861a780",
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "import json, pandas as pd\n",
303 |     "test_dataset = pd.read_csv(\"docs/karpathy-lex-pod/karpathy-pod-eval.csv\")\n",
304 |     "qus = []\n",
305 |     "for i in test_dataset.index:\n",
306 |     "    question = test_dataset.loc[i, \"question\"]\n",
307 |     "    answer = test_dataset.loc[i, \"answer\"]\n",
308 |     "    data = {\n",
309 |     "        \"question\": question,\n",
310 |     "        \"answer\": answer\n",
311 |     "    }\n",
312 |     "    qus.append(data)"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "id": "72e60bb5",
319 |    "metadata": {
320 |     "scrolled": true
321 |    },
322 |    "outputs": [],
323 |    "source": [
324 |     "qus[0]"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "markdown",
329 |    "id": "6003593a",
330 |    "metadata": {},
331 |    "source": [
332 |     "`Run Inference`"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": null,
338 |    "id": "6f675d67",
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": [
342 |     "def make_chain(llm, retriever, retriever_type):\n",
343 |     "    \"\"\"\n",
344 |     "    Make retrieval chain\n",
345 |     "    @param llm: model\n",
346 |     "    @param retriever: retriever\n",
347 |     "    @param retriever_type: retriever type\n",
348 |     "    @return: QA chain or Llama-Index retriever, which enables QA\n",
349 |     "    \"\"\"\n",
350 |     "\n",
351 |     "    chain_type_kwargs = {\"prompt\": QA_CHAIN_PROMPT}\n",
352 |     "    qa_chain = RetrievalQA.from_chain_type(llm,\n",
353 |     "                                           chain_type=\"stuff\",\n",
354 |     "                                           retriever=retriever,\n",
355 |     "                                           chain_type_kwargs=chain_type_kwargs,\n",
356 |     "                                           input_key=\"question\")\n",
357 |     "    return qa_chain\n",
358 |     "\n",
359 |     "qa_chain = make_chain(llm, retriever, retriever_type)\n",
360 |     "result = qa_chain(qus[0])\n",
361 |     "result"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "markdown",
366 |    "id": "469143b0",
367 |    "metadata": {},
368 |    "source": [
369 |     "`Test Vicuna endpoint on Replicate`\n",
370 |     "\n",
371 |     "Deployed to `A100` on Replicate.\n",
372 |     "\n",
373 |     "* `max_length` maximum length of the prompt + the output for a given generation\n",
374 |     "* `context window` 2048 tokens\n",
375 |     "\n",
376 |     "Useful reference:\n",
377 |     "https://github.com/replicate/cog-vicuna-13b/issues/3\n",
378 |     "\n"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": null,
384 |    "id": "c7a38076",
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": [
388 |     "llm = Replicate(model=\"replicate/vicuna-13b:e6d469c2b11008bb0e446c3e9629232f9674581224536851272c54871f84076e\",\n",
389 |     "        input={\"temperature\": 0.75, \"max_length\": 3000, \"top_p\":0.25})"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": null,
395 |    "id": "f81af87e",
396 |    "metadata": {},
397 |    "outputs": [],
398 |    "source": [
399 |     "from text_utils import QA_CHAIN_PROMPT, QA_CHAIN_PROMPT_LLAMA\n",
400 |     "chain_type_kwargs = {\"prompt\": QA_CHAIN_PROMPT_LLAMA}\n",
401 |     "qa_chain = RetrievalQA.from_chain_type(llm,\n",
402 |     "                                       chain_type=\"stuff\",\n",
403 |     "                                       retriever=retriever,\n",
404 |     "                                       chain_type_kwargs=chain_type_kwargs,\n",
405 |     "                                       input_key=\"question\")"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": null,
411 |    "id": "81694537",
412 |    "metadata": {},
413 |    "outputs": [],
414 |    "source": [
415 |     "qa_chain(qus[0])"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "markdown",
420 |    "id": "5a8ada58",
421 |    "metadata": {},
422 |    "source": [
423 |     "`Test Mosaic`\n",
424 |     "\n",
425 |     "LangChain docs: \n",
426 |     "\n",
427 |     "https://python.langchain.com/en/latest/modules/models/text_embedding/examples/mosaicml.html\n",
428 |     "\n",
429 |     "Args: \n",
430 |     "\n",
431 |     "https://docs.mosaicml.com/en/latest/inference.html"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": null,
437 |    "id": "d9371e1a",
438 |    "metadata": {},
439 |    "outputs": [],
440 |    "source": [
441 |     "from langchain.llms import MosaicML"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": null,
447 |    "id": "402f9fea",
448 |    "metadata": {
449 |     "scrolled": false
450 |    },
451 |    "outputs": [],
452 |    "source": [
453 |     "llm = MosaicML(inject_instruction_format=True,model_kwargs={'do_sample': False,'max_length': 3000})"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": null,
459 |    "id": "dfb1753c",
460 |    "metadata": {},
461 |    "outputs": [],
462 |    "source": [
463 |     "template = \"\"\"Use the following pieces of context to answer the question at the end. Use three sentences maximum. \n",
464 |     "{context}\n",
465 |     "Question: {question} \"\"\"\n",
466 |     "QA_CHAIN_PROMPT = PromptTemplate(input_variables=[\"context\", \"question\"],template=template,)"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": null,
472 |    "id": "07232f14",
473 |    "metadata": {},
474 |    "outputs": [],
475 |    "source": [
476 |     "chain_type_kwargs = {\"prompt\": QA_CHAIN_PROMPT}\n",
477 |     "qa_chain = RetrievalQA.from_chain_type(llm,\n",
478 |     "                                       chain_type=\"stuff\",\n",
479 |     "                                       retriever=retriever,\n",
480 |     "                                       chain_type_kwargs=chain_type_kwargs,\n",
481 |     "                                       input_key=\"question\")"
482 |    ]
483 |   },
484 |   {
485 |    "cell_type": "code",
486 |    "execution_count": null,
487 |    "id": "9030598d",
488 |    "metadata": {},
489 |    "outputs": [],
490 |    "source": [
491 |     "qa_chain(qus[0])"
492 |    ]
493 |   }
494 |  ],
495 |  "metadata": {
496 |   "kernelspec": {
497 |    "display_name": "Python 3 (ipykernel)",
498 |    "language": "python",
499 |    "name": "python3"
500 |   },
501 |   "language_info": {
502 |    "codemirror_mode": {
503 |     "name": "ipython",
504 |     "version": 3
505 |    },
506 |    "file_extension": ".py",
507 |    "mimetype": "text/x-python",
508 |    "name": "python",
509 |    "nbconvert_exporter": "python",
510 |    "pygments_lexer": "ipython3",
511 |    "version": "3.9.16"
512 |   }
513 |  },
514 |  "nbformat": 4,
515 |  "nbformat_minor": 5
516 | }
517 | 


--------------------------------------------------------------------------------
/api/__pycache__/evaluator_app.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/api/__pycache__/evaluator_app.cpython-38.pyc


--------------------------------------------------------------------------------
/api/__pycache__/text_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/api/__pycache__/text_utils.cpython-38.pyc


--------------------------------------------------------------------------------
/api/docs/gpt3/2005.14165.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/api/docs/gpt3/2005.14165.pdf


--------------------------------------------------------------------------------
/api/docs/gpt3/gpt3-eval.csv:
--------------------------------------------------------------------------------
1 | "question","answer",
2 | "What are the limitations of task-specific fine-tuning?", "First, the need for a large dataset of labeled examples for every new task limits the applicability of language models. Second, high capacity models tend to over-fit on narrow fine-tuning datasets and do not generalize well outside of them. Third, humans do not require large supervised datasets to learn most language tasks. To be broadly useful, we would someday like our NLP systems to have this same fluidity and generality.",
3 | "What is in-context learning?","In-context learning is an approach to meta-learning, which means the model develops a broad set of skills and pattern recognition abilities at training time, and then uses those abilities at inference time to rapidly adapt to or recognize the desired task when given examples. This involves absorbing many skills and tasks within the parameters of the model.",
4 | "On what NLP tasks does GPT3 report state-of-the-art performance using zero or few shot learning relative to fine-tuned benchmarks?","GPT3 achieves 71.2% on TriviaQA in the few-shot setting, which is state of the art relative to fine-tuned models operating in the same closed-book setting.",
5 | "What are the pros and cons of fine-tuning, zero-shot learning, and few-shot learning?","Fine-tuning involves updating the weights of a pre-trained model by training on a supervised dataset specific to the desired task. It benefits from strong performance on many benchmarks, but requires a new large dataset for every task. Few shot learning gives the model a few demonstrations of the task at inference time as conditioning, but no weight updates are done. It benefits from a major reduction in the need for task-specific data. But results from this method have so far been much worse than state-of-the-art fine-tuned models. In zero-shot learning, the model is only given a natural language instruction describing the task without any examples. It is the most convent and potentially robust approach, but the most challenges (especially for tasks that are difficult to describe).",
6 | "How is the batch size increased over the course of training?","The batch size is increased linearly from a small value (32k tokens) to the full value over the first 4-12 billion tokens of training, depending on the model size.",


--------------------------------------------------------------------------------
/api/docs/karpathy-lex-pod/karpathy-pod-eval.csv:
--------------------------------------------------------------------------------
1 | "question","answer",
2 | "Why is the transformer architecture expressive in the forward pass?","The transformer architecture is expressive because it uses a general message passing scheme where nodes get to look at each other, decide what's interesting and then update each other.",
3 | "What design criteria does the Transformer meet?","The transformer is very expressive in a forward pass, optimizable in the backward pass using the techniques that we have such as gradient descent, and it can run efficiently on our hardware such as GPUs.",
4 | "Why is next word prediction an effective training objective?","On a sufficiently large dataset, the task of predicting the next word multi-tasks knowledge of a lot of things, including understanding of chemistry, physics, and human nature. You have to understand a lot about the world to make that prediction on an internet-scale dataset.",
5 | "What was the World Of Bits project and why did it fail?","World Of Bits was an effort to give AI access to tools, such as a keyboard and mouse, in order to complete tasks, such as complete bookings. It failed because it turned out that reinforcement learning is an extremely inefficient way of training neural networks. You take many actions, but you only get a sparse reward once in a while. Starting from scratch, it is very unlikely to stumble on the correct action - such as a booking - by chance at random, so the reward signal is very sparse.",
6 | "Why can additional sensors be a liability in an autonomous vehicle system?","Each sensor adds complexity to the system. The hardware must be sourced, versioned, and maintain firmware. Software must ingest it, track versions. The cost of this additional bloat or entropy must be weighted against the added benefit of that particular sensor."


--------------------------------------------------------------------------------
/api/docs/transformers-challenge/2005.14165.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/api/docs/transformers-challenge/2005.14165.pdf


--------------------------------------------------------------------------------
/api/docs/transformers-challenge/2112.04426.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/api/docs/transformers-challenge/2112.04426.pdf


--------------------------------------------------------------------------------
/api/docs/transformers-challenge/2203.15556.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/api/docs/transformers-challenge/2203.15556.pdf


--------------------------------------------------------------------------------
/api/docs/transformers-challenge/2205.06175.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/api/docs/transformers-challenge/2205.06175.pdf


--------------------------------------------------------------------------------
/api/docs/transformers-challenge/2302.13971.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/api/docs/transformers-challenge/2302.13971.pdf


--------------------------------------------------------------------------------
/api/docs/transformers-challenge/transformers-eval.csv:
--------------------------------------------------------------------------------
 1 | "question","answer",
 2 | "What are the limitations of task-specific fine-tuning?", "First, the need for a large dataset of labeled examples for every new task limits the applicability of language models. Second, high capacity models tend to over-fit on narrow fine-tuning datasets and do not generalize well outside of them. Third, humans do not require large supervised datasets to learn most language tasks. To be broadly useful, we would someday like our NLP systems to have this same fluidity and generality.",
 3 | "What is in-context learning?","In-context learning is an approach to meta-learning, which means the model develops a broad set of skills and pattern recognition abilities at training time, and then uses those abilities at inference time to rapidly adapt to or recognize the desired task when given examples. This involves absorbing many skills and tasks within the parameters of the model.",
 4 | "On what NLP tasks does GPT3 report state-of-the-art performance using zero or few shot learning relative to fine-tuned benchmarks?","GPT3 achieves 71.2% on TriviaQA in the few-shot setting, which is state of the art relative to fine-tuned models operating in the same closed-book setting.",
 5 | "What are the pros and cons of fine-tuning, zero-shot learning, and few-shot learning?","Fine-tuning involves updating the weights of a pre-trained model by training on a supervised dataset specific to the desired task. It benefits from strong performance on many benchmarks, but requires a new large dataset for every task. Few shot learning gives the model a few demonstrations of the task at inference time as conditioning, but no weight updates are done. It benefits from a major reduction in the need for task-specific data. But results from this method have so far been much worse than state-of-the-art fine-tuned models. In zero-shot learning, the model is only given a natural language instruction describing the task without any examples. It is the most convent and potentially robust approach, but the most challenges (especially for tasks that are difficult to describe).",
 6 | "How is the batch size increased for the GPT3 models?","The batch size is increased linearly from a small value (32k tokens) to the full value of 3.2M token over the first 2 billion tokens of training.",
 7 | "How does RETRO perform retrieval in terms of search and latency?", " For each chunk, RETRO will retrieve its approximate k-nearest neighbours from a key-value database using the L2 distance on BERT embeddings. It uses the SCaNN library to query the approximate nearest neighbours in O(log𝑇) time."
 8 | "What scaling law does the Chinchilla paper propose and how does Chinchilla compare to Gopher?", "The paper fits a scaling law for loss  L, as a function of model size N and data size D. Based on the losses of over 400 models, the paper suggests that large models should be substantially smaller and therefore trained much longer than is currently done. They verify this by training a more compute-optimal 70B model, called Chinchilla, on 1.4 trillion tokens, which is 4x smaller than Gopher."
 9 | "How do the LLaMA model compare to prior benchmarks, such as PALM, Chinchilla, and GPT-3?","LLaMA is trained only on publicly available data, making the work compatible with open-sourcing. LLaMA-13B outperforms GPT-3 on most benchmarks, despite being 10× smaller. The 65B-parameter model is also competitive with the best large language models such as Chinchilla or PaLM-540B."
10 | "How did the LLaMA models draw inspiration from GPT3, PaLM, GPTNeo, or Chinchilla?","Like GPT3, LLaMA normalizes the input of each transformer sub-layer using RMSNorm. Like PaLM, they replace the ReLU non-linearity with the SwiGLU activation function. Like GPTNeo, they remove the absolute positional embeddings, and instead, add rotary positional embeddings. The general approach was inspired by the Chinchilla scaling laws: LLaMA-13B outperforms GPT3, but can be run on a single GPU."
11 | "How does Gato embed multi-modal inputs?" , "Tokens belonging to text, discrete or continuous-valued observations or actions for any time-step are embedded via a lookup table into a learned vector embedding space. Tokens belonging to image patches for any time-step are embedded using a single ResNet block to obtain a vector per patch."


--------------------------------------------------------------------------------
/api/evaluator_app.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is an API to support the LLM QA chain auto-evaluator. 
  3 | """
  4 | 
  5 | import io
  6 | import os
  7 | from dotenv import load_dotenv
  8 | import sentry_sdk
  9 | import json
 10 | import time
 11 | import pypdf
 12 | import random
 13 | import logging
 14 | import itertools
 15 | import faiss
 16 | import pandas as pd
 17 | from typing import Dict, List
 18 | from json import JSONDecodeError
 19 | from langchain.llms import MosaicML
 20 | from langchain.llms import Anthropic
 21 | from langchain.llms import Replicate
 22 | from langchain.schema import Document
 23 | from langchain.vectorstores import FAISS
 24 | from langchain.chains import RetrievalQA
 25 | from langchain.chat_models import ChatOpenAI
 26 | from langchain.chains import QAGenerationChain
 27 | from langchain.retrievers import SVMRetriever
 28 | from langchain.evaluation.qa import QAEvalChain
 29 | from langchain.retrievers import TFIDFRetriever
 30 | from sse_starlette.sse import EventSourceResponse
 31 | from fastapi.middleware.cors import CORSMiddleware
 32 | from langchain.embeddings import LlamaCppEmbeddings
 33 | from langchain.embeddings import MosaicMLInstructorEmbeddings
 34 | from fastapi import FastAPI, File, UploadFile, Form
 35 | from langchain.embeddings.openai import OpenAIEmbeddings
 36 | from langchain.chains.question_answering import load_qa_chain
 37 | from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
 38 | from text_utils import GRADE_DOCS_PROMPT, GRADE_ANSWER_PROMPT, GRADE_DOCS_PROMPT_FAST, GRADE_ANSWER_PROMPT_FAST, GRADE_ANSWER_PROMPT_BIAS_CHECK, GRADE_ANSWER_PROMPT_OPENAI, QA_CHAIN_PROMPT, QA_CHAIN_PROMPT_LLAMA
 39 | 
 40 | def generate_eval(text, chunk, logger):
 41 |     """
 42 |     Generate question answer pair from input text 
 43 |     @param text: text to generate eval set from
 44 |     @param chunk: chunk size to draw question from text
 45 |     @param logger: logger
 46 |     @return: dict with keys "question" and "answer"
 47 |     """
 48 | 
 49 |     logger.info("`Generating eval QA pair ...`")
 50 |     # Generate random starting index in the doc to draw question from
 51 |     num_of_chars = len(text)
 52 |     starting_index = random.randint(0, num_of_chars-chunk)
 53 |     sub_sequence = text[starting_index:starting_index+chunk]
 54 |     # Set up QAGenerationChain chain using GPT 3.5 as default
 55 |     chain = QAGenerationChain.from_llm(ChatOpenAI(temperature=0))
 56 |     eval_set = []
 57 |     # Catch any QA generation errors and re-try until QA pair is generated
 58 |     awaiting_answer = True
 59 |     while awaiting_answer:
 60 |         try:
 61 |             qa_pair = chain.run(sub_sequence)
 62 |             eval_set.append(qa_pair)
 63 |             awaiting_answer = False
 64 |         except JSONDecodeError:
 65 |             logger.error("Error on question")
 66 |             starting_index = random.randint(0, num_of_chars-chunk)
 67 |             sub_sequence = text[starting_index:starting_index+chunk]
 68 |     eval_pair = list(itertools.chain.from_iterable(eval_set))
 69 |     return eval_pair
 70 | 
 71 | 
 72 | def split_texts(text, chunk_size, overlap, split_method, logger):
 73 |     """
 74 |     Split text into chunks
 75 |     @param text: text to split
 76 |     @param chunk_size: charecters per split
 77 |     @param overlap: charecter overlap between splits
 78 |     @param split_method: method used to split text
 79 |     @param logger: logger
 80 |     @return: list of str splits
 81 |     """
 82 | 
 83 |     logger.info("`Splitting doc ...`")
 84 |     if split_method == "RecursiveTextSplitter":
 85 |         text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
 86 |                                                        chunk_overlap=overlap)
 87 |     elif split_method == "CharacterTextSplitter":
 88 |         text_splitter = CharacterTextSplitter(separator=" ",
 89 |                                               chunk_size=chunk_size,
 90 |                                               chunk_overlap=overlap)
 91 |     splits = text_splitter.split_text(text)
 92 |     return splits
 93 | 
 94 | 
 95 | def make_llm(model):
 96 |     """
 97 |     Make LLM
 98 |     @param model: LLM to use
 99 |     @return: LLM
100 |     """
101 | 
102 |     if model in ("gpt-3.5-turbo", "gpt-4"):
103 |         llm = ChatOpenAI(model_name=model, temperature=0)
104 |     elif model == "anthropic":
105 |         llm = Anthropic(temperature=0)
106 |     elif model == "Anthropic-100k":
107 |         llm = Anthropic(model="claude-v1-100k",temperature=0)
108 |     elif model == "vicuna-13b":
109 |         llm = Replicate(model="replicate/vicuna-13b:e6d469c2b11008bb0e446c3e9629232f9674581224536851272c54871f84076e",
110 |                 input={"temperature": 0.75, "max_length": 3000, "top_p":0.25})
111 |     elif model == "mosaic":
112 |         llm = MosaicML(inject_instruction_format=True,model_kwargs={'do_sample': False, 'max_length': 3000})
113 |     return llm
114 | 
115 | def make_retriever(splits, retriever_type, embeddings, num_neighbors, llm, logger):
116 |     """
117 |     Make document retriever
118 |     @param splits: list of str splits
119 |     @param retriever_type: retriever type
120 |     @param embedding_type: embedding type
121 |     @param num_neighbors: number of neighbors for retrieval
122 |     @param _llm: model
123 |     @param logger: logger
124 |     @return: retriever
125 |     """
126 | 
127 |     logger.info("`Making retriever ...`")
128 |     # Set embeddings
129 |     if embeddings == "OpenAI":
130 |         embd = OpenAIEmbeddings()
131 |     # Note: Still WIP (can't be selected by user yet)
132 |     elif embeddings == "LlamaCppEmbeddings":
133 |         embd = LlamaCppEmbeddings(model="replicate/vicuna-13b:e6d469c2b11008bb0e446c3e9629232f9674581224536851272c54871f84076e")
134 |     # Note: Test
135 |     elif embeddings == "Mosaic":
136 |         embd = MosaicMLInstructorEmbeddings(query_instruction="Represent the query for retrieval: ")
137 | 
138 |     # Select retriever
139 |     if retriever_type == "similarity-search":
140 |         vectorstore = FAISS.from_texts(splits, embd)
141 |         retriever = vectorstore.as_retriever(k=num_neighbors)
142 |     elif retriever_type == "SVM":
143 |         retriever = SVMRetriever.from_texts(splits, embd)
144 |     elif retriever_type == "TF-IDF":
145 |         retriever = TFIDFRetriever.from_texts(splits)
146 |     elif retriever_type == "Anthropic-100k":
147 |          retriever = llm
148 |     return retriever
149 | 
150 | def make_chain(llm, retriever, retriever_type, model):
151 | 
152 |     """
153 |     Make retrieval chain
154 |     @param llm: model
155 |     @param retriever: retriever
156 |     @param retriever_type: retriever type
157 |     @return: QA chain
158 |     """
159 | 
160 |     # Select prompt 
161 |     if model == "vicuna-13b":
162 |         # Note: Better answer quality using default prompt 
163 |         # chain_type_kwargs = {"prompt": QA_CHAIN_PROMPT_LLAMA}
164 |         chain_type_kwargs = {"prompt": QA_CHAIN_PROMPT}
165 |     else: 
166 |         chain_type_kwargs = {"prompt": QA_CHAIN_PROMPT}
167 | 
168 |     # Select model 
169 |     if retriever_type == "Anthropic-100k":
170 |         qa_chain = load_qa_chain(llm,chain_type="stuff",prompt=QA_CHAIN_PROMPT)
171 |     else:
172 |         qa_chain = RetrievalQA.from_chain_type(llm,
173 |                                                chain_type="stuff",
174 |                                                retriever=retriever,
175 |                                                chain_type_kwargs=chain_type_kwargs,
176 |                                                input_key="question")
177 |     return qa_chain
178 | 
179 | 
180 | def grade_model_answer(predicted_dataset, predictions, grade_answer_prompt, logger):
181 |     """
182 |     Grades the answer based on ground truth and model predictions.
183 |     @param predicted_dataset: A list of dictionaries containing ground truth questions and answers.
184 |     @param predictions: A list of dictionaries containing model predictions for the questions.
185 |     @param grade_answer_prompt: The prompt level for the grading. Either "Fast" or "Full".
186 |     @param logger: logger
187 |     @return: A list of scores for the distilled answers.
188 |     """
189 | 
190 |     logger.info("`Grading model answer ...`")
191 |     if grade_answer_prompt == "Fast":
192 |         prompt = GRADE_ANSWER_PROMPT_FAST
193 |     elif grade_answer_prompt == "Descriptive w/ bias check":
194 |         prompt = GRADE_ANSWER_PROMPT_BIAS_CHECK
195 |     elif grade_answer_prompt == "OpenAI grading prompt":
196 |         prompt = GRADE_ANSWER_PROMPT_OPENAI
197 |     else:
198 |         prompt = GRADE_ANSWER_PROMPT
199 | 
200 |     # Note: GPT-4 grader is advised by OAI 
201 |     eval_chain = QAEvalChain.from_llm(llm=ChatOpenAI(model_name="gpt-4", temperature=0),
202 |                                       prompt=prompt)
203 |     graded_outputs = eval_chain.evaluate(predicted_dataset,
204 |                                          predictions,
205 |                                          question_key="question",
206 |                                          prediction_key="result")
207 |     return graded_outputs
208 | 
209 | 
210 | def grade_model_retrieval(gt_dataset, predictions, grade_docs_prompt, logger):
211 |     """
212 |     Grades the relevance of retrieved documents based on ground truth and model predictions.
213 |     @param gt_dataset: list of dictionaries containing ground truth questions and answers.
214 |     @param predictions: list of dictionaries containing model predictions for the questions
215 |     @param grade_docs_prompt: prompt level for the grading.
216 |     @return: list of scores for the retrieved documents.
217 |     """
218 | 
219 |     logger.info("`Grading relevance of retrieved docs ...`")
220 |     if grade_docs_prompt == "Fast":
221 |         prompt = GRADE_DOCS_PROMPT_FAST
222 |     else:
223 |         prompt = GRADE_DOCS_PROMPT
224 | 
225 |     # Note: GPT-4 grader is advised by OAI
226 |     eval_chain = QAEvalChain.from_llm(llm=ChatOpenAI(model_name="gpt-4", temperature=0),
227 |                                       prompt=prompt)
228 |     graded_outputs = eval_chain.evaluate(gt_dataset,
229 |                                          predictions,
230 |                                          question_key="question",
231 |                                          prediction_key="result")
232 |     return graded_outputs
233 | 
234 | 
235 | def run_eval(chain, retriever, eval_qa_pair, grade_prompt, retriever_type, num_neighbors, text, logger):
236 |     """
237 |     Runs evaluation on a model's performance on a given evaluation dataset.
238 |     @param chain: Model chain used for answering questions
239 |     @param retriever:  Document retriever used for retrieving relevant documents
240 |     @param eval_set: List of dictionaries containing questions and corresponding ground truth answers
241 |     @param grade_prompt: String prompt used for grading model's performance
242 |     @param retriever_type: String specifying the type of retriever used
243 |     @param num_neighbors: Number of neighbors to retrieve using the retriever
244 |     @param text: full document text
245 |     @return: A tuple of four items:
246 |     - answers_grade: A dictionary containing scores for the model's answers.
247 |     - retrieval_grade: A dictionary containing scores for the model's document retrieval.
248 |     - latencies_list: A list of latencies in seconds for each question answered.
249 |     - predictions_list: A list of dictionaries containing the model's predicted answers and relevant documents for each question.
250 |     """
251 | 
252 |     logger.info("`Running eval ...`")
253 |     predictions = []
254 |     retrieved_docs = []
255 |     gt_dataset = []
256 |     latency = []
257 | 
258 |     # Get answer and log latency
259 |     start_time = time.time()
260 |     if retriever_type == "Anthropic-100k":
261 |         docs=[Document(page_content=text)]
262 |         answer = chain.run(input_documents=docs,question=eval_qa_pair["question"])
263 |         predictions.append(
264 |             {"question": eval_qa_pair["question"], "answer": eval_qa_pair["answer"], "result": answer})
265 |     else :
266 |         predictions.append(chain(eval_qa_pair))
267 |     gt_dataset.append(eval_qa_pair)
268 |     end_time = time.time()
269 |     elapsed_time = end_time - start_time
270 |     latency.append(elapsed_time)
271 | 
272 |     # Extract text from retrieved docs
273 |     retrieved_doc_text = ""
274 |     if retriever_type == "Anthropic-100k":
275 |         retrieved_doc_text = "Doc %s: " % str(eval_qa_pair["answer"])
276 |     else:
277 |         docs = retriever.get_relevant_documents(eval_qa_pair["question"])
278 |         for i, doc in enumerate(docs):
279 |             retrieved_doc_text += "Doc %s: " % str(i+1) + \
280 |                 doc.page_content + " "
281 | 
282 |     # Log
283 |     retrieved = {"question": eval_qa_pair["question"],
284 |                  "answer": eval_qa_pair["answer"], "result": retrieved_doc_text}
285 |     retrieved_docs.append(retrieved)
286 | 
287 |     # Grade
288 |     graded_answers = grade_model_answer(
289 |         gt_dataset, predictions, grade_prompt, logger)
290 |     graded_retrieval = grade_model_retrieval(
291 |         gt_dataset, retrieved_docs, grade_prompt, logger)
292 |     return graded_answers, graded_retrieval, latency, predictions
293 | 
294 | load_dotenv()
295 | 
296 | if os.environ.get("ENVIRONMENT") != "development":
297 |     sentry_sdk.init(
298 |     dsn="https://065aa152c4de4e14af9f9e7335c8eae4@o4505106202820608.ingest.sentry.io/4505106207735808",
299 |     traces_sample_rate=1.0,
300 |     )
301 | 
302 | app = FastAPI()
303 | 
304 | origins = [
305 |     "http://localhost:3000",
306 |     "localhost:3000",
307 |     "https://evaluator-ui.vercel.app/"
308 |     "https://evaluator-ui.vercel.app"
309 |     "evaluator-ui.vercel.app/"
310 |     "evaluator-ui.vercel.app"
311 | ]
312 | 
313 | app.add_middleware(
314 |     CORSMiddleware,
315 |     allow_origins=["*"],
316 |     allow_credentials=True,
317 |     allow_methods=["*"],
318 |     allow_headers=["*"],
319 | )
320 | 
321 | 
322 | @app.get("/")
323 | async def root():
324 |     return {"message": "Welcome to the Auto Evaluator!"}
325 | 
326 | 
327 | def run_evaluator(
328 |     files,
329 |     num_eval_questions,
330 |     chunk_chars,
331 |     overlap,
332 |     split_method,
333 |     retriever_type,
334 |     embeddings,
335 |     model_version,
336 |     grade_prompt,
337 |     num_neighbors,
338 |     test_dataset
339 | ):
340 | 
341 |     # Set up logging
342 |     logging.config.fileConfig('logging.conf', disable_existing_loggers=False)
343 |     logger = logging.getLogger(__name__)
344 | 
345 |     # Read content of files
346 |     texts = []
347 |     fnames = []
348 |     for file in files:
349 |         logger.info("Reading file: {}".format(file.filename))
350 |         contents = file.file.read()
351 |         # PDF file
352 |         if file.content_type == 'application/pdf':
353 |             logger.info("File {} is a PDF".format(file.filename))
354 |             pdf_reader = pypdf.PdfReader(io.BytesIO(contents))
355 |             text = ""
356 |             for page in pdf_reader.pages:
357 |                 text += page.extract_text()
358 |             texts.append(text)
359 |             fnames.append(file.filename)
360 |         # Text file
361 |         elif file.content_type == 'text/plain':
362 |             logger.info("File {} is a TXT".format(file.filename))
363 |             texts.append(contents.decode())
364 |             fnames.append(file.filename)
365 |         else:
366 |             logger.warning(
367 |                 "Unsupported file type for file: {}".format(file.filename))
368 |     text = " ".join(texts)
369 | 
370 |     if retriever_type == "Anthropic-100k":
371 |         splits = ""
372 |         model_version = "Anthropic-100k"
373 |     else:
374 |         logger.info("Splitting texts")
375 |         splits = split_texts(text, chunk_chars, overlap, split_method, logger)
376 | 
377 |     logger.info("Make LLM")
378 |     llm = make_llm(model_version)
379 | 
380 |     logger.info("Make retriever")
381 |     retriever = make_retriever(
382 |         splits, retriever_type, embeddings, num_neighbors, llm, logger)
383 | 
384 |     logger.info("Make chain")
385 |     qa_chain = make_chain(llm, retriever, retriever_type, model_version)
386 | 
387 |     for i in range(num_eval_questions):
388 | 
389 |         # Generate one question
390 |         if i < len(test_dataset):
391 |             eval_pair = test_dataset[i]
392 |         else:
393 |             eval_pair = generate_eval(text, 3000, logger)
394 |             if len(eval_pair) == 0:
395 |                 # Error in eval generation
396 |                 continue
397 |             else:
398 |                 # This returns a list, so we unpack to dict
399 |                 eval_pair = eval_pair[0]
400 | 
401 |         # Run eval
402 |         graded_answers, graded_retrieval, latency, predictions = run_eval(
403 |             qa_chain, retriever, eval_pair, grade_prompt, retriever_type, num_neighbors, text, logger)
404 | 
405 |         # Assemble output
406 |         d = pd.DataFrame(predictions)
407 |         d['answerScore'] = [g['text'] for g in graded_answers]
408 |         d['retrievalScore'] = [g['text'] for g in graded_retrieval]
409 |         d['latency'] = latency
410 | 
411 |         # Summary statistics
412 |         d['answerScore'] = [{'score': 1 if "Incorrect" not in text else 0,
413 |                              'justification': text} for text in d['answerScore']]
414 |         d['retrievalScore'] = [{'score': 1 if "Incorrect" not in text else 0,
415 |                                 'justification': text} for text in d['retrievalScore']]
416 | 
417 |         # Convert dataframe to dict
418 |         d_dict = d.to_dict('records')
419 |         if len(d_dict) == 1:
420 |             yield json.dumps({"data":  d_dict[0]})
421 |         else:
422 |             logger.warn(
423 |                 "A QA pair was not evaluated correctly. Skipping this pair.")
424 | 
425 | 
426 | @app.post("/evaluator-stream")
427 | async def create_response(
428 |     files: List[UploadFile] = File(...),
429 |     num_eval_questions: int = Form(5),
430 |     chunk_chars: int = Form(1000),
431 |     overlap: int = Form(100),
432 |     split_method: str = Form("RecursiveTextSplitter"),
433 |     retriever_type: str = Form("similarity-search"),
434 |     embeddings: str = Form("OpenAI"),
435 |     model_version: str = Form("gpt-3.5-turbo"),
436 |     grade_prompt: str = Form("Fast"),
437 |     num_neighbors: int = Form(3),
438 |     test_dataset: str = Form("[]"),
439 | ):
440 |     test_dataset = json.loads(test_dataset)
441 |     return EventSourceResponse(run_evaluator(files, num_eval_questions, chunk_chars,
442 |                                              overlap, split_method, retriever_type, embeddings, model_version, grade_prompt, num_neighbors, test_dataset), headers={"Content-Type": "text/event-stream", "Connection": "keep-alive", "Cache-Control": "no-cache"})
443 | 


--------------------------------------------------------------------------------
/api/logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root,uicheckapp
 3 | 
 4 | [handlers]
 5 | keys=consoleHandler
 6 | 
 7 | [formatters]
 8 | keys=normalFormatter
 9 | 
10 | [logger_root]
11 | level=INFO
12 | handlers=consoleHandler
13 | 
14 | [logger_uicheckapp]
15 | level=DEBUG
16 | handlers=consoleHandler
17 | qualname=uicheckapp
18 | propagate=0
19 | 
20 | [formatter_normalFormatter]
21 | format=%(asctime)s loglevel=%(levelname)-6s logger=%(name)s %(funcName)s() L%(lineno)-4d %(message)s
22 | 
23 | [handler_consoleHandler]
24 | class=StreamHandler
25 | level=DEBUG
26 | formatter=normalFormatter
27 | args=(sys.stdout,)


--------------------------------------------------------------------------------
/api/railway.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://railway.app/railway.schema.json",
 3 |     "build": {
 4 |       "builder": "NIXPACKS"
 5 |     },
 6 |     "deploy": {
 7 |       "startCommand": "uvicorn evaluator_app:app --host 0.0.0.0 --port $PORT",
 8 |       "restartPolicyType": "ON_FAILURE",
 9 |       "restartPolicyMaxRetries": 10
10 |     }
11 |   }


--------------------------------------------------------------------------------
/api/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas==1.4.3
 2 | fastapi==0.85.2
 3 | langchain==0.0.181
 4 | python-multipart==0.0.6
 5 | uvicorn==0.18.3
 6 | openai==0.27.0
 7 | tiktoken==0.3.1
 8 | faiss-cpu==1.7.3
 9 | huggingface-hub==0.12.0
10 | anthropic==0.2.8
11 | pypdf==3.7.1
12 | filetype==1.2.0
13 | tokenizers==0.13.3
14 | sentence-transformers==2.2.2
15 | scikit-learn==1.2.1
16 | llama-index==0.4.35.post1
17 | sse_starlette==1.3.3
18 | gpt-index==0.5.16
19 | faiss-cpu==1.7.3
20 | python-dotenv==1.0.0
21 | sentry_sdk==1.21.1
22 | llama-cpp-python==0.1.43
23 | replicate==0.8.3


--------------------------------------------------------------------------------
/api/text_utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from langchain.prompts import PromptTemplate
  4 | 
  5 | def clean_pdf_text(text: str) -> str:
  6 |     """Cleans text extracted from a PDF file."""
  7 |     # TODO: Remove References/Bibliography section.
  8 |     return remove_citations(text)
  9 | 
 10 | def remove_citations(text: str) -> str:
 11 |     """Removes in-text citations from a string."""
 12 |     # (Author, Year)
 13 |     text = re.sub(r'\([A-Za-z0-9,.\s]+\s\d{4}\)', '', text)
 14 |     # [1], [2], [3-5], [3, 33, 49, 51]
 15 |     text = re.sub(r'\[[0-9,-]+(,\s[0-9,-]+)*\]', '', text)
 16 |     return text
 17 | 
 18 | template = """You are a teacher grading a quiz. 
 19 | You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either Correct or Incorrect.
 20 | 
 21 | Example Format:
 22 | QUESTION: question here
 23 | STUDENT ANSWER: student's answer here
 24 | TRUE ANSWER: true answer here
 25 | GRADE: Correct or Incorrect here
 26 | 
 27 | Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. If the student answers that there is no specific information provided in the context, then the answer is Incorrect. Begin! 
 28 | 
 29 | QUESTION: {query}
 30 | STUDENT ANSWER: {result}
 31 | TRUE ANSWER: {answer}
 32 | GRADE:
 33 | 
 34 | Your response should be as follows:
 35 | 
 36 | GRADE: (Correct or Incorrect)
 37 | (line break)
 38 | JUSTIFICATION: (Without mentioning the student/teacher framing of this prompt, explain why the STUDENT ANSWER is Correct or Incorrect. Use one or two sentences maximum. Keep the answer as concise as possible.)
 39 | """
 40 | 
 41 | GRADE_ANSWER_PROMPT = PromptTemplate(input_variables=["query", "result", "answer"], template=template)
 42 | 
 43 | template = """You are a teacher grading a quiz. 
 44 | You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either Correct or Incorrect.
 45 | 
 46 | Example Format:
 47 | QUESTION: question here
 48 | STUDENT ANSWER: student's answer here
 49 | TRUE ANSWER: true answer here
 50 | GRADE: Correct or Incorrect here
 51 | 
 52 | Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. If the student answers that there is no specific information provided in the context, then the answer is Incorrect. Begin! 
 53 | 
 54 | QUESTION: {query}
 55 | STUDENT ANSWER: {result}
 56 | TRUE ANSWER: {answer}
 57 | GRADE:"""
 58 | 
 59 | GRADE_ANSWER_PROMPT_FAST = PromptTemplate(input_variables=["query", "result", "answer"], template=template)
 60 | 
 61 | template = """You are a teacher grading a quiz. 
 62 | You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either Correct or Incorrect.
 63 | You are also asked to identify potential sources of bias in the question and in the true answer.
 64 | 
 65 | Example Format:
 66 | QUESTION: question here
 67 | STUDENT ANSWER: student's answer here
 68 | TRUE ANSWER: true answer here
 69 | GRADE: Correct or Incorrect here
 70 | 
 71 | Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. If the student answers that there is no specific information provided in the context, then the answer is Incorrect. Begin! 
 72 | 
 73 | QUESTION: {query}
 74 | STUDENT ANSWER: {result}
 75 | TRUE ANSWER: {answer}
 76 | GRADE:
 77 | 
 78 | Your response should be as follows:
 79 | 
 80 | GRADE: (Correct or Incorrect)
 81 | (line break)
 82 | JUSTIFICATION: (Without mentioning the student/teacher framing of this prompt, explain why the STUDENT ANSWER is Correct or Incorrect, identify potential sources of bias in the QUESTION, and identify potential sources of bias in the TRUE ANSWER. Use one or two sentences maximum. Keep the answer as concise as possible.)
 83 | """
 84 | 
 85 | GRADE_ANSWER_PROMPT_BIAS_CHECK = PromptTemplate(input_variables=["query", "result", "answer"], template=template)
 86 | 
 87 | template = """You are assessing a submitted student answer to a question relative to the true answer based on the provided criteria: 
 88 |     
 89 |     ***
 90 |     QUESTION: {query}
 91 |     ***
 92 |     STUDENT ANSWER: {result}
 93 |     ***
 94 |     TRUE ANSWER: {answer}
 95 |     ***
 96 |     Criteria: 
 97 |       relevance:  Is the submission referring to a real quote from the text?"
 98 |       conciseness:  Is the answer concise and to the point?"
 99 |       correct: Is the answer correct?"
100 |     ***
101 |     Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print "Correct" or "Incorrect" (without quotes or punctuation) on its own line corresponding to the correct answer.
102 |     Reasoning:
103 | """
104 | 
105 | GRADE_ANSWER_PROMPT_OPENAI = PromptTemplate(input_variables=["query", "result", "answer"], template=template)
106 | 
107 | template = """ 
108 |     Given the question: \n
109 |     {query}
110 |     Here are some documents retrieved in response to the question: \n
111 |     {result}
112 |     And here is the answer to the question: \n 
113 |     {answer}
114 |     Criteria: 
115 |       relevance: Are the retrieved documents relevant to the question and do they support the answer?"
116 |     Do the retrieved documents meet the criterion? Print "Correct" (without quotes or punctuation) if the retrieved context are relevant or "Incorrect" if not (without quotes or punctuation) on its own line. """
117 | 
118 | GRADE_DOCS_PROMPT_FAST = PromptTemplate(input_variables=["query", "result", "answer"], template=template)
119 | 
120 | template = """ 
121 |     Given the question: \n
122 |     {query}
123 |     Here are some documents retrieved in response to the question: \n
124 |     {result}
125 |     And here is the answer to the question: \n 
126 |     {answer}
127 |     Criteria: 
128 |       relevance: Are the retrieved documents relevant to the question and do they support the answer?"
129 | 
130 |     Your response should be as follows:
131 | 
132 |     GRADE: (Correct or Incorrect, depending if the retrieved documents meet the criterion)
133 |     (line break)
134 |     JUSTIFICATION: (Write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Use one or two sentences maximum. Keep the answer as concise as possible.)
135 |     """
136 | 
137 | GRADE_DOCS_PROMPT = PromptTemplate(input_variables=["query", "result", "answer"], template=template)
138 | 
139 | 
140 | template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible.
141 | {context}
142 | Question: {question}
143 | Helpful Answer:"""
144 | 
145 | QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)
146 | 
147 | template = """
148 | ### Human
149 | You are question-answering assistant tasked with answering questions based on the provided context. 
150 | 
151 | Here is the question: \
152 | {question}
153 | 
154 | Use the following pieces of context to answer the question at the end. Use three sentences maximum. \
155 | {context}
156 | 
157 | ### Assistant
158 | Answer: Think step by step. """
159 | QA_CHAIN_PROMPT_LLAMA = PromptTemplate(input_variables=["context", "question"],template=template,)
160 | 
161 | 


--------------------------------------------------------------------------------
/nextjs/.env.local:
--------------------------------------------------------------------------------
1 | NEXT_PUBLIC_API_URL=http://localhost:8000


--------------------------------------------------------------------------------
/nextjs/.gitignore:
--------------------------------------------------------------------------------
 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
 2 | 
 3 | # dependencies
 4 | /node_modules
 5 | /.pnp
 6 | .pnp.js
 7 | 
 8 | # testing
 9 | /coverage
10 | /coverage-ts
11 | 
12 | # next.js
13 | /.next/
14 | /out/
15 | 
16 | # production
17 | /build
18 | 
19 | /temp
20 | 
21 | # misc
22 | .DS_Store
23 | *.pem
24 | 
25 | # debug
26 | npm-debug.log*
27 | yarn-debug.log*
28 | yarn-error.log*
29 | 
30 | # local env files
31 | .env.local
32 | .env.development.local
33 | .env.test.local
34 | .env.production.local
35 | 
36 | # vercel
37 | .vercel
38 | 
39 | 
40 | /public/graphql/
41 | .vscode/*
42 | 
43 | /src/styles/styles.css
44 | 
45 | *.generated.ts
46 | *.generated.tsx
47 | *.generated.json
48 | 
49 | graphql.schema.json
50 | schema.graphql
51 | 
52 | # Sentry
53 | .sentryclirc
54 | /test-results/
55 | /playwright-report/
56 | /playwright/.cache/
57 | 
58 | tsconfig.tsbuildinfo
59 | .next


--------------------------------------------------------------------------------
/nextjs/components/Demo.tsx:
--------------------------------------------------------------------------------
  1 | import React, {
  2 |   useCallback,
  3 |   useEffect,
  4 |   useMemo,
  5 |   useRef,
  6 |   useState,
  7 | } from "react";
  8 | import {
  9 |   Group,
 10 |   Text,
 11 |   useMantineTheme,
 12 |   Alert,
 13 |   Table,
 14 |   Button,
 15 |   Title,
 16 |   Flex,
 17 |   Stack,
 18 |   Spoiler,
 19 |   Progress,
 20 |   Card,
 21 | } from "@mantine/core";
 22 | import { IconAlertCircle } from "@tabler/icons-react";
 23 | import { Experiment, Form, QAPair, Result } from "../utils/types";
 24 | import { notifications } from "@mantine/notifications";
 25 | import { API_URL, IS_DEV } from "../utils/variables";
 26 | import { fetchEventSource } from "@microsoft/fetch-event-source";
 27 | import { Parser } from "@json2csv/plainjs";
 28 | import { isEmpty, isNil, orderBy } from "lodash";
 29 | import sampleResults from "../public/testData/results.json";
 30 | import sampleTestDataset from "../public/testData/testDataset.json";
 31 | import sampleExperiments from "../public/testData/experiments.json";
 32 | import SummaryChart from "./SummaryChart";
 33 | import ExperimentSummaryTable from "./ExperimentSummaryTable";
 34 | import FilesTable from "./tables/FilesTable";
 35 | import ExperimentResultTable from "./tables/ExperimentResultTable";
 36 | import sampleText from "../public/testData/karpathy-pod.json";
 37 | import LogRocket from "logrocket";
 38 | 
 39 | const Demo = ({ form }: { form: Form }) => {
 40 |   const { setValue, watch, getValues, handleSubmit } = form;
 41 |   const watchFiles = watch("files");
 42 |   const [loading, setLoading] = useState(false);
 43 |   const [results, setResults] = useState<Result[]>([]);
 44 |   const [testDataset, setTestDataset] = useState<QAPair[]>([]);
 45 |   const [evalQuestionsCount, setEvalQuestionsCount] = useState(5);
 46 |   const [experiments, setExperiments] = useState<Experiment[]>([]);
 47 |   const [shouldShowProgress, setShouldShowProgress] = useState(false);
 48 |   const [gradingPromptStyle, setGradingPromptStyle] = useState(undefined);
 49 |   const experimentsResultsSpoilerRef = useRef<HTMLButtonElement>(null);
 50 |   const summarySpoilerRef = useRef<HTMLButtonElement>(null);
 51 |   const testDatasetSpoilerRef = useRef<HTMLButtonElement>(null);
 52 |   const [isFirstRun, setIsFirstRun] = useState(true);
 53 | 
 54 |   const alertStyle = { backgroundColor: `rgba(193,194,197,0.38)` };
 55 |   useEffect(() => {
 56 |     setValue("files", [
 57 |       new File(
 58 |         [new Blob([sampleText.text], { type: "text/plain" })],
 59 |         "karpathy-pod.txt",
 60 |         {
 61 |           type: "text/plain",
 62 |         }
 63 |       ),
 64 |     ]);
 65 |     setResults(sampleResults);
 66 |     setTestDataset(sampleTestDataset);
 67 |     setExperiments(sampleExperiments);
 68 |   }, []);
 69 | 
 70 |   const runExperimentButtonLabel =
 71 |     experiments.length > 1 ? "Re-run experiment" : "Run Experiment";
 72 | 
 73 |   const initialProgress = {
 74 |     value: 15,
 75 |     color: "purple",
 76 |     label: "Building Index ...",
 77 |   };
 78 | 
 79 |   const finishedProgress = {
 80 |     value: 100,
 81 |     color: "green",
 82 |     label: "Completed",
 83 |   };
 84 | 
 85 |   const experimentProgress = useMemo(() => {
 86 |     if (results.length === 0) {
 87 |       return [initialProgress];
 88 |     }
 89 | 
 90 |     const res = 15 + Math.floor((results?.length / evalQuestionsCount) * 85);
 91 | 
 92 |     if (res === 100) {
 93 |       return [finishedProgress];
 94 |     }
 95 |     const ret = [
 96 |       initialProgress,
 97 |       {
 98 |         value: res,
 99 |         color: "blue",
100 |         label: "Generating Evals & Grading",
101 |       },
102 |     ];
103 |     return ret;
104 |   }, [results, evalQuestionsCount]);
105 | 
106 |   const chartData = experiments.map((experiment) => ({
107 |     id: "Expt #" + experiment.id,
108 |     data: [
109 |       {
110 |         x: experiment.avgAnswerScore,
111 |         y: experiment.avgLatency,
112 |       },
113 |     ],
114 |   }));
115 | 
116 |   const submit = handleSubmit(async (data) => {
117 |     setShouldShowProgress(true);
118 |     setLoading(true);
119 |     setResults([]);
120 | 
121 |     const resetExpts = data.evalQuestionsCount !== evalQuestionsCount;
122 |     if (resetExpts) {
123 |       setExperiments([]);
124 |     }
125 | 
126 |     const formData = new FormData();
127 |     data.files.forEach((file) => {
128 |       formData.append("files", file);
129 |     });
130 |     formData.append("num_eval_questions", data.evalQuestionsCount.toString());
131 |     formData.append("chunk_chars", data.chunkSize.toString());
132 |     formData.append("overlap", data.overlap.toString());
133 |     formData.append("split_method", data.splitMethod);
134 |     formData.append("retriever_type", data.retriever);
135 |     formData.append("embeddings", data.embeddingAlgorithm);
136 |     formData.append("model_version", data.model);
137 |     formData.append("grade_prompt", data.gradingPrompt);
138 |     formData.append("num_neighbors", data.numNeighbors.toString());
139 |     formData.append("test_dataset", JSON.stringify(testDataset));
140 | 
141 |     if (!IS_DEV) {
142 |       LogRocket.track("DemoSubmission", {
143 |         numQuestions: data.evalQuestionsCount,
144 |         overlap: data.overlap,
145 |         split: data.splitMethod,
146 |         retriever: data.retriever,
147 |         embedding: data.embeddingAlgorithm,
148 |         model: data.model,
149 |         promptStyle: data.gradingPrompt,
150 |         numNeighbors: data.numNeighbors,
151 |       });
152 |     }
153 | 
154 |     setEvalQuestionsCount(data.evalQuestionsCount);
155 |     setGradingPromptStyle(data.gradingPrompt);
156 | 
157 |     const controller = new AbortController();
158 | 
159 |     let localResults = [];
160 |     let rowCount = 0;
161 | 
162 |     try {
163 |       await fetchEventSource(API_URL + "/evaluator-stream", {
164 |         method: "POST",
165 |         body: formData,
166 |         headers: {
167 |           Accept: "text/event-stream",
168 |         },
169 |         openWhenHidden: true,
170 |         signal: controller.signal,
171 |         onmessage(ev) {
172 |           try {
173 |             const row: Result = JSON.parse(ev.data)?.data;
174 |             setResults((results) => [...results, row]);
175 |             localResults = [...localResults, row];
176 |             rowCount += 1;
177 |             if (rowCount > testDataset.length) {
178 |               setTestDataset((testDataset) => [
179 |                 ...testDataset,
180 |                 {
181 |                   question: row.question,
182 |                   answer: row.answer,
183 |                 },
184 |               ]);
185 |             }
186 |             if (rowCount === data.evalQuestionsCount) {
187 |               controller.abort();
188 |             }
189 |           } catch (e) {
190 |             console.warn("Error parsing data", e);
191 |           }
192 |         },
193 |         onclose() {
194 |           console.log("Connection closed by the server");
195 |           setLoading(false);
196 |         },
197 |         onerror(err) {
198 |           console.log("There was an error from server", err);
199 |           throw err;
200 |         },
201 |       });
202 |     } catch (e) {
203 |       notifications.show({
204 |         title: "Error",
205 |         message: "There was an error from the server.",
206 |         color: "red",
207 |       });
208 |       setShouldShowProgress(false);
209 |       setLoading(false);
210 |       return;
211 |     }
212 |     setLoading(false);
213 |     setIsFirstRun(false);
214 |     const avgAnswerScore =
215 |       localResults.reduce((acc, curr) => acc + curr.answerScore.score, 0) /
216 |       localResults.length;
217 |     const avgRelevancyScore =
218 |       localResults.reduce((acc, curr) => acc + curr.retrievalScore.score, 0) /
219 |       localResults.length;
220 |     const avgLatency =
221 |       localResults.reduce((acc, curr) => acc + curr.latency, 0) /
222 |       localResults.length;
223 |     const newExperiment: Experiment = {
224 |       evalQuestionsCount: data.evalQuestionsCount,
225 |       chunkSize: data.chunkSize,
226 |       overlap: data.overlap,
227 |       splitMethod: data.splitMethod,
228 |       retriever: data.retriever,
229 |       embeddingAlgorithm: data.embeddingAlgorithm,
230 |       model: data.model,
231 |       gradingPrompt: data.gradingPrompt,
232 |       numNeighbors: data.numNeighbors,
233 |       avgRelevancyScore,
234 |       avgAnswerScore,
235 |       avgLatency,
236 |       performance: avgAnswerScore / avgLatency,
237 |       id: resetExpts ? 1 : experiments.length + 1,
238 |     };
239 |     setExperiments((experiments) =>
240 |       resetExpts ? [newExperiment] : [...experiments, newExperiment]
241 |     );
242 |   });
243 | 
244 |   const download = useCallback(
245 |     (data: any[], filename: string) => {
246 |       const parser = new Parser();
247 |       const csv = parser.parse(data);
248 |       const blob = new Blob([csv], { type: "text/csv;charset=utf-8;" });
249 |       const url = URL.createObjectURL(blob);
250 |       const link = document.createElement("a");
251 |       link.setAttribute("href", url);
252 |       link.setAttribute("download", filename + ".csv");
253 |       link.style.visibility = "hidden";
254 |       document.body.appendChild(link);
255 |       link.click();
256 |       document.body.removeChild(link);
257 |     },
258 |     [results]
259 |   );
260 | 
261 |   const isFastGradingPrompt = gradingPromptStyle === "Fast";
262 | 
263 |   return (
264 |     <Stack>
265 |       <Title order={3}>Get Started</Title>
266 |       <Alert style={alertStyle}>
267 |         Welcome to the auto-evaluator! This is an app to evaluate the
268 |         performance of question-answering LLM chains. This demo has pre-loaded
269 |         two things: (1) a document (the Lex Fridman podcast with Andrej
270 |         Karpathy) and (2) a "test set" of question-answer pairs for this
271 |         episode. The aim is to evaluate the performance of various
272 |         question-answering LLM chain configurations against the test set. You
273 |         can build any QA chain using the components and score its performance.
274 |         <br />
275 |         <br />
276 |         <Text>
277 |           Choose the question-answering chain configuration (left) and launch an
278 |           experiment using the button below. For more detail on each setting,
279 |           see full the documentation{" "}
280 |           <a
281 |             style={{ color: "blue" }}
282 |             href="https://github.com/dankolesnikov/auto-evaluator-app"
283 |           >
284 |             here
285 |           </a>
286 |           .
287 |         </Text>
288 |       </Alert>
289 |       {!!watchFiles?.length && (
290 |         <>
291 |           <FilesTable files={watchFiles} />
292 |           {!!testDataset.length && (
293 |             <Card>
294 |               <Spoiler
295 |                 maxHeight={0}
296 |                 showLabel="Show available test dataset"
297 |                 hideLabel={null}
298 |                 transitionDuration={500}
299 |                 controlRef={testDatasetSpoilerRef}
300 |               >
301 |                 <Stack>
302 |                   <Group position="apart">
303 |                     <Title order={3}>Test Dataset</Title>
304 |                     <Group spacing={0}>
305 |                       <Button
306 |                         style={{ marginBottom: "18px" }}
307 |                         type="button"
308 |                         variant="secondary"
309 |                         onClick={() => download(testDataset, "test_dataset")}
310 |                       >
311 |                         Download
312 |                       </Button>
313 |                       <Button
314 |                         style={{ marginBottom: "18px" }}
315 |                         type="button"
316 |                         variant="subtle"
317 |                         onClick={() => {
318 |                           setTestDataset([]);
319 |                           notifications.show({
320 |                             title: "Success",
321 |                             message: "The test dataset has been cleared.",
322 |                             color: "green",
323 |                           });
324 |                         }}
325 |                       >
326 |                         Reset
327 |                       </Button>
328 |                       <Button
329 |                         style={{ marginBottom: "18px" }}
330 |                         type="button"
331 |                         variant="subtle"
332 |                         onClick={() => {
333 |                           if (testDatasetSpoilerRef.current)
334 |                             testDatasetSpoilerRef.current.click();
335 |                         }}
336 |                       >
337 |                         Hide
338 |                       </Button>
339 |                     </Group>
340 |                   </Group>
341 |                 </Stack>
342 |                 <Table withBorder withColumnBorders striped highlightOnHover>
343 |                   <thead>
344 |                     <tr>
345 |                       <th>Question</th>
346 |                       <th>Answer</th>
347 |                     </tr>
348 |                   </thead>
349 |                   <tbody>
350 |                     {testDataset?.map((result: QAPair, index: number) => (
351 |                       <tr key={index}>
352 |                         <td>{result?.question}</td>
353 |                         <td>{result?.answer}</td>
354 |                       </tr>
355 |                     ))}
356 |                   </tbody>
357 |                 </Table>
358 |               </Spoiler>
359 |             </Card>
360 |           )}
361 |           <Flex direction="row" gap="md">
362 |             {!loading || isFirstRun ? (
363 |               <Stack>
364 |                 <Button
365 |                   style={{ marginBottom: "18px", width: 170 }}
366 |                   type="submit"
367 |                   onClick={submit}
368 |                   disabled={loading}
369 |                 >
370 |                   {runExperimentButtonLabel}
371 |                 </Button>
372 |               </Stack>
373 |             ) : null}
374 |           </Flex>
375 |         </>
376 |       )}
377 |       {shouldShowProgress && (
378 |         <Progress
379 |           // value={percentLoaded}
380 |           // label={percentLoaded + "%"}
381 |           size="xl"
382 |           radius="xl"
383 |           sections={experimentProgress}
384 |           color={loading ? "blue" : "green"}
385 |         />
386 |       )}
387 |       {!isEmpty(results) ? (
388 |         <Card>
389 |           <Spoiler
390 |             maxHeight={0}
391 |             showLabel="Show results"
392 |             hideLabel={null}
393 |             transitionDuration={500}
394 |             initialState={true}
395 |             controlRef={experimentsResultsSpoilerRef}
396 |           >
397 |             <Stack>
398 |               <Group position="apart">
399 |                 <Title order={3}>Experiment Results</Title>
400 |                 <Alert style={alertStyle}>
401 |                   This table shows the each question-answer pair from the test
402 |                   set along with the model's answer to the question. The app
403 |                   will score two things: (1) the relevance of the retrieved
404 |                   documents relative to the question and (2) the similarity of
405 |                   the LLM generated answer relative to ground truth answer. The
406 |                   prompts for both can be seen{" "}
407 |                   <a
408 |                     style={{ color: "blue" }}
409 |                     href="https://github.com/dankolesnikov/auto-evaluator-app/blob/main/api/text_utils.py"
410 |                   >
411 |                     here
412 |                   </a>{" "}
413 |                   and can be chosen by the user in the drop-down list "Grading
414 |                   prompt style". The "Fast" prompt will only have the LLM grader
415 |                   output the score. The other prompts will also produce an
416 |                   explanation.
417 |                 </Alert>
418 |                 <Group spacing={0}>
419 |                   <Button
420 |                     style={{ marginBottom: "18px" }}
421 |                     type="button"
422 |                     variant="subtle"
423 |                     onClick={() => download(results, "results")}
424 |                   >
425 |                     Download
426 |                   </Button>
427 |                   <Button
428 |                     style={{ marginBottom: "18px" }}
429 |                     type="button"
430 |                     variant="subtle"
431 |                     onClick={() => {
432 |                       if (experimentsResultsSpoilerRef.current)
433 |                         experimentsResultsSpoilerRef.current.click();
434 |                     }}
435 |                   >
436 |                     Hide
437 |                   </Button>
438 |                 </Group>
439 |               </Group>
440 |             </Stack>
441 |             <ExperimentResultTable
442 |               results={results}
443 |               isFastGradingPrompt={isFastGradingPrompt}
444 |             />
445 |           </Spoiler>
446 |         </Card>
447 |       ) : null}
448 |       {!!experiments.length && (
449 |         <Card>
450 |           <Spoiler
451 |             maxHeight={0}
452 |             showLabel="Show summary"
453 |             hideLabel={null}
454 |             transitionDuration={500}
455 |             initialState={true}
456 |             controlRef={summarySpoilerRef}
457 |           >
458 |             <Stack>
459 |               <Group position="apart">
460 |                 <Title order={3}>Summary</Title>
461 |                 <Group spacing={0}>
462 |                   <Button
463 |                     style={{ marginBottom: "18px" }}
464 |                     type="button"
465 |                     variant="secondary"
466 |                     onClick={() => download(experiments, "summary")}
467 |                   >
468 |                     Download
469 |                   </Button>
470 |                   <Button
471 |                     style={{ marginBottom: "18px" }}
472 |                     type="button"
473 |                     variant="subtle"
474 |                     onClick={() => {
475 |                       if (summarySpoilerRef.current)
476 |                         summarySpoilerRef.current.click();
477 |                     }}
478 |                   >
479 |                     Hide
480 |                   </Button>
481 |                 </Group>
482 |               </Group>
483 |             </Stack>
484 |             <ExperimentSummaryTable experiments={experiments} />
485 |             <div style={{ height: 500 }}>
486 |               <SummaryChart chartData={chartData} />
487 |             </div>
488 |           </Spoiler>
489 |         </Card>
490 |       )}
491 |     </Stack>
492 |   );
493 | };
494 | export default Demo;
495 | 


--------------------------------------------------------------------------------
/nextjs/components/ExperimentSummaryTable.tsx:
--------------------------------------------------------------------------------
 1 | import { ScrollArea, Table } from "@mantine/core";
 2 | import { Experiment } from "../utils/types";
 3 | 
 4 | const ExperimentSummaryTable = ({
 5 |   experiments,
 6 | }: {
 7 |   experiments: Experiment[];
 8 | }) => {
 9 |   return (
10 |     <ScrollArea scrollbarSize={0}>
11 |       <Table withBorder withColumnBorders striped highlightOnHover>
12 |         <thead>
13 |           <tr>
14 |             <th>Experiment #</th>
15 |             <th># of Eval Questions</th>
16 |             <th>Chunk Size</th>
17 |             <th>Overlap</th>
18 |             <th>Split Method</th>
19 |             <th>Retriever</th>
20 |             <th>Embedding Algorithm</th>
21 |             <th>Model</th>
22 |             <th>Grading Prompt Style</th>
23 |             <th># of Chunks Retrieved</th>
24 |             <th>Avg Retrieval Relevancy Score</th>
25 |             <th>Avg Answer Similarity Score</th>
26 |             <th>Avg Latency (s)</th>
27 |           </tr>
28 |         </thead>
29 |         <tbody>
30 |           {experiments?.map((result: Experiment, index: number) => (
31 |             <tr key={index}>
32 |               <td>{result.id}</td>
33 |               <td>{result?.evalQuestionsCount}</td>
34 |               <td>{result?.chunkSize}</td>
35 |               <td>{result?.overlap}</td>
36 |               <td>{result?.splitMethod}</td>
37 |               <td>{result?.retriever}</td>
38 |               <td>{result?.embeddingAlgorithm}</td>
39 |               <td>{result?.model}</td>
40 |               <td>{result?.gradingPrompt}</td>
41 |               <td>{result?.numNeighbors}</td>
42 |               <td>{result?.avgRelevancyScore}</td>
43 |               <td>{result?.avgAnswerScore}</td>
44 |               <td>{result?.avgLatency.toFixed(3)}</td>
45 |             </tr>
46 |           ))}
47 |         </tbody>
48 |       </Table>
49 |     </ScrollArea>
50 |   );
51 | };
52 | export default ExperimentSummaryTable;
53 | 


--------------------------------------------------------------------------------
/nextjs/components/HeaderEvaluator.tsx:
--------------------------------------------------------------------------------
 1 | import { Group, Header, Stack, Text } from "@mantine/core";
 2 | import Image from "next/image";
 3 | import Link from "next/link";
 4 | import React from "react";
 5 | import githubIcon from "../public/github-mark.svg";
 6 | import { useMediaQuery } from "@mantine/hooks";
 7 | 
 8 | export enum MenuItem {
 9 |   Demo = "Demo",
10 |   Playground = "Playground",
11 |   About = "About",
12 | }
13 | 
14 | const HeaderEvaluator = ({ activeTab }: { activeTab: MenuItem }) => {
15 |   const mobileWidth = useMediaQuery("(max-width: 390px)");
16 |   const borderBottom = "1px solid #000";
17 | 
18 |   return (
19 |     <Header height={{ base: "75px" }}>
20 |       <Stack justify="center" p="15px" pr={"25px"}>
21 |         <Group position="apart">
22 |           <Link href="/" style={{ textDecoration: "none" }}>
23 |             <Group>
24 |               <Text size={mobileWidth === true ? "14px" : "28px"}>🦜🔗</Text>
25 |               <Text
26 |                 variant="gradient"
27 |                 gradient={{ from: "blue", to: "#bf2015" }}
28 |                 size={mobileWidth === true ? "14px" : "28px"}
29 |               >
30 |                 Auto-Evaluator
31 |               </Text>
32 |             </Group>
33 |           </Link>
34 |           <Group>
35 |             <Link
36 |               href="/"
37 |               style={{
38 |                 textDecoration: "none",
39 |                 borderBottom: activeTab === MenuItem.Demo ? borderBottom : null,
40 |               }}
41 |             >
42 |               <Text c="black">Demo</Text>
43 |             </Link>
44 |             <Link
45 |               href="/playground"
46 |               style={{
47 |                 textDecoration: "none",
48 |                 borderBottom:
49 |                   activeTab === MenuItem.Playground ? borderBottom : null,
50 |               }}
51 |             >
52 |               <Text c="black">Playground</Text>
53 |             </Link>
54 |             <Link
55 |               style={{ textDecoration: "none" }}
56 |               href={
57 |                 "https://github.com/dankolesnikov/evaluator-app/blob/main/README.md"
58 |               }
59 |               target="_blank"
60 |             >
61 |               <Text c="black">Docs</Text>
62 |             </Link>
63 |             <Link
64 |               style={{
65 |                 textDecoration: "none",
66 |                 borderBottom:
67 |                   activeTab === MenuItem.About ? borderBottom : null,
68 |               }}
69 |               href="/about"
70 |             >
71 |               <Text c="black">About</Text>
72 |             </Link>
73 |             <Link
74 |               href={"https://github.com/dankolesnikov/evaluator-app"}
75 |               target="_blank"
76 |             >
77 |               <Image src={githubIcon} alt="github" width={30} height={30} />
78 |             </Link>
79 |           </Group>
80 |         </Group>
81 |       </Stack>
82 |     </Header>
83 |   );
84 | };
85 | export default HeaderEvaluator;
86 | 


--------------------------------------------------------------------------------
/nextjs/components/PersonCard.tsx:
--------------------------------------------------------------------------------
 1 | import {
 2 |   createStyles,
 3 |   Card,
 4 |   Avatar,
 5 |   Text,
 6 |   Group,
 7 |   Button,
 8 |   rem,
 9 |   Stack,
10 | } from "@mantine/core";
11 | import Link from "next/link";
12 | import githubIcon from "../public/github-mark.svg";
13 | import twitterBlackIcon from "../public/twitter-black.svg";
14 | import Image from "next/image";
15 | 
16 | const useStyles = createStyles((theme) => ({
17 |   card: {
18 |     backgroundColor:
19 |       theme.colorScheme === "dark" ? theme.colors.dark[7] : theme.white,
20 |   },
21 | 
22 |   avatar: {
23 |     border: `${rem(2)} solid ${
24 |       theme.colorScheme === "dark" ? theme.colors.dark[7] : theme.white
25 |     }`,
26 |   },
27 | }));
28 | 
29 | interface UserCardImageProps {
30 |   avatar: string;
31 |   name: string;
32 |   job: string;
33 |   twitterHandle: string;
34 |   githubHandle: string;
35 | }
36 | 
37 | export function UserCardImage({
38 |   avatar,
39 |   name,
40 |   job,
41 |   twitterHandle,
42 |   githubHandle,
43 | }: UserCardImageProps) {
44 |   const { classes, theme } = useStyles();
45 | 
46 |   return (
47 |     <Card withBorder padding="xl" radius="md" className={classes.card}>
48 |       <Stack>
49 |         <Avatar
50 |           src={avatar}
51 |           size={185}
52 |           radius={80}
53 |           mx="auto"
54 |           className={classes.avatar}
55 |         />
56 |         <div>
57 |           <Text ta="center" fz="lg" fw={500} mt="sm">
58 |             {name}
59 |           </Text>
60 |           <Text ta="center" fz="sm" c="dimmed">
61 |             {job}
62 |           </Text>
63 |         </div>
64 |         <Group position="center">
65 |           <Link href={`https://twitter.com/${twitterHandle}`} target="_blank">
66 |             <Image src={twitterBlackIcon} alt="github" width={30} height={30} />
67 |           </Link>
68 |           <Link href={`https://github.com/${githubHandle}`} target="_blank">
69 |             <Image src={githubIcon} alt="github" width={30} height={30} />
70 |           </Link>
71 |         </Group>
72 |       </Stack>
73 |     </Card>
74 |   );
75 | }
76 | 


--------------------------------------------------------------------------------
/nextjs/components/Playground.tsx:
--------------------------------------------------------------------------------
  1 | import React, {
  2 |   useCallback,
  3 |   useEffect,
  4 |   useMemo,
  5 |   useRef,
  6 |   useState,
  7 | } from "react";
  8 | import {
  9 |   Group,
 10 |   Text,
 11 |   useMantineTheme,
 12 |   Alert,
 13 |   Table,
 14 |   Button,
 15 |   Title,
 16 |   Flex,
 17 |   Stack,
 18 |   Spoiler,
 19 |   Progress,
 20 |   Card,
 21 |   ScrollArea,
 22 |   createStyles,
 23 | } from "@mantine/core";
 24 | import { IconUpload, IconX, IconAlertCircle } from "@tabler/icons-react";
 25 | import { Dropzone, MIME_TYPES } from "@mantine/dropzone";
 26 | import { Experiment, Form, QAPair, Result } from "../utils/types";
 27 | import { notifications } from "@mantine/notifications";
 28 | import { API_URL, IS_DEV } from "../utils/variables";
 29 | import { fetchEventSource } from "@microsoft/fetch-event-source";
 30 | import { Parser } from "@json2csv/plainjs";
 31 | import { IconFile } from "@tabler/icons-react";
 32 | import { ResponsiveScatterPlot } from "@nivo/scatterplot";
 33 | import { isEmpty, isNil, orderBy } from "lodash";
 34 | import TestFileUploadZone from "./TestFileUploadZone";
 35 | import LogRocket from "logrocket";
 36 | 
 37 | const MAX_FILE_SIZE_MB = 50;
 38 | 
 39 | enum DropZoneErrorCode {
 40 |   FileTooLarge = "file-too-large",
 41 |   FileInvalidType = "file-invalid-type",
 42 | }
 43 | 
 44 | const useStyles = createStyles((theme) => ({
 45 |   disabled: {
 46 |     backgroundColor:
 47 |       theme.colorScheme === "dark"
 48 |         ? theme.colors.dark[6]
 49 |         : theme.colors.gray[0],
 50 |     borderColor:
 51 |       theme.colorScheme === "dark"
 52 |         ? theme.colors.dark[5]
 53 |         : theme.colors.gray[2],
 54 |     cursor: "not-allowed",
 55 | 
 56 |     "& *": {
 57 |       color:
 58 |         theme.colorScheme === "dark"
 59 |           ? theme.colors.dark[3]
 60 |           : theme.colors.gray[5],
 61 |     },
 62 |   },
 63 | }));
 64 | 
 65 | const Playground = ({ form }: { form: Form }) => {
 66 |   const { setValue, watch, getValues, handleSubmit } = form;
 67 |   const watchFiles = watch("files");
 68 |   const theme = useMantineTheme();
 69 |   const [loading, setLoading] = useState(false);
 70 |   const [results, setResults] = useState<Result[]>([]);
 71 |   const [testDataset, setTestDataset] = useState<QAPair[]>([]);
 72 |   const [evalQuestionsCount, setEvalQuestionsCount] = useState(-1);
 73 |   const [experiments, setExperiments] = useState<Experiment[]>([]);
 74 |   const [didUploadTestDataset, setDidUploadTestDataset] = useState(false);
 75 |   const [shouldShowProgress, setShouldShowProgress] = useState(false);
 76 |   const [gradingPromptStyle, setGradingPromptStyle] = useState(undefined);
 77 |   const experimentsResultsSpoilerRef = useRef<HTMLButtonElement>(null);
 78 |   const summarySpoilerRef = useRef<HTMLButtonElement>(null);
 79 |   const testDatasetSpoilerRef = useRef<HTMLButtonElement>(null);
 80 |   const [testFilesDropzoneDisabled, setTestFilesDropzoneDisabled] =
 81 |     useState(true);
 82 |   const [fileUploadDisabled, setFileUploadDisabled] = useState(false);
 83 | 
 84 |   const { classes } = useStyles();
 85 | 
 86 |   const initialProgress = {
 87 |     value: 15,
 88 |     color: "purple",
 89 |     label: "Processing Files",
 90 |   };
 91 | 
 92 |   const finishedProgress = {
 93 |     value: 100,
 94 |     color: "green",
 95 |     label: "Completed",
 96 |   };
 97 | 
 98 |   const experimentProgress = useMemo(() => {
 99 |     if (results.length === 0) {
100 |       return [initialProgress];
101 |     }
102 | 
103 |     const res = 15 + Math.floor((results?.length / evalQuestionsCount) * 85);
104 | 
105 |     if (res === 100) {
106 |       return [finishedProgress];
107 |     }
108 |     const ret = [
109 |       initialProgress,
110 |       {
111 |         value: res,
112 |         color: "blue",
113 |         label: "Generating Evals & Grading",
114 |       },
115 |     ];
116 |     return ret;
117 |   }, [results, evalQuestionsCount]);
118 | 
119 |   const chartData = experiments.map((experiment, index) => ({
120 |     id: "Expt #" + (index + 1),
121 |     data: [
122 |       {
123 |         x: experiment.avgAnswerScore,
124 |         y: experiment.avgLatency,
125 |       },
126 |     ],
127 |   }));
128 | 
129 |   const renderPassFail = (data: any) => {
130 |     if (data.score === 0) {
131 |       return "Incorrect";
132 |     }
133 |     if (data.score === 1) {
134 |       return "Correct";
135 |     }
136 |     throw new Error(`Problem parsing ${data}`);
137 |   };
138 | 
139 |   const submit = handleSubmit(async (data) => {
140 |     setShouldShowProgress(true);
141 |     setLoading(true);
142 |     setResults([]);
143 | 
144 |     const resetExpts =
145 |       data.evalQuestionsCount !== evalQuestionsCount || didUploadTestDataset;
146 |     if (resetExpts) {
147 |       setExperiments([]);
148 |     }
149 | 
150 |     setDidUploadTestDataset(false);
151 | 
152 |     const formData = new FormData();
153 |     data.files.forEach((file) => {
154 |       formData.append("files", file);
155 |     });
156 |     formData.append("num_eval_questions", data.evalQuestionsCount.toString());
157 |     formData.append("chunk_chars", data.chunkSize.toString());
158 |     formData.append("overlap", data.overlap.toString());
159 |     formData.append("split_method", data.splitMethod);
160 |     formData.append("retriever_type", data.retriever);
161 |     formData.append("embeddings", data.embeddingAlgorithm);
162 |     formData.append("model_version", data.model);
163 |     formData.append("grade_prompt", data.gradingPrompt);
164 |     formData.append("num_neighbors", data.numNeighbors.toString());
165 |     formData.append("test_dataset", JSON.stringify(testDataset));
166 | 
167 |     if (!IS_DEV) {
168 |       LogRocket.track("PlaygroundSubmission", {
169 |         fileSizes: data.files.map((file) => file.size),
170 |         fileTypes: data.files.map((file) => file.type),
171 |         numQuestions: data.evalQuestionsCount,
172 |         overlap: data.overlap,
173 |         split: data.splitMethod,
174 |         retriever: data.retriever,
175 |         embedding: data.embeddingAlgorithm,
176 |         model: data.model,
177 |         promptStyle: data.gradingPrompt,
178 |         numNeighbors: data.numNeighbors,
179 |         uploadedTestDataset: !!testDataset.length,
180 |       });
181 |     }
182 | 
183 |     setEvalQuestionsCount(data.evalQuestionsCount);
184 |     setGradingPromptStyle(data.gradingPrompt);
185 | 
186 |     const controller = new AbortController();
187 | 
188 |     let localResults = [];
189 |     let rowCount = 0;
190 |     try {
191 |       await fetchEventSource(API_URL + "/evaluator-stream", {
192 |         method: "POST",
193 |         body: formData,
194 |         headers: {
195 |           Accept: "text/event-stream",
196 |           Connection: "keep-alive", // Add the keep-alive header
197 |         },
198 |         openWhenHidden: true,
199 |         signal: controller.signal,
200 |         onmessage(ev) {
201 |           try {
202 |             const row: Result = JSON.parse(ev.data)?.data;
203 |             setResults((results) => [...results, row]);
204 |             localResults = [...localResults, row];
205 |             rowCount += 1;
206 |             if (rowCount > testDataset.length) {
207 |               setTestDataset((testDataset) => [
208 |                 ...testDataset,
209 |                 {
210 |                   question: row.question,
211 |                   answer: row.answer,
212 |                 },
213 |               ]);
214 |             }
215 |             if (rowCount === data.evalQuestionsCount) {
216 |               controller.abort();
217 |             }
218 |           } catch (e) {
219 |             console.warn("Error parsing data", e);
220 |           }
221 |         },
222 |         onclose() {
223 |           console.log("Connection closed by the server");
224 |           setLoading(false);
225 |           if (!rowCount) {
226 |             throw new Error("No results were returned from the server.");
227 |           }
228 |         },
229 |         onerror(err) {
230 |           console.log("There was an error from server", err);
231 |           throw new Error(err);
232 |         },
233 |       });
234 |     } catch (e) {
235 |       notifications.show({
236 |         title: "Error",
237 |         message: "There was an error from the server.",
238 |         color: "red",
239 |       });
240 |       setShouldShowProgress(false);
241 |       setLoading(false);
242 |       return;
243 |     }
244 |     setLoading(false);
245 |     const avgAnswerScore =
246 |       localResults.reduce((acc, curr) => acc + curr.answerScore.score, 0) /
247 |       localResults.length;
248 |     const avgRelevancyScore =
249 |       localResults.reduce((acc, curr) => acc + curr.retrievalScore.score, 0) /
250 |       localResults.length;
251 |     const avgLatency =
252 |       localResults.reduce((acc, curr) => acc + curr.latency, 0) /
253 |       localResults.length;
254 |     const newExperiment: Experiment = {
255 |       evalQuestionsCount: data.evalQuestionsCount,
256 |       chunkSize: data.chunkSize,
257 |       overlap: data.overlap,
258 |       splitMethod: data.splitMethod,
259 |       retriever: data.retriever,
260 |       embeddingAlgorithm: data.embeddingAlgorithm,
261 |       model: data.model,
262 |       gradingPrompt: data.gradingPrompt,
263 |       numNeighbors: data.numNeighbors,
264 |       avgRelevancyScore,
265 |       avgAnswerScore,
266 |       avgLatency,
267 |       performance: avgAnswerScore / avgLatency,
268 |       id: resetExpts ? 1 : experiments.length + 1,
269 |     };
270 |     setExperiments((experiments) =>
271 |       resetExpts ? [newExperiment] : [...experiments, newExperiment]
272 |     );
273 |   });
274 | 
275 |   const runExperimentButtonLabel = experiments.length
276 |     ? "Re-run experiment"
277 |     : "Run Experiment";
278 | 
279 |   const download = useCallback(
280 |     (data: any[], filename: string) => {
281 |       const parser = new Parser();
282 |       const csv = parser.parse(data);
283 |       const blob = new Blob([csv], { type: "text/csv;charset=utf-8;" });
284 |       const url = URL.createObjectURL(blob);
285 |       const link = document.createElement("a");
286 |       link.setAttribute("href", url);
287 |       link.setAttribute("download", filename + ".csv");
288 |       link.style.visibility = "hidden";
289 |       document.body.appendChild(link);
290 |       link.click();
291 |       document.body.removeChild(link);
292 |     },
293 |     [results]
294 |   );
295 | 
296 |   const isFastGradingPrompt = gradingPromptStyle === "Fast";
297 |   const alertStyle = { backgroundColor: `rgba(193,194,197,0.38)` };
298 | 
299 |   return (
300 |     <Stack>
301 |       <Alert
302 |         icon={<IconAlertCircle size="1rem" />}
303 |         title="Instructions"
304 |         style={alertStyle}
305 |       >
306 |         Upload a file (up to 50 MB) and choose the parameters for your QA
307 |         chain. This evaluator will generate a test dataset of QA pairs and grade
308 |         the performance of the QA chain. You can experiment with different
309 |         parameters and evaluate the performance.
310 |       </Alert>
311 |       <Flex direction="row" gap="md">
312 |         <Dropzone
313 |           disabled={fileUploadDisabled}
314 |           className={fileUploadDisabled ? classes.disabled : null}
315 |           onDrop={(files) => {
316 |             setValue("files", [...(getValues("files") ?? []), ...files]);
317 |             setExperiments([]);
318 |             setResults([]);
319 |             setShouldShowProgress(false);
320 |             setTestFilesDropzoneDisabled(false);
321 |             setFileUploadDisabled(true);
322 |           }}
323 |           maxFiles={1}
324 |           multiple={false}
325 |           maxSize={MAX_FILE_SIZE_MB * 1024 ** 2} // 50 MB
326 |           accept={[
327 |             MIME_TYPES.pdf,
328 |             MIME_TYPES.docx,
329 |             MIME_TYPES.doc,
330 |             "text/plain",
331 |           ]}
332 |           onReject={(files) => {
333 |             const errorCode = files?.[0]?.errors?.[0]?.code;
334 |             let message = files?.[0]?.errors?.[0]?.message;
335 |             switch (errorCode) {
336 |               case DropZoneErrorCode.FileTooLarge:
337 |                 message = `File size too large. Max file size is ${MAX_FILE_SIZE_MB} MB.`;
338 |                 break;
339 |               case DropZoneErrorCode.FileInvalidType:
340 |                 message = "File type not supported";
341 |                 break;
342 |               default:
343 |                 break;
344 |             }
345 |             notifications.show({
346 |               title: "Error",
347 |               message,
348 |               color: "red",
349 |             });
350 |           }}
351 |           // maxSize={3 * 1024 ** 2}
352 |           style={{ width: "100%" }}
353 |         >
354 |           <Stack align="center">
355 |             <Dropzone.Accept>
356 |               <IconUpload
357 |                 size="3.2rem"
358 |                 stroke={1.5}
359 |                 color={
360 |                   theme.colors[theme.primaryColor][
361 |                     theme.colorScheme === "dark" ? 4 : 6
362 |                   ]
363 |                 }
364 |               />
365 |             </Dropzone.Accept>
366 |             <Dropzone.Reject>
367 |               <IconX
368 |                 size="3.2rem"
369 |                 stroke={1.5}
370 |                 color={theme.colors.red[theme.colorScheme === "dark" ? 4 : 6]}
371 |               />
372 |             </Dropzone.Reject>
373 |             <Dropzone.Idle>
374 |               <IconFile size="3.2rem" stroke={1.5} />
375 |             </Dropzone.Idle>
376 |             <div>
377 |               <Text size="xl" inline align="center">
378 |                 Upload Text for QA Eval
379 |               </Text>
380 |               <Text size="sm" color="dimmed" mt={7} align="center">
381 |                 {"Attach a file (.txt, .pdf, .doc, .docx)"}
382 |               </Text>
383 |             </div>
384 |           </Stack>
385 |         </Dropzone>
386 |         <TestFileUploadZone
387 |           disabled={testFilesDropzoneDisabled}
388 |           setTestDataset={setTestDataset}
389 |           setDidUploadTestDataset={setDidUploadTestDataset}
390 |         />
391 |       </Flex>
392 |       {!!watchFiles?.length && (
393 |         <>
394 |           <Table>
395 |             <thead>
396 |               <tr>
397 |                 <th>File Name</th>
398 |                 <th>Size (MB)</th>
399 |               </tr>
400 |             </thead>
401 |             <tbody>
402 |               {watchFiles?.map((file, id) => (
403 |                 <tr key={id}>
404 |                   <td>{file?.name}</td>
405 |                   <td>{(file?.size / 1024 ** 2).toFixed(1)}</td>
406 |                 </tr>
407 |               ))}
408 |             </tbody>
409 |           </Table>
410 |           {!!testDataset.length && (
411 |             <Card>
412 |               <Spoiler
413 |                 maxHeight={0}
414 |                 showLabel="Show available test dataset"
415 |                 hideLabel={null}
416 |                 transitionDuration={500}
417 |                 controlRef={testDatasetSpoilerRef}
418 |               >
419 |                 <Stack>
420 |                   <Group position="apart">
421 |                     <Title order={3}>Test Dataset</Title>
422 |                     <Group>
423 |                       <Button
424 |                         style={{ marginBottom: "18px" }}
425 |                         type="button"
426 |                         variant="secondary"
427 |                         onClick={() => download(testDataset, "test_dataset")}
428 |                       >
429 |                         Download
430 |                       </Button>
431 |                       <Button
432 |                         style={{ marginBottom: "18px" }}
433 |                         type="button"
434 |                         variant="subtle"
435 |                         onClick={() => {
436 |                           setTestDataset([]);
437 |                           notifications.show({
438 |                             title: "Success",
439 |                             message: "The test dataset has been cleared.",
440 |                             color: "green",
441 |                           });
442 |                         }}
443 |                       >
444 |                         Reset
445 |                       </Button>
446 |                       <Button
447 |                         style={{ marginBottom: "18px" }}
448 |                         type="button"
449 |                         variant="subtle"
450 |                         onClick={() => {
451 |                           if (testDatasetSpoilerRef.current)
452 |                             testDatasetSpoilerRef.current.click();
453 |                         }}
454 |                       >
455 |                         Hide
456 |                       </Button>
457 |                     </Group>
458 |                   </Group>
459 |                 </Stack>
460 |                 <Table withBorder withColumnBorders striped highlightOnHover>
461 |                   <thead>
462 |                     <tr>
463 |                       <th>Question</th>
464 |                       <th>Answer</th>
465 |                     </tr>
466 |                   </thead>
467 |                   <tbody>
468 |                     {testDataset?.map((result: QAPair, index: number) => (
469 |                       <tr key={index}>
470 |                         <td>{result?.question}</td>
471 |                         <td>{result?.answer}</td>
472 |                       </tr>
473 |                     ))}
474 |                   </tbody>
475 |                 </Table>
476 |               </Spoiler>
477 |             </Card>
478 |           )}
479 |           <Flex direction="row" gap="md">
480 |             <Button
481 |               style={{ marginBottom: "18px" }}
482 |               type="submit"
483 |               onClick={submit}
484 |               disabled={loading}
485 |             >
486 |               {runExperimentButtonLabel}
487 |             </Button>
488 |           </Flex>
489 |         </>
490 |       )}
491 |       {shouldShowProgress && (
492 |         <Progress
493 |           // value={percentLoaded}
494 |           // label={percentLoaded + "%"}
495 |           size="xl"
496 |           radius="xl"
497 |           sections={experimentProgress}
498 |           color={loading ? "blue" : "green"}
499 |         />
500 |       )}
501 |       {!!experiments.length && (
502 |         <Card>
503 |           <Spoiler
504 |             maxHeight={0}
505 |             showLabel="Show summary"
506 |             hideLabel={null}
507 |             transitionDuration={500}
508 |             initialState={true}
509 |             controlRef={summarySpoilerRef}
510 |           >
511 |             <Stack>
512 |               <Group position="apart">
513 |                 <Title order={3}>Summary</Title>
514 |                 <Group>
515 |                   <Button
516 |                     style={{ marginBottom: "18px" }}
517 |                     type="button"
518 |                     variant="secondary"
519 |                     onClick={() => download(experiments, "summary")}
520 |                   >
521 |                     Download
522 |                   </Button>
523 |                   <Button
524 |                     style={{ marginBottom: "18px" }}
525 |                     type="button"
526 |                     variant="subtle"
527 |                     onClick={() => {
528 |                       if (summarySpoilerRef.current)
529 |                         summarySpoilerRef.current.click();
530 |                     }}
531 |                   >
532 |                     Hide
533 |                   </Button>
534 |                 </Group>
535 |               </Group>
536 |             </Stack>
537 |             <ScrollArea scrollbarSize={0}>
538 |               <Table withBorder withColumnBorders striped highlightOnHover>
539 |                 <thead>
540 |                   <tr>
541 |                     <th>Experiment #</th>
542 |                     <th># of Eval Questions</th>
543 |                     <th>Chunk Size</th>
544 |                     <th>Overlap</th>
545 |                     <th>Split Method</th>
546 |                     <th>Retriever</th>
547 |                     <th>Embedding Algorithm</th>
548 |                     <th>Model</th>
549 |                     <th>Grading Prompt Style</th>
550 |                     <th># of Chunks Retrieved</th>
551 |                     <th>Avg Retrieval Relevancy Score</th>
552 |                     <th>Avg Answer Similarity Score</th>
553 |                     <th>Avg Latency (s)</th>
554 |                   </tr>
555 |                 </thead>
556 |                 <tbody>
557 |                   {experiments?.map((result: Experiment, index: number) => (
558 |                     <tr key={index}>
559 |                       <td>{index + 1}</td>
560 |                       <td>{result?.evalQuestionsCount}</td>
561 |                       <td>{result?.chunkSize}</td>
562 |                       <td>{result?.overlap}</td>
563 |                       <td>{result?.splitMethod}</td>
564 |                       <td>{result?.retriever}</td>
565 |                       <td>{result?.embeddingAlgorithm}</td>
566 |                       <td>{result?.model}</td>
567 |                       <td>{result?.gradingPrompt}</td>
568 |                       <td>{result?.numNeighbors}</td>
569 |                       <td>{result?.avgRelevancyScore}</td>
570 |                       <td>{result?.avgAnswerScore}</td>
571 |                       <td>{result?.avgLatency.toFixed(3)}</td>
572 |                     </tr>
573 |                   ))}
574 |                 </tbody>
575 |               </Table>
576 |             </ScrollArea>
577 |             <div style={{ height: 500 }}>
578 |               <ResponsiveScatterPlot
579 |                 data={chartData}
580 |                 margin={{ top: 60, right: 140, bottom: 70, left: 90 }}
581 |                 xScale={{ type: "linear", min: 0, max: 1 }}
582 |                 xFormat=">-.2f"
583 |                 yScale={{ type: "linear", min: 0, max: "auto" }}
584 |                 yFormat=">-.2f"
585 |                 blendMode="multiply"
586 |                 axisTop={null}
587 |                 axisRight={null}
588 |                 nodeSize={25}
589 |                 axisBottom={{
590 |                   tickSize: 5,
591 |                   tickPadding: 5,
592 |                   tickRotation: 0,
593 |                   legend: "Avg Answer Similarity Score",
594 |                   legendPosition: "middle",
595 |                   legendOffset: 46,
596 |                 }}
597 |                 axisLeft={{
598 |                   tickSize: 5,
599 |                   tickPadding: 5,
600 |                   tickRotation: 0,
601 |                   legend: "Avg Latency (s)",
602 |                   legendPosition: "middle",
603 |                   legendOffset: -60,
604 |                 }}
605 |                 legends={[
606 |                   {
607 |                     anchor: "bottom-right",
608 |                     direction: "column",
609 |                     justify: false,
610 |                     translateX: 130,
611 |                     translateY: 0,
612 |                     itemWidth: 100,
613 |                     itemHeight: 12,
614 |                     itemsSpacing: 5,
615 |                     itemDirection: "left-to-right",
616 |                     symbolSize: 12,
617 |                     symbolShape: "circle",
618 |                     effects: [
619 |                       {
620 |                         on: "hover",
621 |                         style: {
622 |                           itemOpacity: 1,
623 |                         },
624 |                       },
625 |                     ],
626 |                   },
627 |                 ]}
628 |               />
629 |             </div>
630 |           </Spoiler>
631 |         </Card>
632 |       )}
633 |       {!isEmpty(results) ? (
634 |         <Card>
635 |           <Spoiler
636 |             maxHeight={0}
637 |             showLabel="Show results"
638 |             hideLabel={null}
639 |             transitionDuration={500}
640 |             initialState={true}
641 |             controlRef={experimentsResultsSpoilerRef}
642 |           >
643 |             <Stack>
644 |               <Group position="apart">
645 |                 <Title order={3}>Experiment Results</Title>
646 |                 <br />
647 |                 <br />
648 |                 <Group>
649 |                   <Button
650 |                     style={{ marginBottom: "18px" }}
651 |                     type="button"
652 |                     variant="subtle"
653 |                     onClick={() => download(results, "results")}
654 |                   >
655 |                     Download
656 |                   </Button>
657 |                   <Button
658 |                     style={{ marginBottom: "18px" }}
659 |                     type="button"
660 |                     variant="subtle"
661 |                     onClick={() => {
662 |                       if (experimentsResultsSpoilerRef.current)
663 |                         experimentsResultsSpoilerRef.current.click();
664 |                     }}
665 |                   >
666 |                     Hide
667 |                   </Button>
668 |                 </Group>
669 |               </Group>
670 |             </Stack>
671 |             <ScrollArea scrollbarSize={0}>
672 |               <Table withBorder withColumnBorders striped highlightOnHover>
673 |                 <thead>
674 |                   <tr>
675 |                     <th>Question</th>
676 |                     <th>Expected Answer</th>
677 |                     <th>Observed Answer</th>
678 |                     <th>Retrieval Relevancy Score</th>
679 |                     <th>Answer Similarity Score</th>
680 |                     <th>Latency (s)</th>
681 |                   </tr>
682 |                 </thead>
683 |                 <tbody>
684 |                   {results?.map((result: Result, index: number) => (
685 |                     <tr key={index}>
686 |                       <td>{result?.question}</td>
687 |                       <td>{result?.answer}</td>
688 |                       <td>{result?.result}</td>
689 |                       <td style={{ whiteSpace: "pre-wrap" }}>
690 |                         {isFastGradingPrompt ? (
691 |                           renderPassFail(result.retrievalScore)
692 |                         ) : (
693 |                           <Spoiler
694 |                             maxHeight={150}
695 |                             hideLabel={
696 |                               <Text weight="bold" color="blue">
697 |                                 Show less
698 |                               </Text>
699 |                             }
700 |                             showLabel={
701 |                               <Text weight="bold" color="blue">
702 |                                 Show more
703 |                               </Text>
704 |                             }
705 |                           >
706 |                             {result?.retrievalScore.justification}
707 |                           </Spoiler>
708 |                         )}
709 |                       </td>
710 |                       <td style={{ whiteSpace: "pre-wrap" }}>
711 |                         {isFastGradingPrompt ? (
712 |                           renderPassFail(result?.answerScore)
713 |                         ) : (
714 |                           <Spoiler
715 |                             maxHeight={150}
716 |                             hideLabel={
717 |                               <Text weight="bold" color="blue">
718 |                                 Show less
719 |                               </Text>
720 |                             }
721 |                             showLabel={
722 |                               <Text weight="bold" color="blue">
723 |                                 Show more
724 |                               </Text>
725 |                             }
726 |                           >
727 |                             {result?.answerScore.justification}
728 |                           </Spoiler>
729 |                         )}
730 |                       </td>
731 |                       <td>{result?.latency?.toFixed(3)}</td>
732 |                     </tr>
733 |                   ))}
734 |                 </tbody>
735 |               </Table>
736 |             </ScrollArea>
737 |           </Spoiler>
738 |         </Card>
739 |       ) : null}
740 |     </Stack>
741 |   );
742 | };
743 | export default Playground;
744 | 


--------------------------------------------------------------------------------
/nextjs/components/Sidebar.tsx:
--------------------------------------------------------------------------------
  1 | import { ScrollArea, Select, Slider, Stack, Text } from "@mantine/core";
  2 | import React from "react";
  3 | import { Form } from "../utils/types";
  4 | import { Controller, useForm } from "react-hook-form";
  5 | 
  6 | const Sidebar = ({ form }: { form: Form }) => {
  7 |   const { control, setValue } = form;
  8 | 
  9 |   return (
 10 |     <>
 11 |       <ScrollArea scrollbarSize={0}>
 12 |         <Text fz="xl">Parameters</Text>
 13 |         <Stack
 14 |           spacing="30px"
 15 |           style={{
 16 |             overflowX: "hidden",
 17 |             height: "100%",
 18 |             paddingRight: "15px",
 19 |             paddingLeft: "5px",
 20 |             paddingTop: "15px",
 21 |           }}
 22 |         >
 23 |           <div>
 24 |             <Text fz="md">Number of eval questions</Text>
 25 |             <Controller
 26 |               name="evalQuestionsCount"
 27 |               control={control}
 28 |               render={({ field }) => (
 29 |                 <Slider
 30 |                   {...field}
 31 |                   labelTransition="skew-down"
 32 |                   marks={[
 33 |                     { value: 1, label: "1" },
 34 |                     { value: 5, label: "5" },
 35 |                     { value: 10, label: "10" },
 36 |                     { value: 15, label: "15" },
 37 |                   ]}
 38 |                   max={15}
 39 |                   min={1}
 40 |                   step={1}
 41 |                 />
 42 |               )}
 43 |             />
 44 |           </div>
 45 |           <div>
 46 |             <Text fz="md">Chunk size</Text>
 47 |             <Controller
 48 |               name="chunkSize"
 49 |               control={control}
 50 |               render={({ field }) => (
 51 |                 <Slider
 52 |                   {...field}
 53 |                   labelTransition="skew-down"
 54 |                   marks={[
 55 |                     { value: 500, label: "500" },
 56 |                     { value: 1000, label: "1000" },
 57 |                     { value: 1500, label: "1500" },
 58 |                     { value: 2000, label: "2000" },
 59 |                   ]}
 60 |                   max={2000}
 61 |                   min={500}
 62 |                   step={100}
 63 |                 />
 64 |               )}
 65 |             />
 66 |           </div>
 67 |           <div>
 68 |             <Text fz="md">Chunk overlap</Text>
 69 |             <Controller
 70 |               name="overlap"
 71 |               control={control}
 72 |               render={({ field }) => (
 73 |                 <Slider
 74 |                   {...field}
 75 |                   labelTransition="skew-down"
 76 |                   marks={[
 77 |                     { value: 0, label: "0" },
 78 |                     { value: 50, label: "50" },
 79 |                     { value: 100, label: "100" },
 80 |                     { value: 150, label: "150" },
 81 |                   ]}
 82 |                   max={150}
 83 |                   min={0}
 84 |                   step={10}
 85 |                 />
 86 |               )}
 87 |             />
 88 |           </div>
 89 |           <div>
 90 |             <Text fz="md">Model</Text>
 91 |             <Controller
 92 |               name="model"
 93 |               control={control}
 94 |               render={({ field }) => (
 95 |                 <Select
 96 |                   {...field}
 97 |                   data={[
 98 |                     { label: "GPT 3.5 Turbo", value: "gpt-3.5-turbo" },
 99 |                     { label: "GPT 4", value: "gpt-4" },
100 |                     { label: "Anthropic", value: "anthropic" },
101 |                     { label: "Vicuna 13B", value: "vicuna-13b" },
102 |                     { label: "MPT-7B", value: "mosaic" },
103 |                   ]}
104 |                 />
105 |               )}
106 |             />
107 |           </div>
108 |           <div>
109 |             <Text fz="md">Split method</Text>
110 |             <Controller
111 |               name="splitMethod"
112 |               control={control}
113 |               render={({ field }) => (
114 |                 <Select
115 |                   {...field}
116 |                   data={[
117 |                     {
118 |                       label: "CharacterTextSplitter",
119 |                       value: "CharacterTextSplitter",
120 |                     },
121 |                     {
122 |                       label: "RecursiveTextSplitter",
123 |                       value: "RecursiveTextSplitter",
124 |                     },
125 |                   ]}
126 |                 />
127 |               )}
128 |             />
129 |           </div>
130 |           <div>
131 |             <Text fz="md">Embedding algorithm</Text>
132 |             <Controller
133 |               name="embeddingAlgorithm"
134 |               control={control}
135 |               render={({ field }) => (
136 |                 <Select
137 |                   {...field}
138 |                   data={[
139 |                     {
140 |                       label: "OpenAI",
141 |                       value: "OpenAI",
142 |                     },
143 |                   ]}
144 |                 />
145 |               )}
146 |             />
147 |           </div>
148 |           <div>
149 |             <Text fz="md">Retriever</Text>
150 |             <Controller
151 |               name="retriever"
152 |               control={control}
153 |               render={({ field }) => (
154 |                 <Select
155 |                   {...field}
156 |                   onChange={(value) => {
157 |                     field.onChange(value);
158 |                     if (value === "Anthropic-100k") {
159 |                       setValue("model", "anthropic");
160 |                       setValue("splitMethod", "");
161 |                       setValue("embeddingAlgorithm", ""); 
162 |                     }
163 |                   }}
164 |                   data={[
165 |                     {
166 |                       label: "Similarity Search",
167 |                       value: "similarity-search",
168 |                     },
169 |                     {
170 |                       label: "SVM",
171 |                       value: "SVM",
172 |                     },
173 |                     { label: "TF-IDF", value: "TF-IDF" },
174 |                     { label: "Anthropic-100k", value: "Anthropic-100k" },
175 |                   ]}
176 |                 />
177 |               )}
178 |             />
179 |           </div>
180 |           <div>
181 |             <Text fz="md">Number of chunks to retrieve</Text>
182 |             <Controller
183 |               name="numNeighbors"
184 |               control={control}
185 |               render={({ field }) => (
186 |                 <Slider
187 |                   {...field}
188 |                   labelTransition="skew-down"
189 |                   marks={[
190 |                     { value: 3, label: "3" },
191 |                     { value: 4, label: "4" },
192 |                     { value: 5, label: "5" },
193 |                   ]}
194 |                   max={5}
195 |                   min={3}
196 |                   step={1}
197 |                 />
198 |               )}
199 |             />
200 |           </div>
201 |           <div>
202 |             <Text fz="md">Grading prompt style</Text>
203 |             <Controller
204 |               name="gradingPrompt"
205 |               control={control}
206 |               render={({ field }) => (
207 |                 <Select
208 |                   {...field}
209 |                   data={[
210 |                     {
211 |                       label: "Fast",
212 |                       value: "Fast",
213 |                     },
214 |                     {
215 |                       label: "Descriptive",
216 |                       value: "Descriptive",
217 |                     },
218 |                     {
219 |                       label: "Descriptive w/ bias check",
220 |                       value: "Descriptive w/ bias check",
221 |                     },
222 |                     {
223 |                       label: "OpenAI grading prompt",
224 |                       value: "OpenAI grading prompt",
225 |                     },
226 |                   ]}
227 |                 />
228 |               )}
229 |             />
230 |           </div>
231 |         </Stack>
232 |       </ScrollArea>
233 |     </>
234 |   );
235 | };
236 | export default Sidebar;
237 | 


--------------------------------------------------------------------------------
/nextjs/components/SummaryChart.tsx:
--------------------------------------------------------------------------------
 1 | import { ResponsiveScatterPlot } from "@nivo/scatterplot";
 2 | 
 3 | const SummaryChart = ({
 4 |   chartData,
 5 | }: {
 6 |   chartData: {
 7 |     id: string;
 8 |     data: {
 9 |       x: number;
10 |       y: number;
11 |     }[];
12 |   }[];
13 | }) => {
14 |   return (
15 |     <ResponsiveScatterPlot
16 |       data={chartData}
17 |       margin={{ top: 60, right: 140, bottom: 70, left: 90 }}
18 |       xScale={{ type: "linear", min: 0, max: 1 }}
19 |       xFormat=">-.2f"
20 |       yScale={{ type: "linear", min: 0, max: "auto" }}
21 |       yFormat=">-.2f"
22 |       blendMode="multiply"
23 |       axisTop={null}
24 |       axisRight={null}
25 |       nodeSize={25}
26 |       axisBottom={{
27 |         tickSize: 5,
28 |         tickPadding: 5,
29 |         tickRotation: 0,
30 |         legend: "Avg Answer Similarity Score",
31 |         legendPosition: "middle",
32 |         legendOffset: 46,
33 |       }}
34 |       axisLeft={{
35 |         tickSize: 5,
36 |         tickPadding: 5,
37 |         tickRotation: 0,
38 |         legend: "Avg Latency (s)",
39 |         legendPosition: "middle",
40 |         legendOffset: -60,
41 |       }}
42 |       legends={[
43 |         {
44 |           anchor: "bottom-right",
45 |           direction: "column",
46 |           justify: false,
47 |           translateX: 130,
48 |           translateY: 0,
49 |           itemWidth: 100,
50 |           itemHeight: 12,
51 |           itemsSpacing: 5,
52 |           itemDirection: "left-to-right",
53 |           symbolSize: 12,
54 |           symbolShape: "circle",
55 |           effects: [
56 |             {
57 |               on: "hover",
58 |               style: {
59 |                 itemOpacity: 1,
60 |               },
61 |             },
62 |           ],
63 |         },
64 |       ]}
65 |     />
66 |   );
67 | };
68 | export default SummaryChart;
69 | 


--------------------------------------------------------------------------------
/nextjs/components/TestFileUploadZone.tsx:
--------------------------------------------------------------------------------
  1 | import { Stack, createStyles, Text, useMantineTheme } from "@mantine/core";
  2 | import { Dropzone, MIME_TYPES } from "@mantine/dropzone";
  3 | import { notifications } from "@mantine/notifications";
  4 | import { IconFile, IconUpload, IconX } from "@tabler/icons-react";
  5 | import Papa from "papaparse";
  6 | import { QAPair } from "../utils/types";
  7 | 
  8 | const useStyles = createStyles((theme) => ({
  9 |   disabled: {
 10 |     backgroundColor:
 11 |       theme.colorScheme === "dark"
 12 |         ? theme.colors.dark[6]
 13 |         : theme.colors.gray[0],
 14 |     borderColor:
 15 |       theme.colorScheme === "dark"
 16 |         ? theme.colors.dark[5]
 17 |         : theme.colors.gray[2],
 18 |     cursor: "not-allowed",
 19 | 
 20 |     "& *": {
 21 |       color:
 22 |         theme.colorScheme === "dark"
 23 |           ? theme.colors.dark[3]
 24 |           : theme.colors.gray[5],
 25 |     },
 26 |   },
 27 | }));
 28 | 
 29 | const TestFileUploadZone = ({
 30 |   disabled,
 31 |   setTestDataset,
 32 |   setDidUploadTestDataset,
 33 | }: {
 34 |   disabled: boolean;
 35 |   setTestDataset: React.Dispatch<React.SetStateAction<QAPair[]>>;
 36 |   setDidUploadTestDataset: React.Dispatch<React.SetStateAction<boolean>>;
 37 | }) => {
 38 |   const { classes } = useStyles();
 39 |   const theme = useMantineTheme();
 40 | 
 41 |   return (
 42 |     <Dropzone
 43 |       disabled={disabled}
 44 |       className={disabled ? classes.disabled : null}
 45 |       onDrop={(files) =>
 46 |         files.forEach((file) =>
 47 |           Papa.parse(file, {
 48 |             header: false,
 49 |             beforeFirstChunk: (chunk) => chunk.replace(/, /g, ","),
 50 |             complete: (results: { data: string[][] }) => {
 51 |               const datasetArray = results?.data;
 52 |               if (
 53 |                 datasetArray?.[0]?.[0]?.toLowerCase() === "question" ||
 54 |                 datasetArray?.[0]?.[0]?.toLowerCase() === "answer"
 55 |               ) {
 56 |                 datasetArray.shift();
 57 |               }
 58 |               const cappedDatasetArray = datasetArray.slice(
 59 |                 0,
 60 |                 Math.min(15, datasetArray.length)
 61 |               );
 62 |               const uploadedTestDataset = cappedDatasetArray.map((row) => ({
 63 |                 question: row?.[0],
 64 |                 answer: row?.[1],
 65 |               }));
 66 |               setTestDataset((testDataset) => [
 67 |                 ...uploadedTestDataset,
 68 |                 ...testDataset,
 69 |               ]);
 70 |               setDidUploadTestDataset(true);
 71 |             },
 72 |             error: () => {
 73 |               notifications.show({
 74 |                 title: "Error",
 75 |                 message: "Error parsing test dataset CSV",
 76 |                 color: "red",
 77 |               });
 78 |             },
 79 |           })
 80 |         )
 81 |       }
 82 |       maxFiles={1}
 83 |       multiple={false}
 84 |       accept={[MIME_TYPES.csv]}
 85 |       onReject={(files) =>
 86 |         notifications.show({
 87 |           title: "Error",
 88 |           message: `File type(s) not supported ${files.map(
 89 |             (file) => file.file.type
 90 |           )}`,
 91 |           color: "red",
 92 |         })
 93 |       }
 94 |       // maxSize={3 * 1024 ** 2}
 95 |       style={{ width: "40%" }}
 96 |     >
 97 |       <Stack align="center">
 98 |         <Dropzone.Accept>
 99 |           <IconUpload
100 |             size="3.2rem"
101 |             stroke={1.5}
102 |             color={
103 |               theme.colors[theme.primaryColor][
104 |                 theme.colorScheme === "dark" ? 4 : 6
105 |               ]
106 |             }
107 |           />
108 |         </Dropzone.Accept>
109 |         <Dropzone.Reject>
110 |           <IconX
111 |             size="3.2rem"
112 |             stroke={1.5}
113 |             color={theme.colors.red[theme.colorScheme === "dark" ? 4 : 6]}
114 |           />
115 |         </Dropzone.Reject>
116 |         <Dropzone.Idle>
117 |           <IconFile size="3.2rem" stroke={1.5} />
118 |         </Dropzone.Idle>
119 |         <div>
120 |           <Text size="xl" inline align="center">
121 |             Upload Test Dataset (Optional)
122 |           </Text>
123 |           <Text size="sm" color="dimmed" inline mt={7} align="center">
124 |             Single CSV containing QA pairs (shape: [question, answer])
125 |           </Text>
126 |         </div>
127 |       </Stack>
128 |     </Dropzone>
129 |   );
130 | };
131 | export default TestFileUploadZone;
132 | 


--------------------------------------------------------------------------------
/nextjs/components/tables/ExperimentResultTable.tsx:
--------------------------------------------------------------------------------
 1 | import { ScrollArea, Spoiler, Table, Text } from "@mantine/core";
 2 | import { Result } from "../../utils/types";
 3 | import renderPassFail from "../../utils/renderPassFail";
 4 | 
 5 | const ExperimentResultsTable = ({
 6 |   results,
 7 |   isFastGradingPrompt,
 8 | }: {
 9 |   results: any[];
10 |   isFastGradingPrompt: boolean;
11 | }) => {
12 |   return (
13 |     <ScrollArea scrollbarSize={0}>
14 |       <Table withBorder withColumnBorders striped highlightOnHover>
15 |         <thead>
16 |           <tr>
17 |             <th>Question</th>
18 |             <th>Expected Answer</th>
19 |             <th>Observed Answer</th>
20 |             <th>Retrieval Relevancy Score</th>
21 |             <th>Answer Similarity Score</th>
22 |             <th>Latency (s)</th>
23 |           </tr>
24 |         </thead>
25 |         <tbody>
26 |           {results?.map((result: Result, index: number) => {
27 |             return (
28 |               <tr key={index}>
29 |                 <td>{result?.question}</td>
30 |                 <td>{result?.answer}</td>
31 |                 <td>{result?.result}</td>
32 |                 <td style={{ whiteSpace: "pre-wrap" }}>
33 |                   {isFastGradingPrompt ? (
34 |                     renderPassFail(result.retrievalScore)
35 |                   ) : (
36 |                     <Spoiler
37 |                       maxHeight={150}
38 |                       hideLabel={
39 |                         <Text weight="bold" color="blue">
40 |                           Show less
41 |                         </Text>
42 |                       }
43 |                       showLabel={
44 |                         <Text weight="bold" color="blue">
45 |                           Show more
46 |                         </Text>
47 |                       }
48 |                     >
49 |                       {result?.retrievalScore.justification}
50 |                     </Spoiler>
51 |                   )}
52 |                 </td>
53 |                 <td style={{ whiteSpace: "pre-wrap" }}>
54 |                   {isFastGradingPrompt ? (
55 |                     renderPassFail(result?.answerScore)
56 |                   ) : (
57 |                     <Spoiler
58 |                       maxHeight={150}
59 |                       hideLabel={
60 |                         <Text weight="bold" color="blue">
61 |                           Show less
62 |                         </Text>
63 |                       }
64 |                       showLabel={
65 |                         <Text weight="bold" color="blue">
66 |                           Show more
67 |                         </Text>
68 |                       }
69 |                     >
70 |                       {result?.answerScore.justification}
71 |                     </Spoiler>
72 |                   )}
73 |                 </td>
74 |                 <td>{result?.latency?.toFixed(3)}</td>
75 |               </tr>
76 |             );
77 |           })}
78 |         </tbody>
79 |       </Table>
80 |     </ScrollArea>
81 |   );
82 | };
83 | export default ExperimentResultsTable;
84 | 


--------------------------------------------------------------------------------
/nextjs/components/tables/FilesTable.tsx:
--------------------------------------------------------------------------------
 1 | import { Table } from "@mantine/core";
 2 | 
 3 | const FilesTable = ({ files }: { files: any[] }) => {
 4 |   return (
 5 |     <Table>
 6 |       <thead>
 7 |         <tr>
 8 |           <th>File Name</th>
 9 |           <th>Size (MB)</th>
10 |         </tr>
11 |       </thead>
12 |       <tbody>
13 |         {files?.map((file, id) => (
14 |           <tr key={id}>
15 |             <td>{file?.name}</td>
16 |             <td>{(file?.size / 1024 ** 2).toFixed(1)}</td>
17 |           </tr>
18 |         ))}
19 |       </tbody>
20 |     </Table>
21 |   );
22 | };
23 | export default FilesTable;
24 | 


--------------------------------------------------------------------------------
/nextjs/next-env.d.ts:
--------------------------------------------------------------------------------
1 | /// <reference types="next" />
2 | /// <reference types="next/image-types/global" />
3 | 
4 | // NOTE: This file should not be edited
5 | // see https://nextjs.org/docs/basic-features/typescript for more information.
6 | 


--------------------------------------------------------------------------------
/nextjs/next.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |     i18n: {
3 |        // providing the locales supported by your application
4 |       locales: ["en-US"],
5 |       //  default locale used when the non-locale paths are visited
6 |       defaultLocale: "en-US",
7 |     },
8 | }


--------------------------------------------------------------------------------
/nextjs/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "private": true,
 3 |   "scripts": {
 4 |     "dev": "next dev",
 5 |     "build": "next build",
 6 |     "start": "next start"
 7 |   },
 8 |   "dependencies": {
 9 |     "@emotion/react": "^11.10.6",
10 |     "@emotion/server": "^11.10.0",
11 |     "@json2csv/plainjs": "^6.1.3",
12 |     "@mantine/core": "^6.0.4",
13 |     "@mantine/dropzone": "^6.0.6",
14 |     "@mantine/hooks": "^6.0.4",
15 |     "@mantine/next": "^6.0.4",
16 |     "@mantine/notifications": "^6.0.6",
17 |     "@microsoft/fetch-event-source": "^2.0.1",
18 |     "@nivo/core": "^0.80.0",
19 |     "@nivo/scatterplot": "^0.80.0",
20 |     "@segment/snippet": "^4.15.3",
21 |     "@sentry/browser": "^7.50.0",
22 |     "@tabler/icons-react": "^2.16.0",
23 |     "@types/lodash": "^4.14.194",
24 |     "@vercel/analytics": "^1.0.0",
25 |     "axios": "^1.3.5",
26 |     "date-fns": "^2.29.3",
27 |     "eslint-plugin-unused-imports": "^2.0.0",
28 |     "gray-matter": "^4.0.3",
29 |     "lodash": "^4.17.21",
30 |     "logrocket": "^4.0.0",
31 |     "next": "^13.2.4",
32 |     "openai": "^3.2.1",
33 |     "papaparse": "^5.4.1",
34 |     "react": "18.2.0",
35 |     "react-dom": "18.2.0",
36 |     "react-hook-form": "^7.43.9",
37 |     "react-spinners": "^0.13.8",
38 |     "remark": "^14.0.2",
39 |     "remark-html": "^15.0.1"
40 |   },
41 |   "devDependencies": {
42 |     "@types/node": "18.15.10",
43 |     "@types/react": "18.0.29",
44 |     "typescript": "5.0.2"
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/nextjs/pages/_app.tsx:
--------------------------------------------------------------------------------
 1 | import { AppProps } from "next/app";
 2 | import Head from "next/head";
 3 | import { MantineProvider, MantineThemeOverride } from "@mantine/core";
 4 | import React from "react";
 5 | import { IS_DEV } from "../utils/variables";
 6 | import * as snippet from "@segment/snippet";
 7 | import { useEffect } from "react";
 8 | import { Notifications } from "@mantine/notifications";
 9 | import { Analytics } from "@vercel/analytics/react";
10 | import LogRocket from "logrocket";
11 | import * as Sentry from "@sentry/browser";
12 | 
13 | if (!IS_DEV) {
14 |   Sentry.init({
15 |     dsn: "https://065aa152c4de4e14af9f9e7335c8eae4@o4505106202820608.ingest.sentry.io/4505106207735808",
16 |     release: "evaluator@1.0.0",
17 |     integrations: [new Sentry.BrowserTracing()],
18 |     tracesSampleRate: 1.0,
19 |   });
20 | 
21 |   LogRocket.init("dyuioj/auto-evaluator");
22 | 
23 |   LogRocket.getSessionURL((sessionURL) => {
24 |     Sentry.configureScope((scope) => {
25 |       scope.setExtra("sessionURL", sessionURL);
26 |     });
27 |   });
28 | }
29 | 
30 | const renderSegmentSnippet = () => {
31 |   const opts = {
32 |     apiKey: process.env.NEXT_PUBLIC_SEGMENT_KEY,
33 |     page: true,
34 |   };
35 | 
36 |   return snippet.min(opts);
37 | };
38 | 
39 | export default function App(props: AppProps) {
40 |   const { Component, pageProps } = props;
41 |   const theme: MantineThemeOverride = {
42 |     primaryColor: "dark",
43 |     fontFamily:
44 |       "Greycliff CF, Inter,-apple-system, BlinkMacSystemFont, Segoe UI, Roboto, Helvetica Neue,Arial, Noto Sans",
45 |   };
46 |   const pageName = "Auto-Evaluator";
47 |   useEffect(() => {
48 |     if (!IS_DEV) {
49 |       // @ts-expect-error
50 |       global.window.analytics.page();
51 |     }
52 |   }, [props]);
53 |   return (
54 |     <>
55 |       <Head>
56 |         <link rel="icon" href="favicon/favicon.ico" />
57 |         <link rel="icon" sizes="32x32" href="favicon/favicon-32x32.png" />
58 |         <link rel="icon" sizes="16x16" href="favicon/favicon-16x16.png" />
59 |         <link
60 |           rel="apple-touch-icon"
61 |           sizes="180x180"
62 |           href="/apple-touch-icon.png"
63 |         />
64 | 
65 |         <link rel="icon" href="favicon/favicon.ico" />
66 |         <link rel="icon" href="favicon/favicon.ico" />
67 |         <meta
68 |           name="description"
69 |           content="Auto-Evaluator helps you evaluate your LLM apps."
70 |         />
71 |         <meta name="og:title" content={pageName} />
72 |         <meta name="twitter:card" content="summary_large_image" />
73 |         <title>{pageName}</title>
74 |         {!IS_DEV && (
75 |           <>
76 |             <script
77 |               // eslint-disable-next-line react/no-danger
78 |               dangerouslySetInnerHTML={{ __html: renderSegmentSnippet() }}
79 |             />
80 |           </>
81 |         )}
82 |       </Head>
83 |       <MantineProvider
84 |         withGlobalStyles
85 |         withNormalizeCSS
86 |         theme={{
87 |           /** Put your mantine theme override here */
88 |           colorScheme: "light",
89 |           ...theme,
90 |         }}
91 |       >
92 |         <Notifications />
93 |         <Component {...pageProps} />
94 |         <Analytics />
95 |       </MantineProvider>
96 |     </>
97 |   );
98 | }
99 | 


--------------------------------------------------------------------------------
/nextjs/pages/about/index.tsx:
--------------------------------------------------------------------------------
 1 | import React from "react";
 2 | import HeaderEvaluator, { MenuItem } from "../../components/HeaderEvaluator";
 3 | import { UserCardImage } from "../../components/PersonCard";
 4 | import { Center, Group } from "@mantine/core";
 5 | 
 6 | const AboutPage = () => {
 7 |   return (
 8 |     <>
 9 |       <HeaderEvaluator activeTab={MenuItem.About} />
10 |       <Center>
11 |         <Group pt={100}>
12 |           <UserCardImage
13 |             avatar="https://avatars.githubusercontent.com/u/122662504?v=4"
14 |             name="Lance"
15 |             job="Tech Lead @ Nuro"
16 |             twitterHandle="RLanceMartin"
17 |             githubHandle="PineappleExpress808"
18 |           />
19 |           <UserCardImage
20 |             avatar="https://avatars.githubusercontent.com/u/10562358?v=4"
21 |             name="Danil"
22 |             job="Founding Eng @ Shepherd"
23 |             twitterHandle="sfgunslinger"
24 |             githubHandle="dankolesnikov"
25 |           />
26 |           <UserCardImage
27 |             avatar="https://pbs.twimg.com/profile_images/1508934858990505986/-bPAFfFU_400x400.png"
28 |             name="Ben"
29 |             job="Software @ Rainbow"
30 |             twitterHandle="thebengoldberg"
31 |             githubHandle="benisgold"
32 |           />
33 |         </Group>
34 |       </Center>
35 |     </>
36 |   );
37 | };
38 | export default AboutPage;
39 | 


--------------------------------------------------------------------------------
/nextjs/pages/index.tsx:
--------------------------------------------------------------------------------
 1 | import { AppShell, Navbar } from "@mantine/core";
 2 | import React, { useEffect } from "react";
 3 | import { useForm } from "react-hook-form";
 4 | import HeaderEvaluator, { MenuItem } from "../components/HeaderEvaluator";
 5 | import Sidebar from "../components/Sidebar";
 6 | import { FormValues } from "../utils/types";
 7 | import Demo from "../components/Demo";
 8 | 
 9 | const DemoPage = () => {
10 |   const form = useForm<FormValues>({
11 |     defaultValues: {
12 |       evalQuestionsCount: 5,
13 |       chunkSize: 1000,
14 |       overlap: 100,
15 |       splitMethod: "RecursiveTextSplitter",
16 |       embeddingAlgorithm: "OpenAI",
17 |       model: "gpt-3.5-turbo",
18 |       retriever: "similarity-search",
19 |       gradingPrompt: "Descriptive",
20 |       numNeighbors: 3,
21 |       files: [],
22 |     },
23 |   });
24 | 
25 |   return (
26 |     <AppShell
27 |       navbarOffsetBreakpoint="sm"
28 |       navbar={
29 |         <Navbar p="md" hiddenBreakpoint="sm" width={{ base: "30%" }}>
30 |           <Sidebar form={form} />
31 |           <br />
32 |         </Navbar>
33 |       }
34 |       header={<HeaderEvaluator activeTab={MenuItem.Demo} />}
35 |       styles={(theme) => ({
36 |         main: {
37 |           backgroundColor:
38 |             theme.colorScheme === "dark"
39 |               ? theme.colors.dark[8]
40 |               : theme.colors.gray[0],
41 |         },
42 |       })}
43 |     >
44 |       <Demo form={form} />
45 |     </AppShell>
46 |   );
47 | };
48 | export default DemoPage;
49 | 


--------------------------------------------------------------------------------
/nextjs/pages/playground/index.tsx:
--------------------------------------------------------------------------------
 1 | import { AppShell, Navbar } from "@mantine/core";
 2 | import React from "react";
 3 | import { useForm } from "react-hook-form";
 4 | import HeaderEvaluator, { MenuItem } from "../../components/HeaderEvaluator";
 5 | import Sidebar from "../../components/Sidebar";
 6 | import { FormValues } from "../../utils/types";
 7 | import Playground from "../../components/Playground";
 8 | 
 9 | const PlaygroundPage = () => {
10 |   const form = useForm<FormValues>({
11 |     defaultValues: {
12 |       evalQuestionsCount: 1,
13 |       chunkSize: 1000,
14 |       overlap: 100,
15 |       splitMethod: "RecursiveTextSplitter",
16 |       embeddingAlgorithm: "OpenAI",
17 |       model: "gpt-3.5-turbo",
18 |       retriever: "similarity-search",
19 |       gradingPrompt: "Descriptive",
20 |       numNeighbors: 3,
21 |       files: [],
22 |     },
23 |   });
24 | 
25 |   return (
26 |     <AppShell
27 |       navbarOffsetBreakpoint="sm"
28 |       navbar={
29 |         <Navbar p="md" hiddenBreakpoint="sm" width={{ sm: 200, lg: 400 }}>
30 |           <Sidebar form={form} />
31 |           <br />
32 |         </Navbar>
33 |       }
34 |       header={<HeaderEvaluator activeTab={MenuItem.Playground} />}
35 |       styles={(theme) => ({
36 |         main: {
37 |           backgroundColor:
38 |             theme.colorScheme === "dark"
39 |               ? theme.colors.dark[8]
40 |               : theme.colors.gray[0],
41 |         },
42 |       })}
43 |     >
44 |       <Playground form={form} />
45 |     </AppShell>
46 |   );
47 | };
48 | export default PlaygroundPage;
49 | 


--------------------------------------------------------------------------------
/nextjs/public/favicon/about.txt:
--------------------------------------------------------------------------------
1 | This favicon was generated using the following graphics from Twitter Twemoji:
2 | 
3 | - Graphics Title: 1f916.svg
4 | - Graphics Author: Copyright 2020 Twitter, Inc and other contributors (https://github.com/twitter/twemoji)
5 | - Graphics Source: https://github.com/twitter/twemoji/blob/master/assets/svg/1f916.svg
6 | - Graphics License: CC-BY 4.0 (https://creativecommons.org/licenses/by/4.0/)
7 | 


--------------------------------------------------------------------------------
/nextjs/public/favicon/android-chrome-192x192.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/nextjs/public/favicon/android-chrome-192x192.png


--------------------------------------------------------------------------------
/nextjs/public/favicon/android-chrome-512x512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/nextjs/public/favicon/android-chrome-512x512.png


--------------------------------------------------------------------------------
/nextjs/public/favicon/apple-touch-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/nextjs/public/favicon/apple-touch-icon.png


--------------------------------------------------------------------------------
/nextjs/public/favicon/favicon-16x16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/nextjs/public/favicon/favicon-16x16.png


--------------------------------------------------------------------------------
/nextjs/public/favicon/favicon-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/nextjs/public/favicon/favicon-32x32.png


--------------------------------------------------------------------------------
/nextjs/public/favicon/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/nextjs/public/favicon/favicon.ico


--------------------------------------------------------------------------------
/nextjs/public/favicon/site.webmanifest:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "",
 3 |     "short_name": "",
 4 |     "icons": [
 5 |        {
 6 |           "src": "/android-chrome-192x192.png",
 7 |           "sizes": "192x192",
 8 |           "type": "image/png"
 9 |        },
10 |        {
11 |         "src": "/android-chrome-512x512.png",
12 |         "sizes": "512x512",
13 |         "type": "image/png"
14 |         }
15 |     ],
16 |     "theme_color": "#FFFFFF",
17 |     "background_color": "#FFFFFF",
18 |     "display": "standalone"
19 |  }


--------------------------------------------------------------------------------
/nextjs/public/github-mark.svg:
--------------------------------------------------------------------------------
1 | <svg width="98" height="96" xmlns="http://www.w3.org/2000/svg"><path fill-rule="evenodd" clip-rule="evenodd" d="M48.854 0C21.839 0 0 22 0 49.217c0 21.756 13.993 40.172 33.405 46.69 2.427.49 3.316-1.059 3.316-2.362 0-1.141-.08-5.052-.08-9.127-13.59 2.934-16.42-5.867-16.42-5.867-2.184-5.704-5.42-7.17-5.42-7.17-4.448-3.015.324-3.015.324-3.015 4.934.326 7.523 5.052 7.523 5.052 4.367 7.496 11.404 5.378 14.235 4.074.404-3.178 1.699-5.378 3.074-6.6-10.839-1.141-22.243-5.378-22.243-24.283 0-5.378 1.94-9.778 5.014-13.2-.485-1.222-2.184-6.275.486-13.038 0 0 4.125-1.304 13.426 5.052a46.97 46.97 0 0 1 12.214-1.63c4.125 0 8.33.571 12.213 1.63 9.302-6.356 13.427-5.052 13.427-5.052 2.67 6.763.97 11.816.485 13.038 3.155 3.422 5.015 7.822 5.015 13.2 0 18.905-11.404 23.06-22.324 24.283 1.78 1.548 3.316 4.481 3.316 9.126 0 6.6-.08 11.897-.08 13.526 0 1.304.89 2.853 3.316 2.364 19.412-6.52 33.405-24.935 33.405-46.691C97.707 22 75.788 0 48.854 0z" fill="#24292f"/></svg>


--------------------------------------------------------------------------------
/nextjs/public/slack-mark.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 23.0.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 270 270" style="enable-background:new 0 0 270 270;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:#E01E5A;}
 7 | 	.st1{fill:#36C5F0;}
 8 | 	.st2{fill:#2EB67D;}
 9 | 	.st3{fill:#ECB22E;}
10 | </style>
11 | <g>
12 | 	<g>
13 | 		<path class="st0" d="M99.4,151.2c0,7.1-5.8,12.9-12.9,12.9c-7.1,0-12.9-5.8-12.9-12.9c0-7.1,5.8-12.9,12.9-12.9h12.9V151.2z"/>
14 | 		<path class="st0" d="M105.9,151.2c0-7.1,5.8-12.9,12.9-12.9s12.9,5.8,12.9,12.9v32.3c0,7.1-5.8,12.9-12.9,12.9
15 | 			s-12.9-5.8-12.9-12.9V151.2z"/>
16 | 	</g>
17 | 	<g>
18 | 		<path class="st1" d="M118.8,99.4c-7.1,0-12.9-5.8-12.9-12.9c0-7.1,5.8-12.9,12.9-12.9s12.9,5.8,12.9,12.9v12.9H118.8z"/>
19 | 		<path class="st1" d="M118.8,105.9c7.1,0,12.9,5.8,12.9,12.9s-5.8,12.9-12.9,12.9H86.5c-7.1,0-12.9-5.8-12.9-12.9
20 | 			s5.8-12.9,12.9-12.9H118.8z"/>
21 | 	</g>
22 | 	<g>
23 | 		<path class="st2" d="M170.6,118.8c0-7.1,5.8-12.9,12.9-12.9c7.1,0,12.9,5.8,12.9,12.9s-5.8,12.9-12.9,12.9h-12.9V118.8z"/>
24 | 		<path class="st2" d="M164.1,118.8c0,7.1-5.8,12.9-12.9,12.9c-7.1,0-12.9-5.8-12.9-12.9V86.5c0-7.1,5.8-12.9,12.9-12.9
25 | 			c7.1,0,12.9,5.8,12.9,12.9V118.8z"/>
26 | 	</g>
27 | 	<g>
28 | 		<path class="st3" d="M151.2,170.6c7.1,0,12.9,5.8,12.9,12.9c0,7.1-5.8,12.9-12.9,12.9c-7.1,0-12.9-5.8-12.9-12.9v-12.9H151.2z"/>
29 | 		<path class="st3" d="M151.2,164.1c-7.1,0-12.9-5.8-12.9-12.9c0-7.1,5.8-12.9,12.9-12.9h32.3c7.1,0,12.9,5.8,12.9,12.9
30 | 			c0,7.1-5.8,12.9-12.9,12.9H151.2z"/>
31 | 	</g>
32 | </g>
33 | </svg>
34 | 


--------------------------------------------------------------------------------
/nextjs/public/testData/experiments.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "evalQuestionsCount": 5,
 4 |     "chunkSize": 2000,
 5 |     "overlap": 0,
 6 |     "splitMethod": "RecursiveTextSplitter",
 7 |     "retriever": "SVM",
 8 |     "embeddingAlgorithm": "OpenAI",
 9 |     "model": "gpt-3.5-turbo",
10 |     "gradingPrompt": "Descriptive",
11 |     "numNeighbors": 3,
12 |     "avgRelevancyScore": 1,
13 |     "avgAnswerScore": 1,
14 |     "avgLatency": 14.516361331939697,
15 |     "performance": 0.06888778648680678,
16 |     "id": 1
17 |   },
18 |   {
19 |     "evalQuestionsCount": 5,
20 |     "chunkSize": 1500,
21 |     "overlap": 50,
22 |     "splitMethod": "RecursiveTextSplitter",
23 |     "retriever": "TF-IDF",
24 |     "embeddingAlgorithm": "OpenAI",
25 |     "model": "gpt-3.5-turbo",
26 |     "gradingPrompt": "Descriptive",
27 |     "numNeighbors": 3,
28 |     "avgRelevancyScore": 1,
29 |     "avgAnswerScore": 0.8,
30 |     "avgLatency": 10.672325372695923,
31 |     "performance": 0.0749602333196025,
32 |     "id": 2
33 |   },
34 |   {
35 |     "evalQuestionsCount": 5,
36 |     "chunkSize": 500,
37 |     "overlap": 0,
38 |     "splitMethod": "CharacterTextSplitter",
39 |     "retriever": "similarity-search",
40 |     "embeddingAlgorithm": "OpenAI",
41 |     "model": "gpt-3.5-turbo",
42 |     "gradingPrompt": "Descriptive",
43 |     "numNeighbors": 3,
44 |     "avgRelevancyScore": 1,
45 |     "avgAnswerScore": 1,
46 |     "avgLatency": 9.620948791503906,
47 |     "performance": 0.10393985267680489,
48 |     "id": 3
49 |   }
50 | ]
51 | 


--------------------------------------------------------------------------------
/nextjs/public/testData/results.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "question": "Why is the transformer architecture expressive in the forward pass?",
 4 |     "answer": "The transformer architecture is expressive because it uses a general message passing scheme where nodes get to look at each other, decide what's interesting and then update each other.",
 5 |     "result": "The transformer architecture is designed to be very expressive in the forward pass and is optimized for hardware with lots of parallelism. It is designed to perform a lot of operations serially and is very optimizable in the backward pass. The residual connections in the transformer support a kind of passing where nodes get to communicate and broadcast their keys and values.",
 6 |     "answerScore": {
 7 |       "score": 1,
 8 |       "justification": "GRADE: Correct\n\nJUSTIFICATION: The student answer accurately explains that the transformer architecture is designed to be expressive in the forward pass and uses a message passing scheme where nodes update each other. The additional information provided by the student does not conflict with the true answer."
 9 |     },
10 |     "retrievalScore": {
11 |       "score": 1,
12 |       "justification": "GRADE: Correct\n\nJUSTIFICATION: All of the retrieved documents discuss the transformer architecture and its design features, specifically its ability to be expressive in the forward pass. Doc 3 specifically mentions the message passing scheme used by the transformer architecture, which directly answers the question. Therefore, the retrieved documents are relevant and support the answer."
13 |     },
14 |     "latency": 9.03007197380066
15 |   },
16 |   {
17 |     "question": "What design criteria does the Transformer meet?",
18 |     "answer": "The transformer is very expressive in a forward pass, optimizable in the backward pass using the techniques that we have such as gradient descent, and it can run efficiently on our hardware such as GPUs.",
19 |     "result": "The Transformer was designed to be both expressive in the forward pass and optimizable in the backward pass, with hardware efficiency in mind. The authors aimed to create a powerful architecture that could express general computation and support message passing. The Transformer has been remarkably stable since its release in 2016, with some minor reshuffling of layer normalizations in a pre-norm formulation.",
20 |     "answerScore": {
21 |       "score": 1,
22 |       "justification": "GRADE: Correct\n\nJUSTIFICATION: The student answer accurately describes the design criteria of the Transformer, including its expressiveness in the forward pass, optimizability in the backward pass, and hardware efficiency. The additional information provided does not conflict with the true answer."
23 |     },
24 |     "retrievalScore": {
25 |       "score": 1,
26 |       "justification": "GRADE: Correct\n\nJUSTIFICATION: The retrieved documents provide information on the design criteria of the Transformer, including its efficiency on hardware, expressiveness in the forward pass, and optimizability in the backward pass. These criteria directly answer the question and support the answer provided."
27 |     },
28 |     "latency": 9.81938886642456
29 |   },
30 |   {
31 |     "question": "Why is next word prediction an effective training objective?",
32 |     "answer": "On a sufficiently large dataset, the task of predicting the next word multi-tasks knowledge of a lot of things, including understanding of chemistry, physics, and human nature. You have to understand a lot about the world to make that prediction on an internet-scale dataset.",
33 |     "result": "Next word prediction is an effective training objective because it is a simple objective that can be applied to language models, which have existed for a long time. By consuming a sequence of words and predicting the next word, neural nets can be trained on large datasets and prompted to solve problems. This objective requires understanding of the world and context, making it an effective way to train language models.",
34 |     "answerScore": {
35 |       "score": 1,
36 |       "justification": "GRADE: Correct\n\nJUSTIFICATION: The student answer accurately explains that next word prediction is an effective training objective because it requires understanding of the world and context, making it an effective way to train language models. The answer also mentions that neural nets can be trained on large datasets and prompted to solve problems, which is also true."
37 |     },
38 |     "retrievalScore": {
39 |       "score": 1,
40 |       "justification": "GRADE: Correct\n\nJUSTIFICATION: All of the retrieved documents discuss the task of predicting the next word in a sequence and how it relates to language modeling and neural networks. They also mention the importance of understanding various aspects of the world to effectively predict the next word. Therefore, they are relevant to the question and support the answer."
41 |     },
42 |     "latency": 9.98887300491333
43 |   },
44 |   {
45 |     "question": "What was the World Of Bits project and why did it fail?",
46 |     "answer": "World Of Bits was an effort to give AI access to tools, such as a keyboard and mouse, in order to complete tasks, such as complete bookings. It failed because it turned out that reinforcement learning is an extremely inefficient way of training neural networks. You take many actions, but you only get a sparse reward once in a while. Starting from scratch, it is very unlikely to stumble on the correct action - such as a booking - by chance at random, so the reward signal is very sparse.",
47 |     "result": "The World of Bits project was an attempt to use reinforcement learning to teach neural networks how to complete tasks using a keyboard and mouse. The project failed because this approach was found to be inefficient and impractical for solving problems. The project also lacked a clear goal or purpose.",
48 |     "answerScore": {
49 |       "score": 1,
50 |       "justification": "GRADE: Correct\n\nJUSTIFICATION: The student accurately describes the World of Bits project as an attempt to use reinforcement learning to teach neural networks how to complete tasks using a keyboard and mouse. They also correctly state that the project failed due to the inefficiency and impracticality of this approach, as well as the lack of a clear goal or purpose."
51 |     },
52 |     "retrievalScore": {
53 |       "score": 1,
54 |       "justification": "GRADE: Correct\n\nJUSTIFICATION: \n- Doc 1 mentions the World Of Bits project and the time period it was active.\n- Doc 2 discusses the inefficiency of reinforcement learning, which was the approach taken in the World Of Bits project.\n- Doc 3 is not directly related to the question, but it does not contradict the information in the other documents.\n- Doc 4 provides a clear explanation of what the World Of Bits project was and what it aimed to achieve."
55 |     },
56 |     "latency": 10.1092050075531
57 |   },
58 |   {
59 |     "question": "Why can additional sensors be a liability in an autonomous vehicle system?",
60 |     "answer": "Each sensor adds complexity to the system. The hardware must be sourced, versioned, and maintain firmware. Software must ingest it, track versions. The cost of this additional bloat or entropy must be weighted against the added benefit of that particular sensor.",
61 |     "result": "Additional sensors can be a liability in an autonomous vehicle system because they are not free and can add bloat to the data engine. They can also be a distraction and may change over time, requiring additional resources to maintain. It is important to focus resources on necessary and sufficient sensors and consider the full cost of adding a sensor.",
62 |     "answerScore": {
63 |       "score": 1,
64 |       "justification": "GRADE: Correct\n\nJUSTIFICATION: The student answer accurately explains that additional sensors can add complexity and cost to an autonomous vehicle system, and emphasizes the importance of considering the full cost and necessity of adding a sensor."
65 |     },
66 |     "retrievalScore": {
67 |       "score": 1,
68 |       "justification": "GRADE: Correct\n\nJUSTIFICATION: All four documents discuss the potential drawbacks of adding additional sensors to an autonomous vehicle system, including increased complexity, cost, and potential distraction. The answer provided is supported by the information in the retrieved documents."
69 |     },
70 |     "latency": 9.15720510482788
71 |   }
72 | ]


--------------------------------------------------------------------------------
/nextjs/public/testData/testDataset.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "question": "Why is the transformer architecture expressive in the forward pass?",
 4 |     "answer": "The transformer architecture is expressive because it uses a general message passing scheme where nodes get to look at each other, decide what's interesting and then update each other."
 5 |   },
 6 |   {
 7 |     "question": "What design criteria does the Transformer meet?",
 8 |     "answer": "The transformer is very expressive in a forward pass, optimizable in the backward pass using the techniques that we have such as gradient descent, and it can run efficiently on our hardware such as GPUs."
 9 |   },
10 |   {
11 |     "question": "Why is next word prediction an effective training objective?",
12 |     "answer": "On a sufficiently large dataset, the task of predicting the next word multi-tasks knowledge of a lot of things, including understanding of chemistry, physics, and human nature. You have to understand a lot about the world to make that prediction on an internet-scale dataset."
13 |   },
14 |   {
15 |     "question": "What was the World Of Bits project and why did it fail?",
16 |     "answer": "World Of Bits was an effort to give AI access to tools, such as a keyboard and mouse, in order to complete tasks, such as complete bookings. It failed because it turned out that reinforcement learning is an extremely inefficient way of training neural networks. You take many actions, but you only get a sparse reward once in a while. Starting from scratch, it is very unlikely to stumble on the correct action - such as a booking - by chance at random, so the reward signal is very sparse."
17 |   },
18 |   {
19 |     "question": "Why can additional sensors be a liability in an autonomous vehicle system?",
20 |     "answer": "Each sensor adds complexity to the system. The hardware must be sourced, versioned, and maintain firmware. Software must ingest it, track versions. The cost of this additional bloat or entropy must be weighted against the added benefit of that particular sensor."
21 |   }
22 | ]
23 | 


--------------------------------------------------------------------------------
/nextjs/public/twitter-black.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 24.2.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Logo" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 248 204" style="enable-background:new 0 0 248 204;" xml:space="preserve">
 5 | <g id="Logo_1_">
 6 | 	<path id="white_background" d="M221.95,51.29c0.15,2.17,0.15,4.34,0.15,6.53c0,66.73-50.8,143.69-143.69,143.69v-0.04
 7 | 		C50.97,201.51,24.1,193.65,1,178.83c3.99,0.48,8,0.72,12.02,0.73c22.74,0.02,44.83-7.61,62.72-21.66
 8 | 		c-21.61-0.41-40.56-14.5-47.18-35.07c7.57,1.46,15.37,1.16,22.8-0.87C27.8,117.2,10.85,96.5,10.85,72.46c0-0.22,0-0.43,0-0.64
 9 | 		c7.02,3.91,14.88,6.08,22.92,6.32C11.58,63.31,4.74,33.79,18.14,10.71c25.64,31.55,63.47,50.73,104.08,52.76
10 | 		c-4.07-17.54,1.49-35.92,14.61-48.25c20.34-19.12,52.33-18.14,71.45,2.19c11.31-2.23,22.15-6.38,32.07-12.26
11 | 		c-3.77,11.69-11.66,21.62-22.2,27.93c10.01-1.18,19.79-3.86,29-7.95C240.37,35.29,231.83,44.14,221.95,51.29z"/>
12 | </g>
13 | </svg>
14 | 


--------------------------------------------------------------------------------
/nextjs/styles/global.css:
--------------------------------------------------------------------------------
 1 | html,
 2 | body {
 3 |   padding: 0;
 4 |   margin: 0;
 5 |   font-family: Greycliff C, -apple-system, BlinkMacSystemFont, Segoe UI, Roboto, Oxygen,
 6 |     Ubuntu, Cantarell, Fira Sans, Droid Sans, Helvetica Neue, sans-serif;
 7 |   line-height: 1.6;
 8 |   font-size: 18px;
 9 | }
10 | 
11 | * {
12 |   box-sizing: border-box;
13 | }
14 | 
15 | a {
16 |   color: #000;
17 |   text-decoration: underline;
18 |   background-color: transparent;
19 |   outline: none;
20 |   cursor: pointer;
21 |   transition: color 0.3s;
22 |   -webkit-text-decoration-skip: objects;
23 | }
24 | a:hover {
25 |   color: #0d0c0c;
26 | }
27 | a:active {
28 |   color: #000000;
29 | }
30 | a:active,
31 | a:hover {
32 |   text-decoration: none;
33 |   outline: 0;
34 | }
35 | a:focus {
36 |   text-decoration: none;
37 |   outline: 0;
38 | }
39 | a[disabled] {
40 |   color: rgba(0, 0, 0, 0.25);
41 |   cursor: not-allowed;
42 | }
43 | 
44 | 
45 | 
46 | img {
47 |   max-width: 100%;
48 |   display: block;
49 | }
50 | 


--------------------------------------------------------------------------------
/nextjs/styles/utils.module.css:
--------------------------------------------------------------------------------
 1 | .heading2Xl {
 2 |   font-size: 2.5rem;
 3 |   line-height: 1.2;
 4 |   font-weight: 800;
 5 |   letter-spacing: -0.05rem;
 6 |   margin: 1rem 0;
 7 | }
 8 | 
 9 | .headingXl {
10 |   font-size: 2rem;
11 |   line-height: 1.3;
12 |   font-weight: 800;
13 |   letter-spacing: -0.05rem;
14 |   margin: 1rem 0;
15 | }
16 | 
17 | .headingLg {
18 |   font-size: 1.5rem;
19 |   line-height: 1.4;
20 |   margin: 1rem 0;
21 | }
22 | 
23 | .headingMd {
24 |   font-size: 1.2rem;
25 |   line-height: 1.5;
26 | }
27 | 
28 | .borderCircle {
29 |   border-radius: 9999px;
30 | }
31 | 
32 | .colorInherit {
33 |   color: inherit;
34 | }
35 | 
36 | .padding1px {
37 |   padding-top: 1px;
38 | }
39 | 
40 | .list {
41 |   list-style: none;
42 |   padding: 0;
43 |   margin: 0;
44 | }
45 | 
46 | .listItem {
47 |   margin: 0 0 1.25rem;
48 | }
49 | 
50 | .lightText {
51 |   color: #666;
52 | }
53 | 


--------------------------------------------------------------------------------
/nextjs/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "lib": [
 4 |       "dom",
 5 |       "dom.iterable",
 6 |       "esnext"
 7 |     ],
 8 |     "allowJs": true,
 9 |     "skipLibCheck": true,
10 |     "strict": false,
11 |     "forceConsistentCasingInFileNames": true,
12 |     "noEmit": true,
13 |     "incremental": true,
14 |     "esModuleInterop": true,
15 |     "module": "esnext",
16 |     "moduleResolution": "node",
17 |     "resolveJsonModule": true,
18 |     "isolatedModules": true,
19 |     "jsx": "preserve"
20 |   },
21 |   "include": [
22 |     "next-env.d.ts",
23 |     "**/*.ts",
24 |     "**/*.tsx"
25 |   ],
26 |   "exclude": [
27 |     "node_modules"
28 |   ]
29 | }
30 | 


--------------------------------------------------------------------------------
/nextjs/utils/renderPassFail.ts:
--------------------------------------------------------------------------------
 1 | const renderPassFail = (data: any) => {
 2 |   if (data.score === 0) {
 3 |     return "Incorrect";
 4 |   }
 5 |   if (data.score === 1) {
 6 |     return "Correct";
 7 |   }
 8 |   throw new Error(`Problem parsing ${data}`);
 9 | };
10 | 
11 | export default renderPassFail;
12 | 


--------------------------------------------------------------------------------
/nextjs/utils/types.ts:
--------------------------------------------------------------------------------
 1 | import { UseFormReturn } from "react-hook-form";
 2 | 
 3 | export type FormValues = {
 4 |   evalQuestionsCount: number;
 5 |   chunkSize: number;
 6 |   overlap: number;
 7 |   splitMethod: string;
 8 |   embeddingAlgorithm: string;
 9 |   model: string;
10 |   retriever: string;
11 |   gradingPrompt: string;
12 |   numNeighbors: number;
13 |   files: any[];
14 | };
15 | 
16 | export type Form = UseFormReturn<FormValues>;
17 | 
18 | export type Result = {
19 |   question: string;
20 |   answer: string;
21 |   result: string;
22 |   retrievalScore: { score: number; justification: string };
23 |   answerScore: { score: number; justification: string };
24 |   latency: number;
25 | };
26 | 
27 | export type QAPair = {
28 |   question: string;
29 |   answer: string;
30 | };
31 | 
32 | export type Experiment = {
33 |   evalQuestionsCount: number;
34 |   chunkSize: number;
35 |   overlap: number;
36 |   splitMethod: string;
37 |   retriever: string;
38 |   embeddingAlgorithm: string;
39 |   model: string;
40 |   gradingPrompt: string;
41 |   numNeighbors: number;
42 |   avgRelevancyScore: number;
43 |   avgAnswerScore: number;
44 |   avgLatency: number;
45 |   performance: number;
46 |   id: number;
47 | };
48 | 


--------------------------------------------------------------------------------
/nextjs/utils/variables.ts:
--------------------------------------------------------------------------------
1 | export const IS_DEV = process.env.NODE_ENV === "development";
2 | export const API_URL = process.env.NEXT_PUBLIC_API_URL;
3 | 


--------------------------------------------------------------------------------
/streamlit/README.md:
--------------------------------------------------------------------------------
 1 | # `VectorDB Auto-evaluator` :brain: :memo:
 2 | 
 3 | **Context**
 4 | 
 5 | We previously introduced auto-evaluator, an open-source tool for grading LLM question-answer chains. But this app did not connect to an existing (e.g., production) VectorDB and did not test some interesting architectures for retrieval, such as metadata filtering. 
 6 | 
 7 | **App**
 8 | 
 9 | There are several architectures for retrieval:
10 | 
11 | ![image](https://github.com/langchain-ai/auto-evaluator/assets/122662504/e8763dc0-474d-4c59-97c7-00c70cbc974e)
12 | 
13 | Here, we connect auto-evaluator to Pinecone in order to test metadata filtering (architecture 3). 
14 | 
15 | In particular, we let the user test 3 different retrieval methods discussed below:
16 | 
17 | `1) Pinecone w/ metadata filtering`
18 | 
19 | * Hard-code the metadata fileter you want to use
20 | * See example `metadata_filter = {'id':"0252"}` [here](https://github.com/langchain-ai/auto-evaluator/blob/d3e6a7eaff0ac0a04904cdafcefc3980d4321014/streamlit/auto-evaluator.py#L181)
21 | * This will not adapt the filter based upon the context of the question, of course
22 | 
23 | `2) Pinecone w/ self-querying`
24 | 
25 | * See background on self-querying retriever [here](https://python.langchain.com/en/latest/modules/indexes/retrievers/examples/self_query_retriever.html)
26 | * See example `metadata_field_info` [here](https://github.com/langchain-ai/auto-evaluator/blob/d3e6a7eaff0ac0a04904cdafcefc3980d4321014/streamlit/self_query_retriever_lex.py#L3)
27 | * The `SelfQueryRetriever` will try to extract metadata filters from the question
28 | 
29 | `3) Kor filtering`
30 | 
31 | * See background on Kor [here](https://eyurtsev.github.io/kor/tutorial.html)
32 | * See example `schema` for Kor [here](https://github.com/langchain-ai/auto-evaluator/blob/d3e6a7eaff0ac0a04904cdafcefc3980d4321014/streamlit/kor_retriever_lex.py#L8) 
33 | * Define a `kor_retriever` that will try to extract metadata filters from the question
34 | * Here is the the problem that Kor filtering can help solve:
35 | 
36 | ![image](https://github.com/langchain-ai/auto-evaluator/assets/122662504/7e12fd3b-4f97-4afa-a3ab-d325f16c4a35)
37 | 
38 | **Run as Streamlit app**
39 | 
40 | `pip install -r requirements.txt`
41 | 
42 | `streamlit run auto-evaluator.py`
43 | 
44 | **Inputs**
45 |  
46 | `retriever_type` - Retrieval method
47 | 
48 | `num_neighbors` - Neighbors for retrieval 
49 | 
50 | `embeddings` - Embeddings in your Pinecone vectorDB
51 | 
52 | `model` - LLM for summarization of retrieved chunks 
53 | 
54 | `grade_prompt` - Prompt choice for model self-grading
55 | 
56 | **Blog**
57 | 
58 | https://rlancemartin.notion.site/Auto-Evaluation-of-Metadata-Filtering-18502448c85240828f33716740f9574b
59 | 
60 | **Disclaimer**
61 | 
62 | ```You will need an OpenAI API key with access to `GPT-4` for the default grading, but this can be modified in grade_model_retrieval and grade_model_answer if needed [here](https://github.com/langchain-ai/auto-evaluator/blob/d3e6a7eaff0ac0a04904cdafcefc3980d4321014/streamlit/auto-evaluator.py#L135).```
63 | 


--------------------------------------------------------------------------------
/streamlit/auto-evaluator.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import time
  4 | import pinecone
  5 | import pandas as pd
  6 | import altair as alt
  7 | import streamlit as st
  8 | from typing import List
  9 | from langchain.vectorstores import Pinecone
 10 | from langchain.llms import Anthropic
 11 | from langchain.chat_models import ChatOpenAI
 12 | from langchain.evaluation.qa import QAEvalChain
 13 | from langchain.embeddings import HuggingFaceEmbeddings
 14 | from langchain.embeddings.openai import OpenAIEmbeddings
 15 | from langchain.chains.question_answering import load_qa_chain
 16 | from langchain.retrievers.self_query.base import SelfQueryRetriever
 17 | from kor_retriever_lex import kor_retriever
 18 | from langchain.docstore.document import Document
 19 | from self_query_retriever_lex import metadata_field_info, document_content_description
 20 | from prompts import GRADE_DOCS_PROMPT, GRADE_ANSWER_PROMPT, GRADE_ANSWER_PROMPT_FAST, GRADE_ANSWER_PROMPT_BIAS_CHECK, GRADE_ANSWER_PROMPT_OPENAI, QA_CHAIN_PROMPT_LEX, QA_CHAIN_PROMPT_TRAVEL
 21 | 
 22 | # Keep dataframe in memory to accumulate experimental results
 23 | if "existing_df" not in st.session_state:
 24 |     summary = pd.DataFrame(columns=['model',
 25 |                                     'retriever',
 26 |                                     'embedding',
 27 |                                     'num_neighbors',
 28 |                                     'Latency',
 29 |                                     'Retrieval score',
 30 |                                     'Answer score'])
 31 |     st.session_state.existing_df = summary
 32 | else:
 33 |     summary = st.session_state.existing_df
 34 | 
 35 | @st.cache_resource
 36 | def make_llm(model_version: str):
 37 |     """
 38 |     Make LLM from model version
 39 |     @param model_version: model_version
 40 |     @return: LLN
 41 |     """
 42 |     if (model_version == "gpt-3.5-turbo") or (model_version == "gpt-4"):
 43 |         chosen_model = ChatOpenAI(model_name=model_version, temperature=0)
 44 |     elif model_version == "anthropic":
 45 |         chosen_model = Anthropic(temperature=0)
 46 |     else:
 47 |         st.warning("`Model version not recognized. Using gpt-3.5-turbo`", icon="⚠️")
 48 |         chosen_model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
 49 |     return chosen_model
 50 | 
 51 | @st.cache_resource
 52 | def make_retriever(retriever_type,embedding_type,pc_api_key,pc_region,pc_index):
 53 |     """
 54 |     Make document retriever
 55 |     @param retriever_type: retriever type
 56 |     @param embedding_type: embedding type
 57 |     @param num_neighbors: number of neighbors for retrieval
 58 |     @return: Pinecone
 59 |     """
 60 |     st.info("`Connecting to Pinecone ...`")
 61 |         
 62 |     # Retriver type 
 63 |     if retriever_type in ("Pinecone","Pinecone w/ metadata filtering"):
 64 |         return p   
 65 |     elif retriever_type == "Pinecone w/ self-querying":
 66 |         return  SelfQueryRetriever.from_llm(ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0), p, document_content_description, metadata_field_info, verbose=True, k=10)
 67 |     elif retriever_type == "Kor filtering":
 68 |         return  kor_retriever
 69 |     
 70 | def make_chain(llm):
 71 |     """
 72 |     Make retrieval chain
 73 |     @param retriever: retriever
 74 |     @param retriever_type: retriever type
 75 |     @return: QA chain 
 76 |     """
 77 | 
 78 |     qa_chain = load_qa_chain(llm, chain_type="stuff", prompt=QA_CHAIN_PROMPT_LEX)
 79 |         
 80 |     return qa_chain
 81 | 
 82 | 
 83 | def grade_model_answer(predicted_dataset: List, predictions: List, grade_answer_prompt: str) -> List:
 84 |     """
 85 |     Grades the distilled answer based on ground truth and model predictions.
 86 |     @param predicted_dataset: A list of dictionaries containing ground truth questions and answers.
 87 |     @param predictions: A list of dictionaries containing model predictions for the questions.
 88 |     @param grade_answer_prompt: The prompt level for the grading. Either "Fast" or "Full".
 89 |     @return: A list of scores for the distilled answers.
 90 |     """
 91 |     # Grade the distilled answer
 92 |     st.info("`Grading model answer ...`")
 93 |     # Set the grading prompt based on the grade_answer_prompt parameter
 94 |     if grade_answer_prompt == "Fast":
 95 |         prompt = GRADE_ANSWER_PROMPT_FAST
 96 |     elif grade_answer_prompt == "Descriptive w/ bias check":
 97 |         prompt = GRADE_ANSWER_PROMPT_BIAS_CHECK
 98 |     elif grade_answer_prompt == "OpenAI grading prompt":
 99 |         prompt = GRADE_ANSWER_PROMPT_OPENAI
100 |     else:
101 |         prompt = GRADE_ANSWER_PROMPT
102 | 
103 |     # Create an evaluation chain
104 |     eval_chain = QAEvalChain.from_llm(
105 |         llm=ChatOpenAI(model_name="gpt-4", temperature=0),
106 |         prompt=prompt
107 |     )
108 | 
109 |     # Evaluate the predictions and ground truth using the evaluation chain
110 |     graded_outputs = eval_chain.evaluate(
111 |         predicted_dataset,
112 |         predictions,
113 |         question_key="question",
114 |         prediction_key="result"
115 |     )
116 | 
117 |     return graded_outputs
118 | 
119 | 
120 | def grade_model_retrieval(gt_dataset: List, predictions: List, grade_docs_prompt: str):
121 |     """
122 |     Grades the relevance of retrieved documents based on ground truth and model predictions.
123 |     @param gt_dataset: list of dictionaries containing ground truth questions and answers.
124 |     @param predictions: list of dictionaries containing model predictions for the questions
125 |     @param grade_docs_prompt: prompt level for the grading. Either "Fast" or "Full"
126 |     @return: list of scores for the retrieved documents.
127 |     """
128 |     # Grade the docs retrieval
129 |     st.info("`Grading relevance of retrieved docs ...`")
130 | 
131 |     # Set the grading prompt based on the grade_docs_prompt parameter
132 |     prompt = GRADE_DOCS_PROMPT
133 | 
134 |     # Create an evaluation chain
135 |     eval_chain = QAEvalChain.from_llm(
136 |         llm=ChatOpenAI(model_name="gpt-4", temperature=0),
137 |         prompt=prompt
138 |     )
139 | 
140 |     # Evaluate the predictions and ground truth using the evaluation chain
141 |     graded_outputs = eval_chain.evaluate(
142 |         gt_dataset,
143 |         predictions,
144 |         question_key="question",
145 |         prediction_key="result"
146 |     )
147 |     return graded_outputs
148 | 
149 | 
150 | def run_evaluation(chain, retriever, eval_set, grade_prompt, retriever_type, num_neighbors):
151 |     """
152 |     Runs evaluation on a model's performance on a given evaluation dataset.
153 |     @param chain: Model chain used for answering questions
154 |     @param retriever:  Document retriever used for retrieving relevant documents
155 |     @param eval_set: List of dictionaries containing questions and corresponding ground truth answers
156 |     @param grade_prompt: String prompt used for grading model's performance
157 |     @param retriever_type: String specifying the type of retriever used
158 |     @param num_neighbors: Number of neighbors to retrieve using the retriever
159 |     @return: A tuple of four items:
160 |     - answers_grade: A dictionary containing scores for the model's answers.
161 |     - retrieval_grade: A dictionary containing scores for the model's document retrieval.
162 |     - latencies_list: A list of latencies in seconds for each question answered.
163 |     - predictions_list: A list of dictionaries containing the model's predicted answers and relevant documents for each question.
164 |     """
165 |     st.info("`Running evaluation ...`")
166 |     predictions_list = []
167 |     retrieved_docs = []
168 |     gt_dataset = []
169 |     latencies_list = []
170 | 
171 |     for data in eval_set:
172 | 
173 |         # Get answer and log latency
174 |         start_time = time.time()
175 |         
176 |         # Get docs
177 |         if retriever_type == "Pinecone w/ self-querying":
178 |             docs = retriever.get_relevant_documents(data["question"])
179 | 
180 |         elif retriever_type == "Pinecone w/ metadata filtering":
181 |             ### Set metadata here ### 
182 |             metadata_filter = {'id':"0252"}
183 |             docs = retriever.similarity_search(query=data["question"],k=num_neighbors,filter=metadata_filter)
184 | 
185 |         elif retriever_type == "Kor filtering":
186 |             docs = retriever(p,data["question"]) 
187 | 
188 |         else: 
189 |             docs = retriever.similarity_search(query=data["question"],k=num_neighbors)
190 | 
191 |         print("--DOCS--")
192 |         if not docs:
193 |             docs=[Document(page_content="I was unable to recover any information about the question!")]
194 |         print(docs)
195 | 
196 |         # Get answer
197 |         answer = chain.run(input_documents=docs,question=data["question"])
198 |         predictions_list.append({"question": data["question"], "answer": data["answer"], "result": answer})            
199 |         gt_dataset.append(data)
200 |         end_time = time.time()
201 |         elapsed_time = end_time - start_time
202 |         latencies_list.append(elapsed_time)
203 | 
204 |         # Get doc text 
205 |         retrieved_doc_text = ""
206 |         for i, doc in enumerate(docs):
207 |             retrieved_doc_text += "Doc %s: " % str(i + 1) + doc.page_content + " "
208 |         retrieved = {"question": data["question"], "answer": data["answer"], "result": retrieved_doc_text}
209 |         retrieved_docs.append(retrieved)
210 | 
211 |     # Grade docs and answer 
212 |     answers_grade = grade_model_answer(gt_dataset, predictions_list, grade_prompt)
213 |     retrieval_grade = grade_model_retrieval(gt_dataset, retrieved_docs, grade_prompt)
214 |     return answers_grade, retrieval_grade, latencies_list, predictions_list
215 | 
216 | # Auth
217 | st.sidebar.image("img/diagnostic.jpg")
218 | 
219 | with st.sidebar.form("user_input"):
220 | 
221 |     # Pinecone params 
222 |     oai_api_key = st.text_input("`OpenAI API Key:`", type="password").strip()
223 |     pc_api_key = st.text_input("`Pinecone API Key:`", type="password").strip()
224 |     pc_region = st.text_input("`Pinecone region:`", type="password").strip()
225 |     pc_index = st.text_input("`Pinecone index:`", type="password").strip()
226 | 
227 |     retriever_type = st.radio("`Choose retriever`",
228 |                               ("Pinecone",
229 |                                "Pinecone w/ self-querying",
230 |                                "Pinecone w/ metadata filtering",
231 |                                "Kor filtering"),
232 |                               index=0)
233 | 
234 |     num_neighbors = st.select_slider("`Choose # chunks to retrieve`",
235 |                                      options=[3, 4, 5, 6, 7, 8])
236 | 
237 |     embeddings = st.radio("`Choose embeddings`",
238 |                           ("HuggingFace",
239 |                            "OpenAI"),
240 |                           index=1)
241 | 
242 |     model = st.radio("`Choose model`",
243 |                      ("gpt-3.5-turbo",
244 |                       "gpt-4"),
245 |                      index=0)
246 | 
247 |     grade_prompt = st.radio("`Grading style prompt`",
248 |                             ("Fast",
249 |                              "Descriptive",
250 |                              "Descriptive w/ bias check",
251 |                              "OpenAI grading prompt"),
252 |                             index=3)
253 | 
254 |     submitted = st.form_submit_button("Submit evaluation")
255 | 
256 | # App
257 | st.header("`VectorDB auto-evaluator`")
258 | st.info(
259 |     "`I am an evaluation tool for question-answering using an existing vectorDB (currently Pinecone is supported) and an eval set. "
260 |     "I will generate and grade an answer to each eval set question with the user-specific retrival setting, such as metadata filtering or self-querying retrieval." 
261 |     " Experiments with different configurations are logged. For an example eval set, see eval_sets/lex-pod-eval.json.`")
262 | 
263 | with st.form(key='file_inputs'):
264 | 
265 |     uploaded_eval_set = st.file_uploader("`Please upload eval set (.json):` ",
266 |                                          type=['json'],
267 |                                          accept_multiple_files=False)
268 | 
269 |     submitted = st.form_submit_button("Submit files")
270 | 
271 | # Build an index from the supplied docs
272 | if uploaded_eval_set and pc_api_key and pc_region and pc_index:
273 |     
274 |     # Set API key
275 |     os.environ["OPENAI_API_KEY"] = oai_api_key
276 | 
277 |     # Set embeddings (must match your Pinecone DB)
278 |     if embeddings == "OpenAI":
279 |         embedding = OpenAIEmbeddings()
280 |     elif embeddings == "HuggingFace":
281 |         embedding = HuggingFaceEmbeddings()
282 | 
283 |     # Set Pinecone 
284 |     pinecone.init(api_key=str(pc_api_key), environment=str(pc_region))
285 |     p = Pinecone.from_existing_index(index_name=str(pc_index), embedding=embedding)
286 | 
287 |     # Eval set
288 |     eval_set = json.loads(uploaded_eval_set.read())
289 | 
290 |     # Make LLM
291 |     llm = make_llm(model)
292 |     
293 |     # Make retriver
294 |     retriever = make_retriever(retriever_type,embeddings,pc_api_key,pc_region,pc_index)
295 |     
296 |     # Make chain
297 |     qa_chain = make_chain(llm)
298 |     
299 |     # Grade model
300 |     graded_answers, graded_retrieval, latency, predictions = run_evaluation(qa_chain, retriever, eval_set, grade_prompt,
301 |                                                                         retriever_type, num_neighbors)
302 | 
303 |     # Assemble outputs
304 |     d = pd.DataFrame(predictions)
305 |     d['answer score'] = [g['text'] for g in graded_answers]
306 |     d['docs score'] = [g['text'] for g in graded_retrieval]
307 |     d['latency'] = latency
308 | 
309 |     # Summary statistics
310 |     mean_latency = d['latency'].mean()
311 |     correct_answer_count = len([text for text in d['answer score'] if "Incorrect" not in text])
312 |     correct_docs_count = len([text for text in d['docs score'] if "Incorrect" not in text])
313 |     percentage_answer = (correct_answer_count / len(graded_answers)) * 100
314 |     percentage_docs = (correct_docs_count / len(graded_retrieval)) * 100
315 | 
316 |     st.subheader("`Run Results`")
317 |     st.info(
318 |         "`I will grade the chain based on: 1/ the relevance of the retrived documents relative to the question and 2/ "
319 |         "the summarized answer relative to the ground truth answer. You can see (and change) to prompts used for "
320 |         "grading in text_utils`")
321 |     st.dataframe(data=d, use_container_width=True)
322 | 
323 |     # Accumulate results
324 |     st.subheader("`Aggregate Results`")
325 |     st.info(
326 |         "`Retrieval and answer scores are percentage of retrived documents deemed relevant by the LLM grader ("
327 |         "relative to the question) and percentage of summarized answers deemed relevant (relative to ground truth "
328 |         "answer), respectively. The size of point correponds to the latency (in seconds) of retrieval + answer "
329 |         "summarization (larger circle = slower).`")
330 |     new_row = pd.DataFrame({'model': [model],
331 |                             'retriever': [retriever_type],
332 |                             'embedding': [embeddings],
333 |                             'num_neighbors': [num_neighbors],
334 |                             'Latency': [mean_latency],
335 |                             'Retrieval score': [percentage_docs],
336 |                             'Answer score': [percentage_answer]})
337 |     summary = pd.concat([summary, new_row], ignore_index=True)
338 |     st.dataframe(data=summary, use_container_width=True)
339 |     st.session_state.existing_df = summary
340 | 
341 |     # Dataframe for visualization
342 |     show = summary.reset_index().copy()
343 |     show.columns = ['expt number', 'model', 'retriever', 'embedding', 'num_neighbors', 'Latency', 'Retrieval score','Answer score']
344 |     show['expt number'] = show['expt number'].apply(lambda x: "Expt #: " + str(x + 1))
345 |     c = alt.Chart(show).mark_circle().encode(x='Retrieval score',
346 |                                                 y='Answer score',
347 |                                                 size=alt.Size('Latency'),
348 |                                                 color='expt number',
349 |                                                 tooltip=['expt number', 'Retrieval score', 'Latency', 'Answer score'])
350 |     st.altair_chart(c, use_container_width=True, theme="streamlit")
351 | 
352 | else:
353 |     st.warning('Please specify a Pinecone index and add an eval set.', icon="⚠️")


--------------------------------------------------------------------------------
/streamlit/eval_sets/lex-pod-eval.json:
--------------------------------------------------------------------------------
1 | [
2 |     {"question": "What does Elon Musk say about the self driving problem in episode 252?",
3 |       "answer": "Elon mentions that the self-driving problem is harder than he thought because you need to build a silicon equivalent of vision that maps from camera to vector space. But, he also mentions that the disengagements has been dropping rapidly and the probability of an accident on FSD will be less than that of the average human within a year."}
4 | ]


--------------------------------------------------------------------------------
/streamlit/img/diagnostic.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/auto-evaluator/a18e71fac95006238bfedb0f6aa9c6bd61244066/streamlit/img/diagnostic.jpg


--------------------------------------------------------------------------------
/streamlit/kor_retriever_lex.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from kor.extraction import create_extraction_chain
 3 | from kor.nodes import Object, Text, Number
 4 | from langchain.chat_models import ChatOpenAI
 5 | from langchain.docstore.document import Document
 6 | 
 7 | # Extraction schema - 
 8 | schema = Object(
 9 |     id = "episode_id",
10 |     description = "An ID for each Lex Fridman podcast episode",
11 |     attributes = [
12 |         Text(
13 |             id="episode_id",
14 |             description="The podcast ID.",
15 |         )
16 |     ],
17 |     examples = [
18 |         ("What does episode 333 say about AI?", [{"episode_id": "0333"}]),
19 |         ("What does episode #231 say about dogs?", [{"episode_id": "0231"}]),
20 |         ("What is the summary of episode 50?",[{"episode_id": "050"}])
21 |     ],
22 |     many = True,
23 | )
24 | 
25 | # Retriever -
26 | def kor_retriever(p,query):
27 | 
28 |     # LLM - 
29 |     llm = ChatOpenAI(
30 |         model_name="gpt-3.5-turbo",
31 |         temperature=0,
32 |     )
33 | 
34 |     # Chain -
35 |     chain = create_extraction_chain(llm, schema)
36 | 
37 |     # City extraction - 
38 |     results = chain.predict_and_parse(text=(query.strip()))["data"]['episode_id']
39 |     print("RESULTS in KOR")
40 |     print(results)
41 | 
42 |     # Get results - 
43 |     if results:
44 |         metadata_filter = {'id':results[0]['episode_id']} 
45 |         docs = p.similarity_search(query=query,k=3,filter=metadata_filter)
46 |         print("DOCS in KOR")
47 |         print(docs)
48 |         return docs
49 | 
50 |     else:
51 |         print("No results Kor retrieval!")
52 |         return None


--------------------------------------------------------------------------------
/streamlit/prompts.py:
--------------------------------------------------------------------------------
  1 | from langchain.prompts import PromptTemplate
  2 | 
  3 | template = """You are a teacher grading a quiz. 
  4 | You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either Correct or Incorrect.
  5 | 
  6 | Example Format:
  7 | QUESTION: question here
  8 | STUDENT ANSWER: student's answer here
  9 | TRUE ANSWER: true answer here
 10 | GRADE: Correct or Incorrect here
 11 | 
 12 | Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. If the student answers that there is no specific information provided in the context, then the answer is Incorrect. Begin! 
 13 | 
 14 | QUESTION: {query}
 15 | STUDENT ANSWER: {result}
 16 | TRUE ANSWER: {answer}
 17 | GRADE:
 18 | 
 19 | Your response should be as follows:
 20 | 
 21 | GRADE: (Correct or Incorrect)
 22 | (line break)
 23 | JUSTIFICATION: (Without mentioning the student/teacher framing of this prompt, explain why the STUDENT ANSWER is Correct or Incorrect. Use one or two sentences maximum. Keep the answer as concise as possible.)
 24 | """
 25 | 
 26 | GRADE_ANSWER_PROMPT = PromptTemplate(input_variables=["query", "result", "answer"], template=template)
 27 | 
 28 | template = """You are a teacher grading a quiz. 
 29 | You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either Correct or Incorrect.
 30 | 
 31 | Example Format:
 32 | QUESTION: question here
 33 | STUDENT ANSWER: student's answer here
 34 | TRUE ANSWER: true answer here
 35 | GRADE: Correct or Incorrect here
 36 | 
 37 | Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. If the student answers that there is no specific information provided in the context, then the answer is Incorrect. Begin! 
 38 | 
 39 | QUESTION: {query}
 40 | STUDENT ANSWER: {result}
 41 | TRUE ANSWER: {answer}
 42 | GRADE:"""
 43 | 
 44 | GRADE_ANSWER_PROMPT_FAST = PromptTemplate(input_variables=["query", "result", "answer"], template=template)
 45 | 
 46 | template = """You are a teacher grading a quiz. 
 47 | You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either Correct or Incorrect.
 48 | You are also asked to identify potential sources of bias in the question and in the true answer.
 49 | 
 50 | Example Format:
 51 | QUESTION: question here
 52 | STUDENT ANSWER: student's answer here
 53 | TRUE ANSWER: true answer here
 54 | GRADE: Correct or Incorrect here
 55 | 
 56 | Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. If the student answers that there is no specific information provided in the context, then the answer is Incorrect. Begin! 
 57 | 
 58 | QUESTION: {query}
 59 | STUDENT ANSWER: {result}
 60 | TRUE ANSWER: {answer}
 61 | GRADE:
 62 | 
 63 | Your response should be as follows:
 64 | 
 65 | GRADE: (Correct or Incorrect)
 66 | (line break)
 67 | JUSTIFICATION: (Without mentioning the student/teacher framing of this prompt, explain why the STUDENT ANSWER is Correct or Incorrect, identify potential sources of bias in the QUESTION, and identify potential sources of bias in the TRUE ANSWER. Use one or two sentences maximum. Keep the answer as concise as possible.)
 68 | """
 69 | 
 70 | GRADE_ANSWER_PROMPT_BIAS_CHECK = PromptTemplate(input_variables=["query", "result", "answer"], template=template)
 71 | 
 72 | template = """You are assessing a submitted student answer to a question relative to the true answer based on the provided criteria: 
 73 |     
 74 |     ***
 75 |     QUESTION: {query}
 76 |     ***
 77 |     STUDENT ANSWER: {result}
 78 |     ***
 79 |     TRUE ANSWER: {answer}
 80 |     ***
 81 |     Criteria: 
 82 |       relevance:  Is the submission referring to a real quote from the text?"
 83 |       conciseness:  Is the answer concise and to the point?"
 84 |       correct: Is the answer correct?"
 85 |     ***
 86 |     Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print "Correct" or "Incorrect" (without quotes or punctuation) on its own line corresponding to the correct answer.
 87 |     Reasoning:
 88 | """
 89 | 
 90 | GRADE_ANSWER_PROMPT_OPENAI = PromptTemplate(input_variables=["query", "result", "answer"], template=template)
 91 | 
 92 | template = """ 
 93 |     You are a grader trying to determine if a set of retrieved documents will help a student answer a question. \n
 94 | 
 95 |     Here is the question: \n
 96 |     {query}
 97 | 
 98 |     Here are the documents retrieved to answer question: \n
 99 |     {result}
100 |     
101 |     Here is the correct answer to the question: \n 
102 |     {answer}
103 |    
104 |     Criteria: 
105 |       relevance: Do all of the documents contain information that will help the student arrive that the correct answer to the question?"
106 | 
107 |     Your response should be as follows:
108 | 
109 |     GRADE: (Correct or Incorrect, depending if all of the documents retrieved meet the criterion)
110 |     (line break)
111 |     JUSTIFICATION: (Write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Use three sentences maximum. Keep the answer as concise as possible.)
112 |     """
113 | 
114 | GRADE_DOCS_PROMPT = PromptTemplate(input_variables=['result', 'answer', 'query'], template=template)
115 | 
116 | template = """You are an AI travel assistant for a company called Osito that answers questions using the provided context. 
117 | 
118 | Your job is to help the customer decide on which cities might be good for an event they are planning, and then optionally to help them pick a shortlist of hotels within those cities to request proposals from. 
119 | 
120 | If you need more information at any point, you can ask the customer follow up questions about where their attendees are coming from ("origin cities"), what vibe they are looking for, their budget, what hotel class they want (2-star through 5-star), etc.
121 | 
122 | Question: 
123 | {question}
124 | 
125 | Here is some context that might be helpful:
126 | {context}
127 | 
128 | Answer:"""
129 | 
130 | QA_CHAIN_PROMPT_TRAVEL = PromptTemplate(input_variables=["context", "question"],template=template,)
131 | 
132 | template = """You are an assistant providing summary answers about the Lex Fridman podcast. 
133 | 
134 | If the user asks a question about a specific episode, just answeer using information in the below context. 
135 | 
136 | Be concise and truthful. Think step by step.
137 | 
138 | Question: 
139 | {question}
140 | 
141 | Here is some context that might be helpful:
142 | {context}
143 | 
144 | Answer:"""
145 | 
146 | QA_CHAIN_PROMPT_LEX = PromptTemplate(input_variables=["context", "question"],template=template,)


--------------------------------------------------------------------------------
/streamlit/requirements.txt:
--------------------------------------------------------------------------------
1 | langchain==0.0.164
2 | openai==0.27.0
3 | altair==4.2.2
4 | scikit-learn==1.2.1
5 | streamlit==1.21.0
6 | kor==0.9.2
7 | transformers==4.28.1
8 | lark==1.1.5
9 | 


--------------------------------------------------------------------------------
/streamlit/self_query_retriever_lex.py:
--------------------------------------------------------------------------------
 1 | from langchain.chains.query_constructor.base import AttributeInfo
 2 | 
 3 | metadata_field_info=[
 4 |     
 5 |     AttributeInfo(
 6 |         name="id",
 7 |         description="The ID of the episode", 
 8 |         type="string", 
 9 |     ),
10 | ]
11 | 
12 | document_content_description = "Information about Lex Fridman podcast episodes"


--------------------------------------------------------------------------------