├── img ├── llmautoeval.png └── colab.svg ├── llm_autoeval ├── upload.py └── table.py ├── LICENSE ├── main.py ├── README.md └── runpod.sh /img/llmautoeval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlabonne/llm-autoeval/HEAD/img/llmautoeval.png -------------------------------------------------------------------------------- /llm_autoeval/upload.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import requests 4 | 5 | 6 | def upload_to_github_gist(text, gist_name, gh_token): 7 | # Create the gist content 8 | gist_content = { 9 | "public": str(os.getenv("PRIVATE_GIST", False)).lower(), 10 | "files": { 11 | f"{gist_name}": { # Change the file extension to .txt for plain text 12 | "content": text 13 | } 14 | }, 15 | } 16 | 17 | # Headers for the request 18 | headers = { 19 | "Authorization": f"token {gh_token}", 20 | "Accept": "application/vnd.github.v3+json", 21 | } 22 | 23 | # Make the request 24 | response = requests.post( 25 | "https://api.github.com/gists", headers=headers, json=gist_content 26 | ) 27 | 28 | if response.status_code == 201: 29 | print(f"Uploaded gist successfully! URL: {response.json()['html_url']}") 30 | else: 31 | print( 32 | f"Failed to upload gist. Status code: {response.status_code}. Response: {response.text}" 33 | ) 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Teknium 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /img/colab.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import os 5 | import time 6 | 7 | from llm_autoeval.table import make_final_table, make_table 8 | from llm_autoeval.upload import upload_to_github_gist 9 | 10 | logging.basicConfig(level=logging.INFO) 11 | logger = logging.getLogger(__name__) 12 | 13 | MODEL_ID = os.getenv("MODEL_ID") 14 | BENCHMARK = os.getenv("BENCHMARK") 15 | GITHUB_API_TOKEN = os.getenv("GITHUB_API_TOKEN") 16 | 17 | 18 | def _make_autoeval_summary(directory: str, elapsed_time: float) -> str: 19 | # Variables 20 | tables = [] 21 | averages = [] 22 | 23 | # Tasks 24 | if BENCHMARK == "openllm": 25 | tasks = ["ARC", "HellaSwag", "MMLU", "TruthfulQA", "Winogrande", "GSM8K"] 26 | elif BENCHMARK == "nous": 27 | tasks = ["AGIEval", "GPT4All", "TruthfulQA", "Bigbench"] 28 | elif BENCHMARK == "eq-bench": 29 | tasks = ["EQ-Bench"] 30 | else: 31 | raise NotImplementedError( 32 | f"The benchmark {BENCHMARK} could not be found." 33 | ) 34 | 35 | # Load results 36 | for task in tasks: 37 | file_path = f"{directory}/{task.lower()}.json" 38 | if os.path.exists(file_path): 39 | json_data = open(file_path, "r").read() 40 | data = json.loads(json_data, strict=False) 41 | table, average = make_table(data, task) 42 | else: 43 | table = "" 44 | average = "Error: File does not exist" 45 | 46 | tables.append(table) 47 | averages.append(average) 48 | 49 | # Generate tables 50 | summary = "" 51 | for index, task in enumerate(tasks): 52 | summary += f"### {task}\n{tables[index]}\nAverage: {averages[index]}%\n\n" 53 | result_dict = {k: v for k, v in zip(tasks, averages)} 54 | 55 | # Calculate the final average, excluding strings 56 | if all(isinstance(e, float) for e in averages): 57 | final_average = round(sum(averages) / len(averages), 2) 58 | summary += f"Average score: {final_average}%" 59 | result_dict.update({"Average": final_average}) 60 | else: 61 | summary += "Average score: Not available due to errors" 62 | 63 | # Generate final table 64 | final_table = make_final_table(result_dict, MODEL_ID) 65 | summary = final_table + "\n" + summary 66 | return summary 67 | 68 | 69 | def _get_result_dict(directory: str) -> dict: 70 | """Walk down directories to get JSON path""" 71 | 72 | for root, dirs, files in os.walk(directory): 73 | for file in files: 74 | if file.endswith(".json"): 75 | return json.load(open(os.path.join(root, file))) 76 | raise FileNotFoundError(f"No JSON file found in {directory}") 77 | 78 | 79 | def _make_lighteval_summary(directory: str, elapsed_time: float) -> str: 80 | from lighteval.evaluator import make_results_table 81 | 82 | result_dict = _get_result_dict(directory) 83 | final_table = make_results_table(result_dict) 84 | summary = f"## {MODEL_ID.split('/')[-1]} - {BENCHMARK.capitalize()}\n\n" 85 | summary += final_table 86 | return summary 87 | 88 | 89 | def main(directory: str, elapsed_time: float) -> None: 90 | # Tasks 91 | if BENCHMARK == "openllm" or BENCHMARK == "nous" or BENCHMARK == "eq-bench": 92 | summary = _make_autoeval_summary(directory, elapsed_time) 93 | elif BENCHMARK == "lighteval": 94 | summary = _make_lighteval_summary(directory, elapsed_time) 95 | else: 96 | raise NotImplementedError( 97 | f"BENCHMARK should be 'openllm' or 'nous' (current value = {BENCHMARK})" 98 | ) 99 | 100 | # Add elapsed time 101 | convert = time.strftime("%H:%M:%S", time.gmtime(elapsed_time)) 102 | summary += f"\n\nElapsed time: {convert}" 103 | 104 | # Upload to GitHub Gist 105 | upload_to_github_gist( 106 | summary, 107 | f"{MODEL_ID.split('/')[-1]}-{BENCHMARK.capitalize()}.md", 108 | GITHUB_API_TOKEN, 109 | ) 110 | 111 | 112 | if __name__ == "__main__": 113 | # Create the parser 114 | parser = argparse.ArgumentParser(description="Summarize results and upload them.") 115 | parser.add_argument( 116 | "directory", type=str, help="The path to the directory with the JSON results" 117 | ) 118 | parser.add_argument( 119 | "elapsed_time", 120 | type=float, 121 | help="Elapsed time since the start of the evaluation", 122 | ) 123 | 124 | # Parse the arguments 125 | args = parser.parse_args() 126 | 127 | # Check if the directory exists 128 | if not os.path.isdir(args.directory): 129 | raise ValueError(f"The directory {args.directory} does not exist.") 130 | 131 | # Call the main function with the directory argument 132 | main(args.directory, args.elapsed_time) 133 | -------------------------------------------------------------------------------- /llm_autoeval/table.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | 4 | from pytablewriter import MarkdownTableWriter 5 | 6 | BENCHMARK = os.getenv("BENCHMARK") 7 | 8 | 9 | def get_acc_norm(data): 10 | accs = [ 11 | data["results"][k]["acc_norm"] 12 | if "acc_norm" in data["results"][k] 13 | else data["results"][k]["acc"] 14 | for k in data["results"] 15 | ] 16 | acc = sum(accs) / len(accs) * 100 17 | return acc 18 | 19 | 20 | def get_mcg(data): 21 | accs = [data["results"][k]["multiple_choice_grade"] for k in data["results"]] 22 | acc = sum(accs) / len(accs) * 100 23 | return acc 24 | 25 | 26 | def calculate_average(data, task): 27 | task = task.lower() 28 | print(data) 29 | 30 | if BENCHMARK == "openllm": 31 | if task == "arc": 32 | return data["results"]["arc_challenge"]["acc_norm,none"] * 100 33 | elif task == "hellaswag": 34 | return data["results"]["hellaswag"]["acc_norm,none"] * 100 35 | elif task == "mmlu": 36 | return data["results"]["mmlu"]["acc,none"] * 100 37 | elif task == "truthfulqa": 38 | value = data["results"]["truthfulqa_mc2"]["acc,none"] 39 | return 0.0 if math.isnan(value) else value * 100 40 | elif task == "winogrande": 41 | return data["results"]["winogrande"]["acc,none"] * 100 42 | elif task == "gsm8k": 43 | return data["results"]["gsm8k"]["exact_match,strict-match"] * 100 44 | 45 | elif BENCHMARK == "nous": 46 | if task == "agieval": 47 | return get_acc_norm(data) 48 | elif task == "gpt4all": 49 | return get_acc_norm(data) 50 | elif task == "bigbench": 51 | return get_mcg(data) 52 | elif task == "truthfulqa": 53 | value = data["results"]["truthfulqa_mc"]["mc2"] 54 | return 0.0 if math.isnan(value) else value * 100 55 | 56 | elif BENCHMARK == "eq-bench": 57 | if task == "eq-bench": 58 | return data["results"]["eq_bench"]["eqbench,none"] 59 | 60 | raise NotImplementedError(f"Could not find task {task} for benchmark {BENCHMARK}") 61 | 62 | 63 | def make_table(result_dict, task): 64 | """Generate table of results.""" 65 | # TODO: properly format values in table for openllm 66 | 67 | md_writer = MarkdownTableWriter() 68 | md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"] 69 | 70 | values = [] 71 | 72 | for k, dic in sorted(result_dict["results"].items()): 73 | version = result_dict["versions"].get(k, "N/A") 74 | 75 | percent = k == "squad2" 76 | for m, v in dic.items(): 77 | if m.endswith("_stderr"): 78 | continue 79 | 80 | if m + "_stderr" in dic: 81 | se = dic[m + "_stderr"] 82 | if percent or m == "ppl": 83 | values.append([k, version, m, "%.2f" % v, "±", "%.2f" % se]) 84 | else: 85 | values.append( 86 | [k, version, m, "%.2f" % (v * 100), "±", "%.2f" % (se * 100)] 87 | ) 88 | else: 89 | if percent or m == "ppl": 90 | values.append([k, version, m, "%.2f" % v, "", ""]) 91 | else: 92 | try: 93 | # Attempt to convert v to a float 94 | v_converted = float(v) 95 | v_formatted = "%.2f" % v_converted 96 | except ValueError: 97 | # If conversion fails, use the original string value 98 | v_formatted = v 99 | 100 | values.append([k, version, m, v_formatted, "", ""]) 101 | 102 | k = "" 103 | version = "" 104 | 105 | md_writer.value_matrix = values 106 | 107 | # Get average score 108 | average = round(calculate_average(result_dict, task), 2) 109 | 110 | return md_writer.dumps(), average 111 | 112 | 113 | def make_final_table(result_dict, model_name): 114 | """Generate table of results with model name. 115 | 116 | Args: 117 | result_dict (dict): A dictionary where keys are headers and values are the values in the table. 118 | model_name (str): The name of the model to be included in the table. 119 | 120 | Returns: 121 | str: A string representing the markdown table. 122 | """ 123 | md_writer = MarkdownTableWriter() 124 | # Add 'Model' as the first header and then the rest from the dictionary keys 125 | md_writer.headers = ["Model"] + list(result_dict.keys()) 126 | 127 | # The values in the table will be the model name and then the values from the dictionary 128 | values = [ 129 | f"[{model_name.split('/')[-1]}](https://huggingface.co/{model_name})" 130 | ] + list(result_dict.values()) 131 | 132 | # The table only has one row of values 133 | md_writer.value_matrix = [values] 134 | 135 | # Return the table as a markdown formatted string 136 | return md_writer.dumps() 137 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 |

🧐 LLM AutoEval

3 |

9 |

Simplify LLM evaluation using a convenient Colab notebook.

10 |

11 |

12 |
13 | 14 |

15 | 16 |

17 | 18 | ## 🔍 Overview 19 | 20 | LLM AutoEval **simplifies the process of evaluating LLMs** using a convenient [Colab notebook](https://colab.research.google.com/drive/1Igs3WZuXAIv9X0vwqiE90QlEPys8e8Oa?usp=sharing). You just need to specify the name of your model, a benchmark, a GPU, and press run! 21 | 22 | ### Key Features 23 | 24 | * Automated setup and execution using [RunPod](https://runpod.io?ref=9nvk2srl). 25 | * Customizable evaluation parameters for tailored benchmarking. 26 | * Summary generation and upload to [GitHub Gist](https://gist.github.com/) for easy sharing and reference. 27 | 28 | [View a sample summary here.](https://gist.github.com/mlabonne/1d33c86824b3a11d2308e36db1ba41c1) 29 | 30 | *Note: This project is in the early stages and primarily designed for personal use. Use it carefully and feel free to contribute.* 31 | 32 | ## ⚡ Quick Start 33 | 34 | ### Evaluation 35 | 36 | * **`MODEL_ID`**: Enter the model id from Hugging Face. 37 | * **`BENCHMARK`**: 38 | * `nous`: List of tasks: AGIEval, GPT4ALL, TruthfulQA, and Bigbench (popularized by [Teknium](https://github.com/teknium1) and [NousResearch](https://github.com/NousResearch)). This is recommended. 39 | * `lighteval`: This is a [new library](https://github.com/huggingface/lighteval) from Hugging Face. It allows you to specify your tasks as shown in the readme. Check the list of [recommended tasks](https://github.com/huggingface/lighteval/blob/main/examples/tasks/recommended_set.txt) to see what you can use (e.g., HELM, PIQA, GSM8K, MATH, etc.) 40 | * `openllm`: List of tasks: ARC, HellaSwag, MMLU, Winogrande, GSM8K, and TruthfulQA (like the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)). It uses the [vllm](https://docs.vllm.ai/) implementation to enhance speed (note that the results will not be identical to those obtained without using vllm). "mmlu" is currently missing because of a problem with vllm. 41 | * **`LIGHTEVAL_TASK`**: You can select one or several tasks as specified in the [readme](https://github.com/huggingface/lighteval?tab=readme-ov-file#usage) or in the list of [recommended tasks](https://github.com/huggingface/lighteval/blob/main/examples/tasks/recommended_set.txt). 42 | 43 | ### Cloud GPU 44 | 45 | * **`GPU`**: Select the GPU you want for evaluation (see prices [here](https://www.runpod.io/console/gpu-cloud)). I recommend using beefy GPUs (RTX 3090 or higher), especially for the Open LLM benchmark suite. 46 | * **`Number of GPUs`**: Self-explanatory (more cost-efficient than bigger GPUs if you need more VRAM). 47 | * **`CONTAINER_DISK`**: Size of the disk in GB. 48 | * **`CLOUD_TYPE`**: RunPod offers a community cloud (cheaper) and a secure cloud (more reliable). 49 | * **`REPO`**: If you made a fork of this repo, you can specify its URL here (the image only runs `runpod.sh`). 50 | * **`TRUST_REMOTE_CODE`**: Models like Phi require this flag to run them. 51 | * **`PRIVATE_GIST`**: (W.I.P.) Make the Gist with the results private (true) or public (false). 52 | * **`DEBUG`**: The pod will not be destroyed at the end of the run (not recommended). 53 | 54 | ### Tokens 55 | 56 | Tokens use Colab's Secrets tab. Create two secrets called "runpod" and "github" and add the corresponding tokens you can find as follows: 57 | 58 | * **`RUNPOD_TOKEN`**: Please consider using my [referral link](https://runpod.io?ref=9nvk2srl) if you don't have an account yet. You can create your token [here](https://www.runpod.io/console/user/settings) under "API keys" (read & write permission). You'll also need to transfer some money there to start a pod. 59 | * **`GITHUB_TOKEN`**: You can create your token [here](https://github.com/settings/tokens) (read & write, can be restricted to "gist" only). 60 | * **`HF_TOKEN`**: Optional. You can find your Hugging Face token [here](https://huggingface.co/settings/tokens) if you have an account. 61 | 62 | ## 📊 Benchmark suites 63 | 64 | ### Nous 65 | 66 | You can compare your results with: 67 | * [YALL - Yet Another LLM Leaderboard](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard), my leaderboard made with the gists produced by LLM AutoEval. 68 | * Models like [OpenHermes-2.5-Mistral-7B](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B#benchmark-results), [Nous-Hermes-2-SOLAR-10.7B](https://huggingface.co/NousResearch/Nous-Hermes-2-SOLAR-10.7B), or [Nous-Hermes-2-Yi-34B](https://huggingface.co/NousResearch/Nous-Hermes-2-Yi-34B). 69 | * Teknium stores his evaluations in his [LLM-Benchmark-Logs](https://github.com/teknium1/LLM-Benchmark-Logs). 70 | 71 | ### Lighteval 72 | 73 | You can compare your results on a case-by-case basis, depending on the tasks you have selected. 74 | 75 | ### Open LLM 76 | 77 | You can compare your results with those listed on the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). 78 | 79 | ## 🏆 Leaderboard 80 | 81 | I use the summaries produced by LLM AutoEval to created [YALL - Yet Another LLM Leaderboard](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard) with plots as follows: 82 | 83 | ![image](https://github.com/mlabonne/llm-autoeval/assets/81252890/a9a7e24a-ee29-4c8f-b587-4549d16bf142) 84 | 85 | Let me know if you're interested in creating your own leaderboard with your gists in one click. This can be easily converted into a small notebook to create this space. 86 | 87 | ## 🛠️ Troubleshooting 88 | 89 | * **"Error: File does not exist"**: This task didn't produce the JSON file that is parsed for the summary. Activate debug mode and rerun the evaluation to inspect the issue in the logs. 90 | * **"700 Killed" Error**: The hardware is not powerful enough for the evaluation. This happens when you try to run the Open LLM benchmark suite on an RTX 3070 for example. 91 | * **Outdated CUDA Drivers**: That's unlucky. You'll need to start a new pod in this case. 92 | * **"triu_tril_cuda_template" not implemented for 'BFloat16'**: Switch the image as explained in [this issue](https://github.com/mlabonne/llm-autoeval/issues/22). 93 | 94 | ## Acknowledgements 95 | 96 | 97 | 98 | Special thanks to [burtenshaw](https://github.com/burtenshaw) for integrating lighteval, EleutherAI for the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness), [dmahan93](https://github.com/dmahan93) for his fork that adds agieval to the lm-evaluation-harness, Hugging Face for the [lighteval](https://github.com/huggingface/lighteval) library, [NousResearch](https://github.com/NousResearch) and [Teknium](https://github.com/teknium1) for the Nous benchmark suite, and 99 | [vllm](https://docs.vllm.ai/) for the additional inference speed. 100 | -------------------------------------------------------------------------------- /runpod.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | start=$(date +%s) 4 | 5 | # Detect the number of NVIDIA GPUs and create a device string 6 | gpu_count=$(nvidia-smi -L | wc -l) 7 | if [ $gpu_count -eq 0 ]; then 8 | echo "No NVIDIA GPUs detected. Exiting." 9 | exit 1 10 | fi 11 | # Construct the CUDA device string 12 | cuda_devices="" 13 | for ((i=0; i