├── img ├── llmautoeval.png └── colab.svg ├── llm_autoeval ├── upload.py └── table.py ├── LICENSE ├── main.py ├── README.md └── runpod.sh /img/llmautoeval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlabonne/llm-autoeval/HEAD/img/llmautoeval.png -------------------------------------------------------------------------------- /llm_autoeval/upload.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import requests 4 | 5 | 6 | def upload_to_github_gist(text, gist_name, gh_token): 7 | # Create the gist content 8 | gist_content = { 9 | "public": str(os.getenv("PRIVATE_GIST", False)).lower(), 10 | "files": { 11 | f"{gist_name}": { # Change the file extension to .txt for plain text 12 | "content": text 13 | } 14 | }, 15 | } 16 | 17 | # Headers for the request 18 | headers = { 19 | "Authorization": f"token {gh_token}", 20 | "Accept": "application/vnd.github.v3+json", 21 | } 22 | 23 | # Make the request 24 | response = requests.post( 25 | "https://api.github.com/gists", headers=headers, json=gist_content 26 | ) 27 | 28 | if response.status_code == 201: 29 | print(f"Uploaded gist successfully! URL: {response.json()['html_url']}") 30 | else: 31 | print( 32 | f"Failed to upload gist. Status code: {response.status_code}. Response: {response.text}" 33 | ) 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Teknium 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /img/colab.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import os 5 | import time 6 | 7 | from llm_autoeval.table import make_final_table, make_table 8 | from llm_autoeval.upload import upload_to_github_gist 9 | 10 | logging.basicConfig(level=logging.INFO) 11 | logger = logging.getLogger(__name__) 12 | 13 | MODEL_ID = os.getenv("MODEL_ID") 14 | BENCHMARK = os.getenv("BENCHMARK") 15 | GITHUB_API_TOKEN = os.getenv("GITHUB_API_TOKEN") 16 | 17 | 18 | def _make_autoeval_summary(directory: str, elapsed_time: float) -> str: 19 | # Variables 20 | tables = [] 21 | averages = [] 22 | 23 | # Tasks 24 | if BENCHMARK == "openllm": 25 | tasks = ["ARC", "HellaSwag", "MMLU", "TruthfulQA", "Winogrande", "GSM8K"] 26 | elif BENCHMARK == "nous": 27 | tasks = ["AGIEval", "GPT4All", "TruthfulQA", "Bigbench"] 28 | elif BENCHMARK == "eq-bench": 29 | tasks = ["EQ-Bench"] 30 | else: 31 | raise NotImplementedError( 32 | f"The benchmark {BENCHMARK} could not be found." 33 | ) 34 | 35 | # Load results 36 | for task in tasks: 37 | file_path = f"{directory}/{task.lower()}.json" 38 | if os.path.exists(file_path): 39 | json_data = open(file_path, "r").read() 40 | data = json.loads(json_data, strict=False) 41 | table, average = make_table(data, task) 42 | else: 43 | table = "" 44 | average = "Error: File does not exist" 45 | 46 | tables.append(table) 47 | averages.append(average) 48 | 49 | # Generate tables 50 | summary = "" 51 | for index, task in enumerate(tasks): 52 | summary += f"### {task}\n{tables[index]}\nAverage: {averages[index]}%\n\n" 53 | result_dict = {k: v for k, v in zip(tasks, averages)} 54 | 55 | # Calculate the final average, excluding strings 56 | if all(isinstance(e, float) for e in averages): 57 | final_average = round(sum(averages) / len(averages), 2) 58 | summary += f"Average score: {final_average}%" 59 | result_dict.update({"Average": final_average}) 60 | else: 61 | summary += "Average score: Not available due to errors" 62 | 63 | # Generate final table 64 | final_table = make_final_table(result_dict, MODEL_ID) 65 | summary = final_table + "\n" + summary 66 | return summary 67 | 68 | 69 | def _get_result_dict(directory: str) -> dict: 70 | """Walk down directories to get JSON path""" 71 | 72 | for root, dirs, files in os.walk(directory): 73 | for file in files: 74 | if file.endswith(".json"): 75 | return json.load(open(os.path.join(root, file))) 76 | raise FileNotFoundError(f"No JSON file found in {directory}") 77 | 78 | 79 | def _make_lighteval_summary(directory: str, elapsed_time: float) -> str: 80 | from lighteval.evaluator import make_results_table 81 | 82 | result_dict = _get_result_dict(directory) 83 | final_table = make_results_table(result_dict) 84 | summary = f"## {MODEL_ID.split('/')[-1]} - {BENCHMARK.capitalize()}\n\n" 85 | summary += final_table 86 | return summary 87 | 88 | 89 | def main(directory: str, elapsed_time: float) -> None: 90 | # Tasks 91 | if BENCHMARK == "openllm" or BENCHMARK == "nous" or BENCHMARK == "eq-bench": 92 | summary = _make_autoeval_summary(directory, elapsed_time) 93 | elif BENCHMARK == "lighteval": 94 | summary = _make_lighteval_summary(directory, elapsed_time) 95 | else: 96 | raise NotImplementedError( 97 | f"BENCHMARK should be 'openllm' or 'nous' (current value = {BENCHMARK})" 98 | ) 99 | 100 | # Add elapsed time 101 | convert = time.strftime("%H:%M:%S", time.gmtime(elapsed_time)) 102 | summary += f"\n\nElapsed time: {convert}" 103 | 104 | # Upload to GitHub Gist 105 | upload_to_github_gist( 106 | summary, 107 | f"{MODEL_ID.split('/')[-1]}-{BENCHMARK.capitalize()}.md", 108 | GITHUB_API_TOKEN, 109 | ) 110 | 111 | 112 | if __name__ == "__main__": 113 | # Create the parser 114 | parser = argparse.ArgumentParser(description="Summarize results and upload them.") 115 | parser.add_argument( 116 | "directory", type=str, help="The path to the directory with the JSON results" 117 | ) 118 | parser.add_argument( 119 | "elapsed_time", 120 | type=float, 121 | help="Elapsed time since the start of the evaluation", 122 | ) 123 | 124 | # Parse the arguments 125 | args = parser.parse_args() 126 | 127 | # Check if the directory exists 128 | if not os.path.isdir(args.directory): 129 | raise ValueError(f"The directory {args.directory} does not exist.") 130 | 131 | # Call the main function with the directory argument 132 | main(args.directory, args.elapsed_time) 133 | -------------------------------------------------------------------------------- /llm_autoeval/table.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | 4 | from pytablewriter import MarkdownTableWriter 5 | 6 | BENCHMARK = os.getenv("BENCHMARK") 7 | 8 | 9 | def get_acc_norm(data): 10 | accs = [ 11 | data["results"][k]["acc_norm"] 12 | if "acc_norm" in data["results"][k] 13 | else data["results"][k]["acc"] 14 | for k in data["results"] 15 | ] 16 | acc = sum(accs) / len(accs) * 100 17 | return acc 18 | 19 | 20 | def get_mcg(data): 21 | accs = [data["results"][k]["multiple_choice_grade"] for k in data["results"]] 22 | acc = sum(accs) / len(accs) * 100 23 | return acc 24 | 25 | 26 | def calculate_average(data, task): 27 | task = task.lower() 28 | print(data) 29 | 30 | if BENCHMARK == "openllm": 31 | if task == "arc": 32 | return data["results"]["arc_challenge"]["acc_norm,none"] * 100 33 | elif task == "hellaswag": 34 | return data["results"]["hellaswag"]["acc_norm,none"] * 100 35 | elif task == "mmlu": 36 | return data["results"]["mmlu"]["acc,none"] * 100 37 | elif task == "truthfulqa": 38 | value = data["results"]["truthfulqa_mc2"]["acc,none"] 39 | return 0.0 if math.isnan(value) else value * 100 40 | elif task == "winogrande": 41 | return data["results"]["winogrande"]["acc,none"] * 100 42 | elif task == "gsm8k": 43 | return data["results"]["gsm8k"]["exact_match,strict-match"] * 100 44 | 45 | elif BENCHMARK == "nous": 46 | if task == "agieval": 47 | return get_acc_norm(data) 48 | elif task == "gpt4all": 49 | return get_acc_norm(data) 50 | elif task == "bigbench": 51 | return get_mcg(data) 52 | elif task == "truthfulqa": 53 | value = data["results"]["truthfulqa_mc"]["mc2"] 54 | return 0.0 if math.isnan(value) else value * 100 55 | 56 | elif BENCHMARK == "eq-bench": 57 | if task == "eq-bench": 58 | return data["results"]["eq_bench"]["eqbench,none"] 59 | 60 | raise NotImplementedError(f"Could not find task {task} for benchmark {BENCHMARK}") 61 | 62 | 63 | def make_table(result_dict, task): 64 | """Generate table of results.""" 65 | # TODO: properly format values in table for openllm 66 | 67 | md_writer = MarkdownTableWriter() 68 | md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"] 69 | 70 | values = [] 71 | 72 | for k, dic in sorted(result_dict["results"].items()): 73 | version = result_dict["versions"].get(k, "N/A") 74 | 75 | percent = k == "squad2" 76 | for m, v in dic.items(): 77 | if m.endswith("_stderr"): 78 | continue 79 | 80 | if m + "_stderr" in dic: 81 | se = dic[m + "_stderr"] 82 | if percent or m == "ppl": 83 | values.append([k, version, m, "%.2f" % v, "±", "%.2f" % se]) 84 | else: 85 | values.append( 86 | [k, version, m, "%.2f" % (v * 100), "±", "%.2f" % (se * 100)] 87 | ) 88 | else: 89 | if percent or m == "ppl": 90 | values.append([k, version, m, "%.2f" % v, "", ""]) 91 | else: 92 | try: 93 | # Attempt to convert v to a float 94 | v_converted = float(v) 95 | v_formatted = "%.2f" % v_converted 96 | except ValueError: 97 | # If conversion fails, use the original string value 98 | v_formatted = v 99 | 100 | values.append([k, version, m, v_formatted, "", ""]) 101 | 102 | k = "" 103 | version = "" 104 | 105 | md_writer.value_matrix = values 106 | 107 | # Get average score 108 | average = round(calculate_average(result_dict, task), 2) 109 | 110 | return md_writer.dumps(), average 111 | 112 | 113 | def make_final_table(result_dict, model_name): 114 | """Generate table of results with model name. 115 | 116 | Args: 117 | result_dict (dict): A dictionary where keys are headers and values are the values in the table. 118 | model_name (str): The name of the model to be included in the table. 119 | 120 | Returns: 121 | str: A string representing the markdown table. 122 | """ 123 | md_writer = MarkdownTableWriter() 124 | # Add 'Model' as the first header and then the rest from the dictionary keys 125 | md_writer.headers = ["Model"] + list(result_dict.keys()) 126 | 127 | # The values in the table will be the model name and then the values from the dictionary 128 | values = [ 129 | f"[{model_name.split('/')[-1]}](https://huggingface.co/{model_name})" 130 | ] + list(result_dict.values()) 131 | 132 | # The table only has one row of values 133 | md_writer.value_matrix = [values] 134 | 135 | # Return the table as a markdown formatted string 136 | return md_writer.dumps() 137 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
4 | 🐦 Follow me on X • 5 | 🤗 Hugging Face • 6 | 💻 Blog • 7 | 📙 Hands-on GNN 8 |
9 |Simplify LLM evaluation using a convenient Colab notebook.
10 |
15 |
16 |