├── .DS_Store ├── price-graph.png ├── lmsys-compairson.png ├── aidan-bench-scores.png ├── models.py ├── prompts.py ├── README.md └── main.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/altryne/Aidan-Bench/main/.DS_Store -------------------------------------------------------------------------------- /price-graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/altryne/Aidan-Bench/main/price-graph.png -------------------------------------------------------------------------------- /lmsys-compairson.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/altryne/Aidan-Bench/main/lmsys-compairson.png -------------------------------------------------------------------------------- /aidan-bench-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/altryne/Aidan-Bench/main/aidan-bench-scores.png -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import os 3 | from functools import lru_cache 4 | from retry import retry 5 | 6 | 7 | @retry(tries=3) 8 | def chat_with_model(prompt, model, max_tokens=4000, temperature=0): 9 | client = OpenAI( 10 | api_key=os.getenv("OPEN_ROUTER_KEY"), 11 | base_url="https://openrouter.ai/api/v1" 12 | ) 13 | response = client.chat.completions.create( 14 | model=model, 15 | messages=[ 16 | { 17 | "role": "user", 18 | "content": prompt 19 | } 20 | ], 21 | max_tokens=max_tokens, 22 | temperature=temperature 23 | ) 24 | return response.choices[0].message.content 25 | 26 | 27 | @lru_cache(maxsize=10000) 28 | @retry(tries=3) 29 | def embed(text): 30 | client = OpenAI() 31 | 32 | response = client.embeddings.create( 33 | model="text-embedding-3-large", input=[text]) 34 | return response.data[0].embedding 35 | -------------------------------------------------------------------------------- /prompts.py: -------------------------------------------------------------------------------- 1 | # Questions should be open-ended but demand concrete answers. 2 | questions = [ 3 | "Provide an explanation for Japan's Lost Decades.", 4 | "What is a cause of World War 1?", 5 | "Why might the United States government nationalize ASI development?", 6 | "How might you use a brick and a blanket?", 7 | "What architectural features might you include in a tasteful house?", 8 | "Provide coordinates for a point inside the unit circle (x^2 + y^2 < 1).", 9 | "What's one way to use oregano?", 10 | "How might we enable LLMs to spend more output tokens to get predictably better results?", 11 | "Propose a solution to Los Angeles traffic.", 12 | "What activities might I include at a party for firefighters?", 13 | "Why did Rome fall?", 14 | "How could we redesign schools to better prepare students for the 22nd century?", 15 | "Find a solution to the inequality 2x + 3y < 10 where x and y are positive real numbers.", 16 | "What might be an unexpected consequence of achieving nuclear fusion?", 17 | "Describe a plausible alien life form that doesn't rely on carbon-based biology.", 18 | "How could we modify the rules of chess to make it more exciting for spectators?", 19 | "What would be the implications of a universal basic income on society?", 20 | "Propose an alternative to democracy for governing a country.", 21 | "Provide a real number greater than π but less than 4.", 22 | "How might we terraform Venus instead of Mars, and why?", 23 | "Design a new sport that combines elements of three existing sports.", 24 | "What could be a novel use for blockchain technology outside of cryptocurrency?", 25 | "How might human evolution be affected by long-term space colonization?", 26 | "Invent a new musical instrument and describe how it would be played.", 27 | "What might be an unexpected solution to reducing plastic waste in oceans?", 28 | ] 29 | 30 | 31 | def create_gen_prompt(question: str, previous_answers: list) -> str: 32 | prompt = ( 33 | "Answer the following question:.\n" 34 | f"{question}\n" 35 | "Your response should be a single brief sentence.\n" 36 | ) 37 | 38 | if len(previous_answers) > 0: 39 | 40 | previous_answers_str = "\n".join( 41 | [f"{i+1}. {answer}" for i, answer in enumerate(previous_answers)] 42 | ) 43 | 44 | prompt += ( 45 | "IMPORTANT: Provide an answer you *HAVE NOT* given previously.\n" 46 | "Your previous answers are inside of XML tags.\n" 47 | f"\n{previous_answers_str}\n" 48 | ) 49 | 50 | return prompt 51 | 52 | 53 | def create_judge_prompt(question: str, answer: str): 54 | prompt = f""" Your task is to evaluate the coherence and plausibility of an answer to a given question. This involves assessing whether the answer makes sense and isn't nonsensical or implausible. 55 | 56 | Question: {question} 57 | Answer: {answer} 58 | 59 | Evaluation process: 60 | 1. Understand the question: Analyze what the question is asking. 61 | 2. Assess the answer: Determine if the answer is coherent and plausible. 62 | 3. Check for nonsensical elements: Identify any aspects that are completely unrelated or absurd. 63 | 64 | Please think through each step carefully and show your reasoning: 65 | 66 | 1. Question analysis: 67 | [Your brief analysis of the question here] 68 | 69 | 2. Answer assessment: 70 | [Evaluate if the answer is coherent and plausible] 71 | 72 | 3. Nonsensical check: 73 | [Identify any completely unrelated or absurd elements] 74 | 75 | Based on your analysis, provide a final Coherence and Plausibility Score on a scale of 1 - 10, where: 76 | 1-3: Incoherent, implausible, or nonsensical 77 | 4-6: Partially coherent and plausible, but with some issues 78 | 7-8: Mostly coherent and plausible with minor issues 79 | 9-10: Highly coherent and plausible 80 | 81 | Ensure that nonsensical or completely implausible answers receive very low scores (1-3). 82 | 83 | IMPORTANT: After your reasoning, you must provide your final Coherence and Plausibility Score as a single integer between 1 and 10, enclosed in XML tags. For example: 84 | 7 85 | 86 | Your response must end with this score in the specified format. 87 | """ 88 | return prompt 89 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Aidan Bench 2 | Some models feel competent despite under-scoring on benchmarks like MMLU, GPQA, MATH, or NIAH. 3 | 4 | *Aidan Bench* rewards: 5 | 6 | 1. Creativity 7 | 2. Reliability 8 | 3. Contextual attention 9 | 4. Instruction following 10 | 11 | **Aidan Bench is weakly correlated with Lmsys, reveals poor GPT-4o performance, and surprisingly impressive Mistral Large 2 performance.** 12 | 13 | # Methodology 14 | 15 | We give LLMs a set of open-ended questions like the following: 16 | 17 | ```python 18 | "Provide an explanation for Japan's Lost Decades.", 19 | "How might you use a brick and a blanket?", 20 | "What architectural features might you include in a tasteful house?", 21 | "Provide coordinates for a point inside the unit circle (x^2 + y^2 < 1).", 22 | "Propose a solution to Los Angeles traffic.", 23 | "What activities might I include at a party for firefighters?", 24 | "How could we redesign schools to better prepare students for the 22nd century?", 25 | ``` 26 | 27 | And ask the model to answer each question while **avoiding previous answers** provided in-context. 28 | 29 | For each question, we generate answers until: 30 | 31 | 1. An answer is clearly incoherent (as judged by another LLM) 32 | 2. An answer is quite similar to one of its previous answers (as judged by an embedding model) 33 | 34 | We sum models' novelty scores across questions. The novelty score is the sum of the maximum dissimilarity across many questions: 35 | 36 | $$ 37 | \text{max}\text{-}\text{dissimilarity} = 1 - \max_{e_i \in E_\text{prev}} \frac{e_\text{new} \cdot e_i}{\|e_\text{new}\| \|e_i\|} 38 | $$ 39 | 40 | where: 41 | 42 | - $e_\text{new}$: embedding vector of the new answer 43 | - $E_\text{prev}$: set of embedding vectors for previous answers, $\{e_1, e_2, ..., e_n\}$ 44 | - $e_i$: an individual embedding vector from $E_\text{prev}$ 45 | 46 | # Findings 47 | 48 | Here are the final novelty scores across models: 49 | 50 | ![Novelty scores across models](aidan-bench-scores.png) 51 | 52 | Notable results: 53 | 54 | 1. `Mistral Large 2` wins this benchmark, scoring 25% higher than `Claude 3.5 Sonnet`, the runner-up. 55 | 2. OpenAI's `GPT-4o` underperforms similarly priced models substantially, including its cheaper sibling, `GPT-4o-mini`. 56 | 3. OpenAI's `GPT-4o-mini` punches well above its price class, rivaling much more expensive models like `Llama 3.1 405b`. 57 | 58 | We also include a comparison between Aidan Bench scores and Lmsys scores. Notably, there's a weak correlation between these benchmarks (r=0.188). 59 | 60 | ![Comparison of Aidan Bench and Lmsys scores](lmsys-compairson.png) 61 | 62 | We also compare each model's Aidan Bench scores to its (input) token pricing: 63 | 64 | ![Comparison of Aidan Bench scores and token pricing](price-graph.png) 65 | 66 | OpenAI's `GPT-4o-mini` and `Mistral Large 2` have outlier efficiency. 67 | 68 | ## Setup 69 | 70 | ### Prerequisites 71 | 72 | Ensure you have Python installed on your system. This project requires the following libraries: 73 | 74 | - numpy 75 | - openai 76 | - colorama 77 | - retry 78 | 79 | ### Installation 80 | 81 | 1. Clone the repository: 82 | ``` 83 | git clone https://github.com/aidanmclaughlin/Aidan-Bench.git 84 | cd Aidan-Bench 85 | ``` 86 | 87 | 2. Install the required libraries: 88 | ``` 89 | pip install numpy openai colorama retry 90 | ``` 91 | 92 | 3. Set up your API keys: 93 | - Create an environment variable named `OPEN_ROUTER_KEY` with your OpenRouter API key. 94 | - Create an environment variable named `OPENAI_API_KEY` with your OpenAI API key. 95 | 96 | ### Running the Project 97 | 98 | To run the benchmark: 99 | 100 | ``` 101 | python main.py [--single-threaded] 102 | ``` 103 | 104 | Arguments: 105 | - ``: (Required) Name of the model to benchmark 106 | - `--single-threaded`: (Optional) Run in single-threaded mode 107 | 108 | Examples: 109 | 110 | 1. To run the benchmark for GPT-4 Turbo in multithreaded mode (default): 111 | ``` 112 | python main.py openai/gpt-4-turbo 113 | ``` 114 | 115 | 2. To run the benchmark for Claude 3 Sonnet in single-threaded mode: 116 | ``` 117 | python main.py anthropic/claude-3-sonnet --single-threaded 118 | ``` 119 | 120 | The script will execute the benchmark using the specified model and threading option. By default, the benchmark runs in multithreaded mode unless the `--single-threaded` flag is provided. 121 | 122 | ### API Keys 123 | 124 | This project requires two different API keys: 125 | 126 | 1. OpenRouter API key: Used for chat completions with various models. 127 | 2. OpenAI API key: Used for embedding text. 128 | 129 | Make sure both keys are set up as environment variables before running the project. 130 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from models import chat_with_model, embed 3 | from prompts import questions, create_gen_prompt, create_judge_prompt 4 | from colorama import Fore, Style 5 | import time 6 | from concurrent.futures import ThreadPoolExecutor, as_completed 7 | import threading 8 | import argparse 9 | 10 | 11 | def parse_arguments(): 12 | parser = argparse.ArgumentParser(description="Benchmark a language model.") 13 | parser.add_argument("model_name", type=str, help="Name of the model to benchmark") 14 | parser.add_argument("--single-threaded", action="store_true", help="Run in single-threaded mode") 15 | return parser.parse_args() 16 | 17 | 18 | def benchmark_model(model_name, multithreaded=False): 19 | if multithreaded: 20 | return benchmark_model_multithreaded(model_name) 21 | else: 22 | return benchmark_model_sequential(model_name) 23 | 24 | 25 | def process_question(question, model_name): 26 | start_time = time.time() 27 | print(Fore.RED + question + Style.RESET_ALL) 28 | previous_answers = [] 29 | question_novelty = 0 30 | 31 | try: 32 | while True: 33 | gen_prompt = create_gen_prompt(question, previous_answers) 34 | try: 35 | new_answer = chat_with_model(prompt=gen_prompt, model=model_name) 36 | except Exception as e: 37 | print(Fore.RED + f"Error generating answer: {str(e)}" + Style.RESET_ALL) 38 | break 39 | 40 | judge_prompt = create_judge_prompt(question, new_answer) 41 | judge = "openai/gpt-4o-mini" 42 | try: 43 | judge_response = chat_with_model(prompt=judge_prompt, model=judge) 44 | except Exception as e: 45 | print(Fore.RED + f"Error getting judge response: {str(e)}" + Style.RESET_ALL) 46 | break 47 | 48 | coherence_score = int(judge_response.split("")[ 49 | 1].split("")[0]) 50 | 51 | if coherence_score <= 3: 52 | print( 53 | Fore.YELLOW + "Output is incoherent. Moving to next question." + Style.RESET_ALL) 54 | break 55 | 56 | novelty_score = get_novelty_score(new_answer, previous_answers) 57 | 58 | if novelty_score < 0.1: 59 | print( 60 | Fore.YELLOW + "Output is redundant. Moving to next question." + Style.RESET_ALL) 61 | break 62 | 63 | print(f"New Answer:\n{new_answer}") 64 | print(Fore.GREEN + f"Coherence Score: {coherence_score}") 65 | print(f"Novelty Score: {novelty_score}" + Style.RESET_ALL) 66 | 67 | previous_answers.append(new_answer) 68 | question_novelty += novelty_score 69 | 70 | except Exception as e: 71 | print(Fore.RED + f"Unexpected error processing question: {str(e)}" + Style.RESET_ALL) 72 | 73 | time_taken = time.time() - start_time 74 | print(Fore.BLUE) 75 | print(f"Total novelty score for this question: {question_novelty}") 76 | print(f"Time taken: {time_taken} seconds") 77 | print(Style.RESET_ALL) 78 | 79 | return question_novelty 80 | 81 | 82 | def get_novelty_score(new_answer: str, previous_answers: list): 83 | new_embedding = embed(new_answer) 84 | 85 | # If there are no previous answers, return maximum novelty 86 | if not previous_answers: 87 | return 1.0 88 | 89 | previous_embeddings = [embed(answer) for answer in previous_answers] 90 | 91 | similarities = [ 92 | np.dot(new_embedding, prev_embedding) / 93 | (np.linalg.norm(new_embedding) * np.linalg.norm(prev_embedding)) 94 | for prev_embedding in previous_embeddings 95 | ] 96 | 97 | max_similarity = max(similarities) 98 | novelty = 1 - max_similarity 99 | 100 | return novelty 101 | 102 | 103 | def benchmark_model_multithreaded(model_name): 104 | novelty_score = 0 105 | print_lock = threading.Lock() 106 | 107 | with ThreadPoolExecutor(max_workers=len(questions)) as executor: 108 | future_to_question = {executor.submit( 109 | process_question, question, model_name): question for question in questions} 110 | 111 | for future in as_completed(future_to_question): 112 | question = future_to_question[future] 113 | 114 | question_novelty = future.result() 115 | with print_lock: 116 | novelty_score += question_novelty 117 | 118 | print(Fore.YELLOW) 119 | print(f"Total novelty score across all questions: {novelty_score}") 120 | print(Style.RESET_ALL) 121 | 122 | return novelty_score 123 | 124 | 125 | def benchmark_model_sequential(model_name): 126 | novelty_score = 0 127 | 128 | for question in questions: 129 | question_novelty = process_question(question, model_name) 130 | novelty_score += question_novelty 131 | 132 | print(Fore.YELLOW) 133 | print(f"Total novelty score across all questions: {novelty_score}") 134 | print(Style.RESET_ALL) 135 | 136 | return novelty_score 137 | 138 | 139 | if __name__ == "__main__": 140 | args = parse_arguments() 141 | benchmark_model(args.model_name, multithreaded=not args.single_threaded) 142 | --------------------------------------------------------------------------------