├── .DS_Store
├── price-graph.png
├── lmsys-compairson.png
├── aidan-bench-scores.png
├── models.py
├── prompts.py
├── README.md
└── main.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/altryne/Aidan-Bench/main/.DS_Store


--------------------------------------------------------------------------------
/price-graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/altryne/Aidan-Bench/main/price-graph.png


--------------------------------------------------------------------------------
/lmsys-compairson.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/altryne/Aidan-Bench/main/lmsys-compairson.png


--------------------------------------------------------------------------------
/aidan-bench-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/altryne/Aidan-Bench/main/aidan-bench-scores.png


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | import os
 3 | from functools import lru_cache
 4 | from retry import retry
 5 | 
 6 | 
 7 | @retry(tries=3)
 8 | def chat_with_model(prompt, model, max_tokens=4000, temperature=0):
 9 |     client = OpenAI(
10 |         api_key=os.getenv("OPEN_ROUTER_KEY"),
11 |         base_url="https://openrouter.ai/api/v1"
12 |     )
13 |     response = client.chat.completions.create(
14 |         model=model,
15 |         messages=[
16 |             {
17 |                 "role": "user",
18 |                 "content": prompt
19 |             }
20 |         ],
21 |         max_tokens=max_tokens,
22 |         temperature=temperature
23 |     )
24 |     return response.choices[0].message.content
25 | 
26 | 
27 | @lru_cache(maxsize=10000)
28 | @retry(tries=3)
29 | def embed(text):
30 |     client = OpenAI()
31 | 
32 |     response = client.embeddings.create(
33 |         model="text-embedding-3-large", input=[text])
34 |     return response.data[0].embedding
35 | 


--------------------------------------------------------------------------------
/prompts.py:
--------------------------------------------------------------------------------
 1 | # Questions should be open-ended but demand concrete answers.
 2 | questions = [
 3 |     "Provide an explanation for Japan's Lost Decades.",
 4 |     "What is a cause of World War 1?",
 5 |     "Why might the United States government nationalize ASI development?",
 6 |     "How might you use a brick and a blanket?",
 7 |     "What architectural features might you include in a tasteful house?",
 8 |     "Provide coordinates for a point inside the unit circle (x^2 + y^2 < 1).",
 9 |     "What's one way to use oregano?",
10 |     "How might we enable LLMs to spend more output tokens to get predictably better results?",
11 |     "Propose a solution to Los Angeles traffic.",
12 |     "What activities might I include at a party for firefighters?",
13 |     "Why did Rome fall?",
14 |     "How could we redesign schools to better prepare students for the 22nd century?",
15 |     "Find a solution to the inequality 2x + 3y < 10 where x and y are positive real numbers.",
16 |     "What might be an unexpected consequence of achieving nuclear fusion?",
17 |     "Describe a plausible alien life form that doesn't rely on carbon-based biology.",
18 |     "How could we modify the rules of chess to make it more exciting for spectators?",
19 |     "What would be the implications of a universal basic income on society?",
20 |     "Propose an alternative to democracy for governing a country.",
21 |     "Provide a real number greater than π but less than 4.",
22 |     "How might we terraform Venus instead of Mars, and why?",
23 |     "Design a new sport that combines elements of three existing sports.",
24 |     "What could be a novel use for blockchain technology outside of cryptocurrency?",
25 |     "How might human evolution be affected by long-term space colonization?",
26 |     "Invent a new musical instrument and describe how it would be played.",
27 |     "What might be an unexpected solution to reducing plastic waste in oceans?",
28 | ]
29 | 
30 | 
31 | def create_gen_prompt(question: str, previous_answers: list) -> str:
32 |     prompt = (
33 |         "Answer the following question:.\n"
34 |         f"<question>{question}</question>\n"
35 |         "Your response should be a single brief sentence.\n"
36 |     )
37 | 
38 |     if len(previous_answers) > 0:
39 | 
40 |         previous_answers_str = "\n".join(
41 |             [f"{i+1}. {answer}" for i, answer in enumerate(previous_answers)]
42 |         )
43 | 
44 |         prompt += (
45 |             "IMPORTANT: Provide an answer you *HAVE NOT* given previously.\n"
46 |             "Your previous answers are inside of <previous_answers></previous_answers> XML tags.\n"
47 |             f"<previous_answers>\n{previous_answers_str}\n</previous_answers>"
48 |         )
49 | 
50 |     return prompt
51 | 
52 | 
53 | def create_judge_prompt(question: str, answer: str):
54 |     prompt = f"""    Your task is to evaluate the coherence and plausibility of an answer to a given question. This involves assessing whether the answer makes sense and isn't nonsensical or implausible.
55 | 
56 |     Question: <question>{question}</question>
57 |     Answer: <answer>{answer}</answer>
58 | 
59 |     Evaluation process:
60 |     1. Understand the question: Analyze what the question is asking.
61 |     2. Assess the answer: Determine if the answer is coherent and plausible.
62 |     3. Check for nonsensical elements: Identify any aspects that are completely unrelated or absurd.
63 | 
64 |     Please think through each step carefully and show your reasoning:
65 | 
66 |     1. Question analysis:
67 |     [Your brief analysis of the question here]
68 | 
69 |     2. Answer assessment:
70 |     [Evaluate if the answer is coherent and plausible]
71 | 
72 |     3. Nonsensical check:
73 |     [Identify any completely unrelated or absurd elements]
74 | 
75 |     Based on your analysis, provide a final Coherence and Plausibility Score on a scale of 1 - 10, where:
76 |     1-3: Incoherent, implausible, or nonsensical
77 |     4-6: Partially coherent and plausible, but with some issues
78 |     7-8: Mostly coherent and plausible with minor issues
79 |     9-10: Highly coherent and plausible
80 | 
81 |     Ensure that nonsensical or completely implausible answers receive very low scores (1-3).
82 | 
83 |     IMPORTANT: After your reasoning, you must provide your final Coherence and Plausibility Score as a single integer between 1 and 10, enclosed in <coherence_score></coherence_score> XML tags. For example:
84 |     <coherence_score>7</coherence_score>
85 | 
86 |     Your response must end with this score in the specified format.
87 |     """
88 |     return prompt
89 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Aidan Bench
  2 | Some models feel competent despite under-scoring on benchmarks like MMLU, GPQA, MATH, or NIAH.
  3 | 
  4 | *Aidan Bench* rewards:
  5 | 
  6 | 1. Creativity
  7 | 2. Reliability
  8 | 3. Contextual attention
  9 | 4. Instruction following
 10 | 
 11 | **Aidan Bench is weakly correlated with Lmsys, reveals poor GPT-4o performance, and surprisingly impressive Mistral Large 2 performance.**
 12 | 
 13 | # Methodology
 14 | 
 15 | We give LLMs a set of open-ended questions like the following:
 16 | 
 17 | ```python
 18 | "Provide an explanation for Japan's Lost Decades.",
 19 | "How might you use a brick and a blanket?",
 20 | "What architectural features might you include in a tasteful house?",
 21 | "Provide coordinates for a point inside the unit circle (x^2 + y^2 < 1).",
 22 | "Propose a solution to Los Angeles traffic.",
 23 | "What activities might I include at a party for firefighters?",
 24 | "How could we redesign schools to better prepare students for the 22nd century?",
 25 | ```
 26 | 
 27 | And ask the model to answer each question while **avoiding previous answers** provided in-context.
 28 | 
 29 | For each question, we generate answers until:
 30 | 
 31 | 1. An answer is clearly incoherent (as judged by another LLM)
 32 | 2. An answer is quite similar to one of its previous answers (as judged by an embedding model)
 33 | 
 34 | We sum models' novelty scores across questions. The novelty score is the sum of the maximum dissimilarity across many questions:
 35 | 
 36 | $$
 37 | \text{max}\text{-}\text{dissimilarity} = 1 - \max_{e_i \in E_\text{prev}} \frac{e_\text{new} \cdot e_i}{\|e_\text{new}\| \|e_i\|}
 38 | $$
 39 | 
 40 | where:
 41 | 
 42 | - $e_\text{new}$: embedding vector of the new answer
 43 | - $E_\text{prev}$: set of embedding vectors for previous answers, $\{e_1, e_2, ..., e_n\}$
 44 | - $e_i$: an individual embedding vector from $E_\text{prev}$
 45 | 
 46 | # Findings
 47 | 
 48 | Here are the final novelty scores across models:
 49 | 
 50 | ![Novelty scores across models](aidan-bench-scores.png)
 51 | 
 52 | Notable results:
 53 | 
 54 | 1. `Mistral Large 2` wins this benchmark, scoring 25% higher than `Claude 3.5 Sonnet`, the runner-up.
 55 | 2. OpenAI's `GPT-4o` underperforms similarly priced models substantially, including its cheaper sibling, `GPT-4o-mini`.
 56 | 3. OpenAI's `GPT-4o-mini` punches well above its price class, rivaling much more expensive models like `Llama 3.1 405b`.
 57 | 
 58 | We also include a comparison between Aidan Bench scores and Lmsys scores. Notably, there's a weak correlation between these benchmarks (r=0.188).
 59 | 
 60 | ![Comparison of Aidan Bench and Lmsys scores](lmsys-compairson.png)
 61 | 
 62 | We also compare each model's Aidan Bench scores to its (input) token pricing:
 63 | 
 64 | ![Comparison of Aidan Bench scores and token pricing](price-graph.png)
 65 | 
 66 | OpenAI's `GPT-4o-mini` and `Mistral Large 2` have outlier efficiency.
 67 | 
 68 | ## Setup
 69 | 
 70 | ### Prerequisites
 71 | 
 72 | Ensure you have Python installed on your system. This project requires the following libraries:
 73 | 
 74 | - numpy
 75 | - openai
 76 | - colorama
 77 | - retry
 78 | 
 79 | ### Installation
 80 | 
 81 | 1. Clone the repository:
 82 |    ```
 83 |    git clone https://github.com/aidanmclaughlin/Aidan-Bench.git
 84 |    cd Aidan-Bench
 85 |    ```
 86 | 
 87 | 2. Install the required libraries:
 88 |    ```
 89 |    pip install numpy openai colorama retry
 90 |    ```
 91 | 
 92 | 3. Set up your API keys:
 93 |    - Create an environment variable named `OPEN_ROUTER_KEY` with your OpenRouter API key.
 94 |    - Create an environment variable named `OPENAI_API_KEY` with your OpenAI API key.
 95 | 
 96 | ### Running the Project
 97 | 
 98 | To run the benchmark:
 99 | 
100 | ```
101 | python main.py <model_name> [--single-threaded]
102 | ```
103 | 
104 | Arguments:
105 | - `<model_name>`: (Required) Name of the model to benchmark
106 | - `--single-threaded`: (Optional) Run in single-threaded mode
107 | 
108 | Examples:
109 | 
110 | 1. To run the benchmark for GPT-4 Turbo in multithreaded mode (default):
111 |    ```
112 |    python main.py openai/gpt-4-turbo
113 |    ```
114 | 
115 | 2. To run the benchmark for Claude 3 Sonnet in single-threaded mode:
116 |    ```
117 |    python main.py anthropic/claude-3-sonnet --single-threaded
118 |    ```
119 | 
120 | The script will execute the benchmark using the specified model and threading option. By default, the benchmark runs in multithreaded mode unless the `--single-threaded` flag is provided.
121 | 
122 | ### API Keys
123 | 
124 | This project requires two different API keys:
125 | 
126 | 1. OpenRouter API key: Used for chat completions with various models.
127 | 2. OpenAI API key: Used for embedding text.
128 | 
129 | Make sure both keys are set up as environment variables before running the project.
130 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from models import chat_with_model, embed
  3 | from prompts import questions, create_gen_prompt, create_judge_prompt
  4 | from colorama import Fore, Style
  5 | import time
  6 | from concurrent.futures import ThreadPoolExecutor, as_completed
  7 | import threading
  8 | import argparse
  9 | 
 10 | 
 11 | def parse_arguments():
 12 |     parser = argparse.ArgumentParser(description="Benchmark a language model.")
 13 |     parser.add_argument("model_name", type=str, help="Name of the model to benchmark")
 14 |     parser.add_argument("--single-threaded", action="store_true", help="Run in single-threaded mode")
 15 |     return parser.parse_args()
 16 | 
 17 | 
 18 | def benchmark_model(model_name, multithreaded=False):
 19 |     if multithreaded:
 20 |         return benchmark_model_multithreaded(model_name)
 21 |     else:
 22 |         return benchmark_model_sequential(model_name)
 23 | 
 24 | 
 25 | def process_question(question, model_name):
 26 |     start_time = time.time()
 27 |     print(Fore.RED + question + Style.RESET_ALL)
 28 |     previous_answers = []
 29 |     question_novelty = 0
 30 | 
 31 |     try:
 32 |         while True:
 33 |             gen_prompt = create_gen_prompt(question, previous_answers)
 34 |             try:
 35 |                 new_answer = chat_with_model(prompt=gen_prompt, model=model_name)
 36 |             except Exception as e:
 37 |                 print(Fore.RED + f"Error generating answer: {str(e)}" + Style.RESET_ALL)
 38 |                 break
 39 | 
 40 |             judge_prompt = create_judge_prompt(question, new_answer)
 41 |             judge = "openai/gpt-4o-mini"
 42 |             try:
 43 |                 judge_response = chat_with_model(prompt=judge_prompt, model=judge)
 44 |             except Exception as e:
 45 |                 print(Fore.RED + f"Error getting judge response: {str(e)}" + Style.RESET_ALL)
 46 |                 break
 47 | 
 48 |             coherence_score = int(judge_response.split("<coherence_score>")[
 49 |                                 1].split("</coherence_score>")[0])
 50 | 
 51 |             if coherence_score <= 3:
 52 |                 print(
 53 |                     Fore.YELLOW + "Output is incoherent. Moving to next question." + Style.RESET_ALL)
 54 |                 break
 55 | 
 56 |             novelty_score = get_novelty_score(new_answer, previous_answers)
 57 | 
 58 |             if novelty_score < 0.1:
 59 |                 print(
 60 |                     Fore.YELLOW + "Output is redundant. Moving to next question." + Style.RESET_ALL)
 61 |                 break
 62 | 
 63 |             print(f"New Answer:\n{new_answer}")
 64 |             print(Fore.GREEN + f"Coherence Score: {coherence_score}")
 65 |             print(f"Novelty Score: {novelty_score}" + Style.RESET_ALL)
 66 | 
 67 |             previous_answers.append(new_answer)
 68 |             question_novelty += novelty_score
 69 | 
 70 |     except Exception as e:
 71 |         print(Fore.RED + f"Unexpected error processing question: {str(e)}" + Style.RESET_ALL)
 72 | 
 73 |     time_taken = time.time() - start_time
 74 |     print(Fore.BLUE)
 75 |     print(f"Total novelty score for this question: {question_novelty}")
 76 |     print(f"Time taken: {time_taken} seconds")
 77 |     print(Style.RESET_ALL)
 78 | 
 79 |     return question_novelty
 80 | 
 81 | 
 82 | def get_novelty_score(new_answer: str, previous_answers: list):
 83 |     new_embedding = embed(new_answer)
 84 | 
 85 |     # If there are no previous answers, return maximum novelty
 86 |     if not previous_answers:
 87 |         return 1.0
 88 | 
 89 |     previous_embeddings = [embed(answer) for answer in previous_answers]
 90 | 
 91 |     similarities = [
 92 |         np.dot(new_embedding, prev_embedding) /
 93 |         (np.linalg.norm(new_embedding) * np.linalg.norm(prev_embedding))
 94 |         for prev_embedding in previous_embeddings
 95 |     ]
 96 | 
 97 |     max_similarity = max(similarities)
 98 |     novelty = 1 - max_similarity
 99 | 
100 |     return novelty
101 | 
102 | 
103 | def benchmark_model_multithreaded(model_name):
104 |     novelty_score = 0
105 |     print_lock = threading.Lock()
106 | 
107 |     with ThreadPoolExecutor(max_workers=len(questions)) as executor:
108 |         future_to_question = {executor.submit(
109 |             process_question, question, model_name): question for question in questions}
110 | 
111 |         for future in as_completed(future_to_question):
112 |             question = future_to_question[future]
113 | 
114 |             question_novelty = future.result()
115 |             with print_lock:
116 |                 novelty_score += question_novelty
117 | 
118 |     print(Fore.YELLOW)
119 |     print(f"Total novelty score across all questions: {novelty_score}")
120 |     print(Style.RESET_ALL)
121 | 
122 |     return novelty_score
123 | 
124 | 
125 | def benchmark_model_sequential(model_name):
126 |     novelty_score = 0
127 | 
128 |     for question in questions:
129 |         question_novelty = process_question(question, model_name)
130 |         novelty_score += question_novelty
131 | 
132 |     print(Fore.YELLOW)
133 |     print(f"Total novelty score across all questions: {novelty_score}")
134 |     print(Style.RESET_ALL)
135 | 
136 |     return novelty_score
137 | 
138 | 
139 | if __name__ == "__main__":
140 |     args = parse_arguments()
141 |     benchmark_model(args.model_name, multithreaded=not args.single_threaded)
142 | 


--------------------------------------------------------------------------------