├── .DS_Store
├── price-graph.png
├── lmsys-compairson.png
├── aidan-bench-scores.png
├── models.py
├── prompts.py
├── README.md
└── main.py
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/altryne/Aidan-Bench/main/.DS_Store
--------------------------------------------------------------------------------
/price-graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/altryne/Aidan-Bench/main/price-graph.png
--------------------------------------------------------------------------------
/lmsys-compairson.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/altryne/Aidan-Bench/main/lmsys-compairson.png
--------------------------------------------------------------------------------
/aidan-bench-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/altryne/Aidan-Bench/main/aidan-bench-scores.png
--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
1 | from openai import OpenAI
2 | import os
3 | from functools import lru_cache
4 | from retry import retry
5 |
6 |
7 | @retry(tries=3)
8 | def chat_with_model(prompt, model, max_tokens=4000, temperature=0):
9 | client = OpenAI(
10 | api_key=os.getenv("OPEN_ROUTER_KEY"),
11 | base_url="https://openrouter.ai/api/v1"
12 | )
13 | response = client.chat.completions.create(
14 | model=model,
15 | messages=[
16 | {
17 | "role": "user",
18 | "content": prompt
19 | }
20 | ],
21 | max_tokens=max_tokens,
22 | temperature=temperature
23 | )
24 | return response.choices[0].message.content
25 |
26 |
27 | @lru_cache(maxsize=10000)
28 | @retry(tries=3)
29 | def embed(text):
30 | client = OpenAI()
31 |
32 | response = client.embeddings.create(
33 | model="text-embedding-3-large", input=[text])
34 | return response.data[0].embedding
35 |
--------------------------------------------------------------------------------
/prompts.py:
--------------------------------------------------------------------------------
1 | # Questions should be open-ended but demand concrete answers.
2 | questions = [
3 | "Provide an explanation for Japan's Lost Decades.",
4 | "What is a cause of World War 1?",
5 | "Why might the United States government nationalize ASI development?",
6 | "How might you use a brick and a blanket?",
7 | "What architectural features might you include in a tasteful house?",
8 | "Provide coordinates for a point inside the unit circle (x^2 + y^2 < 1).",
9 | "What's one way to use oregano?",
10 | "How might we enable LLMs to spend more output tokens to get predictably better results?",
11 | "Propose a solution to Los Angeles traffic.",
12 | "What activities might I include at a party for firefighters?",
13 | "Why did Rome fall?",
14 | "How could we redesign schools to better prepare students for the 22nd century?",
15 | "Find a solution to the inequality 2x + 3y < 10 where x and y are positive real numbers.",
16 | "What might be an unexpected consequence of achieving nuclear fusion?",
17 | "Describe a plausible alien life form that doesn't rely on carbon-based biology.",
18 | "How could we modify the rules of chess to make it more exciting for spectators?",
19 | "What would be the implications of a universal basic income on society?",
20 | "Propose an alternative to democracy for governing a country.",
21 | "Provide a real number greater than π but less than 4.",
22 | "How might we terraform Venus instead of Mars, and why?",
23 | "Design a new sport that combines elements of three existing sports.",
24 | "What could be a novel use for blockchain technology outside of cryptocurrency?",
25 | "How might human evolution be affected by long-term space colonization?",
26 | "Invent a new musical instrument and describe how it would be played.",
27 | "What might be an unexpected solution to reducing plastic waste in oceans?",
28 | ]
29 |
30 |
31 | def create_gen_prompt(question: str, previous_answers: list) -> str:
32 | prompt = (
33 | "Answer the following question:.\n"
34 | f"{question}\n"
35 | "Your response should be a single brief sentence.\n"
36 | )
37 |
38 | if len(previous_answers) > 0:
39 |
40 | previous_answers_str = "\n".join(
41 | [f"{i+1}. {answer}" for i, answer in enumerate(previous_answers)]
42 | )
43 |
44 | prompt += (
45 | "IMPORTANT: Provide an answer you *HAVE NOT* given previously.\n"
46 | "Your previous answers are inside of XML tags.\n"
47 | f"\n{previous_answers_str}\n"
48 | )
49 |
50 | return prompt
51 |
52 |
53 | def create_judge_prompt(question: str, answer: str):
54 | prompt = f""" Your task is to evaluate the coherence and plausibility of an answer to a given question. This involves assessing whether the answer makes sense and isn't nonsensical or implausible.
55 |
56 | Question: {question}
57 | Answer: {answer}
58 |
59 | Evaluation process:
60 | 1. Understand the question: Analyze what the question is asking.
61 | 2. Assess the answer: Determine if the answer is coherent and plausible.
62 | 3. Check for nonsensical elements: Identify any aspects that are completely unrelated or absurd.
63 |
64 | Please think through each step carefully and show your reasoning:
65 |
66 | 1. Question analysis:
67 | [Your brief analysis of the question here]
68 |
69 | 2. Answer assessment:
70 | [Evaluate if the answer is coherent and plausible]
71 |
72 | 3. Nonsensical check:
73 | [Identify any completely unrelated or absurd elements]
74 |
75 | Based on your analysis, provide a final Coherence and Plausibility Score on a scale of 1 - 10, where:
76 | 1-3: Incoherent, implausible, or nonsensical
77 | 4-6: Partially coherent and plausible, but with some issues
78 | 7-8: Mostly coherent and plausible with minor issues
79 | 9-10: Highly coherent and plausible
80 |
81 | Ensure that nonsensical or completely implausible answers receive very low scores (1-3).
82 |
83 | IMPORTANT: After your reasoning, you must provide your final Coherence and Plausibility Score as a single integer between 1 and 10, enclosed in XML tags. For example:
84 | 7
85 |
86 | Your response must end with this score in the specified format.
87 | """
88 | return prompt
89 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Aidan Bench
2 | Some models feel competent despite under-scoring on benchmarks like MMLU, GPQA, MATH, or NIAH.
3 |
4 | *Aidan Bench* rewards:
5 |
6 | 1. Creativity
7 | 2. Reliability
8 | 3. Contextual attention
9 | 4. Instruction following
10 |
11 | **Aidan Bench is weakly correlated with Lmsys, reveals poor GPT-4o performance, and surprisingly impressive Mistral Large 2 performance.**
12 |
13 | # Methodology
14 |
15 | We give LLMs a set of open-ended questions like the following:
16 |
17 | ```python
18 | "Provide an explanation for Japan's Lost Decades.",
19 | "How might you use a brick and a blanket?",
20 | "What architectural features might you include in a tasteful house?",
21 | "Provide coordinates for a point inside the unit circle (x^2 + y^2 < 1).",
22 | "Propose a solution to Los Angeles traffic.",
23 | "What activities might I include at a party for firefighters?",
24 | "How could we redesign schools to better prepare students for the 22nd century?",
25 | ```
26 |
27 | And ask the model to answer each question while **avoiding previous answers** provided in-context.
28 |
29 | For each question, we generate answers until:
30 |
31 | 1. An answer is clearly incoherent (as judged by another LLM)
32 | 2. An answer is quite similar to one of its previous answers (as judged by an embedding model)
33 |
34 | We sum models' novelty scores across questions. The novelty score is the sum of the maximum dissimilarity across many questions:
35 |
36 | $$
37 | \text{max}\text{-}\text{dissimilarity} = 1 - \max_{e_i \in E_\text{prev}} \frac{e_\text{new} \cdot e_i}{\|e_\text{new}\| \|e_i\|}
38 | $$
39 |
40 | where:
41 |
42 | - $e_\text{new}$: embedding vector of the new answer
43 | - $E_\text{prev}$: set of embedding vectors for previous answers, $\{e_1, e_2, ..., e_n\}$
44 | - $e_i$: an individual embedding vector from $E_\text{prev}$
45 |
46 | # Findings
47 |
48 | Here are the final novelty scores across models:
49 |
50 | 
51 |
52 | Notable results:
53 |
54 | 1. `Mistral Large 2` wins this benchmark, scoring 25% higher than `Claude 3.5 Sonnet`, the runner-up.
55 | 2. OpenAI's `GPT-4o` underperforms similarly priced models substantially, including its cheaper sibling, `GPT-4o-mini`.
56 | 3. OpenAI's `GPT-4o-mini` punches well above its price class, rivaling much more expensive models like `Llama 3.1 405b`.
57 |
58 | We also include a comparison between Aidan Bench scores and Lmsys scores. Notably, there's a weak correlation between these benchmarks (r=0.188).
59 |
60 | 
61 |
62 | We also compare each model's Aidan Bench scores to its (input) token pricing:
63 |
64 | 
65 |
66 | OpenAI's `GPT-4o-mini` and `Mistral Large 2` have outlier efficiency.
67 |
68 | ## Setup
69 |
70 | ### Prerequisites
71 |
72 | Ensure you have Python installed on your system. This project requires the following libraries:
73 |
74 | - numpy
75 | - openai
76 | - colorama
77 | - retry
78 |
79 | ### Installation
80 |
81 | 1. Clone the repository:
82 | ```
83 | git clone https://github.com/aidanmclaughlin/Aidan-Bench.git
84 | cd Aidan-Bench
85 | ```
86 |
87 | 2. Install the required libraries:
88 | ```
89 | pip install numpy openai colorama retry
90 | ```
91 |
92 | 3. Set up your API keys:
93 | - Create an environment variable named `OPEN_ROUTER_KEY` with your OpenRouter API key.
94 | - Create an environment variable named `OPENAI_API_KEY` with your OpenAI API key.
95 |
96 | ### Running the Project
97 |
98 | To run the benchmark:
99 |
100 | ```
101 | python main.py [--single-threaded]
102 | ```
103 |
104 | Arguments:
105 | - ``: (Required) Name of the model to benchmark
106 | - `--single-threaded`: (Optional) Run in single-threaded mode
107 |
108 | Examples:
109 |
110 | 1. To run the benchmark for GPT-4 Turbo in multithreaded mode (default):
111 | ```
112 | python main.py openai/gpt-4-turbo
113 | ```
114 |
115 | 2. To run the benchmark for Claude 3 Sonnet in single-threaded mode:
116 | ```
117 | python main.py anthropic/claude-3-sonnet --single-threaded
118 | ```
119 |
120 | The script will execute the benchmark using the specified model and threading option. By default, the benchmark runs in multithreaded mode unless the `--single-threaded` flag is provided.
121 |
122 | ### API Keys
123 |
124 | This project requires two different API keys:
125 |
126 | 1. OpenRouter API key: Used for chat completions with various models.
127 | 2. OpenAI API key: Used for embedding text.
128 |
129 | Make sure both keys are set up as environment variables before running the project.
130 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from models import chat_with_model, embed
3 | from prompts import questions, create_gen_prompt, create_judge_prompt
4 | from colorama import Fore, Style
5 | import time
6 | from concurrent.futures import ThreadPoolExecutor, as_completed
7 | import threading
8 | import argparse
9 |
10 |
11 | def parse_arguments():
12 | parser = argparse.ArgumentParser(description="Benchmark a language model.")
13 | parser.add_argument("model_name", type=str, help="Name of the model to benchmark")
14 | parser.add_argument("--single-threaded", action="store_true", help="Run in single-threaded mode")
15 | return parser.parse_args()
16 |
17 |
18 | def benchmark_model(model_name, multithreaded=False):
19 | if multithreaded:
20 | return benchmark_model_multithreaded(model_name)
21 | else:
22 | return benchmark_model_sequential(model_name)
23 |
24 |
25 | def process_question(question, model_name):
26 | start_time = time.time()
27 | print(Fore.RED + question + Style.RESET_ALL)
28 | previous_answers = []
29 | question_novelty = 0
30 |
31 | try:
32 | while True:
33 | gen_prompt = create_gen_prompt(question, previous_answers)
34 | try:
35 | new_answer = chat_with_model(prompt=gen_prompt, model=model_name)
36 | except Exception as e:
37 | print(Fore.RED + f"Error generating answer: {str(e)}" + Style.RESET_ALL)
38 | break
39 |
40 | judge_prompt = create_judge_prompt(question, new_answer)
41 | judge = "openai/gpt-4o-mini"
42 | try:
43 | judge_response = chat_with_model(prompt=judge_prompt, model=judge)
44 | except Exception as e:
45 | print(Fore.RED + f"Error getting judge response: {str(e)}" + Style.RESET_ALL)
46 | break
47 |
48 | coherence_score = int(judge_response.split("")[
49 | 1].split("")[0])
50 |
51 | if coherence_score <= 3:
52 | print(
53 | Fore.YELLOW + "Output is incoherent. Moving to next question." + Style.RESET_ALL)
54 | break
55 |
56 | novelty_score = get_novelty_score(new_answer, previous_answers)
57 |
58 | if novelty_score < 0.1:
59 | print(
60 | Fore.YELLOW + "Output is redundant. Moving to next question." + Style.RESET_ALL)
61 | break
62 |
63 | print(f"New Answer:\n{new_answer}")
64 | print(Fore.GREEN + f"Coherence Score: {coherence_score}")
65 | print(f"Novelty Score: {novelty_score}" + Style.RESET_ALL)
66 |
67 | previous_answers.append(new_answer)
68 | question_novelty += novelty_score
69 |
70 | except Exception as e:
71 | print(Fore.RED + f"Unexpected error processing question: {str(e)}" + Style.RESET_ALL)
72 |
73 | time_taken = time.time() - start_time
74 | print(Fore.BLUE)
75 | print(f"Total novelty score for this question: {question_novelty}")
76 | print(f"Time taken: {time_taken} seconds")
77 | print(Style.RESET_ALL)
78 |
79 | return question_novelty
80 |
81 |
82 | def get_novelty_score(new_answer: str, previous_answers: list):
83 | new_embedding = embed(new_answer)
84 |
85 | # If there are no previous answers, return maximum novelty
86 | if not previous_answers:
87 | return 1.0
88 |
89 | previous_embeddings = [embed(answer) for answer in previous_answers]
90 |
91 | similarities = [
92 | np.dot(new_embedding, prev_embedding) /
93 | (np.linalg.norm(new_embedding) * np.linalg.norm(prev_embedding))
94 | for prev_embedding in previous_embeddings
95 | ]
96 |
97 | max_similarity = max(similarities)
98 | novelty = 1 - max_similarity
99 |
100 | return novelty
101 |
102 |
103 | def benchmark_model_multithreaded(model_name):
104 | novelty_score = 0
105 | print_lock = threading.Lock()
106 |
107 | with ThreadPoolExecutor(max_workers=len(questions)) as executor:
108 | future_to_question = {executor.submit(
109 | process_question, question, model_name): question for question in questions}
110 |
111 | for future in as_completed(future_to_question):
112 | question = future_to_question[future]
113 |
114 | question_novelty = future.result()
115 | with print_lock:
116 | novelty_score += question_novelty
117 |
118 | print(Fore.YELLOW)
119 | print(f"Total novelty score across all questions: {novelty_score}")
120 | print(Style.RESET_ALL)
121 |
122 | return novelty_score
123 |
124 |
125 | def benchmark_model_sequential(model_name):
126 | novelty_score = 0
127 |
128 | for question in questions:
129 | question_novelty = process_question(question, model_name)
130 | novelty_score += question_novelty
131 |
132 | print(Fore.YELLOW)
133 | print(f"Total novelty score across all questions: {novelty_score}")
134 | print(Style.RESET_ALL)
135 |
136 | return novelty_score
137 |
138 |
139 | if __name__ == "__main__":
140 | args = parse_arguments()
141 | benchmark_model(args.model_name, multithreaded=not args.single_threaded)
142 |
--------------------------------------------------------------------------------