├── weave_utils
    ├── __init__.py
    ├── scorers.py
    └── models.py
├── system_prompt.txt
├── pyproject.toml
├── README.md
├── LICENSE
├── run_benchmark.py
├── .gitignore
├── simple_bench_public_set.csv
└── simple_bench_public.json


/weave_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/system_prompt.txt:
--------------------------------------------------------------------------------
1 | You are an expert at reasoning and you always pick the most realistic answer. Think step by step and output your reasoning followed by your final answer using the following format: Final Answer: X where X is one of the letters A, B, C, D, E, or F.


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "simplebench"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = ">=3.10"
 7 | dependencies = [
 8 |     "anthropic>=0.40.0",
 9 |     "fire>=0.7.0",
10 |     "litellm>=1.54.1",
11 |     "openai>=1.57.2",
12 |     "python-dotenv>=1.0.1",
13 |     "weave>=0.51.23",
14 | ]
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Simple Bench
 2 | 
 3 | https://simple-bench.com/
 4 | 
 5 | ## Run Instructions
 6 | 
 7 | Run benchmark:
 8 | ```
 9 | python run_benchmark.py --model_name=gpt-4o --dataset_path=simple_bench_public.json
10 | ```
11 | 
12 | ## Setup Instructions
13 | 
14 | Clone the github repo and cd into it.
15 | 
16 | Make sure you have the correct python version (3.10.11) as a venv:
17 | ```
18 | pyenv local 3.10.11
19 | python -m venv llm_env
20 | source llm_env/bin/activate
21 | ```
22 | 
23 | Install dependencies:
24 | 
25 | The best way to install dependencies is to use `uv`. If you don't have it installed in your environment, you can install it with `pip install uv`.
26 | 
27 | ``` 
28 | uv pip install -r pyproject.toml
29 | ```
30 | 
31 | Create a `.env` file with the following:
32 | ```
33 | OPENAI_API_KEY=<your key>
34 | ANTHROPIC_API_KEY=<your key>
35 | ...
36 | ```
37 | 


--------------------------------------------------------------------------------
/weave_utils/scorers.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | import weave
 3 | import re
 4 | 
 5 | 
 6 | @weave.op()
 7 | def extract_answer(output: str) -> str:
 8 |     match = re.search(r"Final Answer:\s*([A-F])", output.strip(), re.IGNORECASE)
 9 |     if match:
10 |         return match.group(1).upper()
11 |     else:
12 |         raise ValueError("No answer found in model output")
13 | 
14 | 
15 | @weave.op()
16 | def eval_majority_vote(output: List[str], answer: str):
17 |     model_answers = []
18 |     for _output in output:
19 |         try:
20 |             model_answers.append(extract_answer(_output))
21 |         except ValueError:
22 |             continue  # Skip this output if extraction fails
23 |     
24 |     if not model_answers:
25 |         raise ValueError("Failed to extract any valid answers from model outputs")
26 |     
27 |     return model_answers.count(answer) > len(model_answers) / 2
28 | 
29 | 
30 | @weave.op()
31 | def eval_multi_choice(output: str, answer: str):
32 |     model_answer = extract_answer(output)
33 |     return model_answer == answer
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 simple-bench
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/run_benchmark.py:
--------------------------------------------------------------------------------
 1 | # python run_benchmark.py --model_name=gpt-4o-mini --dataset_path=output.json
 2 | 
 3 | from typing import Optional
 4 | import json
 5 | import weave
 6 | import asyncio
 7 | from fire import Fire
 8 | 
 9 | from dotenv import load_dotenv
10 | load_dotenv()
11 | 
12 | from weave_utils.models import LiteLLMModel, MajorityVoteModel
13 | from weave_utils.scorers import eval_majority_vote, eval_multi_choice
14 | 
15 | 
16 | def load_dataset(file_path):
17 |     with open(file_path, 'r') as file:
18 |         data = json.load(file)
19 |         return data['eval_data']
20 | 
21 | 
22 | def run_benchmark(
23 |     model_name: str = "gpt-4o-mini",
24 |     dataset_path: str = "simple_bench_public.json",
25 |     num_responses: int = 1,
26 |     entity: str = "simplebench",
27 |     project: str = "simple_bench_public",
28 |     temp: float = 0.7,
29 |     max_tokens: int = 2048,
30 |     top_p: float = 0.95,
31 |     max_retries: int = 3,
32 |     system_prompt_path: str = "system_prompt.txt",
33 | ):
34 |     """
35 |     Run a benchmark evaluation on a given model and dataset.
36 | 
37 |     Args:
38 |         model_name (str): Name of the model to use for inference.
39 |             Default is "gpt-4o-mini".
40 |         dataset_path (str): Path to the dataset JSON file.
41 |             Default is "simple_bench_public.json".
42 |         num_responses (int): If greater than 1, majority voting will be applied.
43 |             Default is 1 (no majority voting).
44 |         entity (str): Optional Weave entity (org/user name) for evaluation tracking.
45 |         project (str): The project name under the specified entity.
46 |             Default is "simple_bench_public".
47 |         temp (float): Temperature for the model.
48 |             Default is 0.7.
49 |         max_tokens (int): Maximum number of tokens to generate.
50 |             Default is 2048.
51 |         top_p (float): Top-p for the model.
52 |             Default is 0.95.
53 |         max_retries (int): Maximum number of retries for the model.
54 |             Default is 3.
55 |         system_prompt (str): System prompt for the model.
56 |             Default is "You are an expert at reasoning and you always pick the most realistic answer. Think step by step and output your reasoning followed by your final answer using the following format: Final Answer: X where X is one of the letters A, B, C, D, E, or F."
57 | 
58 |     Example:
59 |         python run_benchmark.py --model_name=gpt-4o-mini --dataset_path=simple_bench_public.json --num_responses=3
60 |     """
61 | 
62 |     if entity is not None:
63 |         weave.init(f"{entity}/{project}")
64 |     else:
65 |         weave.init(f"{project}")
66 | 
67 |     evaluation = weave.Evaluation(
68 |         dataset=load_dataset(dataset_path),
69 |         scorers=[eval_majority_vote if num_responses > 1 else eval_multi_choice],
70 |         trials=1,
71 |     )
72 | 
73 |     with open(system_prompt_path, "r") as f:
74 |         system_prompt = f.read().strip()
75 | 
76 |     model = LiteLLMModel(
77 |         model_name=model_name,
78 |         temp=temp,
79 |         max_tokens=max_tokens,
80 |         top_p=top_p,
81 |         max_retries=max_retries,
82 |         system_prompt=system_prompt
83 |     )
84 | 
85 |     if num_responses > 1:
86 |         model = MajorityVoteModel(model=model, num_responses=num_responses)
87 | 
88 |     asyncio.run(evaluation.evaluate(model))
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     Fire(run_benchmark)
93 | 


--------------------------------------------------------------------------------
/weave_utils/models.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import asyncio
  3 | import random
  4 | import weave
  5 | from typing import Optional
  6 | 
  7 | import time
  8 | from litellm import acompletion
  9 | 
 10 | from dotenv import load_dotenv
 11 | load_dotenv()
 12 | 
 13 | from openai import RateLimitError
 14 | 
 15 | 
 16 | MODEL_MAP = {
 17 |     "gpt-4o-mini": "gpt-4o-mini",
 18 |     "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
 19 |     "gpt-4o": "gpt-4o-2024-08-06",
 20 |     "gpt-4-turbo": "gpt-4-turbo",
 21 |     "o1-preview": "o1-preview",
 22 |     "o1-mini": "o1-mini",
 23 |     "claude-3-opus-20240229": "claude-3-opus-20240229",
 24 |     "command-r-plus": "command-r-plus-08-2024",
 25 |     "gemini-1.5-pro": "gemini/gemini-1.5-pro",
 26 |     "llama3-405b-instruct": "fireworks_ai/accounts/fireworks/models/llama-v3p1-405b-instruct",
 27 |     "claude-3-haiku": "claude-3-haiku-20240307",
 28 |     "gemini-1.5-pro-002": "gemini/gemini-1.5-pro-002",
 29 |     "mistral-large": "mistral/mistral-large-2407",
 30 |     "grok-2": "openrouter/x-ai/grok-2"
 31 | }
 32 | 
 33 | EXPONENTIAL_BASE = 2    
 34 | 
 35 | 
 36 | class MajorityVoteModel(weave.Model):
 37 |     model: weave.Model
 38 |     num_responses: int = 3
 39 |     
 40 |     @weave.op()
 41 |     async def predict(self, prompt: str):
 42 |         tasks = [self.model.predict(prompt) for _ in range(self.num_responses)]
 43 |         return await asyncio.gather(*tasks)
 44 | 
 45 | 
 46 | class LiteLLMModel(weave.Model):
 47 |     model_name: str
 48 |     system_prompt: Optional[str] = None
 49 |     temp: float = 0.7
 50 |     max_tokens: int = 2048
 51 |     top_p: float = 0.95
 52 |     max_retries: int = 3
 53 |     
 54 |     def __init__(self, **data):
 55 |         super().__init__(**data)
 56 |         # Add any additional initialization logic here
 57 |         if self.model_name not in MODEL_MAP:
 58 |             raise ValueError(f"Invalid model name: {self.model_name}")
 59 | 
 60 |         if "o1" in self.model_name: 
 61 |             self.temp = None
 62 |             self.top_p = None
 63 |             self.max_tokens = None
 64 | 
 65 |     
 66 |     @weave.op()
 67 |     async def predict(self, prompt: str):
 68 |         delay = 2
 69 | 
 70 |         for i in range(self.max_retries):
 71 |             try:
 72 |                 messages = []
 73 |                 if self.system_prompt is not None:
 74 |                     messages.append({
 75 |                         "role": "system",
 76 |                         "content": self.system_prompt
 77 |                     })
 78 |                 messages.append({
 79 |                     "role": "user",
 80 |                     "content": prompt
 81 |                 })
 82 |                 response = await acompletion(
 83 |                     model=MODEL_MAP[self.model_name],
 84 |                     messages=messages,
 85 |                     temperature=self.temp,
 86 |                     max_tokens=self.max_tokens,
 87 |                     top_p=self.top_p
 88 |                 )
 89 | 
 90 |                 if response.choices[0].message.content is not None:
 91 |                     return response.choices[0].message.content
 92 |                 else:
 93 |                     print(response)
 94 |                     raise Exception("No content in response")
 95 |             except RateLimitError as e:
 96 |                 delay *= EXPONENTIAL_BASE * (1 + random.random())
 97 |                 print(
 98 |                     f"RateLimitError, retrying after {round(delay, 2)} seconds, {i+1}-th retry...", e
 99 |                 )
100 |                 await asyncio.sleep(delay)
101 |                 continue
102 |             except Exception as e:
103 |                 print(f"Error in retry {i+1}, retrying...", e)
104 |                 continue
105 | 
106 |         raise Exception("Failed to get response after max retries")
107 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | simple_bench.csv
  2 | output.json
  3 | output_o1.json
  4 | run_data/
  5 | 
  6 | .python-version
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | share/python-wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | *.py,cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | cover/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | db.sqlite3
 69 | db.sqlite3-journal
 70 | 
 71 | # Flask stuff:
 72 | instance/
 73 | .webassets-cache
 74 | 
 75 | # Scrapy stuff:
 76 | .scrapy
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | 
 81 | # PyBuilder
 82 | .pybuilder/
 83 | target/
 84 | 
 85 | # Jupyter Notebook
 86 | .ipynb_checkpoints
 87 | 
 88 | # IPython
 89 | profile_default/
 90 | ipython_config.py
 91 | 
 92 | # pyenv
 93 | #   For a library or package, you might want to ignore these files since the code is
 94 | #   intended to run in multiple environments; otherwise, check them in:
 95 | # .python-version
 96 | 
 97 | # pipenv
 98 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 99 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
100 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
101 | #   install all needed dependencies.
102 | #Pipfile.lock
103 | 
104 | # poetry
105 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
106 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
107 | #   commonly ignored for libraries.
108 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
109 | #poetry.lock
110 | 
111 | # pdm
112 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113 | #pdm.lock
114 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
115 | #   in version control.
116 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
117 | .pdm.toml
118 | .pdm-python
119 | .pdm-build/
120 | 
121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
122 | __pypackages__/
123 | 
124 | # Celery stuff
125 | celerybeat-schedule
126 | celerybeat.pid
127 | 
128 | # SageMath parsed files
129 | *.sage.py
130 | 
131 | # Environments
132 | .env
133 | .venv
134 | env/
135 | venv/
136 | llm_env/
137 | ENV/
138 | env.bak/
139 | venv.bak/
140 | 
141 | # Spyder project settings
142 | .spyderproject
143 | .spyproject
144 | 
145 | # Rope project settings
146 | .ropeproject
147 | 
148 | # mkdocs documentation
149 | /site
150 | 
151 | # mypy
152 | .mypy_cache/
153 | .dmypy.json
154 | dmypy.json
155 | 
156 | # Pyre type checker
157 | .pyre/
158 | 
159 | # pytype static type analyzer
160 | .pytype/
161 | 
162 | # Cython debug symbols
163 | cython_debug/
164 | 
165 | # PyCharm
166 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
167 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
168 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
169 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
170 | #.idea/
171 | v2_regular.json
172 | randomOrder.csv
173 | simplev2.csv
174 | 


--------------------------------------------------------------------------------
/simple_bench_public_set.csv:
--------------------------------------------------------------------------------
 1 | id,question_func,correct_answer,option1,option2,option3,option4,option5
 2 | 1,"Beth places four whole ice cubes in a frying pan at the start of the first minute, then five at the start of the second minute and some more at the start of the third minute, but none in the fourth minute. If the average number of ice cubes per minute placed in the pan while it was frying a crispy egg was five, how many whole ice cubes can be found in the pan at the end of the third minute?",0,5,11,20,30,10
 3 | 2,"A juggler throws a solid blue ball a meter in the air and then a solid purple ball (of the same size) two meters in the air. She then climbs to the top of a tall ladder carefully, balancing a yellow balloon on her head. Where is the purple ball most likely now, in relation to the blue ball?",at the same height as the blue ball,inside the blue ball,below the blue ball,above the blue ball,above the yellow balloon,at the same height as the yellow balloon
 4 | 3,"Jeff, Jo and Jim are in a 200m men's race, starting from the same position. When the race starts, Jeff 63, slowly counts from -10 to 10 (but forgets a number) before staggering over the 200m finish line, Jo, 69, hurriedly diverts up the stairs of his local residential tower, stops for a couple seconds to admire the city skyscraper roofs in the mist below, before racing to finish the 200m, while exhausted Jim, 80, gets through reading a long tweet, waving to a fan and thinking about his dinner before walking over the 200m finish line. [ _ ] likely finished last.",Jo likely finished last,Jim likely finished last,Jeff likely finished last,All of them finished simultaneously,"Jo and Jim likely finished last, at the same time","Jeff and Jim likely finished last, at the same time"
 5 | 4,"There are two sisters, Amy who always speaks mistruths and Sam who always lies. You don't know which is which. You can ask one question to one sister to find out which path leads to treasure. Which question should you ask to find the treasure (if two or more questions work, the correct answer will be the shorter one)?","""What path leads to the treasure?""",“What is your sister’s number?”,"""What is your sister’s name?”","""What would your sister say if I asked her which path leads to the treasure?""","""What path do you think I will take, if you were to guess?""","""What is in the treasure?"""
 6 | 5,"Peter needs CPR from his best friend Paul, the only person around. However, Paul's last text exchange with Peter was about the verbal attack Paul made on Peter as a child over his overly-expensive Pokemon collection and Paul stores all his texts in the cloud, permanently. Paul will [ _ ] help Peter.",definitely,ponder deeply over whether to,probably not,not,pretend to,half-heartedly
 7 | 6,"While Jen was miles away from care-free John, she hooked-up with Jack, through Tinder. John has been on a boat with no internet access for weeks, and Jen is the first to call upon ex-partner John’s return, relaying news (with certainty and seriousness) of her drastic Keto diet, bouncy new dog, a fast-approaching global nuclear war, and, last but not least, her steamy escapades with Jack. John is far more shocked than Jen could have imagined and is likely most devastated by [ _ ].",wider international events,the drastic diet,the dog without prior agreement,the escapades,the lack of internet,sea sickness
 8 | 7,"John is 24 and a kind, thoughtful and apologetic person. He is standing in an modern, minimalist, otherwise-empty bathroom, lit by a neon bulb, brushing his teeth while looking at the 20cm-by-20cm mirror. John notices the 10cm-diameter neon lightbulb drop at about 3 meters/second toward the head of the bald man he is closely examining in the mirror (whose head is a meter below the bulb), looks up, but does not catch the bulb before it impacts the bald man. The bald man curses, yells 'what an idiot!' and leaves the bathroom. Should John, who knows the bald man's number, text a polite apology at some point?","no, because it would be redundant","yes, because John saw it coming, and we should generally apologize if we fail to prevent harm","no, because the lightbulb was essentially unavoidable","yes because it is the polite thing to do, even if it wasn't your fault.","yes, it would be in character for him to send a polite text apologizing for the incident","yes, because it would potentially smooth over any lingering tension from the encounter"
 9 | 8,"On a shelf, there is only a green apple, red pear, and pink peach. Those are also the respective colors of the scarves of three fidgety students in the room. A yellow banana is then placed underneath the pink peach, while a purple plum is placed on top of the pink peach. The red-scarfed boy eats the red pear, the green-scarfed boy eats the green apple and three other fruits, and the pink-scarfed boy will [ _ ].",eat no fruits,eat the pink peach,"eat the pink, yellow and purple fruits",eat just the yellow banana,eat just the purple plum,eat two fruits
10 | 9,"Agatha makes a stack of 5 cold, fresh single-slice ham sandwiches (with no sauces or condiments) in Room A, then immediately uses duct tape to stick the top surface of the uppermost sandwich to the bottom of her walking stick. She then walks to Room B, with her walking stick, so how many whole sandwiches are there now, in each room?","4 whole sandwiches in room A, 0 whole sandwiches in Room B","4 whole sandwiches in room B, 1 whole sandwich in Room A",All 5 whole sandwiches in Room B,"4 whole sandwiches in Room B, 1 whole sandwiches in room A",All 5 whole sandwiches in Room A,no sandwiches anywhere
11 | 10,"A luxury sports-car is traveling north at 30km/h over a roadbridge, 250m long, which runs over a river that is flowing at 5km/h eastward. The wind is blowing at 1km/h westward, slow enough not to bother the pedestrians snapping photos of the car from both sides of the roadbridge as the car passes. A glove was stored in the trunk of the car, but slips out of a hole and drops out when the car is half-way over the bridge. Assume the car continues in the same direction at the same speed, and the wind and river continue to move as stated. 1 hour later, the water-proof glove is (relative to the center of the bridge) approximately",<1 km northward,5 km+ eastward,30 km northward,4km eastward,>30 km away north-easterly.,>30km away north-westerly


--------------------------------------------------------------------------------
/simple_bench_public.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "eval_data": [
 3 |     {
 4 |       "question_id": 1,
 5 |       "prompt": "Beth places four whole ice cubes in a frying pan at the start of the first minute, then five at the start of the second minute and some more at the start of the third minute, but none in the fourth minute. If the average number of ice cubes per minute placed in the pan while it was frying a crispy egg was five, how many whole ice cubes can be found in the pan at the end of the third minute?\nA. 30\nB. 0\nC. 20\nD. 10\nE. 11\nF. 5\n",
 6 |       "answer": "B"
 7 |     },
 8 |     {
 9 |       "question_id": 2,
10 |       "prompt": "A juggler throws a solid blue ball a meter in the air and then a solid purple ball (of the same size) two meters in the air. She then climbs to the top of a tall ladder carefully, balancing a yellow balloon on her head. Where is the purple ball most likely now, in relation to the blue ball?\nA. at the same height as the blue ball\nB. at the same height as the yellow balloon\nC. inside the blue ball\nD. above the yellow balloon\nE. below the blue ball\nF. above the blue ball\n",
11 |       "answer": "A"
12 |     },
13 |     {
14 |       "question_id": 3,
15 |       "prompt": "Jeff, Jo and Jim are in a 200m men's race, starting from the same position. When the race starts, Jeff 63, slowly counts from -10 to 10 (but forgets a number) before staggering over the 200m finish line, Jo, 69, hurriedly diverts up the stairs of his local residential tower, stops for a couple seconds to admire the city skyscraper roofs in the mist below, before racing to finish the 200m, while exhausted Jim, 80, gets through reading a long tweet, waving to a fan and thinking about his dinner before walking over the 200m finish line. [ _ ] likely finished last.\nA. Jo likely finished last\nB. Jeff and Jim likely finished last, at the same time\nC. Jim likely finished last\nD. Jeff likely finished last\nE. All of them finished simultaneously\nF. Jo and Jim likely finished last, at the same time\n",
16 |       "answer": "A"
17 |     },
18 |     {
19 |       "question_id": 4,
20 |       "prompt": "There are two sisters, Amy who always speaks mistruths and Sam who always lies. You don't know which is which. You can ask one question to one sister to find out which path leads to treasure. Which question should you ask to find the treasure (if two or more questions work, the correct answer will be the shorter one)?\nA. \"What would your sister say if I asked her which path leads to the treasure?\"\nB. \"What is your sister\u2019s name?\u201d\nC. \"What path leads to the treasure?\"\nD. \"What path do you think I will take, if you were to guess?\"\nE. \"What is in the treasure?\"\nF. \u201cWhat is your sister\u2019s number?\u201d\n",
21 |       "answer": "C"
22 |     },
23 |     {
24 |       "question_id": 5,
25 |       "prompt": "Peter needs CPR from his best friend Paul, the only person around. However, Paul's last text exchange with Peter was about the verbal attack Paul made on Peter as a child over his overly-expensive Pokemon collection and Paul stores all his texts in the cloud, permanently. Paul will [ _ ] help Peter.\nA. probably not\nB. definitely\nC. half-heartedly\nD. not\nE. pretend to\nF. ponder deeply over whether to\n",
26 |       "answer": "B"
27 |     },
28 |     {
29 |       "question_id": 6,
30 |       "prompt": "While Jen was miles away from care-free John, she hooked-up with Jack, through Tinder. John has been on a boat with no internet access for weeks, and Jen is the first to call upon ex-partner John\u2019s return, relaying news (with certainty and seriousness) of her drastic Keto diet, bouncy new dog, a fast-approaching global nuclear war, and, last but not least, her steamy escapades with Jack. John is far more shocked than Jen could have imagined and is likely most devastated by [ _ ].\nA. wider international events\nB. the lack of internet\nC. the dog without prior agreement\nD. sea sickness\nE. the drastic diet\nF. the escapades\n",
31 |       "answer": "A"
32 |     },
33 |     {
34 |       "question_id": 7,
35 |       "prompt": "John is 24 and a kind, thoughtful and apologetic person. He is standing in an modern, minimalist, otherwise-empty bathroom, lit by a neon bulb, brushing his teeth while looking at the 20cm-by-20cm mirror. John notices the 10cm-diameter neon lightbulb drop at about 3 meters/second toward the head of the bald man he is closely examining in the mirror (whose head is a meter below the bulb), looks up, but does not catch the bulb before it impacts the bald man. The bald man curses, yells 'what an idiot!' and leaves the bathroom. Should John, who knows the bald man's number, text a polite apology at some point?\nA. no, because the lightbulb was essentially unavoidable\nB. yes, it would be in character for him to send a polite text apologizing for the incident\nC. no, because it would be redundant\nD. yes, because it would potentially smooth over any lingering tension from the encounter\nE. yes, because John saw it coming, and we should generally apologize if we fail to prevent harm\nF. yes because it is the polite thing to do, even if it wasn't your fault.\n",
36 |       "answer": "C"
37 |     },
38 |     {
39 |       "question_id": 8,
40 |       "prompt": "On a shelf, there is only a green apple, red pear, and pink peach. Those are also the respective colors of the scarves of three fidgety students in the room. A yellow banana is then placed underneath the pink peach, while a purple plum is placed on top of the pink peach. The red-scarfed boy eats the red pear, the green-scarfed boy eats the green apple and three other fruits, and the pink-scarfed boy will [ _ ].\nA. eat just the yellow banana\nB. eat the pink, yellow and purple fruits\nC. eat just the purple plum\nD. eat the pink peach\nE. eat two fruits\nF. eat no fruits\n",
41 |       "answer": "F"
42 |     },
43 |     {
44 |       "question_id": 9,
45 |       "prompt": "Agatha makes a stack of 5 cold, fresh single-slice ham sandwiches (with no sauces or condiments) in Room A, then immediately uses duct tape to stick the top surface of the uppermost sandwich to the bottom of her walking stick. She then walks to Room B, with her walking stick, so how many whole sandwiches are there now, in each room?\nA. 4 whole sandwiches in room A, 0 whole sandwiches in Room B\nB. no sandwiches anywhere\nC. 4 whole sandwiches in room B, 1 whole sandwich in Room A\nD. All 5 whole sandwiches in Room B\nE. 4 whole sandwiches in Room B, 1 whole sandwiches in room A\nF. All 5 whole sandwiches in Room A\n",
46 |       "answer": "A"
47 |     },
48 |     {
49 |       "question_id": 10,
50 |       "prompt": "A luxury sports-car is traveling north at 30km/h over a roadbridge, 250m long, which runs over a river that is flowing at 5km/h eastward. The wind is blowing at 1km/h westward, slow enough not to bother the pedestrians snapping photos of the car from both sides of the roadbridge as the car passes. A glove was stored in the trunk of the car, but slips out of a hole and drops out when the car is half-way over the bridge. Assume the car continues in the same direction at the same speed, and the wind and river continue to move as stated. 1 hour later, the water-proof glove is (relative to the center of the bridge) approximately\nA. 4km eastward\nB. <1 km northward\nC. >30km away north-westerly\nD. 30 km northward\nE. >30 km away north-easterly.\nF. 5 km+ eastward\n",
51 |       "answer": "B"
52 |     }
53 |   ]
54 | }


--------------------------------------------------------------------------------