├── weave_utils ├── __init__.py ├── scorers.py └── models.py ├── system_prompt.txt ├── pyproject.toml ├── README.md ├── LICENSE ├── run_benchmark.py ├── .gitignore ├── simple_bench_public_set.csv └── simple_bench_public.json /weave_utils/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /system_prompt.txt: -------------------------------------------------------------------------------- 1 | You are an expert at reasoning and you always pick the most realistic answer. Think step by step and output your reasoning followed by your final answer using the following format: Final Answer: X where X is one of the letters A, B, C, D, E, or F. -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "simplebench" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "anthropic>=0.40.0", 9 | "fire>=0.7.0", 10 | "litellm>=1.54.1", 11 | "openai>=1.57.2", 12 | "python-dotenv>=1.0.1", 13 | "weave>=0.51.23", 14 | ] 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Simple Bench 2 | 3 | https://simple-bench.com/ 4 | 5 | ## Run Instructions 6 | 7 | Run benchmark: 8 | ``` 9 | python run_benchmark.py --model_name=gpt-4o --dataset_path=simple_bench_public.json 10 | ``` 11 | 12 | ## Setup Instructions 13 | 14 | Clone the github repo and cd into it. 15 | 16 | Make sure you have the correct python version (3.10.11) as a venv: 17 | ``` 18 | pyenv local 3.10.11 19 | python -m venv llm_env 20 | source llm_env/bin/activate 21 | ``` 22 | 23 | Install dependencies: 24 | 25 | The best way to install dependencies is to use `uv`. If you don't have it installed in your environment, you can install it with `pip install uv`. 26 | 27 | ``` 28 | uv pip install -r pyproject.toml 29 | ``` 30 | 31 | Create a `.env` file with the following: 32 | ``` 33 | OPENAI_API_KEY= 34 | ANTHROPIC_API_KEY= 35 | ... 36 | ``` 37 | -------------------------------------------------------------------------------- /weave_utils/scorers.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import weave 3 | import re 4 | 5 | 6 | @weave.op() 7 | def extract_answer(output: str) -> str: 8 | match = re.search(r"Final Answer:\s*([A-F])", output.strip(), re.IGNORECASE) 9 | if match: 10 | return match.group(1).upper() 11 | else: 12 | raise ValueError("No answer found in model output") 13 | 14 | 15 | @weave.op() 16 | def eval_majority_vote(output: List[str], answer: str): 17 | model_answers = [] 18 | for _output in output: 19 | try: 20 | model_answers.append(extract_answer(_output)) 21 | except ValueError: 22 | continue # Skip this output if extraction fails 23 | 24 | if not model_answers: 25 | raise ValueError("Failed to extract any valid answers from model outputs") 26 | 27 | return model_answers.count(answer) > len(model_answers) / 2 28 | 29 | 30 | @weave.op() 31 | def eval_multi_choice(output: str, answer: str): 32 | model_answer = extract_answer(output) 33 | return model_answer == answer 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 simple-bench 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /run_benchmark.py: -------------------------------------------------------------------------------- 1 | # python run_benchmark.py --model_name=gpt-4o-mini --dataset_path=output.json 2 | 3 | from typing import Optional 4 | import json 5 | import weave 6 | import asyncio 7 | from fire import Fire 8 | 9 | from dotenv import load_dotenv 10 | load_dotenv() 11 | 12 | from weave_utils.models import LiteLLMModel, MajorityVoteModel 13 | from weave_utils.scorers import eval_majority_vote, eval_multi_choice 14 | 15 | 16 | def load_dataset(file_path): 17 | with open(file_path, 'r') as file: 18 | data = json.load(file) 19 | return data['eval_data'] 20 | 21 | 22 | def run_benchmark( 23 | model_name: str = "gpt-4o-mini", 24 | dataset_path: str = "simple_bench_public.json", 25 | num_responses: int = 1, 26 | entity: str = "simplebench", 27 | project: str = "simple_bench_public", 28 | temp: float = 0.7, 29 | max_tokens: int = 2048, 30 | top_p: float = 0.95, 31 | max_retries: int = 3, 32 | system_prompt_path: str = "system_prompt.txt", 33 | ): 34 | """ 35 | Run a benchmark evaluation on a given model and dataset. 36 | 37 | Args: 38 | model_name (str): Name of the model to use for inference. 39 | Default is "gpt-4o-mini". 40 | dataset_path (str): Path to the dataset JSON file. 41 | Default is "simple_bench_public.json". 42 | num_responses (int): If greater than 1, majority voting will be applied. 43 | Default is 1 (no majority voting). 44 | entity (str): Optional Weave entity (org/user name) for evaluation tracking. 45 | project (str): The project name under the specified entity. 46 | Default is "simple_bench_public". 47 | temp (float): Temperature for the model. 48 | Default is 0.7. 49 | max_tokens (int): Maximum number of tokens to generate. 50 | Default is 2048. 51 | top_p (float): Top-p for the model. 52 | Default is 0.95. 53 | max_retries (int): Maximum number of retries for the model. 54 | Default is 3. 55 | system_prompt (str): System prompt for the model. 56 | Default is "You are an expert at reasoning and you always pick the most realistic answer. Think step by step and output your reasoning followed by your final answer using the following format: Final Answer: X where X is one of the letters A, B, C, D, E, or F." 57 | 58 | Example: 59 | python run_benchmark.py --model_name=gpt-4o-mini --dataset_path=simple_bench_public.json --num_responses=3 60 | """ 61 | 62 | if entity is not None: 63 | weave.init(f"{entity}/{project}") 64 | else: 65 | weave.init(f"{project}") 66 | 67 | evaluation = weave.Evaluation( 68 | dataset=load_dataset(dataset_path), 69 | scorers=[eval_majority_vote if num_responses > 1 else eval_multi_choice], 70 | trials=1, 71 | ) 72 | 73 | with open(system_prompt_path, "r") as f: 74 | system_prompt = f.read().strip() 75 | 76 | model = LiteLLMModel( 77 | model_name=model_name, 78 | temp=temp, 79 | max_tokens=max_tokens, 80 | top_p=top_p, 81 | max_retries=max_retries, 82 | system_prompt=system_prompt 83 | ) 84 | 85 | if num_responses > 1: 86 | model = MajorityVoteModel(model=model, num_responses=num_responses) 87 | 88 | asyncio.run(evaluation.evaluate(model)) 89 | 90 | 91 | if __name__ == "__main__": 92 | Fire(run_benchmark) 93 | -------------------------------------------------------------------------------- /weave_utils/models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import asyncio 3 | import random 4 | import weave 5 | from typing import Optional 6 | 7 | import time 8 | from litellm import acompletion 9 | 10 | from dotenv import load_dotenv 11 | load_dotenv() 12 | 13 | from openai import RateLimitError 14 | 15 | 16 | MODEL_MAP = { 17 | "gpt-4o-mini": "gpt-4o-mini", 18 | "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620", 19 | "gpt-4o": "gpt-4o-2024-08-06", 20 | "gpt-4-turbo": "gpt-4-turbo", 21 | "o1-preview": "o1-preview", 22 | "o1-mini": "o1-mini", 23 | "claude-3-opus-20240229": "claude-3-opus-20240229", 24 | "command-r-plus": "command-r-plus-08-2024", 25 | "gemini-1.5-pro": "gemini/gemini-1.5-pro", 26 | "llama3-405b-instruct": "fireworks_ai/accounts/fireworks/models/llama-v3p1-405b-instruct", 27 | "claude-3-haiku": "claude-3-haiku-20240307", 28 | "gemini-1.5-pro-002": "gemini/gemini-1.5-pro-002", 29 | "mistral-large": "mistral/mistral-large-2407", 30 | "grok-2": "openrouter/x-ai/grok-2" 31 | } 32 | 33 | EXPONENTIAL_BASE = 2 34 | 35 | 36 | class MajorityVoteModel(weave.Model): 37 | model: weave.Model 38 | num_responses: int = 3 39 | 40 | @weave.op() 41 | async def predict(self, prompt: str): 42 | tasks = [self.model.predict(prompt) for _ in range(self.num_responses)] 43 | return await asyncio.gather(*tasks) 44 | 45 | 46 | class LiteLLMModel(weave.Model): 47 | model_name: str 48 | system_prompt: Optional[str] = None 49 | temp: float = 0.7 50 | max_tokens: int = 2048 51 | top_p: float = 0.95 52 | max_retries: int = 3 53 | 54 | def __init__(self, **data): 55 | super().__init__(**data) 56 | # Add any additional initialization logic here 57 | if self.model_name not in MODEL_MAP: 58 | raise ValueError(f"Invalid model name: {self.model_name}") 59 | 60 | if "o1" in self.model_name: 61 | self.temp = None 62 | self.top_p = None 63 | self.max_tokens = None 64 | 65 | 66 | @weave.op() 67 | async def predict(self, prompt: str): 68 | delay = 2 69 | 70 | for i in range(self.max_retries): 71 | try: 72 | messages = [] 73 | if self.system_prompt is not None: 74 | messages.append({ 75 | "role": "system", 76 | "content": self.system_prompt 77 | }) 78 | messages.append({ 79 | "role": "user", 80 | "content": prompt 81 | }) 82 | response = await acompletion( 83 | model=MODEL_MAP[self.model_name], 84 | messages=messages, 85 | temperature=self.temp, 86 | max_tokens=self.max_tokens, 87 | top_p=self.top_p 88 | ) 89 | 90 | if response.choices[0].message.content is not None: 91 | return response.choices[0].message.content 92 | else: 93 | print(response) 94 | raise Exception("No content in response") 95 | except RateLimitError as e: 96 | delay *= EXPONENTIAL_BASE * (1 + random.random()) 97 | print( 98 | f"RateLimitError, retrying after {round(delay, 2)} seconds, {i+1}-th retry...", e 99 | ) 100 | await asyncio.sleep(delay) 101 | continue 102 | except Exception as e: 103 | print(f"Error in retry {i+1}, retrying...", e) 104 | continue 105 | 106 | raise Exception("Failed to get response after max retries") 107 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | simple_bench.csv 2 | output.json 3 | output_o1.json 4 | run_data/ 5 | 6 | .python-version 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | *.py,cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | cover/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | .pybuilder/ 83 | target/ 84 | 85 | # Jupyter Notebook 86 | .ipynb_checkpoints 87 | 88 | # IPython 89 | profile_default/ 90 | ipython_config.py 91 | 92 | # pyenv 93 | # For a library or package, you might want to ignore these files since the code is 94 | # intended to run in multiple environments; otherwise, check them in: 95 | # .python-version 96 | 97 | # pipenv 98 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 99 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 100 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 101 | # install all needed dependencies. 102 | #Pipfile.lock 103 | 104 | # poetry 105 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 106 | # This is especially recommended for binary packages to ensure reproducibility, and is more 107 | # commonly ignored for libraries. 108 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 109 | #poetry.lock 110 | 111 | # pdm 112 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 113 | #pdm.lock 114 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 115 | # in version control. 116 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 117 | .pdm.toml 118 | .pdm-python 119 | .pdm-build/ 120 | 121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 122 | __pypackages__/ 123 | 124 | # Celery stuff 125 | celerybeat-schedule 126 | celerybeat.pid 127 | 128 | # SageMath parsed files 129 | *.sage.py 130 | 131 | # Environments 132 | .env 133 | .venv 134 | env/ 135 | venv/ 136 | llm_env/ 137 | ENV/ 138 | env.bak/ 139 | venv.bak/ 140 | 141 | # Spyder project settings 142 | .spyderproject 143 | .spyproject 144 | 145 | # Rope project settings 146 | .ropeproject 147 | 148 | # mkdocs documentation 149 | /site 150 | 151 | # mypy 152 | .mypy_cache/ 153 | .dmypy.json 154 | dmypy.json 155 | 156 | # Pyre type checker 157 | .pyre/ 158 | 159 | # pytype static type analyzer 160 | .pytype/ 161 | 162 | # Cython debug symbols 163 | cython_debug/ 164 | 165 | # PyCharm 166 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 167 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 168 | # and can be added to the global gitignore or merged into this file. For a more nuclear 169 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 170 | #.idea/ 171 | v2_regular.json 172 | randomOrder.csv 173 | simplev2.csv 174 | -------------------------------------------------------------------------------- /simple_bench_public_set.csv: -------------------------------------------------------------------------------- 1 | id,question_func,correct_answer,option1,option2,option3,option4,option5 2 | 1,"Beth places four whole ice cubes in a frying pan at the start of the first minute, then five at the start of the second minute and some more at the start of the third minute, but none in the fourth minute. If the average number of ice cubes per minute placed in the pan while it was frying a crispy egg was five, how many whole ice cubes can be found in the pan at the end of the third minute?",0,5,11,20,30,10 3 | 2,"A juggler throws a solid blue ball a meter in the air and then a solid purple ball (of the same size) two meters in the air. She then climbs to the top of a tall ladder carefully, balancing a yellow balloon on her head. Where is the purple ball most likely now, in relation to the blue ball?",at the same height as the blue ball,inside the blue ball,below the blue ball,above the blue ball,above the yellow balloon,at the same height as the yellow balloon 4 | 3,"Jeff, Jo and Jim are in a 200m men's race, starting from the same position. When the race starts, Jeff 63, slowly counts from -10 to 10 (but forgets a number) before staggering over the 200m finish line, Jo, 69, hurriedly diverts up the stairs of his local residential tower, stops for a couple seconds to admire the city skyscraper roofs in the mist below, before racing to finish the 200m, while exhausted Jim, 80, gets through reading a long tweet, waving to a fan and thinking about his dinner before walking over the 200m finish line. [ _ ] likely finished last.",Jo likely finished last,Jim likely finished last,Jeff likely finished last,All of them finished simultaneously,"Jo and Jim likely finished last, at the same time","Jeff and Jim likely finished last, at the same time" 5 | 4,"There are two sisters, Amy who always speaks mistruths and Sam who always lies. You don't know which is which. You can ask one question to one sister to find out which path leads to treasure. Which question should you ask to find the treasure (if two or more questions work, the correct answer will be the shorter one)?","""What path leads to the treasure?""",“What is your sister’s number?”,"""What is your sister’s name?”","""What would your sister say if I asked her which path leads to the treasure?""","""What path do you think I will take, if you were to guess?""","""What is in the treasure?""" 6 | 5,"Peter needs CPR from his best friend Paul, the only person around. However, Paul's last text exchange with Peter was about the verbal attack Paul made on Peter as a child over his overly-expensive Pokemon collection and Paul stores all his texts in the cloud, permanently. Paul will [ _ ] help Peter.",definitely,ponder deeply over whether to,probably not,not,pretend to,half-heartedly 7 | 6,"While Jen was miles away from care-free John, she hooked-up with Jack, through Tinder. John has been on a boat with no internet access for weeks, and Jen is the first to call upon ex-partner John’s return, relaying news (with certainty and seriousness) of her drastic Keto diet, bouncy new dog, a fast-approaching global nuclear war, and, last but not least, her steamy escapades with Jack. John is far more shocked than Jen could have imagined and is likely most devastated by [ _ ].",wider international events,the drastic diet,the dog without prior agreement,the escapades,the lack of internet,sea sickness 8 | 7,"John is 24 and a kind, thoughtful and apologetic person. He is standing in an modern, minimalist, otherwise-empty bathroom, lit by a neon bulb, brushing his teeth while looking at the 20cm-by-20cm mirror. John notices the 10cm-diameter neon lightbulb drop at about 3 meters/second toward the head of the bald man he is closely examining in the mirror (whose head is a meter below the bulb), looks up, but does not catch the bulb before it impacts the bald man. The bald man curses, yells 'what an idiot!' and leaves the bathroom. Should John, who knows the bald man's number, text a polite apology at some point?","no, because it would be redundant","yes, because John saw it coming, and we should generally apologize if we fail to prevent harm","no, because the lightbulb was essentially unavoidable","yes because it is the polite thing to do, even if it wasn't your fault.","yes, it would be in character for him to send a polite text apologizing for the incident","yes, because it would potentially smooth over any lingering tension from the encounter" 9 | 8,"On a shelf, there is only a green apple, red pear, and pink peach. Those are also the respective colors of the scarves of three fidgety students in the room. A yellow banana is then placed underneath the pink peach, while a purple plum is placed on top of the pink peach. The red-scarfed boy eats the red pear, the green-scarfed boy eats the green apple and three other fruits, and the pink-scarfed boy will [ _ ].",eat no fruits,eat the pink peach,"eat the pink, yellow and purple fruits",eat just the yellow banana,eat just the purple plum,eat two fruits 10 | 9,"Agatha makes a stack of 5 cold, fresh single-slice ham sandwiches (with no sauces or condiments) in Room A, then immediately uses duct tape to stick the top surface of the uppermost sandwich to the bottom of her walking stick. She then walks to Room B, with her walking stick, so how many whole sandwiches are there now, in each room?","4 whole sandwiches in room A, 0 whole sandwiches in Room B","4 whole sandwiches in room B, 1 whole sandwich in Room A",All 5 whole sandwiches in Room B,"4 whole sandwiches in Room B, 1 whole sandwiches in room A",All 5 whole sandwiches in Room A,no sandwiches anywhere 11 | 10,"A luxury sports-car is traveling north at 30km/h over a roadbridge, 250m long, which runs over a river that is flowing at 5km/h eastward. The wind is blowing at 1km/h westward, slow enough not to bother the pedestrians snapping photos of the car from both sides of the roadbridge as the car passes. A glove was stored in the trunk of the car, but slips out of a hole and drops out when the car is half-way over the bridge. Assume the car continues in the same direction at the same speed, and the wind and river continue to move as stated. 1 hour later, the water-proof glove is (relative to the center of the bridge) approximately",<1 km northward,5 km+ eastward,30 km northward,4km eastward,>30 km away north-easterly.,>30km away north-westerly -------------------------------------------------------------------------------- /simple_bench_public.json: -------------------------------------------------------------------------------- 1 | { 2 | "eval_data": [ 3 | { 4 | "question_id": 1, 5 | "prompt": "Beth places four whole ice cubes in a frying pan at the start of the first minute, then five at the start of the second minute and some more at the start of the third minute, but none in the fourth minute. If the average number of ice cubes per minute placed in the pan while it was frying a crispy egg was five, how many whole ice cubes can be found in the pan at the end of the third minute?\nA. 30\nB. 0\nC. 20\nD. 10\nE. 11\nF. 5\n", 6 | "answer": "B" 7 | }, 8 | { 9 | "question_id": 2, 10 | "prompt": "A juggler throws a solid blue ball a meter in the air and then a solid purple ball (of the same size) two meters in the air. She then climbs to the top of a tall ladder carefully, balancing a yellow balloon on her head. Where is the purple ball most likely now, in relation to the blue ball?\nA. at the same height as the blue ball\nB. at the same height as the yellow balloon\nC. inside the blue ball\nD. above the yellow balloon\nE. below the blue ball\nF. above the blue ball\n", 11 | "answer": "A" 12 | }, 13 | { 14 | "question_id": 3, 15 | "prompt": "Jeff, Jo and Jim are in a 200m men's race, starting from the same position. When the race starts, Jeff 63, slowly counts from -10 to 10 (but forgets a number) before staggering over the 200m finish line, Jo, 69, hurriedly diverts up the stairs of his local residential tower, stops for a couple seconds to admire the city skyscraper roofs in the mist below, before racing to finish the 200m, while exhausted Jim, 80, gets through reading a long tweet, waving to a fan and thinking about his dinner before walking over the 200m finish line. [ _ ] likely finished last.\nA. Jo likely finished last\nB. Jeff and Jim likely finished last, at the same time\nC. Jim likely finished last\nD. Jeff likely finished last\nE. All of them finished simultaneously\nF. Jo and Jim likely finished last, at the same time\n", 16 | "answer": "A" 17 | }, 18 | { 19 | "question_id": 4, 20 | "prompt": "There are two sisters, Amy who always speaks mistruths and Sam who always lies. You don't know which is which. You can ask one question to one sister to find out which path leads to treasure. Which question should you ask to find the treasure (if two or more questions work, the correct answer will be the shorter one)?\nA. \"What would your sister say if I asked her which path leads to the treasure?\"\nB. \"What is your sister\u2019s name?\u201d\nC. \"What path leads to the treasure?\"\nD. \"What path do you think I will take, if you were to guess?\"\nE. \"What is in the treasure?\"\nF. \u201cWhat is your sister\u2019s number?\u201d\n", 21 | "answer": "C" 22 | }, 23 | { 24 | "question_id": 5, 25 | "prompt": "Peter needs CPR from his best friend Paul, the only person around. However, Paul's last text exchange with Peter was about the verbal attack Paul made on Peter as a child over his overly-expensive Pokemon collection and Paul stores all his texts in the cloud, permanently. Paul will [ _ ] help Peter.\nA. probably not\nB. definitely\nC. half-heartedly\nD. not\nE. pretend to\nF. ponder deeply over whether to\n", 26 | "answer": "B" 27 | }, 28 | { 29 | "question_id": 6, 30 | "prompt": "While Jen was miles away from care-free John, she hooked-up with Jack, through Tinder. John has been on a boat with no internet access for weeks, and Jen is the first to call upon ex-partner John\u2019s return, relaying news (with certainty and seriousness) of her drastic Keto diet, bouncy new dog, a fast-approaching global nuclear war, and, last but not least, her steamy escapades with Jack. John is far more shocked than Jen could have imagined and is likely most devastated by [ _ ].\nA. wider international events\nB. the lack of internet\nC. the dog without prior agreement\nD. sea sickness\nE. the drastic diet\nF. the escapades\n", 31 | "answer": "A" 32 | }, 33 | { 34 | "question_id": 7, 35 | "prompt": "John is 24 and a kind, thoughtful and apologetic person. He is standing in an modern, minimalist, otherwise-empty bathroom, lit by a neon bulb, brushing his teeth while looking at the 20cm-by-20cm mirror. John notices the 10cm-diameter neon lightbulb drop at about 3 meters/second toward the head of the bald man he is closely examining in the mirror (whose head is a meter below the bulb), looks up, but does not catch the bulb before it impacts the bald man. The bald man curses, yells 'what an idiot!' and leaves the bathroom. Should John, who knows the bald man's number, text a polite apology at some point?\nA. no, because the lightbulb was essentially unavoidable\nB. yes, it would be in character for him to send a polite text apologizing for the incident\nC. no, because it would be redundant\nD. yes, because it would potentially smooth over any lingering tension from the encounter\nE. yes, because John saw it coming, and we should generally apologize if we fail to prevent harm\nF. yes because it is the polite thing to do, even if it wasn't your fault.\n", 36 | "answer": "C" 37 | }, 38 | { 39 | "question_id": 8, 40 | "prompt": "On a shelf, there is only a green apple, red pear, and pink peach. Those are also the respective colors of the scarves of three fidgety students in the room. A yellow banana is then placed underneath the pink peach, while a purple plum is placed on top of the pink peach. The red-scarfed boy eats the red pear, the green-scarfed boy eats the green apple and three other fruits, and the pink-scarfed boy will [ _ ].\nA. eat just the yellow banana\nB. eat the pink, yellow and purple fruits\nC. eat just the purple plum\nD. eat the pink peach\nE. eat two fruits\nF. eat no fruits\n", 41 | "answer": "F" 42 | }, 43 | { 44 | "question_id": 9, 45 | "prompt": "Agatha makes a stack of 5 cold, fresh single-slice ham sandwiches (with no sauces or condiments) in Room A, then immediately uses duct tape to stick the top surface of the uppermost sandwich to the bottom of her walking stick. She then walks to Room B, with her walking stick, so how many whole sandwiches are there now, in each room?\nA. 4 whole sandwiches in room A, 0 whole sandwiches in Room B\nB. no sandwiches anywhere\nC. 4 whole sandwiches in room B, 1 whole sandwich in Room A\nD. All 5 whole sandwiches in Room B\nE. 4 whole sandwiches in Room B, 1 whole sandwiches in room A\nF. All 5 whole sandwiches in Room A\n", 46 | "answer": "A" 47 | }, 48 | { 49 | "question_id": 10, 50 | "prompt": "A luxury sports-car is traveling north at 30km/h over a roadbridge, 250m long, which runs over a river that is flowing at 5km/h eastward. The wind is blowing at 1km/h westward, slow enough not to bother the pedestrians snapping photos of the car from both sides of the roadbridge as the car passes. A glove was stored in the trunk of the car, but slips out of a hole and drops out when the car is half-way over the bridge. Assume the car continues in the same direction at the same speed, and the wind and river continue to move as stated. 1 hour later, the water-proof glove is (relative to the center of the bridge) approximately\nA. 4km eastward\nB. <1 km northward\nC. >30km away north-westerly\nD. 30 km northward\nE. >30 km away north-easterly.\nF. 5 km+ eastward\n", 51 | "answer": "B" 52 | } 53 | ] 54 | } --------------------------------------------------------------------------------