├── src ├── evaluation │ ├── __init__.py │ ├── output_parser.py │ ├── metric_calculator.py │ └── batch_eval_openai.py ├── inference │ ├── __init__.py │ ├── multi_turn.py │ ├── multi_turn_openai.py │ ├── single_turn.py │ └── single_turn_openai.py ├── data │ ├── __init__.py │ ├── data_utils.py │ └── dataloader.py ├── __init__.py ├── model │ ├── __init__.py │ ├── vllm.py │ └── openai.py └── utils │ ├── __init__.py │ ├── sampling_params.py │ ├── common.py │ └── prompts.py ├── requirements.txt ├── assets ├── teaser.png └── discrepancies.png ├── outputs ├── evaluation │ ├── llama3_8b_zero-shot_metric.json │ └── llama3_8b_prompt-gold.json └── inference │ ├── gpt-3.5-turbo_prompt-gold.json │ ├── llama3_8b_prompt-gold.json │ ├── llama3_8b_prompt-pred.json │ └── gpt-3.5-turbo_zero-shot.json ├── scripts ├── evaluation │ ├── llama3_8b_prompt-gold_auto.sh │ └── llama3_8b_zero-shot.sh └── inference │ ├── gpt-3.5-turbo.sh │ └── llama3_8b.sh ├── .gitignore ├── README.md ├── LICENSE └── DATA_LICENSE /src/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/inference/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiolimiter 2 | tqdm 3 | transformers 4 | datasets 5 | vllm 6 | openai -------------------------------------------------------------------------------- /assets/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaistAI/knowledge-reasoning/HEAD/assets/teaser.png -------------------------------------------------------------------------------- /src/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .data_utils import * 2 | from .dataloader import HFDepthQALoader 3 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | from . import data, evaluation, inference, model, utils 2 | 3 | utils.set_seed() 4 | -------------------------------------------------------------------------------- /src/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .openai import OpenAIBatchClient, OpenAILLM 2 | from .vllm import VLLM 3 | -------------------------------------------------------------------------------- /assets/discrepancies.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaistAI/knowledge-reasoning/HEAD/assets/discrepancies.png -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .common import * 2 | from .prompts import * 3 | from .sampling_params import * 4 | -------------------------------------------------------------------------------- /src/utils/sampling_params.py: -------------------------------------------------------------------------------- 1 | # Open source LLM inference parameters 2 | SAMPLING_PARAMS = { 3 | "max_tokens": 1024, 4 | "temperature": 1.0, 5 | "top_p": 0.9, 6 | "repetition_penalty": 1.03, 7 | } 8 | 9 | # OpenAI inference/evaluation parameters 10 | SAMPLING_PARAMS_OPENAI = {"max_tokens": 1024, "temperature": 1.0, "top_p": 0.9} 11 | -------------------------------------------------------------------------------- /src/data/data_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Callable, Dict 3 | 4 | 5 | def filter_data_dict( 6 | data: Dict[str, Dict], filter_fn: Callable[[Dict], bool] 7 | ) -> Dict[str, Dict]: 8 | return {key: value for key, value in data.items() if filter_fn(value)} 9 | 10 | 11 | def slice_data_dict(data: Dict[str, Dict], start: int, end: int) -> Dict[str, Dict]: 12 | return {key: value for key, value in list(data.items())[start:end]} 13 | -------------------------------------------------------------------------------- /outputs/evaluation/llama3_8b_zero-shot_metric.json: -------------------------------------------------------------------------------- 1 | { 2 | "Average Accuracy - Depth depth_1": 4.56021409455843, 3 | "Average Accuracy - Depth depth_2": 4.451253481894151, 4 | "Average Accuracy - Depth depth_3": 4.362637362637362, 5 | "Forward Discrepancy - Depth 2 <=> Depth 3": 0.09465020576131687, 6 | "Forward Discrepancy - Depth 1 <=> Depth 2": 0.08497327502429543, 7 | "Forward Discrepancy - Overall": 0.08682193396226415, 8 | "Backward Discrepancy - Depth 2 <=> Depth 3": 0.08855799373040753, 9 | "Backward Discrepancy - Depth 1 <=> Depth 2": 0.06630901287553648, 10 | "Backward Discrepancy - Overall": 0.07109164420485176 11 | } -------------------------------------------------------------------------------- /scripts/evaluation/llama3_8b_prompt-gold_auto.sh: -------------------------------------------------------------------------------- 1 | export OPENAI_API_KEY= 2 | 3 | # ------------------------- 1~3. Create, check status, retrieve results of batch request ------------------------ # 4 | python -m src.evaluation.batch_eval_openai \ 5 | --mode auto \ 6 | --eval_model_name gpt-4-0125-preview \ 7 | --prediction_file outputs/inference/llama3_8b_prompt-gold.json \ 8 | --description llama3_8b_prompt-gold \ 9 | --output_file outputs/evaluation/llama3_8b_prompt-gold.json 10 | 11 | # ---------------------- 4. Calculate evaluation metrics --------------------- # 12 | python -m src.evaluation.metric_calculator \ 13 | --input outputs/evaluation/llama3_8b_prompt-gold.json \ 14 | --output_file outputs/evaluation/llama3_8b_prompt-gold_metric.json -------------------------------------------------------------------------------- /scripts/inference/gpt-3.5-turbo.sh: -------------------------------------------------------------------------------- 1 | export OPENAI_API_KEY= 2 | 3 | 4 | python -m src.inference.single_turn_openai \ 5 | --model_name gpt-3.5-turbo-0125 \ 6 | --input kaist-ai/DepthQA \ 7 | --output_file outputs/inference/gpt-3.5-turbo_zero-shot.json \ 8 | --task_type zero-shot \ 9 | 10 | python -m src.inference.single_turn_openai \ 11 | --model_name gpt-3.5-turbo-0125 \ 12 | --input kaist-ai/DepthQA \ 13 | --output_file outputs/inference/gpt-3.5-turbo_prompt-gold.json \ 14 | --task_type prompt-gold \ 15 | 16 | python -m src.inference.single_turn_openai \ 17 | --model_name gpt-3.5-turbo-0125 \ 18 | --input outputs/inference/gpt-3.5-turbo_zero-shot.json \ 19 | --output_file outputs/inference/gpt-3.5-turbo_prompt-pred.json \ 20 | --task_type prompt-pred \ 21 | 22 | python -m src.inference.multi_turn_openai \ 23 | --model_name gpt-3.5-turbo-0125 \ 24 | --input kaist-ai/DepthQA \ 25 | --output_file outputs/inference/gpt-3.5-turbo_multi-turn.json -------------------------------------------------------------------------------- /scripts/inference/llama3_8b.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | export HF_HOME= 3 | NUM_GPUS=1 4 | 5 | python -m src.inference.single_turn \ 6 | --model_name meta-llama/Meta-Llama-3-8B-Instruct \ 7 | --input kaist-ai/DepthQA \ 8 | --output_file outputs/inference/llama3_8b_zero-shot.json \ 9 | --num_gpus $NUM_GPUS \ 10 | --task_type zero-shot \ 11 | 12 | python -m src.inference.single_turn \ 13 | --model_name meta-llama/Meta-Llama-3-8B-Instruct \ 14 | --input kaist-ai/DepthQA \ 15 | --output_file outputs/inference/llama3_8b_prompt-gold.json \ 16 | --num_gpus $NUM_GPUS \ 17 | --task_type prompt-gold \ 18 | 19 | python -m src.inference.single_turn \ 20 | --model_name meta-llama/Meta-Llama-3-8B-Instruct \ 21 | --input outputs/inference/llama3_8b_zero-shot.json \ 22 | --output_file outputs/inference/llama3_8b_prompt-pred.json \ 23 | --num_gpus $NUM_GPUS \ 24 | --task_type prompt-pred \ 25 | 26 | python -m src.inference.multi_turn \ 27 | --model_name meta-llama/Meta-Llama-3-8B-Instruct \ 28 | --input kaist-ai/DepthQA \ 29 | --output_file outputs/inference/llama3_8b_multi-turn.json \ 30 | --num_gpus $NUM_GPUS -------------------------------------------------------------------------------- /src/model/vllm.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List 3 | 4 | import torch 5 | from vllm import LLM, SamplingParams 6 | 7 | 8 | class VLLM: 9 | def __init__(self, name, tokenizer_name=None, num_gpus=1): 10 | dtype = "float16" 11 | if torch.cuda.is_bf16_supported(): 12 | dtype = "bfloat16" 13 | 14 | self.name = name 15 | 16 | max_model_len = None 17 | 18 | print(f"Loading {name}...") 19 | self.model = LLM( 20 | model=self.name, 21 | tokenizer=tokenizer_name, 22 | dtype=dtype, 23 | max_model_len=max_model_len, 24 | trust_remote_code=True, 25 | tensor_parallel_size=num_gpus, 26 | download_dir=os.getenv("HF_HOME"), 27 | ) 28 | 29 | def get_tokenizer(self): 30 | return self.model.get_tokenizer() 31 | 32 | def completions( 33 | self, 34 | prompts: List[str], 35 | use_tqdm=False, 36 | **kwargs, 37 | ): 38 | prompts = [prompt.strip() for prompt in prompts] 39 | params = SamplingParams(**kwargs) 40 | outputs = self.model.generate(prompts, params, use_tqdm=use_tqdm) 41 | outputs = [output.outputs[0].text.strip() for output in outputs] 42 | return outputs 43 | -------------------------------------------------------------------------------- /src/utils/common.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | import torch 5 | 6 | 7 | def set_seed(seed: int = 42): 8 | random.seed(seed) 9 | torch.manual_seed(seed) 10 | 11 | 12 | def show_random_inputs(inputs): 13 | random_inputs = random.sample(inputs, 3) 14 | width = 20 15 | 16 | for input_str in random_inputs: 17 | print("-" * width) 18 | print("Example inputs:") 19 | print(input_str) 20 | print("-" * width) 21 | return inputs 22 | 23 | 24 | def get_output_path(output_file: str, default_output_dir: str): 25 | if os.path.dirname(output_file): 26 | return output_file 27 | else: 28 | return os.path.join(default_output_dir, output_file) 29 | 30 | 31 | def format_messages(system_prompt, user_prompt): 32 | messages = [ 33 | {"role": "system", "content": system_prompt}, 34 | {"role": "user", "content": user_prompt}, 35 | ] 36 | return messages 37 | 38 | 39 | def calculate_cost(model_name, input_token_cnt, output_token_cnt, batch_eval=True): 40 | def pricing_info(model): 41 | if model.startswith("gpt-4") and model.endswith("preview"): 42 | input_rate = 0.01 43 | output_rate = 0.03 44 | elif model == "gpt-4": 45 | input_rate = 0.03 46 | output_rate = 0.06 47 | elif model == "gpt-4-32k": 48 | input_rate = 0.06 49 | output_rate = 0.12 50 | elif model == "gpt-3.5-turbo-0125": 51 | input_rate = 0.0005 52 | output_rate = 0.0015 53 | elif model == "gpt-3.5-turbo-instruct": 54 | input_rate = 0.0015 55 | output_rate = 0.0020 56 | else: 57 | raise ValueError(f"Model {model} not supported.") 58 | return input_rate, output_rate 59 | 60 | input_rate, output_rate = pricing_info(model_name) 61 | if batch_eval: 62 | input_rate /= 2 63 | output_rate /= 2 64 | cost = input_rate * input_token_cnt / 1000 + output_rate * output_token_cnt / 1000 65 | return cost 66 | -------------------------------------------------------------------------------- /scripts/evaluation/llama3_8b_zero-shot.sh: -------------------------------------------------------------------------------- 1 | export OPENAI_API_KEY= 2 | 3 | # ------------------------- 1. Create a batch request ------------------------ # 4 | python -m src.evaluation.batch_eval_openai \ 5 | --mode create \ 6 | --eval_model_name gpt-4-0125-preview \ 7 | --prediction_file outputs/inference/llama3_8b_zero-shot.json \ 8 | --description llama3_8b_zero-shot 9 | # Printed output: 10 | # Batch(id='batch_ckmtDkk2bbpdEXp1KmM0vqk0', completion_window='24h', created_at=1719384093, endpoint='/v1/chat/completions', input_file_id='file-hbN7K9bwo42Gs6daIE2ivjQJ', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1719470493, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'llama3_8b_zero-shot'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0)) 11 | 12 | # ------------------------- 2. Check the status of a batch request ------------------------ # 13 | python -m src.evaluation.batch_eval_openai \ 14 | --mode check \ 15 | --batch_id batch_ckmtDkk2bbpdEXp1KmM0vqk0 16 | # Printed output (in progress): 17 | # Batch(id='batch_ckmtDkk2bbpdEXp1KmM0vqk0', completion_window='24h', created_at=1719384093, endpoint='/v1/chat/completions', input_file_id='file-hbN7K9bwo42Gs6daIE2ivjQJ', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1719470493, failed_at=None, finalizing_at=None, in_progress_at=1719384093, metadata={'description': 'llama3_8b_zero-shot'}, output_file_id=None, request_counts=BatchRequestCounts(completed=549, failed=0, total=1571)) 18 | 19 | python -m src.evaluation.batch_eval_openai \ 20 | --mode list 21 | # Printed output (completed): 22 | # llama3_8b_zero-shot 23 | # Batch ID: batch_ckmtDkk2bbpdEXp1KmM0vqk0 24 | # Status: completed 25 | # Output file ID: file-ffvIyZGGyWMuQoTSLRXgNb8X 26 | # BatchRequestCounts(completed=1571, failed=0, total=1571) 27 | 28 | # ------------------------- 3. Retrieve the results of a batch request ------------------------ # 29 | python -m src.evaluation.batch_eval_openai \ 30 | --mode retrieve \ 31 | --prediction_file outputs/inference/llama3_8b_zero-shot.json \ 32 | --batch_output_file_id file-ffvIyZGGyWMuQoTSLRXgNb8X \ 33 | --output_file outputs/evaluation/llama3_8b_zero-shot.json 34 | 35 | # ---------------------- 4. Calculate evaluation metrics --------------------- # 36 | python -m src.evaluation.metric_calculator \ 37 | --input outputs/evaluation/llama3_8b_zero-shot.json \ 38 | --output_file outputs/evaluation/llama3_8b_zero-shot_metric.json -------------------------------------------------------------------------------- /src/utils/prompts.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | # Zero-shot inference 4 | SYSTEM_PROMPT_ZERO_SHOT = ( 5 | "You are a helpful, respectful and honest assistant. Answer the question." 6 | ) 7 | 8 | USER_PROMPT_TEMPLATE_ZERO_SHOT = """ 9 | ###Question: 10 | {question} 11 | 12 | ###Answer: """ 13 | 14 | 15 | # Prompt (Gold.) or Prompt (Pred.) inference 16 | SYSTEM_PROMPT_CTX = "You are a helpful, respectful and honest assistant. Answer the question using the knowledge in given QA pairs." 17 | 18 | USER_PROMPT_TEMPLATE_CTX = """ 19 | ###QA pairs: 20 | {qa_pairs} 21 | ###Question: 22 | {question} 23 | 24 | ###Answer: """ 25 | 26 | 27 | USER_PROMPT_TEMPLATE_LAST_TURN = """ 28 | Based on previous questions and responses, answer the given question: 29 | 30 | ###Question: 31 | {question} 32 | 33 | ###Answer: """ 34 | 35 | 36 | # Evaluation 37 | SYSTEM_PROMPT_EVAL = "You are a fair judge assistant tasked with providing clear, objective feedback based on specific criteria, ensuring each assessment reflects the absolute standards set for performance." 38 | 39 | USER_PROMPT_TEMPLATE_EVAL = """ 40 | ###Task Description: 41 | An instruction (might include an Input inside it), a response to evaluate, and a score rubric representing a evaluation criteria are given. 42 | 1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general. 43 | 2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric. 44 | 3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)" 45 | 4. Please do not generate any other opening, closing, and explanations. 46 | 47 | ###The instruction to evaluate: 48 | {instruction} 49 | 50 | ###Response to evaluate: 51 | {response} 52 | 53 | ###Reference Answer (Score 5): 54 | {reference_answer} 55 | 56 | ###Score Rubrics: 57 | [Is the response correct, accurate, and factual?] 58 | Score 1: The response is largely incorrect, inaccurate, and not factual. It demonstrates a fundamental misunderstanding of the query or topic, leading to irrelevant or completely erroneous information. 59 | Score 2: The response is partially correct but contains significant inaccuracies or factual errors. It shows some understanding of the query or topic but fails to provide a fully accurate or reliable answer. 60 | Score 3: The response is generally correct and factual but may include minor inaccuracies or lack of detail. It shows a good understanding of the query or topic but may miss some nuances or specific information. 61 | Score 4: The response is mostly correct, accurate, and factual. It demonstrates a strong understanding of the query or topic, with only minimal inaccuracies or omissions that do not significantly detract from the overall quality of the response. 62 | Score 5: The response is consistently correct, accurate, and entirely factual. It reflects a comprehensive understanding of the query or topic, providing detailed, precise, and fully reliable information without any inaccuracies or omissions. 63 | 64 | ###Feedback: """ 65 | -------------------------------------------------------------------------------- /src/evaluation/output_parser.py: -------------------------------------------------------------------------------- 1 | # Inspired by https://github.com/prometheus-eval/prometheus-eval/blob/main/eval/parser.py 2 | 3 | import re 4 | 5 | pattern = re.compile( 6 | r""" 7 | (?: # Non-capturing group for various result indicators 8 | \[RESULT\] # [RESULT] 9 | |Score # Score 10 | |\[SCORE\] # [SCORE] 11 | |\[RESULT\]: # [RESULT]: 12 | |Score: # Score: 13 | |score: # score: 14 | |Result: # Result: 15 | |\[Result\] # [Result] 16 | |score\s+of # score of 17 | |Feedback: # Feedback: 18 | |feedback: # feedback: 19 | ) 20 | \s* # Optional whitespace 21 | (?: # Non-capturing group for optional brackets or parentheses 22 | \(\s* # Opening parenthesis with optional whitespace 23 | |\[\s* # or opening square bracket with optional whitespace 24 | | # or nothing 25 | ) 26 | \s* # Optional whitespace 27 | (\d+) # Capturing group for one or more digits 28 | """, 29 | re.IGNORECASE | re.VERBOSE, 30 | ) 31 | 32 | 33 | def parse_judgment(judgment): 34 | matches = pattern.search(judgment) 35 | 36 | if matches: 37 | # Extract the first group that matches (ignoring None) 38 | result = next((int(match) for match in matches.groups() if match), None) 39 | if result is not None: 40 | feedback = ( 41 | judgment.split("[RESULT]")[0].strip() 42 | if "[RESULT]" in judgment 43 | else judgment 44 | ) 45 | return feedback, result 46 | 47 | return None, None 48 | 49 | 50 | if __name__ == "__main__": 51 | # Test cases 52 | test_cases = [ 53 | # Absolute mode test cases (a2a, a2r) 54 | ("Good job. [RESULT] 3", 3), 55 | ("Needs improvement. [RESULT] Score: 2", 2), 56 | ("Well done. [RESULT] Result: 4", 4), 57 | ("Average. [RESULT] 4/5", 4), 58 | ("Excellent. [RESULT] 5 out of 5", 5), 59 | ("Poor performance. [RESULT] score of 1", 1), 60 | ("Good job. [Result] 3", 3), 61 | ("Needs improvement. [Result] Score: 2", 2), 62 | ("Well done. [Result] Result: 4", 4), 63 | ("Average. [Result] 4/5", 4), 64 | ("Excellent. [Result] 5 out of 5", 5), 65 | ("Poor performance. [Result] score of 1", 1), 66 | ("Good job. [3]", 3), 67 | ("Good job. (Score 5)", 5), 68 | ("Good job. [Score 4]", 4), 69 | ("Good job. score: 3", 3), 70 | ("Good job. Score: 3", 3), 71 | ("Good job. score of 1", 1), 72 | ("Good job. [RESULT] (5)", 5), 73 | ] 74 | 75 | def run_tests(): 76 | failed_tests = [] # To keep track of failed tests 77 | 78 | for output, expected in test_cases: 79 | _, result = parse_judgment(output) 80 | if result != expected: 81 | failed_tests.append((output, expected, result)) 82 | 83 | if failed_tests: 84 | print("Some tests failed:") 85 | for output, expected, result in failed_tests: 86 | print(f" For input: '{output}', expected: {expected}, got: {result}") 87 | else: 88 | print("All tests passed!") 89 | 90 | run_tests() 91 | -------------------------------------------------------------------------------- /src/model/openai.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import openai 4 | from aiolimiter import AsyncLimiter 5 | from tqdm.asyncio import tqdm_asyncio 6 | from tqdm.auto import tqdm 7 | 8 | 9 | class OpenAILLM: 10 | def __init__( 11 | self, 12 | name, 13 | ): 14 | self.name = name 15 | self.batch_size = 100 16 | self.requests_per_minute = 100 17 | self.limiter = AsyncLimiter(self.requests_per_minute, 60) 18 | self.client = openai.AsyncOpenAI( 19 | api_key=os.environ.get("OPENAI_API_KEY"), 20 | ) 21 | 22 | async def get_completion_text_async(self, messages, **kwargs): 23 | async with self.limiter: 24 | try: 25 | # Assuming you have a session and client setup for OpenAI 26 | completion = await self.client.chat.completions.create( 27 | model=self.name, messages=messages, **kwargs 28 | ) 29 | content = completion.choices[0].message.content.strip() 30 | return content 31 | except openai.APIConnectionError as e: 32 | print("APIConnectionError: The server could not be reached") 33 | print( 34 | e.__cause__ 35 | ) # an underlying Exception, likely raised within httpx. 36 | except openai.RateLimitError as e: 37 | print( 38 | "RateLimitError: A 429 status code was received; we should back off a bit." 39 | ) 40 | except openai.APIStatusError as e: 41 | print("APIStatusError: Another non-200-range status code was received") 42 | print(e.status_code) 43 | print(e.response) 44 | except Exception as e: 45 | print(f"Error during OpenAI API call: {e}") 46 | return "" # , {} 47 | 48 | async def completions( 49 | self, 50 | messages, 51 | **kwargs, 52 | ): 53 | assert isinstance(messages, list) 54 | assert list(messages[0][0].keys()) == ["role", "content"] 55 | 56 | result_responses = [] 57 | 58 | for start_idx in tqdm( 59 | range(0, len(messages), self.batch_size), desc="Processing batches" 60 | ): 61 | end_idx = start_idx + self.batch_size 62 | batch_prompts = messages[start_idx:end_idx] 63 | batch_responses = await tqdm_asyncio.gather( 64 | *[ 65 | self.get_completion_text_async(prompt, **kwargs) 66 | for prompt in batch_prompts 67 | ] 68 | ) 69 | result_responses.extend(batch_responses) 70 | 71 | return result_responses 72 | 73 | 74 | class OpenAIBatchClient: 75 | def __init__(self): 76 | self.client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY")) 77 | 78 | def create_batch(self, input_file, description=None): 79 | batch_input_file = self.client.files.create( 80 | file=open(input_file, "rb"), purpose="batch" 81 | ) 82 | 83 | batch_input_file_id = batch_input_file.id 84 | 85 | batch = self.client.batches.create( 86 | input_file_id=batch_input_file_id, 87 | endpoint="/v1/chat/completions", 88 | completion_window="24h", 89 | metadata={"description": description}, 90 | ) 91 | return batch 92 | 93 | def cancel_batch(self, batch_id): 94 | self.client.batches.cancel(batch_id) 95 | 96 | def check_batch(self, batch_id): 97 | batch = self.client.batches.retrieve(batch_id) 98 | status = batch.status 99 | batch_output_file_id = batch.output_file_id 100 | return status, batch_output_file_id 101 | 102 | def list_batches(self): 103 | batches = self.client.batches.list() 104 | batches = sorted(batches, key=lambda x: x.created_at) 105 | for batch in batches: 106 | desc = batch.metadata.get("description", "") if batch.metadata else "" 107 | batch_id = batch.id 108 | status = batch.status 109 | if "cancel" in status: 110 | continue 111 | output_file_id = batch.output_file_id 112 | request_counts = batch.request_counts 113 | print("-" * 20) 114 | print(desc) 115 | print(f"\tBatch ID: {batch_id}") 116 | print(f"\tStatus: {status}") 117 | print(f"\tOutput file ID: {output_file_id}") 118 | print(f"\t{request_counts}") 119 | 120 | def retrieve_batch(self, batch_output_file_id): 121 | content = self.client.files.content(batch_output_file_id) 122 | return content 123 | 124 | 125 | if __name__ == "__main__": 126 | print("Hello, World!") 127 | 128 | model = OpenAILLM("gpt-3.5-turbo") 129 | 130 | responses = model.completions( 131 | model="gpt-3.5-turbo", 132 | messages=[ 133 | [{"role": "user", "content": "good morning? "}], 134 | [{"role": "user", "content": "what's the time? "}], 135 | ], 136 | ) 137 | import pdb 138 | 139 | pdb.set_trace() 140 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.jsonl 2 | env_var.sh 3 | Makefile 4 | notebooks 5 | *.ipynb 6 | *.json 7 | human_eval 8 | 9 | # Created by https://www.toptal.com/developers/gitignore/api/linux,macos,python 10 | # Edit at https://www.toptal.com/developers/gitignore?templates=linux,macos,python 11 | 12 | ### Linux ### 13 | *~ 14 | 15 | # temporary files which can be created if a process still has a handle open of a deleted file 16 | .fuse_hidden* 17 | 18 | # KDE directory preferences 19 | .directory 20 | 21 | # Linux trash folder which might appear on any partition or disk 22 | .Trash-* 23 | 24 | # .nfs files are created when an open file is removed but is still being accessed 25 | .nfs* 26 | 27 | ### macOS ### 28 | # General 29 | .DS_Store 30 | .AppleDouble 31 | .LSOverride 32 | 33 | # Icon must end with two \r 34 | Icon 35 | 36 | 37 | # Thumbnails 38 | ._* 39 | 40 | # Files that might appear in the root of a volume 41 | .DocumentRevisions-V100 42 | .fseventsd 43 | .Spotlight-V100 44 | .TemporaryItems 45 | .Trashes 46 | .VolumeIcon.icns 47 | .com.apple.timemachine.donotpresent 48 | 49 | # Directories potentially created on remote AFP share 50 | .AppleDB 51 | .AppleDesktop 52 | Network Trash Folder 53 | Temporary Items 54 | .apdisk 55 | 56 | ### macOS Patch ### 57 | # iCloud generated files 58 | *.icloud 59 | 60 | ### Python ### 61 | # Byte-compiled / optimized / DLL files 62 | __pycache__/ 63 | *.py[cod] 64 | *$py.class 65 | 66 | # C extensions 67 | *.so 68 | 69 | # Distribution / packaging 70 | .Python 71 | build/ 72 | develop-eggs/ 73 | dist/ 74 | downloads/ 75 | eggs/ 76 | .eggs/ 77 | lib/ 78 | lib64/ 79 | parts/ 80 | sdist/ 81 | var/ 82 | wheels/ 83 | share/python-wheels/ 84 | *.egg-info/ 85 | .installed.cfg 86 | *.egg 87 | MANIFEST 88 | 89 | # PyInstaller 90 | # Usually these files are written by a python script from a template 91 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 92 | *.manifest 93 | *.spec 94 | 95 | # Installer logs 96 | pip-log.txt 97 | pip-delete-this-directory.txt 98 | 99 | # Unit test / coverage reports 100 | htmlcov/ 101 | .tox/ 102 | .nox/ 103 | .coverage 104 | .coverage.* 105 | .cache 106 | nosetests.xml 107 | coverage.xml 108 | *.cover 109 | *.py,cover 110 | .hypothesis/ 111 | .pytest_cache/ 112 | cover/ 113 | 114 | # Translations 115 | *.mo 116 | *.pot 117 | 118 | # Django stuff: 119 | *.log 120 | local_settings.py 121 | db.sqlite3 122 | db.sqlite3-journal 123 | 124 | # Flask stuff: 125 | instance/ 126 | .webassets-cache 127 | 128 | # Scrapy stuff: 129 | .scrapy 130 | 131 | # Sphinx documentation 132 | docs/_build/ 133 | 134 | # PyBuilder 135 | .pybuilder/ 136 | target/ 137 | 138 | # Jupyter Notebook 139 | .ipynb_checkpoints 140 | 141 | # IPython 142 | profile_default/ 143 | ipython_config.py 144 | 145 | # pyenv 146 | # For a library or package, you might want to ignore these files since the code is 147 | # intended to run in multiple environments; otherwise, check them in: 148 | # .python-version 149 | 150 | # pipenv 151 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 152 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 153 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 154 | # install all needed dependencies. 155 | #Pipfile.lock 156 | 157 | # poetry 158 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 159 | # This is especially recommended for binary packages to ensure reproducibility, and is more 160 | # commonly ignored for libraries. 161 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 162 | #poetry.lock 163 | 164 | # pdm 165 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 166 | #pdm.lock 167 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 168 | # in version control. 169 | # https://pdm.fming.dev/#use-with-ide 170 | .pdm.toml 171 | 172 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 173 | __pypackages__/ 174 | 175 | # Celery stuff 176 | celerybeat-schedule 177 | celerybeat.pid 178 | 179 | # SageMath parsed files 180 | *.sage.py 181 | 182 | # Environments 183 | .env 184 | .venv 185 | env/ 186 | venv/ 187 | ENV/ 188 | env.bak/ 189 | venv.bak/ 190 | 191 | # Spyder project settings 192 | .spyderproject 193 | .spyproject 194 | 195 | # Rope project settings 196 | .ropeproject 197 | 198 | # mkdocs documentation 199 | /site 200 | 201 | # mypy 202 | .mypy_cache/ 203 | .dmypy.json 204 | dmypy.json 205 | 206 | # Pyre type checker 207 | .pyre/ 208 | 209 | # pytype static type analyzer 210 | .pytype/ 211 | 212 | # Cython debug symbols 213 | cython_debug/ 214 | 215 | # PyCharm 216 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 217 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 218 | # and can be added to the global gitignore or merged into this file. For a more nuclear 219 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 220 | #.idea/ 221 | 222 | ### Python Patch ### 223 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration 224 | poetry.toml 225 | 226 | # ruff 227 | .ruff_cache/ 228 | 229 | # LSP config files 230 | pyrightconfig.json 231 | 232 | # End of https://www.toptal.com/developers/gitignore/api/linux,macos,python -------------------------------------------------------------------------------- /src/data/dataloader.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import Dict, Set 3 | 4 | from datasets import load_dataset 5 | 6 | 7 | class HFDepthQALoader: 8 | def __init__(self, hf_repo: str = "kaist-ai/DepthQA", split: str = "test"): 9 | self.hf_repo = hf_repo 10 | self.split = split 11 | self.questions: Dict[str, Dict] = {} # qid -> question dict 12 | self.nodes: Dict[str, Dict] = {} # nodeid -> node dict 13 | self.node_to_q: Dict[str, str] = {} # nodeid -> qid 14 | self.q_to_node: Dict[str, Set[str]] = defaultdict(set) # qid -> set of nodeids 15 | 16 | def load_data( 17 | self, except_questions: bool = False, remove_unused_columns: bool = True 18 | ): 19 | print(f"Loading data from {self.hf_repo}...") 20 | if not except_questions: 21 | # Load questions 22 | questions_dataset = load_dataset( 23 | self.hf_repo, "questions", split=self.split 24 | ) 25 | if remove_unused_columns: 26 | questions_dataset = questions_dataset.remove_columns( 27 | ["domain", "tutoreval_data", "augmented"] 28 | ) 29 | self.questions = {item["qid"]: item for item in questions_dataset} 30 | 31 | # Load nodes 32 | nodes_dataset = load_dataset(self.hf_repo, "nodes", split=self.split) 33 | self.nodes = {item["nodeid"]: item for item in nodes_dataset} 34 | 35 | # Load node_to_q mappings 36 | node_to_q_dataset = load_dataset(self.hf_repo, "node_to_q", split=self.split) 37 | for item in node_to_q_dataset: 38 | self.node_to_q[item["nodeid"]] = item["qid"] 39 | self.q_to_node[item["qid"]].add(item["nodeid"]) # 1-to-n mapping 40 | 41 | return self.questions, self.nodes, self.node_to_q 42 | 43 | def check_integrity(self) -> None: 44 | print("Checking graph integrity...") 45 | 46 | errors = [] 47 | 48 | def add_error(message): 49 | errors.append(message) 50 | 51 | # Check questions 52 | for qid in self.questions.keys(): 53 | # Check q_to_node mapping 54 | if not self.q_to_node.get(qid): 55 | add_error(f"Question {qid} not found in q_to_node") 56 | 57 | # Check nodes 58 | for nodeid in self.nodes.keys(): 59 | node = self.nodes[nodeid] 60 | depth = node["depth"] 61 | 62 | # Check group consistency 63 | group_nodeid = nodeid.split("_")[0] 64 | if node["group"] != group_nodeid: 65 | add_error(f"Inconsistent group in node {nodeid}") 66 | 67 | # Check direct_predecessors 68 | for predec_id in node["direct_predecessors"]: 69 | predec_node = self.nodes[predec_id] 70 | if not predec_node: 71 | add_error(f"Predecessor node {predec_id} of {nodeid} not found") 72 | else: 73 | predec_depth_nodeid = int(predec_id.split("_")[1][1:]) 74 | predec_depth = predec_node["depth"] 75 | if predec_depth_nodeid != predec_depth: 76 | add_error( 77 | f"Inconsistent depth in predecessor {predec_id}: {predec_depth_nodeid} in nodeid while {predec_depth} in depth field" 78 | ) 79 | if predec_depth != depth - 1: 80 | add_error( 81 | f"Predecessor {predec_id} of {nodeid} has incorrect depth" 82 | ) 83 | if nodeid not in predec_node["direct_successors"]: 84 | add_error( 85 | f"Node {nodeid} not in direct_successors of its predecessor {predec_id}" 86 | ) 87 | 88 | # Check direct_successors 89 | for succ_id in node["direct_successors"]: 90 | succ_node = self.nodes[succ_id] 91 | if not succ_node: 92 | add_error(f"Successor node {succ_id} of {nodeid} not found") 93 | else: 94 | succ_depth_nodeid = int(succ_id.split("_")[1][1:]) 95 | succ_depth = succ_node["depth"] 96 | if succ_depth_nodeid != succ_depth: 97 | add_error( 98 | f"Inconsistent depth in successor {succ_id}: {succ_depth_nodeid} in nodeid while {succ_depth} in depth field" 99 | ) 100 | if succ_depth != depth + 1: 101 | add_error( 102 | f"Successor {succ_id} of {nodeid} has incorrect depth" 103 | ) 104 | if nodeid not in succ_node["direct_predecessors"]: 105 | add_error( 106 | f"Node {nodeid} not in direct_predecessors of its successor {succ_id}" 107 | ) 108 | 109 | # Check node_to_q mapping 110 | if not self.node_to_q.get(nodeid): 111 | add_error(f"Node {nodeid} not found in node_to_q") 112 | 113 | # Check consistency between node_to_q, nodes, and questions 114 | for nodeid in self.node_to_q.keys(): 115 | qid = self.node_to_q.get(nodeid) 116 | if not self.questions.get(qid): 117 | add_error(f"qid {qid} in node_to_q not found in questions") 118 | if not self.nodes.get(nodeid): 119 | add_error(f"nodeid {nodeid} in node_to_q not found in nodes") 120 | if nodeid not in self.q_to_node.get(qid): 121 | add_error( 122 | f"Inconsistency: node_to_q[{nodeid}] = {qid}, but q_to_node[{qid}] = {self.q_to_node.get(qid)}" 123 | ) 124 | 125 | if errors: 126 | raise ValueError("Graph integrity check failed:\n" + "\n".join(errors)) 127 | else: 128 | print("Graph integrity check passed successfully.") 129 | 130 | 131 | if __name__ == "__main__": 132 | loader = HFDepthQALoader() 133 | loader.load_data() 134 | loader.check_integrity() 135 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hierarchical Deconstruction of LLM Reasoning: A Graph-Based Framework for Analyzing Knowledge Utilization 2 | 3 | This repository is the official implementation of [Hierarchical Deconstruction of LLM Reasoning: A Graph-Based Framework for Analyzing Knowledge Utilization](https://aclanthology.org/2024.emnlp-main.288/). 4 | 5 | - [📃 Paper](https://aclanthology.org/2024.emnlp-main.288/) 6 | - [🧵 Brief overview of paper (X thread)](https://x.com/miyoung_ko/status/1807753375196270716) 7 | - [🤗 Dataset (DepthQA)](https://huggingface.co/datasets/kaist-ai/DepthQA) 8 | 9 | ## TL;DR 10 | We investigate how large language models utilize knowledge for reasoning to solve complex questions, based on a method that deconstructs complex questions into a hierarchical graph. 11 | 12 | Each depth of knowledge required to answer the question represents different levels of complexity. | Some reasoning is required to answer a more complex question compared to a simpler question. 13 | ---- | ---- 14 | ![teaser](assets/teaser.png) | ![discrepancies](assets/discrepancies.png) 15 | 16 | 17 | ## Requirements 18 | 19 | Create a virtual environment with python>=3.9 and install the appropriate PyTorch version for your machine. 20 | 21 | In our project, we use a node of 4 x NVIDIA A6000 40GB GPUs with CUDA version 12.3. 22 | 23 | ```bash 24 | conda create -n myenv python=3.10 25 | conda activate myenv 26 | conda install pytorch pytorch-cuda=12.1 -c pytorch -c nvidia 27 | ``` 28 | 29 | To install requirements: 30 | 31 | ```setup 32 | pip install -r requirements.txt 33 | ``` 34 | 35 | ## Inference 36 | 37 | You can experiment multiple inference modes with our dataset, [DepthQA]((https://huggingface.co/datasets/kaist-ai/DepthQA)): 38 | 39 | - Single-turn: 40 | - `zero-shot`: Only the target question is in the input. 41 | - `prompt-gold`: Before the target question, shallower (i.e., predecessors to the target question) question and gold answer pairs are provided as context. 42 | - `prompt-pred`: Before the target question, shallower question and its own predicted answer pairs are provided as context. 43 | - `multi-turn`: Shallower questions are provided as inputs in a multi-turn conversation, i.e., the model answers each shallower question one by one and then is presented with the target question. 44 | 45 | Most HuggingFace `AutoModelForCausalLM` models can be run with [src/inference/single_turn.py](src/inference/single_turn.py) and [src/inference/multi_turn.py](src/inference/multi_turn.py), with vLLM integrated and using mixed precision. 46 | 47 | For OpenAI models, use [src/inference/single_turn_openai.py](src/inference/single_turn_openai.py) and [src/inference/multi_turn_openai.py](src/inference/multi_turn_openai.py). 48 | 49 | ### Example usage 50 | 51 | To inference LLaMA 3 8B Instruct with all modes: 52 | ```bash 53 | bash scripts/inference/llama3_8b.sh 54 | ``` 55 | 56 | To inference GPT-3.5 Turbo with all modes: 57 | ```bash 58 | bash scripts/inference/gpt-3.5-turbo.sh 59 | ``` 60 | 61 | ## Evaluation 62 | 63 | Following the LLM-as-a-Judge approach, we use `gpt-4-0125-preview` to score the correctness of model predictions. Specifically, we use the [Batch API](https://platform.openai.com/docs/guides/batch) for faster and cheaper evaluation. Our implementation of the evaluation pipeline consists of four steps: 64 | 65 | 1. Creating a batch request 66 | 2. Check the status of the batch request 67 | 3. Retrieve the results of the batch request 68 | 4. Calculate evaluation metrics 69 | - Average accuracy 70 | - Forward discrepancy 71 | - Backward discrepancy 72 | 73 | where the first three steps are performed in [src/evaluation/batch_eval_openai.py](src/evaluation/batch_eval_openai.py) and the last step is in [src/evaluation/metric_calculator.py](src/evaluation/metric_calculator.py) 74 | 75 | ### Example usage 76 | 77 | To analyze each step in the evaluation pipeline of LLaMA 3 8B Instruct `zero-shot` predictions, refer to the example commands and printed outputs in [scripts/evaluation/llama3_8b_zero-shot.sh](scripts/evaluation/llama3_8b_zero-shot.sh). 78 | 79 | To run the entire pipeline of LLaMA 3 8B Instruct `prompt-gold` predictions automatically: 80 | ```bash 81 | bash scripts/evaluation/llama3_8b_prompt-gold_auto.sh 82 | ``` 83 | 84 | 85 | ## Citation 86 | ```bibtex 87 | @inproceedings{ko-etal-2024-hierarchical, 88 | title = "Hierarchical Deconstruction of {LLM} Reasoning: A Graph-Based Framework for Analyzing Knowledge Utilization", 89 | author = "Ko, Miyoung and 90 | Park, Sue Hyun and 91 | Park, Joonsuk and 92 | Seo, Minjoon", 93 | editor = "Al-Onaizan, Yaser and 94 | Bansal, Mohit and 95 | Chen, Yun-Nung", 96 | booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing", 97 | month = nov, 98 | year = "2024", 99 | address = "Miami, Florida, USA", 100 | publisher = "Association for Computational Linguistics", 101 | url = "https://aclanthology.org/2024.emnlp-main.288", 102 | doi = "10.18653/v1/2024.emnlp-main.288", 103 | pages = "4995--5027", 104 | abstract = "Despite the advances in large language models (LLMs), how they use their knowledge for reasoning is not yet well understood.In this study, we propose a method that deconstructs complex real-world questions into a graph, representing each question as a node with predecessors of background knowledge needed to solve the question. We develop the DepthQA dataset, deconstructing questions into three depths: (i) recalling conceptual knowledge, (ii) applying procedural knowledge, and (iii) analyzing strategic knowledge. Based on a hierarchical graph, we quantify forward discrepancy, a discrepancy in LLM performance on simpler sub-problems versus complex questions. We also measure backward discrepancy where LLMs answer complex questions but struggle with simpler ones. Our analysis shows that smaller models exhibit more discrepancies than larger models. Distinct patterns of discrepancies are observed across model capacity and possibility of training data memorization. Additionally, guiding models from simpler to complex questions through multi-turn interactions improves performance across model sizes, highlighting the importance of structured intermediate steps in knowledge reasoning. This work enhances our understanding of LLM reasoning and suggests ways to improve their problem-solving abilities.", 105 | } 106 | 107 | ``` 108 | -------------------------------------------------------------------------------- /src/evaluation/metric_calculator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | from collections import defaultdict 5 | from argparse import ArgumentParser 6 | 7 | from src.data import HFDepthQALoader 8 | 9 | def compute_score(score_value): 10 | if isinstance(score_value, list): 11 | return sum(score_value) / len(score_value) 12 | elif isinstance(score_value, int): 13 | return int(score_value) 14 | else: 15 | return 0. 16 | 17 | def depth_score(results): 18 | scores = { 19 | "depth_1": [], 20 | "depth_2": [], 21 | "depth_3": [] 22 | } 23 | 24 | for inst in results.values(): 25 | score = inst["score"] 26 | scores[f'depth_{inst["depth"]}'].append( 27 | compute_score(score)) 28 | 29 | output = {} 30 | for depth, score in scores.items(): 31 | if len(score) == 0: 32 | output[f"Average Accuracy - Depth {depth}"] = 0 33 | else: 34 | output[f"Average Accuracy - Depth {depth}"] = np.mean(score) 35 | 36 | return output 37 | 38 | def forward_disc(nodes, node_results): 39 | depths = [(2,3), (1,2)] 40 | output = {} 41 | 42 | overall_gaps = 0 43 | overall_cnt = 0 44 | for (depth_a, depth_b) in depths: 45 | gaps = [] 46 | cnt = 0 47 | aggregate = defaultdict(list) 48 | score_dict = defaultdict(float) 49 | for nodeid, data in node_results.items(): 50 | if data["depth"] == depth_a: 51 | depth_a_score = compute_score(data["score"]) 52 | successors = nodes[nodeid]["direct_successors"] 53 | if len(successors) > 0: 54 | for depth_b_id in successors: 55 | depth_b_d = node_results[depth_b_id] 56 | depth_b_score = compute_score(depth_b_d["score"]) 57 | score_dict[depth_b_id] = depth_b_score 58 | aggregate[depth_b_id].append(depth_a_score) 59 | cnt +=1 60 | 61 | gaps = [] 62 | cnt = 0 63 | for bid in score_dict.keys(): 64 | cnt += (np.average(aggregate[bid]) >= 4) 65 | gap = (np.average(aggregate[bid]) - score_dict[bid])/4 66 | gaps.append(max(0, gap) * (np.average(aggregate[bid]) >= 4)) 67 | overall_gaps += sum(gaps) 68 | overall_cnt += cnt 69 | output[f"Forward Discrepancy - Depth {depth_a} <=> Depth {depth_b}"] = sum(gaps) / cnt 70 | 71 | output["Forward Discrepancy - Overall"] = overall_gaps / overall_cnt 72 | 73 | return output 74 | 75 | def backward_disc(nodes, node_results): 76 | depths = [(2,3), (1,2)] 77 | output = {} 78 | 79 | overall_gaps = 0 80 | overall_cnt = 0 81 | for (depth_a, depth_b) in depths: 82 | gaps = [] 83 | cnt = 0 84 | aggregate = defaultdict(list) 85 | score_dict = defaultdict(float) 86 | id_map = defaultdict(list) 87 | for nodeid, data in node_results.items(): 88 | if data["depth"] == depth_b: 89 | depth_b_score = compute_score(data["score"]) 90 | predecessors = nodes[nodeid]["direct_predecessors"] 91 | if len(predecessors) > 0: 92 | for depth_a_id in predecessors: 93 | depth_a_d = node_results[depth_a_id] 94 | depth_a_score = compute_score(depth_a_d["score"]) 95 | score_dict[depth_a_id] = depth_a_score 96 | aggregate[depth_a_id].append(depth_b_score) 97 | id_map[depth_a_id].append(id) 98 | cnt +=1 99 | 100 | gaps = [] 101 | cnt = 0 102 | 103 | for aid in score_dict.keys(): 104 | cnt += (aggregate[aid][0] >= 4) 105 | gap = (aggregate[aid][0] - score_dict[aid]) / 4 106 | gaps.append(max(0, gap) * (aggregate[aid][0] >= 4)) 107 | overall_gaps += sum(gaps) 108 | overall_cnt += cnt 109 | output[f"Backward Discrepancy - Depth {depth_a} <=> Depth {depth_b}"] = sum(gaps) / cnt 110 | 111 | output["Backward Discrepancy - Overall"] = overall_gaps / overall_cnt 112 | return output 113 | 114 | 115 | 116 | def main(args): 117 | 118 | # Load src data 119 | dataloader = HFDepthQALoader() 120 | questions, nodes, node_to_q = dataloader.load_data() 121 | 122 | q_to_node = defaultdict(list) 123 | for nodeid, qid in node_to_q.items(): 124 | q_to_node[qid].append(nodeid) 125 | 126 | # Load evaluation file 127 | with open(args.input, "r") as fr: 128 | results = json.load(fr) 129 | 130 | metrics = {} 131 | depth_metric = depth_score(results) 132 | metrics.update(depth_metric) 133 | 134 | node_results = {} 135 | if list(results.keys())[0] in q_to_node: 136 | # Convert Q to Node 137 | for qid, nids in q_to_node.items(): 138 | for nodeid in nids: 139 | node_results[nodeid] = results[qid] 140 | else: 141 | node_results = results 142 | 143 | fwd_metric = forward_disc(nodes, node_results) 144 | metrics.update(fwd_metric) 145 | bwd_metric = backward_disc(nodes, node_results) 146 | metrics.update(bwd_metric) 147 | # print(metrics) 148 | 149 | with open(args.output_file, "w") as fw: 150 | json.dump(metrics, fw) 151 | 152 | 153 | if __name__ == "__main__": 154 | parser = ArgumentParser() 155 | # I/O arguments 156 | parser.add_argument( 157 | "--src_input", 158 | type=str, 159 | default="kaist-ai/DepthQA", 160 | help="Dataset name in Hugging Face (for zero-shot) or local zero-shot JSON output file (for prompt-*)", 161 | ) 162 | parser.add_argument( 163 | "--input", 164 | type=str, 165 | required=True, 166 | help="Input JSON file name to calculate metric." 167 | ) 168 | parser.add_argument( 169 | "--output_file", 170 | type=str, 171 | required=True, 172 | help="Output JSON file name. Unless the parent directory is specified, will be saved under outputs/inference by default.", 173 | ) 174 | parser.add_argument( 175 | "--force_rerun", 176 | action="store_true", 177 | help="Force rerun even if output file exists.", 178 | ) 179 | parser.add_argument("--debug", action="store_true", help="Debug mode.") 180 | args = parser.parse_args() 181 | assert os.path.exists(args.input), "Cannot find input file." 182 | assert args.output_file.endswith(".json"), "Output file must be a JSON file." 183 | main(args) 184 | -------------------------------------------------------------------------------- /src/inference/multi_turn.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import warnings 4 | from argparse import ArgumentParser 5 | from typing import Dict, List 6 | from collections import defaultdict 7 | from tqdm import tqdm 8 | 9 | from src.data import HFDepthQALoader, filter_data_dict, slice_data_dict 10 | from src.model import VLLM 11 | from src.utils import ( 12 | SAMPLING_PARAMS, 13 | SYSTEM_PROMPT_ZERO_SHOT, 14 | USER_PROMPT_TEMPLATE_ZERO_SHOT, 15 | USER_PROMPT_TEMPLATE_LAST_TURN, 16 | get_output_path, 17 | show_random_inputs, 18 | ) 19 | 20 | DEBUG = False 21 | 22 | 23 | 24 | def multiturn_completions( 25 | model, 26 | inputs: List[str], 27 | tokenizer, 28 | total_len 29 | ): 30 | system_prompt = SYSTEM_PROMPT_ZERO_SHOT 31 | chat_dict = defaultdict(str) 32 | 33 | # Adjust batch size to fit the number of inputs 34 | # VLLM supports adaptive batch size already 35 | total_len = len(inputs) 36 | 37 | # Process initial batches with progress bar 38 | print("Processing initial chat...") 39 | for i in tqdm( 40 | range(0, len(inputs)), total=total_len, desc="Initial Chat" 41 | ): 42 | chat_input = inputs[i] 43 | 44 | for j, inst in enumerate(chat_input): 45 | if j == 0: 46 | input_ = apply_template_chat(system_prompt, inst["input"], tokenizer) 47 | else: 48 | input_.append({"role": "user", "content": inst["input"]}) 49 | 50 | if inst["id"] in chat_dict: 51 | input_.append({"role": "assistant", "content": chat_dict[inst["id"]]}) 52 | continue 53 | 54 | tokenized_input = tokenizer.apply_chat_template( 55 | input_, tokenize=False, add_generation_prompt=True) 56 | 57 | output = model.completions([tokenized_input], **SAMPLING_PARAMS) 58 | chat_dict[inst["id"]] = output[0] 59 | input_.append({"role": "assistant", "content": output[0]}) 60 | 61 | # Final aggregation and printing 62 | outputs_len = len(chat_dict) 63 | print(f"Processed {outputs_len} / {total_len} instances.") 64 | 65 | if outputs_len < total_len: 66 | warnings.warn("Some instances failed.") 67 | warnings.warn("They will be written as None in the output file.") 68 | raise Exception( 69 | f"Failed to generate feedback for {total_len - outputs_len} instances." 70 | ) 71 | 72 | return chat_dict 73 | 74 | 75 | def apply_template_chat(system_prompt, user_prompt, tokenizer): 76 | if tokenizer.chat_template and "system" not in tokenizer.chat_template: 77 | messages = [ 78 | {"role": "user", "content": system_prompt + "\n" + user_prompt}, 79 | ] 80 | else: 81 | messages = [ 82 | {"role": "system", "content": system_prompt}, 83 | {"role": "user", "content": user_prompt}, 84 | ] 85 | return messages 86 | 87 | 88 | def prepare_inputs_multiturn( 89 | questions: Dict[str, Dict], 90 | nodes: Dict[str, Dict], 91 | node_to_q: Dict[str, str] 92 | ) -> List[str]: 93 | 94 | chat_inputs = [] 95 | 96 | for nodeid, node in nodes.items(): 97 | chat = [] 98 | target_question_d = questions[node_to_q[nodeid]] 99 | target_question = target_question_d["question"] 100 | 101 | for predec_nodeid in node["direct_predecessors"]: 102 | predec_question_d = questions[node_to_q[predec_nodeid]] 103 | predec_question = predec_question_d["question"] 104 | user_prompt = USER_PROMPT_TEMPLATE_ZERO_SHOT.format(question=predec_question) 105 | chat.append({"id":predec_nodeid, "input": user_prompt}) 106 | last_user_prompt = USER_PROMPT_TEMPLATE_LAST_TURN.format(question=target_question) 107 | chat.append({"id": nodeid, "input": last_user_prompt}) 108 | 109 | chat_inputs.append(chat) 110 | 111 | return chat_inputs 112 | 113 | 114 | def main(args): 115 | global DEBUG 116 | DEBUG = args.debug 117 | 118 | # Load data 119 | dataloader = HFDepthQALoader() 120 | questions, nodes, node_to_q = dataloader.load_data() 121 | print(f"Loaded {len(questions)} questions and {len(nodes)} nodes.") 122 | 123 | # Load model 124 | model = VLLM(args.model_name, num_gpus=args.num_gpus) 125 | tokenizer = model.get_tokenizer() 126 | 127 | nodes = filter_data_dict(nodes, lambda node: node["depth"] > 1) 128 | 129 | if DEBUG: 130 | nodes = slice_data_dict(nodes, start=0, end=5) 131 | 132 | inputs = prepare_inputs_multiturn( 133 | questions, 134 | nodes, 135 | node_to_q 136 | ) 137 | 138 | if DEBUG: 139 | inputs = inputs[:5] 140 | 141 | # Inference 142 | predictions = multiturn_completions(model, inputs, tokenizer, len(node_to_q)) 143 | 144 | # Save results 145 | results = {} 146 | for idx, nodeid in enumerate(predictions.keys()): 147 | results[nodeid] = questions[node_to_q[nodeid]] 148 | results[nodeid].update({"predicted_answer": predictions[nodeid]}) 149 | 150 | output_path = get_output_path( 151 | output_file=args.output_file, default_output_dir="../../outputs/inference" 152 | ) 153 | 154 | with open(output_path, "w") as f: 155 | json.dump(results, f, indent=4) 156 | 157 | 158 | if __name__ == "__main__": 159 | parser = ArgumentParser() 160 | # I/O arguments 161 | parser.add_argument( 162 | "--model_name", 163 | type=str, 164 | required=True, 165 | help="Name of model hosted in Hugging Face under AutoModelForCausalLM", 166 | ) 167 | parser.add_argument( 168 | "--input", 169 | type=str, 170 | default="kaist-ai/DepthQA", 171 | help="Dataset name in Hugging Face (for zero-shot) or local zero-shot JSON output file (for prompt-*)", 172 | ) 173 | parser.add_argument( 174 | "--output_file", 175 | type=str, 176 | required=True, 177 | help="Output JSON file name. Unless the parent directory is specified, will be saved under outputs/inference by default.", 178 | ) 179 | parser.add_argument( 180 | "--force_rerun", 181 | action="store_true", 182 | help="Force rerun even if output file exists.", 183 | ) 184 | parser.add_argument("--debug", action="store_true", help="Debug mode.") 185 | # Compute arguments 186 | parser.add_argument( 187 | "--num_gpus", 188 | type=int, 189 | default=2, 190 | help="Number of GPUs to use for inference. Note that we use bfloat16 if available and float16 otherwise.", 191 | ) 192 | args = parser.parse_args() 193 | 194 | assert args.output_file.endswith(".json"), "Output file must be a JSON file." 195 | 196 | output_path = get_output_path( 197 | output_file=args.output_file, default_output_dir="../../outputs/inference" 198 | ) 199 | assert not ( 200 | os.path.exists(output_path) and not args.force_rerun 201 | ), f"Output file {output_path} already exists. Skipping inference." 202 | 203 | main(args) 204 | -------------------------------------------------------------------------------- /src/inference/multi_turn_openai.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import os 4 | import warnings 5 | from argparse import ArgumentParser 6 | from typing import Dict, List 7 | from collections import defaultdict 8 | from tqdm import tqdm 9 | 10 | from src.data import HFDepthQALoader, filter_data_dict, slice_data_dict 11 | from src.model import OpenAILLM 12 | from src.utils import ( 13 | SAMPLING_PARAMS_OPENAI, 14 | SYSTEM_PROMPT_ZERO_SHOT, 15 | USER_PROMPT_TEMPLATE_ZERO_SHOT, 16 | USER_PROMPT_TEMPLATE_LAST_TURN, 17 | get_output_path 18 | ) 19 | 20 | DEBUG = False 21 | 22 | 23 | async def multiturn_completions( 24 | model, 25 | inputs, 26 | total_len, 27 | max_retries=5 28 | ): 29 | system_prompt = SYSTEM_PROMPT_ZERO_SHOT 30 | chat_dict = defaultdict(str) 31 | 32 | # Adjust batch size to fit the number of inputs 33 | # VLLM supports adaptive batch size already 34 | total_len = len(inputs) 35 | batched_outputs = [] 36 | 37 | # Process initial batches with progress bar 38 | print("Processing initial chat...") 39 | for i in tqdm( 40 | range(0, len(inputs)), total=total_len, desc="Initial Chat" 41 | ): 42 | chat_input = inputs[i] 43 | 44 | for j, inst in enumerate(chat_input): 45 | if j == 0: 46 | input_ = apply_template_chat(system_prompt, inst["input"]) 47 | else: 48 | input_.append({"role": "user", "content": inst["input"]}) 49 | 50 | if inst["id"] in chat_dict: 51 | input_.append({"role": "assistant", "content": chat_dict[inst["id"]]}) 52 | continue 53 | 54 | 55 | output = await model.completions([input_], **SAMPLING_PARAMS_OPENAI) 56 | chat_dict[inst["id"]] = output[0] 57 | input_.append({"role": "assistant", "content": output[0]}) 58 | 59 | batched_outputs.append(input_) 60 | 61 | # Identify failed instances and prepare for retries 62 | 63 | to_retry_inputs = [] 64 | to_retry_indices = [] 65 | 66 | for i, output in enumerate(batched_outputs): 67 | for chat in output: 68 | if "assistant" in chat: 69 | if chat["content"] is None: # Parsing failed 70 | to_retry_inputs.append(inputs[i]) 71 | to_retry_indices.append(i) 72 | 73 | # Retry logic with progress bar 74 | retries = 0 75 | while to_retry_inputs and retries < max_retries: 76 | retries += 1 77 | print(f"Retrying failed batches: Attempt {retries}/{max_retries}") 78 | retry_outputs = [] 79 | for i in tqdm( 80 | range(0, len(to_retry_inputs)), desc=f"Retry Attempt {retries}" 81 | ): 82 | chat_input = inputs[i] 83 | for j, inst in enumerate(chat_input): 84 | if j == 0: 85 | input_ = apply_template_chat(system_prompt, inst["input"]) 86 | else: 87 | input_.append({"role": "user", "content": inst["input"]}) 88 | 89 | if inst["id"] in chat_dict: 90 | input_.append({"role": "assistant", "content": chat_dict[inst["id"]]}) 91 | continue 92 | output = await model.completions([input_], **SAMPLING_PARAMS_OPENAI) 93 | chat_dict[inst["id"]] = output[0] 94 | input_.append({"role": "assistant", "content": output[0]}) 95 | 96 | batched_outputs.append(input_) 97 | 98 | new_to_retry_inputs = [] 99 | new_to_retry_indices = [] 100 | for idx, (retry_idx, output) in enumerate(zip(to_retry_indices, retry_outputs)): 101 | if output is None: # Still failing 102 | new_to_retry_inputs.append(to_retry_inputs[idx]) 103 | new_to_retry_indices.append(to_retry_indices[idx]) 104 | else: 105 | batched_outputs[retry_idx] = output # Update with successful retry 106 | 107 | to_retry_inputs = new_to_retry_inputs 108 | to_retry_indices = new_to_retry_indices 109 | 110 | # Final aggregation and printing 111 | outputs_len = len(chat_dict) 112 | print(f"Processed {outputs_len} / {total_len} instances.") 113 | ''' 114 | if outputs_len < total_len: 115 | warnings.warn("Some instances failed.") 116 | warnings.warn("They will be written as None in the output file.") 117 | raise Exception( 118 | f"Failed to generate feedback for {total_len - outputs_len} instances." 119 | ) 120 | ''' 121 | return chat_dict 122 | 123 | 124 | def apply_template_chat(system_prompt, user_prompt): 125 | messages = [ 126 | {"role": "system", "content": system_prompt}, 127 | {"role": "user", "content": user_prompt}, 128 | ] 129 | return messages 130 | 131 | 132 | def prepare_inputs_multiturn( 133 | questions: Dict[str, Dict], 134 | nodes: Dict[str, Dict], 135 | node_to_q: Dict[str, str] 136 | ) -> List[str]: 137 | 138 | chat_inputs = [] 139 | 140 | for nodeid, node in nodes.items(): 141 | chat = [] 142 | target_question_d = questions[node_to_q[nodeid]] 143 | target_question = target_question_d["question"] 144 | 145 | for predec_nodeid in node["direct_predecessors"]: 146 | predec_question_d = questions[node_to_q[predec_nodeid]] 147 | predec_question = predec_question_d["question"] 148 | user_prompt = USER_PROMPT_TEMPLATE_ZERO_SHOT.format(question=predec_question) 149 | chat.append({"id":predec_nodeid, "input": user_prompt}) 150 | last_user_prompt = USER_PROMPT_TEMPLATE_LAST_TURN.format(question=target_question) 151 | chat.append({"id": nodeid, "input": last_user_prompt}) 152 | 153 | chat_inputs.append(chat) 154 | 155 | return chat_inputs 156 | 157 | 158 | async def main(args): 159 | global DEBUG 160 | DEBUG = args.debug 161 | 162 | # Load data 163 | dataloader = HFDepthQALoader() 164 | questions, nodes, node_to_q = dataloader.load_data() 165 | print(f"Loaded {len(questions)} questions and {len(nodes)} nodes.") 166 | 167 | # Load model 168 | model = OpenAILLM(args.model_name) 169 | nodes = filter_data_dict(nodes, lambda node: node["depth"] > 1) 170 | 171 | if DEBUG: 172 | nodes = slice_data_dict(nodes, start=0, end=5) 173 | 174 | inputs = prepare_inputs_multiturn( 175 | questions, 176 | nodes, 177 | node_to_q 178 | ) 179 | 180 | if DEBUG: 181 | inputs = inputs[:5] 182 | 183 | # Inference 184 | predictions = await multiturn_completions(model, inputs, len(node_to_q)) 185 | 186 | # Save results 187 | results = {} 188 | for idx, nodeid in enumerate(predictions.keys()): 189 | results[nodeid] = questions[node_to_q[nodeid]] 190 | results[nodeid].update({"predicted_answer": predictions[nodeid]}) 191 | 192 | output_path = get_output_path( 193 | output_file=args.output_file, default_output_dir="../../outputs/inference" 194 | ) 195 | 196 | with open(output_path, "w") as f: 197 | json.dump(results, f, indent=4) 198 | 199 | 200 | if __name__ == "__main__": 201 | parser = ArgumentParser() 202 | # I/O arguments 203 | parser.add_argument( 204 | "--model_name", 205 | type=str, 206 | required=True, 207 | help="Name of model hosted in Hugging Face under AutoModelForCausalLM", 208 | ) 209 | parser.add_argument( 210 | "--input", 211 | type=str, 212 | default="kaist-ai/DepthQA", 213 | help="Dataset name in Hugging Face (for zero-shot) or local zero-shot JSON output file (for prompt-*)", 214 | ) 215 | parser.add_argument( 216 | "--output_file", 217 | type=str, 218 | required=True, 219 | help="Output JSON file name. Unless the parent directory is specified, will be saved under outputs/inference by default.", 220 | ) 221 | parser.add_argument( 222 | "--force_rerun", 223 | action="store_true", 224 | help="Force rerun even if output file exists.", 225 | ) 226 | parser.add_argument("--debug", action="store_true", help="Debug mode.") 227 | 228 | args = parser.parse_args() 229 | 230 | assert args.output_file.endswith(".json"), "Output file must be a JSON file." 231 | 232 | output_path = get_output_path( 233 | output_file=args.output_file, default_output_dir="../../outputs/inference" 234 | ) 235 | assert not ( 236 | os.path.exists(output_path) and not args.force_rerun 237 | ), f"Output file {output_path} already exists. Skipping inference." 238 | 239 | asyncio.run(main(args)) 240 | -------------------------------------------------------------------------------- /src/inference/single_turn.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import warnings 4 | from argparse import ArgumentParser 5 | from typing import Dict, List 6 | 7 | from tqdm import tqdm 8 | 9 | from src.data import HFDepthQALoader, filter_data_dict, slice_data_dict 10 | from src.model import VLLM 11 | from src.utils import ( 12 | SAMPLING_PARAMS, 13 | SYSTEM_PROMPT_CTX, 14 | SYSTEM_PROMPT_ZERO_SHOT, 15 | USER_PROMPT_TEMPLATE_CTX, 16 | USER_PROMPT_TEMPLATE_ZERO_SHOT, 17 | get_output_path, 18 | show_random_inputs, 19 | ) 20 | 21 | DEBUG = False 22 | 23 | 24 | # Model inference (Use offline batching) 25 | def batch_completions( 26 | model, 27 | inputs: List[str], 28 | batch_size, 29 | ): 30 | batched_outputs = [] 31 | 32 | # Adjust batch size to fit the number of inputs 33 | # VLLM supports adaptive batch size already 34 | total_batches = len(inputs) // batch_size + ( 35 | 1 if len(inputs) % batch_size > 0 else 0 36 | ) 37 | total_len = len(inputs) 38 | 39 | # Process initial batches with progress bar 40 | print("Processing initial batches...") 41 | for i in tqdm( 42 | range(0, len(inputs), batch_size), total=total_batches, desc="Initial Batches" 43 | ): 44 | batch_inputs = inputs[i : i + batch_size] 45 | batch_outputs = model.completions( 46 | batch_inputs, **SAMPLING_PARAMS, use_tqdm=True 47 | ) 48 | batched_outputs.extend(batch_outputs) 49 | 50 | # Final aggregation and printing 51 | outputs_len = len(batched_outputs) 52 | print(f"Processed {outputs_len}/{total_len} instances.") 53 | 54 | if outputs_len < total_len: 55 | warnings.warn("Some instances failed.") 56 | warnings.warn("They will be written as None in the output file.") 57 | raise Exception( 58 | f"Failed to generate feedback for {total_len - outputs_len} instances." 59 | ) 60 | 61 | for i, output in enumerate(batched_outputs): 62 | if output == "": 63 | print("Empty output") 64 | batched_outputs[i] = None 65 | 66 | if DEBUG: 67 | print("Checking the results") 68 | for output in batched_outputs[:5]: 69 | print(output) 70 | 71 | return batched_outputs 72 | 73 | 74 | def apply_template_chat(system_prompt, user_prompt, tokenizer): 75 | if tokenizer.chat_template and "system" not in tokenizer.chat_template: 76 | messages = [ 77 | {"role": "user", "content": system_prompt + "\n" + user_prompt}, 78 | ] 79 | else: 80 | messages = [ 81 | {"role": "system", "content": system_prompt}, 82 | {"role": "user", "content": user_prompt}, 83 | ] 84 | return ( 85 | tokenizer.apply_chat_template( # automatically format to default chat template 86 | messages, tokenize=False, add_generation_prompt=True 87 | ) 88 | ) 89 | 90 | 91 | def prepare_inputs_zero_shot(questions: Dict[str, Dict], tokenizer) -> List[str]: 92 | inputs = [] 93 | system_prompt = SYSTEM_PROMPT_ZERO_SHOT 94 | for question_d in questions.values(): 95 | target_question = question_d["question"] 96 | user_prompt = USER_PROMPT_TEMPLATE_ZERO_SHOT.format(question=target_question) 97 | 98 | input_str = apply_template_chat(system_prompt, user_prompt, tokenizer) 99 | inputs.append(input_str) 100 | 101 | return inputs 102 | 103 | 104 | def prepare_inputs_ctx( 105 | questions: Dict[str, Dict], 106 | nodes: Dict[str, Dict], 107 | node_to_q: Dict[str, str], 108 | tokenizer, 109 | use_gold: bool, 110 | ) -> List[str]: 111 | inputs = [] 112 | system_prompt = SYSTEM_PROMPT_CTX 113 | for nodeid, node in nodes.items(): 114 | target_question_d = questions[node_to_q[nodeid]] 115 | target_question = target_question_d["question"] 116 | predecessor_pairs = "" 117 | for predec_nodeid in node["direct_predecessors"]: 118 | predec_question_d = questions[node_to_q[predec_nodeid]] 119 | predec_question = predec_question_d["question"] 120 | if use_gold: 121 | predec_answer = predec_question_d["answer"] 122 | else: 123 | predec_answer = predec_question_d["predicted_answer"] 124 | pair = f"Q: {predec_question}\nA: {predec_answer}\n" 125 | predecessor_pairs += pair 126 | user_prompt = USER_PROMPT_TEMPLATE_CTX.format( 127 | qa_pairs=predecessor_pairs, question=target_question 128 | ) 129 | 130 | input_str = apply_template_chat(system_prompt, user_prompt, tokenizer) 131 | inputs.append(input_str) 132 | 133 | return inputs 134 | 135 | 136 | def main(args): 137 | global DEBUG 138 | DEBUG = args.debug 139 | 140 | # Load data 141 | dataloader = HFDepthQALoader() 142 | if args.task_type == "prompt-pred": 143 | with open(args.input) as f: 144 | questions = json.load(f) 145 | _, nodes, node_to_q = dataloader.load_data(except_questions=True) 146 | else: 147 | questions, nodes, node_to_q = dataloader.load_data() 148 | print(f"Loaded {len(questions)} questions and {len(nodes)} nodes.") 149 | 150 | # Load model 151 | model = VLLM(args.model_name, num_gpus=args.num_gpus) 152 | tokenizer = model.get_tokenizer() 153 | 154 | # Prepare inputs 155 | if args.task_type == "zero-shot": 156 | if DEBUG: 157 | questions = slice_data_dict(questions, start=0, end=5) 158 | inputs = prepare_inputs_zero_shot(questions, tokenizer) 159 | else: 160 | nodes = filter_data_dict(nodes, lambda node: node["depth"] > 1) 161 | if DEBUG: 162 | nodes = slice_data_dict(nodes, start=0, end=5) 163 | inputs = prepare_inputs_ctx( 164 | questions, 165 | nodes, 166 | node_to_q, 167 | tokenizer, 168 | use_gold=args.task_type == "prompt-gold", 169 | ) 170 | show_random_inputs(inputs) 171 | 172 | if DEBUG: 173 | inputs = inputs[:5] 174 | 175 | # Inference 176 | predictions = batch_completions(model, inputs, args.batch_size) 177 | 178 | # Save results 179 | results = {} 180 | if args.task_type == "zero-shot": 181 | for idx, (qid, question_d) in enumerate(questions.items()): 182 | results[qid] = question_d 183 | results[qid].update({"predicted_answer": predictions[idx]}) 184 | else: 185 | for idx, nodeid in enumerate(nodes.keys()): 186 | results[nodeid] = questions[node_to_q[nodeid]] 187 | results[nodeid].update({"predicted_answer": predictions[idx]}) 188 | 189 | output_path = get_output_path( 190 | output_file=args.output_file, default_output_dir="../../outputs/inference" 191 | ) 192 | with open(output_path, "w") as f: 193 | json.dump(results, f, indent=4) 194 | 195 | 196 | if __name__ == "__main__": 197 | parser = ArgumentParser() 198 | # I/O arguments 199 | parser.add_argument( 200 | "--model_name", 201 | type=str, 202 | required=True, 203 | help="Name of model hosted in Hugging Face under AutoModelForCausalLM", 204 | ) 205 | parser.add_argument( 206 | "--input", 207 | type=str, 208 | default="kaist-ai/DepthQA", 209 | help="Dataset name in Hugging Face (for zero-shot) or local zero-shot JSON output file (for prompt-*)", 210 | ) 211 | parser.add_argument( 212 | "--output_file", 213 | type=str, 214 | required=True, 215 | help="Output JSON file name. Unless the parent directory is specified, will be saved under outputs/inference by default.", 216 | ) 217 | parser.add_argument( 218 | "--force_rerun", 219 | action="store_true", 220 | help="Force rerun even if output file exists.", 221 | ) 222 | parser.add_argument("--debug", action="store_true", help="Debug mode.") 223 | # Compute arguments 224 | parser.add_argument( 225 | "--batch_size", type=int, default=8, help="Batch size for inference." 226 | ) 227 | parser.add_argument( 228 | "--num_gpus", 229 | type=int, 230 | default=2, 231 | help="Number of GPUs to use for inference. Note that we use bfloat16 if available and float16 otherwise.", 232 | ) 233 | # Prompt arguments 234 | parser.add_argument( 235 | "--task_type", 236 | type=str, 237 | default="zero-shot", 238 | choices=["zero-shot", "prompt-gold", "prompt-pred"], 239 | help="Task type for the model, which determines the input text.", 240 | ) 241 | args = parser.parse_args() 242 | 243 | assert not ( 244 | args.task_type == "prompt-pred" and not args.input.endswith(".json") 245 | ), "Input file for prompt-pred task should be a JSON file that contains zero-shot predictions." 246 | 247 | assert args.output_file.endswith(".json"), "Output file must be a JSON file." 248 | 249 | output_path = get_output_path( 250 | output_file=args.output_file, default_output_dir="../../outputs/inference" 251 | ) 252 | assert not ( 253 | os.path.exists(output_path) and not args.force_rerun 254 | ), f"Output file {output_path} already exists. Skipping inference." 255 | 256 | main(args) 257 | -------------------------------------------------------------------------------- /outputs/inference/gpt-3.5-turbo_prompt-gold.json: -------------------------------------------------------------------------------- 1 | { 2 | "1_d3": { 3 | "qid": "d3_q1", 4 | "depth": 3, 5 | "question": "I think the Law of Large Numbers also tells us something similar to what the Central Limit Theoreom says. Evaluate my understanding.", 6 | "answer": "It's understandable that the concepts of the Law of Large Numbers (LLN) and the Central Limit Theorem (CLT) might seem similar at first glance, as they both involve large sample sizes and their effects on statistical properties. However, they actually address quite different phenomena in the field of statistics.\n\nThe Law of Large Numbers is a principle that focuses on the convergence of sample means to the population mean. What this means is that as you take larger and larger samples from a population, the average (mean) of those samples gets closer and closer to the average (mean) of the entire population. The LLN essentially tells us that the accuracy of the sample mean as an estimate of the population mean improves with larger sample sizes.\n\nOn the other hand, the Central Limit Theorem is concerned with the distribution of these sample means, especially when the sample size is large. According to the CLT, if you take sufficiently large samples from a population (regardless of the population's original distribution) and calculate the mean of each sample, the distribution of these sample means will tend to follow a normal (bell-shaped) distribution. This holds true even if the original population is not normally distributed. The CLT is crucial for enabling us to use normal distribution assumptions and techniques for inference about population means, even when dealing with populations that are not normally distributed themselves.\n\nSo, while the LLN guarantees that the sample means converge to the population mean as the sample size increases, the CLT provides a shape to the distribution of these sample means, stating that they will tend to be normally distributed as the sample size becomes large. This differentiation is essential for correctly applying statistical methods and understanding the behavior of sample means in relation to the population mean.", 7 | "predicted_answer": "Your understanding is on the right track! Both the Law of Large Numbers and the Central Limit Theorem are related concepts in statistics, but they focus on different aspects of sampling and estimation.\n\nThe Law of Large Numbers primarily deals with the behavior of sample means as the sample size increases. It states that as the sample size increases, the sample mean gets closer to the population mean. In other words, it emphasizes the convergence of sample statistics to population parameters as sample size grows. This is particularly useful for reducing sampling error and improving the accuracy of estimates.\n\nOn the other hand, the Central Limit Theorem focuses on the distribution of sample means. It states that regardless of the shape of the population distribution, the distribution of sample means will approach a normal distribution as the sample size increases. This is essential for making statistical inferences and justifying the use of certain statistical techniques.\n\nIn summary, while both concepts are related and highlight the importance of sample size in statistical analysis, the Law of Large Numbers pertains more to the convergence of sample statistics to population parameters, while the Central Limit Theorem relates to the distributional properties of sample means." 8 | }, 9 | "1_d2_n1": { 10 | "qid": "d2_q1", 11 | "depth": 2, 12 | "question": "How can the Law of Large Numbers be applied to improve the accuracy of sample means?", 13 | "answer": "The Law of Large Numbers can be applied to improve the accuracy of sample means by increasing the sample size. According to this law, as the size of a sample increases, the sample mean will get closer to the population mean. This means that increasing the number of data points in your sample will make your sample mean a more accurate estimate of the actual population mean. It reduces the impact of outliers or extreme values and minimizes the effect of randomness or chance, providing a more reliable and valid result.", 14 | "predicted_answer": "The Law of Large Numbers can be applied to improve the accuracy of sample means by increasing the sample size. As the sample size grows, the mean of the sample will approach the true population mean more closely, reducing the margin of error and increasing the accuracy of the sample mean estimate. This is because a larger sample size provides a more representative distribution of the population, reducing the impact of random fluctuations and leading to more reliable and precise estimates." 15 | }, 16 | "1_d2_n2": { 17 | "qid": "d2_q2", 18 | "depth": 2, 19 | "question": "What is the significance of the distribution of sample means according to the Central Limit Theorem?", 20 | "answer": "The Central Limit Theorem (CLT) is a statistical theory that states that given a sufficiently large sample size from a population with a finite level of variance, the mean of all samples from the same population will be approximately equal to the mean of the population. The distribution of sample means will approach a normal (bell-shaped) distribution as the sample size increases, regardless of the shape of the population distribution.\n\nThe significance of this is twofold:\n\n1. It allows us to make inferences about the population from sample data. Even if the original population is not normally distributed, we can apply techniques assuming normality if the sample size is large enough.\n\n2. It underpins many statistical tests and procedures that assume a normal distribution, such as confidence intervals and hypothesis testing. Without the CLT, these techniques would not have a solid theoretical foundation.\n\nIn essence, the Central Limit Theorem allows us to apply probabilistic or inferential statistics to data in a robust and justified manner.", 21 | "predicted_answer": "The significance of the distribution of sample means according to the Central Limit Theorem is that when multiple random samples of a certain size are taken from a population, the distribution of the sample means will approximate a normal distribution, regardless of the shape of the population distribution. This allows for the application of various statistical analysis methods that rely on the assumption of normality, such as calculating confidence intervals and conducting hypothesis testing. The Central Limit Theorem provides a theoretical foundation for understanding how sample means behave and helps in making statistical inferences about populations based on sample data." 22 | }, 23 | "1_d2_n3": { 24 | "qid": "d2_q3", 25 | "depth": 2, 26 | "question": "How does sample size affect the accuracy of statistical estimates in relation to the Law of Large Numbers?", 27 | "answer": "The Law of Large Numbers states that as the sample size increases, the sample mean gets closer and closer to the population mean. In other words, the accuracy of statistical estimates improves with an increase in sample size. This is because larger samples tend to include a greater variety of observations, which therefore gives a better representation of the population. This minimizes the sampling error and the results are more reliable and valid. So, having a larger sample size reduces the effect of randomness and anomaly in the results, making the estimate more accurate.", 28 | "predicted_answer": "Sample size plays a crucial role in the accuracy of statistical estimates in relation to the Law of Large Numbers. As the sample size increases, the accuracy of statistical estimates also improves. This is because the Law of Large Numbers states that as the sample size grows, the sample mean gets closer to the population mean. Therefore, with a larger sample size, the estimates derived from the sample are more likely to represent the true population parameter accurately. In essence, a larger sample size reduces the impact of random chance and variability, leading to more reliable and precise statistical estimates." 29 | }, 30 | "1_d2_n4": { 31 | "qid": "d2_q4", 32 | "depth": 2, 33 | "question": "Explain how increasing the sample size influences the distribution of sample means as per the Central Limit Theorem?", 34 | "answer": "The Central Limit Theorem (CLT) states that if you have a population with any shape of distribution (even a non-normal distribution), the distribution of sample means obtained through sufficiently large samples drawn from that population will approximate a normal distribution. \n\nIn relation to sample size, CLT suggests that the larger the sample size, the closer the sample mean gets to the population mean. This is due to the fact that increasing the sample size reduces the standard error, thus narrowing the spread of the distribution. \n\nThe distribution of sample means becomes less skewed and more closely approximates a normal distribution as the sample size increases, regardless of the shape of the population distribution. This is one of the key aspects of the Central Limit theorem. Consequently, larger sample sizes lead to more reliable and valid inferences about the population.", 35 | "predicted_answer": "Increasing the sample size influences the distribution of sample means by aligning it more closely to a normal distribution, as per the Central Limit Theorem. When the sample size is increased, the sample mean distribution becomes narrower and taller, resembling a bell curve. This occurs because with a larger sample size, the variability in the sample means decreases, leading to a more normal distribution shape. Essentially, as the sample size increases, the distribution of sample means becomes more symmetrical, centered around the population mean, and conforms more to the characteristics of a normal distribution." 36 | } 37 | } -------------------------------------------------------------------------------- /src/evaluation/batch_eval_openai.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import time 4 | import warnings 5 | from argparse import ArgumentParser 6 | from typing import Dict, List 7 | 8 | from src.model import OpenAIBatchClient 9 | from src.utils import ( 10 | SAMPLING_PARAMS_OPENAI, 11 | SYSTEM_PROMPT_EVAL, 12 | USER_PROMPT_TEMPLATE_EVAL, 13 | calculate_cost, 14 | format_messages, 15 | get_output_path, 16 | show_random_inputs, 17 | ) 18 | 19 | from .output_parser import parse_judgment 20 | 21 | DEBUG = False 22 | 23 | 24 | def prepare_inputs(predictions: Dict[str, Dict], eval_model_name: str) -> List[Dict]: 25 | inputs = [] 26 | system_prompt = SYSTEM_PROMPT_EVAL 27 | for id, record in predictions.items(): # id can be qid or nodeid 28 | instruction = record["question"].strip() 29 | reference_answer = record["answer"].strip() 30 | response = record["predicted_answer"].strip() 31 | user_prompt = USER_PROMPT_TEMPLATE_EVAL.format( 32 | instruction=instruction, 33 | reference_answer=reference_answer, 34 | response=response, 35 | ) 36 | messages = format_messages(system_prompt, user_prompt) 37 | inputs.append( 38 | { 39 | "custom_id": id, 40 | "method": "POST", 41 | "url": "/v1/chat/completions", 42 | "body": { 43 | "model": eval_model_name, 44 | "messages": messages, 45 | **SAMPLING_PARAMS_OPENAI, 46 | }, 47 | } 48 | ) 49 | return inputs 50 | 51 | 52 | def prepare_input_file(prediction_file, eval_model_name): 53 | with open(prediction_file) as f: 54 | predictions = json.load(f) 55 | 56 | inputs = prepare_inputs(predictions, eval_model_name) 57 | show_random_inputs(inputs) 58 | if DEBUG: 59 | inputs = inputs[:5] 60 | 61 | batch_eval_input_file = prediction_file.replace(".json", "_batch_eval_input.jsonl") 62 | with open(batch_eval_input_file, "w") as f: 63 | for input in inputs: 64 | f.write(json.dumps(input) + "\n") 65 | return batch_eval_input_file 66 | 67 | 68 | def prepare_output_file(outputs, prediction_file, output_file): 69 | with open(prediction_file) as f: 70 | predictions = json.load(f) 71 | 72 | input_token_cnt = 0 73 | output_token_cnt = 0 74 | 75 | batch_output_file = args.output_file.replace(".json", "_batch_eval_output.jsonl") 76 | batch_output_writer = open(batch_output_file, "w") 77 | for output in outputs.iter_lines(): 78 | batch_output_writer.write(output + "\n") 79 | 80 | output = json.loads(output) 81 | custom_id = output["custom_id"] 82 | judgment = output["response"]["body"]["choices"][0]["message"]["content"] 83 | feedback, score = parse_judgment(judgment) 84 | predictions[custom_id].update({"feedback": feedback, "score": score}) 85 | 86 | input_token_cnt += output["response"]["body"]["usage"]["prompt_tokens"] 87 | output_token_cnt += output["response"]["body"]["usage"]["completion_tokens"] 88 | batch_output_writer.close() 89 | 90 | with open(output_file, "w") as f: 91 | json.dump(predictions, f, indent=4) 92 | 93 | return input_token_cnt, output_token_cnt 94 | 95 | 96 | def main_individual(args): 97 | global DEBUG 98 | DEBUG = args.debug 99 | 100 | client = OpenAIBatchClient() 101 | if args.mode == "create": 102 | batch_eval_input_file = prepare_input_file( 103 | args.prediction_file, args.eval_model_name 104 | ) 105 | batch = client.create_batch(batch_eval_input_file, args.description) 106 | print(f"Batch created: {batch}") 107 | return batch.id 108 | 109 | elif args.mode == "check": 110 | status, batch_output_file_id = client.check_batch(args.batch_id) 111 | print(f"{args.batch_id} status: {status}") 112 | return status, batch_output_file_id 113 | 114 | elif args.mode == "cancel": 115 | client.cancel_batch(args.batch_id) 116 | 117 | elif args.mode == "list": 118 | client.list_batches() 119 | 120 | elif args.mode == "retrieve": 121 | outputs = client.retrieve_batch(args.batch_output_file_id) 122 | 123 | output_path = get_output_path( 124 | output_file=args.output_file, default_output_dir="../../outputs/evaluation" 125 | ) 126 | args.output_file = output_path 127 | 128 | input_token_cnt, output_token_cnt = prepare_output_file( 129 | outputs, args.prediction_file, args.output_file 130 | ) 131 | 132 | cost = calculate_cost(args.eval_model_name, input_token_cnt, output_token_cnt) 133 | print(f"Cost: {cost:.2f} USD") 134 | 135 | else: 136 | raise ValueError("Invalid mode") 137 | 138 | 139 | def main_auto(args): 140 | client = OpenAIBatchClient() 141 | 142 | # Step 1: Create batch request 143 | batch_eval_input_file = prepare_input_file( 144 | args.prediction_file, args.eval_model_name 145 | ) 146 | batch = client.create_batch(batch_eval_input_file, args.description) 147 | print(f"Batch created: {batch}") 148 | batch_id = batch.id 149 | 150 | # Step 2: Check status periodically 151 | while True: 152 | status, batch_output_file_id = client.check_batch(batch_id) 153 | print(f"Current status: {status}") 154 | 155 | if status == "completed": 156 | break 157 | elif status in ["failed", "cancelling", "cancelled", "expired"]: 158 | raise Exception(f"Batch failed with status: {status}") 159 | 160 | time.sleep(30) # Wait for 30 seconds before checking again 161 | print(f"Batch completed. Output file ID: {batch_output_file_id}") 162 | 163 | # Step 3: Retrieve results 164 | outputs = client.retrieve_batch(batch_output_file_id) 165 | output_path = get_output_path( 166 | output_file=args.output_file, default_output_dir="../../outputs/evaluation" 167 | ) 168 | args.output_file = output_path 169 | 170 | print(f"Retrieved results saved to {output_path}") 171 | input_token_cnt, output_token_cnt = prepare_output_file( 172 | outputs, args.prediction_file, args.output_file 173 | ) 174 | 175 | cost = calculate_cost(args.eval_model_name, input_token_cnt, output_token_cnt) 176 | print(f"Cost: {cost:.2f} USD") 177 | 178 | 179 | if __name__ == "__main__": 180 | parser = ArgumentParser() 181 | parser.add_argument( 182 | "--mode", 183 | type=str, 184 | choices=["auto", "create", "check", "cancel", "list", "retrieve"], 185 | ) 186 | # Arguments for creating or retrieving a batch job 187 | parser.add_argument( 188 | "--eval_model_name", 189 | type=str, 190 | default="gpt-4-0125-preview", 191 | help="OpenAI evaluator model name", 192 | ) 193 | # parser.add_argument("--dataset", type=str, default="kaist-ai/DepthQA", help="Dataset name in Hugging Face") # TODO: Allow retrieving metadata from dataset in case the dataset changes 194 | parser.add_argument( 195 | "--prediction_file", type=str, help="Output file containing model predictions" 196 | ) 197 | parser.add_argument("--description", type=str, help="Description of the batch job") 198 | parser.add_argument( 199 | "--batch_output_file_id", type=str, help="Output file ID of the batch job" 200 | ) 201 | parser.add_argument( 202 | "--output_file", 203 | type=str, 204 | help="Output JSON file to write the results. Unless the parent directory is specified, will be saved under outputs/inference by default. The same file but with a .jsonl extension will be created intermediately to store the batch job output.", 205 | ) 206 | # Argument for checking or canceling a batch job 207 | parser.add_argument("--batch_id", type=str, help="Batch ID of submitted job") 208 | # Argument for debugging 209 | parser.add_argument("--debug", action="store_true", help="Debug mode") 210 | args = parser.parse_args() 211 | 212 | if args.mode == "auto": 213 | warnings.warn( 214 | "Creating a batch job in auto mode will overwrite the output file. Make sure you want to proceed." 215 | ) 216 | assert args.prediction_file and args.prediction_file.endswith( 217 | ".json" 218 | ), "A valid JSON prediction file is required for creating a batch job" 219 | assert args.description, "Description is required for creating a batch job" 220 | assert args.output_file and args.output_file.endswith( 221 | ".json" 222 | ), "A valid JSON output file is required for retrieving a batch job" 223 | main_auto(args) 224 | else: 225 | if args.mode == "create": 226 | assert args.prediction_file and args.prediction_file.endswith( 227 | ".json" 228 | ), "A valid JSON prediction file is required for creating a batch job" 229 | assert args.description, "Description is required for creating a batch job" 230 | 231 | elif args.mode == "check" or args.mode == "cancel": 232 | assert args.batch_id and args.batch_id.startswith( 233 | "batch_" 234 | ), "A valid batch ID is required for checking a batch job" 235 | 236 | elif args.mode == "retrieve": 237 | assert args.batch_output_file_id and args.batch_output_file_id.startswith( 238 | "file-" 239 | ), "Output file ID is required for retrieving a batch job" 240 | assert args.output_file and args.output_file.endswith( 241 | ".json" 242 | ), "A valid JSON output file is required for retrieving a batch job" 243 | 244 | main_individual(args) 245 | -------------------------------------------------------------------------------- /src/inference/single_turn_openai.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import os 4 | import warnings 5 | from argparse import ArgumentParser 6 | from typing import Dict, List 7 | 8 | from tqdm import tqdm 9 | 10 | from src.data import HFDepthQALoader, filter_data_dict, slice_data_dict 11 | from src.model import OpenAILLM 12 | from src.utils import ( 13 | SAMPLING_PARAMS_OPENAI, 14 | SYSTEM_PROMPT_CTX, 15 | SYSTEM_PROMPT_ZERO_SHOT, 16 | USER_PROMPT_TEMPLATE_CTX, 17 | USER_PROMPT_TEMPLATE_ZERO_SHOT, 18 | format_messages, 19 | get_output_path, 20 | show_random_inputs, 21 | ) 22 | 23 | DEBUG = False 24 | 25 | 26 | # Model inference (Use offline batching) 27 | async def batch_completions_with_retries( 28 | model, 29 | inputs, 30 | batch_size, 31 | max_retries=5, 32 | ): 33 | 34 | batched_outputs = [] 35 | 36 | total_batches = len(inputs) // batch_size + ( 37 | 1 if len(inputs) % batch_size > 0 else 0 38 | ) 39 | total_len = len(inputs) 40 | 41 | # Process initial batches with progress bar 42 | print("Processing initial batches...") 43 | for i in tqdm( 44 | range(0, len(inputs), batch_size), total=total_batches, desc="Initial Batches" 45 | ): 46 | batch_inputs = inputs[i : i + batch_size] 47 | batch_outputs = await model.completions(batch_inputs, **SAMPLING_PARAMS_OPENAI) 48 | batched_outputs.extend(batch_outputs) 49 | 50 | # Identify failed instances and prepare for retries 51 | to_retry_inputs = [] 52 | to_retry_indices = [] 53 | for i, output in enumerate(batched_outputs): 54 | 55 | if output is None: # Parsing failed 56 | to_retry_inputs.append(inputs[i]) 57 | to_retry_indices.append(i) 58 | 59 | # Retry logic with progress bar 60 | retries = 0 61 | while to_retry_inputs and retries < max_retries: 62 | retries += 1 63 | print(f"Retrying failed batches: Attempt {retries}/{max_retries}") 64 | retry_outputs = [] 65 | for i in tqdm( 66 | range(0, len(to_retry_inputs), batch_size), desc=f"Retry Attempt {retries}" 67 | ): 68 | batch_inputs = to_retry_inputs[i : i + batch_size] 69 | batch_outputs = await model.completions( 70 | batch_inputs, **SAMPLING_PARAMS_OPENAI 71 | ) 72 | 73 | assert len(batch_outputs) == len(batch_inputs) 74 | retry_outputs.extend(batch_outputs) 75 | 76 | new_to_retry_inputs = [] 77 | new_to_retry_indices = [] 78 | for idx, (retry_idx, output) in enumerate(zip(to_retry_indices, retry_outputs)): 79 | 80 | if output is None: # Still failing 81 | new_to_retry_inputs.append(to_retry_inputs[idx]) 82 | new_to_retry_indices.append(to_retry_indices[idx]) 83 | else: 84 | batched_outputs[retry_idx] = output # Update with successful retry 85 | 86 | to_retry_inputs = new_to_retry_inputs 87 | to_retry_indices = new_to_retry_indices 88 | 89 | # Final aggregation and printing 90 | outputs_len = len(batched_outputs) 91 | print(f"Processed {outputs_len}/{total_len} instances.") 92 | 93 | if outputs_len < total_len: 94 | warnings.warn("Some instances failed to generate.") 95 | warnings.warn("They will be written as None in the output file.") 96 | raise Exception(f"Failed to generate for {total_len - outputs_len} instances.") 97 | 98 | predictions = [] 99 | 100 | for output in tqdm(batched_outputs, desc="Finalizing"): 101 | if output is not None: 102 | predictions.append(output) 103 | else: 104 | predictions.append(None) 105 | if DEBUG: 106 | print("Checking the results") 107 | for prediction in predictions[:5]: 108 | print(prediction) 109 | 110 | return predictions 111 | 112 | 113 | def prepare_inputs_zero_shot(questions: Dict[str, Dict]) -> List[str]: 114 | inputs = [] 115 | system_prompt = SYSTEM_PROMPT_ZERO_SHOT 116 | for question_d in questions.values(): 117 | target_question = question_d["question"] 118 | user_prompt = USER_PROMPT_TEMPLATE_ZERO_SHOT.format(question=target_question) 119 | messages = format_messages(system_prompt, user_prompt) 120 | inputs.append(messages) 121 | 122 | return inputs 123 | 124 | 125 | def prepare_inputs_ctx( 126 | questions: Dict[str, Dict], 127 | nodes: Dict[str, Dict], 128 | node_to_q: Dict[str, str], 129 | use_gold: bool, 130 | ) -> List[str]: 131 | inputs = [] 132 | system_prompt = SYSTEM_PROMPT_CTX 133 | for nodeid, node in nodes.items(): 134 | target_question_d = questions[node_to_q[nodeid]] 135 | target_question = target_question_d["question"] 136 | predecessor_pairs = "" 137 | for predec_nodeid in node["direct_predecessors"]: 138 | predec_question_d = questions[node_to_q[predec_nodeid]] 139 | predec_question = predec_question_d["question"] 140 | if use_gold: 141 | predec_answer = predec_question_d["answer"] 142 | else: 143 | predec_answer = predec_question_d["predicted_answer"] 144 | pair = f"Q: {predec_question}\nA: {predec_answer}\n" 145 | predecessor_pairs += pair 146 | user_prompt = USER_PROMPT_TEMPLATE_CTX.format( 147 | qa_pairs=predecessor_pairs, question=target_question 148 | ) 149 | 150 | messages = format_messages(system_prompt, user_prompt) 151 | inputs.append(messages) 152 | 153 | return inputs 154 | 155 | 156 | async def main(args): 157 | global DEBUG 158 | DEBUG = args.debug 159 | 160 | # Load data 161 | dataloader = HFDepthQALoader() 162 | if args.task_type == "prompt-pred": 163 | with open(args.input) as f: 164 | questions = json.load(f) 165 | _, nodes, node_to_q = dataloader.load_data(except_questions=True) 166 | else: 167 | questions, nodes, node_to_q = dataloader.load_data() 168 | print(f"Loaded {len(questions)} questions and {len(nodes)} nodes.") 169 | 170 | # Load model 171 | model = OpenAILLM(args.model_name) 172 | 173 | # Prepare inputs 174 | if args.task_type == "zero-shot": 175 | if DEBUG: 176 | questions = slice_data_dict(questions, start=0, end=5) 177 | inputs = prepare_inputs_zero_shot(questions) 178 | else: 179 | nodes = filter_data_dict(nodes, lambda node: node["depth"] > 1) 180 | if DEBUG: 181 | nodes = slice_data_dict(nodes, start=0, end=5) 182 | inputs = prepare_inputs_ctx( 183 | questions, 184 | nodes, 185 | node_to_q, 186 | use_gold=args.task_type == "prompt-gold", 187 | ) 188 | show_random_inputs(inputs) 189 | 190 | if DEBUG: 191 | inputs = inputs[:5] 192 | 193 | # Inference 194 | predictions = await batch_completions_with_retries(model, inputs, args.batch_size) 195 | 196 | # Save results 197 | results = {} 198 | if args.task_type == "zero-shot": 199 | for idx, (qid, question_d) in enumerate(questions.items()): 200 | results[qid] = question_d 201 | results[qid].update({"predicted_answer": predictions[idx]}) 202 | else: 203 | for idx, nodeid in enumerate(nodes.keys()): 204 | results[nodeid] = questions[node_to_q[nodeid]] 205 | results[nodeid].update({"predicted_answer": predictions[idx]}) 206 | 207 | output_path = get_output_path( 208 | output_file=args.output_file, default_output_dir="../../outputs/inference" 209 | ) 210 | with open(output_path, "w") as f: 211 | json.dump(results, f, indent=4) 212 | 213 | 214 | if __name__ == "__main__": 215 | parser = ArgumentParser() 216 | # I/O arguments 217 | parser.add_argument( 218 | "--model_name", 219 | type=str, 220 | required=True, 221 | help="Name of model hosted in OpenAI", 222 | ) 223 | parser.add_argument( 224 | "--input", 225 | type=str, 226 | default="kaist-ai/DepthQA", 227 | help="Dataset name in Hugging Face (for zero-shot) or local zero-shot JSON output file (for prompt-*)", 228 | ) 229 | parser.add_argument( 230 | "--output_file", 231 | type=str, 232 | required=True, 233 | help="Output JSON file name. Will be saved under outputs/inference by default.", 234 | ) 235 | parser.add_argument( 236 | "--force_rerun", 237 | action="store_true", 238 | help="Force rerun even if output file exists.", 239 | ) 240 | parser.add_argument("--debug", action="store_true", help="Debug mode.") 241 | # Compute arguments 242 | parser.add_argument( 243 | "--batch_size", type=int, default=8, help="Batch size for inference." 244 | ) 245 | # Prompt arguments 246 | parser.add_argument( 247 | "--task_type", 248 | type=str, 249 | default="zero-shot", 250 | choices=["zero-shot", "prompt-gold", "prompt-pred"], 251 | help="Task type for the model, which determines the input text.", 252 | ) 253 | args = parser.parse_args() 254 | 255 | assert not ( 256 | args.task_type == "prompt-pred" and not args.input.endswith(".json") 257 | ), "Input file for prompt-pred task should be a JSON file that contains zero-shot predictions." 258 | 259 | assert args.output_file.endswith(".json"), "Output file must be a JSON file." 260 | 261 | output_path = get_output_path( 262 | output_file=args.output_file, default_output_dir="../../outputs/inference" 263 | ) 264 | assert not ( 265 | os.path.exists(output_path) and not args.force_rerun 266 | ), f"Output file {output_path} already exists. Skipping inference." 267 | 268 | asyncio.run(main(args)) 269 | -------------------------------------------------------------------------------- /outputs/inference/llama3_8b_prompt-gold.json: -------------------------------------------------------------------------------- 1 | { 2 | "1_d3": { 3 | "qid": "d3_q1", 4 | "depth": 3, 5 | "question": "I think the Law of Large Numbers also tells us something similar to what the Central Limit Theoreom says. Evaluate my understanding.", 6 | "answer": "It's understandable that the concepts of the Law of Large Numbers (LLN) and the Central Limit Theorem (CLT) might seem similar at first glance, as they both involve large sample sizes and their effects on statistical properties. However, they actually address quite different phenomena in the field of statistics.\n\nThe Law of Large Numbers is a principle that focuses on the convergence of sample means to the population mean. What this means is that as you take larger and larger samples from a population, the average (mean) of those samples gets closer and closer to the average (mean) of the entire population. The LLN essentially tells us that the accuracy of the sample mean as an estimate of the population mean improves with larger sample sizes.\n\nOn the other hand, the Central Limit Theorem is concerned with the distribution of these sample means, especially when the sample size is large. According to the CLT, if you take sufficiently large samples from a population (regardless of the population's original distribution) and calculate the mean of each sample, the distribution of these sample means will tend to follow a normal (bell-shaped) distribution. This holds true even if the original population is not normally distributed. The CLT is crucial for enabling us to use normal distribution assumptions and techniques for inference about population means, even when dealing with populations that are not normally distributed themselves.\n\nSo, while the LLN guarantees that the sample means converge to the population mean as the sample size increases, the CLT provides a shape to the distribution of these sample means, stating that they will tend to be normally distributed as the sample size becomes large. This differentiation is essential for correctly applying statistical methods and understanding the behavior of sample means in relation to the population mean.", 7 | "predicted_answer": "It's understandable that the concepts of the Law of Large Numbers (LLN) and the Central Limit Theorem (CLT) might seem similar at first glance, as they both involve large sample sizes and their effects on statistical properties. However, they actually address quite different phenomena in the field of statistics.\n\nThe Law of Large Numbers is a principle that focuses on the convergence of sample means to the population mean. What this means is that as you take larger and larger samples from a population, the average (mean) of those samples gets closer and closer to the average (mean) of the entire population. The LLN essentially tells us that the accuracy of the sample mean as an estimate of the population mean improves with larger sample sizes.\n\nOn the other hand, the Central Limit Theorem is concerned with the distribution of these sample means, especially when the sample size is large. According to the CLT, if you take sufficiently large samples from a population (regardless of the population's original distribution) and calculate the mean of each sample, the distribution of these sample means will tend to follow a normal (bell-shaped) distribution. This holds true even if the original population is not normally distributed. The CLT is crucial for enabling us to use normal distribution assumptions and techniques for inference about population means, even when dealing with populations that are not normally distributed themselves.\n\nSo, while the LLN guarantees that the sample means converge to the population mean as the sample size increases, the CLT provides a shape to the distribution of these sample means, stating that they will tend to be normally distributed as the sample size becomes large. This differentiation is essential for correctly applying statistical methods and understanding the behavior of sample means in relation to the population mean.\n\nSo, your understanding is partially correct, but it seems you were connecting the LLN to the aspect of the CLT that deals with the distribution of sample means becoming normal. While the LLN does indeed provide information about the convergence of sample means to the population mean, the CLT is a distinct concept that explains the specific shape of the distribution of sample means." 8 | }, 9 | "1_d2_n1": { 10 | "qid": "d2_q1", 11 | "depth": 2, 12 | "question": "How can the Law of Large Numbers be applied to improve the accuracy of sample means?", 13 | "answer": "The Law of Large Numbers can be applied to improve the accuracy of sample means by increasing the sample size. According to this law, as the size of a sample increases, the sample mean will get closer to the population mean. This means that increasing the number of data points in your sample will make your sample mean a more accurate estimate of the actual population mean. It reduces the impact of outliers or extreme values and minimizes the effect of randomness or chance, providing a more reliable and valid result.", 14 | "predicted_answer": "I can answer that for you!\n\nAccording to the Law of Large Numbers, increasing the sample size can improve the accuracy of sample means. As the size of the sample increases, the sample mean will get closer to the population mean, making it a more accurate estimate of the actual population mean. This means that having a larger sample size reduces the impact of outliers or extreme values and minimizes the effect of randomness or chance, providing a more reliable and valid result." 15 | }, 16 | "1_d2_n2": { 17 | "qid": "d2_q2", 18 | "depth": 2, 19 | "question": "What is the significance of the distribution of sample means according to the Central Limit Theorem?", 20 | "answer": "The Central Limit Theorem (CLT) is a statistical theory that states that given a sufficiently large sample size from a population with a finite level of variance, the mean of all samples from the same population will be approximately equal to the mean of the population. The distribution of sample means will approach a normal (bell-shaped) distribution as the sample size increases, regardless of the shape of the population distribution.\n\nThe significance of this is twofold:\n\n1. It allows us to make inferences about the population from sample data. Even if the original population is not normally distributed, we can apply techniques assuming normality if the sample size is large enough.\n\n2. It underpins many statistical tests and procedures that assume a normal distribution, such as confidence intervals and hypothesis testing. Without the CLT, these techniques would not have a solid theoretical foundation.\n\nIn essence, the Central Limit Theorem allows us to apply probabilistic or inferential statistics to data in a robust and justified manner.", 21 | "predicted_answer": "According to the Central Limit Theorem (CLT), the distribution of sample means approaches a normal (bell-shaped) distribution as the sample size increases, regardless of the shape of the population distribution." 22 | }, 23 | "1_d2_n3": { 24 | "qid": "d2_q3", 25 | "depth": 2, 26 | "question": "How does sample size affect the accuracy of statistical estimates in relation to the Law of Large Numbers?", 27 | "answer": "The Law of Large Numbers states that as the sample size increases, the sample mean gets closer and closer to the population mean. In other words, the accuracy of statistical estimates improves with an increase in sample size. This is because larger samples tend to include a greater variety of observations, which therefore gives a better representation of the population. This minimizes the sampling error and the results are more reliable and valid. So, having a larger sample size reduces the effect of randomness and anomaly in the results, making the estimate more accurate.", 28 | "predicted_answer": "I see you're looking for an answer related to the Law of Large Numbers and sample size! According to our previous Q&A pair, the Law of Large Numbers states that as the sample size increases, the sample mean gets closer and closer to the population mean. This means that the accuracy of statistical estimates improves with an increase in sample size. With a larger sample size, you'll have a better representation of the population, which minimizes the sampling error and makes the results more reliable and valid. As a result, having a larger sample size reduces the effect of randomness and anomaly in the results, making the estimate more accurate." 29 | }, 30 | "1_d2_n4": { 31 | "qid": "d2_q4", 32 | "depth": 2, 33 | "question": "Explain how increasing the sample size influences the distribution of sample means as per the Central Limit Theorem?", 34 | "answer": "The Central Limit Theorem (CLT) states that if you have a population with any shape of distribution (even a non-normal distribution), the distribution of sample means obtained through sufficiently large samples drawn from that population will approximate a normal distribution. \n\nIn relation to sample size, CLT suggests that the larger the sample size, the closer the sample mean gets to the population mean. This is due to the fact that increasing the sample size reduces the standard error, thus narrowing the spread of the distribution. \n\nThe distribution of sample means becomes less skewed and more closely approximates a normal distribution as the sample size increases, regardless of the shape of the population distribution. This is one of the key aspects of the Central Limit theorem. Consequently, larger sample sizes lead to more reliable and valid inferences about the population.", 35 | "predicted_answer": "According to the Central Limit Theorem (CLT), increasing the sample size influences the distribution of sample means by:\n\n* Making the distribution of sample means more closely approximate a normal distribution, regardless of the shape of the population distribution.\n* Reducing the standard error, which narrows the spread of the distribution and makes it less skewed.\n* Bringing the sample mean closer to the population mean, making the inferences about the population more reliable and valid.\n\nIn other words, as the sample size increases, the distribution of sample means becomes more normal and centered around the population mean, making it easier to make accurate predictions about the population." 36 | } 37 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /outputs/inference/llama3_8b_prompt-pred.json: -------------------------------------------------------------------------------- 1 | { 2 | "1_d3": { 3 | "qid": "d3_q1", 4 | "depth": 3, 5 | "question": "I think the Law of Large Numbers also tells us something similar to what the Central Limit Theoreom says. Evaluate my understanding.", 6 | "answer": "It's understandable that the concepts of the Law of Large Numbers (LLN) and the Central Limit Theorem (CLT) might seem similar at first glance, as they both involve large sample sizes and their effects on statistical properties. However, they actually address quite different phenomena in the field of statistics.\n\nThe Law of Large Numbers is a principle that focuses on the convergence of sample means to the population mean. What this means is that as you take larger and larger samples from a population, the average (mean) of those samples gets closer and closer to the average (mean) of the entire population. The LLN essentially tells us that the accuracy of the sample mean as an estimate of the population mean improves with larger sample sizes.\n\nOn the other hand, the Central Limit Theorem is concerned with the distribution of these sample means, especially when the sample size is large. According to the CLT, if you take sufficiently large samples from a population (regardless of the population's original distribution) and calculate the mean of each sample, the distribution of these sample means will tend to follow a normal (bell-shaped) distribution. This holds true even if the original population is not normally distributed. The CLT is crucial for enabling us to use normal distribution assumptions and techniques for inference about population means, even when dealing with populations that are not normally distributed themselves.\n\nSo, while the LLN guarantees that the sample means converge to the population mean as the sample size increases, the CLT provides a shape to the distribution of these sample means, stating that they will tend to be normally distributed as the sample size becomes large. This differentiation is essential for correctly applying statistical methods and understanding the behavior of sample means in relation to the population mean.", 7 | "predicted_answer": "That's correct! The Law of Large Numbers (LLN) and the Central Limit Theorem (CLT) both relate to the behavior of sample means as the sample size increases. \n\nThe LLN states that as the sample size increases, the average of the sample means will converge to the population mean with probability 1. This means that as you collect more data, your sample mean will get closer and closer to the true population mean.\n\nThe CLT, on the other hand, states that as the sample size increases, the distribution of sample means will tend towards a normal distribution with a mean equal to the population mean and a variance that is inversely proportional to the sample size.\n\nBoth the LLN and CLT describe the convergence of the sample mean to the population mean as the sample size increases, but they focus on different aspects of this convergence. The LLN focuses on the point estimation aspect, stating that the sample mean will converge to the population mean, while the CLT focuses on the distributional aspect, stating that the distribution of sample means will become more normal.\n\nWell done on recognizing the connection between these two important concepts in statistics!" 8 | }, 9 | "1_d2_n1": { 10 | "qid": "d2_q1", 11 | "depth": 2, 12 | "question": "How can the Law of Large Numbers be applied to improve the accuracy of sample means?", 13 | "answer": "The Law of Large Numbers can be applied to improve the accuracy of sample means by increasing the sample size. According to this law, as the size of a sample increases, the sample mean will get closer to the population mean. This means that increasing the number of data points in your sample will make your sample mean a more accurate estimate of the actual population mean. It reduces the impact of outliers or extreme values and minimizes the effect of randomness or chance, providing a more reliable and valid result.", 14 | "predicted_answer": "According to the Law of Large Numbers (LLN), as the size of a sample increases, the average value of the sample will converge to the population mean, with a probability that approaches 1. This means that as the sample size grows, the sample mean will become a more reliable estimate of the population mean.\n\nApplying the Law of Large Numbers to improve the accuracy of sample means involves collecting a large enough sample size to reduce the variability of the sample mean and increase its precision. By doing so, the sample mean will be less affected by individual data points that may be unusual or outliers, and more representative of the overall pattern of the population.\n\nAdditionally, the LLN states that the spread of the data will decrease as the sample size increases, which means that the sample mean will become a more robust estimator of the population mean. This is especially important in situations where the population mean is difficult to estimate directly, such as when conducting surveys or analyzing data from complex systems.\n\nIn practical terms, applying the Law of Large Numbers to improve the accuracy of sample means means that:\n\n* Increasing the sample size can help reduce the margin of error and increase the precision of the sample mean.\n* A larger sample size can provide a more accurate estimate of the population mean, even if the data points in the sample are noisy or contain outliers.\n* The LLN can be used to construct confidence intervals for the population mean, which can provide a range of values within which the true population mean is likely to lie.\n\nBy understanding and applying the Law of Large Numbers, researchers and practitioners can improve the accuracy of their sample means and make more informed decisions based on the data they collect." 15 | }, 16 | "1_d2_n2": { 17 | "qid": "d2_q2", 18 | "depth": 2, 19 | "question": "What is the significance of the distribution of sample means according to the Central Limit Theorem?", 20 | "answer": "The Central Limit Theorem (CLT) is a statistical theory that states that given a sufficiently large sample size from a population with a finite level of variance, the mean of all samples from the same population will be approximately equal to the mean of the population. The distribution of sample means will approach a normal (bell-shaped) distribution as the sample size increases, regardless of the shape of the population distribution.\n\nThe significance of this is twofold:\n\n1. It allows us to make inferences about the population from sample data. Even if the original population is not normally distributed, we can apply techniques assuming normality if the sample size is large enough.\n\n2. It underpins many statistical tests and procedures that assume a normal distribution, such as confidence intervals and hypothesis testing. Without the CLT, these techniques would not have a solid theoretical foundation.\n\nIn essence, the Central Limit Theorem allows us to apply probabilistic or inferential statistics to data in a robust and justified manner.", 21 | "predicted_answer": "According to the Central Limit Theorem (CLT), the distribution of sample means is significant because it states that, under certain conditions, the distribution of the mean of a large sample of independent and identically distributed random variables will be approximately normally distributed, even if the underlying distribution of the individual observations is not normal.\n\nIn simpler terms, the CLT says that the average value of a large enough sample will follow a normal distribution, regardless of the shape of the original distribution. This means that the distribution of sample means will be bell-shaped, with a single peak and symmetric around the true population mean.\n\nThis has important implications in statistics, as it provides a theoretical foundation for statistical inference and hypothesis testing. By knowing the distribution of sample means, we can make inferences about the population mean and construct confidence intervals and hypothesis tests.\n\nIn essence, the significance of the distribution of sample means according to the CLT lies in its ability to provide a predictable and reliable basis for statistical analysis, allowing us to draw meaningful conclusions about populations based on a small sample of data." 22 | }, 23 | "1_d2_n3": { 24 | "qid": "d2_q3", 25 | "depth": 2, 26 | "question": "How does sample size affect the accuracy of statistical estimates in relation to the Law of Large Numbers?", 27 | "answer": "The Law of Large Numbers states that as the sample size increases, the sample mean gets closer and closer to the population mean. In other words, the accuracy of statistical estimates improves with an increase in sample size. This is because larger samples tend to include a greater variety of observations, which therefore gives a better representation of the population. This minimizes the sampling error and the results are more reliable and valid. So, having a larger sample size reduces the effect of randomness and anomaly in the results, making the estimate more accurate.", 28 | "predicted_answer": "According to the Law of Large Numbers (LLN), as the sample size increases, the average value of the sample will converge to the population mean, with a probability that approaches 1. This means that as the sample size grows, the spread of the data will decrease, and the average value will become more representative of the population.\n\nIn terms of accuracy, increasing the sample size can improve the accuracy of statistical estimates in several ways:\n\n1. **Reduced bias**: With a larger sample size, the sample mean is more likely to be close to the population mean, reducing bias and making the estimate more accurate.\n2. **Increased precision**: As the sample size increases, the standard error of the mean (i.e., the spread of the data around the sample mean) decreases, leading to more precise estimates.\n3. **Improved representation**: A larger sample size provides a more representative picture of the population, allowing for more accurate inference and prediction.\n\nHowever, it's important to note that there is no guarantee that increasing the sample size will always lead to improved accuracy. Factors such as the quality of the data, the complexity of the statistical model, and the presence of outliers can also impact accuracy. Nevertheless, the LLN suggests that, all else being equal, a larger sample size is likely to lead to more accurate statistical estimates." 29 | }, 30 | "1_d2_n4": { 31 | "qid": "d2_q4", 32 | "depth": 2, 33 | "question": "Explain how increasing the sample size influences the distribution of sample means as per the Central Limit Theorem?", 34 | "answer": "The Central Limit Theorem (CLT) states that if you have a population with any shape of distribution (even a non-normal distribution), the distribution of sample means obtained through sufficiently large samples drawn from that population will approximate a normal distribution. \n\nIn relation to sample size, CLT suggests that the larger the sample size, the closer the sample mean gets to the population mean. This is due to the fact that increasing the sample size reduces the standard error, thus narrowing the spread of the distribution. \n\nThe distribution of sample means becomes less skewed and more closely approximates a normal distribution as the sample size increases, regardless of the shape of the population distribution. This is one of the key aspects of the Central Limit theorem. Consequently, larger sample sizes lead to more reliable and valid inferences about the population.", 35 | "predicted_answer": "According to the Central Limit Theorem (CLT), the distribution of the sample mean will become more normal and tighter around the population mean as the sample size increases. This means that as the sample size grows, the sample means will converge towards the population mean, and the variability of the sample means will decrease.\n\nIn simpler terms, the more observations you include in your sample, the more likely it is that the average of those observations will be close to the true average of the entire population. As the sample size increases, the sampling distribution of the mean (i.e., the distribution of sample means) becomes more concentrated around the population mean, with fewer extreme values and a more symmetrical distribution.\n\nThe CLT suggests that, given a large enough sample size, the distribution of sample means will be approximately normal, even if the underlying population distribution is not normal. This has significant implications for statistical inference and hypothesis testing, as it allows us to make more accurate inferences about the population mean based on a random sample.\n\nTo illustrate this, imagine taking a series of samples from the same population, each with a different sample size. As you increase the sample size, the distribution of sample means will become increasingly concentrated around the population mean, with fewer outliers and a more symmetrical shape. This is because the law of large numbers takes effect, which states that the average of a large number of independent random variables will be close to the expected value.\n\nIn summary, increasing the sample size has a profound impact on the distribution of sample means according to the Central Limit Theorem. As the sample size grows, the distribution of sample means becomes more normal, more concentrated around the population mean, and less prone to extreme values, making it easier to make accurate inferences about the population." 36 | } 37 | } -------------------------------------------------------------------------------- /outputs/evaluation/llama3_8b_prompt-gold.json: -------------------------------------------------------------------------------- 1 | { 2 | "1_d3": { 3 | "qid": "d3_q1", 4 | "depth": 3, 5 | "question": "I think the Law of Large Numbers also tells us something similar to what the Central Limit Theoreom says. Evaluate my understanding.", 6 | "answer": "It's understandable that the concepts of the Law of Large Numbers (LLN) and the Central Limit Theorem (CLT) might seem similar at first glance, as they both involve large sample sizes and their effects on statistical properties. However, they actually address quite different phenomena in the field of statistics.\n\nThe Law of Large Numbers is a principle that focuses on the convergence of sample means to the population mean. What this means is that as you take larger and larger samples from a population, the average (mean) of those samples gets closer and closer to the average (mean) of the entire population. The LLN essentially tells us that the accuracy of the sample mean as an estimate of the population mean improves with larger sample sizes.\n\nOn the other hand, the Central Limit Theorem is concerned with the distribution of these sample means, especially when the sample size is large. According to the CLT, if you take sufficiently large samples from a population (regardless of the population's original distribution) and calculate the mean of each sample, the distribution of these sample means will tend to follow a normal (bell-shaped) distribution. This holds true even if the original population is not normally distributed. The CLT is crucial for enabling us to use normal distribution assumptions and techniques for inference about population means, even when dealing with populations that are not normally distributed themselves.\n\nSo, while the LLN guarantees that the sample means converge to the population mean as the sample size increases, the CLT provides a shape to the distribution of these sample means, stating that they will tend to be normally distributed as the sample size becomes large. This differentiation is essential for correctly applying statistical methods and understanding the behavior of sample means in relation to the population mean.", 7 | "predicted_answer": "It's understandable that the concepts of the Law of Large Numbers (LLN) and the Central Limit Theorem (CLT) might seem similar at first glance, as they both involve large sample sizes and their effects on statistical properties. However, they actually address quite different phenomena in the field of statistics.\n\nThe Law of Large Numbers is a principle that focuses on the convergence of sample means to the population mean. What this means is that as you take larger and larger samples from a population, the average (mean) of those samples gets closer and closer to the average (mean) of the entire population. The LLN essentially tells us that the accuracy of the sample mean as an estimate of the population mean improves with larger sample sizes.\n\nOn the other hand, the Central Limit Theorem is concerned with the distribution of these sample means, especially when the sample size is large. According to the CLT, if you take sufficiently large samples from a population (regardless of the population's original distribution) and calculate the mean of each sample, the distribution of these sample means will tend to follow a normal (bell-shaped) distribution. This holds true even if the original population is not normally distributed. The CLT is crucial for enabling us to use normal distribution assumptions and techniques for inference about population means, even when dealing with populations that are not normally distributed themselves.\n\nSo, while the LLN guarantees that the sample means converge to the population mean as the sample size increases, the CLT provides a shape to the distribution of these sample means, stating that they will tend to be normally distributed as the sample size becomes large. This differentiation is essential for correctly applying statistical methods and understanding the behavior of sample means in relation to the population mean.\n\nSo, your understanding is partially correct, but it seems you were connecting the LLN to the aspect of the CLT that deals with the distribution of sample means becoming normal. While the LLN does indeed provide information about the convergence of sample means to the population mean, the CLT is a distinct concept that explains the specific shape of the distribution of sample means.", 8 | "feedback": "Feedback: The response provided is consistently correct, accurate, and entirely factual. It accurately distinguishes between the Law of Large Numbers (LLN) and the Central Limit Theorem (CLT), explaining their different applications in statistics. The explanation that the LLN concerns the convergence of sample means to the population mean, and the CLT describes the distribution of these sample means as normally distributed when the sample size is large, demonstrates a comprehensive understanding of the concepts. The response addresses the potential confusion between the LLN and CLT with clear, precise explanations, and no inaccuracies or omissions are present. This shows a deep understanding of the topic, meeting the highest standards set by the score rubric.", 9 | "score": 5 10 | }, 11 | "1_d2_n1": { 12 | "qid": "d2_q1", 13 | "depth": 2, 14 | "question": "How can the Law of Large Numbers be applied to improve the accuracy of sample means?", 15 | "answer": "The Law of Large Numbers can be applied to improve the accuracy of sample means by increasing the sample size. According to this law, as the size of a sample increases, the sample mean will get closer to the population mean. This means that increasing the number of data points in your sample will make your sample mean a more accurate estimate of the actual population mean. It reduces the impact of outliers or extreme values and minimizes the effect of randomness or chance, providing a more reliable and valid result.", 16 | "predicted_answer": "I can answer that for you!\n\nAccording to the Law of Large Numbers, increasing the sample size can improve the accuracy of sample means. As the size of the sample increases, the sample mean will get closer to the population mean, making it a more accurate estimate of the actual population mean. This means that having a larger sample size reduces the impact of outliers or extreme values and minimizes the effect of randomness or chance, providing a more reliable and valid result.", 17 | "feedback": "Feedback: The response accurately and correctly explains the application of the Law of Large Numbers in improving the accuracy of sample means. It highlights how increasing the sample size leads to a sample mean that is closer to the population mean, thereby reducing the impact of outliers and the effect of randomness. This explanation aligns well with the reference answer, demonstrating a comprehensive understanding of the query and topic without any inaccuracies or significant omissions. It provides a detailed, precise, and entirely factual explanation, satisfying the criteria for the highest score based on the rubrics.", 18 | "score": 5 19 | }, 20 | "1_d2_n2": { 21 | "qid": "d2_q2", 22 | "depth": 2, 23 | "question": "What is the significance of the distribution of sample means according to the Central Limit Theorem?", 24 | "answer": "The Central Limit Theorem (CLT) is a statistical theory that states that given a sufficiently large sample size from a population with a finite level of variance, the mean of all samples from the same population will be approximately equal to the mean of the population. The distribution of sample means will approach a normal (bell-shaped) distribution as the sample size increases, regardless of the shape of the population distribution.\n\nThe significance of this is twofold:\n\n1. It allows us to make inferences about the population from sample data. Even if the original population is not normally distributed, we can apply techniques assuming normality if the sample size is large enough.\n\n2. It underpins many statistical tests and procedures that assume a normal distribution, such as confidence intervals and hypothesis testing. Without the CLT, these techniques would not have a solid theoretical foundation.\n\nIn essence, the Central Limit Theorem allows us to apply probabilistic or inferential statistics to data in a robust and justified manner.", 25 | "predicted_answer": "According to the Central Limit Theorem (CLT), the distribution of sample means approaches a normal (bell-shaped) distribution as the sample size increases, regardless of the shape of the population distribution.", 26 | "feedback": "Feedback: The response accurately explains the Central Limit Theorem (CLT), mentioning how the distribution of sample means approaches a normal distribution as sample size increases, which is correct and aligns with the fundamental concept of CLT. However, the response lacks the detailed explanation and significance provided in the reference answer, such as making inferences about the population and the foundation it provides for statistical tests and procedures. While the response is correct and shows a good understanding of CLT, it does not fully explore the implications and importance of the theorem, thus missing out on some depth and detail that would enhance its completeness and accuracy. Based on these observations, the response demonstrates a strong understanding of the CLT but falls short of providing a comprehensive explanation of its significance.", 27 | "score": 4 28 | }, 29 | "1_d2_n3": { 30 | "qid": "d2_q3", 31 | "depth": 2, 32 | "question": "How does sample size affect the accuracy of statistical estimates in relation to the Law of Large Numbers?", 33 | "answer": "The Law of Large Numbers states that as the sample size increases, the sample mean gets closer and closer to the population mean. In other words, the accuracy of statistical estimates improves with an increase in sample size. This is because larger samples tend to include a greater variety of observations, which therefore gives a better representation of the population. This minimizes the sampling error and the results are more reliable and valid. So, having a larger sample size reduces the effect of randomness and anomaly in the results, making the estimate more accurate.", 34 | "predicted_answer": "I see you're looking for an answer related to the Law of Large Numbers and sample size! According to our previous Q&A pair, the Law of Large Numbers states that as the sample size increases, the sample mean gets closer and closer to the population mean. This means that the accuracy of statistical estimates improves with an increase in sample size. With a larger sample size, you'll have a better representation of the population, which minimizes the sampling error and makes the results more reliable and valid. As a result, having a larger sample size reduces the effect of randomness and anomaly in the results, making the estimate more accurate.", 35 | "feedback": "Feedback: The response effectively communicates how the Law of Large Numbers relates to sample size and its impact on the accuracy of statistical estimates. It clearly states that as sample size increases, the sample mean approaches the population mean, which in turn enhances the accuracy of statistical estimates. This explanation mirrors the reference answer, emphasizing the reduction of sampling error and the minimization of the effect of randomness with larger sample sizes. The response is accurate, factual, and demonstrates a comprehensive understanding of the topic, aligning closely with the reference answer's content and structure. There are no notable inaccuracies or omissions that detract from the quality of the explanation provided.", 36 | "score": 5 37 | }, 38 | "1_d2_n4": { 39 | "qid": "d2_q4", 40 | "depth": 2, 41 | "question": "Explain how increasing the sample size influences the distribution of sample means as per the Central Limit Theorem?", 42 | "answer": "The Central Limit Theorem (CLT) states that if you have a population with any shape of distribution (even a non-normal distribution), the distribution of sample means obtained through sufficiently large samples drawn from that population will approximate a normal distribution. \n\nIn relation to sample size, CLT suggests that the larger the sample size, the closer the sample mean gets to the population mean. This is due to the fact that increasing the sample size reduces the standard error, thus narrowing the spread of the distribution. \n\nThe distribution of sample means becomes less skewed and more closely approximates a normal distribution as the sample size increases, regardless of the shape of the population distribution. This is one of the key aspects of the Central Limit theorem. Consequently, larger sample sizes lead to more reliable and valid inferences about the population.", 43 | "predicted_answer": "According to the Central Limit Theorem (CLT), increasing the sample size influences the distribution of sample means by:\n\n* Making the distribution of sample means more closely approximate a normal distribution, regardless of the shape of the population distribution.\n* Reducing the standard error, which narrows the spread of the distribution and makes it less skewed.\n* Bringing the sample mean closer to the population mean, making the inferences about the population more reliable and valid.\n\nIn other words, as the sample size increases, the distribution of sample means becomes more normal and centered around the population mean, making it easier to make accurate predictions about the population.", 44 | "feedback": "Feedback: The response accurately and effectively communicates how increasing the sample size influences the distribution of sample means in accordance with the Central Limit Theorem. It correctly identifies that larger sample sizes lead to a distribution of sample means that is more normal, less skewed, and more closely centered around the population mean. Additionally, the explanation regarding the reduction of the standard error and its effects on the distribution's spread directly aligns with the principles of the Central Limit Theorem. The response is clear, factually correct, and encompasses all the critical aspects related to the influence of sample size on the distribution of sample means as described in the reference answer. There are no significant inaccuracies, omissions, or factual errors detected in the evaluation of the given response according to the rubric provided.", 45 | "score": 5 46 | } 47 | } -------------------------------------------------------------------------------- /outputs/inference/gpt-3.5-turbo_zero-shot.json: -------------------------------------------------------------------------------- 1 | { 2 | "d3_q1": { 3 | "qid": "d3_q1", 4 | "depth": 3, 5 | "question": "I think the Law of Large Numbers also tells us something similar to what the Central Limit Theoreom says. Evaluate my understanding.", 6 | "answer": "It's understandable that the concepts of the Law of Large Numbers (LLN) and the Central Limit Theorem (CLT) might seem similar at first glance, as they both involve large sample sizes and their effects on statistical properties. However, they actually address quite different phenomena in the field of statistics.\n\nThe Law of Large Numbers is a principle that focuses on the convergence of sample means to the population mean. What this means is that as you take larger and larger samples from a population, the average (mean) of those samples gets closer and closer to the average (mean) of the entire population. The LLN essentially tells us that the accuracy of the sample mean as an estimate of the population mean improves with larger sample sizes.\n\nOn the other hand, the Central Limit Theorem is concerned with the distribution of these sample means, especially when the sample size is large. According to the CLT, if you take sufficiently large samples from a population (regardless of the population's original distribution) and calculate the mean of each sample, the distribution of these sample means will tend to follow a normal (bell-shaped) distribution. This holds true even if the original population is not normally distributed. The CLT is crucial for enabling us to use normal distribution assumptions and techniques for inference about population means, even when dealing with populations that are not normally distributed themselves.\n\nSo, while the LLN guarantees that the sample means converge to the population mean as the sample size increases, the CLT provides a shape to the distribution of these sample means, stating that they will tend to be normally distributed as the sample size becomes large. This differentiation is essential for correctly applying statistical methods and understanding the behavior of sample means in relation to the population mean.", 7 | "predicted_answer": "Your understanding is on the right track. The Law of Large Numbers and the Central Limit Theorem are both important concepts in statistics and probability theory. \n\nThe Law of Large Numbers states that as the sample size of a population grows larger, the sample mean will tend to get closer to the population mean. In simpler terms, the average of a large number of independent measurements from the same distribution will converge to the true expected value of that distribution.\n\nOn the other hand, the Central Limit Theorem states that the distribution of the sample mean of a large number of independent, identically distributed random variables will be approximately normally distributed, regardless of the original distribution of the random variables. This theorem is crucial in statistical inference as it allows us to make assumptions about the sampling distribution of the sample mean.\n\nTherefore, while the Law of Large Numbers focuses on the convergence of sample mean to population mean as sample size grows, the Central Limit Theorem deals with the distribution of the sample mean itself. In summary, they are related concepts that complement each other in the field of statistics." 8 | }, 9 | "d3_q2": { 10 | "qid": "d3_q2", 11 | "depth": 3, 12 | "question": "I'm trying to understand constant acceleration in special relativity. Suppose I'm in a spaceship that starts moving with a constant accerelation of 10m/s^2, creating the illusion of having gravity equivalent to the Earth's onboard. How much time would have passed for outsiders, after I have been traveling for 10 years.", 13 | "answer": "To understand the time that would have passed for outsiders after traveling with a constant acceleration of 10m/s^2 for 10 years in a spaceship, we must delve into the principles of special relativity, specifically focusing on the concept of constant acceleration and its implications on time dilation.\n\nIn the context of special relativity, when a spaceship accelerates at a constant rate, the time experienced by those inside the spaceship and an outside observer diverges due to time dilation. The relationship between the proper time ($\\tau$) inside the spaceship (the time experienced by the passengers) and the coordinate time ($t$) observed from an inertial frame outside the spaceship can be derived using the formulas for constant proper acceleration. \n\nGiven that the spaceship accelerates with constant proper acceleration $a$, the relationship between the proper time $\\tau$ and the coordinate time $t$ can be expressed through the hyperbolic functions:\n\\[t = \\frac{1}{a} \\sinh(a\\tau)\\]\n\\[x = \\frac{1}{a} \\cosh(a\\tau)\\]\n\nwhere $t$ and $x$ describe the position of the spaceship in time and space from the perspective of an outside observer, and $a$ is the constant proper acceleration, and $\\tau$ is the proper time experienced inside the spaceship.\n\nTaking the proper acceleration $a = 10\\, \\text{m/s}^2$ and the proper time $\\tau = 10 \\text{ years}$, we first convert $\\tau$ into seconds to align with the SI unit system:\n\\[10 \\text{ years} = 10 \\times 365.25 \\times 24 \\times 3600 \\text{ seconds} \\approx 315,576,000 \\text{ seconds}\\]\n\nThe constant proper acceleration $a$ needs to be in the unit of $\\text{s}^{-2}$ for the formula, so:\n\\[a = 10\\, \\text{m/s}^2 = 10\\, \\text{m/s}^2 \\times \\frac{1}{c^2} = 10 \\times \\frac{1}{(3 \\times 10^8)^2} \\text{s}^{-2}\\]\n\nNow, applying the value of $\\tau$ to the equation for $t$, we get:\n\\[t = \\frac{1}{a} \\sinh(a\\tau)\\]\n\nGiven the very large value of $\\tau$ and the nature of the hyperbolic sine function, $\\sinh(a\\tau)$, the result will be an extremely large number, reflecting the significant time dilation effect at such high constant acceleration over a long period.\n\nFor computational purposes, without diving into the exact computation which requires handling the hyperbolic sine of a very large product of $a\\tau$, we rely on the key point provided that indicates the answer is roughly 10,000 years. This is derived from understanding that the hyperbolic sine function grows exponentially for large arguments, indicating that the time experienced outside the spaceship, or $t$, would be vastly greater than the time experienced aboard, $\\tau$.\n\nThus, for someone traveling inside a spaceship at a constant acceleration of 10m/s^2 for what feels like 10 years to them, approximately 10,000 years would have passed for an observer outside the spaceship. This showcases the profound effects of constant acceleration and time dilation as predicted by the theory of special relativity.", 14 | "predicted_answer": "In special relativity, time dilation occurs when an object is moving at constant velocity or with constant acceleration. When an object is accelerating, time dilation effects become more complex. However, we can still make an approximate calculation using the concept of proper time.\n\nIn your scenario, you are experiencing constant acceleration of 10m/s^2, creating the illusion of gravity equivalent to Earth's onboard. To an observer on Earth, due to time dilation effects, time would be passing slower for you compared to them.\n\nTo calculate the time that would have passed for outsiders after you have been traveling for 10 years (proper time), we need to take into account the effects of acceleration and the resulting time dilation. \n\nGiven the constant acceleration of 10m/s^2, the time dilation factor can be calculated using the formula:\n\n\\[ t' = \\frac{t}{\\sqrt{1 - \\frac{v^2}{c^2}}} \\]\n\nwhere:\n- \\( t \\) is the proper time experienced by you (10 years in this case),\n- \\( v \\) is the velocity of the spaceship relative to the Earth,\n- \\( c \\) is the speed of light.\n\nSince the spaceship is accelerating, the velocity will be changing. Without knowing the specific velocity at any given point during the 10-year journey, we can't provide an exact calculation for the total time that has passed for outsiders.\n\nTherefore, to provide a precise answer, we would need more information about the velocity profile of the spaceship during the 10-year journey." 15 | }, 16 | "d3_q3": { 17 | "qid": "d3_q3", 18 | "depth": 3, 19 | "question": "I do not understand why $B=[a,b]\times[c,d] \\subset {\\mathbb{R}}^2$ is compact. Here is my attempt at a proof: suppose I have a sequence $( (x_k,y_k) )_{k=1}^{\\infty}$ in $B$. According to Bolzano--Weierstrass, the sequences $(x_k)_{k=1}^{\\infty}$ and $(y_k)_{k=1}^{\\infty}$ both have a convergent subsequence. This means that I can find $(k_i)_{I=1}^{\\infty}$ and $(l_j)_{j=1}^\\infty$ two increasing sequences of integers such that $(x_{k_i})_{I=1}^\\infty$ and $(y_{l_j})_{j=1}^\\infty$ converge. So $( (x_{k_i},y_{l_i}) )_{I=1}^\\infty$ converges. But I do not understand why $( (x_{k_i},y_{l_i}) )_{I=1}^\\infty$ is a subsequence of $( (x_k,y_k) )_{k=1}^\\infty$?", 20 | "answer": "The misunderstanding in your attempt lies in the assumption that the sequence $\\{ (x_{k_i}, y_{l_i}) \\}_{i=1}^{\\infty}$ is a subsequence of the original sequence $\\{ (x_k, y_k) \\}_{k=1}^{\\infty}$. This assumption is incorrect because the indices $k_i$ and $l_j$ chosen for the subsequences of $x_k$ and $y_k$ respectively, do not necessarily align to extract a joint subsequence of pairs from the original sequence. To clarify, let's break down the correct approach to ensure that we extract a legitimate subsequence from the original sequence that demonstrates compactness.\n\nFirst, from the sequence $\\{x_k\\}_{k=1}^{\\infty}$, we extract a convergent subsequence $\\{x_{k_i}\\}_{i=1}^{\\infty}$. This step is correctly based on the Bolzano-Weierstrass theorem, which guarantees that any bounded sequence in $\\mathbb{R}$ has a convergent subsequence.\n\nSubsequently, instead of directly extracting a subsequence from $\\{y_k\\}_{k=1}^{\\infty}$, we focus on $\\{y_{k_i}\\}_{i=1}^{\\infty}$, which is the sequence of $y$-coordinates corresponding to the $x$-coordinate subsequence we have already chosen. Since $\\{y_{k_i}\\}_{i=1}^{\\infty}$ is also bounded (because it is derived from the bounded set $B$), we can apply the Bolzano-Weierstrass theorem again to extract a subsequence $\\{y_{k_{i_j}}\\}_{j=1}^{\\infty}$ that converges.\n\nCrucially, since $\\{y_{k_{i_j}}\\}_{j=1}^{\\infty}$ is a subsequence of $\\{y_{k_i}\\}_{i=1}^{\\infty}$, and $\\{x_{k_{i_j}}\\}_{j=1}^{\\infty}$ is simultaneously a subsequence of $\\{x_{k_i}\\}_{i=1}^{\\infty}$, the pair $\\{ (x_{k_{i_j}}, y_{k_{i_j}}) \\}_{j=1}^{\\infty}$ forms a valid subsequence of the original sequence $\\{ (x_k, y_k) \\}_{k=1}^{\\infty}$. This is because both subsequences of $x$ and $y$ coordinates are extracted using the same sequence of indices $k_{i_j}$.\n\nThis correct approach ensures that we have a legitimate subsequence of the original sequence, demonstrating the compactness of $B=[a,b]\\times[c,d] \\subset \\mathbb{R}^2$. The compactness is shown by proving that every sequence in $B$ has a convergent subsequence whose limit is also within $B$, satisfying the definition of compactness in metric spaces.", 21 | "predicted_answer": "Your proof is on the right track, but you made a mistake in concluding that $( (x_{k_i},y_{l_i}) )_{I=1}^\\infty$ is a subsequence of $( (x_k,y_k) )_{k=1}^\\infty$. In order for $( (x_{k_i},y_{l_i}) )_{I=1}^\\infty$ to be a subsequence of $( (x_k,y_k) )_{k=1}^\\infty$, you need to show that there exists a strictly increasing sequence of indices $(m_n)_{n=1}^{\\infty}$ such that $( (x_{m_n},y_{m_n}) )_{n=1}^{\\infty}$ is a subsequence of $( (x_k,y_k) )_{k=1}^{\\infty}$.\n\nTo correct your proof, consider defining a new sequence by taking elements from $(x_k,y_k)$ at indices given by the increasing sequence $(\\max(k_i,l_i))_{i=1}^{\\infty}$. This will give you a subsequence of $(x_k,y_k)$ that converges. Therefore, $B=[a,b]\\times[c,d]$ is compact." 22 | }, 23 | "d3_q4": { 24 | "qid": "d3_q4", 25 | "depth": 3, 26 | "question": "I thought that animals from different species could not produce viable offspring. However, a horse and a donkey can produce viable offspring. How can this contradiction be explained?", 27 | "answer": "The observation that a horse and a donkey can produce offspring, specifically a mule, might seem at first to contradict the biological definition of species. According to this definition, a species is generally characterized by a group of individuals that are actually or potentially capable of interbreeding to produce viable and fertile offspring. This means that, under normal circumstances, members of the same species can mate and their offspring can also reproduce.\n\nHowever, the case of the horse and donkey mating to produce a mule serves as an exception to the general rule rather than a contradiction. While horses and donkeys are indeed different species, they are closely related enough that they can mate and produce hybrid offspring, which we refer to as mules. The crucial point here is that, although mules are viable in the sense that they are alive and can grow to adulthood, they are generally sterile. This sterility means that mules cannot produce offspring of their own when they mate. The inability of mules to reproduce is a key aspect of why horses and donkeys are still considered separate species. \n\nTherefore, the production of a mule by a horse and a donkey does not challenge the biological definition of species. Instead, it highlights an exception where two different species can produce viable but sterile offspring. This exception emphasizes the complexity of nature and the fact that while biological definitions and rules provide a framework for understanding, there are always special cases that require further explanation.", 28 | "predicted_answer": "The ability of a horse and a donkey to produce viable offspring, known as a mule, is an exception to the general rule that animals from different species cannot produce fertile offspring. Horses and donkeys are closely related species that belong to the same genus, Equus. While they have a different number of chromosomes (64 for a horse and 62 for a donkey), they are similar enough genetically to produce viable mules. Mules are generally sterile because of the uneven number of chromosomes, which makes it difficult for them to produce viable gametes." 29 | }, 30 | "d3_q5": { 31 | "qid": "d3_q5", 32 | "depth": 3, 33 | "question": "What does differences in wave propagation velocity along the basilar membrane have to do with the ear's ability to Fourier transform mechanical vibrations?", 34 | "answer": "The ability of the ear to perform a Fourier transform on mechanical vibrations, essentially breaking down complex sounds into their constituent frequencies, is intricately tied to the differences in wave propagation velocity along the basilar membrane within the cochlea. To understand this relationship, it's crucial to grasp the basic physics principle that relates frequency \\((f)\\), velocity \\((v)\\), and wavelength \\((\\lambda)\\) through the equation \\(f = \\frac{v}{\\lambda}\\). This equation indicates that the frequency of a wave is equal to its velocity divided by its wavelength.\n\nIn the context of the basilar membrane in the cochlea, this relationship has profound implications. The basilar membrane varies in stiffness from one end to the other: it is stiffer at the base (near the oval window where sound enters from the middle ear) and less stiff at the apex (the far end of the cochlea). This gradient in stiffness leads to a gradient in wave propagation velocity along the membrane. Higher-frequency sounds, which have shorter wavelengths, cause maximal resonant vibrations at the stiffer base, where the velocity of wave propagation is higher. Conversely, lower-frequency sounds, with their longer wavelengths, travel further along the membrane and cause maximal resonant vibrations closer to the less stiff apex, where the wave propagation velocity is slower.\n\nThe implications of this are remarkable for auditory perception. As sound waves enter the cochlea and travel along the basilar membrane, their components of different frequencies resonate at different locations along the membrane. This spatial differentiation of frequencies essentially performs a mechanical Fourier transform, segregating the sound into its constituent frequencies. The cochlea translates these vibrations into neural signals, with specific neurons firing in response to vibrations at their respective locations along the membrane. Therefore, the ear's ability to differentiate between frequencies - a foundational aspect of hearing - is directly tied to the differences in wave propagation velocity along the basilar membrane, allowing the cochlea to act as a spectral analyzer for sound.", 35 | "predicted_answer": "The differences in wave propagation velocity along the basilar membrane are essential for the ear's ability to perform a type of frequency analysis known as the Fourier transform of mechanical vibrations. The basilar membrane in the inner ear is tonotopically organized, meaning that different regions of the membrane respond to different frequencies of sound. \n\nAs sound waves travel through the ear canal and reach the cochlea, they cause the basilar membrane to vibrate. The variation in the stiffness and width of the membrane causes different parts of the membrane to vibrate at different frequencies. This results in a frequency-to-place mapping along the basilar membrane, where high frequencies are encoded at the base (stiff and narrow region) and low frequencies at the apex (more flexible and wide region).\n\nBy analyzing the differences in wave propagation velocity along the basilar membrane, the ear is able to decompose complex sound waves into their individual frequency components. This process, akin to a Fourier transform, allows the brain to perceive and distinguish various pitches and frequencies present in the incoming sound waves, which is fundamental for our sense of hearing." 36 | } 37 | } -------------------------------------------------------------------------------- /DATA_LICENSE: -------------------------------------------------------------------------------- 1 | Attribution 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution 4.0 International Public License 58 | 59 | By exercising the Licensed Rights (defined below), You accept and agree 60 | to be bound by the terms and conditions of this Creative Commons 61 | Attribution 4.0 International Public License ("Public License"). To the 62 | extent this Public License may be interpreted as a contract, You are 63 | granted the Licensed Rights in consideration of Your acceptance of 64 | these terms and conditions, and the Licensor grants You such rights in 65 | consideration of benefits the Licensor receives from making the 66 | Licensed Material available under these terms and conditions. 67 | 68 | 69 | Section 1 -- Definitions. 70 | 71 | a. Adapted Material means material subject to Copyright and Similar 72 | Rights that is derived from or based upon the Licensed Material 73 | and in which the Licensed Material is translated, altered, 74 | arranged, transformed, or otherwise modified in a manner requiring 75 | permission under the Copyright and Similar Rights held by the 76 | Licensor. For purposes of this Public License, where the Licensed 77 | Material is a musical work, performance, or sound recording, 78 | Adapted Material is always produced where the Licensed Material is 79 | synched in timed relation with a moving image. 80 | 81 | b. Adapter's License means the license You apply to Your Copyright 82 | and Similar Rights in Your contributions to Adapted Material in 83 | accordance with the terms and conditions of this Public License. 84 | 85 | c. Copyright and Similar Rights means copyright and/or similar rights 86 | closely related to copyright including, without limitation, 87 | performance, broadcast, sound recording, and Sui Generis Database 88 | Rights, without regard to how the rights are labeled or 89 | categorized. For purposes of this Public License, the rights 90 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 91 | Rights. 92 | 93 | d. Effective Technological Measures means those measures that, in the 94 | absence of proper authority, may not be circumvented under laws 95 | fulfilling obligations under Article 11 of the WIPO Copyright 96 | Treaty adopted on December 20, 1996, and/or similar international 97 | agreements. 98 | 99 | e. Exceptions and Limitations means fair use, fair dealing, and/or 100 | any other exception or limitation to Copyright and Similar Rights 101 | that applies to Your use of the Licensed Material. 102 | 103 | f. Licensed Material means the artistic or literary work, database, 104 | or other material to which the Licensor applied this Public 105 | License. 106 | 107 | g. Licensed Rights means the rights granted to You subject to the 108 | terms and conditions of this Public License, which are limited to 109 | all Copyright and Similar Rights that apply to Your use of the 110 | Licensed Material and that the Licensor has authority to license. 111 | 112 | h. Licensor means the individual(s) or entity(ies) granting rights 113 | under this Public License. 114 | 115 | i. Share means to provide material to the public by any means or 116 | process that requires permission under the Licensed Rights, such 117 | as reproduction, public display, public performance, distribution, 118 | dissemination, communication, or importation, and to make material 119 | available to the public including in ways that members of the 120 | public may access the material from a place and at a time 121 | individually chosen by them. 122 | 123 | j. Sui Generis Database Rights means rights other than copyright 124 | resulting from Directive 96/9/EC of the European Parliament and of 125 | the Council of 11 March 1996 on the legal protection of databases, 126 | as amended and/or succeeded, as well as other essentially 127 | equivalent rights anywhere in the world. 128 | 129 | k. You means the individual or entity exercising the Licensed Rights 130 | under this Public License. Your has a corresponding meaning. 131 | 132 | 133 | Section 2 -- Scope. 134 | 135 | a. License grant. 136 | 137 | 1. Subject to the terms and conditions of this Public License, 138 | the Licensor hereby grants You a worldwide, royalty-free, 139 | non-sublicensable, non-exclusive, irrevocable license to 140 | exercise the Licensed Rights in the Licensed Material to: 141 | 142 | a. reproduce and Share the Licensed Material, in whole or 143 | in part; and 144 | 145 | b. produce, reproduce, and Share Adapted Material. 146 | 147 | 2. Exceptions and Limitations. For the avoidance of doubt, where 148 | Exceptions and Limitations apply to Your use, this Public 149 | License does not apply, and You do not need to comply with 150 | its terms and conditions. 151 | 152 | 3. Term. The term of this Public License is specified in Section 153 | 6(a). 154 | 155 | 4. Media and formats; technical modifications allowed. The 156 | Licensor authorizes You to exercise the Licensed Rights in 157 | all media and formats whether now known or hereafter created, 158 | and to make technical modifications necessary to do so. The 159 | Licensor waives and/or agrees not to assert any right or 160 | authority to forbid You from making technical modifications 161 | necessary to exercise the Licensed Rights, including 162 | technical modifications necessary to circumvent Effective 163 | Technological Measures. For purposes of this Public License, 164 | simply making modifications authorized by this Section 2(a) 165 | (4) never produces Adapted Material. 166 | 167 | 5. Downstream recipients. 168 | 169 | a. Offer from the Licensor -- Licensed Material. Every 170 | recipient of the Licensed Material automatically 171 | receives an offer from the Licensor to exercise the 172 | Licensed Rights under the terms and conditions of this 173 | Public License. 174 | 175 | b. No downstream restrictions. You may not offer or impose 176 | any additional or different terms or conditions on, or 177 | apply any Effective Technological Measures to, the 178 | Licensed Material if doing so restricts exercise of the 179 | Licensed Rights by any recipient of the Licensed 180 | Material. 181 | 182 | 6. No endorsement. Nothing in this Public License constitutes or 183 | may be construed as permission to assert or imply that You 184 | are, or that Your use of the Licensed Material is, connected 185 | with, or sponsored, endorsed, or granted official status by, 186 | the Licensor or others designated to receive attribution as 187 | provided in Section 3(a)(1)(A)(i). 188 | 189 | b. Other rights. 190 | 191 | 1. Moral rights, such as the right of integrity, are not 192 | licensed under this Public License, nor are publicity, 193 | privacy, and/or other similar personality rights; however, to 194 | the extent possible, the Licensor waives and/or agrees not to 195 | assert any such rights held by the Licensor to the limited 196 | extent necessary to allow You to exercise the Licensed 197 | Rights, but not otherwise. 198 | 199 | 2. Patent and trademark rights are not licensed under this 200 | Public License. 201 | 202 | 3. To the extent possible, the Licensor waives any right to 203 | collect royalties from You for the exercise of the Licensed 204 | Rights, whether directly or through a collecting society 205 | under any voluntary or waivable statutory or compulsory 206 | licensing scheme. In all other cases the Licensor expressly 207 | reserves any right to collect such royalties. 208 | 209 | 210 | Section 3 -- License Conditions. 211 | 212 | Your exercise of the Licensed Rights is expressly made subject to the 213 | following conditions. 214 | 215 | a. Attribution. 216 | 217 | 1. If You Share the Licensed Material (including in modified 218 | form), You must: 219 | 220 | a. retain the following if it is supplied by the Licensor 221 | with the Licensed Material: 222 | 223 | i. identification of the creator(s) of the Licensed 224 | Material and any others designated to receive 225 | attribution, in any reasonable manner requested by 226 | the Licensor (including by pseudonym if 227 | designated); 228 | 229 | ii. a copyright notice; 230 | 231 | iii. a notice that refers to this Public License; 232 | 233 | iv. a notice that refers to the disclaimer of 234 | warranties; 235 | 236 | v. a URI or hyperlink to the Licensed Material to the 237 | extent reasonably practicable; 238 | 239 | b. indicate if You modified the Licensed Material and 240 | retain an indication of any previous modifications; and 241 | 242 | c. indicate the Licensed Material is licensed under this 243 | Public License, and include the text of, or the URI or 244 | hyperlink to, this Public License. 245 | 246 | 2. You may satisfy the conditions in Section 3(a)(1) in any 247 | reasonable manner based on the medium, means, and context in 248 | which You Share the Licensed Material. For example, it may be 249 | reasonable to satisfy the conditions by providing a URI or 250 | hyperlink to a resource that includes the required 251 | information. 252 | 253 | 3. If requested by the Licensor, You must remove any of the 254 | information required by Section 3(a)(1)(A) to the extent 255 | reasonably practicable. 256 | 257 | 4. If You Share Adapted Material You produce, the Adapter's 258 | License You apply must not prevent recipients of the Adapted 259 | Material from complying with this Public License. 260 | 261 | 262 | Section 4 -- Sui Generis Database Rights. 263 | 264 | Where the Licensed Rights include Sui Generis Database Rights that 265 | apply to Your use of the Licensed Material: 266 | 267 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 268 | to extract, reuse, reproduce, and Share all or a substantial 269 | portion of the contents of the database; 270 | 271 | b. if You include all or a substantial portion of the database 272 | contents in a database in which You have Sui Generis Database 273 | Rights, then the database in which You have Sui Generis Database 274 | Rights (but not its individual contents) is Adapted Material; and 275 | 276 | c. You must comply with the conditions in Section 3(a) if You Share 277 | all or a substantial portion of the contents of the database. 278 | 279 | For the avoidance of doubt, this Section 4 supplements and does not 280 | replace Your obligations under this Public License where the Licensed 281 | Rights include other Copyright and Similar Rights. 282 | 283 | 284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 285 | 286 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 287 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 288 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 289 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 290 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 291 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 292 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 293 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 294 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 295 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 296 | 297 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 298 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 299 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 300 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 301 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 302 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 303 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 304 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 305 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 306 | 307 | c. The disclaimer of warranties and limitation of liability provided 308 | above shall be interpreted in a manner that, to the extent 309 | possible, most closely approximates an absolute disclaimer and 310 | waiver of all liability. 311 | 312 | 313 | Section 6 -- Term and Termination. 314 | 315 | a. This Public License applies for the term of the Copyright and 316 | Similar Rights licensed here. However, if You fail to comply with 317 | this Public License, then Your rights under this Public License 318 | terminate automatically. 319 | 320 | b. Where Your right to use the Licensed Material has terminated under 321 | Section 6(a), it reinstates: 322 | 323 | 1. automatically as of the date the violation is cured, provided 324 | it is cured within 30 days of Your discovery of the 325 | violation; or 326 | 327 | 2. upon express reinstatement by the Licensor. 328 | 329 | For the avoidance of doubt, this Section 6(b) does not affect any 330 | right the Licensor may have to seek remedies for Your violations 331 | of this Public License. 332 | 333 | c. For the avoidance of doubt, the Licensor may also offer the 334 | Licensed Material under separate terms or conditions or stop 335 | distributing the Licensed Material at any time; however, doing so 336 | will not terminate this Public License. 337 | 338 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 339 | License. 340 | 341 | 342 | Section 7 -- Other Terms and Conditions. 343 | 344 | a. The Licensor shall not be bound by any additional or different 345 | terms or conditions communicated by You unless expressly agreed. 346 | 347 | b. Any arrangements, understandings, or agreements regarding the 348 | Licensed Material not stated herein are separate from and 349 | independent of the terms and conditions of this Public License. 350 | 351 | 352 | Section 8 -- Interpretation. 353 | 354 | a. For the avoidance of doubt, this Public License does not, and 355 | shall not be interpreted to, reduce, limit, restrict, or impose 356 | conditions on any use of the Licensed Material that could lawfully 357 | be made without permission under this Public License. 358 | 359 | b. To the extent possible, if any provision of this Public License is 360 | deemed unenforceable, it shall be automatically reformed to the 361 | minimum extent necessary to make it enforceable. If the provision 362 | cannot be reformed, it shall be severed from this Public License 363 | without affecting the enforceability of the remaining terms and 364 | conditions. 365 | 366 | c. No term or condition of this Public License will be waived and no 367 | failure to comply consented to unless expressly agreed to by the 368 | Licensor. 369 | 370 | d. Nothing in this Public License constitutes or may be interpreted 371 | as a limitation upon, or waiver of, any privileges and immunities 372 | that apply to the Licensor or You, including from the legal 373 | processes of any jurisdiction or authority. 374 | 375 | 376 | ======================================================================= 377 | 378 | Creative Commons is not a party to its public licenses. 379 | Notwithstanding, Creative Commons may elect to apply one of its public 380 | licenses to material it publishes and in those instances will be 381 | considered the “Licensor.” The text of the Creative Commons public 382 | licenses is dedicated to the public domain under the CC0 Public Domain 383 | Dedication. Except for the limited purpose of indicating that material 384 | is shared under a Creative Commons public license or as otherwise 385 | permitted by the Creative Commons policies published at 386 | creativecommons.org/policies, Creative Commons does not authorize the 387 | use of the trademark "Creative Commons" or any other trademark or logo 388 | of Creative Commons without its prior written consent including, 389 | without limitation, in connection with any unauthorized modifications 390 | to any of its public licenses or any other arrangements, 391 | understandings, or agreements concerning use of licensed material. For 392 | the avoidance of doubt, this paragraph does not form part of the public 393 | licenses. 394 | 395 | Creative Commons may be contacted at creativecommons.org. --------------------------------------------------------------------------------