├── src
    ├── evaluation
    │   ├── __init__.py
    │   ├── output_parser.py
    │   ├── metric_calculator.py
    │   └── batch_eval_openai.py
    ├── inference
    │   ├── __init__.py
    │   ├── multi_turn.py
    │   ├── multi_turn_openai.py
    │   ├── single_turn.py
    │   └── single_turn_openai.py
    ├── data
    │   ├── __init__.py
    │   ├── data_utils.py
    │   └── dataloader.py
    ├── __init__.py
    ├── model
    │   ├── __init__.py
    │   ├── vllm.py
    │   └── openai.py
    └── utils
    │   ├── __init__.py
    │   ├── sampling_params.py
    │   ├── common.py
    │   └── prompts.py
├── requirements.txt
├── assets
    ├── teaser.png
    └── discrepancies.png
├── outputs
    ├── evaluation
    │   ├── llama3_8b_zero-shot_metric.json
    │   └── llama3_8b_prompt-gold.json
    └── inference
    │   ├── gpt-3.5-turbo_prompt-gold.json
    │   ├── llama3_8b_prompt-gold.json
    │   ├── llama3_8b_prompt-pred.json
    │   └── gpt-3.5-turbo_zero-shot.json
├── scripts
    ├── evaluation
    │   ├── llama3_8b_prompt-gold_auto.sh
    │   └── llama3_8b_zero-shot.sh
    └── inference
    │   ├── gpt-3.5-turbo.sh
    │   └── llama3_8b.sh
├── .gitignore
├── README.md
├── LICENSE
└── DATA_LICENSE


/src/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/inference/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiolimiter
2 | tqdm
3 | transformers
4 | datasets
5 | vllm
6 | openai


--------------------------------------------------------------------------------
/assets/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaistAI/knowledge-reasoning/HEAD/assets/teaser.png


--------------------------------------------------------------------------------
/src/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .data_utils import *
2 | from .dataloader import HFDepthQALoader
3 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | from . import data, evaluation, inference, model, utils
2 | 
3 | utils.set_seed()
4 | 


--------------------------------------------------------------------------------
/src/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .openai import OpenAIBatchClient, OpenAILLM
2 | from .vllm import VLLM
3 | 


--------------------------------------------------------------------------------
/assets/discrepancies.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaistAI/knowledge-reasoning/HEAD/assets/discrepancies.png


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .common import *
2 | from .prompts import *
3 | from .sampling_params import *
4 | 


--------------------------------------------------------------------------------
/src/utils/sampling_params.py:
--------------------------------------------------------------------------------
 1 | # Open source LLM inference parameters
 2 | SAMPLING_PARAMS = {
 3 |     "max_tokens": 1024,
 4 |     "temperature": 1.0,
 5 |     "top_p": 0.9,
 6 |     "repetition_penalty": 1.03,
 7 | }
 8 | 
 9 | # OpenAI inference/evaluation parameters
10 | SAMPLING_PARAMS_OPENAI = {"max_tokens": 1024, "temperature": 1.0, "top_p": 0.9}
11 | 


--------------------------------------------------------------------------------
/src/data/data_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Callable, Dict
 3 | 
 4 | 
 5 | def filter_data_dict(
 6 |     data: Dict[str, Dict], filter_fn: Callable[[Dict], bool]
 7 | ) -> Dict[str, Dict]:
 8 |     return {key: value for key, value in data.items() if filter_fn(value)}
 9 | 
10 | 
11 | def slice_data_dict(data: Dict[str, Dict], start: int, end: int) -> Dict[str, Dict]:
12 |     return {key: value for key, value in list(data.items())[start:end]}
13 | 


--------------------------------------------------------------------------------
/outputs/evaluation/llama3_8b_zero-shot_metric.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Average Accuracy - Depth depth_1": 4.56021409455843,
 3 |     "Average Accuracy - Depth depth_2": 4.451253481894151,
 4 |     "Average Accuracy - Depth depth_3": 4.362637362637362,
 5 |     "Forward Discrepancy - Depth 2 <=> Depth 3": 0.09465020576131687,
 6 |     "Forward Discrepancy - Depth 1 <=> Depth 2": 0.08497327502429543,
 7 |     "Forward Discrepancy - Overall": 0.08682193396226415,
 8 |     "Backward Discrepancy - Depth 2 <=> Depth 3": 0.08855799373040753,
 9 |     "Backward Discrepancy - Depth 1 <=> Depth 2": 0.06630901287553648,
10 |     "Backward Discrepancy - Overall": 0.07109164420485176
11 | }


--------------------------------------------------------------------------------
/scripts/evaluation/llama3_8b_prompt-gold_auto.sh:
--------------------------------------------------------------------------------
 1 | export OPENAI_API_KEY=<YOUR_API_KEY>
 2 | 
 3 | # ------------------------- 1~3. Create, check status, retrieve results of batch request ------------------------ #
 4 | python -m src.evaluation.batch_eval_openai \
 5 | --mode auto \
 6 | --eval_model_name gpt-4-0125-preview \
 7 | --prediction_file outputs/inference/llama3_8b_prompt-gold.json \
 8 | --description llama3_8b_prompt-gold \
 9 | --output_file outputs/evaluation/llama3_8b_prompt-gold.json
10 | 
11 | # ---------------------- 4. Calculate evaluation metrics --------------------- #
12 | python -m src.evaluation.metric_calculator \
13 | --input outputs/evaluation/llama3_8b_prompt-gold.json \
14 | --output_file outputs/evaluation/llama3_8b_prompt-gold_metric.json


--------------------------------------------------------------------------------
/scripts/inference/gpt-3.5-turbo.sh:
--------------------------------------------------------------------------------
 1 | export OPENAI_API_KEY=<YOUR_API_KEY>
 2 | 
 3 | 
 4 | python -m src.inference.single_turn_openai \
 5 | --model_name gpt-3.5-turbo-0125 \
 6 | --input kaist-ai/DepthQA \
 7 | --output_file outputs/inference/gpt-3.5-turbo_zero-shot.json \
 8 | --task_type zero-shot \
 9 | 
10 | python -m src.inference.single_turn_openai \
11 | --model_name gpt-3.5-turbo-0125 \
12 | --input kaist-ai/DepthQA \
13 | --output_file outputs/inference/gpt-3.5-turbo_prompt-gold.json \
14 | --task_type prompt-gold \
15 | 
16 | python -m src.inference.single_turn_openai \
17 | --model_name gpt-3.5-turbo-0125 \
18 | --input outputs/inference/gpt-3.5-turbo_zero-shot.json \
19 | --output_file outputs/inference/gpt-3.5-turbo_prompt-pred.json \
20 | --task_type prompt-pred \
21 | 
22 | python -m src.inference.multi_turn_openai \
23 | --model_name gpt-3.5-turbo-0125 \
24 | --input kaist-ai/DepthQA \
25 | --output_file outputs/inference/gpt-3.5-turbo_multi-turn.json 


--------------------------------------------------------------------------------
/scripts/inference/llama3_8b.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0
 2 | export HF_HOME=<YOUR_HF_HOME>
 3 | NUM_GPUS=1
 4 | 
 5 | python -m src.inference.single_turn \
 6 | --model_name meta-llama/Meta-Llama-3-8B-Instruct \
 7 | --input kaist-ai/DepthQA \
 8 | --output_file outputs/inference/llama3_8b_zero-shot.json \
 9 | --num_gpus $NUM_GPUS \
10 | --task_type zero-shot \
11 | 
12 | python -m src.inference.single_turn \
13 | --model_name meta-llama/Meta-Llama-3-8B-Instruct \
14 | --input kaist-ai/DepthQA \
15 | --output_file outputs/inference/llama3_8b_prompt-gold.json \
16 | --num_gpus $NUM_GPUS \
17 | --task_type prompt-gold \
18 | 
19 | python -m src.inference.single_turn \
20 | --model_name meta-llama/Meta-Llama-3-8B-Instruct \
21 | --input outputs/inference/llama3_8b_zero-shot.json \
22 | --output_file outputs/inference/llama3_8b_prompt-pred.json \
23 | --num_gpus $NUM_GPUS \
24 | --task_type prompt-pred \
25 | 
26 | python -m src.inference.multi_turn \
27 | --model_name meta-llama/Meta-Llama-3-8B-Instruct \
28 | --input kaist-ai/DepthQA \
29 | --output_file outputs/inference/llama3_8b_multi-turn.json \
30 | --num_gpus $NUM_GPUS 


--------------------------------------------------------------------------------
/src/model/vllm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List
 3 | 
 4 | import torch
 5 | from vllm import LLM, SamplingParams
 6 | 
 7 | 
 8 | class VLLM:
 9 |     def __init__(self, name, tokenizer_name=None, num_gpus=1):
10 |         dtype = "float16"
11 |         if torch.cuda.is_bf16_supported():
12 |             dtype = "bfloat16"
13 | 
14 |         self.name = name
15 | 
16 |         max_model_len = None
17 | 
18 |         print(f"Loading {name}...")
19 |         self.model = LLM(
20 |             model=self.name,
21 |             tokenizer=tokenizer_name,
22 |             dtype=dtype,
23 |             max_model_len=max_model_len,
24 |             trust_remote_code=True,
25 |             tensor_parallel_size=num_gpus,
26 |             download_dir=os.getenv("HF_HOME"),
27 |         )
28 | 
29 |     def get_tokenizer(self):
30 |         return self.model.get_tokenizer()
31 | 
32 |     def completions(
33 |         self,
34 |         prompts: List[str],
35 |         use_tqdm=False,
36 |         **kwargs,
37 |     ):
38 |         prompts = [prompt.strip() for prompt in prompts]
39 |         params = SamplingParams(**kwargs)
40 |         outputs = self.model.generate(prompts, params, use_tqdm=use_tqdm)
41 |         outputs = [output.outputs[0].text.strip() for output in outputs]
42 |         return outputs
43 | 


--------------------------------------------------------------------------------
/src/utils/common.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | def set_seed(seed: int = 42):
 8 |     random.seed(seed)
 9 |     torch.manual_seed(seed)
10 | 
11 | 
12 | def show_random_inputs(inputs):
13 |     random_inputs = random.sample(inputs, 3)
14 |     width = 20
15 | 
16 |     for input_str in random_inputs:
17 |         print("-" * width)
18 |         print("Example inputs:")
19 |         print(input_str)
20 |     print("-" * width)
21 |     return inputs
22 | 
23 | 
24 | def get_output_path(output_file: str, default_output_dir: str):
25 |     if os.path.dirname(output_file):
26 |         return output_file
27 |     else:
28 |         return os.path.join(default_output_dir, output_file)
29 | 
30 | 
31 | def format_messages(system_prompt, user_prompt):
32 |     messages = [
33 |         {"role": "system", "content": system_prompt},
34 |         {"role": "user", "content": user_prompt},
35 |     ]
36 |     return messages
37 | 
38 | 
39 | def calculate_cost(model_name, input_token_cnt, output_token_cnt, batch_eval=True):
40 |     def pricing_info(model):
41 |         if model.startswith("gpt-4") and model.endswith("preview"):
42 |             input_rate = 0.01
43 |             output_rate = 0.03
44 |         elif model == "gpt-4":
45 |             input_rate = 0.03
46 |             output_rate = 0.06
47 |         elif model == "gpt-4-32k":
48 |             input_rate = 0.06
49 |             output_rate = 0.12
50 |         elif model == "gpt-3.5-turbo-0125":
51 |             input_rate = 0.0005
52 |             output_rate = 0.0015
53 |         elif model == "gpt-3.5-turbo-instruct":
54 |             input_rate = 0.0015
55 |             output_rate = 0.0020
56 |         else:
57 |             raise ValueError(f"Model {model} not supported.")
58 |         return input_rate, output_rate
59 | 
60 |     input_rate, output_rate = pricing_info(model_name)
61 |     if batch_eval:
62 |         input_rate /= 2
63 |         output_rate /= 2
64 |     cost = input_rate * input_token_cnt / 1000 + output_rate * output_token_cnt / 1000
65 |     return cost
66 | 


--------------------------------------------------------------------------------
/scripts/evaluation/llama3_8b_zero-shot.sh:
--------------------------------------------------------------------------------
 1 | export OPENAI_API_KEY=<YOUR_API_KEY>
 2 | 
 3 | # ------------------------- 1. Create a batch request ------------------------ #
 4 | python -m src.evaluation.batch_eval_openai \
 5 | --mode create \
 6 | --eval_model_name gpt-4-0125-preview \
 7 | --prediction_file outputs/inference/llama3_8b_zero-shot.json \
 8 | --description llama3_8b_zero-shot
 9 | # Printed output:
10 | # Batch(id='batch_ckmtDkk2bbpdEXp1KmM0vqk0', completion_window='24h', created_at=1719384093, endpoint='/v1/chat/completions', input_file_id='file-hbN7K9bwo42Gs6daIE2ivjQJ', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1719470493, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'llama3_8b_zero-shot'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
11 | 
12 | # ------------------------- 2. Check the status of a batch request ------------------------ #
13 | python -m src.evaluation.batch_eval_openai \
14 | --mode check \
15 | --batch_id batch_ckmtDkk2bbpdEXp1KmM0vqk0
16 | # Printed output (in progress):
17 | # Batch(id='batch_ckmtDkk2bbpdEXp1KmM0vqk0', completion_window='24h', created_at=1719384093, endpoint='/v1/chat/completions', input_file_id='file-hbN7K9bwo42Gs6daIE2ivjQJ', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1719470493, failed_at=None, finalizing_at=None, in_progress_at=1719384093, metadata={'description': 'llama3_8b_zero-shot'}, output_file_id=None, request_counts=BatchRequestCounts(completed=549, failed=0, total=1571))
18 | 
19 | python -m src.evaluation.batch_eval_openai \
20 | --mode list
21 | # Printed output (completed):
22 | # llama3_8b_zero-shot
23 | #         Batch ID: batch_ckmtDkk2bbpdEXp1KmM0vqk0
24 | #         Status: completed
25 | #         Output file ID: file-ffvIyZGGyWMuQoTSLRXgNb8X
26 | #         BatchRequestCounts(completed=1571, failed=0, total=1571)
27 | 
28 | # ------------------------- 3. Retrieve the results of a batch request ------------------------ #
29 | python -m src.evaluation.batch_eval_openai \
30 | --mode retrieve \
31 | --prediction_file outputs/inference/llama3_8b_zero-shot.json \
32 | --batch_output_file_id file-ffvIyZGGyWMuQoTSLRXgNb8X \
33 | --output_file outputs/evaluation/llama3_8b_zero-shot.json
34 | 
35 | # ---------------------- 4. Calculate evaluation metrics --------------------- #
36 | python -m src.evaluation.metric_calculator \
37 | --input outputs/evaluation/llama3_8b_zero-shot.json \
38 | --output_file outputs/evaluation/llama3_8b_zero-shot_metric.json


--------------------------------------------------------------------------------
/src/utils/prompts.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | 
 3 | # Zero-shot inference
 4 | SYSTEM_PROMPT_ZERO_SHOT = (
 5 |     "You are a helpful, respectful and honest assistant. Answer the question."
 6 | )
 7 | 
 8 | USER_PROMPT_TEMPLATE_ZERO_SHOT = """
 9 | ###Question: 
10 | {question}
11 | 
12 | ###Answer: """
13 | 
14 | 
15 | # Prompt (Gold.) or Prompt (Pred.) inference
16 | SYSTEM_PROMPT_CTX = "You are a helpful, respectful and honest assistant. Answer the question using the knowledge in given QA pairs."
17 | 
18 | USER_PROMPT_TEMPLATE_CTX = """
19 | ###QA pairs:
20 | {qa_pairs}
21 | ###Question: 
22 | {question}
23 | 
24 | ###Answer: """
25 | 
26 | 
27 | USER_PROMPT_TEMPLATE_LAST_TURN = """
28 | Based on previous questions and responses, answer the given question:
29 | 
30 | ###Question: 
31 | {question}
32 | 
33 | ###Answer: """
34 | 
35 | 
36 | # Evaluation
37 | SYSTEM_PROMPT_EVAL = "You are a fair judge assistant tasked with providing clear, objective feedback based on specific criteria, ensuring each assessment reflects the absolute standards set for performance."
38 | 
39 | USER_PROMPT_TEMPLATE_EVAL = """
40 | ###Task Description:
41 | An instruction (might include an Input inside it), a response to evaluate, and a score rubric representing a evaluation criteria are given.
42 | 1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
43 | 2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
44 | 3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
45 | 4. Please do not generate any other opening, closing, and explanations.
46 | 
47 | ###The instruction to evaluate:
48 | {instruction}
49 | 
50 | ###Response to evaluate:
51 | {response}
52 | 
53 | ###Reference Answer (Score 5):
54 | {reference_answer}
55 | 
56 | ###Score Rubrics:
57 | [Is the response correct, accurate, and factual?]
58 | Score 1: The response is largely incorrect, inaccurate, and not factual. It demonstrates a fundamental misunderstanding of the query or topic, leading to irrelevant or completely erroneous information.
59 | Score 2: The response is partially correct but contains significant inaccuracies or factual errors. It shows some understanding of the query or topic but fails to provide a fully accurate or reliable answer.
60 | Score 3: The response is generally correct and factual but may include minor inaccuracies or lack of detail. It shows a good understanding of the query or topic but may miss some nuances or specific information.
61 | Score 4: The response is mostly correct, accurate, and factual. It demonstrates a strong understanding of the query or topic, with only minimal inaccuracies or omissions that do not significantly detract from the overall quality of the response.
62 | Score 5: The response is consistently correct, accurate, and entirely factual. It reflects a comprehensive understanding of the query or topic, providing detailed, precise, and fully reliable information without any inaccuracies or omissions.
63 | 
64 | ###Feedback: """
65 | 


--------------------------------------------------------------------------------
/src/evaluation/output_parser.py:
--------------------------------------------------------------------------------
 1 | # Inspired by https://github.com/prometheus-eval/prometheus-eval/blob/main/eval/parser.py
 2 | 
 3 | import re
 4 | 
 5 | pattern = re.compile(
 6 |     r"""
 7 |     (?:                     # Non-capturing group for various result indicators
 8 |         \[RESULT\]          # [RESULT]
 9 |         |Score              # Score
10 |         |\[SCORE\]          # [SCORE]
11 |         |\[RESULT\]:        # [RESULT]:
12 |         |Score:             # Score:
13 |         |score:             # score:
14 |         |Result:            # Result:
15 |         |\[Result\]         # [Result]
16 |         |score\s+of         # score of
17 |         |Feedback:          # Feedback:
18 |         |feedback:          # feedback:
19 |     )
20 |     \s*                     # Optional whitespace
21 |     (?:                     # Non-capturing group for optional brackets or parentheses
22 |         \(\s*               # Opening parenthesis with optional whitespace
23 |         |\[\s*              # or opening square bracket with optional whitespace
24 |         |                   # or nothing
25 |     )
26 |     \s*                     # Optional whitespace
27 |     (\d+)                   # Capturing group for one or more digits
28 |     """,
29 |     re.IGNORECASE | re.VERBOSE,
30 | )
31 | 
32 | 
33 | def parse_judgment(judgment):
34 |     matches = pattern.search(judgment)
35 | 
36 |     if matches:
37 |         # Extract the first group that matches (ignoring None)
38 |         result = next((int(match) for match in matches.groups() if match), None)
39 |         if result is not None:
40 |             feedback = (
41 |                 judgment.split("[RESULT]")[0].strip()
42 |                 if "[RESULT]" in judgment
43 |                 else judgment
44 |             )
45 |             return feedback, result
46 | 
47 |     return None, None
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     # Test cases
52 |     test_cases = [
53 |         # Absolute mode test cases (a2a, a2r)
54 |         ("Good job. [RESULT] 3", 3),
55 |         ("Needs improvement. [RESULT] Score: 2", 2),
56 |         ("Well done. [RESULT] Result: 4", 4),
57 |         ("Average. [RESULT] 4/5", 4),
58 |         ("Excellent. [RESULT] 5 out of 5", 5),
59 |         ("Poor performance. [RESULT] score of 1", 1),
60 |         ("Good job. [Result] 3", 3),
61 |         ("Needs improvement. [Result] Score: 2", 2),
62 |         ("Well done. [Result] Result: 4", 4),
63 |         ("Average. [Result] 4/5", 4),
64 |         ("Excellent. [Result] 5 out of 5", 5),
65 |         ("Poor performance. [Result] score of 1", 1),
66 |         ("Good job. [3]", 3),
67 |         ("Good job. (Score 5)", 5),
68 |         ("Good job. [Score 4]", 4),
69 |         ("Good job. score: 3", 3),
70 |         ("Good job. Score: 3", 3),
71 |         ("Good job. score of 1", 1),
72 |         ("Good job. [RESULT] (5)", 5),
73 |     ]
74 | 
75 |     def run_tests():
76 |         failed_tests = []  # To keep track of failed tests
77 | 
78 |         for output, expected in test_cases:
79 |             _, result = parse_judgment(output)
80 |             if result != expected:
81 |                 failed_tests.append((output, expected, result))
82 | 
83 |         if failed_tests:
84 |             print("Some tests failed:")
85 |             for output, expected, result in failed_tests:
86 |                 print(f"  For input: '{output}', expected: {expected}, got: {result}")
87 |         else:
88 |             print("All tests passed!")
89 | 
90 |     run_tests()
91 | 


--------------------------------------------------------------------------------
/src/model/openai.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import openai
  4 | from aiolimiter import AsyncLimiter
  5 | from tqdm.asyncio import tqdm_asyncio
  6 | from tqdm.auto import tqdm
  7 | 
  8 | 
  9 | class OpenAILLM:
 10 |     def __init__(
 11 |         self,
 12 |         name,
 13 |     ):
 14 |         self.name = name
 15 |         self.batch_size = 100
 16 |         self.requests_per_minute = 100
 17 |         self.limiter = AsyncLimiter(self.requests_per_minute, 60)
 18 |         self.client = openai.AsyncOpenAI(
 19 |             api_key=os.environ.get("OPENAI_API_KEY"),
 20 |         )
 21 | 
 22 |     async def get_completion_text_async(self, messages, **kwargs):
 23 |         async with self.limiter:
 24 |             try:
 25 |                 # Assuming you have a session and client setup for OpenAI
 26 |                 completion = await self.client.chat.completions.create(
 27 |                     model=self.name, messages=messages, **kwargs
 28 |                 )
 29 |                 content = completion.choices[0].message.content.strip()
 30 |                 return content
 31 |             except openai.APIConnectionError as e:
 32 |                 print("APIConnectionError: The server could not be reached")
 33 |                 print(
 34 |                     e.__cause__
 35 |                 )  # an underlying Exception, likely raised within httpx.
 36 |             except openai.RateLimitError as e:
 37 |                 print(
 38 |                     "RateLimitError: A 429 status code was received; we should back off a bit."
 39 |                 )
 40 |             except openai.APIStatusError as e:
 41 |                 print("APIStatusError: Another non-200-range status code was received")
 42 |                 print(e.status_code)
 43 |                 print(e.response)
 44 |             except Exception as e:
 45 |                 print(f"Error during OpenAI API call: {e}")
 46 |                 return ""  # , {}
 47 | 
 48 |     async def completions(
 49 |         self,
 50 |         messages,
 51 |         **kwargs,
 52 |     ):
 53 |         assert isinstance(messages, list)
 54 |         assert list(messages[0][0].keys()) == ["role", "content"]
 55 | 
 56 |         result_responses = []
 57 | 
 58 |         for start_idx in tqdm(
 59 |             range(0, len(messages), self.batch_size), desc="Processing batches"
 60 |         ):
 61 |             end_idx = start_idx + self.batch_size
 62 |             batch_prompts = messages[start_idx:end_idx]
 63 |             batch_responses = await tqdm_asyncio.gather(
 64 |                 *[
 65 |                     self.get_completion_text_async(prompt, **kwargs)
 66 |                     for prompt in batch_prompts
 67 |                 ]
 68 |             )
 69 |             result_responses.extend(batch_responses)
 70 | 
 71 |         return result_responses
 72 | 
 73 | 
 74 | class OpenAIBatchClient:
 75 |     def __init__(self):
 76 |         self.client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 77 | 
 78 |     def create_batch(self, input_file, description=None):
 79 |         batch_input_file = self.client.files.create(
 80 |             file=open(input_file, "rb"), purpose="batch"
 81 |         )
 82 | 
 83 |         batch_input_file_id = batch_input_file.id
 84 | 
 85 |         batch = self.client.batches.create(
 86 |             input_file_id=batch_input_file_id,
 87 |             endpoint="/v1/chat/completions",
 88 |             completion_window="24h",
 89 |             metadata={"description": description},
 90 |         )
 91 |         return batch
 92 | 
 93 |     def cancel_batch(self, batch_id):
 94 |         self.client.batches.cancel(batch_id)
 95 | 
 96 |     def check_batch(self, batch_id):
 97 |         batch = self.client.batches.retrieve(batch_id)
 98 |         status = batch.status
 99 |         batch_output_file_id = batch.output_file_id
100 |         return status, batch_output_file_id
101 | 
102 |     def list_batches(self):
103 |         batches = self.client.batches.list()
104 |         batches = sorted(batches, key=lambda x: x.created_at)
105 |         for batch in batches:
106 |             desc = batch.metadata.get("description", "") if batch.metadata else ""
107 |             batch_id = batch.id
108 |             status = batch.status
109 |             if "cancel" in status:
110 |                 continue
111 |             output_file_id = batch.output_file_id
112 |             request_counts = batch.request_counts
113 |             print("-" * 20)
114 |             print(desc)
115 |             print(f"\tBatch ID: {batch_id}")
116 |             print(f"\tStatus: {status}")
117 |             print(f"\tOutput file ID: {output_file_id}")
118 |             print(f"\t{request_counts}")
119 | 
120 |     def retrieve_batch(self, batch_output_file_id):
121 |         content = self.client.files.content(batch_output_file_id)
122 |         return content
123 | 
124 | 
125 | if __name__ == "__main__":
126 |     print("Hello, World!")
127 | 
128 |     model = OpenAILLM("gpt-3.5-turbo")
129 | 
130 |     responses = model.completions(
131 |         model="gpt-3.5-turbo",
132 |         messages=[
133 |             [{"role": "user", "content": "good morning? "}],
134 |             [{"role": "user", "content": "what's the time? "}],
135 |         ],
136 |     )
137 |     import pdb
138 | 
139 |     pdb.set_trace()
140 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.jsonl
  2 | env_var.sh
  3 | Makefile
  4 | notebooks
  5 | *.ipynb
  6 | *.json
  7 | human_eval
  8 | 
  9 | # Created by https://www.toptal.com/developers/gitignore/api/linux,macos,python
 10 | # Edit at https://www.toptal.com/developers/gitignore?templates=linux,macos,python
 11 | 
 12 | ### Linux ###
 13 | *~
 14 | 
 15 | # temporary files which can be created if a process still has a handle open of a deleted file
 16 | .fuse_hidden*
 17 | 
 18 | # KDE directory preferences
 19 | .directory
 20 | 
 21 | # Linux trash folder which might appear on any partition or disk
 22 | .Trash-*
 23 | 
 24 | # .nfs files are created when an open file is removed but is still being accessed
 25 | .nfs*
 26 | 
 27 | ### macOS ###
 28 | # General
 29 | .DS_Store
 30 | .AppleDouble
 31 | .LSOverride
 32 | 
 33 | # Icon must end with two \r
 34 | Icon
 35 | 
 36 | 
 37 | # Thumbnails
 38 | ._*
 39 | 
 40 | # Files that might appear in the root of a volume
 41 | .DocumentRevisions-V100
 42 | .fseventsd
 43 | .Spotlight-V100
 44 | .TemporaryItems
 45 | .Trashes
 46 | .VolumeIcon.icns
 47 | .com.apple.timemachine.donotpresent
 48 | 
 49 | # Directories potentially created on remote AFP share
 50 | .AppleDB
 51 | .AppleDesktop
 52 | Network Trash Folder
 53 | Temporary Items
 54 | .apdisk
 55 | 
 56 | ### macOS Patch ###
 57 | # iCloud generated files
 58 | *.icloud
 59 | 
 60 | ### Python ###
 61 | # Byte-compiled / optimized / DLL files
 62 | __pycache__/
 63 | *.py[cod]
 64 | *$py.class
 65 | 
 66 | # C extensions
 67 | *.so
 68 | 
 69 | # Distribution / packaging
 70 | .Python
 71 | build/
 72 | develop-eggs/
 73 | dist/
 74 | downloads/
 75 | eggs/
 76 | .eggs/
 77 | lib/
 78 | lib64/
 79 | parts/
 80 | sdist/
 81 | var/
 82 | wheels/
 83 | share/python-wheels/
 84 | *.egg-info/
 85 | .installed.cfg
 86 | *.egg
 87 | MANIFEST
 88 | 
 89 | # PyInstaller
 90 | #  Usually these files are written by a python script from a template
 91 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 92 | *.manifest
 93 | *.spec
 94 | 
 95 | # Installer logs
 96 | pip-log.txt
 97 | pip-delete-this-directory.txt
 98 | 
 99 | # Unit test / coverage reports
100 | htmlcov/
101 | .tox/
102 | .nox/
103 | .coverage
104 | .coverage.*
105 | .cache
106 | nosetests.xml
107 | coverage.xml
108 | *.cover
109 | *.py,cover
110 | .hypothesis/
111 | .pytest_cache/
112 | cover/
113 | 
114 | # Translations
115 | *.mo
116 | *.pot
117 | 
118 | # Django stuff:
119 | *.log
120 | local_settings.py
121 | db.sqlite3
122 | db.sqlite3-journal
123 | 
124 | # Flask stuff:
125 | instance/
126 | .webassets-cache
127 | 
128 | # Scrapy stuff:
129 | .scrapy
130 | 
131 | # Sphinx documentation
132 | docs/_build/
133 | 
134 | # PyBuilder
135 | .pybuilder/
136 | target/
137 | 
138 | # Jupyter Notebook
139 | .ipynb_checkpoints
140 | 
141 | # IPython
142 | profile_default/
143 | ipython_config.py
144 | 
145 | # pyenv
146 | #   For a library or package, you might want to ignore these files since the code is
147 | #   intended to run in multiple environments; otherwise, check them in:
148 | # .python-version
149 | 
150 | # pipenv
151 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
152 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
153 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
154 | #   install all needed dependencies.
155 | #Pipfile.lock
156 | 
157 | # poetry
158 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
159 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
160 | #   commonly ignored for libraries.
161 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
162 | #poetry.lock
163 | 
164 | # pdm
165 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
166 | #pdm.lock
167 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
168 | #   in version control.
169 | #   https://pdm.fming.dev/#use-with-ide
170 | .pdm.toml
171 | 
172 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
173 | __pypackages__/
174 | 
175 | # Celery stuff
176 | celerybeat-schedule
177 | celerybeat.pid
178 | 
179 | # SageMath parsed files
180 | *.sage.py
181 | 
182 | # Environments
183 | .env
184 | .venv
185 | env/
186 | venv/
187 | ENV/
188 | env.bak/
189 | venv.bak/
190 | 
191 | # Spyder project settings
192 | .spyderproject
193 | .spyproject
194 | 
195 | # Rope project settings
196 | .ropeproject
197 | 
198 | # mkdocs documentation
199 | /site
200 | 
201 | # mypy
202 | .mypy_cache/
203 | .dmypy.json
204 | dmypy.json
205 | 
206 | # Pyre type checker
207 | .pyre/
208 | 
209 | # pytype static type analyzer
210 | .pytype/
211 | 
212 | # Cython debug symbols
213 | cython_debug/
214 | 
215 | # PyCharm
216 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
217 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
218 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
219 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
220 | #.idea/
221 | 
222 | ### Python Patch ###
223 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
224 | poetry.toml
225 | 
226 | # ruff
227 | .ruff_cache/
228 | 
229 | # LSP config files
230 | pyrightconfig.json
231 | 
232 | # End of https://www.toptal.com/developers/gitignore/api/linux,macos,python


--------------------------------------------------------------------------------
/src/data/dataloader.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from typing import Dict, Set
  3 | 
  4 | from datasets import load_dataset
  5 | 
  6 | 
  7 | class HFDepthQALoader:
  8 |     def __init__(self, hf_repo: str = "kaist-ai/DepthQA", split: str = "test"):
  9 |         self.hf_repo = hf_repo
 10 |         self.split = split
 11 |         self.questions: Dict[str, Dict] = {}  # qid -> question dict
 12 |         self.nodes: Dict[str, Dict] = {}  # nodeid -> node dict
 13 |         self.node_to_q: Dict[str, str] = {}  # nodeid -> qid
 14 |         self.q_to_node: Dict[str, Set[str]] = defaultdict(set)  # qid -> set of nodeids
 15 | 
 16 |     def load_data(
 17 |         self, except_questions: bool = False, remove_unused_columns: bool = True
 18 |     ):
 19 |         print(f"Loading data from {self.hf_repo}...")
 20 |         if not except_questions:
 21 |             # Load questions
 22 |             questions_dataset = load_dataset(
 23 |                 self.hf_repo, "questions", split=self.split
 24 |             )
 25 |             if remove_unused_columns:
 26 |                 questions_dataset = questions_dataset.remove_columns(
 27 |                     ["domain", "tutoreval_data", "augmented"]
 28 |                 )
 29 |             self.questions = {item["qid"]: item for item in questions_dataset}
 30 | 
 31 |         # Load nodes
 32 |         nodes_dataset = load_dataset(self.hf_repo, "nodes", split=self.split)
 33 |         self.nodes = {item["nodeid"]: item for item in nodes_dataset}
 34 | 
 35 |         # Load node_to_q mappings
 36 |         node_to_q_dataset = load_dataset(self.hf_repo, "node_to_q", split=self.split)
 37 |         for item in node_to_q_dataset:
 38 |             self.node_to_q[item["nodeid"]] = item["qid"]
 39 |             self.q_to_node[item["qid"]].add(item["nodeid"])  # 1-to-n mapping
 40 | 
 41 |         return self.questions, self.nodes, self.node_to_q
 42 | 
 43 |     def check_integrity(self) -> None:
 44 |         print("Checking graph integrity...")
 45 | 
 46 |         errors = []
 47 | 
 48 |         def add_error(message):
 49 |             errors.append(message)
 50 | 
 51 |         # Check questions
 52 |         for qid in self.questions.keys():
 53 |             # Check q_to_node mapping
 54 |             if not self.q_to_node.get(qid):
 55 |                 add_error(f"Question {qid} not found in q_to_node")
 56 | 
 57 |         # Check nodes
 58 |         for nodeid in self.nodes.keys():
 59 |             node = self.nodes[nodeid]
 60 |             depth = node["depth"]
 61 | 
 62 |             # Check group consistency
 63 |             group_nodeid = nodeid.split("_")[0]
 64 |             if node["group"] != group_nodeid:
 65 |                 add_error(f"Inconsistent group in node {nodeid}")
 66 | 
 67 |             # Check direct_predecessors
 68 |             for predec_id in node["direct_predecessors"]:
 69 |                 predec_node = self.nodes[predec_id]
 70 |                 if not predec_node:
 71 |                     add_error(f"Predecessor node {predec_id} of {nodeid} not found")
 72 |                 else:
 73 |                     predec_depth_nodeid = int(predec_id.split("_")[1][1:])
 74 |                     predec_depth = predec_node["depth"]
 75 |                     if predec_depth_nodeid != predec_depth:
 76 |                         add_error(
 77 |                             f"Inconsistent depth in predecessor {predec_id}: {predec_depth_nodeid} in nodeid while {predec_depth} in depth field"
 78 |                         )
 79 |                     if predec_depth != depth - 1:
 80 |                         add_error(
 81 |                             f"Predecessor {predec_id} of {nodeid} has incorrect depth"
 82 |                         )
 83 |                     if nodeid not in predec_node["direct_successors"]:
 84 |                         add_error(
 85 |                             f"Node {nodeid} not in direct_successors of its predecessor {predec_id}"
 86 |                         )
 87 | 
 88 |             # Check direct_successors
 89 |             for succ_id in node["direct_successors"]:
 90 |                 succ_node = self.nodes[succ_id]
 91 |                 if not succ_node:
 92 |                     add_error(f"Successor node {succ_id} of {nodeid} not found")
 93 |                 else:
 94 |                     succ_depth_nodeid = int(succ_id.split("_")[1][1:])
 95 |                     succ_depth = succ_node["depth"]
 96 |                     if succ_depth_nodeid != succ_depth:
 97 |                         add_error(
 98 |                             f"Inconsistent depth in successor {succ_id}: {succ_depth_nodeid} in nodeid while {succ_depth} in depth field"
 99 |                         )
100 |                     if succ_depth != depth + 1:
101 |                         add_error(
102 |                             f"Successor {succ_id} of {nodeid} has incorrect depth"
103 |                         )
104 |                     if nodeid not in succ_node["direct_predecessors"]:
105 |                         add_error(
106 |                             f"Node {nodeid} not in direct_predecessors of its successor {succ_id}"
107 |                         )
108 | 
109 |             # Check node_to_q mapping
110 |             if not self.node_to_q.get(nodeid):
111 |                 add_error(f"Node {nodeid} not found in node_to_q")
112 | 
113 |         # Check consistency between node_to_q, nodes, and questions
114 |         for nodeid in self.node_to_q.keys():
115 |             qid = self.node_to_q.get(nodeid)
116 |             if not self.questions.get(qid):
117 |                 add_error(f"qid {qid} in node_to_q not found in questions")
118 |             if not self.nodes.get(nodeid):
119 |                 add_error(f"nodeid {nodeid} in node_to_q not found in nodes")
120 |             if nodeid not in self.q_to_node.get(qid):
121 |                 add_error(
122 |                     f"Inconsistency: node_to_q[{nodeid}] = {qid}, but q_to_node[{qid}] = {self.q_to_node.get(qid)}"
123 |                 )
124 | 
125 |         if errors:
126 |             raise ValueError("Graph integrity check failed:\n" + "\n".join(errors))
127 |         else:
128 |             print("Graph integrity check passed successfully.")
129 | 
130 | 
131 | if __name__ == "__main__":
132 |     loader = HFDepthQALoader()
133 |     loader.load_data()
134 |     loader.check_integrity()
135 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Hierarchical Deconstruction of LLM Reasoning: A Graph-Based Framework for Analyzing Knowledge Utilization
  2 | 
  3 | This repository is the official implementation of [Hierarchical Deconstruction of LLM Reasoning: A Graph-Based Framework for Analyzing Knowledge Utilization](https://aclanthology.org/2024.emnlp-main.288/). 
  4 | 
  5 | - [📃 Paper](https://aclanthology.org/2024.emnlp-main.288/)
  6 | - [🧵 Brief overview of paper (X thread)](https://x.com/miyoung_ko/status/1807753375196270716)
  7 | - [🤗 Dataset (DepthQA)](https://huggingface.co/datasets/kaist-ai/DepthQA)
  8 | 
  9 | ## TL;DR
 10 | We investigate how large language models utilize knowledge for reasoning to solve complex questions, based on a method that deconstructs complex questions into a hierarchical graph.
 11 | 
 12 | Each depth of knowledge required to answer the question represents different levels of complexity. | Some reasoning is required to answer a more complex question compared to a simpler question.
 13 | ---- | ----
 14 | ![teaser](assets/teaser.png) | ![discrepancies](assets/discrepancies.png)
 15 | 
 16 | 
 17 | ## Requirements
 18 | 
 19 | Create a virtual environment with python>=3.9 and install the appropriate PyTorch version for your machine.
 20 | 
 21 | In our project, we use a node of 4 x NVIDIA A6000 40GB GPUs with CUDA version 12.3.
 22 | 
 23 | ```bash
 24 | conda create -n myenv python=3.10
 25 | conda activate myenv
 26 | conda install pytorch pytorch-cuda=12.1 -c pytorch -c nvidia
 27 | ```
 28 | 
 29 | To install requirements:
 30 | 
 31 | ```setup
 32 | pip install -r requirements.txt
 33 | ```
 34 | 
 35 | ## Inference
 36 | 
 37 | You can experiment multiple inference modes with our dataset, [DepthQA]((https://huggingface.co/datasets/kaist-ai/DepthQA)):
 38 | 
 39 | - Single-turn:
 40 |   - `zero-shot`: Only the target question is in the input.
 41 |   - `prompt-gold`: Before the target question, shallower (i.e., predecessors to the target question) question and gold answer pairs are provided as context.
 42 |   - `prompt-pred`: Before the target question, shallower question and its own predicted answer pairs are provided as context.
 43 | - `multi-turn`: Shallower questions are provided as inputs in a multi-turn conversation, i.e., the model answers each shallower question one by one and then is presented with the target question.
 44 | 
 45 | Most HuggingFace `AutoModelForCausalLM` models can be run with [src/inference/single_turn.py](src/inference/single_turn.py) and [src/inference/multi_turn.py](src/inference/multi_turn.py), with vLLM integrated and using mixed precision.
 46 | 
 47 | For OpenAI models, use [src/inference/single_turn_openai.py](src/inference/single_turn_openai.py) and [src/inference/multi_turn_openai.py](src/inference/multi_turn_openai.py).
 48 | 
 49 | ### Example usage
 50 | 
 51 | To inference LLaMA 3 8B Instruct with all modes:
 52 | ```bash
 53 | bash scripts/inference/llama3_8b.sh
 54 | ```
 55 | 
 56 | To inference GPT-3.5 Turbo with all modes:
 57 | ```bash
 58 | bash scripts/inference/gpt-3.5-turbo.sh
 59 | ```
 60 | 
 61 | ## Evaluation
 62 | 
 63 | Following the LLM-as-a-Judge approach, we use `gpt-4-0125-preview` to score the correctness of model predictions. Specifically, we use the [Batch API](https://platform.openai.com/docs/guides/batch) for faster and cheaper evaluation. Our implementation of the evaluation pipeline consists of four steps:
 64 | 
 65 | 1. Creating a batch request
 66 | 2. Check the status of the batch request
 67 | 3. Retrieve the results of the batch request
 68 | 4. Calculate evaluation metrics
 69 |     - Average accuracy
 70 |     - Forward discrepancy
 71 |     - Backward discrepancy
 72 | 
 73 | where the first three steps are performed in [src/evaluation/batch_eval_openai.py](src/evaluation/batch_eval_openai.py) and the last step is in [src/evaluation/metric_calculator.py](src/evaluation/metric_calculator.py)
 74 | 
 75 | ### Example usage
 76 | 
 77 | To analyze each step in the evaluation pipeline of LLaMA 3 8B Instruct `zero-shot` predictions, refer to the example commands and printed outputs in [scripts/evaluation/llama3_8b_zero-shot.sh](scripts/evaluation/llama3_8b_zero-shot.sh).
 78 | 
 79 | To run the entire pipeline of LLaMA 3 8B Instruct `prompt-gold` predictions automatically:
 80 | ```bash
 81 | bash scripts/evaluation/llama3_8b_prompt-gold_auto.sh
 82 | ```
 83 | 
 84 | 
 85 | ## Citation
 86 | ```bibtex
 87 | @inproceedings{ko-etal-2024-hierarchical,
 88 |     title = "Hierarchical Deconstruction of {LLM} Reasoning: A Graph-Based Framework for Analyzing Knowledge Utilization",
 89 |     author = "Ko, Miyoung  and
 90 |       Park, Sue Hyun  and
 91 |       Park, Joonsuk  and
 92 |       Seo, Minjoon",
 93 |     editor = "Al-Onaizan, Yaser  and
 94 |       Bansal, Mohit  and
 95 |       Chen, Yun-Nung",
 96 |     booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
 97 |     month = nov,
 98 |     year = "2024",
 99 |     address = "Miami, Florida, USA",
100 |     publisher = "Association for Computational Linguistics",
101 |     url = "https://aclanthology.org/2024.emnlp-main.288",
102 |     doi = "10.18653/v1/2024.emnlp-main.288",
103 |     pages = "4995--5027",
104 |     abstract = "Despite the advances in large language models (LLMs), how they use their knowledge for reasoning is not yet well understood.In this study, we propose a method that deconstructs complex real-world questions into a graph, representing each question as a node with predecessors of background knowledge needed to solve the question. We develop the DepthQA dataset, deconstructing questions into three depths: (i) recalling conceptual knowledge, (ii) applying procedural knowledge, and (iii) analyzing strategic knowledge. Based on a hierarchical graph, we quantify forward discrepancy, a discrepancy in LLM performance on simpler sub-problems versus complex questions. We also measure backward discrepancy where LLMs answer complex questions but struggle with simpler ones. Our analysis shows that smaller models exhibit more discrepancies than larger models. Distinct patterns of discrepancies are observed across model capacity and possibility of training data memorization. Additionally, guiding models from simpler to complex questions through multi-turn interactions improves performance across model sizes, highlighting the importance of structured intermediate steps in knowledge reasoning. This work enhances our understanding of LLM reasoning and suggests ways to improve their problem-solving abilities.",
105 | }
106 | 
107 | ```
108 | 


--------------------------------------------------------------------------------
/src/evaluation/metric_calculator.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import numpy as np
  4 | from collections import defaultdict
  5 | from argparse import ArgumentParser
  6 | 
  7 | from src.data import HFDepthQALoader
  8 | 
  9 | def compute_score(score_value):
 10 |     if isinstance(score_value, list):
 11 |         return sum(score_value) / len(score_value)
 12 |     elif isinstance(score_value, int):
 13 |         return int(score_value)
 14 |     else:
 15 |         return 0.
 16 | 
 17 | def depth_score(results):
 18 |     scores = {
 19 |         "depth_1": [],
 20 |         "depth_2": [],
 21 |         "depth_3": []
 22 |     }
 23 | 
 24 |     for inst in results.values():
 25 |         score = inst["score"]
 26 |         scores[f'depth_{inst["depth"]}'].append(
 27 |                         compute_score(score))
 28 |     
 29 |     output = {}
 30 |     for depth, score in scores.items():
 31 |         if len(score) == 0:
 32 |             output[f"Average Accuracy - Depth {depth}"] = 0
 33 |         else:
 34 |             output[f"Average Accuracy - Depth {depth}"] = np.mean(score)
 35 | 
 36 |     return output
 37 | 
 38 | def forward_disc(nodes, node_results): 
 39 |     depths = [(2,3), (1,2)]
 40 |     output = {}
 41 | 
 42 |     overall_gaps = 0
 43 |     overall_cnt = 0
 44 |     for (depth_a, depth_b) in depths:
 45 |         gaps = []
 46 |         cnt = 0
 47 |         aggregate = defaultdict(list)
 48 |         score_dict = defaultdict(float)
 49 |         for nodeid, data in node_results.items():
 50 |             if data["depth"] == depth_a:
 51 |                 depth_a_score = compute_score(data["score"])
 52 |                 successors = nodes[nodeid]["direct_successors"]
 53 |                 if len(successors) > 0:
 54 |                     for depth_b_id in successors:
 55 |                         depth_b_d = node_results[depth_b_id]
 56 |                         depth_b_score = compute_score(depth_b_d["score"])
 57 |                         score_dict[depth_b_id] = depth_b_score
 58 |                         aggregate[depth_b_id].append(depth_a_score)
 59 |                         cnt +=1 
 60 | 
 61 |         gaps = []
 62 |         cnt = 0 
 63 |         for bid in score_dict.keys():
 64 |             cnt += (np.average(aggregate[bid]) >= 4)
 65 |             gap = (np.average(aggregate[bid]) - score_dict[bid])/4
 66 |             gaps.append(max(0, gap) * (np.average(aggregate[bid]) >= 4))
 67 |         overall_gaps += sum(gaps)
 68 |         overall_cnt += cnt
 69 |         output[f"Forward Discrepancy - Depth {depth_a} <=> Depth {depth_b}"] = sum(gaps) / cnt
 70 | 
 71 |     output["Forward Discrepancy - Overall"] = overall_gaps / overall_cnt
 72 | 
 73 |     return output
 74 | 
 75 | def backward_disc(nodes, node_results):
 76 |     depths = [(2,3), (1,2)]
 77 |     output = {}
 78 | 
 79 |     overall_gaps = 0
 80 |     overall_cnt = 0
 81 |     for (depth_a, depth_b) in depths:
 82 |         gaps = []
 83 |         cnt = 0
 84 |         aggregate = defaultdict(list)
 85 |         score_dict = defaultdict(float)
 86 |         id_map = defaultdict(list)
 87 |         for nodeid, data in node_results.items():
 88 |             if data["depth"] == depth_b:
 89 |                 depth_b_score = compute_score(data["score"])
 90 |                 predecessors = nodes[nodeid]["direct_predecessors"]
 91 |                 if len(predecessors) > 0:
 92 |                     for depth_a_id in predecessors:
 93 |                         depth_a_d = node_results[depth_a_id]
 94 |                         depth_a_score = compute_score(depth_a_d["score"])
 95 |                         score_dict[depth_a_id] = depth_a_score
 96 |                         aggregate[depth_a_id].append(depth_b_score)
 97 |                         id_map[depth_a_id].append(id)
 98 |                         cnt +=1
 99 |         
100 |         gaps = []
101 |         cnt = 0 
102 | 
103 |         for aid in score_dict.keys():
104 |             cnt += (aggregate[aid][0] >= 4)
105 |             gap = (aggregate[aid][0] - score_dict[aid]) / 4
106 |             gaps.append(max(0, gap) * (aggregate[aid][0] >= 4))
107 |         overall_gaps += sum(gaps)
108 |         overall_cnt += cnt
109 |         output[f"Backward Discrepancy - Depth {depth_a} <=> Depth {depth_b}"] = sum(gaps) / cnt
110 | 
111 |     output["Backward Discrepancy - Overall"] = overall_gaps / overall_cnt   
112 |     return output        
113 | 
114 | 
115 | 
116 | def main(args):
117 | 
118 |     # Load src data
119 |     dataloader = HFDepthQALoader()
120 |     questions, nodes, node_to_q = dataloader.load_data()
121 | 
122 |     q_to_node = defaultdict(list)
123 |     for nodeid, qid in node_to_q.items():
124 |         q_to_node[qid].append(nodeid)
125 | 
126 |     # Load evaluation file
127 |     with open(args.input, "r") as fr:
128 |         results = json.load(fr)
129 | 
130 |     metrics = {}
131 |     depth_metric = depth_score(results)
132 |     metrics.update(depth_metric)
133 | 
134 |     node_results = {}
135 |     if list(results.keys())[0] in q_to_node:
136 |         # Convert Q to Node
137 |         for qid, nids in q_to_node.items():
138 |             for nodeid in nids:
139 |                 node_results[nodeid] = results[qid]
140 |     else:
141 |         node_results = results
142 | 
143 |     fwd_metric = forward_disc(nodes, node_results)
144 |     metrics.update(fwd_metric)
145 |     bwd_metric = backward_disc(nodes, node_results)
146 |     metrics.update(bwd_metric)
147 |     # print(metrics)
148 | 
149 |     with open(args.output_file, "w") as fw:
150 |         json.dump(metrics, fw)
151 | 
152 | 
153 | if __name__ == "__main__":
154 |     parser = ArgumentParser()
155 |     # I/O arguments
156 |     parser.add_argument(
157 |         "--src_input",
158 |         type=str,
159 |         default="kaist-ai/DepthQA",
160 |         help="Dataset name in Hugging Face (for zero-shot) or local zero-shot JSON output file (for prompt-*)",
161 |     )
162 |     parser.add_argument(
163 |         "--input",
164 |         type=str,
165 |         required=True,
166 |         help="Input JSON file name to calculate metric."
167 |     )
168 |     parser.add_argument(
169 |         "--output_file",
170 |         type=str,
171 |         required=True,
172 |         help="Output JSON file name. Unless the parent directory is specified, will be saved under outputs/inference by default.",
173 |     )
174 |     parser.add_argument(
175 |         "--force_rerun",
176 |         action="store_true",
177 |         help="Force rerun even if output file exists.",
178 |     )
179 |     parser.add_argument("--debug", action="store_true", help="Debug mode.")
180 |     args = parser.parse_args()
181 |     assert os.path.exists(args.input), "Cannot find input file."
182 |     assert args.output_file.endswith(".json"), "Output file must be a JSON file."
183 |     main(args)
184 | 


--------------------------------------------------------------------------------
/src/inference/multi_turn.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import warnings
  4 | from argparse import ArgumentParser
  5 | from typing import Dict, List
  6 | from collections import defaultdict
  7 | from tqdm import tqdm
  8 | 
  9 | from src.data import HFDepthQALoader, filter_data_dict, slice_data_dict
 10 | from src.model import VLLM
 11 | from src.utils import (
 12 |     SAMPLING_PARAMS,
 13 |     SYSTEM_PROMPT_ZERO_SHOT,
 14 |     USER_PROMPT_TEMPLATE_ZERO_SHOT,
 15 |     USER_PROMPT_TEMPLATE_LAST_TURN,
 16 |     get_output_path,
 17 |     show_random_inputs,
 18 | )
 19 | 
 20 | DEBUG = False
 21 | 
 22 | 
 23 | 
 24 | def multiturn_completions(
 25 |     model,
 26 |     inputs: List[str],
 27 |     tokenizer,
 28 |     total_len
 29 | ):
 30 |     system_prompt = SYSTEM_PROMPT_ZERO_SHOT
 31 |     chat_dict = defaultdict(str)
 32 | 
 33 |     # Adjust batch size to fit the number of inputs
 34 |     # VLLM supports adaptive batch size already
 35 |     total_len = len(inputs)
 36 | 
 37 |     # Process initial batches with progress bar
 38 |     print("Processing initial chat...")
 39 |     for i in tqdm(
 40 |         range(0, len(inputs)), total=total_len, desc="Initial Chat"
 41 |     ):
 42 |         chat_input = inputs[i]
 43 | 
 44 |         for j, inst in enumerate(chat_input):
 45 |             if j == 0:
 46 |                 input_ = apply_template_chat(system_prompt, inst["input"], tokenizer)
 47 |             else:
 48 |                 input_.append({"role": "user", "content": inst["input"]})
 49 | 
 50 |             if inst["id"] in chat_dict:
 51 |                 input_.append({"role": "assistant", "content": chat_dict[inst["id"]]})
 52 |                 continue
 53 |             
 54 |             tokenized_input = tokenizer.apply_chat_template(
 55 |                     input_, tokenize=False, add_generation_prompt=True)
 56 |             
 57 |             output = model.completions([tokenized_input], **SAMPLING_PARAMS)
 58 |             chat_dict[inst["id"]] = output[0]
 59 |             input_.append({"role": "assistant", "content": output[0]})
 60 | 
 61 |     # Final aggregation and printing
 62 |     outputs_len = len(chat_dict)
 63 |     print(f"Processed {outputs_len} / {total_len} instances.")
 64 |     
 65 |     if outputs_len < total_len:
 66 |         warnings.warn("Some instances failed.")
 67 |         warnings.warn("They will be written as None in the output file.")
 68 |         raise Exception(
 69 |             f"Failed to generate feedback for {total_len - outputs_len} instances."
 70 |         )
 71 |     
 72 |     return chat_dict
 73 | 
 74 | 
 75 | def apply_template_chat(system_prompt, user_prompt, tokenizer):
 76 |     if tokenizer.chat_template and "system" not in tokenizer.chat_template:
 77 |         messages = [
 78 |             {"role": "user", "content": system_prompt + "\n" + user_prompt},
 79 |         ]
 80 |     else:
 81 |         messages = [
 82 |             {"role": "system", "content": system_prompt},
 83 |             {"role": "user", "content": user_prompt},
 84 |         ]
 85 |     return messages
 86 | 
 87 | 
 88 | def prepare_inputs_multiturn(
 89 |     questions: Dict[str, Dict],
 90 |     nodes: Dict[str, Dict],
 91 |     node_to_q: Dict[str, str]
 92 | ) -> List[str]:
 93 | 
 94 |     chat_inputs = []
 95 |    
 96 |     for nodeid, node in nodes.items():
 97 |         chat = []
 98 |         target_question_d = questions[node_to_q[nodeid]]
 99 |         target_question = target_question_d["question"]
100 | 
101 |         for predec_nodeid in node["direct_predecessors"]:
102 |             predec_question_d = questions[node_to_q[predec_nodeid]]
103 |             predec_question = predec_question_d["question"]
104 |             user_prompt = USER_PROMPT_TEMPLATE_ZERO_SHOT.format(question=predec_question)
105 |             chat.append({"id":predec_nodeid, "input": user_prompt})
106 |         last_user_prompt = USER_PROMPT_TEMPLATE_LAST_TURN.format(question=target_question)
107 |         chat.append({"id": nodeid, "input": last_user_prompt})
108 | 
109 |         chat_inputs.append(chat)
110 | 
111 |     return chat_inputs
112 | 
113 | 
114 | def main(args):
115 |     global DEBUG
116 |     DEBUG = args.debug
117 | 
118 |     # Load data
119 |     dataloader = HFDepthQALoader()
120 |     questions, nodes, node_to_q = dataloader.load_data()
121 |     print(f"Loaded {len(questions)} questions and {len(nodes)} nodes.")
122 | 
123 |     # Load model
124 |     model = VLLM(args.model_name, num_gpus=args.num_gpus)
125 |     tokenizer = model.get_tokenizer()
126 | 
127 |     nodes = filter_data_dict(nodes, lambda node: node["depth"] > 1)
128 | 
129 |     if DEBUG:
130 |         nodes = slice_data_dict(nodes, start=0, end=5)
131 | 
132 |     inputs = prepare_inputs_multiturn(
133 |         questions,
134 |         nodes,
135 |         node_to_q
136 |     )
137 | 
138 |     if DEBUG:
139 |         inputs = inputs[:5]
140 | 
141 |     # Inference
142 |     predictions = multiturn_completions(model, inputs, tokenizer, len(node_to_q))
143 | 
144 |     # Save results
145 |     results = {}
146 |     for idx, nodeid in enumerate(predictions.keys()):
147 |         results[nodeid] = questions[node_to_q[nodeid]]
148 |         results[nodeid].update({"predicted_answer": predictions[nodeid]})
149 | 
150 |     output_path = get_output_path(
151 |         output_file=args.output_file, default_output_dir="../../outputs/inference"
152 |     )
153 | 
154 |     with open(output_path, "w") as f:
155 |         json.dump(results, f, indent=4)
156 | 
157 | 
158 | if __name__ == "__main__":
159 |     parser = ArgumentParser()
160 |     # I/O arguments
161 |     parser.add_argument(
162 |         "--model_name",
163 |         type=str,
164 |         required=True,
165 |         help="Name of model hosted in Hugging Face under AutoModelForCausalLM",
166 |     )
167 |     parser.add_argument(
168 |         "--input",
169 |         type=str,
170 |         default="kaist-ai/DepthQA",
171 |         help="Dataset name in Hugging Face (for zero-shot) or local zero-shot JSON output file (for prompt-*)",
172 |     )
173 |     parser.add_argument(
174 |         "--output_file",
175 |         type=str,
176 |         required=True,
177 |         help="Output JSON file name. Unless the parent directory is specified, will be saved under outputs/inference by default.",
178 |     )
179 |     parser.add_argument(
180 |         "--force_rerun",
181 |         action="store_true",
182 |         help="Force rerun even if output file exists.",
183 |     )
184 |     parser.add_argument("--debug", action="store_true", help="Debug mode.")
185 |     # Compute arguments
186 |     parser.add_argument(
187 |         "--num_gpus",
188 |         type=int,
189 |         default=2,
190 |         help="Number of GPUs to use for inference. Note that we use bfloat16 if available and float16 otherwise.",
191 |     )
192 |     args = parser.parse_args()
193 | 
194 |     assert args.output_file.endswith(".json"), "Output file must be a JSON file."
195 | 
196 |     output_path = get_output_path(
197 |         output_file=args.output_file, default_output_dir="../../outputs/inference"
198 |     )
199 |     assert not (
200 |         os.path.exists(output_path) and not args.force_rerun
201 |     ), f"Output file {output_path} already exists. Skipping inference."
202 | 
203 |     main(args)
204 | 


--------------------------------------------------------------------------------
/src/inference/multi_turn_openai.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | import os
  4 | import warnings
  5 | from argparse import ArgumentParser
  6 | from typing import Dict, List
  7 | from collections import defaultdict
  8 | from tqdm import tqdm
  9 | 
 10 | from src.data import HFDepthQALoader, filter_data_dict, slice_data_dict
 11 | from src.model import OpenAILLM
 12 | from src.utils import (
 13 |     SAMPLING_PARAMS_OPENAI,
 14 |     SYSTEM_PROMPT_ZERO_SHOT,
 15 |     USER_PROMPT_TEMPLATE_ZERO_SHOT,
 16 |     USER_PROMPT_TEMPLATE_LAST_TURN,
 17 |     get_output_path
 18 | )
 19 | 
 20 | DEBUG = False
 21 | 
 22 | 
 23 | async def multiturn_completions(
 24 |     model,
 25 |     inputs,
 26 |     total_len,
 27 |     max_retries=5
 28 | ):
 29 |     system_prompt = SYSTEM_PROMPT_ZERO_SHOT
 30 |     chat_dict = defaultdict(str)
 31 | 
 32 |     # Adjust batch size to fit the number of inputs
 33 |     # VLLM supports adaptive batch size already
 34 |     total_len = len(inputs)
 35 |     batched_outputs = []
 36 | 
 37 |     # Process initial batches with progress bar
 38 |     print("Processing initial chat...")
 39 |     for i in tqdm(
 40 |         range(0, len(inputs)), total=total_len, desc="Initial Chat"
 41 |     ):
 42 |         chat_input = inputs[i]
 43 | 
 44 |         for j, inst in enumerate(chat_input):
 45 |             if j == 0:
 46 |                 input_ = apply_template_chat(system_prompt, inst["input"])
 47 |             else:
 48 |                 input_.append({"role": "user", "content": inst["input"]})
 49 | 
 50 |             if inst["id"] in chat_dict:
 51 |                 input_.append({"role": "assistant", "content": chat_dict[inst["id"]]})
 52 |                 continue
 53 |             
 54 |             
 55 |             output = await model.completions([input_], **SAMPLING_PARAMS_OPENAI)
 56 |             chat_dict[inst["id"]] = output[0]
 57 |             input_.append({"role": "assistant", "content": output[0]})
 58 | 
 59 |         batched_outputs.append(input_)
 60 | 
 61 |     # Identify failed instances and prepare for retries
 62 | 
 63 |     to_retry_inputs = []
 64 |     to_retry_indices = []
 65 | 
 66 |     for i, output in enumerate(batched_outputs):
 67 |         for chat in output:
 68 |             if "assistant" in chat:
 69 |                 if chat["content"] is None:  # Parsing failed
 70 |                     to_retry_inputs.append(inputs[i])
 71 |                     to_retry_indices.append(i)
 72 | 
 73 |     # Retry logic with progress bar
 74 |     retries = 0
 75 |     while to_retry_inputs and retries < max_retries:
 76 |         retries += 1
 77 |         print(f"Retrying failed batches: Attempt {retries}/{max_retries}")
 78 |         retry_outputs = []
 79 |         for i in tqdm(
 80 |             range(0, len(to_retry_inputs)), desc=f"Retry Attempt {retries}"
 81 |         ):
 82 |             chat_input = inputs[i]
 83 |             for j, inst in enumerate(chat_input):
 84 |                 if j == 0:
 85 |                     input_ = apply_template_chat(system_prompt, inst["input"])
 86 |                 else:
 87 |                     input_.append({"role": "user", "content": inst["input"]})
 88 | 
 89 |                 if inst["id"] in chat_dict:
 90 |                     input_.append({"role": "assistant", "content": chat_dict[inst["id"]]})
 91 |                     continue
 92 |                 output = await model.completions([input_], **SAMPLING_PARAMS_OPENAI)
 93 |                 chat_dict[inst["id"]] = output[0]
 94 |                 input_.append({"role": "assistant", "content": output[0]})
 95 | 
 96 |             batched_outputs.append(input_)
 97 | 
 98 |         new_to_retry_inputs = []
 99 |         new_to_retry_indices = []
100 |         for idx, (retry_idx, output) in enumerate(zip(to_retry_indices, retry_outputs)):
101 |             if output is None:  # Still failing
102 |                 new_to_retry_inputs.append(to_retry_inputs[idx])
103 |                 new_to_retry_indices.append(to_retry_indices[idx])
104 |             else:
105 |                 batched_outputs[retry_idx] = output  # Update with successful retry
106 | 
107 |         to_retry_inputs = new_to_retry_inputs
108 |         to_retry_indices = new_to_retry_indices
109 | 
110 |     # Final aggregation and printing
111 |     outputs_len = len(chat_dict)
112 |     print(f"Processed {outputs_len} / {total_len} instances.")
113 |     '''
114 |     if outputs_len < total_len:
115 |         warnings.warn("Some instances failed.")
116 |         warnings.warn("They will be written as None in the output file.")
117 |         raise Exception(
118 |             f"Failed to generate feedback for {total_len - outputs_len} instances."
119 |         )
120 |     '''
121 |     return chat_dict
122 | 
123 | 
124 | def apply_template_chat(system_prompt, user_prompt):
125 |     messages = [
126 |         {"role": "system", "content": system_prompt},
127 |         {"role": "user", "content": user_prompt},
128 |     ]
129 |     return messages
130 | 
131 | 
132 | def prepare_inputs_multiturn(
133 |     questions: Dict[str, Dict],
134 |     nodes: Dict[str, Dict],
135 |     node_to_q: Dict[str, str]
136 | ) -> List[str]:
137 | 
138 |     chat_inputs = []
139 |    
140 |     for nodeid, node in nodes.items():
141 |         chat = []
142 |         target_question_d = questions[node_to_q[nodeid]]
143 |         target_question = target_question_d["question"]
144 | 
145 |         for predec_nodeid in node["direct_predecessors"]:
146 |             predec_question_d = questions[node_to_q[predec_nodeid]]
147 |             predec_question = predec_question_d["question"]
148 |             user_prompt = USER_PROMPT_TEMPLATE_ZERO_SHOT.format(question=predec_question)
149 |             chat.append({"id":predec_nodeid, "input": user_prompt})
150 |         last_user_prompt = USER_PROMPT_TEMPLATE_LAST_TURN.format(question=target_question)
151 |         chat.append({"id": nodeid, "input": last_user_prompt})
152 | 
153 |         chat_inputs.append(chat)
154 | 
155 |     return chat_inputs
156 | 
157 | 
158 | async def main(args):
159 |     global DEBUG
160 |     DEBUG = args.debug
161 | 
162 |     # Load data
163 |     dataloader = HFDepthQALoader()
164 |     questions, nodes, node_to_q = dataloader.load_data()
165 |     print(f"Loaded {len(questions)} questions and {len(nodes)} nodes.")
166 | 
167 |     # Load model
168 |     model = OpenAILLM(args.model_name)
169 |     nodes = filter_data_dict(nodes, lambda node: node["depth"] > 1)
170 | 
171 |     if DEBUG:
172 |         nodes = slice_data_dict(nodes, start=0, end=5)
173 | 
174 |     inputs = prepare_inputs_multiturn(
175 |         questions,
176 |         nodes,
177 |         node_to_q
178 |     )
179 | 
180 |     if DEBUG:
181 |         inputs = inputs[:5]
182 | 
183 |     # Inference
184 |     predictions = await multiturn_completions(model, inputs, len(node_to_q))
185 | 
186 |     # Save results
187 |     results = {}
188 |     for idx, nodeid in enumerate(predictions.keys()):
189 |         results[nodeid] = questions[node_to_q[nodeid]]
190 |         results[nodeid].update({"predicted_answer": predictions[nodeid]})
191 | 
192 |     output_path = get_output_path(
193 |         output_file=args.output_file, default_output_dir="../../outputs/inference"
194 |     )
195 | 
196 |     with open(output_path, "w") as f:
197 |         json.dump(results, f, indent=4)
198 | 
199 | 
200 | if __name__ == "__main__":
201 |     parser = ArgumentParser()
202 |     # I/O arguments
203 |     parser.add_argument(
204 |         "--model_name",
205 |         type=str,
206 |         required=True,
207 |         help="Name of model hosted in Hugging Face under AutoModelForCausalLM",
208 |     )
209 |     parser.add_argument(
210 |         "--input",
211 |         type=str,
212 |         default="kaist-ai/DepthQA",
213 |         help="Dataset name in Hugging Face (for zero-shot) or local zero-shot JSON output file (for prompt-*)",
214 |     )
215 |     parser.add_argument(
216 |         "--output_file",
217 |         type=str,
218 |         required=True,
219 |         help="Output JSON file name. Unless the parent directory is specified, will be saved under outputs/inference by default.",
220 |     )
221 |     parser.add_argument(
222 |         "--force_rerun",
223 |         action="store_true",
224 |         help="Force rerun even if output file exists.",
225 |     )
226 |     parser.add_argument("--debug", action="store_true", help="Debug mode.")
227 | 
228 |     args = parser.parse_args()
229 | 
230 |     assert args.output_file.endswith(".json"), "Output file must be a JSON file."
231 | 
232 |     output_path = get_output_path(
233 |         output_file=args.output_file, default_output_dir="../../outputs/inference"
234 |     )
235 |     assert not (
236 |         os.path.exists(output_path) and not args.force_rerun
237 |     ), f"Output file {output_path} already exists. Skipping inference."
238 | 
239 |     asyncio.run(main(args))
240 | 


--------------------------------------------------------------------------------
/src/inference/single_turn.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import warnings
  4 | from argparse import ArgumentParser
  5 | from typing import Dict, List
  6 | 
  7 | from tqdm import tqdm
  8 | 
  9 | from src.data import HFDepthQALoader, filter_data_dict, slice_data_dict
 10 | from src.model import VLLM
 11 | from src.utils import (
 12 |     SAMPLING_PARAMS,
 13 |     SYSTEM_PROMPT_CTX,
 14 |     SYSTEM_PROMPT_ZERO_SHOT,
 15 |     USER_PROMPT_TEMPLATE_CTX,
 16 |     USER_PROMPT_TEMPLATE_ZERO_SHOT,
 17 |     get_output_path,
 18 |     show_random_inputs,
 19 | )
 20 | 
 21 | DEBUG = False
 22 | 
 23 | 
 24 | # Model inference (Use offline batching)
 25 | def batch_completions(
 26 |     model,
 27 |     inputs: List[str],
 28 |     batch_size,
 29 | ):
 30 |     batched_outputs = []
 31 | 
 32 |     # Adjust batch size to fit the number of inputs
 33 |     # VLLM supports adaptive batch size already
 34 |     total_batches = len(inputs) // batch_size + (
 35 |         1 if len(inputs) % batch_size > 0 else 0
 36 |     )
 37 |     total_len = len(inputs)
 38 | 
 39 |     # Process initial batches with progress bar
 40 |     print("Processing initial batches...")
 41 |     for i in tqdm(
 42 |         range(0, len(inputs), batch_size), total=total_batches, desc="Initial Batches"
 43 |     ):
 44 |         batch_inputs = inputs[i : i + batch_size]
 45 |         batch_outputs = model.completions(
 46 |             batch_inputs, **SAMPLING_PARAMS, use_tqdm=True
 47 |         )
 48 |         batched_outputs.extend(batch_outputs)
 49 | 
 50 |     # Final aggregation and printing
 51 |     outputs_len = len(batched_outputs)
 52 |     print(f"Processed {outputs_len}/{total_len} instances.")
 53 | 
 54 |     if outputs_len < total_len:
 55 |         warnings.warn("Some instances failed.")
 56 |         warnings.warn("They will be written as None in the output file.")
 57 |         raise Exception(
 58 |             f"Failed to generate feedback for {total_len - outputs_len} instances."
 59 |         )
 60 | 
 61 |     for i, output in enumerate(batched_outputs):
 62 |         if output == "":
 63 |             print("Empty output")
 64 |             batched_outputs[i] = None
 65 | 
 66 |     if DEBUG:
 67 |         print("Checking the results")
 68 |         for output in batched_outputs[:5]:
 69 |             print(output)
 70 | 
 71 |     return batched_outputs
 72 | 
 73 | 
 74 | def apply_template_chat(system_prompt, user_prompt, tokenizer):
 75 |     if tokenizer.chat_template and "system" not in tokenizer.chat_template:
 76 |         messages = [
 77 |             {"role": "user", "content": system_prompt + "\n" + user_prompt},
 78 |         ]
 79 |     else:
 80 |         messages = [
 81 |             {"role": "system", "content": system_prompt},
 82 |             {"role": "user", "content": user_prompt},
 83 |         ]
 84 |     return (
 85 |         tokenizer.apply_chat_template(  # automatically format to default chat template
 86 |             messages, tokenize=False, add_generation_prompt=True
 87 |         )
 88 |     )
 89 | 
 90 | 
 91 | def prepare_inputs_zero_shot(questions: Dict[str, Dict], tokenizer) -> List[str]:
 92 |     inputs = []
 93 |     system_prompt = SYSTEM_PROMPT_ZERO_SHOT
 94 |     for question_d in questions.values():
 95 |         target_question = question_d["question"]
 96 |         user_prompt = USER_PROMPT_TEMPLATE_ZERO_SHOT.format(question=target_question)
 97 | 
 98 |         input_str = apply_template_chat(system_prompt, user_prompt, tokenizer)
 99 |         inputs.append(input_str)
100 | 
101 |     return inputs
102 | 
103 | 
104 | def prepare_inputs_ctx(
105 |     questions: Dict[str, Dict],
106 |     nodes: Dict[str, Dict],
107 |     node_to_q: Dict[str, str],
108 |     tokenizer,
109 |     use_gold: bool,
110 | ) -> List[str]:
111 |     inputs = []
112 |     system_prompt = SYSTEM_PROMPT_CTX
113 |     for nodeid, node in nodes.items():
114 |         target_question_d = questions[node_to_q[nodeid]]
115 |         target_question = target_question_d["question"]
116 |         predecessor_pairs = ""
117 |         for predec_nodeid in node["direct_predecessors"]:
118 |             predec_question_d = questions[node_to_q[predec_nodeid]]
119 |             predec_question = predec_question_d["question"]
120 |             if use_gold:
121 |                 predec_answer = predec_question_d["answer"]
122 |             else:
123 |                 predec_answer = predec_question_d["predicted_answer"]
124 |             pair = f"Q: {predec_question}\nA: {predec_answer}\n"
125 |             predecessor_pairs += pair
126 |         user_prompt = USER_PROMPT_TEMPLATE_CTX.format(
127 |             qa_pairs=predecessor_pairs, question=target_question
128 |         )
129 | 
130 |         input_str = apply_template_chat(system_prompt, user_prompt, tokenizer)
131 |         inputs.append(input_str)
132 | 
133 |     return inputs
134 | 
135 | 
136 | def main(args):
137 |     global DEBUG
138 |     DEBUG = args.debug
139 | 
140 |     # Load data
141 |     dataloader = HFDepthQALoader()
142 |     if args.task_type == "prompt-pred":
143 |         with open(args.input) as f:
144 |             questions = json.load(f)
145 |         _, nodes, node_to_q = dataloader.load_data(except_questions=True)
146 |     else:
147 |         questions, nodes, node_to_q = dataloader.load_data()
148 |     print(f"Loaded {len(questions)} questions and {len(nodes)} nodes.")
149 | 
150 |     # Load model
151 |     model = VLLM(args.model_name, num_gpus=args.num_gpus)
152 |     tokenizer = model.get_tokenizer()
153 | 
154 |     # Prepare inputs
155 |     if args.task_type == "zero-shot":
156 |         if DEBUG:
157 |             questions = slice_data_dict(questions, start=0, end=5)
158 |         inputs = prepare_inputs_zero_shot(questions, tokenizer)
159 |     else:
160 |         nodes = filter_data_dict(nodes, lambda node: node["depth"] > 1)
161 |         if DEBUG:
162 |             nodes = slice_data_dict(nodes, start=0, end=5)
163 |         inputs = prepare_inputs_ctx(
164 |             questions,
165 |             nodes,
166 |             node_to_q,
167 |             tokenizer,
168 |             use_gold=args.task_type == "prompt-gold",
169 |         )
170 |     show_random_inputs(inputs)
171 | 
172 |     if DEBUG:
173 |         inputs = inputs[:5]
174 | 
175 |     # Inference
176 |     predictions = batch_completions(model, inputs, args.batch_size)
177 | 
178 |     # Save results
179 |     results = {}
180 |     if args.task_type == "zero-shot":
181 |         for idx, (qid, question_d) in enumerate(questions.items()):
182 |             results[qid] = question_d
183 |             results[qid].update({"predicted_answer": predictions[idx]})
184 |     else:
185 |         for idx, nodeid in enumerate(nodes.keys()):
186 |             results[nodeid] = questions[node_to_q[nodeid]]
187 |             results[nodeid].update({"predicted_answer": predictions[idx]})
188 | 
189 |     output_path = get_output_path(
190 |         output_file=args.output_file, default_output_dir="../../outputs/inference"
191 |     )
192 |     with open(output_path, "w") as f:
193 |         json.dump(results, f, indent=4)
194 | 
195 | 
196 | if __name__ == "__main__":
197 |     parser = ArgumentParser()
198 |     # I/O arguments
199 |     parser.add_argument(
200 |         "--model_name",
201 |         type=str,
202 |         required=True,
203 |         help="Name of model hosted in Hugging Face under AutoModelForCausalLM",
204 |     )
205 |     parser.add_argument(
206 |         "--input",
207 |         type=str,
208 |         default="kaist-ai/DepthQA",
209 |         help="Dataset name in Hugging Face (for zero-shot) or local zero-shot JSON output file (for prompt-*)",
210 |     )
211 |     parser.add_argument(
212 |         "--output_file",
213 |         type=str,
214 |         required=True,
215 |         help="Output JSON file name. Unless the parent directory is specified, will be saved under outputs/inference by default.",
216 |     )
217 |     parser.add_argument(
218 |         "--force_rerun",
219 |         action="store_true",
220 |         help="Force rerun even if output file exists.",
221 |     )
222 |     parser.add_argument("--debug", action="store_true", help="Debug mode.")
223 |     # Compute arguments
224 |     parser.add_argument(
225 |         "--batch_size", type=int, default=8, help="Batch size for inference."
226 |     )
227 |     parser.add_argument(
228 |         "--num_gpus",
229 |         type=int,
230 |         default=2,
231 |         help="Number of GPUs to use for inference. Note that we use bfloat16 if available and float16 otherwise.",
232 |     )
233 |     # Prompt arguments
234 |     parser.add_argument(
235 |         "--task_type",
236 |         type=str,
237 |         default="zero-shot",
238 |         choices=["zero-shot", "prompt-gold", "prompt-pred"],
239 |         help="Task type for the model, which determines the input text.",
240 |     )
241 |     args = parser.parse_args()
242 | 
243 |     assert not (
244 |         args.task_type == "prompt-pred" and not args.input.endswith(".json")
245 |     ), "Input file for prompt-pred task should be a JSON file that contains zero-shot predictions."
246 | 
247 |     assert args.output_file.endswith(".json"), "Output file must be a JSON file."
248 | 
249 |     output_path = get_output_path(
250 |         output_file=args.output_file, default_output_dir="../../outputs/inference"
251 |     )
252 |     assert not (
253 |         os.path.exists(output_path) and not args.force_rerun
254 |     ), f"Output file {output_path} already exists. Skipping inference."
255 | 
256 |     main(args)
257 | 


--------------------------------------------------------------------------------
/outputs/inference/gpt-3.5-turbo_prompt-gold.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "1_d3": {
 3 |         "qid": "d3_q1",
 4 |         "depth": 3,
 5 |         "question": "I think the Law of Large Numbers also tells us something similar to what the Central Limit Theoreom says. Evaluate my understanding.",
 6 |         "answer": "It's understandable that the concepts of the Law of Large Numbers (LLN) and the Central Limit Theorem (CLT) might seem similar at first glance, as they both involve large sample sizes and their effects on statistical properties. However, they actually address quite different phenomena in the field of statistics.\n\nThe Law of Large Numbers is a principle that focuses on the convergence of sample means to the population mean. What this means is that as you take larger and larger samples from a population, the average (mean) of those samples gets closer and closer to the average (mean) of the entire population. The LLN essentially tells us that the accuracy of the sample mean as an estimate of the population mean improves with larger sample sizes.\n\nOn the other hand, the Central Limit Theorem is concerned with the distribution of these sample means, especially when the sample size is large. According to the CLT, if you take sufficiently large samples from a population (regardless of the population's original distribution) and calculate the mean of each sample, the distribution of these sample means will tend to follow a normal (bell-shaped) distribution. This holds true even if the original population is not normally distributed. The CLT is crucial for enabling us to use normal distribution assumptions and techniques for inference about population means, even when dealing with populations that are not normally distributed themselves.\n\nSo, while the LLN guarantees that the sample means converge to the population mean as the sample size increases, the CLT provides a shape to the distribution of these sample means, stating that they will tend to be normally distributed as the sample size becomes large. This differentiation is essential for correctly applying statistical methods and understanding the behavior of sample means in relation to the population mean.",
 7 |         "predicted_answer": "Your understanding is on the right track! Both the Law of Large Numbers and the Central Limit Theorem are related concepts in statistics, but they focus on different aspects of sampling and estimation.\n\nThe Law of Large Numbers primarily deals with the behavior of sample means as the sample size increases. It states that as the sample size increases, the sample mean gets closer to the population mean. In other words, it emphasizes the convergence of sample statistics to population parameters as sample size grows. This is particularly useful for reducing sampling error and improving the accuracy of estimates.\n\nOn the other hand, the Central Limit Theorem focuses on the distribution of sample means. It states that regardless of the shape of the population distribution, the distribution of sample means will approach a normal distribution as the sample size increases. This is essential for making statistical inferences and justifying the use of certain statistical techniques.\n\nIn summary, while both concepts are related and highlight the importance of sample size in statistical analysis, the Law of Large Numbers pertains more to the convergence of sample statistics to population parameters, while the Central Limit Theorem relates to the distributional properties of sample means."
 8 |     },
 9 |     "1_d2_n1": {
10 |         "qid": "d2_q1",
11 |         "depth": 2,
12 |         "question": "How can the Law of Large Numbers be applied to improve the accuracy of sample means?",
13 |         "answer": "The Law of Large Numbers can be applied to improve the accuracy of sample means by increasing the sample size. According to this law, as the size of a sample increases, the sample mean will get closer to the population mean. This means that increasing the number of data points in your sample will make your sample mean a more accurate estimate of the actual population mean. It reduces the impact of outliers or extreme values and minimizes the effect of randomness or chance, providing a more reliable and valid result.",
14 |         "predicted_answer": "The Law of Large Numbers can be applied to improve the accuracy of sample means by increasing the sample size. As the sample size grows, the mean of the sample will approach the true population mean more closely, reducing the margin of error and increasing the accuracy of the sample mean estimate. This is because a larger sample size provides a more representative distribution of the population, reducing the impact of random fluctuations and leading to more reliable and precise estimates."
15 |     },
16 |     "1_d2_n2": {
17 |         "qid": "d2_q2",
18 |         "depth": 2,
19 |         "question": "What is the significance of the distribution of sample means according to the Central Limit Theorem?",
20 |         "answer": "The Central Limit Theorem (CLT) is a statistical theory that states that given a sufficiently large sample size from a population with a finite level of variance, the mean of all samples from the same population will be approximately equal to the mean of the population. The distribution of sample means will approach a normal (bell-shaped) distribution as the sample size increases, regardless of the shape of the population distribution.\n\nThe significance of this is twofold:\n\n1. It allows us to make inferences about the population from sample data. Even if the original population is not normally distributed, we can apply techniques assuming normality if the sample size is large enough.\n\n2. It underpins many statistical tests and procedures that assume a normal distribution, such as confidence intervals and hypothesis testing. Without the CLT, these techniques would not have a solid theoretical foundation.\n\nIn essence, the Central Limit Theorem allows us to apply probabilistic or inferential statistics to data in a robust and justified manner.",
21 |         "predicted_answer": "The significance of the distribution of sample means according to the Central Limit Theorem is that when multiple random samples of a certain size are taken from a population, the distribution of the sample means will approximate a normal distribution, regardless of the shape of the population distribution. This allows for the application of various statistical analysis methods that rely on the assumption of normality, such as calculating confidence intervals and conducting hypothesis testing. The Central Limit Theorem provides a theoretical foundation for understanding how sample means behave and helps in making statistical inferences about populations based on sample data."
22 |     },
23 |     "1_d2_n3": {
24 |         "qid": "d2_q3",
25 |         "depth": 2,
26 |         "question": "How does sample size affect the accuracy of statistical estimates in relation to the Law of Large Numbers?",
27 |         "answer": "The Law of Large Numbers states that as the sample size increases, the sample mean gets closer and closer to the population mean. In other words, the accuracy of statistical estimates improves with an increase in sample size. This is because larger samples tend to include a greater variety of observations, which therefore gives a better representation of the population. This minimizes the sampling error and the results are more reliable and valid. So, having a larger sample size reduces the effect of randomness and anomaly in the results, making the estimate more accurate.",
28 |         "predicted_answer": "Sample size plays a crucial role in the accuracy of statistical estimates in relation to the Law of Large Numbers. As the sample size increases, the accuracy of statistical estimates also improves. This is because the Law of Large Numbers states that as the sample size grows, the sample mean gets closer to the population mean. Therefore, with a larger sample size, the estimates derived from the sample are more likely to represent the true population parameter accurately. In essence, a larger sample size reduces the impact of random chance and variability, leading to more reliable and precise statistical estimates."
29 |     },
30 |     "1_d2_n4": {
31 |         "qid": "d2_q4",
32 |         "depth": 2,
33 |         "question": "Explain how increasing the sample size influences the distribution of sample means as per the Central Limit Theorem?",
34 |         "answer": "The Central Limit Theorem (CLT) states that if you have a population with any shape of distribution (even a non-normal distribution), the distribution of sample means obtained through sufficiently large samples drawn from that population will approximate a normal distribution. \n\nIn relation to sample size, CLT suggests that the larger the sample size, the closer the sample mean gets to the population mean. This is due to the fact that increasing the sample size reduces the standard error, thus narrowing the spread of the distribution. \n\nThe distribution of sample means becomes less skewed and more closely approximates a normal distribution as the sample size increases, regardless of the shape of the population distribution. This is one of the key aspects of the Central Limit theorem. Consequently, larger sample sizes lead to more reliable and valid inferences about the population.",
35 |         "predicted_answer": "Increasing the sample size influences the distribution of sample means by aligning it more closely to a normal distribution, as per the Central Limit Theorem. When the sample size is increased, the sample mean distribution becomes narrower and taller, resembling a bell curve. This occurs because with a larger sample size, the variability in the sample means decreases, leading to a more normal distribution shape. Essentially, as the sample size increases, the distribution of sample means becomes more symmetrical, centered around the population mean, and conforms more to the characteristics of a normal distribution."
36 |     }
37 | }


--------------------------------------------------------------------------------
/src/evaluation/batch_eval_openai.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import time
  4 | import warnings
  5 | from argparse import ArgumentParser
  6 | from typing import Dict, List
  7 | 
  8 | from src.model import OpenAIBatchClient
  9 | from src.utils import (
 10 |     SAMPLING_PARAMS_OPENAI,
 11 |     SYSTEM_PROMPT_EVAL,
 12 |     USER_PROMPT_TEMPLATE_EVAL,
 13 |     calculate_cost,
 14 |     format_messages,
 15 |     get_output_path,
 16 |     show_random_inputs,
 17 | )
 18 | 
 19 | from .output_parser import parse_judgment
 20 | 
 21 | DEBUG = False
 22 | 
 23 | 
 24 | def prepare_inputs(predictions: Dict[str, Dict], eval_model_name: str) -> List[Dict]:
 25 |     inputs = []
 26 |     system_prompt = SYSTEM_PROMPT_EVAL
 27 |     for id, record in predictions.items():  # id can be qid or nodeid
 28 |         instruction = record["question"].strip()
 29 |         reference_answer = record["answer"].strip()
 30 |         response = record["predicted_answer"].strip()
 31 |         user_prompt = USER_PROMPT_TEMPLATE_EVAL.format(
 32 |             instruction=instruction,
 33 |             reference_answer=reference_answer,
 34 |             response=response,
 35 |         )
 36 |         messages = format_messages(system_prompt, user_prompt)
 37 |         inputs.append(
 38 |             {
 39 |                 "custom_id": id,
 40 |                 "method": "POST",
 41 |                 "url": "/v1/chat/completions",
 42 |                 "body": {
 43 |                     "model": eval_model_name,
 44 |                     "messages": messages,
 45 |                     **SAMPLING_PARAMS_OPENAI,
 46 |                 },
 47 |             }
 48 |         )
 49 |     return inputs
 50 | 
 51 | 
 52 | def prepare_input_file(prediction_file, eval_model_name):
 53 |     with open(prediction_file) as f:
 54 |         predictions = json.load(f)
 55 | 
 56 |     inputs = prepare_inputs(predictions, eval_model_name)
 57 |     show_random_inputs(inputs)
 58 |     if DEBUG:
 59 |         inputs = inputs[:5]
 60 | 
 61 |     batch_eval_input_file = prediction_file.replace(".json", "_batch_eval_input.jsonl")
 62 |     with open(batch_eval_input_file, "w") as f:
 63 |         for input in inputs:
 64 |             f.write(json.dumps(input) + "\n")
 65 |     return batch_eval_input_file
 66 | 
 67 | 
 68 | def prepare_output_file(outputs, prediction_file, output_file):
 69 |     with open(prediction_file) as f:
 70 |         predictions = json.load(f)
 71 | 
 72 |     input_token_cnt = 0
 73 |     output_token_cnt = 0
 74 | 
 75 |     batch_output_file = args.output_file.replace(".json", "_batch_eval_output.jsonl")
 76 |     batch_output_writer = open(batch_output_file, "w")
 77 |     for output in outputs.iter_lines():
 78 |         batch_output_writer.write(output + "\n")
 79 | 
 80 |         output = json.loads(output)
 81 |         custom_id = output["custom_id"]
 82 |         judgment = output["response"]["body"]["choices"][0]["message"]["content"]
 83 |         feedback, score = parse_judgment(judgment)
 84 |         predictions[custom_id].update({"feedback": feedback, "score": score})
 85 | 
 86 |         input_token_cnt += output["response"]["body"]["usage"]["prompt_tokens"]
 87 |         output_token_cnt += output["response"]["body"]["usage"]["completion_tokens"]
 88 |     batch_output_writer.close()
 89 | 
 90 |     with open(output_file, "w") as f:
 91 |         json.dump(predictions, f, indent=4)
 92 | 
 93 |     return input_token_cnt, output_token_cnt
 94 | 
 95 | 
 96 | def main_individual(args):
 97 |     global DEBUG
 98 |     DEBUG = args.debug
 99 | 
100 |     client = OpenAIBatchClient()
101 |     if args.mode == "create":
102 |         batch_eval_input_file = prepare_input_file(
103 |             args.prediction_file, args.eval_model_name
104 |         )
105 |         batch = client.create_batch(batch_eval_input_file, args.description)
106 |         print(f"Batch created: {batch}")
107 |         return batch.id
108 | 
109 |     elif args.mode == "check":
110 |         status, batch_output_file_id = client.check_batch(args.batch_id)
111 |         print(f"{args.batch_id} status: {status}")
112 |         return status, batch_output_file_id
113 | 
114 |     elif args.mode == "cancel":
115 |         client.cancel_batch(args.batch_id)
116 | 
117 |     elif args.mode == "list":
118 |         client.list_batches()
119 | 
120 |     elif args.mode == "retrieve":
121 |         outputs = client.retrieve_batch(args.batch_output_file_id)
122 | 
123 |         output_path = get_output_path(
124 |             output_file=args.output_file, default_output_dir="../../outputs/evaluation"
125 |         )
126 |         args.output_file = output_path
127 | 
128 |         input_token_cnt, output_token_cnt = prepare_output_file(
129 |             outputs, args.prediction_file, args.output_file
130 |         )
131 | 
132 |         cost = calculate_cost(args.eval_model_name, input_token_cnt, output_token_cnt)
133 |         print(f"Cost: {cost:.2f} USD")
134 | 
135 |     else:
136 |         raise ValueError("Invalid mode")
137 | 
138 | 
139 | def main_auto(args):
140 |     client = OpenAIBatchClient()
141 | 
142 |     # Step 1: Create batch request
143 |     batch_eval_input_file = prepare_input_file(
144 |         args.prediction_file, args.eval_model_name
145 |     )
146 |     batch = client.create_batch(batch_eval_input_file, args.description)
147 |     print(f"Batch created: {batch}")
148 |     batch_id = batch.id
149 | 
150 |     # Step 2: Check status periodically
151 |     while True:
152 |         status, batch_output_file_id = client.check_batch(batch_id)
153 |         print(f"Current status: {status}")
154 | 
155 |         if status == "completed":
156 |             break
157 |         elif status in ["failed", "cancelling", "cancelled", "expired"]:
158 |             raise Exception(f"Batch failed with status: {status}")
159 | 
160 |         time.sleep(30)  # Wait for 30 seconds before checking again
161 |     print(f"Batch completed. Output file ID: {batch_output_file_id}")
162 | 
163 |     # Step 3: Retrieve results
164 |     outputs = client.retrieve_batch(batch_output_file_id)
165 |     output_path = get_output_path(
166 |         output_file=args.output_file, default_output_dir="../../outputs/evaluation"
167 |     )
168 |     args.output_file = output_path
169 | 
170 |     print(f"Retrieved results saved to {output_path}")
171 |     input_token_cnt, output_token_cnt = prepare_output_file(
172 |         outputs, args.prediction_file, args.output_file
173 |     )
174 | 
175 |     cost = calculate_cost(args.eval_model_name, input_token_cnt, output_token_cnt)
176 |     print(f"Cost: {cost:.2f} USD")
177 | 
178 | 
179 | if __name__ == "__main__":
180 |     parser = ArgumentParser()
181 |     parser.add_argument(
182 |         "--mode",
183 |         type=str,
184 |         choices=["auto", "create", "check", "cancel", "list", "retrieve"],
185 |     )
186 |     # Arguments for creating or retrieving a batch job
187 |     parser.add_argument(
188 |         "--eval_model_name",
189 |         type=str,
190 |         default="gpt-4-0125-preview",
191 |         help="OpenAI evaluator model name",
192 |     )
193 |     # parser.add_argument("--dataset", type=str, default="kaist-ai/DepthQA", help="Dataset name in Hugging Face")  # TODO: Allow retrieving metadata from dataset in case the dataset changes
194 |     parser.add_argument(
195 |         "--prediction_file", type=str, help="Output file containing model predictions"
196 |     )
197 |     parser.add_argument("--description", type=str, help="Description of the batch job")
198 |     parser.add_argument(
199 |         "--batch_output_file_id", type=str, help="Output file ID of the batch job"
200 |     )
201 |     parser.add_argument(
202 |         "--output_file",
203 |         type=str,
204 |         help="Output JSON file to write the results. Unless the parent directory is specified, will be saved under outputs/inference by default. The same file but with a .jsonl extension will be created intermediately to store the batch job output.",
205 |     )
206 |     # Argument for checking or canceling a batch job
207 |     parser.add_argument("--batch_id", type=str, help="Batch ID of submitted job")
208 |     # Argument for debugging
209 |     parser.add_argument("--debug", action="store_true", help="Debug mode")
210 |     args = parser.parse_args()
211 | 
212 |     if args.mode == "auto":
213 |         warnings.warn(
214 |             "Creating a batch job in auto mode will overwrite the output file. Make sure you want to proceed."
215 |         )
216 |         assert args.prediction_file and args.prediction_file.endswith(
217 |             ".json"
218 |         ), "A valid JSON prediction file is required for creating a batch job"
219 |         assert args.description, "Description is required for creating a batch job"
220 |         assert args.output_file and args.output_file.endswith(
221 |             ".json"
222 |         ), "A valid JSON output file is required for retrieving a batch job"
223 |         main_auto(args)
224 |     else:
225 |         if args.mode == "create":
226 |             assert args.prediction_file and args.prediction_file.endswith(
227 |                 ".json"
228 |             ), "A valid JSON prediction file is required for creating a batch job"
229 |             assert args.description, "Description is required for creating a batch job"
230 | 
231 |         elif args.mode == "check" or args.mode == "cancel":
232 |             assert args.batch_id and args.batch_id.startswith(
233 |                 "batch_"
234 |             ), "A valid batch ID is required for checking a batch job"
235 | 
236 |         elif args.mode == "retrieve":
237 |             assert args.batch_output_file_id and args.batch_output_file_id.startswith(
238 |                 "file-"
239 |             ), "Output file ID is required for retrieving a batch job"
240 |             assert args.output_file and args.output_file.endswith(
241 |                 ".json"
242 |             ), "A valid JSON output file is required for retrieving a batch job"
243 | 
244 |         main_individual(args)
245 | 


--------------------------------------------------------------------------------
/src/inference/single_turn_openai.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | import os
  4 | import warnings
  5 | from argparse import ArgumentParser
  6 | from typing import Dict, List
  7 | 
  8 | from tqdm import tqdm
  9 | 
 10 | from src.data import HFDepthQALoader, filter_data_dict, slice_data_dict
 11 | from src.model import OpenAILLM
 12 | from src.utils import (
 13 |     SAMPLING_PARAMS_OPENAI,
 14 |     SYSTEM_PROMPT_CTX,
 15 |     SYSTEM_PROMPT_ZERO_SHOT,
 16 |     USER_PROMPT_TEMPLATE_CTX,
 17 |     USER_PROMPT_TEMPLATE_ZERO_SHOT,
 18 |     format_messages,
 19 |     get_output_path,
 20 |     show_random_inputs,
 21 | )
 22 | 
 23 | DEBUG = False
 24 | 
 25 | 
 26 | # Model inference (Use offline batching)
 27 | async def batch_completions_with_retries(
 28 |     model,
 29 |     inputs,
 30 |     batch_size,
 31 |     max_retries=5,
 32 | ):
 33 | 
 34 |     batched_outputs = []
 35 | 
 36 |     total_batches = len(inputs) // batch_size + (
 37 |         1 if len(inputs) % batch_size > 0 else 0
 38 |     )
 39 |     total_len = len(inputs)
 40 | 
 41 |     # Process initial batches with progress bar
 42 |     print("Processing initial batches...")
 43 |     for i in tqdm(
 44 |         range(0, len(inputs), batch_size), total=total_batches, desc="Initial Batches"
 45 |     ):
 46 |         batch_inputs = inputs[i : i + batch_size]
 47 |         batch_outputs = await model.completions(batch_inputs, **SAMPLING_PARAMS_OPENAI)
 48 |         batched_outputs.extend(batch_outputs)
 49 | 
 50 |     # Identify failed instances and prepare for retries
 51 |     to_retry_inputs = []
 52 |     to_retry_indices = []
 53 |     for i, output in enumerate(batched_outputs):
 54 | 
 55 |         if output is None:  # Parsing failed
 56 |             to_retry_inputs.append(inputs[i])
 57 |             to_retry_indices.append(i)
 58 | 
 59 |     # Retry logic with progress bar
 60 |     retries = 0
 61 |     while to_retry_inputs and retries < max_retries:
 62 |         retries += 1
 63 |         print(f"Retrying failed batches: Attempt {retries}/{max_retries}")
 64 |         retry_outputs = []
 65 |         for i in tqdm(
 66 |             range(0, len(to_retry_inputs), batch_size), desc=f"Retry Attempt {retries}"
 67 |         ):
 68 |             batch_inputs = to_retry_inputs[i : i + batch_size]
 69 |             batch_outputs = await model.completions(
 70 |                 batch_inputs, **SAMPLING_PARAMS_OPENAI
 71 |             )
 72 | 
 73 |             assert len(batch_outputs) == len(batch_inputs)
 74 |             retry_outputs.extend(batch_outputs)
 75 | 
 76 |         new_to_retry_inputs = []
 77 |         new_to_retry_indices = []
 78 |         for idx, (retry_idx, output) in enumerate(zip(to_retry_indices, retry_outputs)):
 79 | 
 80 |             if output is None:  # Still failing
 81 |                 new_to_retry_inputs.append(to_retry_inputs[idx])
 82 |                 new_to_retry_indices.append(to_retry_indices[idx])
 83 |             else:
 84 |                 batched_outputs[retry_idx] = output  # Update with successful retry
 85 | 
 86 |         to_retry_inputs = new_to_retry_inputs
 87 |         to_retry_indices = new_to_retry_indices
 88 | 
 89 |     # Final aggregation and printing
 90 |     outputs_len = len(batched_outputs)
 91 |     print(f"Processed {outputs_len}/{total_len} instances.")
 92 | 
 93 |     if outputs_len < total_len:
 94 |         warnings.warn("Some instances failed to generate.")
 95 |         warnings.warn("They will be written as None in the output file.")
 96 |         raise Exception(f"Failed to generate for {total_len - outputs_len} instances.")
 97 | 
 98 |     predictions = []
 99 | 
100 |     for output in tqdm(batched_outputs, desc="Finalizing"):
101 |         if output is not None:
102 |             predictions.append(output)
103 |         else:
104 |             predictions.append(None)
105 |     if DEBUG:
106 |         print("Checking the results")
107 |         for prediction in predictions[:5]:
108 |             print(prediction)
109 | 
110 |     return predictions
111 | 
112 | 
113 | def prepare_inputs_zero_shot(questions: Dict[str, Dict]) -> List[str]:
114 |     inputs = []
115 |     system_prompt = SYSTEM_PROMPT_ZERO_SHOT
116 |     for question_d in questions.values():
117 |         target_question = question_d["question"]
118 |         user_prompt = USER_PROMPT_TEMPLATE_ZERO_SHOT.format(question=target_question)
119 |         messages = format_messages(system_prompt, user_prompt)
120 |         inputs.append(messages)
121 | 
122 |     return inputs
123 | 
124 | 
125 | def prepare_inputs_ctx(
126 |     questions: Dict[str, Dict],
127 |     nodes: Dict[str, Dict],
128 |     node_to_q: Dict[str, str],
129 |     use_gold: bool,
130 | ) -> List[str]:
131 |     inputs = []
132 |     system_prompt = SYSTEM_PROMPT_CTX
133 |     for nodeid, node in nodes.items():
134 |         target_question_d = questions[node_to_q[nodeid]]
135 |         target_question = target_question_d["question"]
136 |         predecessor_pairs = ""
137 |         for predec_nodeid in node["direct_predecessors"]:
138 |             predec_question_d = questions[node_to_q[predec_nodeid]]
139 |             predec_question = predec_question_d["question"]
140 |             if use_gold:
141 |                 predec_answer = predec_question_d["answer"]
142 |             else:
143 |                 predec_answer = predec_question_d["predicted_answer"]
144 |             pair = f"Q: {predec_question}\nA: {predec_answer}\n"
145 |             predecessor_pairs += pair
146 |         user_prompt = USER_PROMPT_TEMPLATE_CTX.format(
147 |             qa_pairs=predecessor_pairs, question=target_question
148 |         )
149 | 
150 |         messages = format_messages(system_prompt, user_prompt)
151 |         inputs.append(messages)
152 | 
153 |     return inputs
154 | 
155 | 
156 | async def main(args):
157 |     global DEBUG
158 |     DEBUG = args.debug
159 | 
160 |     # Load data
161 |     dataloader = HFDepthQALoader()
162 |     if args.task_type == "prompt-pred":
163 |         with open(args.input) as f:
164 |             questions = json.load(f)
165 |         _, nodes, node_to_q = dataloader.load_data(except_questions=True)
166 |     else:
167 |         questions, nodes, node_to_q = dataloader.load_data()
168 |     print(f"Loaded {len(questions)} questions and {len(nodes)} nodes.")
169 | 
170 |     # Load model
171 |     model = OpenAILLM(args.model_name)
172 | 
173 |     # Prepare inputs
174 |     if args.task_type == "zero-shot":
175 |         if DEBUG:
176 |             questions = slice_data_dict(questions, start=0, end=5)
177 |         inputs = prepare_inputs_zero_shot(questions)
178 |     else:
179 |         nodes = filter_data_dict(nodes, lambda node: node["depth"] > 1)
180 |         if DEBUG:
181 |             nodes = slice_data_dict(nodes, start=0, end=5)
182 |         inputs = prepare_inputs_ctx(
183 |             questions,
184 |             nodes,
185 |             node_to_q,
186 |             use_gold=args.task_type == "prompt-gold",
187 |         )
188 |     show_random_inputs(inputs)
189 | 
190 |     if DEBUG:
191 |         inputs = inputs[:5]
192 | 
193 |     # Inference
194 |     predictions = await batch_completions_with_retries(model, inputs, args.batch_size)
195 | 
196 |     # Save results
197 |     results = {}
198 |     if args.task_type == "zero-shot":
199 |         for idx, (qid, question_d) in enumerate(questions.items()):
200 |             results[qid] = question_d
201 |             results[qid].update({"predicted_answer": predictions[idx]})
202 |     else:
203 |         for idx, nodeid in enumerate(nodes.keys()):
204 |             results[nodeid] = questions[node_to_q[nodeid]]
205 |             results[nodeid].update({"predicted_answer": predictions[idx]})
206 | 
207 |     output_path = get_output_path(
208 |         output_file=args.output_file, default_output_dir="../../outputs/inference"
209 |     )
210 |     with open(output_path, "w") as f:
211 |         json.dump(results, f, indent=4)
212 | 
213 | 
214 | if __name__ == "__main__":
215 |     parser = ArgumentParser()
216 |     # I/O arguments
217 |     parser.add_argument(
218 |         "--model_name",
219 |         type=str,
220 |         required=True,
221 |         help="Name of model hosted in OpenAI",
222 |     )
223 |     parser.add_argument(
224 |         "--input",
225 |         type=str,
226 |         default="kaist-ai/DepthQA",
227 |         help="Dataset name in Hugging Face (for zero-shot) or local zero-shot JSON output file (for prompt-*)",
228 |     )
229 |     parser.add_argument(
230 |         "--output_file",
231 |         type=str,
232 |         required=True,
233 |         help="Output JSON file name. Will be saved under outputs/inference by default.",
234 |     )
235 |     parser.add_argument(
236 |         "--force_rerun",
237 |         action="store_true",
238 |         help="Force rerun even if output file exists.",
239 |     )
240 |     parser.add_argument("--debug", action="store_true", help="Debug mode.")
241 |     # Compute arguments
242 |     parser.add_argument(
243 |         "--batch_size", type=int, default=8, help="Batch size for inference."
244 |     )
245 |     # Prompt arguments
246 |     parser.add_argument(
247 |         "--task_type",
248 |         type=str,
249 |         default="zero-shot",
250 |         choices=["zero-shot", "prompt-gold", "prompt-pred"],
251 |         help="Task type for the model, which determines the input text.",
252 |     )
253 |     args = parser.parse_args()
254 | 
255 |     assert not (
256 |         args.task_type == "prompt-pred" and not args.input.endswith(".json")
257 |     ), "Input file for prompt-pred task should be a JSON file that contains zero-shot predictions."
258 | 
259 |     assert args.output_file.endswith(".json"), "Output file must be a JSON file."
260 | 
261 |     output_path = get_output_path(
262 |         output_file=args.output_file, default_output_dir="../../outputs/inference"
263 |     )
264 |     assert not (
265 |         os.path.exists(output_path) and not args.force_rerun
266 |     ), f"Output file {output_path} already exists. Skipping inference."
267 | 
268 |     asyncio.run(main(args))
269 | 


--------------------------------------------------------------------------------
/outputs/inference/llama3_8b_prompt-gold.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "1_d3": {
 3 |         "qid": "d3_q1",
 4 |         "depth": 3,
 5 |         "question": "I think the Law of Large Numbers also tells us something similar to what the Central Limit Theoreom says. Evaluate my understanding.",
 6 |         "answer": "It's understandable that the concepts of the Law of Large Numbers (LLN) and the Central Limit Theorem (CLT) might seem similar at first glance, as they both involve large sample sizes and their effects on statistical properties. However, they actually address quite different phenomena in the field of statistics.\n\nThe Law of Large Numbers is a principle that focuses on the convergence of sample means to the population mean. What this means is that as you take larger and larger samples from a population, the average (mean) of those samples gets closer and closer to the average (mean) of the entire population. The LLN essentially tells us that the accuracy of the sample mean as an estimate of the population mean improves with larger sample sizes.\n\nOn the other hand, the Central Limit Theorem is concerned with the distribution of these sample means, especially when the sample size is large. According to the CLT, if you take sufficiently large samples from a population (regardless of the population's original distribution) and calculate the mean of each sample, the distribution of these sample means will tend to follow a normal (bell-shaped) distribution. This holds true even if the original population is not normally distributed. The CLT is crucial for enabling us to use normal distribution assumptions and techniques for inference about population means, even when dealing with populations that are not normally distributed themselves.\n\nSo, while the LLN guarantees that the sample means converge to the population mean as the sample size increases, the CLT provides a shape to the distribution of these sample means, stating that they will tend to be normally distributed as the sample size becomes large. This differentiation is essential for correctly applying statistical methods and understanding the behavior of sample means in relation to the population mean.",
 7 |         "predicted_answer": "It's understandable that the concepts of the Law of Large Numbers (LLN) and the Central Limit Theorem (CLT) might seem similar at first glance, as they both involve large sample sizes and their effects on statistical properties. However, they actually address quite different phenomena in the field of statistics.\n\nThe Law of Large Numbers is a principle that focuses on the convergence of sample means to the population mean. What this means is that as you take larger and larger samples from a population, the average (mean) of those samples gets closer and closer to the average (mean) of the entire population. The LLN essentially tells us that the accuracy of the sample mean as an estimate of the population mean improves with larger sample sizes.\n\nOn the other hand, the Central Limit Theorem is concerned with the distribution of these sample means, especially when the sample size is large. According to the CLT, if you take sufficiently large samples from a population (regardless of the population's original distribution) and calculate the mean of each sample, the distribution of these sample means will tend to follow a normal (bell-shaped) distribution. This holds true even if the original population is not normally distributed. The CLT is crucial for enabling us to use normal distribution assumptions and techniques for inference about population means, even when dealing with populations that are not normally distributed themselves.\n\nSo, while the LLN guarantees that the sample means converge to the population mean as the sample size increases, the CLT provides a shape to the distribution of these sample means, stating that they will tend to be normally distributed as the sample size becomes large. This differentiation is essential for correctly applying statistical methods and understanding the behavior of sample means in relation to the population mean.\n\nSo, your understanding is partially correct, but it seems you were connecting the LLN to the aspect of the CLT that deals with the distribution of sample means becoming normal. While the LLN does indeed provide information about the convergence of sample means to the population mean, the CLT is a distinct concept that explains the specific shape of the distribution of sample means."
 8 |     },
 9 |     "1_d2_n1": {
10 |         "qid": "d2_q1",
11 |         "depth": 2,
12 |         "question": "How can the Law of Large Numbers be applied to improve the accuracy of sample means?",
13 |         "answer": "The Law of Large Numbers can be applied to improve the accuracy of sample means by increasing the sample size. According to this law, as the size of a sample increases, the sample mean will get closer to the population mean. This means that increasing the number of data points in your sample will make your sample mean a more accurate estimate of the actual population mean. It reduces the impact of outliers or extreme values and minimizes the effect of randomness or chance, providing a more reliable and valid result.",
14 |         "predicted_answer": "I can answer that for you!\n\nAccording to the Law of Large Numbers, increasing the sample size can improve the accuracy of sample means. As the size of the sample increases, the sample mean will get closer to the population mean, making it a more accurate estimate of the actual population mean. This means that having a larger sample size reduces the impact of outliers or extreme values and minimizes the effect of randomness or chance, providing a more reliable and valid result."
15 |     },
16 |     "1_d2_n2": {
17 |         "qid": "d2_q2",
18 |         "depth": 2,
19 |         "question": "What is the significance of the distribution of sample means according to the Central Limit Theorem?",
20 |         "answer": "The Central Limit Theorem (CLT) is a statistical theory that states that given a sufficiently large sample size from a population with a finite level of variance, the mean of all samples from the same population will be approximately equal to the mean of the population. The distribution of sample means will approach a normal (bell-shaped) distribution as the sample size increases, regardless of the shape of the population distribution.\n\nThe significance of this is twofold:\n\n1. It allows us to make inferences about the population from sample data. Even if the original population is not normally distributed, we can apply techniques assuming normality if the sample size is large enough.\n\n2. It underpins many statistical tests and procedures that assume a normal distribution, such as confidence intervals and hypothesis testing. Without the CLT, these techniques would not have a solid theoretical foundation.\n\nIn essence, the Central Limit Theorem allows us to apply probabilistic or inferential statistics to data in a robust and justified manner.",
21 |         "predicted_answer": "According to the Central Limit Theorem (CLT), the distribution of sample means approaches a normal (bell-shaped) distribution as the sample size increases, regardless of the shape of the population distribution."
22 |     },
23 |     "1_d2_n3": {
24 |         "qid": "d2_q3",
25 |         "depth": 2,
26 |         "question": "How does sample size affect the accuracy of statistical estimates in relation to the Law of Large Numbers?",
27 |         "answer": "The Law of Large Numbers states that as the sample size increases, the sample mean gets closer and closer to the population mean. In other words, the accuracy of statistical estimates improves with an increase in sample size. This is because larger samples tend to include a greater variety of observations, which therefore gives a better representation of the population. This minimizes the sampling error and the results are more reliable and valid. So, having a larger sample size reduces the effect of randomness and anomaly in the results, making the estimate more accurate.",
28 |         "predicted_answer": "I see you're looking for an answer related to the Law of Large Numbers and sample size! According to our previous Q&A pair, the Law of Large Numbers states that as the sample size increases, the sample mean gets closer and closer to the population mean. This means that the accuracy of statistical estimates improves with an increase in sample size. With a larger sample size, you'll have a better representation of the population, which minimizes the sampling error and makes the results more reliable and valid. As a result, having a larger sample size reduces the effect of randomness and anomaly in the results, making the estimate more accurate."
29 |     },
30 |     "1_d2_n4": {
31 |         "qid": "d2_q4",
32 |         "depth": 2,
33 |         "question": "Explain how increasing the sample size influences the distribution of sample means as per the Central Limit Theorem?",
34 |         "answer": "The Central Limit Theorem (CLT) states that if you have a population with any shape of distribution (even a non-normal distribution), the distribution of sample means obtained through sufficiently large samples drawn from that population will approximate a normal distribution. \n\nIn relation to sample size, CLT suggests that the larger the sample size, the closer the sample mean gets to the population mean. This is due to the fact that increasing the sample size reduces the standard error, thus narrowing the spread of the distribution. \n\nThe distribution of sample means becomes less skewed and more closely approximates a normal distribution as the sample size increases, regardless of the shape of the population distribution. This is one of the key aspects of the Central Limit theorem. Consequently, larger sample sizes lead to more reliable and valid inferences about the population.",
35 |         "predicted_answer": "According to the Central Limit Theorem (CLT), increasing the sample size influences the distribution of sample means by:\n\n* Making the distribution of sample means more closely approximate a normal distribution, regardless of the shape of the population distribution.\n* Reducing the standard error, which narrows the spread of the distribution and makes it less skewed.\n* Bringing the sample mean closer to the population mean, making the inferences about the population more reliable and valid.\n\nIn other words, as the sample size increases, the distribution of sample means becomes more normal and centered around the population mean, making it easier to make accurate predictions about the population."
36 |     }
37 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/outputs/inference/llama3_8b_prompt-pred.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "1_d3": {
 3 |         "qid": "d3_q1",
 4 |         "depth": 3,
 5 |         "question": "I think the Law of Large Numbers also tells us something similar to what the Central Limit Theoreom says. Evaluate my understanding.",
 6 |         "answer": "It's understandable that the concepts of the Law of Large Numbers (LLN) and the Central Limit Theorem (CLT) might seem similar at first glance, as they both involve large sample sizes and their effects on statistical properties. However, they actually address quite different phenomena in the field of statistics.\n\nThe Law of Large Numbers is a principle that focuses on the convergence of sample means to the population mean. What this means is that as you take larger and larger samples from a population, the average (mean) of those samples gets closer and closer to the average (mean) of the entire population. The LLN essentially tells us that the accuracy of the sample mean as an estimate of the population mean improves with larger sample sizes.\n\nOn the other hand, the Central Limit Theorem is concerned with the distribution of these sample means, especially when the sample size is large. According to the CLT, if you take sufficiently large samples from a population (regardless of the population's original distribution) and calculate the mean of each sample, the distribution of these sample means will tend to follow a normal (bell-shaped) distribution. This holds true even if the original population is not normally distributed. The CLT is crucial for enabling us to use normal distribution assumptions and techniques for inference about population means, even when dealing with populations that are not normally distributed themselves.\n\nSo, while the LLN guarantees that the sample means converge to the population mean as the sample size increases, the CLT provides a shape to the distribution of these sample means, stating that they will tend to be normally distributed as the sample size becomes large. This differentiation is essential for correctly applying statistical methods and understanding the behavior of sample means in relation to the population mean.",
 7 |         "predicted_answer": "That's correct! The Law of Large Numbers (LLN) and the Central Limit Theorem (CLT) both relate to the behavior of sample means as the sample size increases. \n\nThe LLN states that as the sample size increases, the average of the sample means will converge to the population mean with probability 1. This means that as you collect more data, your sample mean will get closer and closer to the true population mean.\n\nThe CLT, on the other hand, states that as the sample size increases, the distribution of sample means will tend towards a normal distribution with a mean equal to the population mean and a variance that is inversely proportional to the sample size.\n\nBoth the LLN and CLT describe the convergence of the sample mean to the population mean as the sample size increases, but they focus on different aspects of this convergence. The LLN focuses on the point estimation aspect, stating that the sample mean will converge to the population mean, while the CLT focuses on the distributional aspect, stating that the distribution of sample means will become more normal.\n\nWell done on recognizing the connection between these two important concepts in statistics!"
 8 |     },
 9 |     "1_d2_n1": {
10 |         "qid": "d2_q1",
11 |         "depth": 2,
12 |         "question": "How can the Law of Large Numbers be applied to improve the accuracy of sample means?",
13 |         "answer": "The Law of Large Numbers can be applied to improve the accuracy of sample means by increasing the sample size. According to this law, as the size of a sample increases, the sample mean will get closer to the population mean. This means that increasing the number of data points in your sample will make your sample mean a more accurate estimate of the actual population mean. It reduces the impact of outliers or extreme values and minimizes the effect of randomness or chance, providing a more reliable and valid result.",
14 |         "predicted_answer": "According to the Law of Large Numbers (LLN), as the size of a sample increases, the average value of the sample will converge to the population mean, with a probability that approaches 1. This means that as the sample size grows, the sample mean will become a more reliable estimate of the population mean.\n\nApplying the Law of Large Numbers to improve the accuracy of sample means involves collecting a large enough sample size to reduce the variability of the sample mean and increase its precision. By doing so, the sample mean will be less affected by individual data points that may be unusual or outliers, and more representative of the overall pattern of the population.\n\nAdditionally, the LLN states that the spread of the data will decrease as the sample size increases, which means that the sample mean will become a more robust estimator of the population mean. This is especially important in situations where the population mean is difficult to estimate directly, such as when conducting surveys or analyzing data from complex systems.\n\nIn practical terms, applying the Law of Large Numbers to improve the accuracy of sample means means that:\n\n* Increasing the sample size can help reduce the margin of error and increase the precision of the sample mean.\n* A larger sample size can provide a more accurate estimate of the population mean, even if the data points in the sample are noisy or contain outliers.\n* The LLN can be used to construct confidence intervals for the population mean, which can provide a range of values within which the true population mean is likely to lie.\n\nBy understanding and applying the Law of Large Numbers, researchers and practitioners can improve the accuracy of their sample means and make more informed decisions based on the data they collect."
15 |     },
16 |     "1_d2_n2": {
17 |         "qid": "d2_q2",
18 |         "depth": 2,
19 |         "question": "What is the significance of the distribution of sample means according to the Central Limit Theorem?",
20 |         "answer": "The Central Limit Theorem (CLT) is a statistical theory that states that given a sufficiently large sample size from a population with a finite level of variance, the mean of all samples from the same population will be approximately equal to the mean of the population. The distribution of sample means will approach a normal (bell-shaped) distribution as the sample size increases, regardless of the shape of the population distribution.\n\nThe significance of this is twofold:\n\n1. It allows us to make inferences about the population from sample data. Even if the original population is not normally distributed, we can apply techniques assuming normality if the sample size is large enough.\n\n2. It underpins many statistical tests and procedures that assume a normal distribution, such as confidence intervals and hypothesis testing. Without the CLT, these techniques would not have a solid theoretical foundation.\n\nIn essence, the Central Limit Theorem allows us to apply probabilistic or inferential statistics to data in a robust and justified manner.",
21 |         "predicted_answer": "According to the Central Limit Theorem (CLT), the distribution of sample means is significant because it states that, under certain conditions, the distribution of the mean of a large sample of independent and identically distributed random variables will be approximately normally distributed, even if the underlying distribution of the individual observations is not normal.\n\nIn simpler terms, the CLT says that the average value of a large enough sample will follow a normal distribution, regardless of the shape of the original distribution. This means that the distribution of sample means will be bell-shaped, with a single peak and symmetric around the true population mean.\n\nThis has important implications in statistics, as it provides a theoretical foundation for statistical inference and hypothesis testing. By knowing the distribution of sample means, we can make inferences about the population mean and construct confidence intervals and hypothesis tests.\n\nIn essence, the significance of the distribution of sample means according to the CLT lies in its ability to provide a predictable and reliable basis for statistical analysis, allowing us to draw meaningful conclusions about populations based on a small sample of data."
22 |     },
23 |     "1_d2_n3": {
24 |         "qid": "d2_q3",
25 |         "depth": 2,
26 |         "question": "How does sample size affect the accuracy of statistical estimates in relation to the Law of Large Numbers?",
27 |         "answer": "The Law of Large Numbers states that as the sample size increases, the sample mean gets closer and closer to the population mean. In other words, the accuracy of statistical estimates improves with an increase in sample size. This is because larger samples tend to include a greater variety of observations, which therefore gives a better representation of the population. This minimizes the sampling error and the results are more reliable and valid. So, having a larger sample size reduces the effect of randomness and anomaly in the results, making the estimate more accurate.",
28 |         "predicted_answer": "According to the Law of Large Numbers (LLN), as the sample size increases, the average value of the sample will converge to the population mean, with a probability that approaches 1. This means that as the sample size grows, the spread of the data will decrease, and the average value will become more representative of the population.\n\nIn terms of accuracy, increasing the sample size can improve the accuracy of statistical estimates in several ways:\n\n1. **Reduced bias**: With a larger sample size, the sample mean is more likely to be close to the population mean, reducing bias and making the estimate more accurate.\n2. **Increased precision**: As the sample size increases, the standard error of the mean (i.e., the spread of the data around the sample mean) decreases, leading to more precise estimates.\n3. **Improved representation**: A larger sample size provides a more representative picture of the population, allowing for more accurate inference and prediction.\n\nHowever, it's important to note that there is no guarantee that increasing the sample size will always lead to improved accuracy. Factors such as the quality of the data, the complexity of the statistical model, and the presence of outliers can also impact accuracy. Nevertheless, the LLN suggests that, all else being equal, a larger sample size is likely to lead to more accurate statistical estimates."
29 |     },
30 |     "1_d2_n4": {
31 |         "qid": "d2_q4",
32 |         "depth": 2,
33 |         "question": "Explain how increasing the sample size influences the distribution of sample means as per the Central Limit Theorem?",
34 |         "answer": "The Central Limit Theorem (CLT) states that if you have a population with any shape of distribution (even a non-normal distribution), the distribution of sample means obtained through sufficiently large samples drawn from that population will approximate a normal distribution. \n\nIn relation to sample size, CLT suggests that the larger the sample size, the closer the sample mean gets to the population mean. This is due to the fact that increasing the sample size reduces the standard error, thus narrowing the spread of the distribution. \n\nThe distribution of sample means becomes less skewed and more closely approximates a normal distribution as the sample size increases, regardless of the shape of the population distribution. This is one of the key aspects of the Central Limit theorem. Consequently, larger sample sizes lead to more reliable and valid inferences about the population.",
35 |         "predicted_answer": "According to the Central Limit Theorem (CLT), the distribution of the sample mean will become more normal and tighter around the population mean as the sample size increases. This means that as the sample size grows, the sample means will converge towards the population mean, and the variability of the sample means will decrease.\n\nIn simpler terms, the more observations you include in your sample, the more likely it is that the average of those observations will be close to the true average of the entire population. As the sample size increases, the sampling distribution of the mean (i.e., the distribution of sample means) becomes more concentrated around the population mean, with fewer extreme values and a more symmetrical distribution.\n\nThe CLT suggests that, given a large enough sample size, the distribution of sample means will be approximately normal, even if the underlying population distribution is not normal. This has significant implications for statistical inference and hypothesis testing, as it allows us to make more accurate inferences about the population mean based on a random sample.\n\nTo illustrate this, imagine taking a series of samples from the same population, each with a different sample size. As you increase the sample size, the distribution of sample means will become increasingly concentrated around the population mean, with fewer outliers and a more symmetrical shape. This is because the law of large numbers takes effect, which states that the average of a large number of independent random variables will be close to the expected value.\n\nIn summary, increasing the sample size has a profound impact on the distribution of sample means according to the Central Limit Theorem. As the sample size grows, the distribution of sample means becomes more normal, more concentrated around the population mean, and less prone to extreme values, making it easier to make accurate inferences about the population."
36 |     }
37 | }


--------------------------------------------------------------------------------
/outputs/evaluation/llama3_8b_prompt-gold.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "1_d3": {
 3 |         "qid": "d3_q1",
 4 |         "depth": 3,
 5 |         "question": "I think the Law of Large Numbers also tells us something similar to what the Central Limit Theoreom says. Evaluate my understanding.",
 6 |         "answer": "It's understandable that the concepts of the Law of Large Numbers (LLN) and the Central Limit Theorem (CLT) might seem similar at first glance, as they both involve large sample sizes and their effects on statistical properties. However, they actually address quite different phenomena in the field of statistics.\n\nThe Law of Large Numbers is a principle that focuses on the convergence of sample means to the population mean. What this means is that as you take larger and larger samples from a population, the average (mean) of those samples gets closer and closer to the average (mean) of the entire population. The LLN essentially tells us that the accuracy of the sample mean as an estimate of the population mean improves with larger sample sizes.\n\nOn the other hand, the Central Limit Theorem is concerned with the distribution of these sample means, especially when the sample size is large. According to the CLT, if you take sufficiently large samples from a population (regardless of the population's original distribution) and calculate the mean of each sample, the distribution of these sample means will tend to follow a normal (bell-shaped) distribution. This holds true even if the original population is not normally distributed. The CLT is crucial for enabling us to use normal distribution assumptions and techniques for inference about population means, even when dealing with populations that are not normally distributed themselves.\n\nSo, while the LLN guarantees that the sample means converge to the population mean as the sample size increases, the CLT provides a shape to the distribution of these sample means, stating that they will tend to be normally distributed as the sample size becomes large. This differentiation is essential for correctly applying statistical methods and understanding the behavior of sample means in relation to the population mean.",
 7 |         "predicted_answer": "It's understandable that the concepts of the Law of Large Numbers (LLN) and the Central Limit Theorem (CLT) might seem similar at first glance, as they both involve large sample sizes and their effects on statistical properties. However, they actually address quite different phenomena in the field of statistics.\n\nThe Law of Large Numbers is a principle that focuses on the convergence of sample means to the population mean. What this means is that as you take larger and larger samples from a population, the average (mean) of those samples gets closer and closer to the average (mean) of the entire population. The LLN essentially tells us that the accuracy of the sample mean as an estimate of the population mean improves with larger sample sizes.\n\nOn the other hand, the Central Limit Theorem is concerned with the distribution of these sample means, especially when the sample size is large. According to the CLT, if you take sufficiently large samples from a population (regardless of the population's original distribution) and calculate the mean of each sample, the distribution of these sample means will tend to follow a normal (bell-shaped) distribution. This holds true even if the original population is not normally distributed. The CLT is crucial for enabling us to use normal distribution assumptions and techniques for inference about population means, even when dealing with populations that are not normally distributed themselves.\n\nSo, while the LLN guarantees that the sample means converge to the population mean as the sample size increases, the CLT provides a shape to the distribution of these sample means, stating that they will tend to be normally distributed as the sample size becomes large. This differentiation is essential for correctly applying statistical methods and understanding the behavior of sample means in relation to the population mean.\n\nSo, your understanding is partially correct, but it seems you were connecting the LLN to the aspect of the CLT that deals with the distribution of sample means becoming normal. While the LLN does indeed provide information about the convergence of sample means to the population mean, the CLT is a distinct concept that explains the specific shape of the distribution of sample means.",
 8 |         "feedback": "Feedback: The response provided is consistently correct, accurate, and entirely factual. It accurately distinguishes between the Law of Large Numbers (LLN) and the Central Limit Theorem (CLT), explaining their different applications in statistics. The explanation that the LLN concerns the convergence of sample means to the population mean, and the CLT describes the distribution of these sample means as normally distributed when the sample size is large, demonstrates a comprehensive understanding of the concepts. The response addresses the potential confusion between the LLN and CLT with clear, precise explanations, and no inaccuracies or omissions are present. This shows a deep understanding of the topic, meeting the highest standards set by the score rubric.",
 9 |         "score": 5
10 |     },
11 |     "1_d2_n1": {
12 |         "qid": "d2_q1",
13 |         "depth": 2,
14 |         "question": "How can the Law of Large Numbers be applied to improve the accuracy of sample means?",
15 |         "answer": "The Law of Large Numbers can be applied to improve the accuracy of sample means by increasing the sample size. According to this law, as the size of a sample increases, the sample mean will get closer to the population mean. This means that increasing the number of data points in your sample will make your sample mean a more accurate estimate of the actual population mean. It reduces the impact of outliers or extreme values and minimizes the effect of randomness or chance, providing a more reliable and valid result.",
16 |         "predicted_answer": "I can answer that for you!\n\nAccording to the Law of Large Numbers, increasing the sample size can improve the accuracy of sample means. As the size of the sample increases, the sample mean will get closer to the population mean, making it a more accurate estimate of the actual population mean. This means that having a larger sample size reduces the impact of outliers or extreme values and minimizes the effect of randomness or chance, providing a more reliable and valid result.",
17 |         "feedback": "Feedback: The response accurately and correctly explains the application of the Law of Large Numbers in improving the accuracy of sample means. It highlights how increasing the sample size leads to a sample mean that is closer to the population mean, thereby reducing the impact of outliers and the effect of randomness. This explanation aligns well with the reference answer, demonstrating a comprehensive understanding of the query and topic without any inaccuracies or significant omissions. It provides a detailed, precise, and entirely factual explanation, satisfying the criteria for the highest score based on the rubrics.",
18 |         "score": 5
19 |     },
20 |     "1_d2_n2": {
21 |         "qid": "d2_q2",
22 |         "depth": 2,
23 |         "question": "What is the significance of the distribution of sample means according to the Central Limit Theorem?",
24 |         "answer": "The Central Limit Theorem (CLT) is a statistical theory that states that given a sufficiently large sample size from a population with a finite level of variance, the mean of all samples from the same population will be approximately equal to the mean of the population. The distribution of sample means will approach a normal (bell-shaped) distribution as the sample size increases, regardless of the shape of the population distribution.\n\nThe significance of this is twofold:\n\n1. It allows us to make inferences about the population from sample data. Even if the original population is not normally distributed, we can apply techniques assuming normality if the sample size is large enough.\n\n2. It underpins many statistical tests and procedures that assume a normal distribution, such as confidence intervals and hypothesis testing. Without the CLT, these techniques would not have a solid theoretical foundation.\n\nIn essence, the Central Limit Theorem allows us to apply probabilistic or inferential statistics to data in a robust and justified manner.",
25 |         "predicted_answer": "According to the Central Limit Theorem (CLT), the distribution of sample means approaches a normal (bell-shaped) distribution as the sample size increases, regardless of the shape of the population distribution.",
26 |         "feedback": "Feedback: The response accurately explains the Central Limit Theorem (CLT), mentioning how the distribution of sample means approaches a normal distribution as sample size increases, which is correct and aligns with the fundamental concept of CLT. However, the response lacks the detailed explanation and significance provided in the reference answer, such as making inferences about the population and the foundation it provides for statistical tests and procedures. While the response is correct and shows a good understanding of CLT, it does not fully explore the implications and importance of the theorem, thus missing out on some depth and detail that would enhance its completeness and accuracy. Based on these observations, the response demonstrates a strong understanding of the CLT but falls short of providing a comprehensive explanation of its significance.",
27 |         "score": 4
28 |     },
29 |     "1_d2_n3": {
30 |         "qid": "d2_q3",
31 |         "depth": 2,
32 |         "question": "How does sample size affect the accuracy of statistical estimates in relation to the Law of Large Numbers?",
33 |         "answer": "The Law of Large Numbers states that as the sample size increases, the sample mean gets closer and closer to the population mean. In other words, the accuracy of statistical estimates improves with an increase in sample size. This is because larger samples tend to include a greater variety of observations, which therefore gives a better representation of the population. This minimizes the sampling error and the results are more reliable and valid. So, having a larger sample size reduces the effect of randomness and anomaly in the results, making the estimate more accurate.",
34 |         "predicted_answer": "I see you're looking for an answer related to the Law of Large Numbers and sample size! According to our previous Q&A pair, the Law of Large Numbers states that as the sample size increases, the sample mean gets closer and closer to the population mean. This means that the accuracy of statistical estimates improves with an increase in sample size. With a larger sample size, you'll have a better representation of the population, which minimizes the sampling error and makes the results more reliable and valid. As a result, having a larger sample size reduces the effect of randomness and anomaly in the results, making the estimate more accurate.",
35 |         "feedback": "Feedback: The response effectively communicates how the Law of Large Numbers relates to sample size and its impact on the accuracy of statistical estimates. It clearly states that as sample size increases, the sample mean approaches the population mean, which in turn enhances the accuracy of statistical estimates. This explanation mirrors the reference answer, emphasizing the reduction of sampling error and the minimization of the effect of randomness with larger sample sizes. The response is accurate, factual, and demonstrates a comprehensive understanding of the topic, aligning closely with the reference answer's content and structure. There are no notable inaccuracies or omissions that detract from the quality of the explanation provided.",
36 |         "score": 5
37 |     },
38 |     "1_d2_n4": {
39 |         "qid": "d2_q4",
40 |         "depth": 2,
41 |         "question": "Explain how increasing the sample size influences the distribution of sample means as per the Central Limit Theorem?",
42 |         "answer": "The Central Limit Theorem (CLT) states that if you have a population with any shape of distribution (even a non-normal distribution), the distribution of sample means obtained through sufficiently large samples drawn from that population will approximate a normal distribution. \n\nIn relation to sample size, CLT suggests that the larger the sample size, the closer the sample mean gets to the population mean. This is due to the fact that increasing the sample size reduces the standard error, thus narrowing the spread of the distribution. \n\nThe distribution of sample means becomes less skewed and more closely approximates a normal distribution as the sample size increases, regardless of the shape of the population distribution. This is one of the key aspects of the Central Limit theorem. Consequently, larger sample sizes lead to more reliable and valid inferences about the population.",
43 |         "predicted_answer": "According to the Central Limit Theorem (CLT), increasing the sample size influences the distribution of sample means by:\n\n* Making the distribution of sample means more closely approximate a normal distribution, regardless of the shape of the population distribution.\n* Reducing the standard error, which narrows the spread of the distribution and makes it less skewed.\n* Bringing the sample mean closer to the population mean, making the inferences about the population more reliable and valid.\n\nIn other words, as the sample size increases, the distribution of sample means becomes more normal and centered around the population mean, making it easier to make accurate predictions about the population.",
44 |         "feedback": "Feedback: The response accurately and effectively communicates how increasing the sample size influences the distribution of sample means in accordance with the Central Limit Theorem. It correctly identifies that larger sample sizes lead to a distribution of sample means that is more normal, less skewed, and more closely centered around the population mean. Additionally, the explanation regarding the reduction of the standard error and its effects on the distribution's spread directly aligns with the principles of the Central Limit Theorem. The response is clear, factually correct, and encompasses all the critical aspects related to the influence of sample size on the distribution of sample means as described in the reference answer. There are no significant inaccuracies, omissions, or factual errors detected in the evaluation of the given response according to the rubric provided.",
45 |         "score": 5
46 |     }
47 | }


--------------------------------------------------------------------------------
/outputs/inference/gpt-3.5-turbo_zero-shot.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "d3_q1": {
 3 |         "qid": "d3_q1",
 4 |         "depth": 3,
 5 |         "question": "I think the Law of Large Numbers also tells us something similar to what the Central Limit Theoreom says. Evaluate my understanding.",
 6 |         "answer": "It's understandable that the concepts of the Law of Large Numbers (LLN) and the Central Limit Theorem (CLT) might seem similar at first glance, as they both involve large sample sizes and their effects on statistical properties. However, they actually address quite different phenomena in the field of statistics.\n\nThe Law of Large Numbers is a principle that focuses on the convergence of sample means to the population mean. What this means is that as you take larger and larger samples from a population, the average (mean) of those samples gets closer and closer to the average (mean) of the entire population. The LLN essentially tells us that the accuracy of the sample mean as an estimate of the population mean improves with larger sample sizes.\n\nOn the other hand, the Central Limit Theorem is concerned with the distribution of these sample means, especially when the sample size is large. According to the CLT, if you take sufficiently large samples from a population (regardless of the population's original distribution) and calculate the mean of each sample, the distribution of these sample means will tend to follow a normal (bell-shaped) distribution. This holds true even if the original population is not normally distributed. The CLT is crucial for enabling us to use normal distribution assumptions and techniques for inference about population means, even when dealing with populations that are not normally distributed themselves.\n\nSo, while the LLN guarantees that the sample means converge to the population mean as the sample size increases, the CLT provides a shape to the distribution of these sample means, stating that they will tend to be normally distributed as the sample size becomes large. This differentiation is essential for correctly applying statistical methods and understanding the behavior of sample means in relation to the population mean.",
 7 |         "predicted_answer": "Your understanding is on the right track. The Law of Large Numbers and the Central Limit Theorem are both important concepts in statistics and probability theory. \n\nThe Law of Large Numbers states that as the sample size of a population grows larger, the sample mean will tend to get closer to the population mean. In simpler terms, the average of a large number of independent measurements from the same distribution will converge to the true expected value of that distribution.\n\nOn the other hand, the Central Limit Theorem states that the distribution of the sample mean of a large number of independent, identically distributed random variables will be approximately normally distributed, regardless of the original distribution of the random variables. This theorem is crucial in statistical inference as it allows us to make assumptions about the sampling distribution of the sample mean.\n\nTherefore, while the Law of Large Numbers focuses on the convergence of sample mean to population mean as sample size grows, the Central Limit Theorem deals with the distribution of the sample mean itself. In summary, they are related concepts that complement each other in the field of statistics."
 8 |     },
 9 |     "d3_q2": {
10 |         "qid": "d3_q2",
11 |         "depth": 3,
12 |         "question": "I'm trying to understand constant acceleration in special relativity. Suppose I'm in a spaceship that starts moving with a constant accerelation of 10m/s^2, creating the illusion of having gravity equivalent to the Earth's onboard. How much time would have passed for outsiders, after I have been traveling for 10 years.",
13 |         "answer": "To understand the time that would have passed for outsiders after traveling with a constant acceleration of 10m/s^2 for 10 years in a spaceship, we must delve into the principles of special relativity, specifically focusing on the concept of constant acceleration and its implications on time dilation.\n\nIn the context of special relativity, when a spaceship accelerates at a constant rate, the time experienced by those inside the spaceship and an outside observer diverges due to time dilation. The relationship between the proper time ($\\tau$) inside the spaceship (the time experienced by the passengers) and the coordinate time ($t$) observed from an inertial frame outside the spaceship can be derived using the formulas for constant proper acceleration. \n\nGiven that the spaceship accelerates with constant proper acceleration $a$, the relationship between the proper time $\\tau$ and the coordinate time $t$ can be expressed through the hyperbolic functions:\n\\[t = \\frac{1}{a} \\sinh(a\\tau)\\]\n\\[x = \\frac{1}{a} \\cosh(a\\tau)\\]\n\nwhere $t$ and $x$ describe the position of the spaceship in time and space from the perspective of an outside observer, and $a$ is the constant proper acceleration, and $\\tau$ is the proper time experienced inside the spaceship.\n\nTaking the proper acceleration $a = 10\\, \\text{m/s}^2$ and the proper time $\\tau = 10 \\text{ years}$, we first convert $\\tau$ into seconds to align with the SI unit system:\n\\[10 \\text{ years} = 10 \\times 365.25 \\times 24 \\times 3600 \\text{ seconds} \\approx 315,576,000 \\text{ seconds}\\]\n\nThe constant proper acceleration $a$ needs to be in the unit of $\\text{s}^{-2}$ for the formula, so:\n\\[a = 10\\, \\text{m/s}^2 = 10\\, \\text{m/s}^2 \\times \\frac{1}{c^2} = 10 \\times \\frac{1}{(3 \\times 10^8)^2} \\text{s}^{-2}\\]\n\nNow, applying the value of $\\tau$ to the equation for $t$, we get:\n\\[t = \\frac{1}{a} \\sinh(a\\tau)\\]\n\nGiven the very large value of $\\tau$ and the nature of the hyperbolic sine function, $\\sinh(a\\tau)$, the result will be an extremely large number, reflecting the significant time dilation effect at such high constant acceleration over a long period.\n\nFor computational purposes, without diving into the exact computation which requires handling the hyperbolic sine of a very large product of $a\\tau$, we rely on the key point provided that indicates the answer is roughly 10,000 years. This is derived from understanding that the hyperbolic sine function grows exponentially for large arguments, indicating that the time experienced outside the spaceship, or $t$, would be vastly greater than the time experienced aboard, $\\tau$.\n\nThus, for someone traveling inside a spaceship at a constant acceleration of 10m/s^2 for what feels like 10 years to them, approximately 10,000 years would have passed for an observer outside the spaceship. This showcases the profound effects of constant acceleration and time dilation as predicted by the theory of special relativity.",
14 |         "predicted_answer": "In special relativity, time dilation occurs when an object is moving at constant velocity or with constant acceleration. When an object is accelerating, time dilation effects become more complex. However, we can still make an approximate calculation using the concept of proper time.\n\nIn your scenario, you are experiencing constant acceleration of 10m/s^2, creating the illusion of gravity equivalent to Earth's onboard. To an observer on Earth, due to time dilation effects, time would be passing slower for you compared to them.\n\nTo calculate the time that would have passed for outsiders after you have been traveling for 10 years (proper time), we need to take into account the effects of acceleration and the resulting time dilation. \n\nGiven the constant acceleration of 10m/s^2, the time dilation factor can be calculated using the formula:\n\n\\[ t' = \\frac{t}{\\sqrt{1 - \\frac{v^2}{c^2}}} \\]\n\nwhere:\n- \\( t \\) is the proper time experienced by you (10 years in this case),\n- \\( v \\) is the velocity of the spaceship relative to the Earth,\n- \\( c \\) is the speed of light.\n\nSince the spaceship is accelerating, the velocity will be changing. Without knowing the specific velocity at any given point during the 10-year journey, we can't provide an exact calculation for the total time that has passed for outsiders.\n\nTherefore, to provide a precise answer, we would need more information about the velocity profile of the spaceship during the 10-year journey."
15 |     },
16 |     "d3_q3": {
17 |         "qid": "d3_q3",
18 |         "depth": 3,
19 |         "question": "I do not understand why $B=[a,b]\times[c,d] \\subset {\\mathbb{R}}^2$ is compact. Here is my attempt at a proof: suppose I have a sequence $( (x_k,y_k) )_{k=1}^{\\infty}$ in $B$. According to Bolzano--Weierstrass, the sequences $(x_k)_{k=1}^{\\infty}$ and $(y_k)_{k=1}^{\\infty}$ both have a convergent subsequence. This means that I can find $(k_i)_{I=1}^{\\infty}$ and $(l_j)_{j=1}^\\infty$ two increasing sequences of integers such that $(x_{k_i})_{I=1}^\\infty$ and $(y_{l_j})_{j=1}^\\infty$ converge. So $( (x_{k_i},y_{l_i}) )_{I=1}^\\infty$ converges. But I do not understand why $( (x_{k_i},y_{l_i}) )_{I=1}^\\infty$ is a subsequence of $( (x_k,y_k) )_{k=1}^\\infty$?",
20 |         "answer": "The misunderstanding in your attempt lies in the assumption that the sequence $\\{ (x_{k_i}, y_{l_i}) \\}_{i=1}^{\\infty}$ is a subsequence of the original sequence $\\{ (x_k, y_k) \\}_{k=1}^{\\infty}$. This assumption is incorrect because the indices $k_i$ and $l_j$ chosen for the subsequences of $x_k$ and $y_k$ respectively, do not necessarily align to extract a joint subsequence of pairs from the original sequence. To clarify, let's break down the correct approach to ensure that we extract a legitimate subsequence from the original sequence that demonstrates compactness.\n\nFirst, from the sequence $\\{x_k\\}_{k=1}^{\\infty}$, we extract a convergent subsequence $\\{x_{k_i}\\}_{i=1}^{\\infty}$. This step is correctly based on the Bolzano-Weierstrass theorem, which guarantees that any bounded sequence in $\\mathbb{R}$ has a convergent subsequence.\n\nSubsequently, instead of directly extracting a subsequence from $\\{y_k\\}_{k=1}^{\\infty}$, we focus on $\\{y_{k_i}\\}_{i=1}^{\\infty}$, which is the sequence of $y$-coordinates corresponding to the $x$-coordinate subsequence we have already chosen. Since $\\{y_{k_i}\\}_{i=1}^{\\infty}$ is also bounded (because it is derived from the bounded set $B$), we can apply the Bolzano-Weierstrass theorem again to extract a subsequence $\\{y_{k_{i_j}}\\}_{j=1}^{\\infty}$ that converges.\n\nCrucially, since $\\{y_{k_{i_j}}\\}_{j=1}^{\\infty}$ is a subsequence of $\\{y_{k_i}\\}_{i=1}^{\\infty}$, and $\\{x_{k_{i_j}}\\}_{j=1}^{\\infty}$ is simultaneously a subsequence of $\\{x_{k_i}\\}_{i=1}^{\\infty}$, the pair $\\{ (x_{k_{i_j}}, y_{k_{i_j}}) \\}_{j=1}^{\\infty}$ forms a valid subsequence of the original sequence $\\{ (x_k, y_k) \\}_{k=1}^{\\infty}$. This is because both subsequences of $x$ and $y$ coordinates are extracted using the same sequence of indices $k_{i_j}$.\n\nThis correct approach ensures that we have a legitimate subsequence of the original sequence, demonstrating the compactness of $B=[a,b]\\times[c,d] \\subset \\mathbb{R}^2$. The compactness is shown by proving that every sequence in $B$ has a convergent subsequence whose limit is also within $B$, satisfying the definition of compactness in metric spaces.",
21 |         "predicted_answer": "Your proof is on the right track, but you made a mistake in concluding that $( (x_{k_i},y_{l_i}) )_{I=1}^\\infty$ is a subsequence of $( (x_k,y_k) )_{k=1}^\\infty$. In order for $( (x_{k_i},y_{l_i}) )_{I=1}^\\infty$ to be a subsequence of $( (x_k,y_k) )_{k=1}^\\infty$, you need to show that there exists a strictly increasing sequence of indices $(m_n)_{n=1}^{\\infty}$ such that $( (x_{m_n},y_{m_n}) )_{n=1}^{\\infty}$ is a subsequence of $( (x_k,y_k) )_{k=1}^{\\infty}$.\n\nTo correct your proof, consider defining a new sequence by taking elements from $(x_k,y_k)$ at indices given by the increasing sequence $(\\max(k_i,l_i))_{i=1}^{\\infty}$. This will give you a subsequence of $(x_k,y_k)$ that converges. Therefore, $B=[a,b]\\times[c,d]$ is compact."
22 |     },
23 |     "d3_q4": {
24 |         "qid": "d3_q4",
25 |         "depth": 3,
26 |         "question": "I thought that animals from different species could not produce viable offspring. However, a horse and a donkey can produce viable offspring. How can this contradiction be explained?",
27 |         "answer": "The observation that a horse and a donkey can produce offspring, specifically a mule, might seem at first to contradict the biological definition of species. According to this definition, a species is generally characterized by a group of individuals that are actually or potentially capable of interbreeding to produce viable and fertile offspring. This means that, under normal circumstances, members of the same species can mate and their offspring can also reproduce.\n\nHowever, the case of the horse and donkey mating to produce a mule serves as an exception to the general rule rather than a contradiction. While horses and donkeys are indeed different species, they are closely related enough that they can mate and produce hybrid offspring, which we refer to as mules. The crucial point here is that, although mules are viable in the sense that they are alive and can grow to adulthood, they are generally sterile. This sterility means that mules cannot produce offspring of their own when they mate. The inability of mules to reproduce is a key aspect of why horses and donkeys are still considered separate species. \n\nTherefore, the production of a mule by a horse and a donkey does not challenge the biological definition of species. Instead, it highlights an exception where two different species can produce viable but sterile offspring. This exception emphasizes the complexity of nature and the fact that while biological definitions and rules provide a framework for understanding, there are always special cases that require further explanation.",
28 |         "predicted_answer": "The ability of a horse and a donkey to produce viable offspring, known as a mule, is an exception to the general rule that animals from different species cannot produce fertile offspring. Horses and donkeys are closely related species that belong to the same genus, Equus. While they have a different number of chromosomes (64 for a horse and 62 for a donkey), they are similar enough genetically to produce viable mules. Mules are generally sterile because of the uneven number of chromosomes, which makes it difficult for them to produce viable gametes."
29 |     },
30 |     "d3_q5": {
31 |         "qid": "d3_q5",
32 |         "depth": 3,
33 |         "question": "What does differences in wave propagation velocity along the basilar membrane have to do with the ear's ability to Fourier transform mechanical vibrations?",
34 |         "answer": "The ability of the ear to perform a Fourier transform on mechanical vibrations, essentially breaking down complex sounds into their constituent frequencies, is intricately tied to the differences in wave propagation velocity along the basilar membrane within the cochlea. To understand this relationship, it's crucial to grasp the basic physics principle that relates frequency \\((f)\\), velocity \\((v)\\), and wavelength \\((\\lambda)\\) through the equation \\(f = \\frac{v}{\\lambda}\\). This equation indicates that the frequency of a wave is equal to its velocity divided by its wavelength.\n\nIn the context of the basilar membrane in the cochlea, this relationship has profound implications. The basilar membrane varies in stiffness from one end to the other: it is stiffer at the base (near the oval window where sound enters from the middle ear) and less stiff at the apex (the far end of the cochlea). This gradient in stiffness leads to a gradient in wave propagation velocity along the membrane. Higher-frequency sounds, which have shorter wavelengths, cause maximal resonant vibrations at the stiffer base, where the velocity of wave propagation is higher. Conversely, lower-frequency sounds, with their longer wavelengths, travel further along the membrane and cause maximal resonant vibrations closer to the less stiff apex, where the wave propagation velocity is slower.\n\nThe implications of this are remarkable for auditory perception. As sound waves enter the cochlea and travel along the basilar membrane, their components of different frequencies resonate at different locations along the membrane. This spatial differentiation of frequencies essentially performs a mechanical Fourier transform, segregating the sound into its constituent frequencies. The cochlea translates these vibrations into neural signals, with specific neurons firing in response to vibrations at their respective locations along the membrane. Therefore, the ear's ability to differentiate between frequencies - a foundational aspect of hearing - is directly tied to the differences in wave propagation velocity along the basilar membrane, allowing the cochlea to act as a spectral analyzer for sound.",
35 |         "predicted_answer": "The differences in wave propagation velocity along the basilar membrane are essential for the ear's ability to perform a type of frequency analysis known as the Fourier transform of mechanical vibrations. The basilar membrane in the inner ear is tonotopically organized, meaning that different regions of the membrane respond to different frequencies of sound. \n\nAs sound waves travel through the ear canal and reach the cochlea, they cause the basilar membrane to vibrate. The variation in the stiffness and width of the membrane causes different parts of the membrane to vibrate at different frequencies. This results in a frequency-to-place mapping along the basilar membrane, where high frequencies are encoded at the base (stiff and narrow region) and low frequencies at the apex (more flexible and wide region).\n\nBy analyzing the differences in wave propagation velocity along the basilar membrane, the ear is able to decompose complex sound waves into their individual frequency components. This process, akin to a Fourier transform, allows the brain to perceive and distinguish various pitches and frequencies present in the incoming sound waves, which is fundamental for our sense of hearing."
36 |     }
37 | }


--------------------------------------------------------------------------------
/DATA_LICENSE:
--------------------------------------------------------------------------------
  1 | Attribution 4.0 International
  2 | 
  3 | =======================================================================
  4 | 
  5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
  6 | does not provide legal services or legal advice. Distribution of
  7 | Creative Commons public licenses does not create a lawyer-client or
  8 | other relationship. Creative Commons makes its licenses and related
  9 | information available on an "as-is" basis. Creative Commons gives no
 10 | warranties regarding its licenses, any material licensed under their
 11 | terms and conditions, or any related information. Creative Commons
 12 | disclaims all liability for damages resulting from their use to the
 13 | fullest extent possible.
 14 | 
 15 | Using Creative Commons Public Licenses
 16 | 
 17 | Creative Commons public licenses provide a standard set of terms and
 18 | conditions that creators and other rights holders may use to share
 19 | original works of authorship and other material subject to copyright
 20 | and certain other rights specified in the public license below. The
 21 | following considerations are for informational purposes only, are not
 22 | exhaustive, and do not form part of our licenses.
 23 | 
 24 |      Considerations for licensors: Our public licenses are
 25 |      intended for use by those authorized to give the public
 26 |      permission to use material in ways otherwise restricted by
 27 |      copyright and certain other rights. Our licenses are
 28 |      irrevocable. Licensors should read and understand the terms
 29 |      and conditions of the license they choose before applying it.
 30 |      Licensors should also secure all rights necessary before
 31 |      applying our licenses so that the public can reuse the
 32 |      material as expected. Licensors should clearly mark any
 33 |      material not subject to the license. This includes other CC-
 34 |      licensed material, or material used under an exception or
 35 |      limitation to copyright. More considerations for licensors:
 36 |      wiki.creativecommons.org/Considerations_for_licensors
 37 | 
 38 |      Considerations for the public: By using one of our public
 39 |      licenses, a licensor grants the public permission to use the
 40 |      licensed material under specified terms and conditions. If
 41 |      the licensor's permission is not necessary for any reason--for
 42 |      example, because of any applicable exception or limitation to
 43 |      copyright--then that use is not regulated by the license. Our
 44 |      licenses grant only permissions under copyright and certain
 45 |      other rights that a licensor has authority to grant. Use of
 46 |      the licensed material may still be restricted for other
 47 |      reasons, including because others have copyright or other
 48 |      rights in the material. A licensor may make special requests,
 49 |      such as asking that all changes be marked or described.
 50 |      Although not required by our licenses, you are encouraged to
 51 |      respect those requests where reasonable. More considerations
 52 |      for the public:
 53 |      wiki.creativecommons.org/Considerations_for_licensees
 54 | 
 55 | =======================================================================
 56 | 
 57 | Creative Commons Attribution 4.0 International Public License
 58 | 
 59 | By exercising the Licensed Rights (defined below), You accept and agree
 60 | to be bound by the terms and conditions of this Creative Commons
 61 | Attribution 4.0 International Public License ("Public License"). To the
 62 | extent this Public License may be interpreted as a contract, You are
 63 | granted the Licensed Rights in consideration of Your acceptance of
 64 | these terms and conditions, and the Licensor grants You such rights in
 65 | consideration of benefits the Licensor receives from making the
 66 | Licensed Material available under these terms and conditions.
 67 | 
 68 | 
 69 | Section 1 -- Definitions.
 70 | 
 71 |   a. Adapted Material means material subject to Copyright and Similar
 72 |      Rights that is derived from or based upon the Licensed Material
 73 |      and in which the Licensed Material is translated, altered,
 74 |      arranged, transformed, or otherwise modified in a manner requiring
 75 |      permission under the Copyright and Similar Rights held by the
 76 |      Licensor. For purposes of this Public License, where the Licensed
 77 |      Material is a musical work, performance, or sound recording,
 78 |      Adapted Material is always produced where the Licensed Material is
 79 |      synched in timed relation with a moving image.
 80 | 
 81 |   b. Adapter's License means the license You apply to Your Copyright
 82 |      and Similar Rights in Your contributions to Adapted Material in
 83 |      accordance with the terms and conditions of this Public License.
 84 | 
 85 |   c. Copyright and Similar Rights means copyright and/or similar rights
 86 |      closely related to copyright including, without limitation,
 87 |      performance, broadcast, sound recording, and Sui Generis Database
 88 |      Rights, without regard to how the rights are labeled or
 89 |      categorized. For purposes of this Public License, the rights
 90 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
 91 |      Rights.
 92 | 
 93 |   d. Effective Technological Measures means those measures that, in the
 94 |      absence of proper authority, may not be circumvented under laws
 95 |      fulfilling obligations under Article 11 of the WIPO Copyright
 96 |      Treaty adopted on December 20, 1996, and/or similar international
 97 |      agreements.
 98 | 
 99 |   e. Exceptions and Limitations means fair use, fair dealing, and/or
100 |      any other exception or limitation to Copyright and Similar Rights
101 |      that applies to Your use of the Licensed Material.
102 | 
103 |   f. Licensed Material means the artistic or literary work, database,
104 |      or other material to which the Licensor applied this Public
105 |      License.
106 | 
107 |   g. Licensed Rights means the rights granted to You subject to the
108 |      terms and conditions of this Public License, which are limited to
109 |      all Copyright and Similar Rights that apply to Your use of the
110 |      Licensed Material and that the Licensor has authority to license.
111 | 
112 |   h. Licensor means the individual(s) or entity(ies) granting rights
113 |      under this Public License.
114 | 
115 |   i. Share means to provide material to the public by any means or
116 |      process that requires permission under the Licensed Rights, such
117 |      as reproduction, public display, public performance, distribution,
118 |      dissemination, communication, or importation, and to make material
119 |      available to the public including in ways that members of the
120 |      public may access the material from a place and at a time
121 |      individually chosen by them.
122 | 
123 |   j. Sui Generis Database Rights means rights other than copyright
124 |      resulting from Directive 96/9/EC of the European Parliament and of
125 |      the Council of 11 March 1996 on the legal protection of databases,
126 |      as amended and/or succeeded, as well as other essentially
127 |      equivalent rights anywhere in the world.
128 | 
129 |   k. You means the individual or entity exercising the Licensed Rights
130 |      under this Public License. Your has a corresponding meaning.
131 | 
132 | 
133 | Section 2 -- Scope.
134 | 
135 |   a. License grant.
136 | 
137 |        1. Subject to the terms and conditions of this Public License,
138 |           the Licensor hereby grants You a worldwide, royalty-free,
139 |           non-sublicensable, non-exclusive, irrevocable license to
140 |           exercise the Licensed Rights in the Licensed Material to:
141 | 
142 |             a. reproduce and Share the Licensed Material, in whole or
143 |                in part; and
144 | 
145 |             b. produce, reproduce, and Share Adapted Material.
146 | 
147 |        2. Exceptions and Limitations. For the avoidance of doubt, where
148 |           Exceptions and Limitations apply to Your use, this Public
149 |           License does not apply, and You do not need to comply with
150 |           its terms and conditions.
151 | 
152 |        3. Term. The term of this Public License is specified in Section
153 |           6(a).
154 | 
155 |        4. Media and formats; technical modifications allowed. The
156 |           Licensor authorizes You to exercise the Licensed Rights in
157 |           all media and formats whether now known or hereafter created,
158 |           and to make technical modifications necessary to do so. The
159 |           Licensor waives and/or agrees not to assert any right or
160 |           authority to forbid You from making technical modifications
161 |           necessary to exercise the Licensed Rights, including
162 |           technical modifications necessary to circumvent Effective
163 |           Technological Measures. For purposes of this Public License,
164 |           simply making modifications authorized by this Section 2(a)
165 |           (4) never produces Adapted Material.
166 | 
167 |        5. Downstream recipients.
168 | 
169 |             a. Offer from the Licensor -- Licensed Material. Every
170 |                recipient of the Licensed Material automatically
171 |                receives an offer from the Licensor to exercise the
172 |                Licensed Rights under the terms and conditions of this
173 |                Public License.
174 | 
175 |             b. No downstream restrictions. You may not offer or impose
176 |                any additional or different terms or conditions on, or
177 |                apply any Effective Technological Measures to, the
178 |                Licensed Material if doing so restricts exercise of the
179 |                Licensed Rights by any recipient of the Licensed
180 |                Material.
181 | 
182 |        6. No endorsement. Nothing in this Public License constitutes or
183 |           may be construed as permission to assert or imply that You
184 |           are, or that Your use of the Licensed Material is, connected
185 |           with, or sponsored, endorsed, or granted official status by,
186 |           the Licensor or others designated to receive attribution as
187 |           provided in Section 3(a)(1)(A)(i).
188 | 
189 |   b. Other rights.
190 | 
191 |        1. Moral rights, such as the right of integrity, are not
192 |           licensed under this Public License, nor are publicity,
193 |           privacy, and/or other similar personality rights; however, to
194 |           the extent possible, the Licensor waives and/or agrees not to
195 |           assert any such rights held by the Licensor to the limited
196 |           extent necessary to allow You to exercise the Licensed
197 |           Rights, but not otherwise.
198 | 
199 |        2. Patent and trademark rights are not licensed under this
200 |           Public License.
201 | 
202 |        3. To the extent possible, the Licensor waives any right to
203 |           collect royalties from You for the exercise of the Licensed
204 |           Rights, whether directly or through a collecting society
205 |           under any voluntary or waivable statutory or compulsory
206 |           licensing scheme. In all other cases the Licensor expressly
207 |           reserves any right to collect such royalties.
208 | 
209 | 
210 | Section 3 -- License Conditions.
211 | 
212 | Your exercise of the Licensed Rights is expressly made subject to the
213 | following conditions.
214 | 
215 |   a. Attribution.
216 | 
217 |        1. If You Share the Licensed Material (including in modified
218 |           form), You must:
219 | 
220 |             a. retain the following if it is supplied by the Licensor
221 |                with the Licensed Material:
222 | 
223 |                  i. identification of the creator(s) of the Licensed
224 |                     Material and any others designated to receive
225 |                     attribution, in any reasonable manner requested by
226 |                     the Licensor (including by pseudonym if
227 |                     designated);
228 | 
229 |                 ii. a copyright notice;
230 | 
231 |                iii. a notice that refers to this Public License;
232 | 
233 |                 iv. a notice that refers to the disclaimer of
234 |                     warranties;
235 | 
236 |                  v. a URI or hyperlink to the Licensed Material to the
237 |                     extent reasonably practicable;
238 | 
239 |             b. indicate if You modified the Licensed Material and
240 |                retain an indication of any previous modifications; and
241 | 
242 |             c. indicate the Licensed Material is licensed under this
243 |                Public License, and include the text of, or the URI or
244 |                hyperlink to, this Public License.
245 | 
246 |        2. You may satisfy the conditions in Section 3(a)(1) in any
247 |           reasonable manner based on the medium, means, and context in
248 |           which You Share the Licensed Material. For example, it may be
249 |           reasonable to satisfy the conditions by providing a URI or
250 |           hyperlink to a resource that includes the required
251 |           information.
252 | 
253 |        3. If requested by the Licensor, You must remove any of the
254 |           information required by Section 3(a)(1)(A) to the extent
255 |           reasonably practicable.
256 | 
257 |        4. If You Share Adapted Material You produce, the Adapter's
258 |           License You apply must not prevent recipients of the Adapted
259 |           Material from complying with this Public License.
260 | 
261 | 
262 | Section 4 -- Sui Generis Database Rights.
263 | 
264 | Where the Licensed Rights include Sui Generis Database Rights that
265 | apply to Your use of the Licensed Material:
266 | 
267 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
268 |      to extract, reuse, reproduce, and Share all or a substantial
269 |      portion of the contents of the database;
270 | 
271 |   b. if You include all or a substantial portion of the database
272 |      contents in a database in which You have Sui Generis Database
273 |      Rights, then the database in which You have Sui Generis Database
274 |      Rights (but not its individual contents) is Adapted Material; and
275 | 
276 |   c. You must comply with the conditions in Section 3(a) if You Share
277 |      all or a substantial portion of the contents of the database.
278 | 
279 | For the avoidance of doubt, this Section 4 supplements and does not
280 | replace Your obligations under this Public License where the Licensed
281 | Rights include other Copyright and Similar Rights.
282 | 
283 | 
284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
285 | 
286 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
287 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
288 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
289 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
290 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
291 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
292 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
293 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
294 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
295 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
296 | 
297 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
298 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
299 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
300 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
301 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
302 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
303 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
304 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
305 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
306 | 
307 |   c. The disclaimer of warranties and limitation of liability provided
308 |      above shall be interpreted in a manner that, to the extent
309 |      possible, most closely approximates an absolute disclaimer and
310 |      waiver of all liability.
311 | 
312 | 
313 | Section 6 -- Term and Termination.
314 | 
315 |   a. This Public License applies for the term of the Copyright and
316 |      Similar Rights licensed here. However, if You fail to comply with
317 |      this Public License, then Your rights under this Public License
318 |      terminate automatically.
319 | 
320 |   b. Where Your right to use the Licensed Material has terminated under
321 |      Section 6(a), it reinstates:
322 | 
323 |        1. automatically as of the date the violation is cured, provided
324 |           it is cured within 30 days of Your discovery of the
325 |           violation; or
326 | 
327 |        2. upon express reinstatement by the Licensor.
328 | 
329 |      For the avoidance of doubt, this Section 6(b) does not affect any
330 |      right the Licensor may have to seek remedies for Your violations
331 |      of this Public License.
332 | 
333 |   c. For the avoidance of doubt, the Licensor may also offer the
334 |      Licensed Material under separate terms or conditions or stop
335 |      distributing the Licensed Material at any time; however, doing so
336 |      will not terminate this Public License.
337 | 
338 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
339 |      License.
340 | 
341 | 
342 | Section 7 -- Other Terms and Conditions.
343 | 
344 |   a. The Licensor shall not be bound by any additional or different
345 |      terms or conditions communicated by You unless expressly agreed.
346 | 
347 |   b. Any arrangements, understandings, or agreements regarding the
348 |      Licensed Material not stated herein are separate from and
349 |      independent of the terms and conditions of this Public License.
350 | 
351 | 
352 | Section 8 -- Interpretation.
353 | 
354 |   a. For the avoidance of doubt, this Public License does not, and
355 |      shall not be interpreted to, reduce, limit, restrict, or impose
356 |      conditions on any use of the Licensed Material that could lawfully
357 |      be made without permission under this Public License.
358 | 
359 |   b. To the extent possible, if any provision of this Public License is
360 |      deemed unenforceable, it shall be automatically reformed to the
361 |      minimum extent necessary to make it enforceable. If the provision
362 |      cannot be reformed, it shall be severed from this Public License
363 |      without affecting the enforceability of the remaining terms and
364 |      conditions.
365 | 
366 |   c. No term or condition of this Public License will be waived and no
367 |      failure to comply consented to unless expressly agreed to by the
368 |      Licensor.
369 | 
370 |   d. Nothing in this Public License constitutes or may be interpreted
371 |      as a limitation upon, or waiver of, any privileges and immunities
372 |      that apply to the Licensor or You, including from the legal
373 |      processes of any jurisdiction or authority.
374 | 
375 | 
376 | =======================================================================
377 | 
378 | Creative Commons is not a party to its public licenses.
379 | Notwithstanding, Creative Commons may elect to apply one of its public
380 | licenses to material it publishes and in those instances will be
381 | considered the “Licensor.” The text of the Creative Commons public
382 | licenses is dedicated to the public domain under the CC0 Public Domain
383 | Dedication. Except for the limited purpose of indicating that material
384 | is shared under a Creative Commons public license or as otherwise
385 | permitted by the Creative Commons policies published at
386 | creativecommons.org/policies, Creative Commons does not authorize the
387 | use of the trademark "Creative Commons" or any other trademark or logo
388 | of Creative Commons without its prior written consent including,
389 | without limitation, in connection with any unauthorized modifications
390 | to any of its public licenses or any other arrangements,
391 | understandings, or agreements concerning use of licensed material. For
392 | the avoidance of doubt, this paragraph does not form part of the public
393 | licenses.
394 | 
395 | Creative Commons may be contacted at creativecommons.org.


--------------------------------------------------------------------------------