├── README.md
├── figure2.svg
└── src
    ├── abstract_helper.py
    ├── classify_task.py
    ├── generate_analysis.py
    ├── generate_code_prompt.py
    ├── prompts
        ├── code_prompt_generation_from_explanation.txt
        ├── code_prompt_generation_from_explanation_2.txt
        ├── nl_prompt_generation_from_scratch_shot.txt
        ├── nl_prompt_generation_from_scratch_shot_2.txt
        └── task_classification_prompt.txt
    ├── run.sh
    ├── scoring_single_prompt.py
    ├── tasks
        ├── dyck_languages
        │   ├── data.json
        │   ├── generated_prompts
        │   │   ├── a_code_prompt_from_explanation_planner_gpt_3shot.json
        │   │   ├── a_code_prompt_from_explanation_planner_gpt_3shot.txt
        │   │   ├── a_explanation_3shot_by_gpt.json
        │   │   ├── a_explanation_3shot_by_gpt.txt
        │   │   ├── classification_result_gpt_3shot.json
        │   │   └── classification_result_gpt_3shot.txt
        │   ├── helper.py
        │   ├── prompts
        │   │   ├── code_prompt_cot_edited.txt
        │   │   ├── code_prompt_cot_edited_2.txt
        │   │   ├── code_prompt_generation_example_prompt.txt
        │   │   ├── example_classification.txt
        │   │   ├── explanation.txt
        │   │   ├── explanation_2.txt
        │   │   └── nl_prompt_generation_example_prompt.txt
        │   ├── results
        │   │   ├── a_code_prompt_from_explanation_planner_gpt_3shot_codellama-13b.json
        │   │   ├── a_code_prompt_from_explanation_planner_gpt_3shot_codellama-34b.json
        │   │   ├── a_code_prompt_from_explanation_planner_gpt_3shot_codellama.json
        │   │   ├── a_code_prompt_from_explanation_planner_gpt_3shot_gpt.json
        │   │   ├── a_code_prompt_from_explanation_planner_gpt_3shot_gpt4.json
        │   │   └── a_code_prompt_from_explanation_planner_gpt_3shot_llama-13b.json
        │   └── scoring_prompt_template_code.txt
        ├── geometric_shapes
        │   ├── data.json
        │   ├── generated_prompts
        │   │   ├── classification_result_gpt_3shot.json
        │   │   ├── classification_result_gpt_3shot.txt
        │   │   ├── explanation_from_scratch_3shot_gpt.json
        │   │   ├── explanation_from_scratch_3shot_gpt.txt
        │   │   ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot.json
        │   │   └── new_edited_code_prompt_from_explanation_planner_gpt_3shot.txt
        │   ├── helper.py
        │   ├── prompts
        │   │   ├── code_prompt_cot_edited.txt
        │   │   ├── code_prompt_cot_edited_2.txt
        │   │   ├── code_prompt_generation_example_prompt.txt
        │   │   ├── example_classification.txt
        │   │   ├── explanation.txt
        │   │   ├── explanation_2.txt
        │   │   └── nl_prompt_generation_example_prompt.txt
        │   ├── results
        │   │   ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_codellama-13b.json
        │   │   ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_codellama-34b.json
        │   │   ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_codellama_inst_token.json
        │   │   ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_gpt.json
        │   │   ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_gpt4.json
        │   │   └── new_edited_code_prompt_from_explanation_planner_gpt_3shot_llama-13b.json
        │   └── scoring_prompt_template_code.txt
        ├── navigate
        │   ├── data.json
        │   ├── generated_prompts
        │   │   ├── classification_result_gpt_3shot.json
        │   │   ├── classification_result_gpt_3shot.txt
        │   │   ├── explanation_from_scratch_3shot_gpt.json
        │   │   ├── explanation_from_scratch_3shot_gpt.txt
        │   │   ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot.json
        │   │   └── new_edited_code_prompt_from_explanation_planner_gpt_3shot.txt
        │   ├── helper.py
        │   ├── prompts
        │   │   ├── code_prompt_cot_edited.txt
        │   │   ├── code_prompt_cot_edited_2.txt
        │   │   ├── code_prompt_generation_example_prompt.txt
        │   │   ├── example_classification.txt
        │   │   ├── explanation.txt
        │   │   ├── explanation_2.txt
        │   │   └── nl_prompt_generation_example_prompt.txt
        │   ├── results
        │   │   ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_codellama-13b.json
        │   │   ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_codellama-34b.json
        │   │   ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_codellama_inst_token.json
        │   │   ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_gpt.json
        │   │   ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_gpt4.json
        │   │   └── new_edited_code_prompt_from_explanation_planner_gpt_3shot_llama-13b.json
        │   └── scoring_prompt_template_code.txt
        ├── object_counting
        │   ├── data.json
        │   ├── helper.py
        │   ├── prompts
        │   │   ├── code_prompt_cot_edited.txt
        │   │   ├── code_prompt_cot_edited_2.txt
        │   │   ├── explanation.txt
        │   │   └── explanation_2.txt
        │   └── scoring_prompt_template_code.txt
        ├── penguins_in_a_table
        │   ├── data.json
        │   ├── helper.py
        │   ├── prompts
        │   │   ├── code_prompt_cot_edited.txt
        │   │   ├── code_prompt_cot_edited_2.txt
        │   │   ├── example_classification.txt
        │   │   ├── explanation.txt
        │   │   └── explanation_2.txt
        │   └── scoring_prompt_template_code.txt
        ├── reasoning_about_colored_objects
        │   ├── data.json
        │   ├── generated_prompts
        │   │   ├── classification_result_gpt_3shot.json
        │   │   ├── classification_result_gpt_3shot.txt
        │   │   ├── code_prompt_from_explanation_planner_gpt_3shot.json
        │   │   ├── code_prompt_from_explanation_planner_gpt_3shot.txt
        │   │   ├── explanation_3shot_by_gpt.json
        │   │   └── explanation_3shot_by_gpt.txt
        │   ├── helper.py
        │   ├── prompts
        │   │   ├── code_prompt_cot_edited.txt
        │   │   ├── code_prompt_cot_edited_2.txt
        │   │   ├── code_prompt_generation_example_prompt.txt
        │   │   ├── example_classification.txt
        │   │   ├── explanation.txt
        │   │   ├── explanation_2.txt
        │   │   └── nl_prompt_generation_example_prompt.txt
        │   ├── results
        │   │   ├── code_prompt_from_explanation_planner_gpt_3shot_codellama-13b.json
        │   │   ├── code_prompt_from_explanation_planner_gpt_3shot_codellama-34b.json
        │   │   ├── code_prompt_from_explanation_planner_gpt_3shot_codellama.json
        │   │   ├── code_prompt_from_explanation_planner_gpt_3shot_gpt.json
        │   │   ├── code_prompt_from_explanation_planner_gpt_3shot_gpt4.json
        │   │   └── code_prompt_from_explanation_planner_gpt_3shot_llama-13b.json
        │   └── scoring_prompt_template_code.txt
        ├── temporal_sequences
        │   ├── data.json
        │   ├── generated_prompts
        │   │   ├── classification_result_gpt_3shot.json
        │   │   ├── classification_result_gpt_3shot.txt
        │   │   ├── explanation_from_scratch_3shot_gpt.json
        │   │   ├── explanation_from_scratch_3shot_gpt.txt
        │   │   ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot.json
        │   │   └── new_edited_code_prompt_from_explanation_planner_gpt_3shot.txt
        │   ├── helper.py
        │   ├── prompts
        │   │   ├── code_prompt_cot_edited.txt
        │   │   ├── code_prompt_cot_edited_2.txt
        │   │   ├── code_prompt_generation_example_prompt.txt
        │   │   ├── explanation.txt
        │   │   ├── explanation_2.txt
        │   │   └── nl_prompt_generation_example_prompt.txt
        │   ├── results
        │   │   ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_codellama-13b.json
        │   │   ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_codellama-34b.json
        │   │   ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_codellama_inst_token.json
        │   │   ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_gpt.json
        │   │   ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_gpt4.json
        │   │   └── new_edited_code_prompt_from_explanation_planner_gpt_3shot_llama-13b.json
        │   └── scoring_prompt_template_code.txt
        ├── tracking_shuffled_objectives
        │   ├── data.json
        │   ├── generated_prompts
        │   │   ├── a_code_prompt_from_explanation_planner_gpt_3shot.json
        │   │   ├── a_code_prompt_from_explanation_planner_gpt_3shot.txt
        │   │   ├── a_explanation_3shot_by_gpt.json
        │   │   ├── a_explanation_3shot_by_gpt.txt
        │   │   ├── classification_result_gpt_3shot.json
        │   │   └── classification_result_gpt_3shot.txt
        │   ├── helper.py
        │   ├── prompts
        │   │   ├── code_prompt_cot_edited.txt
        │   │   ├── code_prompt_cot_edited_2.txt
        │   │   ├── code_prompt_generation_example_prompt.txt
        │   │   ├── example_classification.txt
        │   │   ├── explanation.txt
        │   │   ├── explanation_2.txt
        │   │   └── nl_prompt_generation_example_prompt.txt
        │   ├── results
        │   │   ├── a_code_prompt_from_explanation_planner_gpt_3shot_codellama-13b.json
        │   │   ├── a_code_prompt_from_explanation_planner_gpt_3shot_codellama.json
        │   │   ├── a_code_prompt_from_explanation_planner_gpt_3shot_gpt.json
        │   │   ├── a_code_prompt_from_explanation_planner_gpt_3shot_gpt4.json
        │   │   └── a_code_prompt_from_explanation_planner_gpt_3shot_llama-13b.json
        │   └── scoring_prompt_template_code.txt
        └── web_of_lies
        │   ├── data.json
        │   ├── generated_prompts
        │       ├── classification_result_gpt_3shot.json
        │       ├── classification_result_gpt_3shot.txt
        │       ├── explanation_from_scratch_3shot_gpt.json
        │       ├── explanation_from_scratch_3shot_gpt.txt
        │       ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot.json
        │       └── new_edited_code_prompt_from_explanation_planner_gpt_3shot.txt
        │   ├── helper.py
        │   ├── prompts
        │       ├── code_prompt_cot_edited.txt
        │       ├── code_prompt_cot_edited_2.txt
        │       ├── code_prompt_generation_example_prompt.txt
        │       ├── explanation.txt
        │       ├── explanation_2.txt
        │       └── nl_prompt_generation_example_prompt.txt
        │   ├── results
        │       ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_codellama-13b.json
        │       ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_codellama-34b.json
        │       ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_codellama_inst_token.json
        │       ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_gpt.json
        │       ├── new_edited_code_prompt_from_explanation_planner_gpt_3shot_gpt4.json
        │       └── new_edited_code_prompt_from_explanation_planner_gpt_3shot_llama-13b.json
        │   └── scoring_prompt_template_code.txt
    └── utils.py


/README.md:
--------------------------------------------------------------------------------
 1 | # LanguageModelsasCompilers
 2 | ---
 3 | Official implementation of [Language Models as Compilers: Simulating the Execution Of Pseudocode Improves Algorithmic Reasoning in Language Models](https://arxiv.org/abs/2404.02575).
 4 | 
 5 | 
 6 | ![framework overview](./figure2.svg)
 7 | 
 8 | ## Requirements
 9 | To run our code, you need an account to access OpenAI API. Generating a pseudocode does not cost much, but running inferences on all instances of a task requires about $10 ~ $20. Also, you need the latest version of vLLM.
10 | 
11 | 
12 | ## Phase 1: Think - Generating a Task-level Pseudocode
13 | The goal of this phase is to generate a pseudocode prompt that can be applied to all instances of a given task. For that, we conduct following steps:
14 | 
15 | 1. Constructing a meta prompt.
16 | 2. Generating an analysis from the example questions of the task. 
17 | 3. Generating a pseudocode based on the analysis.
18 | 
19 | We provide human-written analyses and pseudocodes in `tasks/{task_name}/prompt` folder. Running `generate_analysis.py` will generate an analysis on the selected task and it will be placed at `tasks/{task_name}/generated_prompts` folder.
20 | 
21 | Then, run `generate_code_prompt.py` to generate a pseudocode for your task. You can check the generated pseudocode prompt at `task/{task_name}/generated_prompts` folder.
22 | 
23 | ## Phase 2: Execute - Simulating the Execution of the Pseudocode
24 | In this phase, we tailor the pseudocode to each instance for conducting reasoning.
25 | 
26 | Run `scoring_single_prompt.py` with passing the path of the generated pseudocode propmt as an argument.
27 | After the process is finished, you can check the result file in JSON format in `tasks/{task_name}/results` folder.
28 | 
29 | ## Running the Whole Process at Once
30 | After changing the working directory to `src`, run `bash run.sh` to execute the whole process for all tasks we have experimented.
31 | 
32 | 
33 | 
34 | 
35 | ## Contact
36 | If you have any inquiries, please feel free to raise an issue or reach out to us via email at: mapoout@yonsei.ac.kr. We're here to assist you!
37 | 
38 | ## Citation
39 | 
40 | If you find this useful, please consider citing our paper:
41 | ```
42 | @article{chae2024language,
43 |   title={Language Models as Compilers: Simulating Pseudocode Execution Improves Algorithmic Reasoning in Language Models},
44 |   author={Chae, Hyungjoo and Kim, Yeonghyeon and Kim, Seungone and Ong, Kai Tzu-iunn and Kwak, Beong-woo and Kim, Moohyeon and Kim, Seonghwan and Kwon, Taeyoon and Chung, Jiwan and Yu, Youngjae and others},
45 |   journal={arXiv preprint arXiv:2404.02575},
46 |   year={2024}
47 | }
48 | ```  
49 | 
50 | 


--------------------------------------------------------------------------------
/src/abstract_helper.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | class AbstractHelper:
 4 |     def __init__(self, args):
 5 |         self.args = args
 6 |         self.train_data = None
 7 |         self.test_data = None
 8 | 
 9 |     
10 |     def evaluate_prediction(self, output, data):
11 |         return
12 |     
13 |     
14 |     def load_data(self):
15 |         train_data = [d for d in load_dataset(self.args.dataset_name)["train"]]
16 |         test_data = [d for d in load_dataset(self.args.dataset_name)['test']]
17 |         if self.args.num_sample > 0:
18 |             test_data = [test_data[i] for i in range(self.args.num_sample)]
19 | 
20 |         self.train_data = train_data
21 |         self.test_data = test_data
22 |         return train_data, test_data
23 |     
24 |     def load_and_prepare_data(self, dataset_split):
25 |         return
26 |     


--------------------------------------------------------------------------------
/src/classify_task.py:
--------------------------------------------------------------------------------
  1 | import sys, os
  2 | import argparse
  3 | import random
  4 | 
  5 | sys.path.append(os.environ.get('PROJECTPATH'))
  6 | from src.utils import *
  7 | 
  8 | 
  9 | def parse_args():
 10 |     parser = argparse.ArgumentParser()
 11 |     parser.add_argument("--task", type=str, choices=LIST_OF_TASKS)
 12 |     parser.add_argument("--task_version", type=str)
 13 |     parser.add_argument("--planner_llm", type=str, default="gpt")
 14 |     parser.add_argument("--shot", type=int, default=3)
 15 |     parser.add_argument("--port", type=str, default="8001")
 16 |     parser.add_argument("--num_sample", type=int, default=-1)
 17 |     return parser.parse_args()
 18 | 
 19 | def load_prompt():
 20 |     with open("prompts/task_classification_prompt.txt", "r") as f:
 21 |         prompt = f.read()
 22 |     return prompt
 23 | 
 24 | def load_target_exmplar(args):
 25 |     helper = helper_dict[args.task](args)
 26 |     _, data = helper.load_data()
 27 |     data = helper.load_and_prepare_data('test')
 28 |     template = "[Example 5]\n"
 29 |     template += "Example task instances:\n"
 30 |     template += "\n\n".join([sample['input'] for sample in random.sample(data, args.shot)])
 31 |     template += "\n\nReason for the decision & answer:"
 32 |     return template
 33 | 
 34 | def load_exemplars(args):
 35 |     free_format_tasks = ["dyck_languages", "navigate", "tracking_shuffled_objectives"]
 36 |     variables_tracking_tasks = ["penguins_in_a_table", "reasoning_about_colored_objects", "geometric_shapes"]
 37 |     if args.task in free_format_tasks:
 38 |         free_format_tasks.remove(args.task)
 39 |     elif args.task in variables_tracking_tasks:
 40 |         variables_tracking_tasks.remove(args.task)
 41 |     
 42 |     tasks = [sample for sample in random.sample(free_format_tasks, 2)] + [sample for sample in random.sample(variables_tracking_tasks, 2)]
 43 |     random.shuffle(tasks)
 44 |     exemplars = ""
 45 |     for i, task in enumerate(tasks):
 46 |         with open(f"tasks/{task}/prompts/example_classification.txt", "r") as f:
 47 |             exemplar = f.read()
 48 |         exemplars += f"[Example {i+1}]\n" + exemplar + "\n\n"
 49 |     return exemplars
 50 | 
 51 | def load_final_prompt(args):
 52 |     template = load_prompt()
 53 |     exemplars = load_exemplars(args)
 54 |     target_exemplar = load_target_exmplar(args)
 55 |     return template.format(exemplar=exemplars+target_exemplar)
 56 | 
 57 | async def main(args):
 58 |     model_name_dict = {
 59 |         "codellama": "codellama/CodeLlama-7b-Instruct-hf",
 60 |         "mistral": "mistralai/Mistral-7B-Instruct-v0.2",
 61 |         "gpt": "gpt-3.5-turbo",
 62 |         "gpt4": "gpt-4",
 63 |         "codellama-13b": "codellama/CodeLlama-13b-Instruct-hf",
 64 |         "codellama-34b": "codellama/CodeLlama-34b-Instruct-hf",
 65 |     }
 66 |     model_name = model_name_dict[args.planner_llm]
 67 |     if 'gpt' in args.planner_llm:
 68 |         scr_llm = ChatOpenAI(
 69 |                             model_name=model_name,
 70 |                             # temperature=args.temperature,
 71 |                             temperature = 0.0,
 72 |                             max_retries=100,
 73 |                             stop=["[Example"],
 74 |                             max_tokens=3000
 75 |                         )
 76 |     else:
 77 |         scr_llm = OpenAI(
 78 |                 model_name=model_name,
 79 |                 temperature=0.0,
 80 |                 max_retries=100,
 81 |                 openai_api_key='EMPTY',
 82 |                 openai_api_base=f"http://localhost:{args.port}/v1",
 83 |                 stop=["[Example"],
 84 |                 max_tokens= 2000
 85 |             ) 
 86 |    
 87 |     prompt = load_final_prompt(args)
 88 |     
 89 |     # check if directory exists. If not, make directory.
 90 |     if not os.path.exists(f'tasks/{args.task}/generated_prompts'):
 91 |         os.makedirs(f'tasks/{args.task}/generated_prompts')
 92 | 
 93 |     # paths
 94 |     save_path = f"tasks/{args.task}/generated_prompts/classification_result_{args.planner_llm}_{args.shot}shot.json"
 95 |         
 96 |     list_of_model_inputs = [
 97 |         prompt
 98 |     ]
 99 |     outputs = await generate_concurrently(scr_llm, list_of_model_inputs, None)
100 |     outputs = [output.strip() for output in outputs]
101 |     print("\n".join(outputs))
102 | 
103 |     with open(save_path, "w") as f:
104 |         json.dump({"input": list_of_model_inputs[0].split("\n"),"output": [o.split("\n") for o in outputs]}, f, indent=4)
105 |     
106 |     with open(save_path.replace(".json", ".txt"), "w") as f:
107 |         f.write(outputs[0])
108 | 
109 | if __name__ == "__main__":
110 |     args = parse_args()
111 |     asyncio.run(main(args))


--------------------------------------------------------------------------------
/src/generate_analysis.py:
--------------------------------------------------------------------------------
  1 | import sys, os
  2 | import argparse
  3 | import random
  4 | 
  5 | sys.path.append(os.environ.get('PROJECTPATH'))
  6 | from src.utils import *
  7 | 
  8 | 
  9 | def parse_args():
 10 |     parser = argparse.ArgumentParser()
 11 |     parser.add_argument("--task", type=str, choices=LIST_OF_TASKS)
 12 |     parser.add_argument("--task_version", type=str)
 13 |     parser.add_argument("--planner_llm", type=str, default="gpt")
 14 |     parser.add_argument("--use_free_format", action="store_true", default=False, help="Depends on the result of the task classification")
 15 |     parser.add_argument("--reproduce", action="store_true", default=True, help="If you want to reproduce our results, use this argument.")
 16 |     parser.add_argument("--shot", type=int, default=3)
 17 |     parser.add_argument("--port", type=str, default="8000")
 18 |     return parser.parse_args()
 19 | 
 20 | def load_prompt(args):
 21 |     with open(f"prompts/nl_prompt_generation_from_scratch_shot{'_2' if args.use_free_format else ''}.txt", "r") as f:
 22 |         prompt = f.read()
 23 |     return prompt
 24 | 
 25 | def load_demonstration(task_name):
 26 |     with open(f"tasks/{task_name}/data.json", "r") as f:
 27 |         data = json.load(f)['examples']
 28 |     exemplars = "\n\n".join([sample['input'] for sample in random.sample(data, 5)])
 29 |     return exemplars
 30 | 
 31 | def load_example_instances_and_code_prompt(task_name, args):
 32 |     example_questions = load_demonstration(task_name)
 33 |     with open(f"tasks/{task_name}/prompts/explanation{'_2' if args.use_free_format else ''}.txt", "r") as f:
 34 |             code_prompt = f.read()
 35 |     return example_questions, code_prompt
 36 | 
 37 | def parsing(text):
 38 |     text = text.strip()
 39 |     if "\n\n" in text:
 40 |         text = text.split("\n\n")[0].strip()
 41 |     text = "\n".join(text.split("\n")[:10])
 42 |     return text
 43 | 
 44 | async def main(args):
 45 |     model_name_dict = {
 46 |         "codellama": "codellama/CodeLlama-7b-Instruct-hf",
 47 |         "mistral": "mistralai/Mistral-7B-Instruct-v0.2",
 48 |         "gpt": "gpt-3.5-turbo",
 49 |         "gpt4": "gpt-4",
 50 |         "codellama-13b": "codellama/CodeLlama-13b-Instruct-hf",
 51 |         "codellama-34b": "codellama/CodeLlama-34b-Instruct-hf",
 52 |     }
 53 |     model_name = model_name_dict[args.planner_llm]
 54 |     if 'gpt' in args.planner_llm:
 55 |         scr_llm = ChatOpenAI(
 56 |                             model_name=model_name,
 57 |                             # temperature=args.temperature,
 58 |                             temperature = 0.0,
 59 |                             max_retries=100,
 60 |                             stop=["[Example"],
 61 |                             max_tokens=3000
 62 |                         )
 63 |     else:
 64 |         scr_llm = OpenAI(
 65 |                 model_name=model_name,
 66 |                 temperature=0.0,
 67 |                 max_retries=100,
 68 |                 openai_api_key='EMPTY',
 69 |                 openai_api_base=f"http://localhost:{args.port}/v1",
 70 |                 stop=["[Example"],
 71 |                 max_tokens= 2000
 72 |             ) 
 73 |    
 74 |     prompt = load_prompt(args)
 75 |     
 76 |     output_formats = {
 77 |         'object_counting': "A positive integer",
 78 |         'temporal_sequences': "'(A)', '(B)', '(C)', ...",
 79 |         'reasoning_about_colored_objects': "'(A)', '(B)', '(C)', ...",
 80 |         'tracking_shuffled_objectives': "'(A)', '(B)', '(C)', ...",
 81 |         'dyck_languages': "A string of closing brakets seperated with a space.",
 82 |         'web_of_lies': "'Yes' or 'No'",
 83 |         'navigate': "'Yes' or 'No'",
 84 |         'geometric_shapes': "'(A)', '(B)', '(C)', ...",
 85 |         "penguins_in_a_table": "'(A)', '(B)', '(C)', ...",
 86 |     }
 87 |     if args.use_free_format:
 88 |         example_tasks = ["penguins_in_a_table", "reasoning_about_colored_objects", "geometric_shapes", "navigate"]
 89 |     else:
 90 |         example_tasks = list(output_formats.keys())
 91 |     
 92 |     helper = helper_dict[args.task](args)
 93 |     # check if directory exists. If not, make directory.
 94 |     if not os.path.exists(f'tasks/{args.task}/generated_prompts'):
 95 |         os.makedirs(f'tasks/{args.task}/generated_prompts')
 96 |     
 97 |     if args.task in example_tasks:
 98 |         example_tasks.remove(args.task)
 99 |     sampled_example_tasks = random.sample(example_tasks, args.shot)
100 | 
101 |     exemplar = "[Example 1]\nExample task instances:\n"
102 |     for i, task in enumerate(sampled_example_tasks):
103 |         task_instance, code_prompt = load_example_instances_and_code_prompt(task,args)
104 |         exemplar += task_instance + "\n\nOutput Format:\n" + output_formats[task] + "\n\nExplanation:\n" + code_prompt
105 |         exemplar += f"\n\n[Example {i+2}]\nExample task instances:\n"
106 |     task_instance = load_demonstration(args.task)
107 |     exemplar += task_instance + "\n\nOutput Format:\n" + output_formats[args.task] + "\n\nExplanation:"
108 | 
109 |     # paths
110 |     save_path = f"tasks/{args.task}/generated_prompts/a_explanation_{args.shot}shot_by_{args.planner_llm}.json"
111 |     if args.use_free_format:
112 |         save_path = save_path.replace(".json", "_use_free_format.json")
113 |     
114 |     if args.reproduce:
115 |         with open(f"tasks/{args.task}/prompts/nl_prompt_generation_example_prompt.txt", "r") as f:
116 |             list_of_model_inputs = [f.read()]
117 |     else:
118 |         list_of_model_inputs = [
119 |         prompt.format(exemplars=exemplar, function_name=helper.function_name)
120 |     ]
121 |     outputs = await generate_concurrently(scr_llm, list_of_model_inputs, None)
122 |     outputs = [o.strip() for o in outputs]
123 |     print(outputs)
124 | 
125 | 
126 |     with open(save_path, "w") as f:
127 |         json.dump({"input": list_of_model_inputs[0].split("\n"),"output": [o for o in outputs]}, f, indent=4)
128 |     
129 |     with open(save_path.replace(".json", ".txt"), "w") as f:
130 |         f.write(outputs[0])
131 | 
132 | if __name__ == "__main__":
133 |     args = parse_args()
134 |     asyncio.run(main(args))
135 | 


--------------------------------------------------------------------------------
/src/generate_code_prompt.py:
--------------------------------------------------------------------------------
  1 | import sys, os
  2 | import argparse
  3 | import random
  4 | 
  5 | sys.path.append(os.environ.get('PROJECTPATH'))
  6 | from src.utils import *
  7 | 
  8 | 
  9 | def parse_args():
 10 |     parser = argparse.ArgumentParser()
 11 |     parser.add_argument("--task", type=str, choices=LIST_OF_TASKS)
 12 |     parser.add_argument("--task_version", type=str)
 13 |     parser.add_argument("--planner_llm", type=str, default="gpt")
 14 |     parser.add_argument("--shot", type=int, default=3)
 15 |     parser.add_argument("--port", type=str, default="8000")
 16 |     parser.add_argument("--use_free_format", action="store_true", default=False)
 17 |     parser.add_argument("--reproduce", action="store_true", default=True, help="If you want to reproduce our results, use this argument.")
 18 |     parser.add_argument("--num_sample", type=int, default=-1)
 19 |     return parser.parse_args()
 20 | 
 21 | def load_prompt(args):
 22 |     with open(f"prompts/code_prompt_generation_from_explanation{'_2' if args.use_free_format else ''}.txt", "r") as f:
 23 |         prompt = f.read()
 24 |     return prompt
 25 | 
 26 | def load_description(task):
 27 |     with open(f"../BIG-Bench-Hard/cot-prompts/{task}.txt", "r") as f:
 28 |         prompt = f.read().split("\n")[2]
 29 |     return prompt
 30 | 
 31 | def load_plan(task_name, args):
 32 |     with open(f"tasks/{task_name}/prompts/explanation{'_2' if args.use_free_format else ''}.txt", "r") as f:
 33 |         plan = f.read()
 34 |     return plan
 35 |     
 36 | def load_target_plan(args):
 37 |     with open(f"tasks/{args.task}/generated_prompts/a_explanation_{args.shot}shot_by_{args.planner_llm}{'_use_free_format' if args.use_free_format else ''}.txt", "r") as f:
 38 |         plan = f.read()
 39 |     return plan
 40 | 
 41 | def load_demonstration(task, args):
 42 |     helper = helper_dict[task](args)
 43 |     _, data = helper.load_data()
 44 |     data = helper.load_and_prepare_data('test')
 45 |     template = "input_text = \"{input_text}\"\n"
 46 |     template += "final_answer = {function_name}(input_text)\n"
 47 |     template += "print(\"Final answer:\" + final_answer)"
 48 |     exemplars = "\n\n".join([template.format(input_text=sample['input_text'], function_name=helper.function_name) for sample in random.sample(data, args.shot)])
 49 |     return exemplars
 50 | 
 51 | def load_example_instances_and_code_prompt(task, args):
 52 |     example_questions = load_demonstration(task, args)
 53 |     with open(f"tasks/{task}/prompts/code_prompt_cot_edited{'_2' if args.use_free_format else ''}.txt", "r") as f:
 54 |         code_prompt = f.read()
 55 |     return example_questions, code_prompt
 56 | 
 57 | def remove_example_usage(code_text):
 58 |     if "[Example 5]" in code_text:
 59 |         code_text = code_text.split("[Example 5]")[0].strip()
 60 |     last_def_index = code_text.rfind("def ")
 61 |     if last_def_index == -1:
 62 |         return code_text
 63 |         
 64 |     last_return_index = code_text.rfind("  return ", last_def_index)
 65 |     if last_return_index == -1:
 66 |         last_return_index = code_text.rfind("  pass", last_def_index)
 67 |     cut_index = code_text.find("\n", last_return_index)
 68 |     if cut_index == -1:
 69 |         return code_text
 70 |     return code_text[:cut_index+1].strip()
 71 | 
 72 | 
 73 | async def main(args):
 74 |     model_name_dict = {
 75 |         "codellama": "codellama/CodeLlama-7b-Instruct-hf",
 76 |         "mistral": "mistralai/Mistral-7B-Instruct-v0.2",
 77 |         "gpt": "gpt-3.5-turbo",
 78 |         "gpt4": "gpt-4",
 79 |         "codellama-13b": "codellama/CodeLlama-13b-Instruct-hf",
 80 |         "codellama-34b": "codellama/CodeLlama-34b-Instruct-hf",
 81 |     }
 82 |     model_name = model_name_dict[args.planner_llm]
 83 |     if 'gpt' in args.planner_llm:
 84 |         scr_llm = ChatOpenAI(
 85 |                             model_name=model_name,
 86 |                             # temperature=args.temperature,
 87 |                             temperature = 0.0,
 88 |                             max_retries=100,
 89 |                             stop=["[Example"],
 90 |                             max_tokens=3000
 91 |                         )
 92 |     else:
 93 |         scr_llm = OpenAI(
 94 |                 model_name=model_name,
 95 |                 temperature=0.0,
 96 |                 max_retries=100,
 97 |                 openai_api_key='EMPTY',
 98 |                 openai_api_base=f"http://localhost:{args.port}/v1",
 99 |                 stop=["[Example"],
100 |                 max_tokens= 2000
101 |             ) 
102 |    
103 |     prompt = load_prompt(args)
104 |     output_format_dict = {
105 |         'object_counting': "A positive integer",
106 |         'temporal_sequences': "'(A)', '(B)', '(C)', ...",
107 |         'reasoning_about_colored_objects': "'(A)', '(B)', '(C)', ...",
108 |         'tracking_shuffled_objectives': "'(A)', '(B)', '(C)', ...",
109 |         'dyck_languages': "A string of closing brakets seperated with a space.",
110 |         'web_of_lies': "'Yes' or 'No'",
111 |         'navigate': "'Yes' or 'No'",
112 |         'geometric_shapes': "'(A)', '(B)', '(C)', ...",
113 |         "penguins_in_a_table": "'(A)', '(B)', '(C)', ...",
114 |     }
115 |     helper = helper_dict[args.task](args)
116 |     
117 |     # check if directory exists. If not, make directory.
118 |     if not os.path.exists(f'tasks/{args.task}/generated_prompts'):
119 |         os.makedirs(f'tasks/{args.task}/generated_prompts')
120 |     
121 |     if not args.reproduce:
122 |         example_tasks_list = list(output_format_dict.keys())
123 |         if args.task in example_tasks_list:
124 |             example_tasks_list.remove(args.task)
125 |         sampled_example_tasks = random.sample(example_tasks_list, args.shot)
126 | 
127 |         exemplar = "[Example 1]\nTask description:\n"
128 |         for i, task in enumerate(sampled_example_tasks):
129 |             description = load_description(task)
130 |             task_instance, code_prompt = load_example_instances_and_code_prompt(task, args)
131 |             plan = load_plan(task, args)
132 |             exemplar += description + "\n\nExample task instances and the code usage:\n" + task_instance + "\n\nFormat of the Final answer:\n" + output_format_dict[task] + f"\n\nExplanation:\n{plan}"+"\n\nCode prompt:\n" + code_prompt
133 |             exemplar += f"\n\n[Example {i+2}]\nTask description:\n"
134 |         description = load_description(args.task)
135 |         task_instance = load_demonstration(args.task, args)
136 |         target_plan = load_target_plan(args)
137 |         exemplar += description + "\n\nExample task instances and the code usage:\n" + task_instance + "\n\nFormat of the Final answer:\n" + output_format_dict[task] + f"\n\nExplanation:\n{target_plan}"+"\n\nCode prompt:"
138 | 
139 |     # paths
140 |     save_path = f"tasks/{args.task}/generated_prompts/a_code_prompt_from_explanation_planner_{args.planner_llm}_{args.shot}shot.json"
141 |     if args.use_free_format:
142 |         save_path = save_path.replace(".json", "_use_free_format.json")
143 |     if args.reproduce:
144 |         save_path = save_path.replace(".json", "_reproduce.json")
145 | 
146 |     if args.reproduce:
147 |         with open(f"tasks/{args.task}/prompts/code_prompt_generation_example_prompt.txt", "r") as f:
148 |             list_of_model_inputs = [f.read()]
149 |     else:
150 |         list_of_model_inputs = [
151 |         prompt.format(exemplars=exemplar, function_name=helper.function_name)
152 |     ]
153 |     
154 |     outputs = await generate_concurrently(scr_llm, list_of_model_inputs, None)
155 |     outputs = [remove_example_usage(output) for output in outputs]
156 |     print("\n".join(outputs))
157 | 
158 | 
159 |     with open(save_path, "w") as f:
160 |         json.dump({"input": list_of_model_inputs[0].split("\n"),"output": [o.split("\n") for o in outputs]}, f, indent=4)
161 |     
162 |     with open(save_path.replace(".json", ".txt"), "w") as f:
163 |         f.write(outputs[0])
164 | 
165 | if __name__ == "__main__":
166 |     args = parse_args()
167 |     asyncio.run(main(args))
168 | 


--------------------------------------------------------------------------------
/src/prompts/code_prompt_generation_from_explanation.txt:
--------------------------------------------------------------------------------
1 | Generate the code prompt for the last task using the similar style of the example codes. Add enough print() functions following the provided steps  in the provided explanation to output intermediate reasoning steps towards the answer and keep track of important variables. Implement the code prompt as much as you can and describe the logic in code following the provided explanation but do not make a code that is biased toward a single task example instance. For example, do not use hard-coded variables that are obtained from task instances (e.g., using specific name of person in the question). The code prompt must be able to be applied to various instances of same task. When returning the final answer, carefully consider the output format. Especially, for the multiple choice questions, the final answer should be one of the given options. The main function name should be '{function_name}'. Along with the main function, you may want to define some helper functions that might be helpful for implementing the '{function_name}'. But you don't have to explicitly implement the helper functions, but just define them with function name and a single-line explanation in comment. When constructing the main function, actively use 'if else statement' to take different reasoning paths based on the condition, 'loop' to efficiently process the repititive instructions, 'dictionary' to keep track of connections between important variables, and 'print()' to output enough intermediate reasoning steps towards the answer and keep tracking changes of important variables.
2 | 
3 | {exemplars}


--------------------------------------------------------------------------------
/src/prompts/code_prompt_generation_from_explanation_2.txt:
--------------------------------------------------------------------------------
1 | Generate the code prompt for the last task using the similar style of the example codes. Implement the code prompt as much as you can and describe the logic in code following the provided explanation but do not make a code that is biased toward a single task example instance. For example, do not use hard-coded variables that are obtained from task instances (e.g., using specific name of person in the question). The code prompt must be able to be applied to various instances of same task. When returning the final answer, carefully consider the output format. Especially, for the multiple choice questions, the final answer should be one of the given options. The main function name should be '{function_name}'. Along with the main function, you may want to define some helper functions that might be helpful for implementing the '{function_name}'. But you don't have to explicitly implement the helper functions, but just define them with function name and a single-line explanation in comment. When constructing the main function, actively use 'if else statement' to take different reasoning paths based on the condition, 'loop' to efficiently process the repititive instructions, 'dictionary' to keep track of connections between important variables, and 'print()' to output enough intermediate reasoning steps towards the answer and keep tracking changes of important variables. In a free format reasoning step, you don't have to specify the logic, but you should use an abstract fucntion like 'derive_answer_with_explantion' which outputs both reasons and proper answer for the situation. For the variables tracking step, you should use 'for loop' to process the multiple statements or instructions step by step, 'print()' to output intermediate steps to keep track of changes of variables.
2 | 
3 | {exemplars}


--------------------------------------------------------------------------------
/src/prompts/nl_prompt_generation_from_scratch_shot.txt:
--------------------------------------------------------------------------------
1 | Generate an explanation, analyzation, and plan to generate code prompt for the last task considering the example task instances. Your plan should show enough intermediate reasoning steps towards the answer. Construct the plan as much as you can and describe the logic specifically. When constructing the plan for the code prompt, actively use 'if else statement' to take different reasoning paths based on the condition, 'loop' to efficiently process the repititive instructions, 'dictionary' to keep track of connections between important variables.
2 | 
3 | {exemplars}


--------------------------------------------------------------------------------
/src/prompts/nl_prompt_generation_from_scratch_shot_2.txt:
--------------------------------------------------------------------------------
1 | Generate an explanation, analysis, and plan to generate code prompt for the last task considering the example task instances. Your plan should show enough intermediate reasoning steps towards the answer and will be applied only the each of the instances. Construct the plan as much as you can and describe the logic specifically. When constructing the plan for the code prompt, actively use 'if else statement' to take different reasoning paths based on the condition, 'loop' to efficiently process the repititive instructions, 'dictionary' to keep track of connections between important variables, and 'print()' to output enough intermediate reasoning steps towards the answer and keep tracking changes of important variables. When analyzing the task, you should carefully decide whether the task needs variable tracking, the free format reasoning approach, or both of them. The variables tracking is effective when the task has multiple instructions or statements to process. For the free format reasoning, it works well when the task needs a complex or diverse logic to implement. In a free format reasoning step, you don't have to specify the logic, but you should specify an abstract fucntion like 'derive_answer_with_explantion' which outputs both reasons and proper answer for the situation.
2 | 
3 | {exemplars}


--------------------------------------------------------------------------------
/src/prompts/task_classification_prompt.txt:
--------------------------------------------------------------------------------
1 | Choose between the variables tracking and the free format reasoning which strategy would be more effective than the other one when tackling the last task considering the example task instances and explain why. The variables tracking is effective when the task has multiple instructions or statements to process and need to check the important variables for each of the step. For the free format reasoning, it works well when the task needs a complex or diverse logic that is difficult to be expressed in a specific method. In a free format reasoning step, you don't have to specify the logic, but you should specify an abstract fucntion like 'derive_answer_with_explantion' which outputs both reasons and proper answer for the situation. For the variables tracking step, you should use 'for loop' to process the multiple statements or instructions step by step, 'print()' to output intermediate steps to keep track of changes of variables.
2 | 
3 | {exemplar}


--------------------------------------------------------------------------------
/src/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PROJECTPATH="" # your path
 3 | export OPENAI_API_KEY="" # your API key
 4 | 
 5 | tasks=("dyck_languages" "navigate" "geometric_shapes" "reasoning_about_colored_objects" "temporal_sequences" "tracking_shuffled_objectives" "web_of_lies")
 6 | 
 7 | for task in "${tasks[@]}"; do
 8 |     python generate_analysis.py --task $task 
 9 |     python generate_code_prompt.py --task $task 
10 |     python scoring_single_prompt.py --task $task --scr_model_name gpt --prompt_path tasks/$task/generated_prompts/a_code_prompt_from_explanation_planner_gpt_3shot_reproduce.txt --code_prompt
11 | done


--------------------------------------------------------------------------------
/src/scoring_single_prompt.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from langchain_community.chat_models import ChatOpenAI
  3 | from langchain_community.llms import OpenAI
  4 | from tqdm import tqdm
  5 | import asyncio
  6 | import json
  7 | import random
  8 | import copy
  9 | import sys
 10 | import os
 11 | 
 12 | sys.path.append(os.environ.get('PROJECTPATH'))
 13 | from src.utils import *
 14 | 
 15 | def parse_args():
 16 |     parser = argparse.ArgumentParser()
 17 |     parser.add_argument("--scr_model_name", type=str, default="codellama")
 18 |     parser.add_argument("--score_type", type=str, default="pass_rate")
 19 |     parser.add_argument("--code_prompt", action="store_true", help="Enable code prompt if flag is present")
 20 |     parser.add_argument("--num_sample", type=int, default=-1)
 21 |     parser.add_argument("--port", type=int, default=8000)
 22 |     parser.add_argument("--prompt_path", type=str)
 23 |     parser.add_argument("--compositional_inference", action="store_true")
 24 |     parser.add_argument("--inst_token", action="store_true")
 25 |     parser.add_argument("--use_extraction", action="store_true")
 26 |     parser.add_argument("--nl_type", type=str, default="direct", choices=['direct', 'cot', 'pns'])
 27 |     parser.add_argument("-r", "--retry_threshold", type=int, default=10 )
 28 |     parser.add_argument("--task", type=str, required=True)
 29 |     return parser.parse_args()
 30 | 
 31 | def load_prompt(args):
 32 |     with open(args.prompt_path, "r") as f:
 33 |         prompt = f.read() 
 34 |     return prompt
 35 | 
 36 | def load_templates(args):
 37 |     prompt_type = "code" if args.code_prompt else "nl"
 38 |     if args.code_prompt:
 39 |         with open(f"tasks/{args.task}/scoring_prompt_template_{prompt_type}.txt", "r") as f:
 40 |             scoring_prompt = f.read()
 41 |     else:
 42 |         with open(f"tasks/{args.task}/scoring_prompt_template_{prompt_type}_{args.nl_type}.txt", "r") as f:
 43 |             scoring_prompt = f.read()
 44 |     return scoring_prompt
 45 | 
 46 | 
 47 | async def calculate_score_for_optimized_prompt(llm, data, scoring_prompt, optimized_prompt, helper):
 48 |     ''' evaluate an optimized instruction using scorer model '''
 49 |     # construct model inputs using instances in evaluation set
 50 |     list_of_model_inputs = [scoring_prompt.format(input_text=d['input_text'], prompt=optimized_prompt, function_name=helper.function_name) for d in data]
 51 |     
 52 |     outputs = await generate_concurrently(llm, list_of_model_inputs, args)
 53 |     if helper.args.use_extraction:
 54 |         list_of_model_inputs = [mi + "\n" + output[:output.rfind("Final answer")].strip() + "\n" + "Therefore, the answer is " if not args.code_prompt else mi + "\n" + output[:output.rfind("Final answer")].strip() + "\n" + "Final answer:" for mi, output in zip(list_of_model_inputs, outputs)]
 55 |         progress_tracker = {'completed': 0, 'dynamic_threshold': int(len(list_of_model_inputs) * 0.8)}
 56 |         tasks = [
 57 |             async_generate(llm, i, mi, args, progress_tracker) for i, mi in enumerate(list_of_model_inputs)
 58 |         ]
 59 |         results = []
 60 |         for f in tqdm_async(asyncio.as_completed(tasks), total=len(tasks)):
 61 |             result = await f
 62 |             results.append(result)
 63 | 
 64 |         sorted_results = sorted(results, key=lambda x: x[0])
 65 |         outputs = [output for index, output in sorted_results]
 66 |     result_score, individual_score = helper.evaluate_prediction(outputs)
 67 | 
 68 |     return result_score, individual_score, outputs, list_of_model_inputs
 69 | 
 70 | ################################################################
 71 | # MAIN FUNCTION
 72 | ################################################################
 73 | async def main(args):
 74 | 
 75 |     model_name_dict = {
 76 |         "codellama": "codellama/CodeLlama-7b-Instruct-hf",
 77 |         "mistral": "mistralai/Mistral-7B-Instruct-v0.2",
 78 |         "gpt": "gpt-3.5-turbo",
 79 |         "gpt4": "gpt-4",
 80 |         "codellama-13b": "codellama/CodeLlama-13b-Instruct-hf",
 81 |         "llama-13b": "meta-llama/Llama-2-13b-hf",
 82 |         "codellama-34b": "codellama/CodeLlama-34b-Instruct-hf",
 83 |     }
 84 |     model_name = model_name_dict[args.scr_model_name]
 85 |     if 'gpt' in args.scr_model_name:
 86 |         scr_llm = ChatOpenAI(
 87 |                             model_name=model_name,
 88 |                             # temperature=args.temperature,
 89 |                             temperature = 0.0,
 90 |                             max_retries=100,
 91 |                             max_tokens=1500,
 92 |                         )
 93 |     else:
 94 |         if args.compositional_inference:
 95 |             num_tokens = 200
 96 |         elif args.use_extraction:
 97 |             num_tokens = 500
 98 |         else:
 99 |             num_tokens = 1500
100 |         scr_llm = OpenAI(
101 |                 model_name=model_name,
102 |                 temperature=0.0,
103 |                 max_retries=100,
104 |                 openai_api_key='EMPTY',
105 |                 openai_api_base=f"http://localhost:{args.port}/v1",
106 |                 max_tokens= num_tokens,
107 |                 stop = ["\n"] if args.compositional_inference else None
108 |             ) 
109 | 
110 |     task_helper = helper_dict[args.task](args)
111 |     # load data and templates
112 |     _, test_data = task_helper.load_data()
113 |     _, test_data = task_helper.load_and_prepare_data("train"), task_helper.load_and_prepare_data("test")
114 |     best_prompt = load_prompt(args)
115 | 
116 |     # load template for meta prompt
117 |     scoring_prompt = load_templates(args)
118 |     if "cp_wo_comment_semantics" in args.prompt_path:
119 |         scoring_prompt = scoring_prompt.replace(task_helper.function_name, "function1")
120 |     if args.inst_token:
121 |         scoring_prompt = "[INST]\n" + scoring_prompt + "\n[/INST]"
122 | 
123 |     # check if directory exists. If not, make directory.
124 |     if not os.path.exists(f'tasks/{args.task}/results'):
125 |         os.makedirs(f'tasks/{args.task}/results')
126 | 
127 |     # paths
128 |     # save_path = f"{args.task}/results/"+f"{args.scr_model_name.replace('/', '-')}_"+args.prompt_path.split("/")[-1].replace(".txt","")+f"_sample{args.num_sample}.json"
129 |     save_path = f"tasks/{args.task}/results/{args.prompt_path.split('/')[-1].split('.')[0]}_{args.scr_model_name}{'_compositional' if args.compositional_inference else ''}{'_inst_token' if args.inst_token else ''}{'_extraction' if args.use_extraction else ''}" + ".json"
130 |     if not args.code_prompt:
131 |         save_path = save_path.replace(".json", f"_{args.nl_type}.json")
132 |     print(save_path)
133 |     print(args.compositional_inference)
134 |     # evaluate newly generated instructions using the scorer model
135 |     score_dic, individual_score, raw_prediction, list_of_model_inputs = await calculate_score_for_optimized_prompt(scr_llm, test_data, scoring_prompt, best_prompt, task_helper)
136 |     output = dict()
137 |     output['prompt'] = best_prompt.split("\n")
138 |     output['score'] = score_dic
139 |     output['inference'] = [{"input": list_of_model_inputs[i].split("\n"), "output": raw_prediction[i].strip().split("\n"), "score": individual_score[i]} for i in range(len(raw_prediction))] 
140 |     with open(save_path, "w") as f:
141 |         json.dump(output, f, indent=4)
142 | 
143 | if __name__ == "__main__":
144 |     args = parse_args()
145 |     asyncio.run(main(args))
146 | 


--------------------------------------------------------------------------------
/src/tasks/dyck_languages/generated_prompts/a_code_prompt_from_explanation_planner_gpt_3shot.txt:
--------------------------------------------------------------------------------
 1 | def complete_dyck_languages(input_text):
 2 |     # Step 1: Parse the input text to extract the sequence of parentheses.
 3 |     parentheses_sequence = extract_parentheses(input_text)
 4 |     print("Parentheses sequence:", parentheses_sequence)
 5 | 
 6 |     # Step 2: Initialize a stack to keep track of opening parentheses that need closing.
 7 |     stack = []
 8 | 
 9 |     # Step 3: Iterate over each character in the sequence to handle opening and closing parentheses.
10 |     for i, char in enumerate(parentheses_sequence):
11 |         if char in ['(', '[', '{', '<']:
12 |             stack.append(char)
13 |             print(f"Step ({i}) - Pushed {char} to stack")
14 |         elif char in [')', ']', '}', '>']:
15 |             if len(stack) == 0:
16 |                 print(f"Step ({i}) - No matching opening bracket for {char}")
17 |             else:
18 |                 opening_bracket = stack.pop()
19 |                 print(f"Step ({i}) - Matched {opening_bracket} with {char}")
20 |     
21 |     # Step 4: Check if the stack is empty after processing the entire sequence.
22 |     if len(stack) > 0:
23 |         print("Remaining unmatched brackets in stack:", stack)
24 |         # Step 5: Generate the necessary closing brackets to match the remaining opening brackets.
25 |         closing_brackets = generate_closing_brackets(stack)
26 |         print("Generated closing brackets:", closing_brackets)
27 |     else:
28 |         print("All brackets are properly matched.")
29 | 
30 |     # Step 6: Return the generated closing brackets as the final answer.
31 |     return closing_brackets


--------------------------------------------------------------------------------
/src/tasks/dyck_languages/generated_prompts/a_explanation_3shot_by_gpt.txt:
--------------------------------------------------------------------------------
 1 | This task involves completing a sequence of parentheses to ensure they are closed properly. The challenge is to understand the nesting structure of the parentheses and add the appropriate closing brackets to match each opening bracket.
 2 | 
 3 | Analyzing the Task:
 4 | 1. Parentheses Matching: Each opening parenthesis should have a corresponding closing parenthesis to ensure the sequence is valid.
 5 | 
 6 | 2. Nested Parentheses: The parentheses can be nested within each other, so it's crucial to maintain the correct nesting structure.
 7 | 
 8 | 3. Proper Closure: The goal is to add the necessary closing brackets to complete the sequence and make sure all parentheses are closed properly.
 9 | 
10 | Constructing the Code Prompt:
11 | 1. Parsing Input: Extract the sequence of parentheses from the input text.
12 | 
13 | 2. Stack Implementation: Use a stack data structure to keep track of the opening parentheses that need closing.
14 | 
15 | 3. Iterating Over the Sequence: Iterate over each character in the sequence and push opening parentheses onto the stack.
16 | 
17 | 4. [Important] Closing Parentheses: When encountering a closing parenthesis, pop the corresponding opening parenthesis from the stack to ensure proper closure.
18 | 
19 | 5. Generating Output: After processing the entire sequence, the stack should be empty if all parentheses are closed properly. If not, add the necessary closing brackets to match the remaining opening brackets.
20 | 
21 | 6. Returning the Result: Output the generated closing brackets as a string, separated by spaces, to complete the sequence.


--------------------------------------------------------------------------------
/src/tasks/dyck_languages/generated_prompts/classification_result_gpt_3shot.txt:
--------------------------------------------------------------------------------
1 | For these tasks, the key is to ensure that the parentheses are closed properly in the given sequences. This requires a systematic approach to keep track of the opening and closing parentheses. Therefore, the variables tracking method, using a for loop to process each character in the sequence and keeping track of the opening and closing parentheses, would be more effective in solving these tasks. So the answer is Variables tracking.


--------------------------------------------------------------------------------
/src/tasks/dyck_languages/helper.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | import os
 4 | from datasets import load_dataset
 5 | from tqdm import tqdm
 6 | import json
 7 | import re
 8 | sys.path.append(os.environ.get('PROJECTPATH'))
 9 | from src.abstract_helper import AbstractHelper
10 | 
11 | 
12 | class DyckLanguagesHelper(AbstractHelper):
13 |     def __init__(self, args):
14 |         super(DyckLanguagesHelper, self).__init__(args)
15 |         self.function_name = "complete_dyck_languages"
16 |     
17 |     def evaluate_prediction(self, outputs):
18 |         agg_pass = []
19 |         agg_task_accuracy = []
20 |        
21 |         for oi, o in enumerate(outputs):
22 |             o = o.lower()
23 |             if "final answer:" in o:
24 |                 model_output = o.split("final answer:")[-1].strip()
25 |             else:
26 |                 model_output = o.strip()
27 |             label = self.test_data[oi]['answer'].strip()
28 |             is_pass = True
29 |             for c in model_output.replace(" ", ""):
30 |                 if c not in ["(", ")", "{", "}", "[", "]", "<", ">"]:
31 |                     is_pass = False
32 |             agg_pass.append(is_pass)
33 |             if label.replace(" ", "") == model_output.replace(" ", ""):
34 |                 agg_task_accuracy.append(True)
35 |             else:
36 |                 agg_task_accuracy.append(False)
37 |         
38 |         task_accuracy = np.mean(agg_task_accuracy).item()
39 |         pass_rate = sum(agg_pass)/len(agg_pass)
40 |         individual_score = [{"pass_rate": agg_pass[i], "task_accuracy": agg_task_accuracy[i]} for i in range(len(agg_task_accuracy))]
41 |         result_score = {
42 |             "pass_rate": pass_rate,
43 |             "task_accuracy": task_accuracy
44 |         }
45 |         return result_score, individual_score
46 |     
47 |     def load_data(self):
48 |         data_name = "tasks/dyck_languages/data.json"
49 |         with open(data_name, "r") as f:
50 |             data = json.load(f)['examples']
51 |         train_data = [d for d in data]
52 |         test_data = [d for d in data]
53 |         if self.args.num_sample != -1:
54 |             test_data = [test_data[i] for i in range(self.args.num_sample)]
55 | 
56 |         self.train_data = train_data
57 |         self.test_data = test_data
58 |         return train_data, test_data
59 |     
60 |     def load_and_prepare_data(self, dataset_split):
61 |         dataset = self.train_data if dataset_split == "train" else self.test_data
62 |         all_processed_data = []
63 |         for data in tqdm(dataset):
64 |             cur_data = {k:v for k,v in data.items()}
65 |             cur_data['input_text'] = cur_data['input'].split("Input: ")[-1]
66 |             cur_data['answer'] = cur_data['target']
67 |             cur_data['label'] = cur_data['target']
68 |             all_processed_data.append(cur_data)
69 |         
70 |         if dataset_split == "train":
71 |             self.train_data = all_processed_data
72 |         else:
73 |             self.test_data = all_processed_data
74 |             
75 |         return all_processed_data
76 |         


--------------------------------------------------------------------------------
/src/tasks/dyck_languages/prompts/code_prompt_cot_edited.txt:
--------------------------------------------------------------------------------
 1 | def complete_dyck_languages(input_text):
 2 |     # Step 1: Initialize a stack to keep track of open parentheses and split the input text to identify and define all types of open parentheses in the text.
 3 |     stack = []
 4 |     character_list = input_text.split()
 5 |     open_to_close_parenthesis_dict = {"(": ")", "<": ">", "{": "}", "[": "]"}
 6 |     opening_parenthesis = ["(", "<", "{", "["]
 7 |     print(f"Parse characters in the input and initialize a stack to track of open parentheses. \nCurrent stack: {stack}. Parsed characters: {character_list}") 
 8 |     
 9 |     
10 |     # Step 2: Through iteration over the input characters, identify opening parentheses among the input characters and add them to the stack.
11 |     print("Check if a character is an opening parenthesis while iterating over the input characters.")
12 |     for char in character_list:
13 |         if char in opening_parenthesis:
14 | 		        print(f"Iteration {i+1}: Current character {char} is an opening parenthesis.")
15 |             stack.append(char)
16 |             print(f"Thus, we append {char} to the stack. Current stack after insertion: {', '.join(stack)}")
17 |         
18 |         # Step 3: For each open parentheses, find the corresponding closing parentheses and close the open parentheses.
19 |         else:
20 |             print(f"Iteration {i+1}: Current character {char} is not an opening parenthesis.\n Thus we delete the last item {stack[-1]} from the stack\n current stack before deletion: {" ".join(stack)} -> updated stack after deletion: {' '.join(stack[:-1]) if stack else 'empty'}")
21 |             stack.pop() # Remove the last added open parentheses assuming a correct match.
22 |     
23 |     # Step 4: Generate the sequence of closing parentheses based on remaining open parentheses in the stack. 
24 |     print(f"The resulting stack is {' '.join(stack)}.")
25 |     print(f"We will need to pop out {' '.join(stack[::-1])} one by one in that order.")
26 |     closing_list = [parentheses_pairs[opening] for opening in stack[::-1]]
27 |     
28 |     # Step 5: Output the completed sequence. Generate the input sequence concatenated with the generated closing sequence of parentheses, ensuring a well-formed structure.
29 |     return " ".join(closing_list) 


--------------------------------------------------------------------------------
/src/tasks/dyck_languages/prompts/code_prompt_cot_edited_2.txt:
--------------------------------------------------------------------------------
 1 | def complete_dyck_languages(input_text):
 2 |     # Step 1: Initialize a stack to keep track of open parentheses and split the input text to identify and define all types of open parentheses in the text.
 3 |     stack = []
 4 |     character_list = input_text.split()
 5 |     open_to_close_parenthesis_dict = {"(": ")", "<": ">", "{": "}", "[": "]"}
 6 |     opening_parenthesis = ["(", "<", "{", "["]
 7 |     print(f"Parse characters in the input and initialize a stack to track of open parentheses. \nCurrent stack: {stack}. Parsed characters: {character_list}") 
 8 |     
 9 |     
10 |     # Step 2: [Varibales tracking] Iterate over the input characters.
11 |     print("Check if a character is an opening parenthesis while iterating over the input characters.")
12 |     for char in character_list:
13 |         # Step 2: If it's an opening parenthesis, push it onto the stack.
14 |         if char in opening_parenthesis:
15 | 		        print(f"Iteration {i+1}: Current character {char} is an opening parenthesis.")
16 |             stack.append(char)
17 |             print(f"Thus, we append {char} to the stack. Current stack after insertion: {', '.join(stack)}")
18 |         
19 |         # Step 2: If it's a closing parenthesis, check against the last opening parenthesis in the stack for a match, and pop the stack if it matches.
20 |         else:
21 |             print(f"Iteration {i+1}: Current character {char} is not an opening parenthesis.\n Thus we delete the last item {stack[-1]} from the stack\n current stack before deletion: {" ".join(stack)} -> updated stack after deletion: {' '.join(stack[:-1]) if stack else 'empty'}")
22 |             stack.pop()
23 |     
24 |     # Step 3: After processing all input characters, any remaining open parentheses in the stack need to be closed in reverse order (LIFO).
25 |     print(f"The resulting stack is {' '.join(stack)}.")
26 |     print(f"We will need to pop out {' '.join(stack[::-1])} one by one in that order.")
27 |     closing_list = [parentheses_pairs[opening] for opening in stack[::-1]]
28 |     
29 |     # Step 5: Formulate the sequence of closing parentheses seperated with a space and return it as an answer.
30 |     return " ".join(closing_list) 


--------------------------------------------------------------------------------
/src/tasks/dyck_languages/prompts/example_classification.txt:
--------------------------------------------------------------------------------
1 | Example task instances:
2 | Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: [ { [
3 | 
4 | Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: < > ( ( [ [ ( { } ) [ < > ] ]
5 | 
6 | Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: < [ < [ { < [ ] < { } > > } ] > { { ( ) } { < [ < > ] > }
7 | 
8 | Reason for the decision & Answer:
9 | In order to find unclosed parentheses, tracking the opening and closing parentheses is really important. Therefore, the variables tracking is an effective way to solve the task. So the answer is Variables tracking.


--------------------------------------------------------------------------------
/src/tasks/dyck_languages/prompts/explanation.txt:
--------------------------------------------------------------------------------
 1 | This task involves completing a sequence of opening and closing parentheses, ensuring that they are correctly matched and the entire sequence is well-formed. This is a classic problem in computer science related to parsing and syntactic analysis, often solved using stack data structures.
 2 | 
 3 | Analyzing the Task:
 4 | 1. Parentheses Matching: Each opening parenthesis must have a corresponding closing one. The sequence must adhere to the proper nesting and order of parentheses.
 5 | 
 6 | 2. Tracking Open Parentheses: A stack is ideal for keeping track of open parentheses, as it allows us to process them in a last-in, first-out (LIFO) manner.
 7 | 
 8 | 3. Completing the Sequence: The goal is to add the appropriate closing parentheses to complete the sequence.
 9 | 
10 | Constructing the Code Prompt:
11 | 1. Initialize a Stack: Use a stack to keep track of opening parentheses. This is crucial for understanding which parentheses are open and need closing.
12 | 
13 | 2. Split and Parse Input: Split the input string into individual characters for easy processing. Identify types of parentheses and their corresponding closing counterparts.
14 | 
15 | 3. [Important] Iterate and Process Characters: Iterate over the input characters. To keep track of the stack, make sure printing out the stack variable using print(). For each character:
16 |    - If it's an opening parenthesis, push it onto the stack.
17 |    - If it's a closing parenthesis, check against the last opening parenthesis in the stack for a match, and pop the stack if it matches.
18 | 
19 | 4. Generate Closing Sequence: After processing all input characters, any remaining open parentheses in the stack need to be closed in reverse order (LIFO).
20 | 
21 | 5. Construct and Return the Completed Sequence: Formulate the sequence of closing parentheses to append to the original sequence, ensuring that the overall structure is properly formed.


--------------------------------------------------------------------------------
/src/tasks/dyck_languages/prompts/explanation_2.txt:
--------------------------------------------------------------------------------
 1 | This task involves completing a sequence of opening and closing parentheses, ensuring that they are correctly matched and the entire sequence is well-formed. This is a classic problem in computer science related to parsing and syntactic analysis, often solved using stack data structures.
 2 | 
 3 | Analyzing the Task:
 4 | In order to find unclosed parentheses, tracking the opening and closing parentheses is really important. Therefore, the variables tracking is an effective way to solve the task.
 5 | 
 6 | Constructing the Code Prompt:
 7 | 1. Initialize a Stack and Split Input: Use a stack to keep track of opening parentheses. This is crucial for understanding which parentheses are open and need closing. Split the input string into individual characters for easy processing. Identify types of parentheses and their corresponding closing counterparts.
 8 | 
 9 | 2. [Variables tracking] Iterate and Process Characters: Iterate over the input characters by using 'for loop'. To keep track of the stack, make sure printing out the stack variable using print(). For each character:
10 |    - If it's an opening parenthesis, push it onto the stack.
11 |    - If it's a closing parenthesis, check against the last opening parenthesis in the stack for a match, and pop the stack if it matches.
12 | 
13 | 3. Generate Closing Sequence: After processing all input characters, any remaining open parentheses in the stack need to be closed in reverse order (LIFO).
14 | 
15 | 4. Return the Required Closing Parentheses Sequence: Formulate the sequence of closing parentheses seperated with a space and return it as an answer.


--------------------------------------------------------------------------------
/src/tasks/dyck_languages/scoring_prompt_template_code.txt:
--------------------------------------------------------------------------------
1 | {prompt}
2 | input_text = "{input_text}"
3 | final_answer = complete_dyck_languages(input_text)
4 | print("Final answer:"+ final_answer)
5 | Generate the expected execution output (output from all print() functions) of the code. You don't have to actually run the code and do not care about 'not implemented error'.
6 | Exprected output:


--------------------------------------------------------------------------------
/src/tasks/geometric_shapes/generated_prompts/classification_result_gpt_3shot.txt:
--------------------------------------------------------------------------------
1 | For these tasks, the key is to analyze the SVG path elements and identify the shape that is being drawn. This requires a more structured approach to track the coordinates and lines in the path element, making the variables tracking strategy more effective. So the answer is Variables tracking.


--------------------------------------------------------------------------------
/src/tasks/geometric_shapes/generated_prompts/explanation_from_scratch_3shot_gpt.txt:
--------------------------------------------------------------------------------
 1 | Explanation:
 2 | This task involves analyzing SVG path elements to determine the shape that is being drawn. The path elements consist of a series of commands that define the movements and curves in the SVG image. By understanding these commands and their implications, we can identify the shape being drawn.
 3 | 
 4 | Analyzing the Task:
 5 | 1. SVG Path Commands: The SVG path element contains commands like M (move to), L (line to), A (arc), etc., each of which contributes to the overall shape being drawn.
 6 | 
 7 | 2. Interpreting Path Commands: Each command in the path element corresponds to a specific action, such as moving to a point, drawing a line, or creating an arc. Understanding these commands is crucial to identifying the shape.
 8 | 
 9 | 3. Recognizing Geometric Shapes: Different combinations of path commands result in different geometric shapes like circles, triangles, rectangles, etc. We need to match the commands to the shapes they represent.
10 | 
11 | Constructing the Code Prompt:
12 | 1. Parsing SVG Path Element: Extract the path commands from the SVG path element provided in the task. This involves splitting the string and identifying each command.
13 | 
14 | 2. Processing Path Commands: Iterate over each command in the path element and execute the corresponding action. This involves:
15 | - Moving to a new point (M command).
16 | - Drawing a line to a point (L command).
17 | - Drawing an arc (A command).
18 | - Other commands as necessary.
19 | 
20 | 3. [Important] Tracking Coordinates: Keep track of the coordinates as the path commands are executed. This involves updating the current position based on the commands.
21 | 
22 | 4. Determining the Shape: After processing all commands, analyze the resulting path to determine the shape being drawn. This can be done by comparing the final path with the characteristics of known shapes.
23 | 
24 | 5. Matching with Provided Options: Compare the identified shape with the options provided in the task to select the correct answer.
25 | 
26 | 6. Returning the Result: Return the identified shape as the output in the specified format ('(A)', '(B)', '(C)', ...).
27 | 
28 | By following these steps and accurately interpreting the SVG path commands, we can determine the shape being drawn and select the correct option from the given choices.


--------------------------------------------------------------------------------
/src/tasks/geometric_shapes/generated_prompts/new_edited_code_prompt_from_explanation_planner_gpt_3shot.txt:
--------------------------------------------------------------------------------
 1 | def recognize_shape_from_svg(svg_path_element):
 2 |     # Step 1: Parse the SVG path element to extract the path commands. Output the extracted path commands.
 3 |     path_commands = extract_path_commands(svg_path_element)
 4 |     print("Extracted path commands:", path_commands)
 5 | 
 6 |     # Step 2: Process each path command to execute the corresponding action. Keep track of the coordinates as the commands are executed.
 7 |     current_position = (0, 0)
 8 |     for command in path_commands:
 9 |         action, coordinates = process_path_command(command, current_position)
10 |         print(f"Executing command: {command} - Action: {action} - New coordinates: {coordinates}")
11 |         current_position = coordinates
12 | 
13 |     # Step 3: Determine the shape based on the processed path commands. This involves analyzing the final path drawn.
14 |     identified_shape = identify_shape(path_commands)
15 |     print("Identified shape:", identified_shape)
16 | 
17 |     # Step 4: Match the identified shape with the provided options to select the correct answer.
18 |     options = extract_options(svg_path_element)
19 |     answer = match_shape_with_options(identified_shape, options)
20 | 
21 |     # Step 5: Return the identified shape as the final answer.
22 |     return answer


--------------------------------------------------------------------------------
/src/tasks/geometric_shapes/helper.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | import os
 4 | from datasets import load_dataset
 5 | from tqdm import tqdm
 6 | import json
 7 | import re
 8 | sys.path.append(os.environ.get('PROJECTPATH'))
 9 | from src.abstract_helper import AbstractHelper
10 | 
11 | 
12 | class GeometricShapesHelper(AbstractHelper):
13 |     def __init__(self, args):
14 |         super(GeometricShapesHelper, self).__init__(args)
15 |         self.function_name = "recognize_shape_from_svg"
16 |     
17 |     def evaluate_prediction(self, outputs):
18 |         agg_pass = []
19 |         agg_task_accuracy = []
20 |         agg_answer = []
21 |         agg_prediction = []
22 |        
23 |         for oi, o in enumerate(outputs):
24 |             o = o.replace("`","").replace(".", "").replace("'","").replace('"', "")
25 |             if "Final answer:" in o:
26 |                 model_output = o.split("Final answer:")[-1].strip()
27 |             else:
28 |                 model_output = o.strip()
29 |             label = self.test_data[oi]['answer']
30 |             option_dic = {option.split()[0]: option for option in self.test_data[oi]['options']}
31 |             is_pass = False
32 |             for option in option_dic.values():
33 |                 if model_output in option and model_output:
34 |                     is_pass = True
35 |                     break
36 |             agg_pass.append(is_pass)
37 |             agg_answer.append(label)
38 |             agg_prediction.append(model_output)
39 |             if model_output in option_dic[label] and model_output:
40 |                 agg_task_accuracy.append(True)
41 |             else:
42 |                 agg_task_accuracy.append(False)
43 |         
44 |         task_accuracy = np.mean(agg_task_accuracy).item()
45 |         pass_rate = sum(agg_pass)/len(agg_pass)
46 |         individual_score = [{"pass_rate": agg_pass[i], "task_accuracy": agg_task_accuracy[i], "answer": agg_answer[i], "prediction": agg_prediction[i]} for i in range(len(agg_task_accuracy))]
47 |         result_score = {
48 |             "pass_rate": pass_rate,
49 |             "task_accuracy": task_accuracy
50 |         }
51 |         return result_score, individual_score
52 |     
53 |     def load_data(self):
54 |         data_name = "tasks/geometric_shapes/data.json"
55 |         with open(data_name, "r") as f:
56 |             data = json.load(f)['examples']
57 |         train_data = [d for d in data]
58 |         test_data = [d for d in data]
59 |         if self.args.num_sample != -1:
60 |             test_data = [test_data[i] for i in range(self.args.num_sample)]
61 | 
62 |         self.train_data = train_data
63 |         self.test_data = test_data
64 |         return train_data, test_data
65 |     
66 |     def load_and_prepare_data(self, dataset_split):
67 |         dataset = self.train_data if dataset_split == "train" else self.test_data
68 |         all_processed_data = []
69 |         for data in tqdm(dataset):
70 |             cur_data = {k:v for k,v in data.items()}
71 |             cur_data['input_text'] = cur_data['input']
72 |             cur_data['answer'] = cur_data['target']
73 |             cur_data['label'] = cur_data['target']
74 |             cur_data['options'] = cur_data['input'].split("\nOptions:\n")[-1].split("\n")
75 |             all_processed_data.append(cur_data)
76 |         
77 |         if dataset_split == "train":
78 |             self.train_data = all_processed_data
79 |         else:
80 |             self.test_data = all_processed_data
81 |             
82 |         return all_processed_data
83 |         


--------------------------------------------------------------------------------
/src/tasks/geometric_shapes/prompts/code_prompt_cot_edited.txt:
--------------------------------------------------------------------------------
 1 | def recognize_shape_from_svg(input_text):
 2 | 		# Step 1: Get the SVG path data from the input text and generate the extracted SVG path.
 3 |     paths = parse_path(input_text)
 4 |     print("SVG paths:\n ", paths)
 5 | 		
 6 | 		# Step 2: Initialize a coordinate map that maps each coordinate with the other connected coordinates and the connection type.
 7 |     coordinate_map = dict()
 8 | 
 9 |     # Step 3: Update the coordinate map referring to the each SVG path.
10 |     for i, path in enumerate(paths):
11 |       coordinate_map = update_coordinate_map(coordinate_map, path)
12 |       print(f"Step {i} - path: {path}, updated coordinate map: {coordinate_map}")
13 | 
14 | 		# Step 4: Conduct calculation to analyze each characteristic of the shape.
15 |     analysis_results_dict = analyze_characteristics(coordinate_map)
16 |     print(f"Anlysis results: {analysis_results_dict}")
17 | 
18 | 		# Step 5: Identify a geometric shape with reasons using the completed coordinates map and the analysis results.
19 |     reason_for_the_decision, name_of_the_shape = identify_shape_with_explanation(coordinate_map, analysis_results_dict)
20 |     print(f"Reason for the decision: {reason_for_the_decision}")
21 |     print(f"Thus, the shape of the path is {name_of_the_shape}.")
22 | 
23 | 		# Step 6: Find the corresponding option from the given options and only output the label of the option as the final answer to the question.
24 |     options = parse_options(input_text)
25 |     print(f"Options: {options}")
26 |     answer = None
27 |     for option in options:
28 |       if name_of_the_shape in option:
29 |         answer = option[:3]
30 |     
31 |     return answer


--------------------------------------------------------------------------------
/src/tasks/geometric_shapes/prompts/code_prompt_cot_edited_2.txt:
--------------------------------------------------------------------------------
 1 | def recognize_shape_from_svg(input_text):
 2 | 		# Step 1: Get the SVG path data from the input text and generate the extracted SVG path.
 3 |     paths = parse_path(input_text)
 4 |     print("SVG paths:\n ", paths)
 5 | 		
 6 | 		# Step 2: Initialize a coordinate map that maps each coordinate with the other connected coordinates and the connection type.
 7 |     coordinate_map = dict()
 8 | 
 9 |     # Step 3: [Variables tracking] Update the coordinate map referring to the each SVG path.
10 |     for i, path in enumerate(paths):
11 |       coordinate_map = update_coordinate_map(coordinate_map, path)
12 |       print(f"Step {i} - path: {path}, updated coordinate map: {coordinate_map}")
13 | 
14 | 		# Step 4: Conduct calculation to analyze each characteristic of the shape.
15 |     analysis_results_dict = analyze_characteristics(coordinate_map)
16 |     print(f"Anlysis results: {analysis_results_dict}")
17 | 
18 | 		# Step 5: [Free format reasoning] Identify a geometric shape with reasons using the completed coordinates map and the analysis results.
19 |     reason_for_the_decision, name_of_the_shape = identify_shape_with_explanation(coordinate_map, analysis_results_dict)
20 |     print(f"Reason for the decision: {reason_for_the_decision}")
21 |     print(f"Thus, the shape of the path is {name_of_the_shape}.")
22 | 
23 | 		# Step 6: Find the corresponding option from the given options and only output the label of the option as the final answer to the question.
24 |     options = parse_options(input_text)
25 |     print(f"Options: {options}")
26 |     answer = None
27 |     for option in options:
28 |       if name_of_the_shape in option:
29 |         answer = option[:3]
30 |     
31 |     return answer


--------------------------------------------------------------------------------
/src/tasks/geometric_shapes/prompts/example_classification.txt:
--------------------------------------------------------------------------------
 1 | Example task instances:
 2 | Q: This SVG path element <path d="M 31.00,73.00 L 32.00,59.00 L 44.00,50.00 L 49.00,41.00 L 64.00,37.00 L 71.00,55.00 L 64.00,76.00 L 52.00,61.00 L 31.00,73.00"/> draws a
 3 | Options:
 4 | (A) circle
 5 | (B) heptagon
 6 | (C) hexagon
 7 | (D) kite
 8 | (E) line
 9 | (F) octagon
10 | (G) pentagon
11 | (H) rectangle
12 | (I) sector
13 | (J) triangle
14 | 
15 | Q: This SVG path element <path d="M 14.19,26.04 L 51.43,39.21 L 58.44,36.69 L 56.63,30.17 L 48.53,26.66 L 14.19,26.04"/> draws a
16 | Options:
17 | (A) circle
18 | (B) heptagon
19 | (C) hexagon
20 | (D) kite
21 | (E) line
22 | (F) octagon
23 | (G) pentagon
24 | (H) rectangle
25 | (I) sector
26 | (J) triangle
27 | 
28 | Q: This SVG path element <path d="M 41.00,43.00 L 37.00,34.00 L 41.00,33.00 L 45.00,34.00 L 41.00,43.00"/> draws a
29 | Options:
30 | (A) circle
31 | (B) heptagon
32 | (C) hexagon
33 | (D) kite
34 | (E) line
35 | (F) octagon
36 | (G) pentagon
37 | (H) rectangle
38 | (I) sector
39 | (J) triangle
40 | 
41 | Reason for the decision & Answer:
42 | After analyzing the given SVG paths, finding attributes and characteristics of the shape needs a really complicated operation that is difficult to specify. Thus, the free format reasoning is needed to solve the task. So the answer is Free format reasoning.


--------------------------------------------------------------------------------
/src/tasks/geometric_shapes/prompts/explanation.txt:
--------------------------------------------------------------------------------
 1 | This task involves identifying the geometric shape formed by an SVG path. SVG (Scalable Vector Graphics) is a language for describing two-dimensional graphics in XML. The path element is used to define a path that can consist of lines, curves, arcs, and more. Understanding and interpreting these paths to determine the resulting shape is the challenge.
 2 | 
 3 | Analyzing the Task:
 4 | 1. Interpreting SVG Path Commands: Understanding the "M" (moveto) and "L" (lineto) commands in SVG is essential. These commands dictate the starting point and subsequent points of the shape.
 5 | 
 6 | 2. Analyze the SVG Path Commands: By generating a detailed explanation on the Path, the code should generate an analysis of the shape of the path.
 7 | 
 8 | 3. Advanced Geometry Analysis: For complex shapes, more sophisticated geometric analysis might be needed, such as calculating side lengths or angles.
 9 | 
10 | Constructing the Code Prompt:
11 | 1. Extracting SVG Path Data: The first step involves parsing the input text to extract the SVG path data.
12 | 
13 | 2. Initializing a coordinate map: Create a coordinate map that maps each coordinate with the other connected coordinates 
14 | 
15 | 3. Updating the coordinate map: Iterate over the parsed SVG paths, update the coordinate map referring to the each SVG path.
16 | 
17 | 4. Analyzing the characteristics of the shape referring to the coordinate map: According to the coordinate map, make an analysis over the shape.
18 | 
19 | 5. [important] Shape Identification: Identify reasons and the proper shape using the completed coordinates map and the analysis results. Use an abstract function like 'identify_shape_with_explanation' which outputs both the reason and the shape.
20 | 
21 | 6. Matching and Outputting the Answer: Parse the options from the input text, find the corresponding option from the given options, and only output the label of the option as the final answer to the question.


--------------------------------------------------------------------------------
/src/tasks/geometric_shapes/prompts/explanation_2.txt:
--------------------------------------------------------------------------------
 1 | This task involves identifying the geometric shape formed by an SVG path. SVG (Scalable Vector Graphics) is a language for describing two-dimensional graphics in XML. The path element is used to define a path that can consist of lines, curves, arcs, and more. Understanding and interpreting these paths to determine the resulting shape is the challenge.
 2 | 
 3 | Analyzing the Task:
 4 | In order to understand the given SVG paths, it is crucial to keep track of the each coordinates, lines, and curves in the SVG paths. Therefore, variables tracking is effective to solve the task. However, after analyzing the given SVG paths, finding attributes and characteristics of the shape could need a really complicated operation. Thus, the free format reasoning is also needed to solve the task.
 5 | 
 6 | Constructing the Code Prompt:
 7 | 1. Extracting SVG Path Data: The first step involves parsing the input text to extract the SVG path data.
 8 | 
 9 | 2. Initializing a coordinate map: Create a coordinate map that maps each coordinate with the other connected coordinates 
10 | 
11 | 3. [Variables tracking] Updating the coordinate map: Iterate over the parsed SVG paths by using 'for loop', update the coordinate map referring to the each SVG path. To keep track of the changes in the coordinate map, use 'print()' to print the intermediate changes.
12 | 
13 | 4. Analyzing the characteristics of the shape referring to the coordinate map: According to the coordinate map, make an analysis over the shape.
14 | 
15 | 5. [Free format reasoning] Shape Identification: Identify reasons and the proper shape using the completed coordinates map and the analysis results. Use an abstract function like 'identify_shape_with_explanation' which outputs both the reason and the shape.
16 | 
17 | 6. Matching and Outputting the Answer: Parse the options from the input text, find the corresponding option from the given options, and only output the label of the option as the final answer to the question.


--------------------------------------------------------------------------------
/src/tasks/geometric_shapes/scoring_prompt_template_code.txt:
--------------------------------------------------------------------------------
1 | {prompt}
2 | input_text = "{input_text}"
3 | final_answer = recognize_shape_from_svg(input_text)
4 | print("Final answer:"+ final_answer)
5 | Generate the expected execution output (output from all print() functions) of the code. You don't have to actually run the code and do not care about 'not implemented error'.
6 | Output:
7 | 


--------------------------------------------------------------------------------
/src/tasks/navigate/generated_prompts/classification_result_gpt_3shot.txt:
--------------------------------------------------------------------------------
1 | For these tasks, the key is to follow the sequence of steps and determine if they lead back to the starting point. This requires tracking the movements and directions taken throughout the instructions. Therefore, the variables tracking method would be more effective in solving these problems as it allows for a systematic approach to keep track of the movements and directions. So the answer is Variables tracking.


--------------------------------------------------------------------------------
/src/tasks/navigate/generated_prompts/explanation_from_scratch_3shot_gpt.txt:
--------------------------------------------------------------------------------
 1 | Explanation:
 2 | This task involves following a series of instructions related to movement and direction to determine if the final position is the same as the starting point. The challenge lies in accurately tracking the movements and rotations to deduce the final position.
 3 | 
 4 | Analyzing the Task:
 5 | 1. Movement Tracking: Keep track of the steps taken in each direction (forward, backward, left, right) to determine the final position.
 6 | 
 7 | 2. Directional Changes: Account for any rotations (turning left or right) that may alter the orientation during movement.
 8 | 
 9 | 3. Spatial Reasoning: Apply logical reasoning to calculate the final position based on the cumulative effect of the movements and rotations.
10 | 
11 | Constructing the Code Prompt:
12 | 1. Extracting Instructions: Parse the input text to extract the sequence of movements and rotations.
13 | 
14 | 2. Processing Movements:
15 |    - Initialize variables to track the current position (x, y coordinates) and orientation (direction faced).
16 |    - Iterate over each instruction, updating the position and orientation accordingly.
17 | 
18 | 3. [Important] Determining the Final Position:
19 |    - Calculate the final position based on the cumulative effect of movements and rotations.
20 |    - Check if the final position matches the starting point to determine if the participant returns to the starting point.
21 | 
22 | 4. Matching with Options: Compare the final position with the starting point to determine if the participant returns to the starting point.
23 | 
24 | 5. Returning the Final Answer: Output 'Yes' if the participant returns to the starting point, 'No' otherwise.


--------------------------------------------------------------------------------
/src/tasks/navigate/generated_prompts/new_edited_code_prompt_from_explanation_planner_gpt_3shot.txt:
--------------------------------------------------------------------------------
 1 | def ends_up_at_start(input_text):
 2 |     # Step 1: Extract the sequence of movements and rotations from the input text.
 3 |     instructions = extract_instructions(input_text)
 4 |     print("Instructions:", instructions)
 5 | 
 6 |     # Step 2: Initialize variables to track the current position and orientation.
 7 |     x, y = 0, 0
 8 |     orientation = "N"
 9 |     print(f"Initial position: ({x}, {y}), Orientation: {orientation}")
10 | 
11 |     # Step 3: Process each instruction to update the position and orientation accordingly.
12 |     for instruction in instructions:
13 |         x, y, orientation = process_instruction(instruction, x, y, orientation)
14 |         print(f"Instruction: {instruction} -> Position: ({x}, {y}), Orientation: {orientation}")
15 | 
16 |     # Step 4: Determine the final position after following all instructions.
17 |     final_position = (x, y)
18 |     print("Final Position:", final_position)
19 | 
20 |     # Step 5: Check if the final position matches the starting point to determine if the participant returns to the starting point.
21 |     if final_position == (0, 0):
22 |         return 'Yes'
23 |     else:
24 |         return 'No'


--------------------------------------------------------------------------------
/src/tasks/navigate/helper.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | import os
 4 | from datasets import load_dataset
 5 | from tqdm import tqdm
 6 | import json
 7 | import re
 8 | sys.path.append(os.environ.get('PROJECTPATH'))
 9 | from src.abstract_helper import AbstractHelper
10 | 
11 | 
12 | class NavigateHelper(AbstractHelper):
13 |     def __init__(self, args):
14 |         super(NavigateHelper, self).__init__(args)
15 |         self.function_name = "ends_up_at_start"
16 |     
17 |     def evaluate_prediction(self, outputs):
18 |         agg_pass = []
19 |         agg_task_accuracy = []
20 |         model_outputs = []
21 | 
22 |         for oi, o in enumerate(outputs):
23 |             o = o.lower()
24 |             if "answer:" in o:
25 |                 model_output = o.split("answer:")[-1].strip()
26 |             else:
27 |                 model_output = o.strip()
28 |             label = self.test_data[oi]['answer'].lower()
29 |             is_pass = False
30 |             for ans in ['yes', 'no']:
31 |                 if ans in model_output[:5]:
32 |                     is_pass = True
33 |                     break
34 |             agg_pass.append(is_pass)
35 |             if label in model_output[:5]:
36 |                 agg_task_accuracy.append(True)
37 |             else:
38 |                 agg_task_accuracy.append(False)
39 |             model_outputs.append(model_output)
40 |         
41 |         task_accuracy = np.mean(agg_task_accuracy).item()
42 |         pass_rate = sum(agg_pass)/len(agg_pass)
43 |         individual_score = [{"pass_rate": agg_pass[i], "task_accuracy": agg_task_accuracy[i], "prediction": model_outputs[i], "answer": self.test_data[i]['answer']} for i in range(len(agg_task_accuracy))]
44 |         result_score = {
45 |             "pass_rate": pass_rate,
46 |             "task_accuracy": task_accuracy
47 |         }
48 |         return result_score, individual_score
49 |     
50 |     def load_data(self):
51 |         data_name = "tasks/navigate/data.json"
52 |         with open(data_name, "r") as f:
53 |             data = json.load(f)['examples']
54 |         train_data = [d for d in data]
55 |         test_data = [d for d in data]
56 |         if self.args.num_sample != -1:
57 |             test_data = [test_data[i] for i in range(self.args.num_sample)]
58 | 
59 |         self.train_data = train_data
60 |         self.test_data = test_data
61 |         return train_data, test_data
62 |     
63 |     def load_and_prepare_data(self, dataset_split):
64 |         dataset = self.train_data if dataset_split == "train" else self.test_data
65 |         all_processed_data = []
66 |         for data in tqdm(dataset):
67 |             cur_data = {k:v for k,v in data.items()}
68 |             cur_data['input_text'] = cur_data['input'][92:-20] if "Always face forward. " in cur_data['input'] else cur_data['input'][71:-20]
69 |             cur_data['answer'] = cur_data['target']
70 |             cur_data['label'] = cur_data['target']
71 |             all_processed_data.append(cur_data)
72 |         
73 |         if dataset_split == "train":
74 |             self.train_data = all_processed_data
75 |         else:
76 |             self.test_data = all_processed_data
77 |             
78 |         return all_processed_data
79 |         


--------------------------------------------------------------------------------
/src/tasks/navigate/prompts/code_prompt_cot_edited.txt:
--------------------------------------------------------------------------------
 1 | def ends_up_at_start(input_text):
 2 |     # Step 1: Initialize coordinates and direction by setting the starting point at (0, 0) and face north.
 3 |     cur_x, cur_y = 0, 0
 4 |     cur_direction = 0
 5 | 
 6 |     # Step 2: Identify and list up instructions from the input text.
 7 |     instructions = parse_instructions(input_text)
 8 |     
 9 |     # Step 3: Process each instruction and update the current coordinates and direction. In order to keep track of changes, output the instruction, current and updated coordinates and direction.
10 |     for i, instruction in enumerate(instructions):
11 |         new_x, new_y, new_direction = process_instruction(instruction, cur_x, cur_y, cur_direction) # process instruction to calculate new position and direction
12 |         print(f"Step {i}: {instruction} - current coordinates: ({cur_x}, {cur_y}), current direction: {cur_direction} -> updated coordinates: ({new_x}, {new_y}), updated direction: {new_direction}")
13 |         cur_x, cur_y, cur_direction = new_x, new_y, new_direction
14 | 
15 |     # Step 4: Return "yes" if the final coordinates are (0, 0). Otherwise, return "no" as the final answer.
16 |     return 'yes' if cur_x == 0 and cur_y == 0 else 'no'


--------------------------------------------------------------------------------
/src/tasks/navigate/prompts/code_prompt_cot_edited_2.txt:
--------------------------------------------------------------------------------
 1 | def ends_up_at_start(input_text):
 2 |     # Step 1: Initialize coordinates and direction by setting the starting point at (0, 0) and face north.
 3 |     cur_x, cur_y = 0, 0
 4 |     cur_direction = 0
 5 | 
 6 |     # Step 2: Identify and list up instructions from the input text.
 7 |     instructions = parse_instructions(input_text)
 8 |     
 9 |     # Step 3: [Variables tracking] Process each instruction and update the current coordinates and direction.
10 |     for i, instruction in enumerate(instructions):
11 |         new_x, new_y, new_direction = process_instruction(instruction, cur_x, cur_y, cur_direction) # process instruction to calculate new position and direction
12 |         print(f"Step {i}: {instruction} - current coordinates: ({cur_x}, {cur_y}), current direction: {cur_direction} -> updated coordinates: ({new_x}, {new_y}), updated direction: {new_direction}")
13 |         cur_x, cur_y, cur_direction = new_x, new_y, new_direction
14 | 
15 |     # Step 4: Return "yes" if the final coordinates are (0, 0). Otherwise, return "no" as the final answer.
16 |     return 'yes' if cur_x == 0 and cur_y == 0 else 'no'


--------------------------------------------------------------------------------
/src/tasks/navigate/prompts/example_classification.txt:
--------------------------------------------------------------------------------
 1 | Example task instances:
 2 | Q: If you follow these instructions, do you return to the starting point? Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. Take 8 steps.
 3 | Options:
 4 | - Yes
 5 | - No
 6 | 
 7 | Q: If you follow these instructions, do you return to the starting point? Turn around. Take 1 step. Take 6 steps. Turn around. Take 6 steps. Take 9 steps. Take 1 step.
 8 | Options:
 9 | - Yes
10 | - No
11 | 
12 | Q: If you follow these instructions, do you return to the starting point? Always face forward. Take 2 steps right. Take 9 steps left. Take 7 steps right.
13 | Options:
14 | - Yes
15 | - No
16 | 
17 | Reason for the decision & Answer:
18 | It is crucial to process each of the instruction and keep track of the current position and direction to solve the task. Therefore, the variables tracking is an effective way to tackle the target task. So the answer is Variables tracking.


--------------------------------------------------------------------------------
/src/tasks/navigate/prompts/explanation.txt:
--------------------------------------------------------------------------------
 1 | This task involves determining whether a series of movements brings a person back to their starting point. This requires tracking the person's position in a two-dimensional space and updating their location based on the instructions given.
 2 | 
 3 | Analyzing the Task:
 4 | 1.Directional Movement and Position Tracking: The task requires keeping track of the person's position (x and y coordinates) as they move in different directions.
 5 | 
 6 | 2.Interpreting Movement Instructions: Instructions include steps in various directions (left, right, forward, backward) and turning, which affects the direction of subsequent steps.
 7 | 
 8 | 3.Determining the Final Position: The objective is to determine if the final position after following all instructions is the same as the starting position.
 9 | 
10 | Constructing the Code Prompt:
11 | 
12 | 1. Initializing Position and Direction: The person starts at a fixed point (0,0) facing a specific direction (e.g., north). This setup requires initializing variables for x and y coordinates, and a variable for direction.
13 | 
14 | 2. Parsing Instructions: The instructions are separated and parsed to identify the type and magnitude of each movement. This involves string manipulation and possibly a mapping of textual instructions to movements and turns.
15 | 
16 | 3. [Important] Processing Each Instruction: Each instruction affects the position or the direction of movement. The intermediate step for calculating the final coordinate should be printed out using print() function. This involves:
17 | 
18 | - Translating forward, backward, left, and right steps into changes in the x and y coordinates.
19 | - Handling turns to update the direction of movement.
20 | - A function like process_instruction can be created to encapsulate the logic for each instruction.
21 | 
22 | 4. Checking the Final Position and Returning the Result: After processing all instructions, the final coordinates are compared with the starting point to determine if the person has returned to the start. Based on whether the final position matches the starting point, the function returns 'Yes' or 'No'.


--------------------------------------------------------------------------------
/src/tasks/navigate/prompts/explanation_2.txt:
--------------------------------------------------------------------------------
 1 | This task involves determining whether a series of movements brings a person back to their starting point. This requires tracking the person's position in a two-dimensional space and updating their location based on the instructions given.
 2 | 
 3 | Analyzing the Task:
 4 | It is crucial to process each of the instruction and keep track of the current position and direction to solve the task. Therefore, the variables tracking is an effective way to tackle the target task.
 5 | 
 6 | Constructing the Code Prompt:
 7 | 
 8 | 1. Initializing Position and Direction: The person starts at a fixed point (0,0) facing a specific direction (e.g., north). This setup requires initializing variables for x and y coordinates, and a variable for direction.
 9 | 
10 | 2. Parsing Instructions: The instructions are separated and parsed to identify the type and magnitude of each movement. This involves string manipulation and possibly a mapping of textual instructions to movements and turns.
11 | 
12 | 3. [Variables tracking] Processing Each Instruction: Process each instruction and update the current coordinates and direction by using 'for loop'. Following things should be logged with print() function for each intermediate steps:
13 | - Each instruction to process
14 | - Current coordinates and direction
15 | - Updated coordinates and direction
16 | 
17 | 4. Checking the Final Position and Returning the Result: After processing all instructions, the final coordinates are compared with the starting point to determine if the person has returned to the start. Based on whether the final position matches the starting point, the function returns 'Yes' or 'No'.


--------------------------------------------------------------------------------
/src/tasks/navigate/scoring_prompt_template_code.txt:
--------------------------------------------------------------------------------
1 | {prompt}
2 | input_text = "{input_text}"
3 | final_answer = ends_up_at_start(input_text)
4 | print("Final answer:"+ final_answer)
5 | Generate the expected execution output (output from all print() functions) of the code. You don't have to actually run the code and do not care about 'not implemented error'.
6 | Exprected output:
7 | 


--------------------------------------------------------------------------------
/src/tasks/object_counting/helper.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | import os
 4 | from datasets import load_dataset
 5 | from tqdm import tqdm
 6 | import json
 7 | import re
 8 | sys.path.append(os.environ.get('PROJECTPATH'))
 9 | from src.abstract_helper import AbstractHelper
10 | 
11 | 
12 | class ObjectCountingHelper(AbstractHelper):
13 |     def __init__(self, args):
14 |         super(ObjectCountingHelper, self).__init__(args)
15 |         self.function_name = "calculate_total_items"
16 |     
17 |     def evaluate_prediction(self, outputs):
18 |         agg_pass = []
19 |         agg_task_accuracy = []
20 |        
21 |         for oi, o in enumerate(outputs):
22 |             model_output = o
23 |             if "Final answer:" in o:
24 |                 model_output = o.split("Final answer:")[1].split('\n')[0].strip()
25 |             label = self.test_data[oi]['answer']
26 |             if model_output.isdigit():
27 |                 is_pass = True
28 |             else:
29 |                 is_pass = False
30 |             agg_pass.append(is_pass)
31 |             if label == model_output:
32 |                 agg_task_accuracy.append(True)
33 |             else:
34 |                 agg_task_accuracy.append(False)
35 |         
36 |         task_accuracy = np.mean(agg_task_accuracy).item()
37 |         pass_rate = sum(agg_pass)/len(agg_pass)
38 |         individual_score = [{"pass_rate": agg_pass[i], "task_accuracy": agg_task_accuracy[i], "answer": self.test_data[i]['answer']} for i in range(len(agg_task_accuracy))]
39 |         result_score = {
40 |             "pass_rate": pass_rate,
41 |             "task_accuracy": task_accuracy
42 |         }
43 |         return result_score, individual_score
44 |     
45 |     def load_data(self):
46 |         data_name = "tasks/object_counting/data.json"
47 |         with open(data_name, "r") as f:
48 |             data = json.load(f)['examples']
49 |         train_data = [d for d in data]
50 |         test_data = [d for d in data]
51 |         if self.args.num_sample != -1:
52 |             test_data = [test_data[i] for i in range(self.args.num_sample)]
53 | 
54 |         self.train_data = train_data
55 |         self.test_data = test_data
56 |         return train_data, test_data
57 |     
58 |     def load_and_prepare_data(self, dataset_split):
59 |         dataset = self.train_data if dataset_split == "train" else self.test_data
60 |         all_processed_data = []
61 |         for data in tqdm(dataset):
62 |             cur_data = {k:v for k,v in data.items()}
63 |             cur_data['input_text'] = cur_data['input']
64 |             cur_data['answer'] = cur_data['target']
65 |             cur_data['label'] = cur_data['target']
66 |             all_processed_data.append(cur_data)
67 |         
68 |         if dataset_split == "train":
69 |             self.train_data = all_processed_data
70 |         else:
71 |             self.test_data = all_processed_data
72 |             
73 |         return all_processed_data
74 |         


--------------------------------------------------------------------------------
/src/tasks/object_counting/prompts/code_prompt_cot_edited.txt:
--------------------------------------------------------------------------------
 1 | def count_objects(input_text):
 2 |     # Step 1: Identify and list all objects and their number from the input text. Output the listed objects.
 3 |     objects_list = extract_objects(input_text)
 4 |     print("All objects:", objects_list)
 5 | 
 6 |     # Step 2: Identify and show the specific question asked from the input text.
 7 |     qustion = extract_question(input_text)
 8 |     print("question:", question)
 9 | 
10 |     # Step 3: Initialize the total count by setting it to 0.
11 |     total_count = 0
12 | 
13 |     # Step 4: Go through each object in the list in steps, count the object according to the question, adds up the object's count to the total count, and show the count and intermediate count.
14 |     for i, object in enumerate(objects_list):
15 |         single_object_count = count_single_object(object)
16 |         total_count += single_object_count
17 |         print(f"Step ({i}) - {object}: {single_object_count}, Intermediate count: {total_count}")
18 | 
19 |     # Step 5: Output the final number. After going through all objects, output the total number as the answer to the question.
20 |     return total_count


--------------------------------------------------------------------------------
/src/tasks/object_counting/prompts/code_prompt_cot_edited_2.txt:
--------------------------------------------------------------------------------
 1 | def count_objects(input_text):
 2 |     # Step 1: Identify and list all objects and their number from the input text. Output the listed objects.
 3 |     objects_list = extract_objects(input_text)
 4 |     print("All objects:", objects_list)
 5 | 
 6 |     # Step 2: Identify and show the specific question asked from the input text.
 7 |     qustion = extract_question(input_text)
 8 |     print("question:", question)
 9 | 
10 |     # Step 3: Initialize the total count by setting it to 0.
11 |     total_count = 0
12 | 
13 |     # Step 4: [Variables tracking] Go through each object in the list in steps, count the object according to the question, adds up the object's count to the total count, and show the count and intermediate count.
14 |     for i, object in enumerate(objects_list):
15 |         print(f"Step ({i}) - {object}")
16 |         single_object_count = count_single_object(object)
17 |         total_count += single_object_count
18 |         print(f"Therefore, {object} is: {single_object_count}, Intermediate count: {total_count}")
19 | 
20 |     # Step 5: Output the final number. After going through all objects, output the total number as the answer to the question.
21 |     return total_count


--------------------------------------------------------------------------------
/src/tasks/object_counting/prompts/explanation.txt:
--------------------------------------------------------------------------------
 1 | Explanation:
 2 | This task involves identifying and counting specific types of objects (fruits or vegetables) from a mixed list. The primary challenge is correctly categorizing each item in the list and then tallying the counts of the desired categories.
 3 | 
 4 | Analyzing the Task:
 5 | 1. Object Categorization: Determining whether each item in the list is a fruit, a vegetable, or neither. This requires knowledge of common fruits and vegetables.
 6 | 
 7 | 2. Counting Objects: Accurately counting the number of items in each category. This may involve handling singular and plural forms, as well as numerical phrases (e.g., "two peaches").
 8 | 
 9 | 3. Handling Varied Input: The input text may vary in format and content, requiring flexible parsing to extract the necessary information.
10 | 
11 | Constructing the Code Prompt:
12 | 1. Extracting Objects from Input: The first step is parsing the input text to create a list of objects. This requires string manipulation and possibly natural language processing techniques.
13 | 
14 | 2. Extracting the Question: Determine what category of items needs to be counted (e.g., fruits, vegetables) from the input text. 
15 | 
16 | 3. Initializing Count Variables: Set up a variable to keep track of the total count of the desired objects.
17 | 
18 | 4. [Important] Counting Each Object: Iterate over each object, determine if it belongs to the desired category (fruit or vegetable), and count it. Intermeditate count results shoud be logged with print() function. This involves:
19 |    - Identifying singular and plural forms.
20 |    - Recognizing and handling numerical phrases.
21 |    - Keeping a running total of the count.
22 | 
23 | 5. Returning the Total Count: After processing all objects, return the total count as the answer.


--------------------------------------------------------------------------------
/src/tasks/object_counting/prompts/explanation_2.txt:
--------------------------------------------------------------------------------
 1 | This task involves identifying and counting specific types of objects (fruits or vegetables) from a mixed list. The primary challenge is correctly categorizing each item in the list and then tallying the counts of the desired categories.
 2 | 
 3 | Analyzing the Task:
 4 | It needs to keep track of the intermediate counts of the each item in the list, but also needs a free format reasoning to calculate the proper number of item according to the question.
 5 | 
 6 | Constructing the Code Prompt:
 7 | 1. Extracting Objects from Input: The first step is parsing the input text to create a list of objects with their number.
 8 | 
 9 | 2. Extracting the Question: Determine what category of items needs to be counted (e.g., fruits, vegetables) from the input text. 
10 | 
11 | 3. Initializing Count Variables: Set up a variable to keep track of the total count of the desired objects.
12 | 
13 | 4. [Variables tracking] Counting Each Object: Go through each object in the list by using 'for loop', count the object according to the question, adds up the object's count to the total count. Following things should be logged with print() function for each intermediate steps:
14 | - Each item to be counted
15 | - Count of the each item and an explanation.
16 | - Intermediate total counts
17 | 
18 | 5. Returning the Total Count: After processing all objects, return the total count as the answer.


--------------------------------------------------------------------------------
/src/tasks/object_counting/scoring_prompt_template_code.txt:
--------------------------------------------------------------------------------
1 | {prompt}
2 | input_text = "{input_text}"
3 | final_answer = calculate_total_items(input_text)
4 | print("Final answer:"+ final_answer)
5 | Generate the expected execution output (output from all print() functions) of the code. You don't have to actually run the code and do not care about 'not implemented error'.
6 | Exprected output:
7 | 


--------------------------------------------------------------------------------
/src/tasks/penguins_in_a_table/helper.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | import os
 4 | from datasets import load_dataset
 5 | from tqdm import tqdm
 6 | import json
 7 | import re
 8 | sys.path.append(os.environ.get('PROJECTPATH'))
 9 | from src.abstract_helper import AbstractHelper
10 | 
11 | 
12 | class PenguinsInATableHelper(AbstractHelper):
13 |     def __init__(self, args):
14 |         super(PenguinsInATableHelper, self).__init__(args)
15 |         self.function_name = "penguins_in_a_table"
16 |     
17 |     def evaluate_prediction(self, outputs):
18 |         agg_pass = []
19 |         agg_task_accuracy = []
20 |        
21 |         for oi, o in enumerate(outputs):
22 |             model_output = o
23 |             if "Final answer:" in o:
24 |                 model_output = o.split("Final answer:")[-1].strip()
25 |             else:
26 |                 model_output = o.strip()
27 |             label = self.test_data[oi]['answer'].strip("()")
28 |             AnsSpan = ['A', 'B', 'C', 'D', 'E']
29 |             is_pass = False
30 |             for an in AnsSpan:
31 |                 if an in model_output[:4]:
32 |                     is_pass = True
33 |                     break
34 |             agg_pass.append(is_pass)
35 |             if label in model_output[:4]:
36 |                 agg_task_accuracy.append(True)
37 |             else:
38 |                 agg_task_accuracy.append(False)
39 |         
40 |         task_accuracy = np.mean(agg_task_accuracy).item()
41 |         pass_rate = sum(agg_pass)/len(agg_pass)
42 |         individual_score = [{"pass_rate": agg_pass[i], "task_accuracy": agg_task_accuracy[i]} for i in range(len(agg_task_accuracy))]
43 |         result_score = {
44 |             "pass_rate": pass_rate,
45 |             "task_accuracy": task_accuracy
46 |         }
47 |         return result_score, individual_score
48 |     
49 |     def load_data(self):
50 |         data_name = "tasks/penguins_in_a_table/data.json"
51 |         with open(data_name, "r") as f:
52 |             data = json.load(f)['examples']
53 |         train_data = [d for d in data]
54 |         test_data = [d for d in data]
55 |         if self.args.num_sample != -1:
56 |             test_data = [test_data[i] for i in range(self.args.num_sample)]
57 | 
58 |         self.train_data = train_data
59 |         self.test_data = test_data
60 |         return train_data, test_data
61 |     
62 |     def load_and_prepare_data(self, dataset_split):
63 |         dataset = self.train_data if dataset_split == "train" else self.test_data
64 |         all_processed_data = []
65 |         for data in tqdm(dataset):
66 |             cur_data = {k:v for k,v in data.items()}
67 |             cur_data['input_text'] = cur_data['input']
68 |             cur_data['answer'] = cur_data['target']
69 |             cur_data['label'] = cur_data['target']
70 |             all_processed_data.append(cur_data)
71 |         
72 |         if dataset_split == "train":
73 |             self.train_data = all_processed_data
74 |         else:
75 |             self.test_data = all_processed_data
76 |             
77 |         return all_processed_data
78 |         


--------------------------------------------------------------------------------
/src/tasks/penguins_in_a_table/prompts/code_prompt_cot_edited.txt:
--------------------------------------------------------------------------------
 1 | def penguins_in_a_table(input_text):
 2 |     # Step 1: Parse information from the input text and present them in a table.
 3 |     penguin_info_table = make_table(input_text)
 4 |     print("The following is a table containing information about each penguin:\n", penguin_info_table)
 5 | 
 6 |     # Step 2: Parse instruction of further modification to the table.
 7 |     required_modification_to_table = identify_required_modification(input_text)
 8 |     print(f"The instruction we must follow before answering the question is: {required_modification_to_table}")
 9 | 
10 |     # Step 3: If the required modification is exists, update the table according to it. Show the current table and updated table.
11 |     if required_modification_to_table != None:
12 |         print("Update the current table upon the further modification.")
13 |         print(f"Current table: {penguin_info_table}")
14 |         penguin_info_table = penguin_info_table.processing(required_modification_to_table)
15 |         print(f"Updated table: {required_modification_to_table}")
16 |     
17 |     # Step 4: Identify question from input text and outputs the reasoning and the proper answer to the question.
18 |     question = identify_question(input_text)
19 |     print(f"The question is: {question}")
20 |     reason, pre_answer = derive_answer_with_explanation(penguin_info_table, question)
21 |     print(f"Reason for the decision: {reason}")
22 |     print(f"Therefore, the answer is: {pre_answer}")
23 | 
24 |     # Step 5: Identify options and match the result from the Step 4 with the given options.
25 |     options = extract_options(input_text)
26 |     print("The answer options are:\n", options)
27 |     answer = None
28 |     for option in options:
29 |         if pre_answer in option:
30 |             answer = option[:3]
31 |             break
32 |     
33 |     # Step 6: Return the final answer.
34 |     return answer


--------------------------------------------------------------------------------
/src/tasks/penguins_in_a_table/prompts/code_prompt_cot_edited_2.txt:
--------------------------------------------------------------------------------
 1 | def penguins_in_a_table(input_text):
 2 |     # Step 1: Parse information from the input text and present them in a table.
 3 |     penguin_info_table = make_table(input_text)
 4 |     print("The following is a table containing information about each penguin:\n", penguin_info_table)
 5 | 
 6 |     # Step 2: Parse instruction of further modification to the table.
 7 |     required_modification_to_table = identify_required_modification(input_text)
 8 |     print(f"The instruction we must follow before answering the question is: {required_modification_to_table}")
 9 | 
10 |     # Step 3: If the required modification is exists, update the table according to it. Show the current table and updated table.
11 |     if required_modification_to_table != None:
12 |         print("Update the current table upon the further modification.")
13 |         print(f"Current table: {penguin_info_table}")
14 |         penguin_info_table = penguin_info_table.processing(required_modification_to_table)
15 |         print(f"Updated table: {required_modification_to_table}")
16 |     
17 |     # Step 4: [Free format reasoning] Identify question from input text and outputs the reasoning and the proper answer to the question.
18 |     question = identify_question(input_text)
19 |     print(f"The question is: {question}")
20 |     reason, pre_answer = derive_answer_with_explanation(penguin_info_table, question)
21 |     print(f"Reason for the decision: {reason}")
22 |     print(f"Therefore, the answer is: {pre_answer}")
23 | 
24 |     # Step 5: Identify options and match the result from the Step 4 with the given options.
25 |     options = extract_options(input_text)
26 |     print("The answer options are:\n", options)
27 |     answer = None
28 |     for option in options:
29 |         if pre_answer in option:
30 |             answer = option[:3]
31 |             break
32 |     
33 |     # Step 6: Return the final answer.
34 |     return answer


--------------------------------------------------------------------------------
/src/tasks/penguins_in_a_table/prompts/example_classification.txt:
--------------------------------------------------------------------------------
 1 | Example task instances:
 2 | Q: Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  We now add a penguin to the table:
 3 | James, 12, 90, 12
 4 | How many penguins are less than 8 years old?
 5 | Options:
 6 | (A) 1
 7 | (B) 2
 8 | (C) 3
 9 | (D) 4
10 | (E) 5
11 | 
12 | Q: Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  Which is the youngest penguin?
13 | Options:
14 | (A) Louis
15 | (B) Bernard
16 | (C) Vincent
17 | (D) Gwen
18 | (E) James
19 | 
20 | Q: Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  What is the name of the second penguin sorted by alphabetic order?
21 | Options:
22 | (A) Louis
23 | (B) Bernard
24 | (C) Vincent
25 | (D) Gwen
26 | (E) James
27 | 
28 | Reason for the decision & Answer:
29 | To find target individual from the table referring to the question is hard to implement with a variables tracking method. Therefore, the free format reasoning could be a better way to solve the problem. So the answer is Free format reasoning.


--------------------------------------------------------------------------------
/src/tasks/penguins_in_a_table/prompts/explanation.txt:
--------------------------------------------------------------------------------
 1 | This task revolves around interpreting and extracting specific pieces of information from structured data presented in tabular format. It focuses on a distinct query related to the attributes (age, height, weight) of individuals (penguins in this case) listed in the table.
 2 | 
 3 | Analyzing the Task:
 4 | 1. Construct initial table: Because he task gives the initial information in a format of tabluar structure, it is important to construct the initial table to understand it.
 5 | 
 6 | 2. Processing additional information: There is sometimes additional instruction or information about the table. Check if an additional information exists and update the table.
 7 | 
 8 | 3. Find Proper answer: Find the correct penguin that the question is asking referring to the updated table.
 9 | 
10 | Constructing the Code Prompt:
11 | 1. Construct initial table: Parse information from the input text and present them in a table.
12 | 
13 | 2. Identifying a modification instruction: Parse instruction of further modification to the table.
14 | 
15 | 3. Updating the info table: If the required modification is exists, update the table according to it. Show the current table and updated table.
16 | 
17 | 4. [Important] Identify question from input text and outputs the reasoning and the proper answer to the question. Use an abstract function like 'derive_answer_with_explanation' which outputs both the reasoning and the correct answer to the question.
18 | 
19 | 5. Choosing correct answer from the options: Identify options and match the result from the Step 4 with the given options.
20 | 
21 | 6. Returning the final answer: Return the final answer.


--------------------------------------------------------------------------------
/src/tasks/penguins_in_a_table/prompts/explanation_2.txt:
--------------------------------------------------------------------------------
 1 | This task revolves around interpreting and extracting specific pieces of information from structured data presented in tabular format. It focuses on a distinct query related to the attributes (age, height, weight) of individuals (penguins in this case) listed in the table.
 2 | 
 3 | Analyzing the Task:
 4 | To find target individual from the table referring to the question is hard to implement with a variables tracking method. Therefore, the free format reasoning could be a better way to solve the problem.
 5 | 
 6 | Constructing the Code Prompt:
 7 | 1. Construct initial table: Parse information from the input text and present them in a table.
 8 | 
 9 | 2. Identifying a modification instruction: Parse instruction of further modification to the table.
10 | 
11 | 3. Updating the info tabel: If the required modification is exists, update the table according to it. Show the current table and updated table.
12 | 
13 | 4. [Free format reasoning] Identify question from input text and outputs the reasoning and the proper answer to the question. Use an abstract function like 'derive_answer_with_explanation' which outputs both the reasoning and the correct answer to the question.
14 | 
15 | 5. Choosing correct answer from the options: Identify options and match the result from the Step 4 with the given options.
16 | 
17 | 6. Returning the final answer: Return the final answer.


--------------------------------------------------------------------------------
/src/tasks/penguins_in_a_table/scoring_prompt_template_code.txt:
--------------------------------------------------------------------------------
1 | {prompt}
2 | input_text = "{input_text}"
3 | final_answer = penguins_in_a_table(input_text)
4 | print("Final answer: "+ final_answer)
5 | Generate the expected output of the code without any explanation. You don't have to actually run the code and do not care about 'not implemented error'.
6 | Expected output:
7 | 


--------------------------------------------------------------------------------
/src/tasks/reasoning_about_colored_objects/generated_prompts/classification_result_gpt_3shot.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "input": [
  3 |         "Choose between the variables tracking and the free format reasoning which strategy would be more effective than the other one when tackling the last task considering the example task instances and explain why. The variables tracking is effective when the task has multiple instructions or statements to process and need to check the important variables for each of the step. For the free format reasoning, it works well when the task needs a complex or diverse logic that is difficult to be expressed in a specific method. In a free format reasoning step, you don't have to specify the logic, but you should specify an abstract fucntion like 'derive_answer_with_explantion' which outputs both reasons and proper answer for the situation. For the variables tracking step, you should use 'for loop' to process the multiple statements or instructions step by step, 'print()' to output intermediate steps to keep track of changes of variables.",
  4 |         "",
  5 |         "[Example 1]",
  6 |         "Example task instances:",
  7 |         "Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: [ { [",
  8 |         "",
  9 |         "Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: < > ( ( [ [ ( { } ) [ < > ] ]",
 10 |         "",
 11 |         "Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: < [ < [ { < [ ] < { } > > } ] > { { ( ) } { < [ < > ] > }",
 12 |         "",
 13 |         "Reason for the decision & Answer:",
 14 |         "In order to find unclosed parentheses, tracking the opening and closing parentheses is really important. Therefore, the variables tracking is an effective way to solve the task. So the answer is Variables tracking.",
 15 |         "",
 16 |         "[Example 2]",
 17 |         "Example task instances:",
 18 |         "Q: This SVG path element <path d=\"M 31.00,73.00 L 32.00,59.00 L 44.00,50.00 L 49.00,41.00 L 64.00,37.00 L 71.00,55.00 L 64.00,76.00 L 52.00,61.00 L 31.00,73.00\"/> draws a",
 19 |         "Options:",
 20 |         "(A) circle",
 21 |         "(B) heptagon",
 22 |         "(C) hexagon",
 23 |         "(D) kite",
 24 |         "(E) line",
 25 |         "(F) octagon",
 26 |         "(G) pentagon",
 27 |         "(H) rectangle",
 28 |         "(I) sector",
 29 |         "(J) triangle",
 30 |         "",
 31 |         "Q: This SVG path element <path d=\"M 14.19,26.04 L 51.43,39.21 L 58.44,36.69 L 56.63,30.17 L 48.53,26.66 L 14.19,26.04\"/> draws a",
 32 |         "Options:",
 33 |         "(A) circle",
 34 |         "(B) heptagon",
 35 |         "(C) hexagon",
 36 |         "(D) kite",
 37 |         "(E) line",
 38 |         "(F) octagon",
 39 |         "(G) pentagon",
 40 |         "(H) rectangle",
 41 |         "(I) sector",
 42 |         "(J) triangle",
 43 |         "",
 44 |         "Q: This SVG path element <path d=\"M 41.00,43.00 L 37.00,34.00 L 41.00,33.00 L 45.00,34.00 L 41.00,43.00\"/> draws a",
 45 |         "Options:",
 46 |         "(A) circle",
 47 |         "(B) heptagon",
 48 |         "(C) hexagon",
 49 |         "(D) kite",
 50 |         "(E) line",
 51 |         "(F) octagon",
 52 |         "(G) pentagon",
 53 |         "(H) rectangle",
 54 |         "(I) sector",
 55 |         "(J) triangle",
 56 |         "",
 57 |         "Reason for the decision & Answer:",
 58 |         "After analyzing the given SVG paths, finding attributes and characteristics of the shape needs a really complicated operation that is difficult to specify. Thus, the free format reasoning is needed to solve the task. So the answer is Free format reasoning.",
 59 |         "",
 60 |         "[Example 3]",
 61 |         "Example task instances:",
 62 |         "Q: If you follow these instructions, do you return to the starting point? Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. Take 8 steps.",
 63 |         "Options:",
 64 |         "- Yes",
 65 |         "- No",
 66 |         "",
 67 |         "Q: If you follow these instructions, do you return to the starting point? Turn around. Take 1 step. Take 6 steps. Turn around. Take 6 steps. Take 9 steps. Take 1 step.",
 68 |         "Options:",
 69 |         "- Yes",
 70 |         "- No",
 71 |         "",
 72 |         "Q: If you follow these instructions, do you return to the starting point? Always face forward. Take 2 steps right. Take 9 steps left. Take 7 steps right.",
 73 |         "Options:",
 74 |         "- Yes",
 75 |         "- No",
 76 |         "",
 77 |         "Reason for the decision & Answer:",
 78 |         "It is crucial to process each of the instruction and keep track of the current position and direction to solve the task. Therefore, the variables tracking is an effective way to tackle the target task. So the answer is Variables tracking.",
 79 |         "",
 80 |         "[Example 4]",
 81 |         "Example task instances:",
 82 |         "Q: Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  We now add a penguin to the table:",
 83 |         "James, 12, 90, 12",
 84 |         "How many penguins are less than 8 years old?",
 85 |         "Options:",
 86 |         "(A) 1",
 87 |         "(B) 2",
 88 |         "(C) 3",
 89 |         "(D) 4",
 90 |         "(E) 5",
 91 |         "",
 92 |         "Q: Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  Which is the youngest penguin?",
 93 |         "Options:",
 94 |         "(A) Louis",
 95 |         "(B) Bernard",
 96 |         "(C) Vincent",
 97 |         "(D) Gwen",
 98 |         "(E) James",
 99 |         "",
100 |         "Q: Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  What is the name of the second penguin sorted by alphabetic order?",
101 |         "Options:",
102 |         "(A) Louis",
103 |         "(B) Bernard",
104 |         "(C) Vincent",
105 |         "(D) Gwen",
106 |         "(E) James",
107 |         "",
108 |         "Reason for the decision & Answer:",
109 |         "To find target individual from the table referring to the question is hard to implement with a variables tracking method. Therefore, the free format reasoning could be a better way to solve the problem. So the answer is Free format reasoning.",
110 |         "",
111 |         "[Example 5]",
112 |         "Example task instances:",
113 |         "On the table, you see several things arranged in a row: a black pencil, a turquoise paperclip, and a yellow stress ball. What is the color of the thing directly to the left of the paperclip?",
114 |         "Options:",
115 |         "(A) red",
116 |         "(B) orange",
117 |         "(C) yellow",
118 |         "(D) green",
119 |         "(E) blue",
120 |         "(F) brown",
121 |         "(G) magenta",
122 |         "(H) fuchsia",
123 |         "(I) mauve",
124 |         "(J) teal",
125 |         "(K) turquoise",
126 |         "(L) burgundy",
127 |         "(M) silver",
128 |         "(N) gold",
129 |         "(O) black",
130 |         "(P) grey",
131 |         "(Q) purple",
132 |         "(R) pink",
133 |         "",
134 |         "On the desk, I see a burgundy stress ball, a red plate, a teal sheet of paper, and a fuchsia cup. Is the stress ball burgundy?",
135 |         "Options:",
136 |         "(A) yes",
137 |         "(B) no",
138 |         "",
139 |         "On the nightstand, I see one green scrunchiephone charger, one silver scrunchiephone charger, three magenta scrunchiephone chargers, three brown scrunchiephone chargers, two brown dog leashes, three magenta jugs, and three green jugs. If I remove all the scrunchiephone chargers from the nightstand, how many magenta objects remain on it?",
140 |         "Options:",
141 |         "(A) zero",
142 |         "(B) one",
143 |         "(C) two",
144 |         "(D) three",
145 |         "(E) four",
146 |         "(F) five",
147 |         "(G) six",
148 |         "(H) seven",
149 |         "(I) eight",
150 |         "(J) nine",
151 |         "(K) ten",
152 |         "(L) eleven",
153 |         "(M) twelve",
154 |         "(N) thirteen",
155 |         "(O) fourteen",
156 |         "(P) fifteen",
157 |         "(Q) sixteen",
158 |         "",
159 |         "Reason for the decision & answer:"
160 |     ],
161 |     "output": [
162 |         [
163 |             "For the tasks involving colors and objects arrangement, the free format reasoning would be more effective. This is because the logic required to determine the color or position of objects is not easily expressed in a step-by-step manner. Instead, a more abstract and flexible approach is needed to reason through the relationships between the objects. Therefore, the free format reasoning would be the better strategy to tackle these tasks. So the answer is Free format reasoning."
164 |         ]
165 |     ]
166 | }


--------------------------------------------------------------------------------
/src/tasks/reasoning_about_colored_objects/generated_prompts/classification_result_gpt_3shot.txt:
--------------------------------------------------------------------------------
1 | For the tasks involving colors and objects arrangement, the free format reasoning would be more effective. This is because the logic required to determine the color or position of objects is not easily expressed in a step-by-step manner. Instead, a more abstract and flexible approach is needed to reason through the relationships between the objects. Therefore, the free format reasoning would be the better strategy to tackle these tasks. So the answer is Free format reasoning.


--------------------------------------------------------------------------------
/src/tasks/reasoning_about_colored_objects/generated_prompts/code_prompt_from_explanation_planner_gpt_3shot.txt:
--------------------------------------------------------------------------------
 1 | def solve_colored_objects(input_text):
 2 |     # Step 1: Extract the descriptions of items and their colors from the input text.
 3 |     items = parse_items(input_text)
 4 |     print("Items on the surface:\n", items)
 5 | 
 6 |     # Step 2: Determine the positions of items relative to each other based on the descriptions provided.
 7 |     spatial_relationships = analyze_spatial_relationships(items)
 8 |     print("Spatial relationships between items:\n", spatial_relationships)
 9 | 
10 |     # Step 3: Derive the answer with explanation by analyzing the descriptions and spatial relationships.
11 |     question = identify_question(input_text)
12 |     print("The question is:", question)
13 |     reason, answer = derive_answer_with_explanation(items, spatial_relationships, question)
14 |     print("Reasoning for the answer:", reason)
15 | 
16 |     # Step 4: Compare the derived answer with the given options and select the correct one.
17 |     options = extract_options(input_text)
18 |     print("Answer options:\n", options)
19 |     final_answer = None
20 |     for option in options:
21 |         if answer in option:
22 |             final_answer = option[:3]
23 |             break
24 | 
25 |     # Step 5: Return the final answer.
26 |     return final_answer


--------------------------------------------------------------------------------
/src/tasks/reasoning_about_colored_objects/generated_prompts/explanation_3shot_by_gpt.txt:
--------------------------------------------------------------------------------
 1 | This task involves identifying specific attributes of items based on their descriptions and positions relative to each other. The task requires careful reading of the descriptions and analyzing the spatial relationships between the items.
 2 | 
 3 | Analyzing the Task:
 4 | The task involves interpreting the descriptions of items and their spatial relationships to answer specific questions. This task does not require variable tracking but rather a free format reasoning approach to extract the necessary information and make logical deductions.
 5 | 
 6 | Constructing the Code Prompt:
 7 | 
 8 | 1. Parse the descriptions: Extract the descriptions of items and their colors from the input text.
 9 | 
10 | 2. Identify the spatial relationships: Determine the positions of items relative to each other based on the descriptions provided.
11 | 
12 | 3. [Free format reasoning] Derive the answer with explanation: Analyze the descriptions and spatial relationships to answer the specific question posed in the task. Use a function like 'derive_answer_with_explanation' to provide both the reasoning and the correct answer.
13 | 
14 | 4. Match the answer with the options: Compare the derived answer with the given options and select the correct one.
15 | 
16 | 5. Return the final answer: Output the final answer in the specified format.
17 | 
18 | By following these steps, the task can be effectively solved by reasoning through the descriptions and spatial relationships of the items provided in the task instances.


--------------------------------------------------------------------------------
/src/tasks/reasoning_about_colored_objects/helper.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | import os
 4 | from datasets import load_dataset
 5 | from tqdm import tqdm
 6 | import json
 7 | import re
 8 | sys.path.append(os.environ.get('PROJECTPATH'))
 9 | from src.abstract_helper import AbstractHelper
10 | 
11 | 
12 | class ColoredObjectsHelper(AbstractHelper):
13 |     def __init__(self, args):
14 |         super(ColoredObjectsHelper, self).__init__(args)
15 |         self.function_name = "solve_colored_objects"
16 |     
17 |     def evaluate_prediction(self, outputs, answer_trigger="Final answer:"):
18 |         agg_pass = []
19 |         agg_task_accuracy = []
20 |        
21 |         for oi, o in enumerate(outputs):
22 |             o = o.replace(".", "")
23 |             if "Final answer:" in o:
24 |                 model_output = o.split(answer_trigger)[-1].strip()
25 |             else:
26 |                 model_output = o.strip()
27 |             label = self.test_data[oi]['answer']
28 |             option_dic = {option.split()[0]: option for option in self.test_data[oi]['options']}
29 |             is_pass = False
30 |             for option in option_dic.values():
31 |                 if model_output in option and model_output:
32 |                     is_pass = True
33 |                     break
34 |             agg_pass.append(is_pass)
35 |             if model_output in option_dic[label] and model_output:
36 |                 agg_task_accuracy.append(True)
37 |             else:
38 |                 agg_task_accuracy.append(False)
39 |         
40 |         task_accuracy = np.mean(agg_task_accuracy).item()
41 |         pass_rate = sum(agg_pass)/len(agg_pass)
42 |         individual_score = [{"pass_rate": agg_pass[i], "task_accuracy": agg_task_accuracy[i]} for i in range(len(agg_task_accuracy))]
43 |         result_score = {
44 |             "pass_rate": pass_rate,
45 |             "task_accuracy": task_accuracy
46 |         }
47 |         return result_score, individual_score
48 |     
49 |     def load_data(self):
50 |         data_name = "tasks/reasoning_about_colored_objects/data.json"
51 |         with open(data_name, "r") as f:
52 |             data = json.load(f)['examples']
53 |         train_data = [d for d in data]
54 |         test_data = [d for d in data]
55 |         if self.args.num_sample != -1:
56 |             test_data = [test_data[i] for i in range(self.args.num_sample)]
57 | 
58 |         self.train_data = train_data
59 |         self.test_data = test_data
60 |         return train_data, test_data
61 |     
62 |     def load_and_prepare_data(self, dataset_split):
63 |         dataset = self.train_data if dataset_split == "train" else self.test_data
64 |         all_processed_data = []
65 |         for data in tqdm(dataset):
66 |             cur_data = {k:v for k,v in data.items()}
67 |             cur_data['input_text'] = cur_data['input']
68 |             cur_data['answer'] = cur_data['target']
69 |             cur_data['label'] = cur_data['target']
70 |             cur_data['options'] = cur_data['input'].split("\nOptions:\n")[-1].split("\n")
71 |             all_processed_data.append(cur_data)
72 |         
73 |         if dataset_split == "train":
74 |             self.train_data = all_processed_data
75 |         else:
76 |             self.test_data = all_processed_data
77 |             
78 |         return all_processed_data
79 |         


--------------------------------------------------------------------------------
/src/tasks/reasoning_about_colored_objects/prompts/code_prompt_cot_edited.txt:
--------------------------------------------------------------------------------
 1 | def solve_colored_objects(input_text):
 2 |     # Step 1: Start by identifying the objects along with their associated properties, such as color and spatial positioning from the input text. Show the list of objects.
 3 |     objects_list = extract_objects(input_text)
 4 |     print("Objects and their properties:", objects_list)
 5 | 
 6 |     # Step 2: Identify the specific question asked. Determine whether the question is about identifying the color of a specific object, counting objects of a certain color, or reasoning about the spatial arrangement of objects and output the question type.
 7 |     question = extract_question(input_text)
 8 |     print("Question specifics:", question)
 9 | 
10 |     # Step 3: Identify and list up available options provided in the input text.
11 |     options = input_text.split("\n")[-5:]
12 | 
13 |     # Step 4: Process according to the question type and show what the question type is:
14 |     # If the question is about identifying color, identify and ouput the target object the question is asking for the color of. Determine and output its color. 
15 |     if question['type'] == 'identify_color':
16 |         print("Question type is = identify_color")
17 |         print(f"Identifying color for: {question['details']}")
18 |         target_object = target(objects_list, question['details'])
19 |         print(f"The question is asking for the color of : {target_object}")
20 |         pre_answer = extract_color(target_object, question['details'])
21 |         print(f"Identified color: {pre_answer}")
22 | 
23 |     # If the question is about counting objects, identify and ouput the objects the question is asking for the number of. Go through each object in the list in steps and count each object. Show the counting steps. Output the final number of objects that meet the specified criteria (e.g., a specific color). 
24 |     elif question['type'] == 'count_objects':
25 |         print("Question type is = count_objects")
26 |         print(f"Counting objects for: {question['details']}")
27 |         print("Total iterations:", len(objects_list))
28 |         for i, object in enumerate(objects_list):
29 |             single_object_count = count_single_object(object, question['details'])
30 |             intermediate_count += single_object_count
31 |             print(f"Step ({i}) - {object}: {single_object_count}, Intermediate count: {intermediate_count}")
32 |         pre_answer = count_objects(objects_list, question['details'])
33 |         print(f"Objects count: {pre_answer}")
34 | 
35 |     # If the question is about spatial reasoning, identify and ouput the relative positions the question is asking for. Arrange the objects from left to right and output the order. Determine the relative positions of objects and output the result.
36 |     elif question['type'] == 'spatial_reasoning':
37 |         print("Question type is = spatial_reasoning")
38 |         print(f"Applying spatial reasoning for: {question['details']}")
39 |         arranged_object = arrange_from_left_to_right(objects_list)
40 |         print(f"Arraged objects: {arranged_object})
41 |         pre_answer = spatial_reasoning(arranged_object, question['details'])
42 |         print(f"Spatial reasoning result: {pre_answer}")
43 | 
44 |     # Step 5: Recall the identified options and match the outcome of Step 4 (the identified color, the count of objects, or the result of spatial reasoning) with the provided options to determine the correct answer.
45 |     answer = find_correct_option(pre_answer, options)
46 |     
47 |     # Step 6: Return the final answer chosen at Step 5.
48 |     return answer


--------------------------------------------------------------------------------
/src/tasks/reasoning_about_colored_objects/prompts/code_prompt_cot_edited_2.txt:
--------------------------------------------------------------------------------
 1 | def solve_colored_objects(input_text):
 2 |     # Step 1: Start by identifying the objects along with their associated properties, such as color and spatial positioning from the input text. Show the list of objects.
 3 |     objects_list = extract_objects(input_text)
 4 |     print("Objects and their properties:", objects_list)
 5 | 
 6 |     # Step 2: Identify the specific question asked. Determine whether the question is about identifying the color of a specific object, counting objects of a certain color, or reasoning about the spatial arrangement of objects and output the question type.
 7 |     question = extract_question(input_text)
 8 |     print("Question specifics:", question)
 9 | 
10 |     # Step 3: Identify and list up available options provided in the input text.
11 |     options = extract_options(input_text)
12 | 
13 |     # Step 4: [Free format reasoning] Process according to the question type and derive answer.
14 |     reason, pre_answer = derive_answer_with_explanation(objects_list, question)
15 |     print(f"Reason for the decision: {reason}")
16 |     print(f"Thus, the pre-answer is {pre_answer}.")
17 | 
18 |     # Step 5: Recall the identified options and match the outcome of Step 4 (the identified color, the count of objects, or the result of spatial reasoning) with the provided options to determine the correct answer.
19 |     print(f"Options:\n{options}")
20 |     answer = find_correct_option(pre_answer, options)
21 |     
22 |     # Step 6: Return the final answer chosen at Step 5.
23 |     return answer


--------------------------------------------------------------------------------
/src/tasks/reasoning_about_colored_objects/prompts/example_classification.txt:
--------------------------------------------------------------------------------
 1 | Q: On the nightstand, there is a red pencil, a purple mug, a burgundy keychain, a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the stress ball?
 2 | Options:
 3 | (A) red
 4 | (B) orange
 5 | (C) yellow
 6 | (D) green
 7 | (E) blue
 8 | (F) brown
 9 | (G) magenta
10 | (H) fuchsia
11 | (I) mauve
12 | (J) teal
13 | (K) turquoise
14 | (L) burgundy
15 | (M) silver
16 | (N) gold
17 | (O) black
18 | (P) grey
19 | (Q) purple
20 | (R) pink
21 | 
22 | Q: On the table, you see a bunch of objects arranged in a row: a purple paperclip, a pink stress ball, a brown keychain, a green scrunchiephone charger, a mauve fidget spinner, and a burgundy pen. What is the color of the object directly to the right of the stress ball?
23 | Options:
24 | (A) red
25 | (B) orange
26 | (C) yellow
27 | (D) green
28 | (E) blue
29 | (F) brown
30 | (G) magenta
31 | (H) fuchsia
32 | (I) mauve
33 | (J) teal
34 | (K) turquoise
35 | (L) burgundy
36 | (M) silver
37 | (N) gold
38 | (O) black
39 | (P) grey
40 | (Q) purple
41 | (R) pink
42 | 
43 | Q: On the nightstand, you see the following items arranged in a row: a teal plate, a burgundy keychain, a yellow scrunchiephone charger, an orange mug, a pink notebook, and a grey cup. How many non-orange items do you see to the left of the teal item?
44 | Options:
45 | (A) zero
46 | (B) one
47 | (C) two
48 | (D) three
49 | (E) four
50 | (F) five
51 | (G) six
52 | 
53 | Reason for the decision & Answer:
54 | The task includes several types of question(i.e. color identification, counting objects, spatial reasoning) which is difficult to deal with a specific method. Therefore, the free format reasoning approach would be effective to tackle the task. So the answer is Free format reasoning.


--------------------------------------------------------------------------------
/src/tasks/reasoning_about_colored_objects/prompts/explanation.txt:
--------------------------------------------------------------------------------
 1 | Explanation:
 2 | This task involves solving questions related to the color, count, and spatial arrangement of objects. The challenge lies in accurately parsing the input to extract relevant information and applying logical reasoning to determine the correct answer.
 3 | 
 4 | Analyzing the Task:
 5 | 1. Object and Property Identification: Identify each object mentioned in the text along with its properties (like color) and position.
 6 | 
 7 | 2. Understanding the Question: Determine the specific nature of the question – whether it's about identifying the color of an object, counting objects based on a criterion, or understanding their spatial arrangement.
 8 | 
 9 | 3. Processing According to Question Type: Based on the question's nature, apply the appropriate logical reasoning:
10 |    - For color identification, locate the specific object and determine its color.
11 |    - For counting objects, enumerate the objects that meet the specified criteria.
12 |    - For spatial reasoning, analyze the relative positions of the objects.
13 | 
14 | 4. Matching with Options: Once the analysis is complete, match the result with the given options to find the correct answer.
15 | 
16 | Constructing the Code Prompt:
17 | 1. Extracting Object Properties: Parse the input text to create a list of objects with their associated properties and positions.
18 | 
19 | 2. Identifying the Question: Extract the specific question from the input text and determine its type (color identification, counting, or spatial reasoning).
20 | 
21 | 3. Listing Options: Extract the list of possible answers provided in the input text.
22 | 
23 | 4. [Important] Processing Logic:
24 |    - For color identification, find the target object and determine its color.
25 |    - For counting, iterate over the objects, count those that meet the criteria, and keep track of the count.
26 |    - For spatial reasoning, arrange the objects as described and deduce the answer based on their positions.
27 | 
28 | 5. Determining the Answer: Match the result of the logical processing with the options to identify the correct answer.
29 | 
30 | 6. Returning the Final Answer: Output the matched option as the final answer to the question.


--------------------------------------------------------------------------------
/src/tasks/reasoning_about_colored_objects/prompts/explanation_2.txt:
--------------------------------------------------------------------------------
 1 | This task involves solving questions related to the color, count, and spatial arrangement of objects. The challenge lies in accurately parsing the input to extract relevant information and applying logical reasoning to determine the correct answer.
 2 | 
 3 | Analyzing the Task:
 4 | The task includes several types of question(i.e. color identification, counting objects, spatial reasoning) which is difficult to deal with a specific method. Therefore, the free format reasoning approach would be effective to tackle the task.
 5 | 
 6 | Constructing the Code Prompt:
 7 | 1. Extracting Object Properties: Parse the input text to create a list of objects with their associated properties and positions.
 8 | 
 9 | 2. Identifying the Question: Extract the specific question from the input text and determine its type (color identification, counting, or spatial reasoning).
10 | 
11 | 3. Listing Options: Extract the list of possible answers provided in the input text.
12 | 
13 | 4. [Free format reasoning] Derive answer with its reasons: Process the appropriate logic according to the question type and derive the answer. Use abstract function(i.e. derive_answer_with_explanation) which outputs the proper reason and answer for the task.
14 | 
15 | 5. Determining the Answer: Match the result of the logical processing with the options to identify the correct answer.
16 | 
17 | 6. Returning the Final Answer: Return the matched option as the final answer to the question.


--------------------------------------------------------------------------------
/src/tasks/reasoning_about_colored_objects/scoring_prompt_template_code.txt:
--------------------------------------------------------------------------------
1 | {prompt}
2 | input_text = "{input_text}"
3 | final_answer = solve_colored_objects(input_text)
4 | print("Final answer:"+ final_answer)
5 | Generate the expected execution output (output from all print() functions) of the code. You don't have to actually run the code and do not care about 'not implemented error'.
6 | Expected output:
7 | 


--------------------------------------------------------------------------------
/src/tasks/temporal_sequences/generated_prompts/classification_result_gpt_3shot.txt:
--------------------------------------------------------------------------------
1 | For the task instances where the timeline of events needs to be analyzed to determine the possible time of arrival at a specific location, the variables tracking approach would be more effective. This is because the task involves processing each event in chronological order and keeping track of the time intervals to determine the possible time of arrival at the destination. Using variables tracking, you can keep track of the time intervals and events to narrow down the possible time range for each individual. Therefore, the answer is Variables tracking.


--------------------------------------------------------------------------------
/src/tasks/temporal_sequences/generated_prompts/explanation_from_scratch_3shot_gpt.txt:
--------------------------------------------------------------------------------
 1 | Explanation:
 2 | This task involves determining the possible time intervals during which a person could have visited a specific location based on the given schedule of events throughout the day. The goal is to identify the time frames that are not conflicting with other activities.
 3 | 
 4 | Analyzing the Task:
 5 | 1. Establishing the Timeline: Understanding the sequence of events and the times at which they occurred is crucial for determining the available time slots.
 6 | 
 7 | 2. Identifying Conflicting Activities: Recognizing the time intervals during which the person was engaged in other activities that would prevent them from visiting the specified location.
 8 | 
 9 | 3. Determining Possible Visitation Times: By eliminating conflicting time intervals, the remaining time slots represent the possible visitation times.
10 | 
11 | Constructing the Code Prompt:
12 | 1. Extracting Event Information: Parse the given schedule of events to extract the times at which each activity occurred. This may involve creating a data structure to store this information.
13 | 
14 | 2. Identifying Conflicting Time Intervals: Determine the time intervals during which the person was occupied with other activities that would prevent them from visiting the specified location.
15 | 
16 | 3. [Important] Calculating Available Time Slots: By subtracting the conflicting time intervals from the total day duration, identify the time frames during which the person could have visited the specified location.
17 | 
18 | 4. Matching with Options: Compare the possible visitation times with the provided options to determine the correct answer.
19 | 
20 | 5. Returning the Correct Option: Return the letter corresponding to the time interval that matches the calculated possible visitation times.
21 | 
22 | This task involves a logical deduction process based on the given schedule of events to determine the feasible time intervals for visiting the specified location. By systematically analyzing the timeline and eliminating conflicting activities, the correct answer can be identified. The solution should provide a clear and accurate assessment of the available time slots for the visitation.


--------------------------------------------------------------------------------
/src/tasks/temporal_sequences/generated_prompts/new_edited_code_prompt_from_explanation_planner_gpt_3shot.txt:
--------------------------------------------------------------------------------
 1 | def solve_temporal_sequences_quiz(input_text):
 2 |     # Step 1: Extract event information from the input text to understand the timeline of activities throughout the day.
 3 |     events_list = extract_events(input_text)
 4 |     print("Events and their timings:", events_list)
 5 | 
 6 |     # Step 2: Identify conflicting time intervals when the person was engaged in other activities that would prevent them from visiting the specified location.
 7 |     conflicting_intervals = find_conflicting_intervals(events_list)
 8 |     print("Conflicting time intervals:", conflicting_intervals)
 9 | 
10 |     # Step 3: Calculate the available time slots by subtracting conflicting intervals from the total day duration.
11 |     available_intervals = calculate_available_intervals(conflicting_intervals)
12 |     print("Available time intervals for visitation:", available_intervals)
13 | 
14 |     # Step 4: Match the available time slots with the provided options to determine the correct answer.
15 |     options = input_text.split("\n")[-5:]
16 | 
17 |     # Step 5: Return the correct option corresponding to the time interval that aligns with the calculated possible visitation times.
18 |     answer = find_correct_option(available_intervals, options)
19 | 
20 |     return answer


--------------------------------------------------------------------------------
/src/tasks/temporal_sequences/helper.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | import os
 4 | from datasets import load_dataset
 5 | from tqdm import tqdm
 6 | import json
 7 | import re
 8 | sys.path.append(os.environ.get('PROJECTPATH'))
 9 | from src.abstract_helper import AbstractHelper
10 | 
11 | 
12 | class TemporalSequencesHelper(AbstractHelper):
13 |     def __init__(self, args):
14 |         super(TemporalSequencesHelper, self).__init__(args)
15 |         self.function_name = "solve_temporal_sequences_quiz"
16 |     
17 |     def evaluate_prediction(self, outputs):
18 |         agg_pass = []
19 |         agg_task_accuracy = []
20 |         predicted_answer = []
21 |        
22 |         for oi, o in enumerate(outputs):
23 |             o = o.replace("'", "").replace('"', "").replace(".", "").replace(",", "")
24 |             if "Final answer:" in o:
25 |                 model_output = o.split("Final answer:")[-1].strip()
26 |             else:
27 |                 model_output = o.strip()
28 |             predicted_answer.append(model_output)
29 |             label = self.test_data[oi]['answer'].strip()
30 |             dic = {option.split()[0]:option for option in self.test_data[oi]['options']}
31 |             is_pass = True
32 |             for option in self.test_data[oi]['options']:
33 |                 if model_output in option and model_output:
34 |                     is_pass = True
35 |                     break
36 |             agg_pass.append(is_pass)
37 |             if model_output in dic[label] and model_output:
38 |                 agg_task_accuracy.append(True)
39 |             else:
40 |                 agg_task_accuracy.append(False)
41 |         
42 |         task_accuracy = np.mean(agg_task_accuracy).item()
43 |         pass_rate = sum(agg_pass)/len(agg_pass)
44 |         individual_score = [{"pass_rate": agg_pass[i], "task_accuracy": agg_task_accuracy[i], "answer": self.test_data[i]['answer'], "prediction": predicted_answer[i]} for i in range(len(agg_task_accuracy))]
45 |         result_score = {
46 |             "pass_rate": pass_rate,
47 |             "task_accuracy": task_accuracy
48 |         }
49 |         return result_score, individual_score
50 |     
51 |     def load_data(self):
52 |         data_name = "tasks/temporal_sequences/data.json"
53 |         with open(data_name, "r") as f:
54 |             data = json.load(f)['examples']
55 |         train_data = [d for d in data]
56 |         test_data = [d for d in data]
57 |         if self.args.num_sample != -1:
58 |             test_data = [test_data[i] for i in range(self.args.num_sample)]
59 | 
60 |         self.train_data = train_data
61 |         self.test_data = test_data
62 |         return train_data, test_data
63 |     
64 |     def load_and_prepare_data(self, dataset_split):
65 |         dataset = self.train_data if dataset_split == "train" else self.test_data
66 |         all_processed_data = []
67 |         for data in tqdm(dataset):
68 |             cur_data = {k:v for k,v in data.items()}
69 |             cur_data['input_text'] = cur_data['input'].split("We know that:")[-1].strip()
70 |             cur_data['answer'] = cur_data['target']
71 |             cur_data['label'] = cur_data['target']
72 |             cur_data['options'] = cur_data['input'].split("Options:")[-1].strip().split("\n")
73 |             all_processed_data.append(cur_data)
74 |         
75 |         if dataset_split == "train":
76 |             self.train_data = all_processed_data
77 |         else:
78 |             self.test_data = all_processed_data
79 |             
80 |         return all_processed_data
81 |         


--------------------------------------------------------------------------------
/src/tasks/temporal_sequences/prompts/code_prompt_cot_edited.txt:
--------------------------------------------------------------------------------
 1 | def solve_temporal_sequences_quiz(input_text):
 2 |     # Step 1: Identify statements and options from the input_text and output the statements.
 3 |     statement_text, option_text = input_text.split("\nOptions:\n")
 4 |     parts = statement_text.split("\n")
 5 |     statements = parts[1:-2]
 6 |     options = option_text.split("\n")
 7 |     print("Statements:", statements)
 8 | 
 9 |     # Step 2: Check the start and end of the possible time.
10 |     print("Start of the possible time: ", parts[0])
11 |     print("End of the possible time: ", parts[-2])
12 |     
13 |     # Step 3: Initialize an available time map with the time slots in the options and output it. The time slots are marked as 'free' initially.
14 |     available_time_map = {option[4:]: "free" for option in options}
15 |     print(f"Initial available time dictionary: {available_time_map}")
16 | 
17 |     # Step 4: Sequentially go through each statement, marking the times when the individual was seen or known to be engaged in specific activities. In this step, you should generate the target time slots and the updated available time map according to the statement.
18 |     for i, statement in enumerate(statements):
19 |         event, time_span = extract_information(statement)
20 |         print(f"\nStep {i}: {statement}")
21 |         print(f"current time occupation: {available_time_map}")
22 |         print(f"Time span to be occupied: {time_span}")
23 |         available_time_map[time_span] = "not available"
24 |         print(f"updated time occupation: {available_time_map}")
25 | 
26 |     # Step 5: By checking the available time map, identify which time slot is marked as 'free'. For each time slot, output the time slot is free or not available.
27 |     for key in available_time_map:
28 |         if available_time_map[key] == "free":
29 |             print(f"{key} is free.")
30 |             free_time = key
31 |         else:
32 |             print(f"{key} is not available.")
33 |     # Step 6: Review the provided options and return the one that matches the identified free time slot in Step 5.
34 |     print(f"Options:\n{option_text}")
35 |     for option in options:
36 |         if free_time in option:
37 |             return option


--------------------------------------------------------------------------------
/src/tasks/temporal_sequences/prompts/code_prompt_cot_edited_2.txt:
--------------------------------------------------------------------------------
 1 | def solve_temporal_sequences_quiz(input_text):
 2 |     # Step 1: Identify statements and options from the input_text and output the statements.
 3 |     statements = extract_statements(input_text)
 4 |     options = extract_options(input_text)
 5 |     print("Statements:", statements)
 6 | 
 7 |     # Step 2: Check the start and end of the possible time.
 8 |     print("Start of the possible time: ", parts[0])
 9 |     print("End of the possible time: ", parts[-2])
10 |     
11 |     # Step 3: Initialize an available time map with the time slots in the options and output it. The time slots are marked as 'free' initially.
12 |     available_time_map = {option[4:]: "free" for option in options}
13 |     print(f"Initial available time dictionary: {available_time_map}")
14 | 
15 |     # Step 4: [Variables tracking] Sequentially go through each statement, marking the times when the individual was seen or known to be engaged in specific activities.
16 |     for i, statement in enumerate(statements):
17 |         event, time_span = extract_information(statement)
18 |         print(f"\nStep {i}: {statement}")
19 |         print(f"current time occupation: {available_time_map}")
20 |         print(f"Time span to be occupied: {time_span}")
21 |         available_time_map[time_span] = "not available"
22 |         print(f"updated time occupation: {available_time_map}")
23 | 
24 |     # Step 5: By checking the available time map, identify which time slot is marked as 'free'. For each time slot, output the time slot is free or not available.
25 |     for key in available_time_map:
26 |         if available_time_map[key] == "free":
27 |             print(f"{key} is free.")
28 |             free_time = key
29 |         else:
30 |             print(f"{key} is not available.")
31 |     
32 |     # Step 6: Review the provided options and return the one that matches the identified free time slot in Step 5.
33 |     print(f"Options:\n{options}")
34 |     for option in options:
35 |         if free_time in option:
36 |             return option


--------------------------------------------------------------------------------
/src/tasks/temporal_sequences/prompts/explanation.txt:
--------------------------------------------------------------------------------
 1 | The task involves determining the time slots when individuals could have performed a specific activity, based on a series of observations that define where they were seen at various times throughout the day. This requires logical reasoning to deduce the only time slot where the individual was unaccounted for and thus could have performed the said activity.
 2 | 
 3 | Analyzing the Task:
 4 | 1.Sequential Events and Time Slots: Each example consists of sequential events occurring at different times. The goal is to find a time slot not occupied by any known activity.
 5 | 
 6 | 2.Identifying Time Intervals: The time intervals during which each activity occurred are crucial. We need to understand and process these intervals to figure out the free slot.
 7 | 
 8 | 3.Eliminating Occupied Time Slots: By systematically ruling out the time slots during which the person was seen engaging in other activities, we can identify the potential free slot.
 9 | 
10 | Constructing the Code Prompt:
11 | 1.Breaking Down the Input: The first step involves parsing the input text to separate the statements about activities and the time options. This involves string manipulation to extract relevant parts of the text.
12 | 
13 | 2.Identifying Time Constraints: The start and end of possible times for the unaccounted activity (e.g., the time when the person woke up and the time by which the location of interest was closed) need to be identified. This helps in setting the boundaries for our search.
14 | 
15 | 3.Initializing an Available Time Map: To keep track of which time slots are occupied and which are free, an available time map (a dictionary in Python) is initialized with time slots marked as 'free' initially. This map will be updated as more information is processed.
16 | 
17 | 4.Processing Each Statement: For each activity mentioned in the statements, the corresponding time span needs to be marked as 'occupied' in the available time map. While iterating over the statements, process each statement and update the time map. Note that time map should be printed out at each step to logging the results. This involves:
18 |  - Extracting time information from each statement (using a function like extract_information).
19 |  - Updating the available time map accordingly.
20 | 5. Determining the Free Time Slot: After all statements are processed, the time slot that remains marked as 'free' in the map is the one where the individual could have performed the activity in question.
21 | 
22 | 6. Matching with Provided Options: Finally, the free time slot is matched against the provided options to identify the correct answer.


--------------------------------------------------------------------------------
/src/tasks/temporal_sequences/prompts/explanation_2.txt:
--------------------------------------------------------------------------------
 1 | The task involves determining the time slots when individuals could have performed a specific activity, based on a series of observations that define where they were seen at various times throughout the day. This requires logical reasoning to deduce the only time slot where the individual was unaccounted for and thus could have performed the said activity.
 2 | 
 3 | Analyzing the Task:
 4 | To check and update possible time slots mentioned in each of the statement, it is necessary to take a variables tracking to keep track of the changes in the time slots.
 5 | 
 6 | Constructing the Code Prompt:
 7 | 1. Breaking Down the Input: The first step involves parsing the input text to separate the statements about activities and the time options.
 8 | 
 9 | 2. Identifying Time Constraints: The start and end of possible times for the unaccounted activity (e.g., the time when the person woke up and the time by which the location of interest was closed) need to be identified. This helps in setting the boundaries for our search.
10 | 
11 | 3. Initializing an Available Time Map: To keep track of which time slots are occupied and which are free, an available time map (a dictionary in Python) is initialized with time slots from options marked as 'free' initially.
12 | 
13 | 4. [Variables tracking] Processing Each Statement: Sequentially go through each statement by using 'for loop', marking the time span when the individual was seen or known to be engaged in specific activities as 'not available' at the time map. Following things should be logged with print() function for each intermediate steps:
14 | - Each statement to process
15 | - Current time map
16 | - Time span specified in the statement
17 | - Updated time map
18 | 
19 | 5. Determining the Free Time Slot: After all statements are processed, the time slot that remains marked as 'free' in the map is the one where the individual could have performed the activity in question.
20 | 
21 | 6. Matching with Provided Options: Finally, the free time slot is matched against the provided options to identify the correct answer.


--------------------------------------------------------------------------------
/src/tasks/temporal_sequences/prompts/nl_prompt_generation_example_prompt.txt:
--------------------------------------------------------------------------------
  1 | Generate a plan for the last task considering the example task instances. Your plan should show enough intermediate reasoning steps towards the answer. Construct the plan as much as you can and describe the logic specifically.
  2 | 
  3 | [Example 1]
  4 | Example task instances:
  5 | Sort the following words alphabetically: List: plebiscite colombo inviolate quagmire garry satanic magnanimity aeneas notoriety nevada foothold glycerine lucre tennyson type scription pompey softball spleenwort fox
  6 | 
  7 | Sort the following words alphabetically: List: stitch steelmake indomitable wigging cubby sheaf pique thymine exotica good mental brake nick rajah lineage choose bunny drone chevalier
  8 | 
  9 | Sort the following words alphabetically: List: dynastic inflammable prick tristan vitiate tackle stagnate conglomerate nebulae phosphide
 10 | 
 11 | Output Format:
 12 | A string of words seperated with a space
 13 | 
 14 | Explanation:
 15 | This task involves sorting a list of words alphabetically. It is a straightforward task that requires the application of standard sorting techniques.
 16 | 
 17 | Analyzing the Task:
 18 | 1. Word Extraction: Initially, the words need to be separated from the input text. 
 19 | 
 20 | 2. Alphabetical Sorting: The core of the task is to sort these words based on their alphabetical order.
 21 | 
 22 | 3. Returning the Sorted List: The final step is to output the list of words in the sorted order.
 23 | 
 24 | Constructing the Code Prompt:
 25 | 1. Splitting the Text into Words: First, extract each word from the input text. This can be achieved through string splitting based on spaces.
 26 | 
 27 | 2. Identifying Starting Letters: Though not strictly necessary for sorting, extracting the first letter of each word can help in understanding how the sorting will proceed.
 28 | 
 29 | 3. Sorting the Words: Use a standard sorting algorithm or function to sort the words alphabetically. In most programming languages, there are built-in functions that can handle this step efficiently.
 30 | 
 31 | 4. Outputting the Sorted List: Finally, return the list of words in their sorted order.
 32 | 
 33 | [Example 2]
 34 | Example task instances:
 35 | Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: [ ( < { [ ( ) ] } >
 36 | 
 37 | Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: ( < { ( ) { } } ( < > ) >
 38 | 
 39 | Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: [ < < { } > < > > ( )
 40 | 
 41 | Output Format:
 42 | A string of closing brakets seperated with a space.
 43 | 
 44 | Explanation:
 45 | This task involves completing a sequence of opening and closing parentheses, ensuring that they are correctly matched and the entire sequence is well-formed. This is a classic problem in computer science related to parsing and syntactic analysis, often solved using stack data structures.
 46 | 
 47 | Analyzing the Task:
 48 | 1. Parentheses Matching: Each opening parenthesis must have a corresponding closing one. The sequence must adhere to the proper nesting and order of parentheses.
 49 | 
 50 | 2. Tracking Open Parentheses: A stack is ideal for keeping track of open parentheses, as it allows us to process them in a last-in, first-out (LIFO) manner.
 51 | 
 52 | 3. Completing the Sequence: The goal is to add the appropriate closing parentheses to complete the sequence.
 53 | 
 54 | Constructing the Code Prompt:
 55 | 1. Initialize a Stack: Use a stack to keep track of opening parentheses. This is crucial for understanding which parentheses are open and need closing.
 56 | 
 57 | 2. Split and Parse Input: Split the input string into individual characters for easy processing. Identify types of parentheses and their corresponding closing counterparts.
 58 | 
 59 | 3. [Important] Iterate and Process Characters: Iterate over the input characters. To keep track of the stack, make sure printing out the stack variable using print(). For each character:
 60 |    - If it's an opening parenthesis, push it onto the stack.
 61 |    - If it's a closing parenthesis, check against the last opening parenthesis in the stack for a match, and pop the stack if it matches.
 62 | 
 63 | 4. Generate Closing Sequence: After processing all input characters, any remaining open parentheses in the stack need to be closed in reverse order (LIFO).
 64 | 
 65 | 5. Construct and Return the Completed Sequence: Formulate the sequence of closing parentheses to append to the original sequence, ensuring that the overall structure is properly formed.
 66 | 
 67 | The stack is a natural choice for this problem, as it efficiently handles the LIFO order required for matching parentheses. Iterating over each character provides a step-by-step approach to build the final sequence. This method is effective and straightforward, ensuring correct matching and order of parentheses. The loop is used for iteration, and the stack manages the open parentheses, making the solution both clear and efficient for this task.
 68 | 
 69 | [Example 3]
 70 | Example task instances:
 71 | I have an oven, a bed, a lamp, a chair, a fridge, a microwave, and a toaster. How many objects do I have?
 72 | 
 73 | I have four couches, a bed, a table, and a lamp. How many objects do I have?
 74 | 
 75 | I have a goat, a frog, five pigs, and a bear. How many animals do I have?
 76 | 
 77 | Output Format:
 78 | A positive integer
 79 | 
 80 | Explanation:
 81 | This task involves identifying and counting specific types of objects (fruits or vegetables) from a mixed list. The primary challenge is correctly categorizing each item in the list and then tallying the counts of the desired categories.
 82 | 
 83 | Analyzing the Task:
 84 | 1. Object Categorization: Determining whether each item in the list is a fruit, a vegetable, or neither. This requires knowledge of common fruits and vegetables.
 85 | 
 86 | 2. Counting Objects: Accurately counting the number of items in each category. This may involve handling singular and plural forms, as well as numerical phrases (e.g., "two peaches").
 87 | 
 88 | 3. Handling Varied Input: The input text may vary in format and content, requiring flexible parsing to extract the necessary information.
 89 | 
 90 | Constructing the Code Prompt:
 91 | 1. Extracting Objects from Input: The first step is parsing the input text to create a list of objects. This requires string manipulation and possibly natural language processing techniques.
 92 | 
 93 | 2. Extracting the Question: Determine what category of items needs to be counted (e.g., fruits, vegetables) from the input text. 
 94 | 
 95 | 3. Initializing Count Variables: Set up a variable to keep track of the total count of the desired objects.
 96 | 
 97 | 4. [Important] Counting Each Object: Iterate over each object, determine if it belongs to the desired category (fruit or vegetable), and count it. This process shoud be logged with print() function. This involves:
 98 |    - Identifying singular and plural forms.
 99 |    - Recognizing and handling numerical phrases.
100 |    - Keeping a running total of the count.
101 | 
102 | 5. Returning the Total Count: After processing all objects, return the total count as the answer.
103 | 
104 | [Example 4]
105 | Example task instances:
106 | Today, William went to the gas station. Between what times could they have gone?
107 | We know that:
108 | William woke up at 5am.
109 | Sarah saw William buying a phone at the electronics store from 7am to 11am.
110 | Thomas saw William working out at the gym from 11am to 12pm.
111 | Steven saw William waiting at the airport from 12pm to 2pm.
112 | James saw William walking towards the Statue of Liberty from 2pm to 8pm.
113 | The gas station was closed after 8pm.
114 | Between what times could William have gone to the gas station?
115 | Options:
116 | (A) 5am to 7am
117 | (B) 11am to 12pm
118 | (C) 12pm to 2pm
119 | (D) 7am to 11am
120 | 
121 | Today, James went to the basketball court. Between what times could they have gone?
122 | We know that:
123 | James woke up at 6am.
124 | Susan saw James buying a bike at the bike shop from 7am to 8am.
125 | Emily saw James buying a phone at the electronics store from 8am to 9am.
126 | Lisa saw James attending class at the school from 9am to 3pm.
127 | Sarah saw James taking photos near the Leaning Tower of Pisa from 3pm to 5pm.
128 | David saw James getting a coffee at the cafe from 5pm to 8pm.
129 | The basketball court was closed after 8pm.
130 | Between what times could James have gone to the basketball court?
131 | Options:
132 | (A) 6am to 7am
133 | (B) 9am to 3pm
134 | (C) 8am to 9am
135 | (D) 3pm to 5pm
136 | 
137 | Today, Jennifer went to the basketball court. Between what times could they have gone?
138 | We know that:
139 | Jennifer woke up at 6am.
140 | Thomas saw Jennifer buying a bike at the bike shop from 6am to 9am.
141 | Samantha saw Jennifer driving to the water park from 9am to 10am.
142 | Steven saw Jennifer fixing their computer at the electronic store from 10am to 1pm.
143 | Mark saw Jennifer stretching at a yoga studio from 3pm to 5pm.
144 | The basketball court was closed after 5pm.
145 | Between what times could Jennifer have gone to the basketball court?
146 | Options:
147 | (A) 9am to 10am
148 | (B) 6am to 9am
149 | (C) 3pm to 5pm
150 | (D) 1pm to 3pm
151 | 
152 | Output Format:
153 | '(A)', '(B)', '(C)', ...
154 | 
155 | Explanation:


--------------------------------------------------------------------------------
/src/tasks/temporal_sequences/scoring_prompt_template_code.txt:
--------------------------------------------------------------------------------
1 | {prompt}
2 | input_text = "{input_text}"
3 | final_answer = solve_temporal_sequences_quiz(input_text)
4 | print("Final answer:"+ final_answer)
5 | Generate the expected output (from all print() functions) of the code. You don't have to actually run the code and do not care about 'not implemented error'.
6 | Exprected output:


--------------------------------------------------------------------------------
/src/tasks/tracking_shuffled_objectives/generated_prompts/a_code_prompt_from_explanation_planner_gpt_3shot.txt:
--------------------------------------------------------------------------------
 1 | def track_swaps(input_text):
 2 |     # Step 1: Initialize Positions or Gifts
 3 |     initial_assignments = extract_initial_assignments(input_text)
 4 |     print("Initial assignments:", initial_assignments)
 5 | 
 6 |     # Step 2: Define Swap Pairs
 7 |     swap_pairs = extract_swap_pairs(input_text)
 8 |     print("Swap pairs:", swap_pairs)
 9 | 
10 |     # Step 3: Process Swaps
11 |     final_assignments = initial_assignments.copy()
12 |     for pair in swap_pairs:
13 |         if pair[0] in final_assignments and pair[1] in final_assignments:
14 |             temp = final_assignments[pair[0]]
15 |             final_assignments[pair[0]] = final_assignments[pair[1]]
16 |             final_assignments[pair[1]] = temp
17 |         print(f"After swapping {pair[0]} and {pair[1]}: {final_assignments}")
18 | 
19 |     # Step 4: Determine the Final Position or Gift
20 |     final_person = identify_final_person(input_text)
21 |     final_position = final_assignments[final_person]
22 |     print(f"The final position or gift of {final_person} is: {final_position}")
23 | 
24 |     # Step 5: Match and Output the Answer
25 |     options = extract_options(input_text)
26 |     print("Options:", options)
27 |     answer = None
28 |     for option in options:
29 |         if final_position in option:
30 |             answer = option[:3]
31 |     
32 |     return answer


--------------------------------------------------------------------------------
/src/tasks/tracking_shuffled_objectives/generated_prompts/a_explanation_3shot_by_gpt.txt:
--------------------------------------------------------------------------------
 1 | This task involves tracking the positions or gifts of individuals as they swap with each other in a sequence of exchanges. The challenge is to keep track of the swaps accurately and determine the final position or gift of a specific individual at the end of the sequence.
 2 | 
 3 | Analyzing the Task:
 4 | 1. Initial Assignments: Understanding the starting positions or gifts of each individual is crucial for tracking the swaps accurately.
 5 | 
 6 | 2. Pairwise Swaps: Identifying the pairs of individuals who swap positions or gifts at each step in the sequence.
 7 | 
 8 | 3. Tracking Changes: Keeping a record of the swaps and updating the positions or gifts of individuals accordingly.
 9 | 
10 | Constructing the Code Prompt:
11 | 1. Initialize Positions or Gifts: Set up a dictionary mapping each individual to their initial position or gift.
12 | 
13 | 2. Define Swap Pairs: Create a list of tuples representing the pairs of individuals who swap positions or gifts at each step.
14 | 
15 | 3. [Important] Process Swaps: Iterate over the swap pairs, update the positions or gifts of the individuals involved in each swap. Use an if-else statement to handle different swap scenarios.
16 | 
17 | 4. Determine the Final Position or Gift: After processing all swaps, identify the final position or gift of the specified individual.
18 | 
19 | 5. Match and Output the Answer: Parse the options from the input text, find the corresponding option from the given options, and only output the label of the option as the final answer to the question.


--------------------------------------------------------------------------------
/src/tasks/tracking_shuffled_objectives/generated_prompts/classification_result_gpt_3shot.txt:
--------------------------------------------------------------------------------
1 | The task involves multiple steps of partner swapping and keeping track of who is dancing with whom. In this case, the variables tracking approach would be more effective as it allows for a systematic way of keeping track of the partners and their movements throughout the dance. By using variables to represent each dancer and their current partner, it becomes easier to follow the sequence of partner swaps and determine the final pairings. Therefore, the answer is Variables tracking.


--------------------------------------------------------------------------------
/src/tasks/tracking_shuffled_objectives/helper.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | import os
 4 | from datasets import load_dataset
 5 | from tqdm import tqdm
 6 | import json
 7 | import re
 8 | sys.path.append(os.environ.get('PROJECTPATH'))
 9 | from src.abstract_helper import AbstractHelper
10 | 
11 | 
12 | class TrackingHelper(AbstractHelper):
13 |     def __init__(self, args):
14 |         super(TrackingHelper, self).__init__(args)
15 |         self.function_name = "track_swaps"
16 |     
17 |     def evaluate_prediction(self, outputs):
18 |         agg_pass = []
19 |         agg_task_accuracy = []
20 |         agg_label = []
21 |         agg_prediction = []
22 |        
23 |         for oi, o in enumerate(outputs):
24 |             o = o.replace(".", "")
25 |             if "Final answer:" in o:
26 |                 model_output = o.split("Final answer:")[-1].strip()
27 |             else:
28 |                 model_output = o.strip()
29 |             label = self.test_data[oi]['answer']
30 |             is_pass = False
31 |             option_dic = {option.split()[0]: option for option in self.test_data[oi]['options']}
32 |             for option in option_dic.values():
33 |                 if model_output in option and model_output:
34 |                     is_pass = True
35 |                     break
36 |             agg_pass.append(is_pass)
37 |             if model_output in option_dic[label] and model_output:
38 |                 agg_task_accuracy.append(True)
39 |             else:
40 |                 agg_task_accuracy.append(False)
41 |             agg_label.append(label)
42 |             agg_prediction.append(model_output)
43 |         
44 |         task_accuracy = np.mean(agg_task_accuracy).item()
45 |         pass_rate = sum(agg_pass)/len(agg_pass)
46 |         individual_score = [{"pass_rate": agg_pass[i], "task_accuracy": agg_task_accuracy[i], "prediction": agg_prediction[i], "answer": agg_label[i]} for i in range(len(agg_task_accuracy))]
47 |         result_score = {
48 |             "pass_rate": pass_rate,
49 |             "task_accuracy": task_accuracy
50 |         }
51 |         return result_score, individual_score
52 |     
53 |     def load_data(self):
54 |         # if self.args.task_version is None:
55 |         #     data_name = f"tasks/tracking_shuffled_objectives/data_five.json"
56 |         # else:
57 |         #     data_name = f"tasks/tracking_shuffled_objectives/data_{self.args.task_version}.json"
58 |         data_name = "tasks/tracking_shuffled_objectives/data.json"
59 |         with open(data_name, "r") as f:
60 |             data = json.load(f)['examples']
61 |         train_data = [d for d in data]
62 |         test_data = [d for d in data]
63 |         if self.args.num_sample != -1:
64 |             test_data = [test_data[i] for i in range(self.args.num_sample)]
65 | 
66 |         self.train_data = train_data
67 |         self.test_data = test_data
68 |         return train_data, test_data
69 |     
70 |     def load_and_prepare_data(self, dataset_split):
71 |         dataset = self.train_data if dataset_split == "train" else self.test_data
72 |         all_processed_data = []
73 |         for data in tqdm(dataset):
74 |             cur_data = {k:v for k,v in data.items()}
75 |             cur_data['input_text'] = cur_data['input']
76 |             cur_data['answer'] = cur_data['target']
77 |             cur_data['label'] = cur_data['target']
78 |             cur_data['options'] = cur_data['input'].split("\nOptions:\n")[-1].split("\n")
79 |             all_processed_data.append(cur_data)
80 |         
81 |         if dataset_split == "train":
82 |             self.train_data = all_processed_data
83 |         else:
84 |             self.test_data = all_processed_data
85 |             
86 |         return all_processed_data
87 |         


--------------------------------------------------------------------------------
/src/tasks/tracking_shuffled_objectives/prompts/code_prompt_cot_edited.txt:
--------------------------------------------------------------------------------
 1 | def track_swaps(input_text):
 2 |     # Step 1: Identify Initial State. Begin by identifying and outputing the initial state of all objectives (e.g., who holds which ball or who is dancing with whom) from the input text before any swaps happen.
 3 |     state_dict = find_initial_state(input_text)
 4 |     print(f"Initial state: {state_dict}")
 5 | 
 6 |     # Step 2: Identify and output the sequences of swaps from the input text. Each swap should be understood in terms of who exchanges with whom.
 7 |     swap_sequences_list = find_swap_sequences(input_text)
 8 |     print("Swap sequences: ", swap_sequences_list)
 9 |     print("Total iterations: ", len(swap_sequences_list))
10 | 
11 |     # Step 3: Carry out the swaps. For each swap in swap sequences, sequentially update and output the current status of objectives by exchanging them between the two participants involved in the swap.
12 |     for i, sequence in enumerate(swap_sequences_list):
13 |         player1, player2 = extract_player(sequence)
14 |         state_dict[player1], state_dict[player2] = state_dict[player2], state_dict[player1]
15 |         print(f"({i}) {sequence} -> {state_dict}")
16 | 
17 |     Step 4: Understand the Question. After processing all swaps, identify what the question is asking for in the input text and output the question.
18 |     question = extract_question(input_text)
19 |     print("Question:", question)
20 | 
21 |     Step 5: Analyze Options. Examine and output the provided options in the input text.
22 |     options = input_text.split("\n")[-5:]
23 |     print("Options:", options)
24 | 
25 |     Step 6: Determine the Correct Option. Using the updated state after all swaps, determine which option correctly answers the question and output the answer.
26 |     answer = find_correct_option(question, options, state_dict)
27 | 
28 |     return answer


--------------------------------------------------------------------------------
/src/tasks/tracking_shuffled_objectives/prompts/code_prompt_cot_edited_2.txt:
--------------------------------------------------------------------------------
 1 | def track_swaps(input_text):
 2 |     # Step 1: Identify Initial State. Begin by identifying and outputing the initial state of all objectives (e.g., who holds which ball or who is dancing with whom) from the input text before any swaps happen.
 3 |     state_dict = find_initial_state(input_text)
 4 |     print(f"Initial state: {state_dict}")
 5 | 
 6 |     # Step 2: Identify and output the sequences of swaps from the input text. Each swap should be understood in terms of who exchanges with whom.
 7 |     swap_sequences_list = find_swap_sequences(input_text)
 8 |     print("Swap sequences: ", swap_sequences_list)
 9 |     print("Total iterations: ", len(swap_sequences_list))
10 | 
11 |     # Step 3: [Variables tracking] Carry out the swaps. For each swap in swap sequences, sequentially update and output the current and updated status of objectives by exchanging them between the two participants involved in the swap.
12 |     for i, sequence in enumerate(swap_sequences_list):
13 |         print(f"({i}) {sequence}")
14 |         player1, player2 = extract_player(sequence)
15 |         print(f"Before swap: {player1} matches {state_dict[player1]} and {player2} matches {state_dict[player2]}")
16 |         state_dict[player1], state_dict[player2] = state_dict[player2], state_dict[player1]
17 |         print(f"After swap: {player1} matches {state_dict[player1]} and {player2} matches {state_dict[player2]}")
18 | 
19 |     # Step 4: Understand the Question. After processing all swaps, identify what the question is asking for in the input text and output the question.
20 |     question = extract_question(input_text)
21 |     print("Question:", question)
22 | 
23 |     # Step 5: Analyze Options. Examine and output the provided options in the input text.
24 |     options = input_text.split("\n")[-5:]
25 |     print("Options:", options)
26 | 
27 |     # Step 6: Determine the Correct Option. Using the updated state after all swaps, determine which option correctly answers the question and output the answer.
28 |     answer = find_correct_option(question, options, state_dict)
29 |     
30 |     return answer


--------------------------------------------------------------------------------
/src/tasks/tracking_shuffled_objectives/prompts/example_classification.txt:
--------------------------------------------------------------------------------
 1 | Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
 2 | As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
 3 | Options:
 4 | (A) yellow ball
 5 | (B) blue ball
 6 | (C) pink ball
 7 | 
 8 | Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a white ball, Bob has a purple ball, and Claire has a pink ball.
 9 | As the game progresses, pairs of players trade balls. First, Bob and Alice swap balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At the end of the game, Alice has the
10 | Options:
11 | (A) white ball
12 | (B) purple ball
13 | (C) pink ball
14 | 
15 | Q: Alice, Bob, and Claire are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Lola, Bob is dancing with Rodrigo, and Claire is dancing with Patrick.
16 | Throughout the song, the dancers often trade partners. First, Alice and Bob switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice switch partners. At the end of the dance, Alice is dancing with
17 | Options:
18 | (A) Lola
19 | (B) Rodrigo
20 | (C) Patrick
21 | 
22 | Reason for the decision & Answer:
23 | For this task, it is crucial to track the pair of items or individuals while processing the each swapping sequence. Therefore, the variables tracking would work well to solve these problems. So the answer is Variables tracking.


--------------------------------------------------------------------------------
/src/tasks/tracking_shuffled_objectives/prompts/explanation.txt:
--------------------------------------------------------------------------------
 1 | Explanation:
 2 | This task involves tracking the swapping of objects (like balls or dance partners) between individuals over a series of exchanges. The goal is to determine the final state of these objects after all swaps are completed. This requires careful tracking of each swap and updating the state of each object accordingly.
 3 | 
 4 | Analyzing the Task:
 5 | 1. Tracking Initial States: The initial state of each object (who has what or who is with whom) needs to be recorded. 
 6 | 
 7 | 2. Understanding Swap Sequences: The sequence of swaps described in the text must be clearly understood and recorded. Each swap alters the state of the objects.
 8 | 
 9 | 3. Sequentially Updating States: For each swap, the states of the involved objects must be updated to reflect the exchange.
10 | 
11 | 4. Identifying the Final Query: After all swaps, the question usually asks for the final state of a particular object or individual.
12 | 
13 | Constructing the Code Prompt:
14 | 1. Identifying Initial State: Parse the input text to determine the initial state of all objects or individuals. This step sets the foundation for the entire process.
15 | 
16 | 2. Parsing Swap Sequences: Extract the sequence of swaps from the input text. Each swap will need to be processed in the order they occur.
17 | 
18 | 3. [Important] Executing Swaps: Iterate over each swap sequence and update the states of the objects or individuals involved in each swap. This is a crucial step that ensures the current state is always up-to-date.
19 | 
20 | 4. Extracting the Final Question: After processing all swaps, identify the specific question asked in the input text.
21 | 
22 | 5. Examining Options: Review and display the options provided in the input text for the final question.
23 | 
24 | 6. Determining the Answer: Based on the final state of the objects or individuals, select the correct option that answers the question.


--------------------------------------------------------------------------------
/src/tasks/tracking_shuffled_objectives/prompts/explanation_2.txt:
--------------------------------------------------------------------------------
 1 | This task involves tracking the swapping of objects (like balls or dance partners) between individuals over a series of exchanges. The goal is to determine the final state of these objects after all swaps are completed. This requires careful tracking of each swap and updating the state of each object accordingly.
 2 | 
 3 | Analyzing the Task:
 4 | It is important to keep track of the state of each object's or individual's state while processing the swap seqeunces. Therefore, the variables tracking will be effective way to solve the task.
 5 | 
 6 | Constructing the Code Prompt:
 7 | 1. Identifying Initial State: Parse the input text to determine the initial state of all objects or individuals. This step sets the foundation for the entire process.
 8 | 
 9 | 2. Parsing Swap Sequences: Extract the sequence of swaps from the input text. Each swap will need to be processed in the order they occur.
10 | 
11 | 3. [Variables tracking] Executing Swaps: Iterate over each swap sequence by using 'for loop' and update the states of the objects or individuals involved in each swap. Print out the current and updated states to track the swaps.
12 | 
13 | 4. Extracting the Final Question: After processing all swaps, identify the specific question asked in the input text.
14 | 
15 | 5. Examining Options: Review and display the options provided in the input text for the final question.
16 | 
17 | 6. Determining the Answer: Based on the final state of the objects or individuals, select the correct option that answers the question.


--------------------------------------------------------------------------------
/src/tasks/tracking_shuffled_objectives/scoring_prompt_template_code.txt:
--------------------------------------------------------------------------------
1 | {prompt}
2 | input_text = "{input_text}"
3 | final_answer = track_swaps(input_text)
4 | print("Final answer:"+ final_answer)
5 | Generate the expected execution output (output from all print() functions) of the code. You don't have to actually run the code and do not care about 'not implemented error'.
6 | Exprected output:
7 | 


--------------------------------------------------------------------------------
/src/tasks/web_of_lies/generated_prompts/classification_result_gpt_3shot.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "input": [
  3 |         "Choose between the variables tracking and the free format reasoning which strategy would be more effective than the other one when tackling the last task considering the example task instances and explain why. The variables tracking is effective when the task has multiple instructions or statements to process and need to check the important variables for each of the step. For the free format reasoning, it works well when the task needs a complex or diverse logic that is difficult to be expressed in a specific method. In a free format reasoning step, you don't have to specify the logic, but you should specify an abstract fucntion like 'derive_answer_with_explantion' which outputs both reasons and proper answer for the situation. For the variables tracking step, you should use 'for loop' to process the multiple statements or instructions step by step, 'print()' to output intermediate steps to keep track of changes of variables.",
  4 |         "",
  5 |         "[Example 1]",
  6 |         "Q: On the nightstand, there is a red pencil, a purple mug, a burgundy keychain, a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the stress ball?",
  7 |         "Options:",
  8 |         "(A) red",
  9 |         "(B) orange",
 10 |         "(C) yellow",
 11 |         "(D) green",
 12 |         "(E) blue",
 13 |         "(F) brown",
 14 |         "(G) magenta",
 15 |         "(H) fuchsia",
 16 |         "(I) mauve",
 17 |         "(J) teal",
 18 |         "(K) turquoise",
 19 |         "(L) burgundy",
 20 |         "(M) silver",
 21 |         "(N) gold",
 22 |         "(O) black",
 23 |         "(P) grey",
 24 |         "(Q) purple",
 25 |         "(R) pink",
 26 |         "",
 27 |         "Q: On the table, you see a bunch of objects arranged in a row: a purple paperclip, a pink stress ball, a brown keychain, a green scrunchiephone charger, a mauve fidget spinner, and a burgundy pen. What is the color of the object directly to the right of the stress ball?",
 28 |         "Options:",
 29 |         "(A) red",
 30 |         "(B) orange",
 31 |         "(C) yellow",
 32 |         "(D) green",
 33 |         "(E) blue",
 34 |         "(F) brown",
 35 |         "(G) magenta",
 36 |         "(H) fuchsia",
 37 |         "(I) mauve",
 38 |         "(J) teal",
 39 |         "(K) turquoise",
 40 |         "(L) burgundy",
 41 |         "(M) silver",
 42 |         "(N) gold",
 43 |         "(O) black",
 44 |         "(P) grey",
 45 |         "(Q) purple",
 46 |         "(R) pink",
 47 |         "",
 48 |         "Q: On the nightstand, you see the following items arranged in a row: a teal plate, a burgundy keychain, a yellow scrunchiephone charger, an orange mug, a pink notebook, and a grey cup. How many non-orange items do you see to the left of the teal item?",
 49 |         "Options:",
 50 |         "(A) zero",
 51 |         "(B) one",
 52 |         "(C) two",
 53 |         "(D) three",
 54 |         "(E) four",
 55 |         "(F) five",
 56 |         "(G) six",
 57 |         "",
 58 |         "Reason for the decision & Answer:",
 59 |         "The task includes several types of question(i.e. color identification, counting objects, spatial reasoning) which is difficult to deal with a specific method. Therefore, the free format reasoning approach would be effective to tackle the task. So the answer is Free format reasoning.",
 60 |         "",
 61 |         "[Example 2]",
 62 |         "Example task instances:",
 63 |         "Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: [ { [",
 64 |         "",
 65 |         "Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: < > ( ( [ [ ( { } ) [ < > ] ]",
 66 |         "",
 67 |         "Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: < [ < [ { < [ ] < { } > > } ] > { { ( ) } { < [ < > ] > }",
 68 |         "",
 69 |         "Reason for the decision & Answer:",
 70 |         "In order to find unclosed parentheses, tracking the opening and closing parentheses is really important. Therefore, the variables tracking is an effective way to solve the task. So the answer is Variables tracking.",
 71 |         "",
 72 |         "[Example 3]",
 73 |         "Example task instances:",
 74 |         "Q: Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  We now add a penguin to the table:",
 75 |         "James, 12, 90, 12",
 76 |         "How many penguins are less than 8 years old?",
 77 |         "Options:",
 78 |         "(A) 1",
 79 |         "(B) 2",
 80 |         "(C) 3",
 81 |         "(D) 4",
 82 |         "(E) 5",
 83 |         "",
 84 |         "Q: Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  Which is the youngest penguin?",
 85 |         "Options:",
 86 |         "(A) Louis",
 87 |         "(B) Bernard",
 88 |         "(C) Vincent",
 89 |         "(D) Gwen",
 90 |         "(E) James",
 91 |         "",
 92 |         "Q: Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  What is the name of the second penguin sorted by alphabetic order?",
 93 |         "Options:",
 94 |         "(A) Louis",
 95 |         "(B) Bernard",
 96 |         "(C) Vincent",
 97 |         "(D) Gwen",
 98 |         "(E) James",
 99 |         "",
100 |         "Reason for the decision & Answer:",
101 |         "To find target individual from the table referring to the question is hard to implement with a variables tracking method. Therefore, the free format reasoning could be a better way to solve the problem. So the answer is Free format reasoning.",
102 |         "",
103 |         "[Example 4]",
104 |         "Example task instances:",
105 |         "Q: If you follow these instructions, do you return to the starting point? Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. Take 8 steps.",
106 |         "Options:",
107 |         "- Yes",
108 |         "- No",
109 |         "",
110 |         "Q: If you follow these instructions, do you return to the starting point? Turn around. Take 1 step. Take 6 steps. Turn around. Take 6 steps. Take 9 steps. Take 1 step.",
111 |         "Options:",
112 |         "- Yes",
113 |         "- No",
114 |         "",
115 |         "Q: If you follow these instructions, do you return to the starting point? Always face forward. Take 2 steps right. Take 9 steps left. Take 7 steps right.",
116 |         "Options:",
117 |         "- Yes",
118 |         "- No",
119 |         "",
120 |         "Reason for the decision & Answer:",
121 |         "It is crucial to process each of the instruction and keep track of the current position and direction to solve the task. Therefore, the variables tracking is an effective way to tackle the target task. So the answer is Variables tracking.",
122 |         "",
123 |         "[Example 5]",
124 |         "Example task instances:",
125 |         "Question: Sima lies. Millie says Sima tells the truth. Jerry says Millie tells the truth. Vina says Jerry lies. Maybelle says Vina tells the truth. Does Maybelle tell the truth?",
126 |         "",
127 |         "Question: Alexis tells the truth. Amberly says Alexis tells the truth. Andree says Amberly tells the truth. Vina says Andree tells the truth. Helene says Vina tells the truth. Does Helene tell the truth?",
128 |         "",
129 |         "Question: Millicent lies. Alexis says Millicent lies. Inga says Alexis tells the truth. Christie says Inga lies. Gwenn says Christie lies. Does Gwenn tell the truth?",
130 |         "",
131 |         "Reason for the decision & answer:"
132 |     ],
133 |     "output": [
134 |         [
135 |             "In these logic puzzles, it is necessary to carefully track the statements made by each person to determine if they are telling the truth or lying. This requires keeping track of multiple variables and evaluating the logic of each statement. Therefore, the variables tracking approach would be more effective in solving these tasks. So the answer is Variables tracking."
136 |         ]
137 |     ]
138 | }


--------------------------------------------------------------------------------
/src/tasks/web_of_lies/generated_prompts/classification_result_gpt_3shot.txt:
--------------------------------------------------------------------------------
1 | In these logic puzzles, it is necessary to carefully track the statements made by each person to determine if they are telling the truth or lying. This requires keeping track of multiple variables and evaluating the logic of each statement. Therefore, the variables tracking approach would be more effective in solving these tasks. So the answer is Variables tracking.


--------------------------------------------------------------------------------
/src/tasks/web_of_lies/generated_prompts/explanation_from_scratch_3shot_gpt.txt:
--------------------------------------------------------------------------------
 1 | Explanation:
 2 | This task involves determining the truthfulness of a statement made by one individual based on the statements made by others in a chain. The task requires understanding the relationships between truth-tellers and liars and applying logical reasoning to determine the final answer.
 3 | 
 4 | Analyzing the Task:
 5 | 1. Establishing Truth Relationships: Each person's statement about another person can be categorized as either true or false. This forms the basis of determining who tells the truth and who lies.
 6 | 
 7 | 2. Propagating Truthfulness: By analyzing the statements in a sequential manner, the truthfulness of each person can be deduced based on the statements made by others.
 8 | 
 9 | 3. Identifying the Final Question: The task usually asks whether a specific person tells the truth or not based on the chain of statements.
10 | 
11 | Constructing the Code Prompt:
12 | 1. Parsing Statements: Extract the statements made by each person from the input text. This involves identifying who is talking about whom and whether they are telling the truth or lying.
13 | 
14 | 2. Establishing Truth Relationships: Create a dictionary to store the truthfulness of each person based on the statements made by others. This dictionary will be updated as the statements are processed.
15 | 
16 | 3. [Important] Analyzing Statements: Iterate over each statement and update the truthfulness of the individuals involved based on the logic that if A says B lies, then A is telling the truth if B is a liar, and vice versa. This step involves logical reasoning and updating the truth dictionary.
17 | 
18 | 4. Extracting the Final Question: Identify the specific question asked in the input text regarding the truthfulness of a particular person.
19 | 
20 | 5. Determining the Answer: Based on the final truthfulness of the person in question as determined by the logic and the statements provided, select 'Yes' if the person tells the truth and 'No' if they do not.
21 | 
22 | By following these steps and applying logical reasoning to the statements provided, the code can accurately determine whether the specified individual tells the truth or not.


--------------------------------------------------------------------------------
/src/tasks/web_of_lies/generated_prompts/new_edited_code_prompt_from_explanation_planner_gpt_3shot.txt:
--------------------------------------------------------------------------------
 1 | def evaluate_boolean_word_problem(input_text):
 2 |     # Step 1: Identify the statements and the question from the input text.
 3 |     statements, question = input_text.split("?")
 4 |     statements = statements.split(". ")
 5 |     print("Statements:", statements)
 6 |     print("Question:", question)
 7 | 
 8 |     # Step 2: Initialize a truth dictionary to keep track of who tells the truth and who lies.
 9 |     truth_dict = {}
10 | 
11 |     # Step 3: Process each statement to update the truth dictionary based on the logic provided.
12 |     for statement in statements:
13 |         person1, action, person2 = statement.split(" ")
14 |         if action == "lies":
15 |             truth_dict[person1] = not truth_dict[person2]
16 |         else:
17 |             truth_dict[person1] = truth_dict[person2]
18 |         print(f"{person1} says {person2} {action}. {person1} tells the truth: {truth_dict[person1]}")
19 | 
20 |     # Step 4: Determine the truthfulness of the person in question based on the truth dictionary.
21 |     person_to_check = question.split(" ")[-2]
22 |     answer = 'Yes' if truth_dict[person_to_check] else 'No'
23 |     
24 |     return answer


--------------------------------------------------------------------------------
/src/tasks/web_of_lies/helper.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | import os
 4 | from datasets import load_dataset
 5 | from tqdm import tqdm
 6 | import json
 7 | import re
 8 | sys.path.append(os.environ.get('PROJECTPATH'))
 9 | from src.abstract_helper import AbstractHelper
10 | 
11 | 
12 | class WebOfLiesHelper(AbstractHelper):
13 |     def __init__(self, args):
14 |         super(WebOfLiesHelper, self).__init__(args)
15 |         self.function_name = "evaluate_boolean_word_problem"
16 |     
17 |     def evaluate_prediction(self, outputs):
18 |         agg_pass = []
19 |         agg_task_accuracy = []
20 |        
21 |         for oi, o in enumerate(outputs):
22 |             model_output = o
23 |             if "Final answer:" in o:
24 |                 model_output = o.split("Final answer:")[-1].strip().lower()
25 |             else:
26 |                 model_output = o.strip().lower()
27 |             label = self.test_data[oi]['answer'].lower()
28 |             is_pass = False
29 |             for ans in ['yes', 'no', 'true', 'false']:
30 |                 if ans in model_output[:5]:
31 |                     is_pass = True
32 |                     break
33 |             agg_pass.append(is_pass)
34 |             label_dic = {'yes': 'true', 'no': 'false'}
35 |             if label in model_output[:5] or label_dic[label] in model_output[:5]:
36 |                 agg_task_accuracy.append(True)
37 |             else:
38 |                 agg_task_accuracy.append(False)
39 |         
40 |         task_accuracy = np.mean(agg_task_accuracy).item()
41 |         pass_rate = sum(agg_pass)/len(agg_pass)
42 |         individual_score = [{"pass_rate": agg_pass[i], "task_accuracy": agg_task_accuracy[i]} for i in range(len(agg_task_accuracy))]
43 |         result_score = {
44 |             "pass_rate": pass_rate,
45 |             "task_accuracy": task_accuracy
46 |         }
47 |         return result_score, individual_score
48 |     
49 |     def load_data(self):
50 |         data_name = "tasks/web_of_lies/data.json"
51 |         with open(data_name, "r") as f:
52 |             data = json.load(f)['examples']
53 |         train_data = [d for d in data]
54 |         test_data = [d for d in data]
55 |         if self.args.num_sample != -1:
56 |             test_data = [test_data[i] for i in range(self.args.num_sample)]
57 | 
58 |         self.train_data = train_data
59 |         self.test_data = test_data
60 |         return train_data, test_data
61 |     
62 |     def load_and_prepare_data(self, dataset_split):
63 |         dataset = self.train_data if dataset_split == "train" else self.test_data
64 |         all_processed_data = []
65 |         for data in tqdm(dataset):
66 |             cur_data = {k:v for k,v in data.items()}
67 |             cur_data['input_text'] = cur_data['input'][10:]
68 |             cur_data['answer'] = cur_data['target']
69 |             cur_data['label'] = cur_data['target']
70 |             all_processed_data.append(cur_data)
71 |         
72 |         if dataset_split == "train":
73 |             self.train_data = all_processed_data
74 |         else:
75 |             self.test_data = all_processed_data
76 |             
77 |         return all_processed_data
78 |         


--------------------------------------------------------------------------------
/src/tasks/web_of_lies/prompts/code_prompt_cot_edited.txt:
--------------------------------------------------------------------------------
 1 | def evaluate_boolean_word_problem(input_text):
 2 |     # Step 1: Divide the input text into individual statements and the final question. Output each statements.
 3 |     statements = input_text.split("")[:-1]
 4 |     question = input_text.split("")[-1]
 5 |     print("Parsed statements:", statements)
 6 |     
 7 |     # Step 2: Create a Truth Map to keep track of the assumed truthfulness of each person mentioned in the statements. No truth values are assigned initially.
 8 |     truth_map = {statement.split()[0]: None for statement in statements}
 9 | 
10 |     # Step 3: Analyze Each Statement. For each statement, first output the statement number and the statement. identify the subject person (who makes the statement), the object person (who the statement is about), and the expected truth value (whether the object person is said to tell the truth or lie). Output the current statement under analysis along with the object person and the expected truth value for clarity.
11 |     for i, statement in enumerate(statements):
12 |         print(f"({i}): {statement}")
13 |         speaker, target_person, expected_truth_value_of_target_person = extract_person_and_truth_value(statement) # speaker - says - target_person - expected_truth_value_of_target_person
14 | 
15 |         print(f"{speaker} says : {target_person} - {expected_truth_value_of_target_person}")
16 |         print(f"Truth value of {target_person}: {truth_map[target_person]}")
17 | 
18 |         # Step 4: Update the Truth Map based on the analysis of each statement. If the statement's claim aligns with the current assumption about the object person's truthfulness, mark the subject person as truthful. Otherwise, mark them as untruthful. After each update, print the name of the person being updated, their determined truth value, and the updated truth map to track changes.
19 |         if truth_map[target_person] == None: # if the statement does not need to be checked
20 |             print(f"{expected_truth_value_of_target_person} matches {truth_map[target_person]}")
21 |             truth_map[target_person] = expected_truth_value_of_target_person
22 |         else:
23 |             print(f"{expected_truth_value_of_target_person} does not match {truth_map[target_person]}")
24 |             if truth_map[target_person] == expected_truth_value_of_target_person: # the speaker tells the truth
25 |                 truth_map[speaker] = True
26 |             else: # the speaker lies
27 |                 truth_map[speaker] = False
28 | 
29 |         print(f"Person to update: {speaker} - {truth_map[speaker]}")
30 | 
31 |         print(f"updated truth_map: {truth_map}")
32 |         print("\n\n")
33 |     
34 |     # Step 5: Using the completed truth map, determine the truthfulness of the person in the final question. Output the question for reference before making the determination.
35 |     print("question:", question)
36 |     target_person_in_question = extract_target_person_from_question(question)
37 |     target_truth_value = truth_map[target_person_in_question]
38 |     print(f"{target_person_in_question}: {target_truth_value})
39 | 
40 |     # Step 6: Based on the evaluation, output "Yes" if the person in the final question is determined to tell the truth, otherwise output "No".
41 |     answer = "Yes" if target_truth_value else "No"
42 | 
43 |     return answer


--------------------------------------------------------------------------------
/src/tasks/web_of_lies/prompts/code_prompt_cot_edited_2.txt:
--------------------------------------------------------------------------------
 1 | def evaluate_boolean_word_problem(input_text):
 2 |     # Step 1: Divide the input text into individual statements and the final question. Output each statements.
 3 |     statements = extract_statements(input_text)
 4 |     question = extract_qeustion(input_text)
 5 |     print("Parsed statements:", statements)
 6 |     
 7 |     # Step 2: Create a Truth Map to keep track of the assumed truthfulness of each person mentioned in the statements. No truth values are assigned initially.
 8 |     truth_map = {statement.split()[0]: None for statement in statements}
 9 |     print(f"Initial truth map: {truth_map}")
10 | 
11 |     # Step 3: [Variables tracking] Analyze Each Statement. For each statement, first output the statement number and the statement. identify the subject person (who makes the statement), the object person (who the statement is about), and the expected truth value (whether the object person is said to tell the truth or lie). Output the current statement under analysis along with the object person and the expected truth value for clarity.
12 |     for i, statement in enumerate(statements):
13 |         print(f"({i}): {statement}")
14 |         speaker, target_person, expected_truth_value_of_target_person = extract_person_and_truth_value(statement) # speaker - says - target_person - expected_truth_value_of_target_person
15 | 
16 |         print(f"{speaker} says : {target_person} - {expected_truth_value_of_target_person}")
17 |         print(f"Truth value of {target_person}: {truth_map[target_person]}")
18 | 
19 |         # Step 4: Update the Truth Map based on the analysis of each statement. If the statement's claim aligns with the current assumption about the object person's truthfulness, mark the subject person as truthful. Otherwise, mark them as untruthful. After each update, print the name of the person being updated, their determined truth value, and the updated truth map to track changes.
20 |         if truth_map[target_person] == None: # if the statement does not need to be checked
21 |             print(f"{expected_truth_value_of_target_person} matches {truth_map[target_person]}")
22 |             truth_map[target_person] = expected_truth_value_of_target_person
23 |         else:
24 |             print(f"{expected_truth_value_of_target_person} does not match {truth_map[target_person]}")
25 |             if truth_map[target_person] == expected_truth_value_of_target_person: # the speaker tells the truth
26 |                 truth_map[speaker] = True
27 |             else: # the speaker lies
28 |                 truth_map[speaker] = False
29 | 
30 |         print(f"Person to update: {speaker} - {truth_map[speaker]}")
31 | 
32 |         print(f"updated truth_map: {truth_map}")
33 |         print("\n\n")
34 |     
35 |     # Step 5: Using the completed truth map, determine the truthfulness of the person in the final question. Output the question for reference before making the determination.
36 |     print("question:", question)
37 |     target_person_in_question = extract_target_person_from_question(question)
38 |     target_truth_value = truth_map[target_person_in_question]
39 |     print(f"{target_person_in_question}: {target_truth_value})
40 | 
41 |     # Step 6: Based on the evaluation, output "Yes" if the person in the final question is determined to tell the truth, otherwise output "No".
42 |     answer = "Yes" if target_truth_value else "No"
43 | 
44 |     return answer


--------------------------------------------------------------------------------
/src/tasks/web_of_lies/prompts/explanation.txt:
--------------------------------------------------------------------------------
 1 | Explanation:
 2 | This task involves evaluating a series of statements regarding truthfulness and deceit to determine the veracity of a final claim. It is a logical puzzle that requires tracking the assertions about each individual's honesty and then applying this information to deduce a final conclusion.
 3 | 
 4 | Analyzing the Task:
 5 | 1. Statement Analysis: Each statement provides information about one person's opinion on another's truthfulness. These need to be carefully analyzed to understand their implications.
 6 | 
 7 | 2. Truth Mapping: A method to keep track of what is known about each person's truthfulness is essential. This can be handled with a dictionary (or map) where keys are individuals' names and values are their assumed truthfulness.
 8 | 
 9 | 3. Logical Deduction: Based on the statements, logical deductions must be made to update the truth map. This involves understanding the implications of each statement in the context of the existing information.
10 | 
11 | 4. Resolving the Final Query: The final step is to use the truth map to answer the specific question asked, based on the accumulated knowledge about each individual's truthfulness.
12 | 
13 | Constructing the Code Prompt:
14 | 1. Parsing Statements and Question: First, divide the input text into individual statements and the final question. This requires string manipulation.
15 | 
16 | 2. Creating a Truth Map: Initialize a map to keep track of each person's truthfulness status. Initially, the truthfulness of each individual might be unknown.
17 | 
18 | 3. [Important] Analyzing Statements: Iterate over each statement to extract the relevant information: who is making the statement, who it is about, and whether it asserts that the second person is truthful or deceitful.
19 | 
20 | 4. [Important] Updating the Truth Map: Use the information from each statement to update the truth map. This might involve setting or revising the truth status of one or more individuals.
21 | 
22 | 5. Evaluating the Final Question: Use the information in the truth map to determine the truthfulness of the person in question.
23 | 
24 | 6. Returning the Conclusion: Based on the evaluation, return 'Yes' if the person in the final question is determined to be truthful, or 'No' otherwise.


--------------------------------------------------------------------------------
/src/tasks/web_of_lies/prompts/explanation_2.txt:
--------------------------------------------------------------------------------
 1 | This task involves evaluating a series of statements regarding truthfulness and deceit to determine the veracity of a final claim. It is a logical puzzle that requires tracking the assertions about each individual's honesty and then applying this information to deduce a final conclusion.
 2 | 
 3 | Analyzing the Task:
 4 | To answer the last person is truthful or not, initializing a truth map and updating it while going through the statements would be a great method. To do this, the variables tracking is needed to check the updates in the truth map while processing the each statement.
 5 | 
 6 | Constructing the Code Prompt:
 7 | 1. Parsing Statements and Question: First, divide the input text into individual statements and the final question. This requires string manipulation.
 8 | 
 9 | 2. Creating a Truth Map: Initialize a map to keep track of each person's truthfulness status. Initially, the truthfulness of each individual might be unknown.
10 | 
11 | 3. [Variables tracking] Analyzing Statements: Iterate over each statement to extract the relevant information: who is making the statement, who it is about, and whether it asserts that the second person is truthful or deceitful.
12 | 
13 | 4. Updating the Truth Map: Use the information from each statement to update the truth map. This might involve setting or revising the truth status of one or more individuals.
14 | 
15 | 5. Evaluating the Final Question: Use the information in the truth map to determine the truthfulness of the person in question.
16 | 
17 | 6. Returning the Conclusion: Based on the evaluation, return 'Yes' if the person in the final question is determined to be truthful, or 'No' otherwise.


--------------------------------------------------------------------------------
/src/tasks/web_of_lies/prompts/nl_prompt_generation_example_prompt.txt:
--------------------------------------------------------------------------------
  1 | Generate an explanation, analyzation, and plan to generate code prompt for the last task considering the example task instances. Your plan should show enough intermediate reasoning steps towards the answer. Construct the plan as much as you can and describe the logic specifically. When constructing the plan for the code prompt, actively use 'if else statement' to take different reasoning paths based on the condition, 'loop' to efficiently process the repititive instructions, 'dictionary' to keep track of connections between important variables.
  2 | 
  3 | [Example 1]
  4 | Example task instances:
  5 | I have a cat, and a fish. How many animals do I have?
  6 | 
  7 | I have a head of broccoli, a carrot, a potato, a yam, and a cauliflower. How many vegetables do I have?
  8 | 
  9 | I have a microwave, and a bed. How many objects do I have?
 10 | 
 11 | Output Format:
 12 | A positive integer
 13 | 
 14 | Explanation:
 15 | This task involves identifying and counting specific types of objects (fruits or vegetables) from a mixed list. The primary challenge is correctly categorizing each item in the list and then tallying the counts of the desired categories.
 16 | 
 17 | Analyzing the Task:
 18 | 1. Object Categorization: Determining whether each item in the list is a fruit, a vegetable, or neither. This requires knowledge of common fruits and vegetables.
 19 | 
 20 | 2. Counting Objects: Accurately counting the number of items in each category. This may involve handling singular and plural forms, as well as numerical phrases (e.g., "two peaches").
 21 | 
 22 | 3. Handling Varied Input: The input text may vary in format and content, requiring flexible parsing to extract the necessary information.
 23 | 
 24 | Constructing the Code Prompt:
 25 | 1. Extracting Objects from Input: The first step is parsing the input text to create a list of objects. This requires string manipulation and possibly natural language processing techniques.
 26 | 
 27 | 2. Extracting the Question: Determine what category of items needs to be counted (e.g., fruits, vegetables) from the input text. 
 28 | 
 29 | 3. Initializing Count Variables: Set up a variable to keep track of the total count of the desired objects.
 30 | 
 31 | 4. [Important] Counting Each Object: Iterate over each object, determine if it belongs to the desired category (fruit or vegetable), and count it. Intermeditate count results shoud be logged with print() function. This involves:
 32 |    - Identifying singular and plural forms.
 33 |    - Recognizing and handling numerical phrases.
 34 |    - Keeping a running total of the count.
 35 | 
 36 | 5. Returning the Total Count: After processing all objects, return the total count as the answer.
 37 | 
 38 | [Example 2]
 39 | Example task instances:
 40 | Sort the following words alphabetically: List: pewter gross gujarati cortland diffusible nomograph stag broadcast galvanometer transition incestuous sketchbook scout larynx
 41 | 
 42 | Sort the following words alphabetically: List: portsmouth impalpable portrait gouda brainy succumb fatuous hoof lisbon erudite malaria enigma isaacson cony stronghold servomechanism
 43 | 
 44 | Sort the following words alphabetically: List: catawba almagest transfusable milt photolysis limpid gnome archenemy sagging councilwomen decrement jungian
 45 | 
 46 | Output Format:
 47 | A string of words seperated with a space
 48 | 
 49 | Explanation:
 50 | This task involves sorting a list of words alphabetically. It is a straightforward task that requires the application of standard sorting techniques.
 51 | 
 52 | Analyzing the Task:
 53 | 1. Word Extraction: Initially, the words need to be separated from the input text. 
 54 | 
 55 | 2. Alphabetical Sorting: The core of the task is to sort these words based on their alphabetical order.
 56 | 
 57 | 3. Returning the Sorted List: The final step is to output the list of words in the sorted order.
 58 | 
 59 | Constructing the Code Prompt:
 60 | 1. Splitting the Text into Words: First, extract each word from the input text. This can be achieved through string splitting based on spaces.
 61 | 
 62 | 2. Identifying Starting Letters: Though not strictly necessary for sorting, extracting the first letter of each word can help in understanding how the sorting will proceed.
 63 | 
 64 | 3. Sorting the Words: Use a standard sorting algorithm or function to sort the words alphabetically. In most programming languages, there are built-in functions that can handle this step efficiently.
 65 | 
 66 | 4. Outputting the Sorted List: Finally, return the list of words in their sorted order.
 67 | 
 68 | [Example 3]
 69 | Example task instances:
 70 | Alice, Bob, Claire, Dave, and Eve are friends and avid readers who occasionally trade books. At the start of the semester, they each buy one new book: Alice gets The Great Gatsby, Bob gets Frankenstein, Claire gets Moby Dick, Dave gets The Fellowship of the Ring, and Eve gets Catch-22.
 71 | As the semester proceeds, they start trading around the new books. First, Alice and Bob swap books. Then, Dave and Eve swap books. Then, Eve and Alice swap books. Then, Dave and Eve swap books. Finally, Claire and Dave swap books. At the end of the semester, Alice has
 72 | Options:
 73 | (A) The Great Gatsby
 74 | (B) Frankenstein
 75 | (C) Moby Dick
 76 | (D) The Fellowship of the Ring
 77 | (E) Catch-22
 78 | 
 79 | Alice, Bob, Claire, Dave, and Eve are playing a game. At the start of the game, they are each holding a ball: Alice has a black ball, Bob has a brown ball, Claire has a blue ball, Dave has a green ball, and Eve has a yellow ball.
 80 | As the game progresses, pairs of players trade balls. First, Eve and Claire swap balls. Then, Bob and Dave swap balls. Then, Eve and Alice swap balls. Then, Claire and Bob swap balls. Finally, Alice and Eve swap balls. At the end of the game, Claire has the
 81 | Options:
 82 | (A) black ball
 83 | (B) brown ball
 84 | (C) blue ball
 85 | (D) green ball
 86 | (E) yellow ball
 87 | 
 88 | Alice, Bob, Claire, Dave, and Eve are playing a game. At the start of the game, they are each holding a ball: Alice has a brown ball, Bob has a black ball, Claire has a red ball, Dave has a pink ball, and Eve has a blue ball.
 89 | As the game progresses, pairs of players trade balls. First, Claire and Bob swap balls. Then, Claire and Eve swap balls. Then, Bob and Claire swap balls. Then, Dave and Claire swap balls. Finally, Alice and Dave swap balls. At the end of the game, Eve has the
 90 | Options:
 91 | (A) brown ball
 92 | (B) black ball
 93 | (C) red ball
 94 | (D) pink ball
 95 | (E) blue ball
 96 | 
 97 | Output Format:
 98 | '(A)', '(B)', '(C)', ...
 99 | 
100 | Explanation:
101 | This task involves tracking the swapping of objects (like balls or dance partners) between individuals over a series of exchanges. The goal is to determine the final state of these objects after all swaps are completed. This requires careful tracking of each swap and updating the state of each object accordingly.
102 | 
103 | Analyzing the Task:
104 | 1. Tracking Initial States: The initial state of each object (who has what or who is with whom) needs to be recorded. 
105 | 
106 | 2. Understanding Swap Sequences: The sequence of swaps described in the text must be clearly understood and recorded. Each swap alters the state of the objects.
107 | 
108 | 3. Sequentially Updating States: For each swap, the states of the involved objects must be updated to reflect the exchange.
109 | 
110 | 4. Identifying the Final Query: After all swaps, the question usually asks for the final state of a particular object or individual.
111 | 
112 | Constructing the Code Prompt:
113 | 1. Identifying Initial State: Parse the input text to determine the initial state of all objects or individuals. This step sets the foundation for the entire process.
114 | 
115 | 2. Parsing Swap Sequences: Extract the sequence of swaps from the input text. Each swap will need to be processed in the order they occur.
116 | 
117 | 3. [Important] Executing Swaps: Iterate over each swap sequence and update the states of the objects or individuals involved in each swap. This is a crucial step that ensures the current state is always up-to-date.
118 | 
119 | 4. Extracting the Final Question: After processing all swaps, identify the specific question asked in the input text.
120 | 
121 | 5. Examining Options: Review and display the options provided in the input text for the final question.
122 | 
123 | 6. Determining the Answer: Based on the final state of the objects or individuals, select the correct option that answers the question.
124 | 
125 | [Example 4]
126 | Example task instances:
127 | Question: Delbert lies. Shalonda says Delbert tells the truth. Tamika says Shalonda lies. Sal says Tamika lies. Helene says Sal lies. Does Helene tell the truth?
128 | 
129 | Question: Delfina lies. Leda says Delfina lies. Fidel says Leda lies. Sal says Fidel tells the truth. Andree says Sal lies. Does Andree tell the truth?
130 | 
131 | Question: Willian lies. Raymond says Willian tells the truth. Rashida says Raymond lies. Gwenn says Rashida lies. Ka says Gwenn lies. Does Ka tell the truth?
132 | 
133 | Output Format:
134 | 'Yes' or 'No'
135 | 
136 | Explanation:


--------------------------------------------------------------------------------
/src/tasks/web_of_lies/scoring_prompt_template_code.txt:
--------------------------------------------------------------------------------
1 | {prompt}
2 | 
3 | input_text = "{input_text}"
4 | final_answer = evaluate_boolean_word_problem(input_text)
5 | print("Final answer:"+ final_answer)
6 | Generate the expected execution output (output from all print() functions) of the code. You don't have to actually run the code and do not care about 'not implemented error'.
7 | Expected output:
8 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from datasets import load_dataset
  3 | from langchain.schema import AIMessage, HumanMessage, SystemMessage
  4 | from langchain_community.chat_models import ChatOpenAI
  5 | from langchain_community.llms import OpenAI
  6 | from tqdm import tqdm
  7 | import asyncio
  8 | import json
  9 | import numpy as np
 10 | import pandas as pd
 11 | import sys
 12 | import os
 13 | from tqdm.asyncio import tqdm as tqdm_async
 14 | sys.path.append(os.environ.get('PROJECTPATH'))
 15 | from src.tasks.geometric_shapes.helper import GeometricShapesHelper
 16 | from src.tasks.temporal_sequences.helper import TemporalSequencesHelper
 17 | from src.tasks.navigate.helper import NavigateHelper
 18 | from src.tasks.web_of_lies.helper import WebOfLiesHelper
 19 | from src.tasks.tracking_shuffled_objectives.helper import TrackingHelper
 20 | from src.tasks.dyck_languages.helper import DyckLanguagesHelper
 21 | from src.tasks.reasoning_about_colored_objects.helper import ColoredObjectsHelper
 22 | 
 23 | 
 24 | async def async_generate(llm, index, model_input, args, progress_tracker):
 25 |     while True:
 26 |         try:
 27 |             if type(llm) == OpenAI:
 28 |                 response = await llm.agenerate([model_input])
 29 |                 output = response.generations[0][0].text
 30 |             else:
 31 |                 response_future = asyncio.ensure_future(llm.agenerate([[HumanMessage(content=model_input)]]))
 32 |                 call_success = False
 33 |                 time_count = 0
 34 |                 while not call_success:
 35 |                     await asyncio.sleep(1)  # Check every second
 36 |                     time_count += 1
 37 |                     if response_future.done():
 38 |                         response = response_future.result()
 39 |                         call_success = True
 40 |                     elif (time_count > 30) and (progress_tracker['completed'] >= progress_tracker['dynamic_threshold']):
 41 |                         time_count = 0
 42 |                         response_future.cancel()
 43 |                         response_future = asyncio.ensure_future(llm.agenerate([[HumanMessage(content=model_input)]]))
 44 |                 
 45 |                 output = response.generations[0][0].text
 46 |             progress_tracker['completed'] += 1
 47 |             break
 48 |         except Exception as e:
 49 |             print(f"Exception occurred: {e}")
 50 |             response = None
 51 | 
 52 |     return index, output 
 53 | 
 54 | async def async_generate_compositional(llm, index, model_input, args, progress_tracker):
 55 |     cur_output = []
 56 |     max_iteration = 15
 57 |     iter_count = 0
 58 |     while True:
 59 |         if "Final answer" in "\n".join(cur_output):
 60 |             break
 61 |         cur_input = model_input+"\n".join(cur_output)
 62 |         while True:
 63 |             try:
 64 |                 if type(llm) == OpenAI:
 65 |                     response = await llm.agenerate([cur_input])
 66 |                     output = response.generations[0][0].text
 67 |                 else:
 68 |                     response_future = asyncio.ensure_future(llm.agenerate([[HumanMessage(content=cur_input)]]))
 69 |                     call_success = False
 70 |                     time_count = 0
 71 |                     while not call_success:
 72 |                         await asyncio.sleep(1)  # Check every second
 73 |                         time_count += 1
 74 |                         if response_future.done():
 75 |                             response = response_future.result()
 76 |                             call_success = True
 77 |                         elif (time_count > 30) and (progress_tracker['completed'] >= progress_tracker['dynamic_threshold']):
 78 |                             time_count = 0
 79 |                             response_future.cancel()
 80 |                             response_future = asyncio.ensure_future(llm.agenerate([[HumanMessage(content=cur_input)]]))
 81 |                     
 82 |                     output = response.generations[0][0].text
 83 |                 progress_tracker['completed'] += 1
 84 |                 break
 85 |             except Exception as e:
 86 |                 print(f"Exception occurred: {e}")
 87 |                 response = None
 88 |         iter_count += 1
 89 |         if iter_count == max_iteration:
 90 |             cur_output.append("Final answer:")
 91 |         cur_output.append(output)
 92 | 
 93 |     return index, "\n".join(cur_output)
 94 | 
 95 | async def generate_concurrently(llm, list_of_model_inputs, args):
 96 |     progress_tracker = {'completed': 0, 'dynamic_threshold': int(len(list_of_model_inputs) * 0.8)}
 97 |     if hasattr(args, 'compositional_inference') and args.compositional_inference:
 98 |         tasks = [
 99 |             async_generate_compositional(llm, i, mi, args, progress_tracker) for i, mi in enumerate(list_of_model_inputs)
100 |         ]
101 |     else:
102 |         tasks = [
103 |             async_generate(llm, i, mi, args, progress_tracker) for i, mi in enumerate(list_of_model_inputs)
104 |         ]
105 |     
106 |     results = []
107 |     for f in tqdm_async(asyncio.as_completed(tasks), total=len(tasks)):
108 |         result = await f
109 |         results.append(result)
110 | 
111 |     sorted_results = sorted(results, key=lambda x: x[0])
112 |     outputs = [output for index, output in sorted_results]
113 | 
114 |     return outputs
115 | 
116 | def select_best_k_prompts(
117 |     optimization_history,
118 |     max_num_prompts, # number of instruction-score pairs to be used in the optimization process
119 |     min_score_threshold,
120 |     args
121 | ):
122 |     """Generate the string that includes instruction-score pairs."""
123 |     # old_instructions_and_scores_str = ""
124 |     counted_propmts = []
125 |     unique_prompt_score_pair = []
126 |     for ps in optimization_history:
127 |         if (ps['prompt'] not in counted_propmts) and (ps['score'][args.score_type]>min_score_threshold):
128 |             unique_prompt_score_pair.append(ps)
129 |             counted_propmts.append(ps['prompt'])
130 | 
131 |     selected_prompts = sorted(
132 |         unique_prompt_score_pair, key=lambda x: x['score'][args.score_type]
133 |     )[-max_num_prompts:]
134 | 
135 | 
136 |     return selected_prompts
137 | 
138 | 
139 | async def calculate_score_for_optimized_prompt(llm, data, scoring_prompt, optimized_prompt, helper):
140 |     ''' evaluate an optimized instruction using scorer model '''
141 |     # construct model inputs using instances in evaluation set
142 |     list_of_model_inputs = [scoring_prompt.format(input_text=d['input_text'], prompt=optimized_prompt, function_name=helper.function_name) for d in data]
143 |     
144 |     outputs = await generate_concurrently(llm, list_of_model_inputs, helper.args)
145 |     result_score, individual_score = helper.evaluate_prediction(outputs)
146 | 
147 |     return result_score, individual_score, outputs, list_of_model_inputs
148 | 
149 | def parse_instruction(raw_instruction):
150 |     start_token, end_token = "<PROMPT>", "</PROMPT>"
151 |     start_index = raw_instruction.find(start_token) + len(start_token) if start_token in raw_instruction else 0
152 |     end_index = raw_instruction.find(end_token) if end_token in raw_instruction else len(raw_instruction)
153 |     return raw_instruction[start_index:end_index].strip()
154 | 
155 | 
156 | 
157 | def save_avg_step_scores(path, save_path, args):
158 |     with open(path, 'r') as f:
159 |         f1 = json.load(f)
160 |     df = pd.DataFrame(f1)
161 |     df[args.score_type] = [score[args.score_type] for score in df['score']]
162 |     avg_scores = df.groupby('step')[args.score_type].mean().to_dict()
163 |     print(avg_scores)
164 |     # avg_scores.pop(0)
165 |     with open(save_path, 'w') as f:
166 |         json.dump(avg_scores, f, indent=4)
167 | 
168 | LIST_OF_TASKS = [
169 |     'temporal_sequences',
170 |     'reasoning_about_colored_objects',
171 |     'dyck_languages',
172 |     'web_of_lies',
173 |     'geometric_shapes',
174 |     'navigate',
175 |     'tracking_shuffled_objectives',
176 | ]
177 | 
178 | helper_dict = {
179 |     "navigate": NavigateHelper,
180 |     "web_of_lies": WebOfLiesHelper,
181 |     "tracking_shuffled_objectives": TrackingHelper,
182 |     "dyck_languages": DyckLanguagesHelper,
183 |     "reasoning_about_colored_objects": ColoredObjectsHelper,
184 |     "temporal_sequences": TemporalSequencesHelper,
185 |     "geometric_shapes": GeometricShapesHelper,
186 | }
187 | 


--------------------------------------------------------------------------------