├── LICENSE ├── README.md ├── classifier ├── classifier_prompts.py └── reflection_classifier_annotations.csv ├── prompts ├── bbh_adv.py ├── cruxeval_i_adv.py ├── cruxeval_o_adv.py └── gsm8k_adv.py ├── results ├── experiment_results_qwen_250328_final_public.csv ├── experiment_results_self_reflection_250328_final_public.csv ├── experiment_results_situational_reflection_250328_final_public.csv ├── plot_results.py ├── plots_qwen │ └── situational_qwen2.5_all_tasks.png ├── plots_self │ ├── self_reflect_olmo2_bbh_adv_accurate_expl_reflection.png │ ├── self_reflect_olmo2_bbh_adv_accurate_impl_reflection.png │ ├── self_reflect_olmo2_bbh_adv_correlation.png │ ├── self_reflect_olmo2_bbh_adv_exact_match_get-answer.png │ ├── self_reflect_olmo2_bbh_adv_reflection.png │ ├── self_reflect_olmo2_cruxeval_i_adv_accurate_expl_reflection.png │ ├── self_reflect_olmo2_cruxeval_i_adv_accurate_impl_reflection.png │ ├── self_reflect_olmo2_cruxeval_i_adv_correlation.png │ ├── self_reflect_olmo2_cruxeval_i_adv_pass_1_create_test.png │ ├── self_reflect_olmo2_cruxeval_i_adv_reflection.png │ ├── self_reflect_olmo2_cruxeval_o_adv_accurate_expl_reflection.png │ ├── self_reflect_olmo2_cruxeval_o_adv_accurate_impl_reflection.png │ ├── self_reflect_olmo2_cruxeval_o_adv_correlation.png │ ├── self_reflect_olmo2_cruxeval_o_adv_pass_1_create_test.png │ ├── self_reflect_olmo2_cruxeval_o_adv_reflection.png │ ├── self_reflect_olmo2_gsm8k-platinum_adv_accurate_expl_reflection.png │ ├── self_reflect_olmo2_gsm8k-platinum_adv_accurate_impl_reflection.png │ ├── self_reflect_olmo2_gsm8k-platinum_adv_correlation.png │ ├── self_reflect_olmo2_gsm8k-platinum_adv_exact_match_flexible-extract.png │ ├── self_reflect_olmo2_gsm8k-platinum_adv_reflection.png │ ├── self_reflect_olmo2_gsm8k_adv_accurate_expl_reflection.png │ ├── self_reflect_olmo2_gsm8k_adv_accurate_impl_reflection.png │ ├── self_reflect_olmo2_gsm8k_adv_correlation.png │ ├── self_reflect_olmo2_gsm8k_adv_exact_match_flexible-extract.png │ ├── self_reflect_olmo2_gsm8k_adv_reflection.png │ ├── self_reflect_olmo2_triviaqa_adv_accurate_expl_reflection.png │ ├── self_reflect_olmo2_triviaqa_adv_accurate_impl_reflection.png │ ├── self_reflect_olmo2_triviaqa_adv_answer_exists_none.png │ ├── self_reflect_olmo2_triviaqa_adv_correlation.png │ ├── self_reflect_olmo2_triviaqa_adv_exact_match_none.png │ └── self_reflect_olmo2_triviaqa_adv_reflection.png └── plots_situational │ ├── situational_olmo2_bbh_adv_accurate_expl_reflection.png │ ├── situational_olmo2_bbh_adv_accurate_impl_reflection.png │ ├── situational_olmo2_bbh_adv_correlation.png │ ├── situational_olmo2_bbh_adv_exact_match_get-answer.png │ ├── situational_olmo2_bbh_adv_reflection.png │ ├── situational_olmo2_cruxeval_i_adv_accurate_expl_reflection.png │ ├── situational_olmo2_cruxeval_i_adv_accurate_impl_reflection.png │ ├── situational_olmo2_cruxeval_i_adv_correlation.png │ ├── situational_olmo2_cruxeval_i_adv_pass_1_create_test.png │ ├── situational_olmo2_cruxeval_i_adv_reflection.png │ ├── situational_olmo2_cruxeval_o_adv_accurate_expl_reflection.png │ ├── situational_olmo2_cruxeval_o_adv_accurate_impl_reflection.png │ ├── situational_olmo2_cruxeval_o_adv_correlation.png │ ├── situational_olmo2_cruxeval_o_adv_pass_1_create_test.png │ ├── situational_olmo2_cruxeval_o_adv_reflection.png │ ├── situational_olmo2_gsm8k-platinum_adv_accurate_expl_reflection.png │ ├── situational_olmo2_gsm8k-platinum_adv_accurate_impl_reflection.png │ ├── situational_olmo2_gsm8k-platinum_adv_correlation.png │ ├── situational_olmo2_gsm8k-platinum_adv_exact_match_flexible-extract.png │ ├── situational_olmo2_gsm8k-platinum_adv_reflection.png │ ├── situational_olmo2_gsm8k_adv_accurate_expl_reflection.png │ ├── situational_olmo2_gsm8k_adv_accurate_impl_reflection.png │ ├── situational_olmo2_gsm8k_adv_correlation.png │ ├── situational_olmo2_gsm8k_adv_exact_match_flexible-extract.png │ ├── situational_olmo2_gsm8k_adv_reflection.png │ ├── situational_olmo2_triviaqa_adv_accurate_expl_reflection.png │ ├── situational_olmo2_triviaqa_adv_accurate_impl_reflection.png │ ├── situational_olmo2_triviaqa_adv_answer_exists_none.png │ ├── situational_olmo2_triviaqa_adv_correlation.png │ ├── situational_olmo2_triviaqa_adv_exact_match_none.png │ └── situational_olmo2_triviaqa_adv_reflection.png └── tasks ├── bbh_adv ├── __pycache__ │ └── _utils.cpython-310.pyc ├── _adv_template_yaml ├── _bbh_adv.yaml ├── _utils.py ├── boolean_expressions.yaml ├── causal_judgement.yaml ├── date_understanding.yaml ├── disambiguation_qa.yaml ├── dyck_languages.yaml ├── formal_fallacies.yaml ├── geometric_shapes.yaml ├── hyperbaton.yaml ├── logical_deduction_five_objects.yaml ├── logical_deduction_seven_objects.yaml ├── logical_deduction_three_objects.yaml ├── movie_recommendation.yaml ├── multistep_arithmetic_two.yaml ├── navigate.yaml ├── object_counting.yaml ├── penguins_in_a_table.yaml ├── reasoning_about_colored_objects.yaml ├── ruin_names.yaml ├── salient_translation_error_detection.yaml ├── snarks.yaml ├── sports_understanding.yaml ├── temporal_sequences.yaml ├── tracking_shuffled_objects_five_objects.yaml ├── tracking_shuffled_objects_seven_objects.yaml ├── tracking_shuffled_objects_three_objects.yaml ├── web_of_lies.yaml └── word_sorting.yaml ├── cruxeval_i_adv ├── __pycache__ │ └── utils.cpython-310.pyc ├── cruxeval_i_adv.yaml └── utils.py ├── cruxeval_o_adv ├── __pycache__ │ └── utils.cpython-310.pyc ├── cruxeval_o_adv.yaml └── utils.py ├── gsm8k-platinum_adv ├── __pycache__ │ └── preprocess_gsm8k-platinum_adv.cpython-310.pyc ├── gsm8k-platinum_adv.yaml └── preprocess_gsm8k-platinum_adv.py ├── gsm8k_adv ├── __pycache__ │ └── preprocess_gsm8k_adv.cpython-310.pyc ├── gsm8k_adv.yaml └── preprocess_gsm8k_adv.py └── triviaqa_adv ├── __pycache__ └── preprocess.cpython-310.pyc ├── preprocess.py └── triviaqa.yaml /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/README.md -------------------------------------------------------------------------------- /classifier/classifier_prompts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/classifier/classifier_prompts.py -------------------------------------------------------------------------------- /classifier/reflection_classifier_annotations.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/classifier/reflection_classifier_annotations.csv -------------------------------------------------------------------------------- /prompts/bbh_adv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/prompts/bbh_adv.py -------------------------------------------------------------------------------- /prompts/cruxeval_i_adv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/prompts/cruxeval_i_adv.py -------------------------------------------------------------------------------- /prompts/cruxeval_o_adv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/prompts/cruxeval_o_adv.py -------------------------------------------------------------------------------- /prompts/gsm8k_adv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/prompts/gsm8k_adv.py -------------------------------------------------------------------------------- /results/experiment_results_qwen_250328_final_public.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/experiment_results_qwen_250328_final_public.csv -------------------------------------------------------------------------------- /results/experiment_results_self_reflection_250328_final_public.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/experiment_results_self_reflection_250328_final_public.csv -------------------------------------------------------------------------------- /results/experiment_results_situational_reflection_250328_final_public.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/experiment_results_situational_reflection_250328_final_public.csv -------------------------------------------------------------------------------- /results/plot_results.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plot_results.py -------------------------------------------------------------------------------- /results/plots_qwen/situational_qwen2.5_all_tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_qwen/situational_qwen2.5_all_tasks.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_bbh_adv_accurate_expl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_bbh_adv_accurate_expl_reflection.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_bbh_adv_accurate_impl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_bbh_adv_accurate_impl_reflection.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_bbh_adv_correlation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_bbh_adv_correlation.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_bbh_adv_exact_match_get-answer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_bbh_adv_exact_match_get-answer.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_bbh_adv_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_bbh_adv_reflection.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_cruxeval_i_adv_accurate_expl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_cruxeval_i_adv_accurate_expl_reflection.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_cruxeval_i_adv_accurate_impl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_cruxeval_i_adv_accurate_impl_reflection.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_cruxeval_i_adv_correlation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_cruxeval_i_adv_correlation.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_cruxeval_i_adv_pass_1_create_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_cruxeval_i_adv_pass_1_create_test.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_cruxeval_i_adv_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_cruxeval_i_adv_reflection.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_cruxeval_o_adv_accurate_expl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_cruxeval_o_adv_accurate_expl_reflection.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_cruxeval_o_adv_accurate_impl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_cruxeval_o_adv_accurate_impl_reflection.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_cruxeval_o_adv_correlation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_cruxeval_o_adv_correlation.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_cruxeval_o_adv_pass_1_create_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_cruxeval_o_adv_pass_1_create_test.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_cruxeval_o_adv_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_cruxeval_o_adv_reflection.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_gsm8k-platinum_adv_accurate_expl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_gsm8k-platinum_adv_accurate_expl_reflection.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_gsm8k-platinum_adv_accurate_impl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_gsm8k-platinum_adv_accurate_impl_reflection.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_gsm8k-platinum_adv_correlation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_gsm8k-platinum_adv_correlation.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_gsm8k-platinum_adv_exact_match_flexible-extract.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_gsm8k-platinum_adv_exact_match_flexible-extract.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_gsm8k-platinum_adv_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_gsm8k-platinum_adv_reflection.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_gsm8k_adv_accurate_expl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_gsm8k_adv_accurate_expl_reflection.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_gsm8k_adv_accurate_impl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_gsm8k_adv_accurate_impl_reflection.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_gsm8k_adv_correlation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_gsm8k_adv_correlation.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_gsm8k_adv_exact_match_flexible-extract.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_gsm8k_adv_exact_match_flexible-extract.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_gsm8k_adv_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_gsm8k_adv_reflection.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_triviaqa_adv_accurate_expl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_triviaqa_adv_accurate_expl_reflection.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_triviaqa_adv_accurate_impl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_triviaqa_adv_accurate_impl_reflection.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_triviaqa_adv_answer_exists_none.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_triviaqa_adv_answer_exists_none.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_triviaqa_adv_correlation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_triviaqa_adv_correlation.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_triviaqa_adv_exact_match_none.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_triviaqa_adv_exact_match_none.png -------------------------------------------------------------------------------- /results/plots_self/self_reflect_olmo2_triviaqa_adv_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_self/self_reflect_olmo2_triviaqa_adv_reflection.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_bbh_adv_accurate_expl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_bbh_adv_accurate_expl_reflection.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_bbh_adv_accurate_impl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_bbh_adv_accurate_impl_reflection.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_bbh_adv_correlation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_bbh_adv_correlation.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_bbh_adv_exact_match_get-answer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_bbh_adv_exact_match_get-answer.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_bbh_adv_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_bbh_adv_reflection.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_cruxeval_i_adv_accurate_expl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_cruxeval_i_adv_accurate_expl_reflection.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_cruxeval_i_adv_accurate_impl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_cruxeval_i_adv_accurate_impl_reflection.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_cruxeval_i_adv_correlation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_cruxeval_i_adv_correlation.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_cruxeval_i_adv_pass_1_create_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_cruxeval_i_adv_pass_1_create_test.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_cruxeval_i_adv_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_cruxeval_i_adv_reflection.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_cruxeval_o_adv_accurate_expl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_cruxeval_o_adv_accurate_expl_reflection.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_cruxeval_o_adv_accurate_impl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_cruxeval_o_adv_accurate_impl_reflection.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_cruxeval_o_adv_correlation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_cruxeval_o_adv_correlation.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_cruxeval_o_adv_pass_1_create_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_cruxeval_o_adv_pass_1_create_test.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_cruxeval_o_adv_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_cruxeval_o_adv_reflection.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_gsm8k-platinum_adv_accurate_expl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_gsm8k-platinum_adv_accurate_expl_reflection.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_gsm8k-platinum_adv_accurate_impl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_gsm8k-platinum_adv_accurate_impl_reflection.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_gsm8k-platinum_adv_correlation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_gsm8k-platinum_adv_correlation.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_gsm8k-platinum_adv_exact_match_flexible-extract.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_gsm8k-platinum_adv_exact_match_flexible-extract.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_gsm8k-platinum_adv_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_gsm8k-platinum_adv_reflection.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_gsm8k_adv_accurate_expl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_gsm8k_adv_accurate_expl_reflection.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_gsm8k_adv_accurate_impl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_gsm8k_adv_accurate_impl_reflection.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_gsm8k_adv_correlation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_gsm8k_adv_correlation.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_gsm8k_adv_exact_match_flexible-extract.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_gsm8k_adv_exact_match_flexible-extract.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_gsm8k_adv_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_gsm8k_adv_reflection.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_triviaqa_adv_accurate_expl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_triviaqa_adv_accurate_expl_reflection.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_triviaqa_adv_accurate_impl_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_triviaqa_adv_accurate_impl_reflection.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_triviaqa_adv_answer_exists_none.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_triviaqa_adv_answer_exists_none.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_triviaqa_adv_correlation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_triviaqa_adv_correlation.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_triviaqa_adv_exact_match_none.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_triviaqa_adv_exact_match_none.png -------------------------------------------------------------------------------- /results/plots_situational/situational_olmo2_triviaqa_adv_reflection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/results/plots_situational/situational_olmo2_triviaqa_adv_reflection.png -------------------------------------------------------------------------------- /tasks/bbh_adv/__pycache__/_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/__pycache__/_utils.cpython-310.pyc -------------------------------------------------------------------------------- /tasks/bbh_adv/_adv_template_yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/_adv_template_yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/_bbh_adv.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/_bbh_adv.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/_utils.py -------------------------------------------------------------------------------- /tasks/bbh_adv/boolean_expressions.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/boolean_expressions.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/causal_judgement.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/causal_judgement.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/date_understanding.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/date_understanding.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/disambiguation_qa.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/disambiguation_qa.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/dyck_languages.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/dyck_languages.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/formal_fallacies.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/formal_fallacies.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/geometric_shapes.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/geometric_shapes.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/hyperbaton.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/hyperbaton.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/logical_deduction_five_objects.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/logical_deduction_five_objects.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/logical_deduction_seven_objects.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/logical_deduction_seven_objects.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/logical_deduction_three_objects.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/logical_deduction_three_objects.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/movie_recommendation.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/movie_recommendation.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/multistep_arithmetic_two.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/multistep_arithmetic_two.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/navigate.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/navigate.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/object_counting.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/object_counting.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/penguins_in_a_table.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/penguins_in_a_table.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/reasoning_about_colored_objects.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/reasoning_about_colored_objects.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/ruin_names.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/ruin_names.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/salient_translation_error_detection.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/salient_translation_error_detection.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/snarks.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/snarks.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/sports_understanding.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/sports_understanding.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/temporal_sequences.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/temporal_sequences.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/tracking_shuffled_objects_five_objects.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/tracking_shuffled_objects_five_objects.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/tracking_shuffled_objects_seven_objects.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/tracking_shuffled_objects_seven_objects.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/tracking_shuffled_objects_three_objects.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/tracking_shuffled_objects_three_objects.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/web_of_lies.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/web_of_lies.yaml -------------------------------------------------------------------------------- /tasks/bbh_adv/word_sorting.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/bbh_adv/word_sorting.yaml -------------------------------------------------------------------------------- /tasks/cruxeval_i_adv/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/cruxeval_i_adv/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /tasks/cruxeval_i_adv/cruxeval_i_adv.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/cruxeval_i_adv/cruxeval_i_adv.yaml -------------------------------------------------------------------------------- /tasks/cruxeval_i_adv/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/cruxeval_i_adv/utils.py -------------------------------------------------------------------------------- /tasks/cruxeval_o_adv/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/cruxeval_o_adv/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /tasks/cruxeval_o_adv/cruxeval_o_adv.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/cruxeval_o_adv/cruxeval_o_adv.yaml -------------------------------------------------------------------------------- /tasks/cruxeval_o_adv/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/cruxeval_o_adv/utils.py -------------------------------------------------------------------------------- /tasks/gsm8k-platinum_adv/__pycache__/preprocess_gsm8k-platinum_adv.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/gsm8k-platinum_adv/__pycache__/preprocess_gsm8k-platinum_adv.cpython-310.pyc -------------------------------------------------------------------------------- /tasks/gsm8k-platinum_adv/gsm8k-platinum_adv.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/gsm8k-platinum_adv/gsm8k-platinum_adv.yaml -------------------------------------------------------------------------------- /tasks/gsm8k-platinum_adv/preprocess_gsm8k-platinum_adv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/gsm8k-platinum_adv/preprocess_gsm8k-platinum_adv.py -------------------------------------------------------------------------------- /tasks/gsm8k_adv/__pycache__/preprocess_gsm8k_adv.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/gsm8k_adv/__pycache__/preprocess_gsm8k_adv.cpython-310.pyc -------------------------------------------------------------------------------- /tasks/gsm8k_adv/gsm8k_adv.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/gsm8k_adv/gsm8k_adv.yaml -------------------------------------------------------------------------------- /tasks/gsm8k_adv/preprocess_gsm8k_adv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/gsm8k_adv/preprocess_gsm8k_adv.py -------------------------------------------------------------------------------- /tasks/triviaqa_adv/__pycache__/preprocess.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/triviaqa_adv/__pycache__/preprocess.cpython-310.pyc -------------------------------------------------------------------------------- /tasks/triviaqa_adv/preprocess.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/triviaqa_adv/preprocess.py -------------------------------------------------------------------------------- /tasks/triviaqa_adv/triviaqa.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Essential-AI/reflection/HEAD/tasks/triviaqa_adv/triviaqa.yaml --------------------------------------------------------------------------------