├── .gitignore ├── LICENSE ├── README.md ├── bbh ├── README.md ├── boolean_expressions.json ├── causal_judgement.json ├── date_understanding.json ├── disambiguation_qa.json ├── dyck_languages.json ├── formal_fallacies.json ├── geometric_shapes.json ├── hyperbaton.json ├── logical_deduction_five_objects.json ├── logical_deduction_seven_objects.json ├── logical_deduction_three_objects.json ├── movie_recommendation.json ├── multistep_arithmetic_two.json ├── navigate.json ├── object_counting.json ├── penguins_in_a_table.json ├── reasoning_about_colored_objects.json ├── ruin_names.json ├── salient_translation_error_detection.json ├── snarks.json ├── sports_understanding.json ├── temporal_sequences.json ├── tracking_shuffled_objects_five_objects.json ├── tracking_shuffled_objects_seven_objects.json ├── tracking_shuffled_objects_three_objects.json ├── web_of_lies.json └── word_sorting.json ├── code-davinci-002-outputs ├── .DS_Store ├── code-davinci-002-cot │ ├── boolean_expressions_few_shot_template_0-255000.json │ ├── boolean_expressions_few_shot_template_0-255000_eval_metrics.jsonl │ ├── causal_judgement_few_shot_template_0-255000.json │ ├── causal_judgement_few_shot_template_0-255000_eval_metrics.jsonl │ ├── date_understanding_few_shot_template_0-255000.json │ ├── date_understanding_few_shot_template_0-255000_eval_metrics.jsonl │ ├── disambiguation_qa_few_shot_template_0-255000.json │ ├── disambiguation_qa_few_shot_template_0-255000_eval_metrics.jsonl │ ├── dyck_languages_few_shot_template_0-255000.json │ ├── dyck_languages_few_shot_template_0-255000_eval_metrics.jsonl │ ├── formal_fallacies_few_shot_template_0-255000.json │ ├── formal_fallacies_few_shot_template_0-255000_eval_metrics.jsonl │ ├── geometric_shapes_few_shot_template_0-255000.json │ ├── geometric_shapes_few_shot_template_0-255000_eval_metrics.jsonl │ ├── hyperbaton_few_shot_template_0-255000.json │ ├── hyperbaton_few_shot_template_0-255000_eval_metrics.jsonl │ ├── logical_deduction_five_objects_few_shot_template_0-255000.json │ ├── logical_deduction_five_objects_few_shot_template_0-255000_eval_metrics.jsonl │ ├── logical_deduction_seven_objects_few_shot_template_0-255000.json │ ├── logical_deduction_seven_objects_few_shot_template_0-255000_eval_metrics.jsonl │ ├── logical_deduction_three_objects_few_shot_template_0-255000.json │ ├── logical_deduction_three_objects_few_shot_template_0-255000_eval_metrics.jsonl │ ├── movie_recommendation_few_shot_template_0-255000.json │ ├── movie_recommendation_few_shot_template_0-255000_eval_metrics.jsonl │ ├── multistep_arithmetic_two_few_shot_template_0-255000.json │ ├── multistep_arithmetic_two_few_shot_template_0-255000_eval_metrics.jsonl │ ├── navigate_few_shot_template_0-255000.json │ ├── navigate_few_shot_template_0-255000_eval_metrics.jsonl │ ├── object_counting_few_shot_template_0-255000.json │ ├── object_counting_few_shot_template_0-255000_eval_metrics.jsonl │ ├── penguins_in_a_table_few_shot_template_0-255000.json │ ├── penguins_in_a_table_few_shot_template_0-255000_eval_metrics.jsonl │ ├── reasoning_about_colored_objects_few_shot_template_0-255000.json │ ├── reasoning_about_colored_objects_few_shot_template_0-255000_eval_metrics.jsonl │ ├── ruin_names_few_shot_template_0-255000.json │ ├── ruin_names_few_shot_template_0-255000_eval_metrics.jsonl │ ├── salient_translation_error_detection_few_shot_template_0-255000.json │ ├── salient_translation_error_detection_few_shot_template_0-255000_eval_metrics.jsonl │ ├── snarks_few_shot_template_0-255000.json │ ├── snarks_few_shot_template_0-255000_eval_metrics.jsonl │ ├── sports_understanding_few_shot_template_0-255000.json │ ├── sports_understanding_few_shot_template_0-255000_eval_metrics.jsonl │ ├── temporal_sequences_few_shot_template_0-255000.json │ ├── temporal_sequences_few_shot_template_0-255000_eval_metrics.jsonl │ ├── tracking_shuffled_objects_five_objects_few_shot_template_0-255000.json │ ├── tracking_shuffled_objects_five_objects_few_shot_template_0-255000_eval_metrics.jsonl │ ├── tracking_shuffled_objects_seven_objects_few_shot_template_0-255000.json │ ├── tracking_shuffled_objects_seven_objects_few_shot_template_0-255000_eval_metrics.jsonl │ ├── tracking_shuffled_objects_three_objects_few_shot_template_0-255000.json │ ├── tracking_shuffled_objects_three_objects_few_shot_template_0-255000_eval_metrics.jsonl │ ├── web_of_lies_few_shot_template_0-255000.json │ ├── web_of_lies_few_shot_template_0-255000_eval_metrics.jsonl │ ├── word_sorting_few_shot_template_0-255000.json │ └── word_sorting_few_shot_template_0-255000_eval_metrics.jsonl └── code-davinci-002-direct │ ├── boolean_expressions_few_shot_template_0-255000.json │ ├── boolean_expressions_few_shot_template_0-255000_eval_metrics.jsonl │ ├── causal_judgement_few_shot_template_0-255000.json │ ├── causal_judgement_few_shot_template_0-255000_eval_metrics.jsonl │ ├── date_understanding_few_shot_template_0-255000.json │ ├── date_understanding_few_shot_template_0-255000_eval_metrics.jsonl │ ├── disambiguation_qa_few_shot_template_0-255000.json │ ├── disambiguation_qa_few_shot_template_0-255000_eval_metrics.jsonl │ ├── dyck_languages_few_shot_template_0-255000.json │ ├── dyck_languages_few_shot_template_0-255000_eval_metrics.jsonl │ ├── formal_fallacies_few_shot_template_0-255000.json │ ├── formal_fallacies_few_shot_template_0-255000_eval_metrics.jsonl │ ├── geometric_shapes_few_shot_template_0-255000.json │ ├── geometric_shapes_few_shot_template_0-255000_eval_metrics.jsonl │ ├── hyperbaton_few_shot_template_0-255000.json │ ├── hyperbaton_few_shot_template_0-255000_eval_metrics.jsonl │ ├── logical_deduction_five_objects_few_shot_template_0-255000.json │ ├── logical_deduction_five_objects_few_shot_template_0-255000_eval_metrics.jsonl │ ├── logical_deduction_seven_objects_few_shot_template_0-255000.json │ ├── logical_deduction_seven_objects_few_shot_template_0-255000_eval_metrics.jsonl │ ├── logical_deduction_three_objects_few_shot_template_0-255000.json │ ├── logical_deduction_three_objects_few_shot_template_0-255000_eval_metrics.jsonl │ ├── movie_recommendation_few_shot_template_0-255000.json │ ├── movie_recommendation_few_shot_template_0-255000_eval_metrics.jsonl │ ├── multistep_arithmetic_two_few_shot_template_0-255000.json │ ├── multistep_arithmetic_two_few_shot_template_0-255000_eval_metrics.jsonl │ ├── navigate_few_shot_template_0-255000.json │ ├── navigate_few_shot_template_0-255000_eval_metrics.jsonl │ ├── object_counting_few_shot_template_0-255000.json │ ├── object_counting_few_shot_template_0-255000_eval_metrics.jsonl │ ├── penguins_in_a_table_few_shot_template_0-255000.json │ ├── penguins_in_a_table_few_shot_template_0-255000_eval_metrics.jsonl │ ├── reasoning_about_colored_objects_few_shot_template_0-255000.json │ ├── reasoning_about_colored_objects_few_shot_template_0-255000_eval_metrics.jsonl │ ├── ruin_names_few_shot_template_0-255000.json │ ├── ruin_names_few_shot_template_0-255000_eval_metrics.jsonl │ ├── salient_translation_error_detection_few_shot_template_0-255000.json │ ├── salient_translation_error_detection_few_shot_template_0-255000_eval_metrics.jsonl │ ├── snarks_few_shot_template_0-255000.json │ ├── snarks_few_shot_template_0-255000_eval_metrics.jsonl │ ├── sports_understanding_few_shot_template_0-255000.json │ ├── sports_understanding_few_shot_template_0-255000_eval_metrics.jsonl │ ├── temporal_sequences_few_shot_template_0-255000.json │ ├── temporal_sequences_few_shot_template_0-255000_eval_metrics.jsonl │ ├── tracking_shuffled_objects_five_objects_few_shot_template_0-255000.json │ ├── tracking_shuffled_objects_five_objects_few_shot_template_0-255000_eval_metrics.jsonl │ ├── tracking_shuffled_objects_seven_objects_few_shot_template_0-255000.json │ ├── tracking_shuffled_objects_seven_objects_few_shot_template_0-255000_eval_metrics.jsonl │ ├── tracking_shuffled_objects_three_objects_few_shot_template_0-255000.json │ ├── tracking_shuffled_objects_three_objects_few_shot_template_0-255000_eval_metrics.jsonl │ ├── web_of_lies_few_shot_template_0-255000.json │ ├── web_of_lies_few_shot_template_0-255000_eval_metrics.jsonl │ ├── word_sorting_few_shot_template_0-255000.json │ └── word_sorting_few_shot_template_0-255000_eval_metrics.jsonl ├── cot-prompts ├── boolean_expressions.txt ├── causal_judgement.txt ├── date_understanding.txt ├── disambiguation_qa.txt ├── dyck_languages.txt ├── formal_fallacies.txt ├── geometric_shapes.txt ├── hyperbaton.txt ├── logical_deduction_five_objects.txt ├── logical_deduction_seven_objects.txt ├── logical_deduction_three_objects.txt ├── movie_recommendation.txt ├── multistep_arithmetic_two.txt ├── navigate.txt ├── object_counting.txt ├── penguins_in_a_table.txt ├── reasoning_about_colored_objects.txt ├── ruin_names.txt ├── salient_translation_error_detection.txt ├── snarks.txt ├── sports_understanding.txt ├── temporal_sequences.txt ├── tracking_shuffled_objects_five_objects.txt ├── tracking_shuffled_objects_seven_objects.txt ├── tracking_shuffled_objects_three_objects.txt ├── web_of_lies.txt └── word_sorting.txt └── figures ├── .DS_Store ├── bbh-model-outputs.png ├── bbh-results.png └── bbh-setup.png /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/README.md -------------------------------------------------------------------------------- /bbh/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/README.md -------------------------------------------------------------------------------- /bbh/boolean_expressions.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/boolean_expressions.json -------------------------------------------------------------------------------- /bbh/causal_judgement.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/causal_judgement.json -------------------------------------------------------------------------------- /bbh/date_understanding.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/date_understanding.json -------------------------------------------------------------------------------- /bbh/disambiguation_qa.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/disambiguation_qa.json -------------------------------------------------------------------------------- /bbh/dyck_languages.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/dyck_languages.json -------------------------------------------------------------------------------- /bbh/formal_fallacies.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/formal_fallacies.json -------------------------------------------------------------------------------- /bbh/geometric_shapes.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/geometric_shapes.json -------------------------------------------------------------------------------- /bbh/hyperbaton.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/hyperbaton.json -------------------------------------------------------------------------------- /bbh/logical_deduction_five_objects.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/logical_deduction_five_objects.json -------------------------------------------------------------------------------- /bbh/logical_deduction_seven_objects.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/logical_deduction_seven_objects.json -------------------------------------------------------------------------------- /bbh/logical_deduction_three_objects.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/logical_deduction_three_objects.json -------------------------------------------------------------------------------- /bbh/movie_recommendation.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/movie_recommendation.json -------------------------------------------------------------------------------- /bbh/multistep_arithmetic_two.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/multistep_arithmetic_two.json -------------------------------------------------------------------------------- /bbh/navigate.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/navigate.json -------------------------------------------------------------------------------- /bbh/object_counting.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/object_counting.json -------------------------------------------------------------------------------- /bbh/penguins_in_a_table.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/penguins_in_a_table.json -------------------------------------------------------------------------------- /bbh/reasoning_about_colored_objects.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/reasoning_about_colored_objects.json -------------------------------------------------------------------------------- /bbh/ruin_names.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/ruin_names.json -------------------------------------------------------------------------------- /bbh/salient_translation_error_detection.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/salient_translation_error_detection.json -------------------------------------------------------------------------------- /bbh/snarks.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/snarks.json -------------------------------------------------------------------------------- /bbh/sports_understanding.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/sports_understanding.json -------------------------------------------------------------------------------- /bbh/temporal_sequences.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/temporal_sequences.json -------------------------------------------------------------------------------- /bbh/tracking_shuffled_objects_five_objects.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/tracking_shuffled_objects_five_objects.json -------------------------------------------------------------------------------- /bbh/tracking_shuffled_objects_seven_objects.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/tracking_shuffled_objects_seven_objects.json -------------------------------------------------------------------------------- /bbh/tracking_shuffled_objects_three_objects.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/tracking_shuffled_objects_three_objects.json -------------------------------------------------------------------------------- /bbh/web_of_lies.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/web_of_lies.json -------------------------------------------------------------------------------- /bbh/word_sorting.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/bbh/word_sorting.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/.DS_Store -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/boolean_expressions_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/boolean_expressions_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/boolean_expressions_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/boolean_expressions_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/causal_judgement_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/causal_judgement_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/causal_judgement_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/causal_judgement_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/date_understanding_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/date_understanding_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/date_understanding_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/date_understanding_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/disambiguation_qa_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/disambiguation_qa_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/disambiguation_qa_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/disambiguation_qa_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/dyck_languages_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/dyck_languages_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/dyck_languages_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/dyck_languages_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/formal_fallacies_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/formal_fallacies_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/formal_fallacies_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/formal_fallacies_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/geometric_shapes_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/geometric_shapes_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/geometric_shapes_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/geometric_shapes_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/hyperbaton_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/hyperbaton_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/hyperbaton_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/hyperbaton_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/logical_deduction_five_objects_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/logical_deduction_five_objects_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/logical_deduction_five_objects_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/logical_deduction_five_objects_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/logical_deduction_seven_objects_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/logical_deduction_seven_objects_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/logical_deduction_seven_objects_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/logical_deduction_seven_objects_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/logical_deduction_three_objects_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/logical_deduction_three_objects_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/logical_deduction_three_objects_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/logical_deduction_three_objects_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/movie_recommendation_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/movie_recommendation_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/movie_recommendation_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/movie_recommendation_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/multistep_arithmetic_two_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/multistep_arithmetic_two_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/multistep_arithmetic_two_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/multistep_arithmetic_two_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/navigate_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/navigate_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/navigate_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/navigate_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/object_counting_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/object_counting_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/object_counting_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/object_counting_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/penguins_in_a_table_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/penguins_in_a_table_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/penguins_in_a_table_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/penguins_in_a_table_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/reasoning_about_colored_objects_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/reasoning_about_colored_objects_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/reasoning_about_colored_objects_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/reasoning_about_colored_objects_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/ruin_names_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/ruin_names_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/ruin_names_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/ruin_names_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/salient_translation_error_detection_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/salient_translation_error_detection_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/salient_translation_error_detection_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/salient_translation_error_detection_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/snarks_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/snarks_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/snarks_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/snarks_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/sports_understanding_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/sports_understanding_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/sports_understanding_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/sports_understanding_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/temporal_sequences_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/temporal_sequences_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/temporal_sequences_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/temporal_sequences_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/tracking_shuffled_objects_five_objects_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/tracking_shuffled_objects_five_objects_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/tracking_shuffled_objects_five_objects_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/tracking_shuffled_objects_five_objects_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/tracking_shuffled_objects_seven_objects_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/tracking_shuffled_objects_seven_objects_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/tracking_shuffled_objects_seven_objects_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/tracking_shuffled_objects_seven_objects_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/tracking_shuffled_objects_three_objects_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/tracking_shuffled_objects_three_objects_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/tracking_shuffled_objects_three_objects_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/tracking_shuffled_objects_three_objects_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/web_of_lies_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/web_of_lies_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/web_of_lies_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/web_of_lies_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/word_sorting_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/word_sorting_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-cot/word_sorting_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-cot/word_sorting_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/boolean_expressions_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/boolean_expressions_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/boolean_expressions_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/boolean_expressions_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/causal_judgement_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/causal_judgement_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/causal_judgement_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/causal_judgement_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/date_understanding_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/date_understanding_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/date_understanding_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/date_understanding_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/disambiguation_qa_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/disambiguation_qa_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/disambiguation_qa_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/disambiguation_qa_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/dyck_languages_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/dyck_languages_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/dyck_languages_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/dyck_languages_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/formal_fallacies_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/formal_fallacies_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/formal_fallacies_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/formal_fallacies_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/geometric_shapes_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/geometric_shapes_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/geometric_shapes_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/geometric_shapes_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/hyperbaton_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/hyperbaton_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/hyperbaton_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/hyperbaton_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/logical_deduction_five_objects_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/logical_deduction_five_objects_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/logical_deduction_five_objects_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/logical_deduction_five_objects_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/logical_deduction_seven_objects_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/logical_deduction_seven_objects_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/logical_deduction_seven_objects_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/logical_deduction_seven_objects_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/logical_deduction_three_objects_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/logical_deduction_three_objects_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/logical_deduction_three_objects_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/logical_deduction_three_objects_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/movie_recommendation_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/movie_recommendation_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/movie_recommendation_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/movie_recommendation_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/multistep_arithmetic_two_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/multistep_arithmetic_two_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/multistep_arithmetic_two_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/multistep_arithmetic_two_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/navigate_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/navigate_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/navigate_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/navigate_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/object_counting_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/object_counting_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/object_counting_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/object_counting_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/penguins_in_a_table_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/penguins_in_a_table_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/penguins_in_a_table_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/penguins_in_a_table_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/reasoning_about_colored_objects_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/reasoning_about_colored_objects_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/reasoning_about_colored_objects_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/reasoning_about_colored_objects_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/ruin_names_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/ruin_names_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/ruin_names_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/ruin_names_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/salient_translation_error_detection_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/salient_translation_error_detection_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/salient_translation_error_detection_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/salient_translation_error_detection_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/snarks_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/snarks_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/snarks_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/snarks_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/sports_understanding_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/sports_understanding_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/sports_understanding_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/sports_understanding_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/temporal_sequences_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/temporal_sequences_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/temporal_sequences_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/temporal_sequences_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/tracking_shuffled_objects_five_objects_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/tracking_shuffled_objects_five_objects_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/tracking_shuffled_objects_five_objects_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/tracking_shuffled_objects_five_objects_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/tracking_shuffled_objects_seven_objects_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/tracking_shuffled_objects_seven_objects_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/tracking_shuffled_objects_seven_objects_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/tracking_shuffled_objects_seven_objects_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/tracking_shuffled_objects_three_objects_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/tracking_shuffled_objects_three_objects_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/tracking_shuffled_objects_three_objects_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/tracking_shuffled_objects_three_objects_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/web_of_lies_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/web_of_lies_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/web_of_lies_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/web_of_lies_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/word_sorting_few_shot_template_0-255000.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/word_sorting_few_shot_template_0-255000.json -------------------------------------------------------------------------------- /code-davinci-002-outputs/code-davinci-002-direct/word_sorting_few_shot_template_0-255000_eval_metrics.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/code-davinci-002-outputs/code-davinci-002-direct/word_sorting_few_shot_template_0-255000_eval_metrics.jsonl -------------------------------------------------------------------------------- /cot-prompts/boolean_expressions.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/boolean_expressions.txt -------------------------------------------------------------------------------- /cot-prompts/causal_judgement.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/causal_judgement.txt -------------------------------------------------------------------------------- /cot-prompts/date_understanding.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/date_understanding.txt -------------------------------------------------------------------------------- /cot-prompts/disambiguation_qa.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/disambiguation_qa.txt -------------------------------------------------------------------------------- /cot-prompts/dyck_languages.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/dyck_languages.txt -------------------------------------------------------------------------------- /cot-prompts/formal_fallacies.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/formal_fallacies.txt -------------------------------------------------------------------------------- /cot-prompts/geometric_shapes.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/geometric_shapes.txt -------------------------------------------------------------------------------- /cot-prompts/hyperbaton.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/hyperbaton.txt -------------------------------------------------------------------------------- /cot-prompts/logical_deduction_five_objects.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/logical_deduction_five_objects.txt -------------------------------------------------------------------------------- /cot-prompts/logical_deduction_seven_objects.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/logical_deduction_seven_objects.txt -------------------------------------------------------------------------------- /cot-prompts/logical_deduction_three_objects.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/logical_deduction_three_objects.txt -------------------------------------------------------------------------------- /cot-prompts/movie_recommendation.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/movie_recommendation.txt -------------------------------------------------------------------------------- /cot-prompts/multistep_arithmetic_two.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/multistep_arithmetic_two.txt -------------------------------------------------------------------------------- /cot-prompts/navigate.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/navigate.txt -------------------------------------------------------------------------------- /cot-prompts/object_counting.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/object_counting.txt -------------------------------------------------------------------------------- /cot-prompts/penguins_in_a_table.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/penguins_in_a_table.txt -------------------------------------------------------------------------------- /cot-prompts/reasoning_about_colored_objects.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/reasoning_about_colored_objects.txt -------------------------------------------------------------------------------- /cot-prompts/ruin_names.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/ruin_names.txt -------------------------------------------------------------------------------- /cot-prompts/salient_translation_error_detection.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/salient_translation_error_detection.txt -------------------------------------------------------------------------------- /cot-prompts/snarks.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/snarks.txt -------------------------------------------------------------------------------- /cot-prompts/sports_understanding.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/sports_understanding.txt -------------------------------------------------------------------------------- /cot-prompts/temporal_sequences.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/temporal_sequences.txt -------------------------------------------------------------------------------- /cot-prompts/tracking_shuffled_objects_five_objects.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/tracking_shuffled_objects_five_objects.txt -------------------------------------------------------------------------------- /cot-prompts/tracking_shuffled_objects_seven_objects.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/tracking_shuffled_objects_seven_objects.txt -------------------------------------------------------------------------------- /cot-prompts/tracking_shuffled_objects_three_objects.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/tracking_shuffled_objects_three_objects.txt -------------------------------------------------------------------------------- /cot-prompts/web_of_lies.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/web_of_lies.txt -------------------------------------------------------------------------------- /cot-prompts/word_sorting.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/cot-prompts/word_sorting.txt -------------------------------------------------------------------------------- /figures/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/figures/.DS_Store -------------------------------------------------------------------------------- /figures/bbh-model-outputs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/figures/bbh-model-outputs.png -------------------------------------------------------------------------------- /figures/bbh-results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/figures/bbh-results.png -------------------------------------------------------------------------------- /figures/bbh-setup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/HEAD/figures/bbh-setup.png --------------------------------------------------------------------------------