├── .gitignore ├── README.md ├── assets └── preview.png ├── eval.py ├── examples ├── bin_1.txt ├── bin_2.txt ├── bin_3.txt ├── bin_4.txt ├── bin_5.txt ├── select_swap_words.py └── word_pairs_lowbins.txt ├── logs ├── basic │ ├── claude-3 │ │ ├── basic10_bin1_temp=0.0.json │ │ ├── basic10_bin2_temp=0.0.json │ │ ├── basic10_bin3_temp=0.0.json │ │ ├── basic10_bin4_temp=0.0.json │ │ ├── basic10_bin5_temp=0.0.json │ │ ├── basic11_bin1_temp=0.0.json │ │ ├── basic11_bin2_temp=0.0.json │ │ ├── basic11_bin3_temp=0.0.json │ │ ├── basic11_bin4_temp=0.0.json │ │ ├── basic11_bin5_temp=0.0.json │ │ ├── basic12_bin1_temp=0.0.json │ │ ├── basic12_bin2_temp=0.0.json │ │ ├── basic12_bin3_temp=0.0.json │ │ ├── basic12_bin4_temp=0.0.json │ │ ├── basic12_bin5_temp=0.0.json │ │ ├── basic13_bin1_temp=0.0.json │ │ ├── basic13_bin2_temp=0.0.json │ │ ├── basic13_bin3_temp=0.0.json │ │ ├── basic13_bin4_temp=0.0.json │ │ ├── basic13_bin5_temp=0.0.json │ │ ├── basic14_bin1_temp=0.0.json │ │ ├── basic14_bin2_temp=0.0.json │ │ ├── basic14_bin3_temp=0.0.json │ │ ├── basic14_bin4_temp=0.0.json │ │ ├── basic14_bin5_temp=0.0.json │ │ ├── basic15_bin1_temp=0.0.json │ │ ├── basic15_bin2_temp=0.0.json │ │ ├── basic15_bin3_temp=0.0.json │ │ ├── basic15_bin4_temp=0.0.json │ │ ├── basic15_bin5_temp=0.0.json │ │ ├── basic16_bin1_temp=0.0.json │ │ ├── basic16_bin2_temp=0.0.json │ │ ├── basic16_bin3_temp=0.0.json │ │ ├── basic16_bin4_temp=0.0.json │ │ ├── basic16_bin5_temp=0.0.json │ │ ├── basic17_bin1_temp=0.0.json │ │ ├── basic17_bin2_temp=0.0.json │ │ ├── basic17_bin3_temp=0.0.json │ │ ├── basic17_bin4_temp=0.0.json │ │ ├── basic17_bin5_temp=0.0.json │ │ ├── basic18_bin1_temp=0.0.json │ │ ├── basic18_bin2_temp=0.0.json │ │ ├── basic18_bin3_temp=0.0.json │ │ ├── basic18_bin4_temp=0.0.json │ │ ├── basic18_bin5_temp=0.0.json │ │ ├── basic19_bin1_temp=0.0.json │ │ ├── basic19_bin2_temp=0.0.json │ │ ├── basic19_bin3_temp=0.0.json │ │ ├── basic19_bin4_temp=0.0.json │ │ ├── basic19_bin5_temp=0.0.json │ │ ├── basic1_bin1_temp=0.0.json │ │ ├── basic1_bin2_temp=0.0.json │ │ ├── basic1_bin3_temp=0.0.json │ │ ├── basic1_bin4_temp=0.0.json │ │ ├── basic1_bin5_temp=0.0.json │ │ ├── basic20_bin1_temp=0.0.json │ │ ├── basic20_bin2_temp=0.0.json │ │ ├── basic20_bin3_temp=0.0.json │ │ ├── basic20_bin4_temp=0.0.json │ │ ├── basic20_bin5_temp=0.0.json │ │ ├── basic21_bin1_temp=0.0.json │ │ ├── basic21_bin2_temp=0.0.json │ │ ├── basic21_bin3_temp=0.0.json │ │ ├── basic21_bin4_temp=0.0.json │ │ ├── basic21_bin5_temp=0.0.json │ │ ├── basic22_bin1_temp=0.0.json │ │ ├── basic22_bin2_temp=0.0.json │ │ ├── basic22_bin3_temp=0.0.json │ │ ├── basic22_bin4_temp=0.0.json │ │ ├── basic22_bin5_temp=0.0.json │ │ ├── basic23_bin1_temp=0.0.json │ │ ├── basic23_bin2_temp=0.0.json │ │ ├── basic23_bin3_temp=0.0.json │ │ ├── basic23_bin4_temp=0.0.json │ │ ├── basic23_bin5_temp=0.0.json │ │ ├── basic24_bin1_temp=0.0.json │ │ ├── basic24_bin2_temp=0.0.json │ │ ├── basic24_bin3_temp=0.0.json │ │ ├── basic24_bin4_temp=0.0.json │ │ ├── basic24_bin5_temp=0.0.json │ │ ├── basic25_bin1_temp=0.0.json │ │ ├── basic25_bin2_temp=0.0.json │ │ ├── basic25_bin3_temp=0.0.json │ │ ├── basic25_bin4_temp=0.0.json │ │ ├── basic25_bin5_temp=0.0.json │ │ ├── basic2_bin1_temp=0.0.json │ │ ├── basic2_bin2_temp=0.0.json │ │ ├── basic2_bin3_temp=0.0.json │ │ ├── basic2_bin4_temp=0.0.json │ │ ├── basic2_bin5_temp=0.0.json │ │ ├── basic3_bin1_temp=0.0.json │ │ ├── basic3_bin2_temp=0.0.json │ │ ├── basic3_bin3_temp=0.0.json │ │ ├── basic3_bin4_temp=0.0.json │ │ ├── basic3_bin5_temp=0.0.json │ │ ├── basic4_bin1_temp=0.0.json │ │ ├── basic4_bin2_temp=0.0.json │ │ ├── basic4_bin3_temp=0.0.json │ │ ├── basic4_bin4_temp=0.0.json │ │ ├── basic4_bin5_temp=0.0.json │ │ ├── basic5_bin1_temp=0.0.json │ │ ├── basic5_bin2_temp=0.0.json │ │ ├── basic5_bin3_temp=0.0.json │ │ ├── basic5_bin4_temp=0.0.json │ │ ├── basic5_bin5_temp=0.0.json │ │ ├── basic6_bin1_temp=0.0.json │ │ ├── basic6_bin2_temp=0.0.json │ │ ├── basic6_bin3_temp=0.0.json │ │ ├── basic6_bin4_temp=0.0.json │ │ ├── basic6_bin5_temp=0.0.json │ │ ├── basic7_bin1_temp=0.0.json │ │ ├── basic7_bin2_temp=0.0.json │ │ ├── basic7_bin3_temp=0.0.json │ │ ├── basic7_bin4_temp=0.0.json │ │ ├── basic7_bin5_temp=0.0.json │ │ ├── basic8_bin1_temp=0.0.json │ │ ├── basic8_bin2_temp=0.0.json │ │ ├── basic8_bin3_temp=0.0.json │ │ ├── basic8_bin4_temp=0.0.json │ │ ├── basic8_bin5_temp=0.0.json │ │ ├── basic9_bin1_temp=0.0.json │ │ ├── basic9_bin2_temp=0.0.json │ │ ├── basic9_bin3_temp=0.0.json │ │ ├── basic9_bin4_temp=0.0.json │ │ ├── basic9_bin5_temp=0.0.json │ │ └── results.jsonl │ ├── llama3.1-405b │ │ ├── basic10_bin1_temp=0.0.json │ │ ├── basic10_bin2_temp=0.0.json │ │ ├── basic10_bin3_temp=0.0.json │ │ ├── basic10_bin4_temp=0.0.json │ │ ├── basic10_bin5_temp=0.0.json │ │ ├── basic11_bin1_temp=0.0.json │ │ ├── basic11_bin2_temp=0.0.json │ │ ├── basic11_bin3_temp=0.0.json │ │ ├── basic11_bin4_temp=0.0.json │ │ ├── basic11_bin5_temp=0.0.json │ │ ├── basic12_bin1_temp=0.0.json │ │ ├── basic12_bin2_temp=0.0.json │ │ ├── basic12_bin3_temp=0.0.json │ │ ├── basic12_bin4_temp=0.0.json │ │ ├── basic12_bin5_temp=0.0.json │ │ ├── basic13_bin1_temp=0.0.json │ │ ├── basic13_bin2_temp=0.0.json │ │ ├── basic13_bin3_temp=0.0.json │ │ ├── basic13_bin4_temp=0.0.json │ │ ├── basic13_bin5_temp=0.0.json │ │ ├── basic14_bin1_temp=0.0.json │ │ ├── basic14_bin2_temp=0.0.json │ │ ├── basic14_bin3_temp=0.0.json │ │ ├── basic14_bin4_temp=0.0.json │ │ ├── basic14_bin5_temp=0.0.json │ │ ├── basic15_bin1_temp=0.0.json │ │ ├── basic15_bin2_temp=0.0.json │ │ ├── basic15_bin3_temp=0.0.json │ │ ├── basic15_bin4_temp=0.0.json │ │ ├── basic15_bin5_temp=0.0.json │ │ ├── basic16_bin1_temp=0.0.json │ │ ├── basic16_bin2_temp=0.0.json │ │ ├── basic16_bin3_temp=0.0.json │ │ ├── basic16_bin4_temp=0.0.json │ │ ├── basic16_bin5_temp=0.0.json │ │ ├── basic17_bin1_temp=0.0.json │ │ ├── basic17_bin2_temp=0.0.json │ │ ├── basic17_bin3_temp=0.0.json │ │ ├── basic17_bin4_temp=0.0.json │ │ ├── basic17_bin5_temp=0.0.json │ │ ├── basic18_bin1_temp=0.0.json │ │ ├── basic18_bin2_temp=0.0.json │ │ ├── basic18_bin3_temp=0.0.json │ │ ├── basic18_bin4_temp=0.0.json │ │ ├── basic18_bin5_temp=0.0.json │ │ ├── basic19_bin1_temp=0.0.json │ │ ├── basic19_bin2_temp=0.0.json │ │ ├── basic19_bin3_temp=0.0.json │ │ ├── basic19_bin4_temp=0.0.json │ │ ├── basic19_bin5_temp=0.0.json │ │ ├── basic1_bin1_temp=0.0.json │ │ ├── basic1_bin2_temp=0.0.json │ │ ├── basic1_bin3_temp=0.0.json │ │ ├── basic1_bin4_temp=0.0.json │ │ ├── basic1_bin5_temp=0.0.json │ │ ├── basic20_bin1_temp=0.0.json │ │ ├── basic20_bin2_temp=0.0.json │ │ ├── basic20_bin3_temp=0.0.json │ │ ├── basic20_bin4_temp=0.0.json │ │ ├── basic20_bin5_temp=0.0.json │ │ ├── basic21_bin1_temp=0.0.json │ │ ├── basic21_bin2_temp=0.0.json │ │ ├── basic21_bin3_temp=0.0.json │ │ ├── basic21_bin4_temp=0.0.json │ │ ├── basic21_bin5_temp=0.0.json │ │ ├── basic22_bin1_temp=0.0.json │ │ ├── basic22_bin2_temp=0.0.json │ │ ├── basic22_bin3_temp=0.0.json │ │ ├── basic22_bin4_temp=0.0.json │ │ ├── basic22_bin5_temp=0.0.json │ │ ├── basic23_bin1_temp=0.0.json │ │ ├── basic23_bin2_temp=0.0.json │ │ ├── basic23_bin3_temp=0.0.json │ │ ├── basic23_bin4_temp=0.0.json │ │ ├── basic23_bin5_temp=0.0.json │ │ ├── basic24_bin1_temp=0.0.json │ │ ├── basic24_bin2_temp=0.0.json │ │ ├── basic24_bin3_temp=0.0.json │ │ ├── basic24_bin4_temp=0.0.json │ │ ├── basic24_bin5_temp=0.0.json │ │ ├── basic25_bin1_temp=0.0.json │ │ ├── basic25_bin2_temp=0.0.json │ │ ├── basic25_bin3_temp=0.0.json │ │ ├── basic25_bin4_temp=0.0.json │ │ ├── basic25_bin5_temp=0.0.json │ │ ├── basic2_bin1_temp=0.0.json │ │ ├── basic2_bin2_temp=0.0.json │ │ ├── basic2_bin3_temp=0.0.json │ │ ├── basic2_bin4_temp=0.0.json │ │ ├── basic2_bin5_temp=0.0.json │ │ ├── basic3_bin1_temp=0.0.json │ │ ├── basic3_bin2_temp=0.0.json │ │ ├── basic3_bin3_temp=0.0.json │ │ ├── basic3_bin4_temp=0.0.json │ │ ├── basic3_bin5_temp=0.0.json │ │ ├── basic4_bin1_temp=0.0.json │ │ ├── basic4_bin2_temp=0.0.json │ │ ├── basic4_bin3_temp=0.0.json │ │ ├── basic4_bin4_temp=0.0.json │ │ ├── basic4_bin5_temp=0.0.json │ │ ├── basic5_bin1_temp=0.0.json │ │ ├── basic5_bin2_temp=0.0.json │ │ ├── basic5_bin3_temp=0.0.json │ │ ├── basic5_bin4_temp=0.0.json │ │ ├── basic5_bin5_temp=0.0.json │ │ ├── basic6_bin1_temp=0.0.json │ │ ├── basic6_bin2_temp=0.0.json │ │ ├── basic6_bin3_temp=0.0.json │ │ ├── basic6_bin4_temp=0.0.json │ │ ├── basic6_bin5_temp=0.0.json │ │ ├── basic7_bin1_temp=0.0.json │ │ ├── basic7_bin2_temp=0.0.json │ │ ├── basic7_bin3_temp=0.0.json │ │ ├── basic7_bin4_temp=0.0.json │ │ ├── basic7_bin5_temp=0.0.json │ │ ├── basic8_bin1_temp=0.0.json │ │ ├── basic8_bin2_temp=0.0.json │ │ ├── basic8_bin3_temp=0.0.json │ │ ├── basic8_bin4_temp=0.0.json │ │ ├── basic8_bin5_temp=0.0.json │ │ ├── basic9_bin1_temp=0.0.json │ │ ├── basic9_bin2_temp=0.0.json │ │ ├── basic9_bin3_temp=0.0.json │ │ ├── basic9_bin4_temp=0.0.json │ │ ├── basic9_bin5_temp=0.0.json │ │ ├── results.jsonl │ │ └── results1.jsonl │ └── o1 │ │ ├── basic12_bin1_temp=0.0.json │ │ ├── basic12_bin5_temp=0.0.json │ │ ├── basic13_bin1_temp=0.0.json │ │ ├── basic13_bin5_temp=0.0.json │ │ ├── basic14_bin1_temp=0.0.json │ │ └── basic14_bin5_temp=0.0.json └── text_cot │ ├── claude-3 │ ├── cot10_bin1_temp=0.0.json │ ├── cot10_bin2_temp=0.0.json │ ├── cot10_bin3_temp=0.0.json │ ├── cot10_bin4_temp=0.0.json │ ├── cot10_bin5_temp=0.0.json │ ├── cot11_bin1_temp=0.0.json │ ├── cot11_bin2_temp=0.0.json │ ├── cot11_bin3_temp=0.0.json │ ├── cot11_bin4_temp=0.0.json │ ├── cot11_bin5_temp=0.0.json │ ├── cot12_bin1_temp=0.0.json │ ├── cot12_bin2_temp=0.0.json │ ├── cot12_bin3_temp=0.0.json │ ├── cot12_bin4_temp=0.0.json │ ├── cot12_bin5_temp=0.0.json │ ├── cot13_bin1_temp=0.0.json │ ├── cot13_bin2_temp=0.0.json │ ├── cot13_bin3_temp=0.0.json │ ├── cot13_bin4_temp=0.0.json │ ├── cot13_bin5_temp=0.0.json │ ├── cot14_bin1_temp=0.0.json │ ├── cot14_bin2_temp=0.0.json │ ├── cot14_bin3_temp=0.0.json │ ├── cot14_bin4_temp=0.0.json │ ├── cot14_bin5_temp=0.0.json │ ├── cot15_bin1_temp=0.0.json │ ├── cot15_bin2_temp=0.0.json │ ├── cot15_bin3_temp=0.0.json │ ├── cot15_bin4_temp=0.0.json │ ├── cot15_bin5_temp=0.0.json │ ├── cot16_bin1_temp=0.0.json │ ├── cot16_bin2_temp=0.0.json │ ├── cot16_bin3_temp=0.0.json │ ├── cot16_bin4_temp=0.0.json │ ├── cot16_bin5_temp=0.0.json │ ├── cot17_bin1_temp=0.0.json │ ├── cot17_bin2_temp=0.0.json │ ├── cot17_bin3_temp=0.0.json │ ├── cot17_bin4_temp=0.0.json │ ├── cot17_bin5_temp=0.0.json │ ├── cot18_bin1_temp=0.0.json │ ├── cot18_bin2_temp=0.0.json │ ├── cot18_bin3_temp=0.0.json │ ├── cot18_bin4_temp=0.0.json │ ├── cot18_bin5_temp=0.0.json │ ├── cot19_bin1_temp=0.0.json │ ├── cot19_bin2_temp=0.0.json │ ├── cot19_bin3_temp=0.0.json │ ├── cot19_bin4_temp=0.0.json │ ├── cot19_bin5_temp=0.0.json │ ├── cot1_bin1_temp=0.0.json │ ├── cot1_bin2_temp=0.0.json │ ├── cot1_bin3_temp=0.0.json │ ├── cot1_bin4_temp=0.0.json │ ├── cot1_bin5_temp=0.0.json │ ├── cot20_bin1_temp=0.0.json │ ├── cot20_bin2_temp=0.0.json │ ├── cot20_bin3_temp=0.0.json │ ├── cot20_bin4_temp=0.0.json │ ├── cot20_bin5_temp=0.0.json │ ├── cot21_bin1_temp=0.0.json │ ├── cot21_bin2_temp=0.0.json │ ├── cot21_bin3_temp=0.0.json │ ├── cot21_bin4_temp=0.0.json │ ├── cot21_bin5_temp=0.0.json │ ├── cot22_bin1_temp=0.0.json │ ├── cot22_bin2_temp=0.0.json │ ├── cot22_bin3_temp=0.0.json │ ├── cot22_bin4_temp=0.0.json │ ├── cot22_bin5_temp=0.0.json │ ├── cot23_bin1_temp=0.0.json │ ├── cot23_bin2_temp=0.0.json │ ├── cot23_bin3_temp=0.0.json │ ├── cot23_bin4_temp=0.0.json │ ├── cot23_bin5_temp=0.0.json │ ├── cot24_bin1_temp=0.0.json │ ├── cot24_bin2_temp=0.0.json │ ├── cot24_bin3_temp=0.0.json │ ├── cot24_bin4_temp=0.0.json │ ├── cot24_bin5_temp=0.0.json │ ├── cot25_bin1_temp=0.0.json │ ├── cot25_bin2_temp=0.0.json │ ├── cot25_bin3_temp=0.0.json │ ├── cot25_bin4_temp=0.0.json │ ├── cot25_bin5_temp=0.0.json │ ├── cot2_bin1_temp=0.0.json │ ├── cot2_bin2_temp=0.0.json │ ├── cot2_bin3_temp=0.0.json │ ├── cot2_bin4_temp=0.0.json │ ├── cot2_bin5_temp=0.0.json │ ├── cot3_bin1_temp=0.0.json │ ├── cot3_bin2_temp=0.0.json │ ├── cot3_bin3_temp=0.0.json │ ├── cot3_bin4_temp=0.0.json │ ├── cot3_bin5_temp=0.0.json │ ├── cot4_bin1_temp=0.0.json │ ├── cot4_bin2_temp=0.0.json │ ├── cot4_bin3_temp=0.0.json │ ├── cot4_bin4_temp=0.0.json │ ├── cot4_bin5_temp=0.0.json │ ├── cot5_bin1_temp=0.0.json │ ├── cot5_bin2_temp=0.0.json │ ├── cot5_bin3_temp=0.0.json │ ├── cot5_bin4_temp=0.0.json │ ├── cot5_bin5_temp=0.0.json │ ├── cot6_bin1_temp=0.0.json │ ├── cot6_bin2_temp=0.0.json │ ├── cot6_bin3_temp=0.0.json │ ├── cot6_bin4_temp=0.0.json │ ├── cot6_bin5_temp=0.0.json │ ├── cot7_bin1_temp=0.0.json │ ├── cot7_bin2_temp=0.0.json │ ├── cot7_bin3_temp=0.0.json │ ├── cot7_bin4_temp=0.0.json │ ├── cot7_bin5_temp=0.0.json │ ├── cot8_bin1_temp=0.0.json │ ├── cot8_bin2_temp=0.0.json │ ├── cot8_bin3_temp=0.0.json │ ├── cot8_bin4_temp=0.0.json │ ├── cot8_bin5_temp=0.0.json │ ├── cot9_bin1_temp=0.0.json │ ├── cot9_bin2_temp=0.0.json │ ├── cot9_bin3_temp=0.0.json │ ├── cot9_bin4_temp=0.0.json │ ├── cot9_bin5_temp=0.0.json │ └── results.jsonl │ ├── gpt-4 │ ├── cot10_bin1_gpt-4-0613_temp=0.0.json │ ├── cot10_bin2_gpt-4-0613_temp=0.0.json │ ├── cot10_bin3_gpt-4-0613_temp=0.0.json │ ├── cot10_bin4_gpt-4-0613_temp=0.0.json │ ├── cot10_bin5_gpt-4-0613_temp=0.0.json │ ├── cot11_bin1_gpt-4-0613_temp=0.0.json │ ├── cot11_bin2_gpt-4-0613_temp=0.0.json │ ├── cot11_bin3_gpt-4-0613_temp=0.0.json │ ├── cot11_bin4_gpt-4-0613_temp=0.0.json │ ├── cot11_bin5_gpt-4-0613_temp=0.0.json │ ├── cot12_bin1_gpt-4-0613_temp=0.0.json │ ├── cot12_bin2_gpt-4-0613_temp=0.0.json │ ├── cot12_bin3_gpt-4-0613_temp=0.0.json │ ├── cot12_bin4_gpt-4-0613_temp=0.0.json │ ├── cot12_bin5_gpt-4-0613_temp=0.0.json │ ├── cot13_bin1_gpt-4-0613_temp=0.0.json │ ├── cot13_bin2_gpt-4-0613_temp=0.0.json │ ├── cot13_bin3_gpt-4-0613_temp=0.0.json │ ├── cot13_bin4_gpt-4-0613_temp=0.0.json │ ├── cot13_bin5_gpt-4-0613_temp=0.0.json │ ├── cot14_bin1_gpt-4-0613_temp=0.0.json │ ├── cot14_bin2_gpt-4-0613_temp=0.0.json │ ├── cot14_bin3_gpt-4-0613_temp=0.0.json │ ├── cot14_bin4_gpt-4-0613_temp=0.0.json │ ├── cot14_bin5_gpt-4-0613_temp=0.0.json │ ├── cot15_bin1_gpt-4-0613_temp=0.0.json │ ├── cot15_bin2_gpt-4-0613_temp=0.0.json │ ├── cot15_bin3_gpt-4-0613_temp=0.0.json │ ├── cot15_bin4_gpt-4-0613_temp=0.0.json │ ├── cot15_bin5_gpt-4-0613_temp=0.0.json │ ├── cot16_bin1_gpt-4-0613_temp=0.0.json │ ├── cot16_bin2_gpt-4-0613_temp=0.0.json │ ├── cot16_bin3_gpt-4-0613_temp=0.0.json │ ├── cot16_bin4_gpt-4-0613_temp=0.0.json │ ├── cot16_bin5_gpt-4-0613_temp=0.0.json │ ├── cot17_bin1_gpt-4-0613_temp=0.0.json │ ├── cot17_bin2_gpt-4-0613_temp=0.0.json │ ├── cot17_bin3_gpt-4-0613_temp=0.0.json │ ├── cot17_bin4_gpt-4-0613_temp=0.0.json │ ├── cot17_bin5_gpt-4-0613_temp=0.0.json │ ├── cot18_bin1_gpt-4-0613_temp=0.0.json │ ├── cot18_bin2_gpt-4-0613_temp=0.0.json │ ├── cot18_bin3_gpt-4-0613_temp=0.0.json │ ├── cot18_bin4_gpt-4-0613_temp=0.0.json │ ├── cot18_bin5_gpt-4-0613_temp=0.0.json │ ├── cot19_bin1_gpt-4-0613_temp=0.0.json │ ├── cot19_bin2_gpt-4-0613_temp=0.0.json │ ├── cot19_bin3_gpt-4-0613_temp=0.0.json │ ├── cot19_bin4_gpt-4-0613_temp=0.0.json │ ├── cot19_bin5_gpt-4-0613_temp=0.0.json │ ├── cot1_bin1_gpt-4-0613_temp=0.0.json │ ├── cot1_bin2_gpt-4-0613_temp=0.0.json │ ├── cot1_bin3_gpt-4-0613_temp=0.0.json │ ├── cot1_bin4_gpt-4-0613_temp=0.0.json │ ├── cot1_bin5_gpt-4-0613_temp=0.0.json │ ├── cot20_bin1_gpt-4-0613_temp=0.0.json │ ├── cot20_bin2_gpt-4-0613_temp=0.0.json │ ├── cot20_bin3_gpt-4-0613_temp=0.0.json │ ├── cot20_bin4_gpt-4-0613_temp=0.0.json │ ├── cot20_bin5_gpt-4-0613_temp=0.0.json │ ├── cot21_bin1_gpt-4-0613_temp=0.0.json │ ├── cot21_bin2_gpt-4-0613_temp=0.0.json │ ├── cot21_bin3_gpt-4-0613_temp=0.0.json │ ├── cot21_bin4_gpt-4-0613_temp=0.0.json │ ├── cot21_bin5_gpt-4-0613_temp=0.0.json │ ├── cot22_bin1_gpt-4-0613_temp=0.0.json │ ├── cot22_bin2_gpt-4-0613_temp=0.0.json │ ├── cot22_bin3_gpt-4-0613_temp=0.0.json │ ├── cot22_bin4_gpt-4-0613_temp=0.0.json │ ├── cot22_bin5_gpt-4-0613_temp=0.0.json │ ├── cot23_bin1_gpt-4-0613_temp=0.0.json │ ├── cot23_bin2_gpt-4-0613_temp=0.0.json │ ├── cot23_bin3_gpt-4-0613_temp=0.0.json │ ├── cot23_bin4_gpt-4-0613_temp=0.0.json │ ├── cot23_bin5_gpt-4-0613_temp=0.0.json │ ├── cot24_bin1_gpt-4-0613_temp=0.0.json │ ├── cot24_bin2_gpt-4-0613_temp=0.0.json │ ├── cot24_bin3_gpt-4-0613_temp=0.0.json │ ├── cot24_bin4_gpt-4-0613_temp=0.0.json │ ├── cot24_bin5_gpt-4-0613_temp=0.0.json │ ├── cot25_bin1_gpt-4-0613_temp=0.0.json │ ├── cot25_bin2_gpt-4-0613_temp=0.0.json │ ├── cot25_bin3_gpt-4-0613_temp=0.0.json │ ├── cot25_bin4_gpt-4-0613_temp=0.0.json │ ├── cot25_bin5_gpt-4-0613_temp=0.0.json │ ├── cot2_bin1_gpt-4-0613_temp=0.0.json │ ├── cot2_bin2_gpt-4-0613_temp=0.0.json │ ├── cot2_bin3_gpt-4-0613_temp=0.0.json │ ├── cot2_bin4_gpt-4-0613_temp=0.0.json │ ├── cot2_bin5_gpt-4-0613_temp=0.0.json │ ├── cot3_bin1_gpt-4-0613_temp=0.0.json │ ├── cot3_bin2_gpt-4-0613_temp=0.0.json │ ├── cot3_bin3_gpt-4-0613_temp=0.0.json │ ├── cot3_bin4_gpt-4-0613_temp=0.0.json │ ├── cot3_bin5_gpt-4-0613_temp=0.0.json │ ├── cot4_bin1_gpt-4-0613_temp=0.0.json │ ├── cot4_bin2_gpt-4-0613_temp=0.0.json │ ├── cot4_bin3_gpt-4-0613_temp=0.0.json │ ├── cot4_bin4_gpt-4-0613_temp=0.0.json │ ├── cot4_bin5_gpt-4-0613_temp=0.0.json │ ├── cot5_bin1_gpt-4-0613_temp=0.0.json │ ├── cot5_bin2_gpt-4-0613_temp=0.0.json │ ├── cot5_bin3_gpt-4-0613_temp=0.0.json │ ├── cot5_bin4_gpt-4-0613_temp=0.0.json │ ├── cot5_bin5_gpt-4-0613_temp=0.0.json │ ├── cot6_bin1_gpt-4-0613_temp=0.0.json │ ├── cot6_bin2_gpt-4-0613_temp=0.0.json │ ├── cot6_bin3_gpt-4-0613_temp=0.0.json │ ├── cot6_bin4_gpt-4-0613_temp=0.0.json │ ├── cot6_bin5_gpt-4-0613_temp=0.0.json │ ├── cot7_bin1_gpt-4-0613_temp=0.0.json │ ├── cot7_bin2_gpt-4-0613_temp=0.0.json │ ├── cot7_bin3_gpt-4-0613_temp=0.0.json │ ├── cot7_bin4_gpt-4-0613_temp=0.0.json │ ├── cot7_bin5_gpt-4-0613_temp=0.0.json │ ├── cot8_bin1_gpt-4-0613_temp=0.0.json │ ├── cot8_bin2_gpt-4-0613_temp=0.0.json │ ├── cot8_bin3_gpt-4-0613_temp=0.0.json │ ├── cot8_bin4_gpt-4-0613_temp=0.0.json │ ├── cot8_bin5_gpt-4-0613_temp=0.0.json │ ├── cot9_bin1_gpt-4-0613_temp=0.0.json │ ├── cot9_bin2_gpt-4-0613_temp=0.0.json │ ├── cot9_bin3_gpt-4-0613_temp=0.0.json │ ├── cot9_bin4_gpt-4-0613_temp=0.0.json │ ├── cot9_bin5_gpt-4-0613_temp=0.0.json │ └── results.jsonl │ └── llama3.1-405b │ ├── cot10_bin1_temp=0.0.json │ ├── cot10_bin2_temp=0.0.json │ ├── cot10_bin3_temp=0.0.json │ ├── cot10_bin4_temp=0.0.json │ ├── cot10_bin5_temp=0.0.json │ ├── cot11_bin1_temp=0.0.json │ ├── cot11_bin2_temp=0.0.json │ ├── cot11_bin3_temp=0.0.json │ ├── cot11_bin4_temp=0.0.json │ ├── cot11_bin5_temp=0.0.json │ ├── cot12_bin1_temp=0.0.json │ ├── cot12_bin2_temp=0.0.json │ ├── cot12_bin3_temp=0.0.json │ ├── cot12_bin4_temp=0.0.json │ ├── cot12_bin5_temp=0.0.json │ ├── cot13_bin1_temp=0.0.json │ ├── cot13_bin2_temp=0.0.json │ ├── cot13_bin3_temp=0.0.json │ ├── cot13_bin4_temp=0.0.json │ ├── cot13_bin5_temp=0.0.json │ ├── cot14_bin1_temp=0.0.json │ ├── cot14_bin2_temp=0.0.json │ ├── cot14_bin3_temp=0.0.json │ ├── cot14_bin4_temp=0.0.json │ ├── cot14_bin5_temp=0.0.json │ ├── cot15_bin1_temp=0.0.json │ ├── cot15_bin2_temp=0.0.json │ ├── cot15_bin3_temp=0.0.json │ ├── cot15_bin4_temp=0.0.json │ ├── cot15_bin5_temp=0.0.json │ ├── cot16_bin1_temp=0.0.json │ ├── cot16_bin2_temp=0.0.json │ ├── cot16_bin3_temp=0.0.json │ ├── cot16_bin4_temp=0.0.json │ ├── cot16_bin5_temp=0.0.json │ ├── cot17_bin1_temp=0.0.json │ ├── cot17_bin2_temp=0.0.json │ ├── cot17_bin3_temp=0.0.json │ ├── cot17_bin4_temp=0.0.json │ ├── cot17_bin5_temp=0.0.json │ ├── cot18_bin1_temp=0.0.json │ ├── cot18_bin2_temp=0.0.json │ ├── cot18_bin3_temp=0.0.json │ ├── cot18_bin4_temp=0.0.json │ ├── cot18_bin5_temp=0.0.json │ ├── cot19_bin1_temp=0.0.json │ ├── cot19_bin2_temp=0.0.json │ ├── cot19_bin3_temp=0.0.json │ ├── cot19_bin4_temp=0.0.json │ ├── cot19_bin5_temp=0.0.json │ ├── cot1_bin1_temp=0.0.json │ ├── cot1_bin2_temp=0.0.json │ ├── cot1_bin3_temp=0.0.json │ ├── cot1_bin4_temp=0.0.json │ ├── cot1_bin5_temp=0.0.json │ ├── cot20_bin1_temp=0.0.json │ ├── cot20_bin2_temp=0.0.json │ ├── cot20_bin3_temp=0.0.json │ ├── cot20_bin4_temp=0.0.json │ ├── cot20_bin5_temp=0.0.json │ ├── cot21_bin1_temp=0.0.json │ ├── cot21_bin2_temp=0.0.json │ ├── cot21_bin3_temp=0.0.json │ ├── cot21_bin4_temp=0.0.json │ ├── cot21_bin5_temp=0.0.json │ ├── cot22_bin1_temp=0.0.json │ ├── cot22_bin2_temp=0.0.json │ ├── cot22_bin3_temp=0.0.json │ ├── cot22_bin4_temp=0.0.json │ ├── cot22_bin5_temp=0.0.json │ ├── cot23_bin1_temp=0.0.json │ ├── cot23_bin2_temp=0.0.json │ ├── cot23_bin3_temp=0.0.json │ ├── cot23_bin4_temp=0.0.json │ ├── cot23_bin5_temp=0.0.json │ ├── cot24_bin1_temp=0.0.json │ ├── cot24_bin2_temp=0.0.json │ ├── cot24_bin3_temp=0.0.json │ ├── cot24_bin4_temp=0.0.json │ ├── cot24_bin5_temp=0.0.json │ ├── cot25_bin1_temp=0.0.json │ ├── cot25_bin2_temp=0.0.json │ ├── cot25_bin3_temp=0.0.json │ ├── cot25_bin4_temp=0.0.json │ ├── cot25_bin5_temp=0.0.json │ ├── cot2_bin1_temp=0.0.json │ ├── cot2_bin2_temp=0.0.json │ ├── cot2_bin3_temp=0.0.json │ ├── cot2_bin4_temp=0.0.json │ ├── cot2_bin5_temp=0.0.json │ ├── cot3_bin1_temp=0.0.json │ ├── cot3_bin2_temp=0.0.json │ ├── cot3_bin3_temp=0.0.json │ ├── cot3_bin4_temp=0.0.json │ ├── cot3_bin5_temp=0.0.json │ ├── cot4_bin1_temp=0.0.json │ ├── cot4_bin2_temp=0.0.json │ ├── cot4_bin3_temp=0.0.json │ ├── cot4_bin4_temp=0.0.json │ ├── cot4_bin5_temp=0.0.json │ ├── cot5_bin1_temp=0.0.json │ ├── cot5_bin2_temp=0.0.json │ ├── cot5_bin3_temp=0.0.json │ ├── cot5_bin4_temp=0.0.json │ ├── cot5_bin5_temp=0.0.json │ ├── cot6_bin1_temp=0.0.json │ ├── cot6_bin2_temp=0.0.json │ ├── cot6_bin3_temp=0.0.json │ ├── cot6_bin4_temp=0.0.json │ ├── cot6_bin5_temp=0.0.json │ ├── cot7_bin1_temp=0.0.json │ ├── cot7_bin2_temp=0.0.json │ ├── cot7_bin3_temp=0.0.json │ ├── cot7_bin4_temp=0.0.json │ ├── cot7_bin5_temp=0.0.json │ ├── cot8_bin1_temp=0.0.json │ ├── cot8_bin2_temp=0.0.json │ ├── cot8_bin3_temp=0.0.json │ ├── cot8_bin4_temp=0.0.json │ ├── cot8_bin5_temp=0.0.json │ ├── cot9_bin1_temp=0.0.json │ ├── cot9_bin2_temp=0.0.json │ ├── cot9_bin3_temp=0.0.json │ ├── cot9_bin4_temp=0.0.json │ ├── cot9_bin5_temp=0.0.json │ └── results.jsonl ├── models └── openai_help.py ├── regression ├── README.md ├── create_train_table.py ├── regression.ipynb ├── text_cot_test_results.tsv ├── text_cot_test_table.tsv ├── text_cot_train_results.tsv └── text_cot_train_table.tsv ├── run_claude3.py ├── run_llama3.py ├── run_o1.py ├── run_openai.py ├── seven_letter_words ├── README.md ├── bin1_prob.txt ├── bin2_prob.txt ├── bin3_prob.txt ├── bin4_prob.txt ├── bin5_prob.txt ├── gpt2_prob_sevenletter.py ├── input_scored.txt ├── random_token_combos.py ├── select_words.py └── words_5bins.txt ├── stimuli ├── math_cot │ ├── math_cot19_bin1.jsonl │ ├── math_cot19_bin2.jsonl │ ├── math_cot19_bin3.jsonl │ ├── math_cot19_bin4.jsonl │ ├── math_cot19_bin5.jsonl │ ├── math_cot20_bin1.jsonl │ ├── math_cot20_bin2.jsonl │ ├── math_cot20_bin3.jsonl │ ├── math_cot20_bin4.jsonl │ ├── math_cot20_bin5.jsonl │ ├── math_cot21_bin1.jsonl │ ├── math_cot21_bin2.jsonl │ ├── math_cot21_bin3.jsonl │ ├── math_cot21_bin4.jsonl │ ├── math_cot21_bin5.jsonl │ ├── math_cot22_bin1.jsonl │ ├── math_cot22_bin2.jsonl │ ├── math_cot22_bin3.jsonl │ ├── math_cot22_bin4.jsonl │ ├── math_cot22_bin5.jsonl │ ├── math_cot23_bin1.jsonl │ ├── math_cot23_bin2.jsonl │ ├── math_cot23_bin3.jsonl │ ├── math_cot23_bin4.jsonl │ └── math_cot23_bin5.jsonl ├── math_swap │ └── math_swap4_bin5.jsonl ├── number_cot │ ├── math10_bin1.jsonl │ ├── math10_bin2.jsonl │ ├── math10_bin3.jsonl │ ├── math10_bin4.jsonl │ ├── math10_bin5.jsonl │ ├── math11_bin1.jsonl │ ├── math11_bin2.jsonl │ ├── math11_bin3.jsonl │ ├── math11_bin4.jsonl │ ├── math11_bin5.jsonl │ ├── math12_bin1.jsonl │ ├── math12_bin2.jsonl │ ├── math12_bin3.jsonl │ ├── math12_bin4.jsonl │ ├── math12_bin5.jsonl │ ├── math13_bin1.jsonl │ ├── math13_bin2.jsonl │ ├── math13_bin3.jsonl │ ├── math13_bin4.jsonl │ ├── math13_bin5.jsonl │ ├── math14_bin1.jsonl │ ├── math14_bin2.jsonl │ ├── math14_bin3.jsonl │ ├── math14_bin4.jsonl │ ├── math14_bin5.jsonl │ ├── math15_bin1.jsonl │ ├── math15_bin2.jsonl │ ├── math15_bin3.jsonl │ ├── math15_bin4.jsonl │ ├── math15_bin5.jsonl │ ├── math16_bin1.jsonl │ ├── math16_bin2.jsonl │ ├── math16_bin3.jsonl │ ├── math16_bin4.jsonl │ ├── math16_bin5.jsonl │ ├── math17_bin1.jsonl │ ├── math17_bin2.jsonl │ ├── math17_bin3.jsonl │ ├── math17_bin4.jsonl │ ├── math17_bin5.jsonl │ ├── math18_bin1.jsonl │ ├── math18_bin2.jsonl │ ├── math18_bin3.jsonl │ ├── math18_bin4.jsonl │ ├── math18_bin5.jsonl │ ├── math19_bin1.jsonl │ ├── math19_bin2.jsonl │ ├── math19_bin3.jsonl │ ├── math19_bin4.jsonl │ ├── math19_bin5.jsonl │ ├── math1_bin1.jsonl │ ├── math1_bin2.jsonl │ ├── math1_bin3.jsonl │ ├── math1_bin4.jsonl │ ├── math1_bin5.jsonl │ ├── math20_bin1.jsonl │ ├── math20_bin2.jsonl │ ├── math20_bin3.jsonl │ ├── math20_bin4.jsonl │ ├── math20_bin5.jsonl │ ├── math21_bin1.jsonl │ ├── math21_bin2.jsonl │ ├── math21_bin3.jsonl │ ├── math21_bin4.jsonl │ ├── math21_bin5.jsonl │ ├── math22_bin1.jsonl │ ├── math22_bin2.jsonl │ ├── math22_bin3.jsonl │ ├── math22_bin4.jsonl │ ├── math22_bin5.jsonl │ ├── math23_bin1.jsonl │ ├── math23_bin2.jsonl │ ├── math23_bin3.jsonl │ ├── math23_bin4.jsonl │ ├── math23_bin5.jsonl │ ├── math24_bin1.jsonl │ ├── math24_bin2.jsonl │ ├── math24_bin3.jsonl │ ├── math24_bin4.jsonl │ ├── math24_bin5.jsonl │ ├── math25_bin1.jsonl │ ├── math25_bin2.jsonl │ ├── math25_bin3.jsonl │ ├── math25_bin4.jsonl │ ├── math25_bin5.jsonl │ ├── math2_bin1.jsonl │ ├── math2_bin2.jsonl │ ├── math2_bin3.jsonl │ ├── math2_bin4.jsonl │ ├── math2_bin5.jsonl │ ├── math3_bin1.jsonl │ ├── math3_bin2.jsonl │ ├── math3_bin3.jsonl │ ├── math3_bin4.jsonl │ ├── math3_bin5.jsonl │ ├── math4_bin1.jsonl │ ├── math4_bin2.jsonl │ ├── math4_bin3.jsonl │ ├── math4_bin4.jsonl │ ├── math4_bin5.jsonl │ ├── math5_bin1.jsonl │ ├── math5_bin2.jsonl │ ├── math5_bin3.jsonl │ ├── math5_bin4.jsonl │ ├── math5_bin5.jsonl │ ├── math6_bin1.jsonl │ ├── math6_bin2.jsonl │ ├── math6_bin3.jsonl │ ├── math6_bin4.jsonl │ ├── math6_bin5.jsonl │ ├── math7_bin1.jsonl │ ├── math7_bin2.jsonl │ ├── math7_bin3.jsonl │ ├── math7_bin4.jsonl │ ├── math7_bin5.jsonl │ ├── math8_bin1.jsonl │ ├── math8_bin2.jsonl │ ├── math8_bin3.jsonl │ ├── math8_bin4.jsonl │ ├── math8_bin5.jsonl │ ├── math9_bin1.jsonl │ ├── math9_bin2.jsonl │ ├── math9_bin3.jsonl │ ├── math9_bin4.jsonl │ └── math9_bin5.jsonl ├── standard │ ├── basic10_bin1.jsonl │ ├── basic10_bin2.jsonl │ ├── basic10_bin3.jsonl │ ├── basic10_bin4.jsonl │ ├── basic10_bin5.jsonl │ ├── basic11_bin1.jsonl │ ├── basic11_bin2.jsonl │ ├── basic11_bin3.jsonl │ ├── basic11_bin4.jsonl │ ├── basic11_bin5.jsonl │ ├── basic12_bin1.jsonl │ ├── basic12_bin2.jsonl │ ├── basic12_bin3.jsonl │ ├── basic12_bin4.jsonl │ ├── basic12_bin5.jsonl │ ├── basic13_bin1.jsonl │ ├── basic13_bin2.jsonl │ ├── basic13_bin3.jsonl │ ├── basic13_bin4.jsonl │ ├── basic13_bin5.jsonl │ ├── basic14_bin1.jsonl │ ├── basic14_bin2.jsonl │ ├── basic14_bin3.jsonl │ ├── basic14_bin4.jsonl │ ├── basic14_bin5.jsonl │ ├── basic15_bin1.jsonl │ ├── basic15_bin2.jsonl │ ├── basic15_bin3.jsonl │ ├── basic15_bin4.jsonl │ ├── basic15_bin5.jsonl │ ├── basic16_bin1.jsonl │ ├── basic16_bin2.jsonl │ ├── basic16_bin3.jsonl │ ├── basic16_bin4.jsonl │ ├── basic16_bin5.jsonl │ ├── basic17_bin1.jsonl │ ├── basic17_bin2.jsonl │ ├── basic17_bin3.jsonl │ ├── basic17_bin4.jsonl │ ├── basic17_bin5.jsonl │ ├── basic18_bin1.jsonl │ ├── basic18_bin2.jsonl │ ├── basic18_bin3.jsonl │ ├── basic18_bin4.jsonl │ ├── basic18_bin5.jsonl │ ├── basic19_bin1.jsonl │ ├── basic19_bin2.jsonl │ ├── basic19_bin3.jsonl │ ├── basic19_bin4.jsonl │ ├── basic19_bin5.jsonl │ ├── basic1_bin1.jsonl │ ├── basic1_bin2.jsonl │ ├── basic1_bin3.jsonl │ ├── basic1_bin4.jsonl │ ├── basic1_bin5.jsonl │ ├── basic20_bin1.jsonl │ ├── basic20_bin2.jsonl │ ├── basic20_bin3.jsonl │ ├── basic20_bin4.jsonl │ ├── basic20_bin5.jsonl │ ├── basic21_bin1.jsonl │ ├── basic21_bin2.jsonl │ ├── basic21_bin3.jsonl │ ├── basic21_bin4.jsonl │ ├── basic21_bin5.jsonl │ ├── basic22_bin1.jsonl │ ├── basic22_bin2.jsonl │ ├── basic22_bin3.jsonl │ ├── basic22_bin4.jsonl │ ├── basic22_bin5.jsonl │ ├── basic23_bin1.jsonl │ ├── basic23_bin2.jsonl │ ├── basic23_bin3.jsonl │ ├── basic23_bin4.jsonl │ ├── basic23_bin5.jsonl │ ├── basic24_bin1.jsonl │ ├── basic24_bin2.jsonl │ ├── basic24_bin3.jsonl │ ├── basic24_bin4.jsonl │ ├── basic24_bin5.jsonl │ ├── basic25_bin1.jsonl │ ├── basic25_bin2.jsonl │ ├── basic25_bin3.jsonl │ ├── basic25_bin4.jsonl │ ├── basic25_bin5.jsonl │ ├── basic2_bin1.jsonl │ ├── basic2_bin2.jsonl │ ├── basic2_bin3.jsonl │ ├── basic2_bin4.jsonl │ ├── basic2_bin5.jsonl │ ├── basic3_bin1.jsonl │ ├── basic3_bin2.jsonl │ ├── basic3_bin3.jsonl │ ├── basic3_bin4.jsonl │ ├── basic3_bin5.jsonl │ ├── basic4_bin1.jsonl │ ├── basic4_bin2.jsonl │ ├── basic4_bin3.jsonl │ ├── basic4_bin4.jsonl │ ├── basic4_bin5.jsonl │ ├── basic5_bin1.jsonl │ ├── basic5_bin2.jsonl │ ├── basic5_bin3.jsonl │ ├── basic5_bin4.jsonl │ ├── basic5_bin5.jsonl │ ├── basic6_bin1.jsonl │ ├── basic6_bin2.jsonl │ ├── basic6_bin3.jsonl │ ├── basic6_bin4.jsonl │ ├── basic6_bin5.jsonl │ ├── basic7_bin1.jsonl │ ├── basic7_bin2.jsonl │ ├── basic7_bin3.jsonl │ ├── basic7_bin4.jsonl │ ├── basic7_bin5.jsonl │ ├── basic8_bin1.jsonl │ ├── basic8_bin2.jsonl │ ├── basic8_bin3.jsonl │ ├── basic8_bin4.jsonl │ ├── basic8_bin5.jsonl │ ├── basic9_bin1.jsonl │ ├── basic9_bin2.jsonl │ ├── basic9_bin3.jsonl │ ├── basic9_bin4.jsonl │ └── basic9_bin5.jsonl ├── swap │ ├── cot13_bin5.jsonl │ ├── cot14_bin5.jsonl │ ├── cot4_bin5.jsonl │ ├── cot5_bin5.jsonl │ ├── swap13c_bin5.jsonl │ ├── swap14c_bin5.jsonl │ ├── swap4c_bin5.jsonl │ └── swap5c_bin5.jsonl └── text_cot │ ├── cot10_bin1.jsonl │ ├── cot10_bin2.jsonl │ ├── cot10_bin3.jsonl │ ├── cot10_bin4.jsonl │ ├── cot10_bin5.jsonl │ ├── cot11_bin1.jsonl │ ├── cot11_bin2.jsonl │ ├── cot11_bin3.jsonl │ ├── cot11_bin4.jsonl │ ├── cot11_bin5.jsonl │ ├── cot12_bin1.jsonl │ ├── cot12_bin2.jsonl │ ├── cot12_bin3.jsonl │ ├── cot12_bin4.jsonl │ ├── cot12_bin5.jsonl │ ├── cot13_bin1.jsonl │ ├── cot13_bin2.jsonl │ ├── cot13_bin3.jsonl │ ├── cot13_bin4.jsonl │ ├── cot13_bin5.jsonl │ ├── cot14_bin1.jsonl │ ├── cot14_bin2.jsonl │ ├── cot14_bin3.jsonl │ ├── cot14_bin4.jsonl │ ├── cot14_bin5.jsonl │ ├── cot15_bin1.jsonl │ ├── cot15_bin2.jsonl │ ├── cot15_bin3.jsonl │ ├── cot15_bin4.jsonl │ ├── cot15_bin5.jsonl │ ├── cot16_bin1.jsonl │ ├── cot16_bin2.jsonl │ ├── cot16_bin3.jsonl │ ├── cot16_bin4.jsonl │ ├── cot16_bin5.jsonl │ ├── cot17_bin1.jsonl │ ├── cot17_bin2.jsonl │ ├── cot17_bin3.jsonl │ ├── cot17_bin4.jsonl │ ├── cot17_bin5.jsonl │ ├── cot18_bin1.jsonl │ ├── cot18_bin2.jsonl │ ├── cot18_bin3.jsonl │ ├── cot18_bin4.jsonl │ ├── cot18_bin5.jsonl │ ├── cot19_bin1.jsonl │ ├── cot19_bin2.jsonl │ ├── cot19_bin3.jsonl │ ├── cot19_bin4.jsonl │ ├── cot19_bin5.jsonl │ ├── cot1_bin1.jsonl │ ├── cot1_bin2.jsonl │ ├── cot1_bin3.jsonl │ ├── cot1_bin4.jsonl │ ├── cot1_bin5.jsonl │ ├── cot20_bin1.jsonl │ ├── cot20_bin2.jsonl │ ├── cot20_bin3.jsonl │ ├── cot20_bin4.jsonl │ ├── cot20_bin5.jsonl │ ├── cot21_bin1.jsonl │ ├── cot21_bin2.jsonl │ ├── cot21_bin3.jsonl │ ├── cot21_bin4.jsonl │ ├── cot21_bin5.jsonl │ ├── cot22_bin1.jsonl │ ├── cot22_bin2.jsonl │ ├── cot22_bin3.jsonl │ ├── cot22_bin4.jsonl │ ├── cot22_bin5.jsonl │ ├── cot23_bin1.jsonl │ ├── cot23_bin2.jsonl │ ├── cot23_bin3.jsonl │ ├── cot23_bin4.jsonl │ ├── cot23_bin5.jsonl │ ├── cot24_bin1.jsonl │ ├── cot24_bin2.jsonl │ ├── cot24_bin3.jsonl │ ├── cot24_bin4.jsonl │ ├── cot24_bin5.jsonl │ ├── cot25_bin1.jsonl │ ├── cot25_bin2.jsonl │ ├── cot25_bin3.jsonl │ ├── cot25_bin4.jsonl │ ├── cot25_bin5.jsonl │ ├── cot2_bin1.jsonl │ ├── cot2_bin2.jsonl │ ├── cot2_bin3.jsonl │ ├── cot2_bin4.jsonl │ ├── cot2_bin5.jsonl │ ├── cot3_bin1.jsonl │ ├── cot3_bin2.jsonl │ ├── cot3_bin3.jsonl │ ├── cot3_bin4.jsonl │ ├── cot3_bin5.jsonl │ ├── cot4_bin1.jsonl │ ├── cot4_bin2.jsonl │ ├── cot4_bin3.jsonl │ ├── cot4_bin4.jsonl │ ├── cot4_bin5.jsonl │ ├── cot5_bin1.jsonl │ ├── cot5_bin2.jsonl │ ├── cot5_bin3.jsonl │ ├── cot5_bin4.jsonl │ ├── cot5_bin5.jsonl │ ├── cot6_bin1.jsonl │ ├── cot6_bin2.jsonl │ ├── cot6_bin3.jsonl │ ├── cot6_bin4.jsonl │ ├── cot6_bin5.jsonl │ ├── cot7_bin1.jsonl │ ├── cot7_bin2.jsonl │ ├── cot7_bin3.jsonl │ ├── cot7_bin4.jsonl │ ├── cot7_bin5.jsonl │ ├── cot8_bin1.jsonl │ ├── cot8_bin2.jsonl │ ├── cot8_bin3.jsonl │ ├── cot8_bin4.jsonl │ ├── cot8_bin5.jsonl │ ├── cot9_bin1.jsonl │ ├── cot9_bin2.jsonl │ ├── cot9_bin3.jsonl │ ├── cot9_bin4.jsonl │ └── cot9_bin5.jsonl └── stimulus_generator.py /.gitignore: -------------------------------------------------------------------------------- 1 | # custom 2 | seven_letter_words/random_pairs_lower* 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | cover/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | .pybuilder/ 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # poetry 101 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 102 | # This is especially recommended for binary packages to ensure reproducibility, and is more 103 | # commonly ignored for libraries. 104 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 105 | #poetry.lock 106 | 107 | # pdm 108 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 109 | #pdm.lock 110 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 111 | # in version control. 112 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 113 | .pdm.toml 114 | .pdm-python 115 | .pdm-build/ 116 | 117 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 118 | __pypackages__/ 119 | 120 | # Celery stuff 121 | celerybeat-schedule 122 | celerybeat.pid 123 | 124 | # SageMath parsed files 125 | *.sage.py 126 | 127 | # Environments 128 | .env 129 | .venv 130 | env/ 131 | venv/ 132 | ENV/ 133 | env.bak/ 134 | venv.bak/ 135 | 136 | # Spyder project settings 137 | .spyderproject 138 | .spyproject 139 | 140 | # Rope project settings 141 | .ropeproject 142 | 143 | # mkdocs documentation 144 | /site 145 | 146 | # mypy 147 | .mypy_cache/ 148 | .dmypy.json 149 | dmypy.json 150 | 151 | # Pyre type checker 152 | .pyre/ 153 | 154 | # pytype static type analyzer 155 | .pytype/ 156 | 157 | # Cython debug symbols 158 | cython_debug/ 159 | 160 | # PyCharm 161 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 162 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 163 | # and can be added to the global gitignore or merged into this file. For a more nuclear 164 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 165 | #.idea/ 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # deciphering_cot 2 | 3 | Code implementation and data for the paper: 4 | 5 | **[Deciphering the Factors Influencing the Efficacy of Chain-of-Thought: Probability, Memorization, and Noisy Reasoning](https://arxiv.org/abs/2407.01687)** 6 | 7 | [Akshara Prabhakar](https://aksh555.github.io/), [Thomas L. Griffiths](https://cocosci.princeton.edu/tom/index.php), [R. Thomas McCoy](https://rtmccoy.com/) 8 | 9 | 10 | 11 | ## Quickstart 12 | ### Data 13 | We construct a dataset of seven-letter words divided into 5 probability bins {bin1 to bin 5} each having around 150 words (first 100 to evaluate GPT-4 and remaining to evaluate the logistic regression model that was fitted on the first 100 words). The binning is done based on the log probability value assigned by GPT-2. 14 | 15 | The seven-letter word dataset is in [seven_letter_words](seven_letter_words): 16 | - bin1_prob.txt 17 | - bin2_prob.txt 18 | - bin3_prob.txt 19 | - bin4_prob.txt 20 | - bin5_prob.txt 21 | 22 | ### Shift cipher stimuli 23 | Using the seven-letter word dataset, we prepare stimuli -- these are shift cipher encoded versions of the words from the 5 probability bins across 25 shift levels (1 to 25). 24 | 25 | The stimuli are prepared for the different types of prompts we use: `standard`, `text_cot`, `math_cot`, `number_cot`. 26 | 27 | Can be created by running, 28 | ```bash 29 | python stimulus_generator.py --prompt_type 30 | ``` 31 | 32 | ### Evaluating LLMs on shift ciphers 33 | - GPT-4: `run_openai.py` 34 | - Llama 3.1: `run_llama.py` 35 | - Claude 3: `run_claude.py` 36 | 37 | Set appropriate OpenAI, Together, Anthropic keys in the environment before running evaluations. 38 | 39 | For example to run experiments on GPT-4 with Text-CoT for shift_level=1 across all 5 bins run, 40 | ```bash 41 | python run_openai.py --tasks text_cot1 --conditions bin1,bin2,bin3,bin4,bin5 --max_tokens 200 --prompt_type text_cot 42 | ``` 43 | 44 | To evaluate the generations, run 45 | ```bash 46 | python eval.py --prompt_type text_cot --create_stats_table 47 | ``` 48 | Run this after evaluating GPT-4 across all shift levels and bins. This will generate the evluation statistics for `text_cot` across all shift levels and the `{prompt_type}_train_table.tsv` file which is the train statistics table for fitting the logistic regression. 49 | 50 | ### Logistic regression 51 | The logistic regression is implemented in R in [regression.ipynb](regression/regression.ipynb). The predictions on the test set are saved in `regression/text_cot_test_results.tsv`. 52 | 53 | ### Outputs 54 | All model generations and outputs are stored in the `logs` directory. 55 | 56 | ## Citation 57 | If you find this repository helpful, feel free to cite our [publication](https://arxiv.org/abs/2407.01687). 58 | ``` 59 | @inproceedings{prabhakar-etal-2024-deciphering, 60 | title = "Deciphering the Factors Influencing the Efficacy of Chain-of-Thought: Probability, Memorization, and Noisy Reasoning", 61 | author = "Prabhakar, Akshara and 62 | Griffiths, Thomas L. and 63 | McCoy, R. Thomas", 64 | editor = "Al-Onaizan, Yaser and 65 | Bansal, Mohit and 66 | Chen, Yun-Nung", 67 | booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024", 68 | month = nov, 69 | year = "2024", 70 | address = "Miami, Florida, USA", 71 | publisher = "Association for Computational Linguistics", 72 | url = "https://aclanthology.org/2024.findings-emnlp.212", 73 | pages = "3710--3724" 74 | } 75 | ``` 76 | -------------------------------------------------------------------------------- /assets/preview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aksh555/deciphering_cot/21783c74d88ed690d657544b4503db09a1054239/assets/preview.png -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from Levenshtein import distance 4 | import statistics 5 | import jsonlines 6 | import sys 7 | import pandas as pd 8 | import argparse 9 | 10 | end_after_strings = ["Original text: ", "message is:", "original text is:", "message is ", "we get:"] 11 | # end_after_strings = ["Therefore, the original sequence of numbers is:","Original sequence:"] 12 | delete_after_strings = ["However, this doesn't make sense", "However, this doesn't make much sense", "This sentence still doesn't make", "However, this sentence doesn't make", "This still doesn't make sense"] 13 | shift_freqs = [59,21,117,5,15,12,6,3,1,3,3,7,1225,5,2,4,2,2,1,1,4,2,17,3,7] 14 | 15 | def desc(idx,gt_chain,pred_chain,gt,res): 16 | print("#", idx) 17 | print("gt_chain", gt_chain) 18 | print("----") 19 | print("pred_chain", pred_chain) 20 | print("----") 21 | print("gt", gt, "res", res) 22 | print("**************") 23 | 24 | def main(args): 25 | data_types = ["bin1","bin2","bin3","bin4","bin5"] 26 | big_df = pd.DataFrame() 27 | prompt_type = args.prompt_type 28 | fo_directory = f"logs/{prompt_type}/" 29 | temp = 0.0 30 | corrupt = False 31 | chain_check = False 32 | chain_directory = "shift_chain/" 33 | bin_probs = {} 34 | for bin in data_types: 35 | with open(f"seven_letter_words/{bin}_prob.txt", 'r') as file: 36 | second_column_words = [line.split(' ')[1].strip() for line in file][:100] 37 | bin_probs[bin] = second_column_words 38 | 39 | for shift in range(1,26): 40 | for fi_label in data_types: 41 | pred_nchars = [] 42 | input_nchars = [] 43 | corrects = [] 44 | preds = [] 45 | gts = [] 46 | small_df = pd.DataFrame() 47 | condition = prompt_type + str(shift) + "_" + fi_label 48 | if corrupt: 49 | condition += "_nohelp2" 50 | 51 | try: 52 | file = fo_directory + condition + ".json" 53 | fi = open(file, "r") 54 | if chain_check and prompt_type == "text_cot": 55 | chain_file = chain_directory + condition + ".jsonl" 56 | fi_chain = open(chain_file, "r") 57 | print(f"Loading {file}") 58 | except: 59 | print(f"\t{file} not found, skipping {fi_label} {shift}") 60 | continue 61 | print("*"*10) 62 | data = json.load(fi) 63 | if chain_check and prompt_type == "text_cot": 64 | data_chain = [] 65 | for line in fi_chain: 66 | x = json.loads(line) 67 | data_chain.append(x["chain"]) 68 | 69 | count_correct = 0 70 | count_correct_demo = 0 71 | count_total = 0 72 | total_dist = 0 73 | chain_correct_op_incorrect = 0 74 | chain_correct_op_correct = 0 75 | chain_incorrect_op_correct = 0 76 | chain_incorrect_op_incorrect = 0 77 | distances = [] 78 | for idx,(gt,res) in enumerate(zip(data["gts"], data["res"])): 79 | orig_res = res[:] 80 | 81 | for delete_after_string in delete_after_strings: 82 | if delete_after_string in res: 83 | starts = [m.start() for m in re.finditer(delete_after_string, res)] 84 | res = res[:starts[0]].strip() 85 | 86 | for end_after_string in end_after_strings: 87 | if end_after_string in res: 88 | res = res.split(end_after_string)[1].split("\n")[0].strip() 89 | if len(res) != 0: 90 | continue 91 | 92 | if gt[0] == '"': 93 | gt = gt[1:] 94 | if gt[-1] == '"': 95 | gt = gt[:-1] 96 | 97 | # if gt1[0] == '"': 98 | # gt1 = gt1[1:] 99 | # if gt1[-1] == '"': 100 | # gt1 = gt1[:-1] 101 | 102 | if len(res) != 0: 103 | if res[0] == '"': 104 | res = res[1:] 105 | if res[-1] == '"': 106 | res = res[:-1] 107 | 108 | dist = distance(gt, res) 109 | total_dist += dist 110 | distances.append(dist) 111 | 112 | if gt == res: 113 | count_correct += 1 114 | corrects.append(1) 115 | else: 116 | corrects.append(0) 117 | 118 | if chain_check and prompt_type == "text_cot": 119 | # find counts of chain correct but not output correct 120 | gt_chain = data_chain[idx].strip() 121 | pred_chain = re.split(r'Original text:', orig_res)[0].strip() 122 | if gt_chain == pred_chain: 123 | if gt != res: 124 | # desc(idx,gt_chain,pred_chain,gt,res) 125 | chain_correct_op_incorrect += 1 126 | else: 127 | chain_correct_op_correct += 1 128 | else: 129 | if gt == res: 130 | # desc() 131 | chain_incorrect_op_correct += 1 132 | else: 133 | chain_incorrect_op_incorrect += 1 134 | # stats 135 | pred_nchars.append(len(res.strip())) 136 | input_nchars.append(len(gt.strip())) 137 | preds.append(res) 138 | gts.append(gt) 139 | 140 | count_total += 1 141 | result_dict = {"condition": condition, "accuracy": count_correct*1.0/count_total, "lev_dist": total_dist*1.0/count_total, "median_levdist": statistics.median(distances), "temp": temp} 142 | print(condition, "acc_inst", count_correct*1.0/count_total, "acc_demo", count_correct_demo*1.0/count_total, "levdist:", total_dist*1.0/count_total, "median levdist:", statistics.median(distances)) 143 | 144 | ## For fine-grained analysis of 'unfaithfulness' 145 | if chain_check: 146 | result_dict.update({"chain_correct_op_correct" : chain_correct_op_correct, "chain_correct_op_incorrect" : chain_correct_op_incorrect, "chain_incorrect_op_correct" : chain_incorrect_op_correct, "chain_incorrect_op_incorrect" : chain_incorrect_op_incorrect}) 147 | print("chain correct:") 148 | print("\toutput correct:", chain_correct_op_correct, "output incorrect:", chain_correct_op_incorrect) 149 | print("chain incorrect:") 150 | print("\toutput correct:", chain_incorrect_op_correct, "output incorrect:", chain_incorrect_op_incorrect) 151 | 152 | if args.create_stats_table: 153 | with open(f'stimuli/{prompt_type}/{condition}.jsonl', 'r') as file: 154 | input_text = [] 155 | for line in file: 156 | json_obj = json.loads(line) 157 | input_text.append(json_obj.get('input', '')) 158 | 159 | ## write to huge tsv 160 | small_df["input_nchars"] = input_nchars 161 | small_df["output_logprob"] = bin_probs[fi_label] 162 | small_df["correct"] = corrects 163 | small_df["pred"] = preds 164 | small_df["gt"] = gts 165 | small_df["shift_level"] = [shift for _ in range(len(input_nchars))] 166 | small_df["shift_freq"] = [shift_freqs[shift-1] for _ in range(len(input_nchars))] 167 | small_df["input"] = input_text 168 | 169 | assert len(input_nchars) == len(pred_nchars) == len(bin_probs[fi_label]) == len(corrects) 170 | big_df = pd.concat([big_df, small_df], ignore_index=True) 171 | 172 | if args.create_stats_table: 173 | big_df.to_csv(f"regression/{prompt_type}_train_table.tsv","\t",index_label="index") 174 | 175 | if __name__ == "__main__": 176 | args = argparse.ArgumentParser() 177 | args.add_argument("--prompt_type", type=str, help="Prompt type to use [standard, text_cot, math_cot, number_cot]", default="text_cot") 178 | args.add_argument("--create_stats_table", action='store_true', help='default = False', default=False) 179 | args = args.parse_args() 180 | main(args) -------------------------------------------------------------------------------- /examples/bin_1.txt: -------------------------------------------------------------------------------- 1 | choosed 2 | colbert 3 | polenta 4 | modicum 5 | autarch 6 | schisms 7 | mariner 8 | disarms 9 | rescale 10 | paywall 11 | infobox 12 | preston 13 | shrines 14 | implore 15 | alloted 16 | precast 17 | borings 18 | bacilli 19 | matrice 20 | redible 21 | absolve 22 | ourself 23 | ethetic 24 | maynard 25 | calibur 26 | enviros 27 | calzone 28 | sumatra 29 | drywall 30 | impaled 31 | manland 32 | divined 33 | conlang 34 | tablero 35 | redraft 36 | equitas 37 | ratting 38 | errancy 39 | webcast 40 | lowland 41 | boyhood 42 | actuary 43 | catlike 44 | putback 45 | galileo 46 | rivaled 47 | volonte 48 | sunspot 49 | rotunda 50 | notched 51 | taproot 52 | secures 53 | entente 54 | outflow 55 | betters 56 | rumpled 57 | burried 58 | repulse 59 | fillets 60 | relator 61 | sombody 62 | unsaved 63 | ailment 64 | nodules 65 | montero 66 | satires 67 | arcadia 68 | valerie 69 | inglish 70 | dukedom 71 | espouse 72 | bedevil 73 | reticle 74 | matinee 75 | maxwell 76 | picante 77 | baboons 78 | exciter 79 | losings 80 | newbies 81 | serried 82 | curving 83 | narrows 84 | ragging 85 | baneful 86 | pinatas 87 | divison 88 | kinfolk 89 | indiana 90 | caritas 91 | silvery 92 | inkling 93 | absense 94 | lavabit 95 | outsize 96 | rewired 97 | absalom 98 | getback 99 | accuser 100 | striven 101 | maloney 102 | escaper 103 | subtile 104 | colibri 105 | delving 106 | calving 107 | tarheel 108 | herders 109 | grooved 110 | octagon 111 | bisping 112 | alluded 113 | merlion 114 | figural 115 | debater 116 | pigtail 117 | honious 118 | pinches 119 | clojure 120 | equates 121 | refiner 122 | billets 123 | alfalfa 124 | hotshot 125 | nonagon 126 | jacuzzi 127 | vincent 128 | pollock 129 | airtime -------------------------------------------------------------------------------- /examples/bin_2.txt: -------------------------------------------------------------------------------- 1 | dupasha 2 | makrita 3 | ferisse 4 | murcers 5 | metires 6 | witmost 7 | astause 8 | sekaram 9 | vilgren 10 | belomat 11 | setnest 12 | curadal 13 | viridon 14 | denpick 15 | eraully 16 | ruborie 17 | queimer 18 | cosuits 19 | rutamen 20 | graizen 21 | sonware 22 | infocos 23 | inkwang 24 | rowbots 25 | engeden 26 | vizizen 27 | molenci 28 | indotes 29 | dapener 30 | ireasti 31 | undving 32 | traumpt 33 | redrear 34 | aryanni 35 | brovoir 36 | greised 37 | networm 38 | memwill 39 | gamplus 40 | estplay 41 | sapwhat 42 | indmong 43 | kenafil 44 | denzhou 45 | cosited 46 | perzoek 47 | balinit 48 | mayonal 49 | armemic 50 | henjury 51 | lavplay 52 | calynes 53 | remfold 54 | engdist 55 | armrich 56 | luxfast 57 | mulhatt 58 | allaton 59 | strfair 60 | monachs 61 | kerapat 62 | hergrim 63 | fidgota 64 | decigan 65 | dezella 66 | haypath 67 | resonga 68 | nosband 69 | poligen 70 | mobture 71 | flufrom 72 | willose 73 | desedge 74 | momclub 75 | clobero 76 | mapauth 77 | vitelho 78 | daykick 79 | sysmite 80 | telolon 81 | onsensa 82 | vipaddy 83 | sunrink 84 | namhero 85 | voratio 86 | niliter 87 | droones 88 | zipcord 89 | pagrete 90 | funwich 91 | negbers 92 | belwich 93 | allayah 94 | pakatak 95 | farathy 96 | betweek 97 | rutanim 98 | obsster 99 | ligigid 100 | lidcore 101 | vacassa 102 | pipiday 103 | almorum 104 | sadmore 105 | hayhorn 106 | vinango 107 | cosisty 108 | libikal 109 | dogodes 110 | camcore 111 | ashmann 112 | fibunal 113 | enciere 114 | revrika 115 | perburg 116 | camilan 117 | sumarms 118 | firigin 119 | pelatra 120 | vorvery 121 | purabra 122 | indondo 123 | dogpeak 124 | alllein 125 | actblue 126 | hasvers 127 | freifty 128 | hueving 129 | coratti 130 | saprika 131 | honcoin 132 | joycons 133 | dogoids 134 | nanians 135 | dreanon 136 | spoanna 137 | levieur 138 | jawolla 139 | cowcard 140 | thehalb 141 | lamboys 142 | disorer 143 | pigwiki 144 | embious 145 | detdden 146 | vacibel -------------------------------------------------------------------------------- /examples/bin_3.txt: -------------------------------------------------------------------------------- 1 | tasvinc 2 | dblshaw 3 | cmbodka 4 | zagbbox 5 | hedoute 6 | cmsdest 7 | leoanje 8 | sitinks 9 | oweorno 10 | advpite 11 | grpwerk 12 | aesasio 13 | atequir 14 | dryhazi 15 | styansa 16 | sunincl 17 | bowamac 18 | xyzunik 19 | awsposs 20 | ogrmode 21 | midbyss 22 | ctlmony 23 | rngmony 24 | rergett 25 | phperti 26 | bfdizzy 27 | srcstit 28 | pktubic 29 | oddourd 30 | mplnick 31 | dccergy 32 | oxyhest 33 | klepled 34 | digydro 35 | aphopez 36 | rifntag 37 | srvlope 38 | emoomez 39 | toyelry 40 | iniilen 41 | iffamma 42 | adsokin 43 | eofpike 44 | dnsavia 45 | uitlesi 46 | owluntu 47 | affesda 48 | mgrulia 49 | foxmsgs 50 | esiaram 51 | subzyst 52 | ottexpo 53 | udpcolo 54 | vakdney 55 | svmvery 56 | dspereo 57 | pngpone 58 | quiilyn 59 | tgtella 60 | ithueur 61 | wynvinc 62 | sezanch 63 | sdkjabi 64 | yaninem 65 | dbgivid 66 | adeardu 67 | paykich 68 | dspdeal 69 | cptwipe 70 | nikaign 71 | pesuell 72 | musropp 73 | ebxside 74 | dnienez 75 | dccscal 76 | cmbheck 77 | stsasks 78 | hapixer 79 | nikuild 80 | wowrapy 81 | txtajes 82 | gtkoooo 83 | sutcmds 84 | erviode 85 | bewikon 86 | hubphas 87 | ervpets 88 | ofsitem 89 | gstivec 90 | utfestr 91 | etaabic 92 | tieibur 93 | islssel 94 | iodvari 95 | zagzept 96 | ustjour 97 | dexonte 98 | bizfilt 99 | adaowns 100 | tetibri 101 | octfirm 102 | weiudos 103 | pwdtick 104 | ttlarry 105 | stuimeo 106 | sqlstre 107 | mieipeg 108 | dueafen 109 | sndurge 110 | vezcorn 111 | ilketch 112 | zugenth 113 | rngiate 114 | ottclud 115 | aprkeep 116 | urlveal 117 | msgourd 118 | xlsboom 119 | wijagma 120 | robisbn 121 | melmlin 122 | samslot 123 | nidoust 124 | begkits 125 | arrflix 126 | ditfrau 127 | aidomid 128 | cptfoto 129 | aimrede 130 | dbgabay 131 | cidlocs 132 | booiedo 133 | mplders 134 | cptpush 135 | nahcalc 136 | amyovel 137 | wonczas 138 | mplrome 139 | edxesis 140 | adcadoo 141 | oudtems 142 | ociirut 143 | balzept 144 | avgcorp 145 | himocos 146 | ignlots 147 | baztrim -------------------------------------------------------------------------------- /examples/bin_4.txt: -------------------------------------------------------------------------------- 1 | voyxfff 2 | qtyijke 3 | mmculed 4 | jmpytut 5 | vtkprit 6 | oilrxjs 7 | vfsisex 8 | eenqrst 9 | nbrlyph 10 | xmmgota 11 | jmpquiv 12 | rummqtt 13 | xhrdisp 14 | ffturaa 15 | dexocht 16 | xmmgett 17 | lvljspx 18 | zugwpdb 19 | tidmqtt 20 | lhsigua 21 | sshemsp 22 | burrgyz 23 | vtkirie 24 | vtkifar 25 | rpczano 26 | vtkinez 27 | vtkifie 28 | zugymce 29 | xcbwent 30 | watobjs 31 | doiawks 32 | cgiacyj 33 | czyands 34 | mdbgebn 35 | atejspx 36 | rndxito 37 | sdkrxjs 38 | mlxoice 39 | mlxahan 40 | auxjspx 41 | jsxirms 42 | czyrgba 43 | makrgyz 44 | nanighb 45 | jsxobil 46 | jwtgraf 47 | vtkundy 48 | jsxuden 49 | pszglfw 50 | czydamn 51 | csvylko 52 | wijincl 53 | oilrgyz 54 | mlxulan 55 | xmmepar 56 | lodxlsx 57 | uczpeon 58 | sesrgyz 59 | pciavax 60 | gpsilik 61 | lhszion 62 | slaampp 63 | uczhtag 64 | ouiqrst 65 | xhrziel 66 | pcbpiar 67 | yumxfff 68 | fedjspb 69 | xmmtega 70 | segzoek 71 | mezgrpc 72 | xcbophe 73 | ngxantz 74 | aosantd 75 | jejymax 76 | rerlsru 77 | racrgyz 78 | rndquam 79 | mlxneau 80 | rudcych 81 | lotlsru 82 | abyilog 83 | rsaueba 84 | jsxioso 85 | derjspx 86 | vfsgett 87 | vtkjure 88 | phyepar 89 | vesxfff 90 | lcdleri 91 | ifsfeas 92 | mmcubbo 93 | ircemsp 94 | pdbiesz 95 | rpciene 96 | iodpiar 97 | rmslsru 98 | rpcumno 99 | apkckpt 100 | lcdvoir 101 | rhsncia 102 | owlsetq 103 | ifsbrtc 104 | csvowej 105 | xcborgt 106 | sutmobx 107 | iovstmt 108 | nanmqtt 109 | irqphem 110 | wndncia 111 | xcbided 112 | jsxkees 113 | cpscsrf 114 | jmppeon 115 | lhsreta 116 | dezrgyz 117 | elecsrf 118 | atrlymp 119 | iodudev 120 | xhrkses 121 | ngxjspx 122 | uczpear 123 | npmhlen 124 | pcmncmp 125 | biczoek 126 | dosorrh 127 | jejmisc 128 | kenjspx 129 | idxiaux 130 | svgiesz 131 | vtkgems 132 | glmldre 133 | dexumbn 134 | kitxfff 135 | jsxajan 136 | fmtmina 137 | gtkthew 138 | czyuess 139 | iodhait 140 | cafantd 141 | xcbredo 142 | fpswpdb 143 | xcbdogs 144 | jwtlify 145 | rsaellt 146 | pkgughs 147 | jmpccak 148 | pclvais -------------------------------------------------------------------------------- /examples/bin_5.txt: -------------------------------------------------------------------------------- 1 | czyjspx 2 | xcbabwe 3 | aktjspx 4 | xcbcych 5 | xcbziej 6 | xmmeczy 7 | qeddhcp 8 | xcbilha 9 | xcbacji 10 | xcbzung 11 | xmmobre 12 | xcbquir 13 | xcbrouw 14 | ilkjspx 15 | lijglfw 16 | foxrgyz 17 | jsxrouw 18 | xcbziel 19 | xcbagua 20 | eidtopl 21 | xcbximo 22 | jwtglfw 23 | xcbnerg 24 | xcbateg 25 | befjspx 26 | xcbxlim 27 | xcbsemi 28 | ketglfw 29 | lemjspx 30 | xcbcyan 31 | xcbsequ 32 | xcbemer 33 | eoscsrf 34 | xcbphot 35 | xcbeken 36 | xcbolum 37 | xcbrodu 38 | tepjspx 39 | xcbthro 40 | xcbueue 41 | oscquiv 42 | xcbubah 43 | xcbodzi 44 | mlxquee 45 | xcbmdat 46 | xcbuell 47 | xcbobre 48 | xcbuhan 49 | tasexpl 50 | xcbueil 51 | xcbilos 52 | iodtopl 53 | suttmpl 54 | xcbhots 55 | xcbosph 56 | xcbuego 57 | xcbquam 58 | kolglfw 59 | gesglfw 60 | gccorrh 61 | mezptom 62 | xcbhecy 63 | xcbsemb 64 | yiijspx 65 | meljspx 66 | xcbunos 67 | xcbunei 68 | pisbrtc 69 | vehjspx 70 | vasrgyz 71 | lhsrgyz 72 | xcbighb 73 | phyfidf 74 | kilglfw 75 | dukvrir 76 | levjspx 77 | updrgyz 78 | xcbagas 79 | opcrgyz 80 | ilkjspb 81 | curfidf 82 | rpcighb 83 | xcbacje 84 | xcbilih 85 | zugcsrf 86 | xcbveau 87 | rpcasje 88 | xcbalsy 89 | pcmrouw 90 | xcbafil 91 | doijspx 92 | xcbhtub 93 | xcbhear 94 | xcbuele 95 | opijspx 96 | xcbazzo 97 | xcboufl 98 | akojspx 99 | ninmqtt 100 | xcbguna 101 | idxorrh 102 | xcbheit 103 | czyxfff 104 | voyglfw 105 | dynmqtt 106 | xcbcoln 107 | vezjspx 108 | xcbocre 109 | cueorrh 110 | xmmacje 111 | mlxalsy 112 | ebxorrh 113 | xcbagal 114 | xcbzept 115 | xcbucle 116 | vesjspx 117 | xcbiser 118 | xcbseau 119 | xcbekte 120 | lapmqtt 121 | abyjspx 122 | xcbueba 123 | xcbijke 124 | xcbvoie 125 | xcbudem 126 | xcbivol 127 | xcbquoi 128 | xcbupal 129 | zugjspx 130 | xcbheel 131 | typglfw 132 | rpcinqu 133 | voyorrh 134 | tieglfw 135 | hexmqtt 136 | xcbacyj 137 | aktjspb 138 | amyjspx 139 | ackrgyz 140 | xcbokus 141 | xcbhtag 142 | togjspx 143 | xcbuely 144 | xcbffic 145 | mlxasje 146 | xcbunft 147 | wieglfw 148 | xcbufig 149 | xcbueur 150 | zagmqtt -------------------------------------------------------------------------------- /examples/select_swap_words.py: -------------------------------------------------------------------------------- 1 | bins = ["bin_3", "bin_4", "bin_5"] 2 | 3 | words = [] 4 | for bin in bins: 5 | with open(f"./{bin}.txt") as file: 6 | words.extend([line.strip() for line in file]) 7 | 8 | # import tiktoken 9 | # from collections import defaultdict 10 | # gpt4_enc = tiktoken.get_encoding("cl100k_base") 11 | 12 | # score_words_dict = defaultdict(list) 13 | 14 | # for word in words: 15 | # tokens = len(gpt4_enc.encode(word)) 16 | # score_words_dict[tokens].append(word) 17 | 18 | alphabet = "abcdefghijklmnopqrstuvwxyz" 19 | index2char = {} 20 | char2index = {} 21 | for index, char in enumerate(alphabet): 22 | index2char[index] = char 23 | char2index[char] = index 24 | 25 | # similar_pairs = [] 26 | # for score, words_with_score in score_words_dict.items(): 27 | # for i in range(len(words_with_score)): 28 | # word1 = words_with_score[i] 29 | # word2 = "" 30 | # for char in word1: 31 | # word2 += index2char[(char2index[char]+25)%26] 32 | # print(word1, word2) 33 | # if word2 in words: 34 | # similar_pairs.append((word1, word2)) 35 | 36 | # print(len(similar_pairs)) 37 | # print(similar_pairs) 38 | 39 | 40 | import os 41 | os.environ['TRANSFORMERS_CACHE'] = "/n/fs/codeval/cache" 42 | os.environ['HF_DATASETS_CACHE'] = "/n/fs/codeval/cache" 43 | os.environ['HF_HOME'] = "/n/fs/codeval/cache" 44 | os.environ['HF_HUB_CACHE'] = "/n/fs/codeval/cache" 45 | 46 | import torch 47 | from transformers import GPT2LMHeadModel, GPT2Tokenizer 48 | import tiktoken 49 | import logging 50 | import json 51 | import pandas as pd 52 | 53 | logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, handlers=[logging.StreamHandler(),logging.FileHandler("prob_random_index.log")]) 54 | 55 | if torch.cuda.is_available(): 56 | device = "cuda" 57 | else: 58 | device = "cpu" 59 | 60 | gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-xl") 61 | gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2-xl").to(device) 62 | gpt4_enc = tiktoken.get_encoding("cl100k_base") 63 | 64 | def pad_batch(batch, pad_idx): 65 | max_length = 0 66 | for seq in batch: 67 | if len(seq) > max_length: 68 | max_length = len(seq) 69 | 70 | new_batch = [] 71 | for seq in batch: 72 | padding = [pad_idx for i in range(max_length - len(seq))] 73 | new_batch.append(seq + padding) 74 | 75 | return new_batch 76 | 77 | # Get perplexity using GPT-2 78 | def prob_gpt2(sentence_list): 79 | 80 | # Tokenize the sentences 81 | all_tokens = [] 82 | for sentence in sentence_list: 83 | tokens = gpt2_tokenizer.encode(sentence) 84 | all_tokens.append(tokens) 85 | tokens = pad_batch(all_tokens, 50256) 86 | 87 | targets = tokens[:] 88 | 89 | # Compute average log likelihood for the generation 90 | input_ids = torch.LongTensor(tokens).to(device) 91 | target_ids = torch.LongTensor(targets).to(device) 92 | 93 | with torch.no_grad(): 94 | outputs = gpt2_model(input_ids, labels=target_ids) 95 | logits = outputs[1] 96 | logits = logits.transpose(0,1)[:-1].transpose(0,1) 97 | target_ids = target_ids.transpose(0,1)[1:].transpose(0,1) 98 | loss = torch.nn.CrossEntropyLoss(reduction="none", ignore_index=50256)(logits.reshape(-1,50257), target_ids.reshape(-1)) 99 | loss = loss.reshape(target_ids.shape).sum(dim=1) 100 | neg_log_likelihood = -1*loss 101 | 102 | 103 | # 13.357776641845703 = logprob('The word is"'); removing this to just get 104 | # the word prob 105 | return neg_log_likelihood + 13.357776641845703 106 | 107 | 108 | # folder_path = "/n/fs/codeval/embers-of-autoregression/extension/stimuli/word/" 109 | # file_list = sorted([os.path.join(folder_path, f) for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])[:1] 110 | file_list = [1] 111 | 112 | num_token_mis = 0 113 | for finame in file_list: 114 | # print(finame, end="**\n") 115 | # with open(finame, 'r') as f: 116 | # lines = f.readlines() 117 | # lines = [json.loads(line) for line in lines] 118 | # fi = [line['input'] for line in lines] 119 | # print("Lines", len(fi)) 120 | # fo = open("input_scored.txt", "a") 121 | word_list = words 122 | print("Lines", len(word_list)) 123 | 124 | words_with_prob = [] 125 | word_pairs = [] 126 | 127 | this_batch_sentences = [] 128 | this_batch_word1s = [] 129 | this_batch_words = [] 130 | num_tokens = [] 131 | for index, line in enumerate(word_list): 132 | if index % 10000 == 0: 133 | logging.info(str(index)) 134 | 135 | word = line.strip() 136 | check_shifts = [2] 137 | for check_shift in check_shifts: 138 | word2 = "" 139 | word1 = "" 140 | for char in word: 141 | word1 += index2char[(char2index[char]+1)%26] 142 | word2 += index2char[(char2index[char]+check_shift)%26] 143 | 144 | tokens = gpt4_enc.encode(word1) 145 | tokens_word2 = gpt4_enc.encode(word2) 146 | if len(tokens) > 4 or len(tokens) != len(tokens_word2): 147 | # print(word1, word2, len(tokens), len(tokens_word2)) 148 | num_token_mis += 1 149 | continue 150 | 151 | tokens_spaced = gpt4_enc.encode(" " + word2) 152 | 153 | this_batch_sentences.append('The word is "' + word2 + '"') 154 | this_batch_words.append(word2) 155 | num_tokens.append(len(tokens)) 156 | this_batch_word1s.append(word1) 157 | 158 | # if len(tokens) == 2 and len(tokens_spaced) == 2 and len(word) == 7: 159 | # token1 = gpt4_enc.decode([tokens[0]]).strip() 160 | # token2 = gpt4_enc.decode([tokens[1]]).strip() 161 | 162 | # tokenspaced1 = gpt4_enc.decode([tokens_spaced[0]]).strip() 163 | # tokenspaced2 = gpt4_enc.decode([tokens_spaced[1]]).strip() 164 | 165 | # if len(token1) == 3 and len(token2) == 4 and len(tokenspaced1) == 3 and len(tokenspaced2) == 4: 166 | # this_batch_sentences.append('The word is "' + word + '"') 167 | # this_batch_words.append(word) 168 | # else: 169 | # print(index, "Wrong length", word, len(token1), len(token2), len(tokenspaced1), len(tokenspaced2)) 170 | # else: 171 | # print(index, "Wrong length", word, len(tokens), len(tokens_spaced), len(word)) 172 | 173 | if len(this_batch_sentences) == 3000: 174 | logprobs = prob_gpt2(this_batch_sentences) 175 | for word1, word2, logprob in zip(this_batch_word1s, this_batch_words, logprobs): 176 | words_with_prob.append(logprob.item()) 177 | if logprob.item() >= -45 and logprob.item() < -30: 178 | word_pairs.append([word1, word2]) 179 | this_batch_sentences = [] 180 | this_batch_words = [] 181 | this_batch_word1s = [] 182 | 183 | if len(this_batch_sentences) > 0: 184 | logprobs = prob_gpt2(this_batch_sentences) 185 | for word1, word2, logprob in zip(this_batch_word1s, this_batch_words, logprobs): 186 | words_with_prob.append(logprob.item()) 187 | if logprob.item() > -45 and logprob.item() < -30: 188 | x = prob_gpt2(['The word is "' + word1 + '"'])[-1].item() 189 | if x > -45 and x < -30: 190 | word_pairs.append([word1, word2]) 191 | print("missed 2", word1, word2, x, logprob.item()) 192 | this_batch_sentences = [] 193 | this_batch_words = [] 194 | this_batch_word1s = [] 195 | 196 | print(num_token_mis) 197 | print(len(word_pairs)) 198 | f = open("./word_pairs_lowbins.txt", 'a+') 199 | for pair in word_pairs: 200 | f.write(pair[0] + "\t" + pair[1] + "\n") 201 | 202 | f.close() 203 | 204 | 205 | -------------------------------------------------------------------------------- /examples/word_pairs_lowbins.txt: -------------------------------------------------------------------------------- 1 | ubtwjod vcuxkpe 2 | ecmtibx fdnujcy 3 | ifepvuf jgfqwvg 4 | dnteftu eoufguv 5 | tjujolt ukvkpmu 6 | bewqjuf cfxrkvg 7 | hsqxfsl itrygtm 8 | bftbtjp cgucukq 9 | bufrvjs cvgswkt 10 | tuzbotb uvacpuc 11 | cpxbnbd dqycoce 12 | yzavojl zabwpkm 13 | phsnpef qitoqfg 14 | dumnpoz evnoqpa 15 | sohnpoz tpioqpa 16 | cgejaaz dhfkbba 17 | tsdtuju uteuvkv 18 | qluvcjd rmvwdke 19 | peepvse qffqwtf 20 | lmfqmfe mngrngf 21 | sjgoubh tkhpvci 22 | fnppnfa goqqogb 23 | jojjmfo kpkkngp 24 | eotbwjb fpucxkc 25 | vjumftj wkvnguk 26 | bggfteb chhgufc 27 | nhsvmjb oitwnkc 28 | ftjbsbn gukctco 29 | tvcaztu uwdbauv 30 | puufyqp qvvgzrq 31 | veqdpmp wfreqnq 32 | wbleofz xcmfpga 33 | etqfsfp furgtgq 34 | qohqpof rpirqpg 35 | uhufmmb vivgnnc 36 | juivfvs kvjwgwt 37 | telkbcj ufmlcdk 38 | echjwje fdikxkf 39 | qbzljdi rcamkej 40 | ojlbjho pkmckip 41 | qftvfmm rguwgnn 42 | nvtspqq owutqrr 43 | fcytjef gdzukfg 44 | eojfofa fpkgpgb 45 | eddtdbm feeuecn 46 | dncifdl eodjgem 47 | hulpppp ivmqqqq 48 | cfxjlpo dgykmqp 49 | vugftus wvhguvt 50 | ujfjcvs vkgkdwt 51 | jtmttfm kunuugn 52 | efypouf fgzqpvg 53 | cjagjmu dkbhknv 54 | ufujcsj vgvkdtk 55 | pdugjsn qevhkto 56 | xfjvept ygkwfqu 57 | qxeujdl ryfvkem 58 | uumbssz vvnctta 59 | tuvjnfp uvwkogq 60 | trmtusf usnuvtg 61 | evfbgfo fwgchgp 62 | toevshf upfwtig 63 | jmlfudi knmgvej 64 | sohjbuf tpikcvg 65 | ymtcppn znudqqo 66 | spcjtco tqdkudp 67 | ojepvtu pkfqwuv 68 | cfhljut dgimkvu 69 | bjepnje ckfqokf 70 | dqugpup ervhqvq 71 | echbcbz fdicdca 72 | bnzpwfm coaqxgn 73 | feyftjt gfzguku 74 | pveufnt qwfvgou 75 | ijnpdpt jkoqequ 76 | wpzyggg xqazhhh 77 | ruzjklf svaklmg 78 | nndvmfe ooewngf 79 | wulqsju xvmrtkv 80 | fforstu ggpstuv 81 | ocsmzqi pdtnarj 82 | yisejtq zjtfkur 83 | gguvsbb hhvwtcc 84 | ynnhfuu zooigvv 85 | avhxqec bwiyrfd 86 | mitjhvb njukiwc 87 | ttifntq uujgour 88 | cvsshza dwttiab 89 | wuljsjf xvmktkg 90 | sqdabop trebcpq 91 | ydcxfou zedygpv 92 | epjbxlt fqkcymu 93 | dhjbdzk eikceal 94 | dazboet ebacpfu 95 | bufktqy cvglurz 96 | telsykt ufmtzlu 97 | nmypjdf onzqkeg 98 | ktyjsnt luzktou 99 | dazshcb ebatidc 100 | nblshza ocmtiab 101 | obojhic pcpkijd 102 | wulvoez xvmwpfa 103 | nmyvmbo onzwncp 104 | vdaqfpo webrgqp 105 | qdjbwby rekcxcz 106 | hqtjmjl iruknkm 107 | vdaiubh webjvci 108 | pvjrstu qwkstuv 109 | gfektqc hgflurd 110 | ydcpqif zedqrjg 111 | ohyboua pizcpvb 112 | sfsmtsv tgtnutw 113 | svedzdi twfeaej 114 | mpumtsv nqvnutw 115 | bczjmph cdaknqi 116 | stbvfcb tucwgdc 117 | wgthfuu xhuigvv 118 | qizfqbs rjagrct 119 | wftyggg xguzhhh 120 | mdemfsj nefngtk 121 | sntmtsv tounutw 122 | sqdvnop trewopq 123 | mdewpjs nefxqkt 124 | jgtcsud khudtve 125 | tvunpcy uwvoqdz 126 | xoeodjb ypfpekc 127 | ydcjefe zedkfgf 128 | ktylfft luzmggu 129 | busmznq cvtnaor 130 | yisltft zjtmugu 131 | vdaqfbs webrgct 132 | eptpssi fquqttj 133 | wulhfnt xvmigou 134 | hmnmesf inonftg 135 | efyvnco fgzwodp 136 | ljuyggg mkvzhhh 137 | ktybkbo luzclcp 138 | huluifx ivmvjgy 139 | ydcepht zedfqiu 140 | stbfmmu tucgnnv 141 | qdmwbjt renxcku 142 | ydcbcxf zedcdyg 143 | bluktqy cmvlurz 144 | ydcdzdi zedeaej 145 | ydcspvx zedtqwy 146 | gpyshza hqztiab 147 | ydcbufh zedcvgi 148 | ydcdzbo zedeacp 149 | ydcflfo zedgmgp 150 | ydcspev zedtqfw 151 | ydcuisp zedvjtq 152 | ydcpeaj zedqfbk 153 | ydcvfmm zedwgnn 154 | ydcpcsf zedqdtg 155 | ubtfyqm vcugzrn 156 | ydcptqi zedqurj 157 | ydcvfhp zedwgiq 158 | hddpssi ieeqttj 159 | ydcifdz zedjgea 160 | qjtcsud rkudtve 161 | ljmhmgx mkninhy 162 | evlwsjs fwmxtkt 163 | vqeshza wrftiab 164 | ydcbhbt zedcicu 165 | pqdshza qretiab 166 | dvsgjeg ewthkfh 167 | qdnspvx reotqwy 168 | ydciuvc zedjvwd 169 | ydcifbs zedjgct 170 | ydcbaap zedcbbq 171 | ydcpvgm zedqwhn 172 | blpktqy cmqlurz 173 | ydcifju zedjgkv 174 | wpzhmgx xqainhy 175 | wfaktqy xgblurz 176 | ydcpdsf zedqetg 177 | ynnbdkf zoocelg 178 | ydcbhbm zedcicn 179 | ydcjtfs zedkugt 180 | mbqnruu ncrosvv 181 | ydcwpjf zedxqkg 182 | ydcrvpj zedswqk 183 | ydciffm zedjggn 184 | sqdjorv trekpsw 185 | ujfhmgx vkginhy 186 | ifynruu jgzosvv 187 | bluktqc cmvlurd 188 | ydcplvt zedqmwu 189 | ydciubh zedjvci 190 | ubtwjod vcuxkpe 191 | ecmtibx fdnujcy 192 | ifepvuf jgfqwvg 193 | dnteftu eoufguv 194 | tjujolt ukvkpmu 195 | bewqjuf cfxrkvg 196 | hsqxfsl itrygtm 197 | bftbtjp cgucukq 198 | bufrvjs cvgswkt 199 | tuzbotb uvacpuc 200 | cpxbnbd dqycoce 201 | yzavojl zabwpkm 202 | phsnpef qitoqfg 203 | dumnpoz evnoqpa 204 | sohnpoz tpioqpa 205 | cgejaaz dhfkbba 206 | tsdtuju uteuvkv 207 | qluvcjd rmvwdke 208 | peepvse qffqwtf 209 | sjgoubh tkhpvci 210 | fnppnfa goqqogb 211 | jojjmfo kpkkngp 212 | eotbwjb fpucxkc 213 | vjumftj wkvnguk 214 | bggfteb chhgufc 215 | nhsvmjb oitwnkc 216 | ftjbsbn gukctco 217 | tvcaztu uwdbauv 218 | puufyqp qvvgzrq 219 | veqdpmp wfreqnq 220 | wbleofz xcmfpga 221 | etqfsfp furgtgq 222 | qohqpof rpirqpg 223 | uhufmmb vivgnnc 224 | juivfvs kvjwgwt 225 | telkbcj ufmlcdk 226 | echjwje fdikxkf 227 | qbzljdi rcamkej 228 | ojlbjho pkmckip 229 | qftvfmm rguwgnn 230 | nvtspqq owutqrr 231 | fcytjef gdzukfg 232 | eojfofa fpkgpgb 233 | eddtdbm feeuecn 234 | dncifdl eodjgem 235 | hulpppp ivmqqqq 236 | cfxjlpo dgykmqp 237 | vugftus wvhguvt 238 | ujfjcvs vkgkdwt 239 | jtmttfm kunuugn 240 | efypouf fgzqpvg 241 | cjagjmu dkbhknv 242 | ufujcsj vgvkdtk 243 | pdugjsn qevhkto 244 | xfjvept ygkwfqu 245 | qxeujdl ryfvkem 246 | uumbssz vvnctta 247 | tuvjnfp uvwkogq 248 | trmtusf usnuvtg 249 | evfbgfo fwgchgp 250 | toevshf upfwtig 251 | jmlfudi knmgvej 252 | sohjbuf tpikcvg 253 | ymtcppn znudqqo 254 | spcjtco tqdkudp 255 | ojepvtu pkfqwuv 256 | cfhljut dgimkvu 257 | bjepnje ckfqokf 258 | dqugpup ervhqvq 259 | echbcbz fdicdca 260 | bnzpwfm coaqxgn 261 | feyftjt gfzguku 262 | pveufnt qwfvgou 263 | ijnpdpt jkoqequ 264 | wpzyggg xqazhhh 265 | ruzjklf svaklmg 266 | nndvmfe ooewngf 267 | wulqsju xvmrtkv 268 | fforstu ggpstuv 269 | ocsmzqi pdtnarj 270 | yisejtq zjtfkur 271 | gguvsbb hhvwtcc 272 | ynnhfuu zooigvv 273 | avhxqec bwiyrfd 274 | mitjhvb njukiwc 275 | ttifntq uujgour 276 | cvsshza dwttiab 277 | wuljsjf xvmktkg 278 | sqdabop trebcpq 279 | ydcxfou zedygpv 280 | epjbxlt fqkcymu 281 | dhjbdzk eikceal 282 | dazboet ebacpfu 283 | bufktqy cvglurz 284 | telsykt ufmtzlu 285 | nmypjdf onzqkeg 286 | ktyjsnt luzktou 287 | dazshcb ebatidc 288 | nblshza ocmtiab 289 | obojhic pcpkijd 290 | wulvoez xvmwpfa 291 | nmyvmbo onzwncp 292 | vdaqfpo webrgqp 293 | qdjbwby rekcxcz 294 | hqtjmjl iruknkm 295 | vdaiubh webjvci 296 | pvjrstu qwkstuv 297 | gfektqc hgflurd 298 | ohyboua pizcpvb 299 | sfsmtsv tgtnutw 300 | svedzdi twfeaej 301 | mpumtsv nqvnutw 302 | bczjmph cdaknqi 303 | stbvfcb tucwgdc 304 | wgthfuu xhuigvv 305 | qizfqbs rjagrct 306 | wftyggg xguzhhh 307 | mdemfsj nefngtk 308 | sntmtsv tounutw 309 | sqdvnop trewopq 310 | mdewpjs nefxqkt 311 | jgtcsud khudtve 312 | tvunpcy uwvoqdz 313 | xoeodjb ypfpekc 314 | ydcjefe zedkfgf 315 | ktylfft luzmggu 316 | busmznq cvtnaor 317 | yisltft zjtmugu 318 | vdaqfbs webrgct 319 | eptpssi fquqttj 320 | wulhfnt xvmigou 321 | hmnmesf inonftg 322 | efyvnco fgzwodp 323 | ljuyggg mkvzhhh 324 | ktybkbo luzclcp 325 | huluifx ivmvjgy 326 | ydcepht zedfqiu 327 | stbfmmu tucgnnv 328 | qdmwbjt renxcku 329 | ydcbcxf zedcdyg 330 | bluktqy cmvlurz 331 | ydcdzdi zedeaej 332 | ydcspvx zedtqwy 333 | gpyshza hqztiab 334 | ydcbufh zedcvgi 335 | ydcdzbo zedeacp 336 | ydcflfo zedgmgp 337 | ydcspev zedtqfw 338 | ydcpeaj zedqfbk 339 | ydcvfmm zedwgnn 340 | ydcpcsf zedqdtg 341 | ubtfyqm vcugzrn 342 | ydcptqi zedqurj 343 | ydcvfhp zedwgiq 344 | hddpssi ieeqttj 345 | ydcifdz zedjgea 346 | qjtcsud rkudtve 347 | ljmhmgx mkninhy 348 | evlwsjs fwmxtkt 349 | vqeshza wrftiab 350 | ydcbhbt zedcicu 351 | pqdshza qretiab 352 | dvsgjeg ewthkfh 353 | qdnspvx reotqwy 354 | ydciuvc zedjvwd 355 | ydcifbs zedjgct 356 | ydcbaap zedcbbq 357 | ydcpvgm zedqwhn 358 | blpktqy cmqlurz 359 | ydcifju zedjgkv 360 | wpzhmgx xqainhy 361 | wfaktqy xgblurz 362 | ydcpdsf zedqetg 363 | ynnbdkf zoocelg 364 | ydcbhbm zedcicn 365 | ydcjtfs zedkugt 366 | mbqnruu ncrosvv 367 | ydcwpjf zedxqkg 368 | ydcrvpj zedswqk 369 | ydciffm zedjggn 370 | sqdjorv trekpsw 371 | ujfhmgx vkginhy 372 | ifynruu jgzosvv 373 | bluktqc cmvlurd 374 | ydcplvt zedqmwu 375 | ydciubh zedjvci 376 | -------------------------------------------------------------------------------- /logs/basic/claude-3/results.jsonl: -------------------------------------------------------------------------------- 1 | {"condition": "basic1_bin1", "acc_inst": 0.79, "acc_demo": 0.0, "levdist": 0.33, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 2 | {"condition": "basic1_bin2", "acc_inst": 0.43, "acc_demo": 0.0, "levdist": 1.25, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 3 | {"condition": "basic1_bin3", "acc_inst": 0.22, "acc_demo": 0.0, "levdist": 2.2, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 4 | {"condition": "basic1_bin4", "acc_inst": 0.11, "acc_demo": 0.0, "levdist": 2.34, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 5 | {"condition": "basic1_bin5", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 2.21, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 6 | {"condition": "basic2_bin1", "acc_inst": 0.44, "acc_demo": 0.0, "levdist": 1.36, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 7 | {"condition": "basic2_bin2", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 2.18, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 8 | {"condition": "basic2_bin3", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 3.26, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 9 | {"condition": "basic2_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.01, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 10 | {"condition": "basic2_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 3.76, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 11 | {"condition": "basic3_bin1", "acc_inst": 0.65, "acc_demo": 0.0, "levdist": 0.84, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 12 | {"condition": "basic3_bin2", "acc_inst": 0.34, "acc_demo": 0.0, "levdist": 1.56, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 13 | {"condition": "basic3_bin3", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 2.63, "median_levdist": 2.5, "model": "claude-3", "temp": 0.0} 14 | {"condition": "basic3_bin4", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 3.92, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 15 | {"condition": "basic3_bin5", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.75, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 16 | {"condition": "basic4_bin1", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 3.48, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 17 | {"condition": "basic4_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.17, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 18 | {"condition": "basic4_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.28, "median_levdist": 4.5, "model": "claude-3", "temp": 0.0} 19 | {"condition": "basic4_bin4", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 4.65, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 20 | {"condition": "basic4_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.37, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 21 | {"condition": "basic5_bin1", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 4.12, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 22 | {"condition": "basic5_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.2, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 23 | {"condition": "basic5_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.85, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 24 | {"condition": "basic5_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.16, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 25 | {"condition": "basic5_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.7, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 26 | {"condition": "basic6_bin1", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 4.54, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 27 | {"condition": "basic6_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.21, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 28 | {"condition": "basic6_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.16, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 29 | {"condition": "basic6_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.3, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 30 | {"condition": "basic6_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.81, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 31 | {"condition": "basic7_bin1", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 4.11, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 32 | {"condition": "basic7_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.55, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 33 | {"condition": "basic7_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.51, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 34 | {"condition": "basic7_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.65, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 35 | {"condition": "basic7_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.31, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 36 | {"condition": "basic8_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.37, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 37 | {"condition": "basic8_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.45, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 38 | {"condition": "basic8_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.49, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 39 | {"condition": "basic8_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.68, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 40 | {"condition": "basic8_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.11, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 41 | {"condition": "basic9_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.6, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 42 | {"condition": "basic9_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.08, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 43 | {"condition": "basic9_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.96, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 44 | {"condition": "basic9_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.03, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 45 | {"condition": "basic9_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.66, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 46 | {"condition": "basic10_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.75, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 47 | {"condition": "basic10_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.83, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 48 | {"condition": "basic10_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.72, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 49 | {"condition": "basic10_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.93, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 50 | {"condition": "basic10_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.42, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 51 | {"condition": "basic11_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.79, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 52 | {"condition": "basic11_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.04, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 53 | {"condition": "basic11_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.08, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 54 | {"condition": "basic11_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.4, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 55 | {"condition": "basic11_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.59, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 56 | {"condition": "basic12_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.66, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 57 | {"condition": "basic12_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.67, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 58 | {"condition": "basic12_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.86, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 59 | {"condition": "basic12_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.39, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 60 | {"condition": "basic12_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.44, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 61 | {"condition": "basic13_bin1", "acc_inst": 0.47, "acc_demo": 0.0, "levdist": 2.58, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 62 | {"condition": "basic13_bin2", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 3.58, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 63 | {"condition": "basic13_bin3", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 4.53, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 64 | {"condition": "basic13_bin4", "acc_inst": 0.1, "acc_demo": 0.0, "levdist": 4.28, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 65 | {"condition": "basic13_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 3.95, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 66 | {"condition": "basic14_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.78, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 67 | {"condition": "basic14_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.97, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 68 | {"condition": "basic14_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.26, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 69 | {"condition": "basic14_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.5, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 70 | {"condition": "basic14_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.67, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 71 | {"condition": "basic15_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.07, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 72 | {"condition": "basic15_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.95, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 73 | {"condition": "basic15_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.13, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 74 | {"condition": "basic15_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.32, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 75 | {"condition": "basic15_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.68, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 76 | {"condition": "basic16_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.78, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 77 | {"condition": "basic16_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.06, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 78 | {"condition": "basic16_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.2, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 79 | {"condition": "basic16_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.5, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 80 | {"condition": "basic16_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.65, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 81 | {"condition": "basic17_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.85, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 82 | {"condition": "basic17_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.01, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 83 | {"condition": "basic17_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.13, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 84 | {"condition": "basic17_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.39, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 85 | {"condition": "basic17_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.55, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 86 | {"condition": "basic18_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.06, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 87 | {"condition": "basic18_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.09, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 88 | {"condition": "basic18_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.44, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 89 | {"condition": "basic18_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.64, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 90 | {"condition": "basic18_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.8, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 91 | {"condition": "basic19_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.87, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 92 | {"condition": "basic19_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.98, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 93 | {"condition": "basic19_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.1, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 94 | {"condition": "basic19_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.58, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 95 | {"condition": "basic19_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.57, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 96 | {"condition": "basic20_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.34, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 97 | {"condition": "basic20_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.27, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 98 | {"condition": "basic20_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.47, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 99 | {"condition": "basic20_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.58, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 100 | {"condition": "basic20_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.49, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 101 | {"condition": "basic21_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.0, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 102 | {"condition": "basic21_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.06, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 103 | {"condition": "basic21_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.18, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 104 | {"condition": "basic21_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.33, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 105 | {"condition": "basic21_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.5, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 106 | {"condition": "basic22_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.12, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 107 | {"condition": "basic22_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.17, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 108 | {"condition": "basic22_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.24, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 109 | {"condition": "basic22_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.66, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 110 | {"condition": "basic22_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.69, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 111 | {"condition": "basic23_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.05, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 112 | {"condition": "basic23_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.48, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 113 | {"condition": "basic23_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.7, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 114 | {"condition": "basic23_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.48, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 115 | {"condition": "basic23_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.65, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 116 | {"condition": "basic24_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 6.12, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 117 | {"condition": "basic24_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.31, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 118 | {"condition": "basic24_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.68, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 119 | {"condition": "basic24_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.72, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 120 | {"condition": "basic24_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.41, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 121 | {"condition": "basic25_bin1", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 2.98, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 122 | {"condition": "basic25_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 3.81, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 123 | {"condition": "basic25_bin3", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 4.36, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 124 | {"condition": "basic25_bin4", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 4.83, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 125 | {"condition": "basic25_bin5", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 4.64, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 126 | -------------------------------------------------------------------------------- /logs/basic/llama3.1-405b/results.jsonl: -------------------------------------------------------------------------------- 1 | {"condition": "basic1_bin1", "acc_inst": 0.28, "acc_demo": 0.0, "levdist": 13.15, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 2 | {"condition": "basic1_bin2", "acc_inst": 0.22, "acc_demo": 0.0, "levdist": 8.68, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 3 | {"condition": "basic1_bin3", "acc_inst": 0.24, "acc_demo": 0.0, "levdist": 15.0, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 4 | {"condition": "basic1_bin4", "acc_inst": 0.38, "acc_demo": 0.0, "levdist": 34.17, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 5 | {"condition": "basic1_bin5", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 13.83, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 6 | {"condition": "basic2_bin1", "acc_inst": 0.36, "acc_demo": 0.0, "levdist": 106.78, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 7 | {"condition": "basic2_bin2", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 139.94, "median_levdist": 68.0, "model": "claude-3", "temp": 0.0} 8 | {"condition": "basic2_bin3", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 146.08, "median_levdist": 186.5, "model": "claude-3", "temp": 0.0} 9 | {"condition": "basic2_bin4", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 221.93, "median_levdist": 191.0, "model": "claude-3", "temp": 0.0} 10 | {"condition": "basic2_bin5", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 199.82, "median_levdist": 188.0, "model": "claude-3", "temp": 0.0} 11 | {"condition": "basic3_bin1", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 107.4, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 12 | {"condition": "basic3_bin2", "acc_inst": 0.21, "acc_demo": 0.0, "levdist": 148.21, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 13 | {"condition": "basic3_bin3", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 228.4, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0} 14 | {"condition": "basic3_bin4", "acc_inst": 0.1, "acc_demo": 0.0, "levdist": 233.82, "median_levdist": 199.5, "model": "claude-3", "temp": 0.0} 15 | {"condition": "basic3_bin5", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 223.57, "median_levdist": 192.5, "model": "claude-3", "temp": 0.0} 16 | {"condition": "basic4_bin1", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 135.45, "median_levdist": 3.5, "model": "claude-3", "temp": 0.0} 17 | {"condition": "basic4_bin2", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 118.88, "median_levdist": 3.5, "model": "claude-3", "temp": 0.0} 18 | {"condition": "basic4_bin3", "acc_inst": 0.16, "acc_demo": 0.0, "levdist": 129.08, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 19 | {"condition": "basic4_bin4", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 218.76, "median_levdist": 38.0, "model": "claude-3", "temp": 0.0} 20 | {"condition": "basic4_bin5", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 418.91, "median_levdist": 561.5, "model": "claude-3", "temp": 0.0} 21 | {"condition": "basic5_bin1", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 266.24, "median_levdist": 276.5, "model": "claude-3", "temp": 0.0} 22 | {"condition": "basic5_bin2", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 262.9, "median_levdist": 276.5, "model": "claude-3", "temp": 0.0} 23 | {"condition": "basic5_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 297.63, "median_levdist": 279.0, "model": "claude-3", "temp": 0.0} 24 | {"condition": "basic5_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 317.22, "median_levdist": 299.5, "model": "claude-3", "temp": 0.0} 25 | {"condition": "basic5_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 293.26, "median_levdist": 301.0, "model": "claude-3", "temp": 0.0} 26 | {"condition": "basic6_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 256.99, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0} 27 | {"condition": "basic6_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 234.49, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0} 28 | {"condition": "basic6_bin3", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 203.78, "median_levdist": 196.5, "model": "claude-3", "temp": 0.0} 29 | {"condition": "basic6_bin4", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 277.69, "median_levdist": 205.0, "model": "claude-3", "temp": 0.0} 30 | {"condition": "basic6_bin5", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 354.46, "median_levdist": 423.0, "model": "claude-3", "temp": 0.0} 31 | {"condition": "basic7_bin1", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 165.28, "median_levdist": 191.5, "model": "claude-3", "temp": 0.0} 32 | {"condition": "basic7_bin2", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 187.12, "median_levdist": 192.0, "model": "claude-3", "temp": 0.0} 33 | {"condition": "basic7_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 218.5, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0} 34 | {"condition": "basic7_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 233.28, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0} 35 | {"condition": "basic7_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 251.88, "median_levdist": 200.5, "model": "claude-3", "temp": 0.0} 36 | {"condition": "basic8_bin1", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 219.92, "median_levdist": 194.0, "model": "claude-3", "temp": 0.0} 37 | {"condition": "basic8_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 213.79, "median_levdist": 201.5, "model": "claude-3", "temp": 0.0} 38 | {"condition": "basic8_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 241.55, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0} 39 | {"condition": "basic8_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 270.14, "median_levdist": 205.0, "model": "claude-3", "temp": 0.0} 40 | {"condition": "basic8_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 247.3, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0} 41 | {"condition": "basic9_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 317.68, "median_levdist": 277.5, "model": "claude-3", "temp": 0.0} 42 | {"condition": "basic9_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 324.41, "median_levdist": 276.0, "model": "claude-3", "temp": 0.0} 43 | {"condition": "basic9_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 335.97, "median_levdist": 298.5, "model": "claude-3", "temp": 0.0} 44 | {"condition": "basic9_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 381.66, "median_levdist": 390.0, "model": "claude-3", "temp": 0.0} 45 | {"condition": "basic9_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 320.68, "median_levdist": 310.5, "model": "claude-3", "temp": 0.0} 46 | {"condition": "basic10_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 221.93, "median_levdist": 206.0, "model": "claude-3", "temp": 0.0} 47 | {"condition": "basic10_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 250.25, "median_levdist": 206.0, "model": "claude-3", "temp": 0.0} 48 | {"condition": "basic10_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 223.85, "median_levdist": 196.0, "model": "claude-3", "temp": 0.0} 49 | {"condition": "basic10_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 282.61, "median_levdist": 277.0, "model": "claude-3", "temp": 0.0} 50 | {"condition": "basic10_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 310.79, "median_levdist": 291.0, "model": "claude-3", "temp": 0.0} 51 | {"condition": "basic11_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 352.69, "median_levdist": 244.5, "model": "claude-3", "temp": 0.0} 52 | {"condition": "basic11_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 336.13, "median_levdist": 281.5, "model": "claude-3", "temp": 0.0} 53 | {"condition": "basic11_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 327.86, "median_levdist": 229.0, "model": "claude-3", "temp": 0.0} 54 | {"condition": "basic11_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 344.3, "median_levdist": 216.0, "model": "claude-3", "temp": 0.0} 55 | {"condition": "basic11_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 505.99, "median_levdist": 632.0, "model": "claude-3", "temp": 0.0} 56 | {"condition": "basic12_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 309.18, "median_levdist": 220.5, "model": "claude-3", "temp": 0.0} 57 | {"condition": "basic12_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 299.78, "median_levdist": 275.0, "model": "claude-3", "temp": 0.0} 58 | {"condition": "basic12_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 292.63, "median_levdist": 200.0, "model": "claude-3", "temp": 0.0} 59 | {"condition": "basic12_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 316.36, "median_levdist": 294.5, "model": "claude-3", "temp": 0.0} 60 | {"condition": "basic12_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 385.7, "median_levdist": 325.0, "model": "claude-3", "temp": 0.0} 61 | {"condition": "basic13_bin1", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 22.45, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 62 | {"condition": "basic13_bin2", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 42.08, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 63 | {"condition": "basic13_bin3", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 41.13, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 64 | {"condition": "basic13_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 41.76, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 65 | {"condition": "basic13_bin5", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 105.25, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 66 | {"condition": "basic14_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 318.19, "median_levdist": 208.5, "model": "claude-3", "temp": 0.0} 67 | {"condition": "basic14_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 257.47, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0} 68 | {"condition": "basic14_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 324.1, "median_levdist": 270.5, "model": "claude-3", "temp": 0.0} 69 | {"condition": "basic14_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 317.07, "median_levdist": 280.5, "model": "claude-3", "temp": 0.0} 70 | {"condition": "basic14_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 319.46, "median_levdist": 279.0, "model": "claude-3", "temp": 0.0} 71 | {"condition": "basic15_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 346.07, "median_levdist": 286.0, "model": "claude-3", "temp": 0.0} 72 | {"condition": "basic15_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 316.85, "median_levdist": 287.5, "model": "claude-3", "temp": 0.0} 73 | {"condition": "basic15_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 346.29, "median_levdist": 304.0, "model": "claude-3", "temp": 0.0} 74 | {"condition": "basic15_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 357.54, "median_levdist": 306.5, "model": "claude-3", "temp": 0.0} 75 | {"condition": "basic15_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 328.12, "median_levdist": 290.0, "model": "claude-3", "temp": 0.0} 76 | {"condition": "basic16_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 402.07, "median_levdist": 340.0, "model": "claude-3", "temp": 0.0} 77 | {"condition": "basic16_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 370.82, "median_levdist": 317.0, "model": "claude-3", "temp": 0.0} 78 | {"condition": "basic16_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 366.68, "median_levdist": 324.5, "model": "claude-3", "temp": 0.0} 79 | {"condition": "basic16_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 419.24, "median_levdist": 445.0, "model": "claude-3", "temp": 0.0} 80 | {"condition": "basic16_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 286.26, "median_levdist": 206.0, "model": "claude-3", "temp": 0.0} 81 | {"condition": "basic17_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 296.78, "median_levdist": 196.0, "model": "claude-3", "temp": 0.0} 82 | {"condition": "basic17_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 318.12, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0} 83 | {"condition": "basic17_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 293.44, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0} 84 | {"condition": "basic17_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 319.41, "median_levdist": 198.5, "model": "claude-3", "temp": 0.0} 85 | {"condition": "basic17_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 355.59, "median_levdist": 229.5, "model": "claude-3", "temp": 0.0} 86 | {"condition": "basic18_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 309.81, "median_levdist": 292.0, "model": "claude-3", "temp": 0.0} 87 | {"condition": "basic18_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 330.19, "median_levdist": 308.0, "model": "claude-3", "temp": 0.0} 88 | {"condition": "basic18_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 327.76, "median_levdist": 310.0, "model": "claude-3", "temp": 0.0} 89 | {"condition": "basic18_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 352.78, "median_levdist": 331.0, "model": "claude-3", "temp": 0.0} 90 | {"condition": "basic18_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 406.42, "median_levdist": 401.0, "model": "claude-3", "temp": 0.0} 91 | {"condition": "basic19_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 330.4, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0} 92 | {"condition": "basic19_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 312.73, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0} 93 | {"condition": "basic19_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 311.94, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0} 94 | {"condition": "basic19_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 291.75, "median_levdist": 198.0, "model": "claude-3", "temp": 0.0} 95 | {"condition": "basic19_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 258.78, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0} 96 | {"condition": "basic20_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 354.28, "median_levdist": 360.0, "model": "claude-3", "temp": 0.0} 97 | {"condition": "basic20_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 308.07, "median_levdist": 299.0, "model": "claude-3", "temp": 0.0} 98 | {"condition": "basic20_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 300.86, "median_levdist": 311.5, "model": "claude-3", "temp": 0.0} 99 | {"condition": "basic20_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 254.9, "median_levdist": 194.5, "model": "claude-3", "temp": 0.0} 100 | {"condition": "basic20_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 217.92, "median_levdist": 8.5, "model": "claude-3", "temp": 0.0} 101 | {"condition": "basic21_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 416.48, "median_levdist": 378.0, "model": "claude-3", "temp": 0.0} 102 | {"condition": "basic21_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 349.05, "median_levdist": 235.0, "model": "claude-3", "temp": 0.0} 103 | {"condition": "basic21_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 406.02, "median_levdist": 378.0, "model": "claude-3", "temp": 0.0} 104 | {"condition": "basic21_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 386.56, "median_levdist": 314.0, "model": "claude-3", "temp": 0.0} 105 | {"condition": "basic21_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 286.54, "median_levdist": 292.0, "model": "claude-3", "temp": 0.0} 106 | {"condition": "basic22_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 426.68, "median_levdist": 339.5, "model": "claude-3", "temp": 0.0} 107 | {"condition": "basic22_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 410.56, "median_levdist": 348.5, "model": "claude-3", "temp": 0.0} 108 | {"condition": "basic22_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 440.15, "median_levdist": 492.5, "model": "claude-3", "temp": 0.0} 109 | {"condition": "basic22_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 430.12, "median_levdist": 435.0, "model": "claude-3", "temp": 0.0} 110 | {"condition": "basic22_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 416.37, "median_levdist": 372.5, "model": "claude-3", "temp": 0.0} 111 | -------------------------------------------------------------------------------- /logs/basic/llama3.1-405b/results1.jsonl: -------------------------------------------------------------------------------- 1 | {"condition": "basic1_bin1", "acc_inst": 0.28, "acc_demo": 0.0, "levdist": 13.15, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 2 | {"condition": "basic1_bin2", "acc_inst": 0.22, "acc_demo": 0.0, "levdist": 8.68, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 3 | {"condition": "basic1_bin3", "acc_inst": 0.24, "acc_demo": 0.0, "levdist": 15.0, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 4 | {"condition": "basic1_bin4", "acc_inst": 0.38, "acc_demo": 0.0, "levdist": 34.17, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 5 | {"condition": "basic1_bin5", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 13.83, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 6 | {"condition": "basic2_bin1", "acc_inst": 0.36, "acc_demo": 0.0, "levdist": 106.78, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 7 | {"condition": "basic2_bin2", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 139.94, "median_levdist": 68.0, "model": "claude-3", "temp": 0.0} 8 | {"condition": "basic2_bin3", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 146.08, "median_levdist": 186.5, "model": "claude-3", "temp": 0.0} 9 | {"condition": "basic2_bin4", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 221.93, "median_levdist": 191.0, "model": "claude-3", "temp": 0.0} 10 | {"condition": "basic2_bin5", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 199.82, "median_levdist": 188.0, "model": "claude-3", "temp": 0.0} 11 | {"condition": "basic3_bin1", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 107.4, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 12 | {"condition": "basic3_bin2", "acc_inst": 0.21, "acc_demo": 0.0, "levdist": 148.21, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 13 | {"condition": "basic3_bin3", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 228.4, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0} 14 | {"condition": "basic3_bin4", "acc_inst": 0.1, "acc_demo": 0.0, "levdist": 233.82, "median_levdist": 199.5, "model": "claude-3", "temp": 0.0} 15 | {"condition": "basic3_bin5", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 223.57, "median_levdist": 192.5, "model": "claude-3", "temp": 0.0} 16 | {"condition": "basic4_bin1", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 135.45, "median_levdist": 3.5, "model": "claude-3", "temp": 0.0} 17 | {"condition": "basic4_bin2", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 118.88, "median_levdist": 3.5, "model": "claude-3", "temp": 0.0} 18 | {"condition": "basic4_bin3", "acc_inst": 0.16, "acc_demo": 0.0, "levdist": 129.08, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 19 | {"condition": "basic4_bin4", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 218.76, "median_levdist": 38.0, "model": "claude-3", "temp": 0.0} 20 | {"condition": "basic4_bin5", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 418.91, "median_levdist": 561.5, "model": "claude-3", "temp": 0.0} 21 | {"condition": "basic5_bin1", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 266.24, "median_levdist": 276.5, "model": "claude-3", "temp": 0.0} 22 | {"condition": "basic5_bin2", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 262.9, "median_levdist": 276.5, "model": "claude-3", "temp": 0.0} 23 | {"condition": "basic5_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 297.63, "median_levdist": 279.0, "model": "claude-3", "temp": 0.0} 24 | {"condition": "basic5_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 317.22, "median_levdist": 299.5, "model": "claude-3", "temp": 0.0} 25 | {"condition": "basic5_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 293.26, "median_levdist": 301.0, "model": "claude-3", "temp": 0.0} 26 | {"condition": "basic6_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 256.99, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0} 27 | {"condition": "basic6_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 234.49, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0} 28 | {"condition": "basic6_bin3", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 203.78, "median_levdist": 196.5, "model": "claude-3", "temp": 0.0} 29 | {"condition": "basic6_bin4", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 277.69, "median_levdist": 205.0, "model": "claude-3", "temp": 0.0} 30 | {"condition": "basic6_bin5", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 354.46, "median_levdist": 423.0, "model": "claude-3", "temp": 0.0} 31 | {"condition": "basic7_bin1", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 165.28, "median_levdist": 191.5, "model": "claude-3", "temp": 0.0} 32 | {"condition": "basic7_bin2", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 187.12, "median_levdist": 192.0, "model": "claude-3", "temp": 0.0} 33 | {"condition": "basic7_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 218.5, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0} 34 | {"condition": "basic7_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 233.28, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0} 35 | {"condition": "basic7_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 251.88, "median_levdist": 200.5, "model": "claude-3", "temp": 0.0} 36 | {"condition": "basic8_bin1", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 219.92, "median_levdist": 194.0, "model": "claude-3", "temp": 0.0} 37 | {"condition": "basic8_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 213.79, "median_levdist": 201.5, "model": "claude-3", "temp": 0.0} 38 | {"condition": "basic8_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 241.55, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0} 39 | {"condition": "basic8_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 270.14, "median_levdist": 205.0, "model": "claude-3", "temp": 0.0} 40 | {"condition": "basic8_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 247.3, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0} 41 | {"condition": "basic9_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 317.68, "median_levdist": 277.5, "model": "claude-3", "temp": 0.0} 42 | {"condition": "basic9_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 324.41, "median_levdist": 276.0, "model": "claude-3", "temp": 0.0} 43 | {"condition": "basic9_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 335.97, "median_levdist": 298.5, "model": "claude-3", "temp": 0.0} 44 | {"condition": "basic9_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 381.66, "median_levdist": 390.0, "model": "claude-3", "temp": 0.0} 45 | {"condition": "basic9_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 320.68, "median_levdist": 310.5, "model": "claude-3", "temp": 0.0} 46 | {"condition": "basic10_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 221.93, "median_levdist": 206.0, "model": "claude-3", "temp": 0.0} 47 | {"condition": "basic10_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 250.25, "median_levdist": 206.0, "model": "claude-3", "temp": 0.0} 48 | {"condition": "basic10_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 223.85, "median_levdist": 196.0, "model": "claude-3", "temp": 0.0} 49 | {"condition": "basic10_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 282.61, "median_levdist": 277.0, "model": "claude-3", "temp": 0.0} 50 | {"condition": "basic10_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 310.79, "median_levdist": 291.0, "model": "claude-3", "temp": 0.0} 51 | {"condition": "basic11_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 352.69, "median_levdist": 244.5, "model": "claude-3", "temp": 0.0} 52 | {"condition": "basic11_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 336.13, "median_levdist": 281.5, "model": "claude-3", "temp": 0.0} 53 | {"condition": "basic11_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 327.86, "median_levdist": 229.0, "model": "claude-3", "temp": 0.0} 54 | {"condition": "basic11_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 344.3, "median_levdist": 216.0, "model": "claude-3", "temp": 0.0} 55 | {"condition": "basic11_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 505.99, "median_levdist": 632.0, "model": "claude-3", "temp": 0.0} 56 | {"condition": "basic12_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 309.18, "median_levdist": 220.5, "model": "claude-3", "temp": 0.0} 57 | {"condition": "basic12_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 299.78, "median_levdist": 275.0, "model": "claude-3", "temp": 0.0} 58 | {"condition": "basic12_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 292.63, "median_levdist": 200.0, "model": "claude-3", "temp": 0.0} 59 | {"condition": "basic12_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 316.36, "median_levdist": 294.5, "model": "claude-3", "temp": 0.0} 60 | {"condition": "basic12_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 385.7, "median_levdist": 325.0, "model": "claude-3", "temp": 0.0} 61 | {"condition": "basic13_bin1", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 22.45, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 62 | {"condition": "basic13_bin2", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 42.08, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 63 | {"condition": "basic13_bin3", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 41.13, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 64 | {"condition": "basic13_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 41.76, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 65 | {"condition": "basic13_bin5", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 105.25, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 66 | {"condition": "basic14_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 318.19, "median_levdist": 208.5, "model": "claude-3", "temp": 0.0} 67 | {"condition": "basic14_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 257.47, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0} 68 | {"condition": "basic14_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 324.1, "median_levdist": 270.5, "model": "claude-3", "temp": 0.0} 69 | {"condition": "basic14_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 317.07, "median_levdist": 280.5, "model": "claude-3", "temp": 0.0} 70 | {"condition": "basic14_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 319.46, "median_levdist": 279.0, "model": "claude-3", "temp": 0.0} 71 | {"condition": "basic15_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 346.07, "median_levdist": 286.0, "model": "claude-3", "temp": 0.0} 72 | {"condition": "basic15_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 316.85, "median_levdist": 287.5, "model": "claude-3", "temp": 0.0} 73 | {"condition": "basic15_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 346.29, "median_levdist": 304.0, "model": "claude-3", "temp": 0.0} 74 | {"condition": "basic15_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 357.54, "median_levdist": 306.5, "model": "claude-3", "temp": 0.0} 75 | {"condition": "basic15_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 328.12, "median_levdist": 290.0, "model": "claude-3", "temp": 0.0} 76 | {"condition": "basic16_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 402.07, "median_levdist": 340.0, "model": "claude-3", "temp": 0.0} 77 | {"condition": "basic16_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 370.82, "median_levdist": 317.0, "model": "claude-3", "temp": 0.0} 78 | {"condition": "basic16_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 366.68, "median_levdist": 324.5, "model": "claude-3", "temp": 0.0} 79 | {"condition": "basic16_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 419.24, "median_levdist": 445.0, "model": "claude-3", "temp": 0.0} 80 | {"condition": "basic16_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 286.26, "median_levdist": 206.0, "model": "claude-3", "temp": 0.0} 81 | {"condition": "basic17_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 296.78, "median_levdist": 196.0, "model": "claude-3", "temp": 0.0} 82 | {"condition": "basic17_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 318.12, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0} 83 | {"condition": "basic17_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 293.44, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0} 84 | {"condition": "basic17_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 319.41, "median_levdist": 198.5, "model": "claude-3", "temp": 0.0} 85 | {"condition": "basic17_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 355.59, "median_levdist": 229.5, "model": "claude-3", "temp": 0.0} 86 | {"condition": "basic18_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 309.81, "median_levdist": 292.0, "model": "claude-3", "temp": 0.0} 87 | {"condition": "basic18_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 330.19, "median_levdist": 308.0, "model": "claude-3", "temp": 0.0} 88 | {"condition": "basic18_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 327.76, "median_levdist": 310.0, "model": "claude-3", "temp": 0.0} 89 | {"condition": "basic18_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 352.78, "median_levdist": 331.0, "model": "claude-3", "temp": 0.0} 90 | {"condition": "basic18_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 406.42, "median_levdist": 401.0, "model": "claude-3", "temp": 0.0} 91 | {"condition": "basic19_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 330.4, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0} 92 | {"condition": "basic19_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 312.73, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0} 93 | {"condition": "basic19_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 311.94, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0} 94 | {"condition": "basic19_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 291.75, "median_levdist": 198.0, "model": "claude-3", "temp": 0.0} 95 | {"condition": "basic19_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 258.78, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0} 96 | {"condition": "basic20_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 354.28, "median_levdist": 360.0, "model": "claude-3", "temp": 0.0} 97 | {"condition": "basic20_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 308.07, "median_levdist": 299.0, "model": "claude-3", "temp": 0.0} 98 | {"condition": "basic20_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 300.86, "median_levdist": 311.5, "model": "claude-3", "temp": 0.0} 99 | {"condition": "basic20_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 254.9, "median_levdist": 194.5, "model": "claude-3", "temp": 0.0} 100 | {"condition": "basic20_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 217.92, "median_levdist": 8.5, "model": "claude-3", "temp": 0.0} 101 | {"condition": "basic21_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 416.48, "median_levdist": 378.0, "model": "claude-3", "temp": 0.0} 102 | {"condition": "basic21_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 349.05, "median_levdist": 235.0, "model": "claude-3", "temp": 0.0} 103 | {"condition": "basic21_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 406.02, "median_levdist": 378.0, "model": "claude-3", "temp": 0.0} 104 | {"condition": "basic21_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 386.56, "median_levdist": 314.0, "model": "claude-3", "temp": 0.0} 105 | {"condition": "basic21_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 286.54, "median_levdist": 292.0, "model": "claude-3", "temp": 0.0} 106 | {"condition": "basic22_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 426.68, "median_levdist": 339.5, "model": "claude-3", "temp": 0.0} 107 | {"condition": "basic22_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 410.56, "median_levdist": 348.5, "model": "claude-3", "temp": 0.0} 108 | {"condition": "basic22_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 440.15, "median_levdist": 492.5, "model": "claude-3", "temp": 0.0} 109 | {"condition": "basic22_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 430.12, "median_levdist": 435.0, "model": "claude-3", "temp": 0.0} 110 | {"condition": "basic22_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 416.37, "median_levdist": 372.5, "model": "claude-3", "temp": 0.0} 111 | {"condition": "basic23_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 278.38, "median_levdist": 196.0, "model": "claude-3", "temp": 0.0} 112 | {"condition": "basic23_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 331.5, "median_levdist": 196.0, "model": "claude-3", "temp": 0.0} 113 | {"condition": "basic23_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 339.51, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0} 114 | {"condition": "basic23_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 363.82, "median_levdist": 202.0, "model": "claude-3", "temp": 0.0} 115 | {"condition": "basic23_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 403.8, "median_levdist": 358.0, "model": "claude-3", "temp": 0.0} 116 | {"condition": "basic24_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 375.29, "median_levdist": 312.0, "model": "claude-3", "temp": 0.0} 117 | {"condition": "basic24_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 415.72, "median_levdist": 354.0, "model": "claude-3", "temp": 0.0} 118 | {"condition": "basic24_bin3", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 395.21, "median_levdist": 316.0, "model": "claude-3", "temp": 0.0} 119 | {"condition": "basic24_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 418.06, "median_levdist": 374.5, "model": "claude-3", "temp": 0.0} 120 | {"condition": "basic24_bin5", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 402.51, "median_levdist": 308.0, "model": "claude-3", "temp": 0.0} 121 | {"condition": "basic25_bin1", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 246.23, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0} 122 | {"condition": "basic25_bin2", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 352.15, "median_levdist": 303.0, "model": "claude-3", "temp": 0.0} 123 | {"condition": "basic25_bin3", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 309.63, "median_levdist": 195.0, "model": "claude-3", "temp": 0.0} 124 | {"condition": "basic25_bin4", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 437.08, "median_levdist": 559.0, "model": "claude-3", "temp": 0.0} 125 | {"condition": "basic25_bin5", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 421.89, "median_levdist": 541.5, "model": "claude-3", "temp": 0.0} 126 | -------------------------------------------------------------------------------- /logs/text_cot/claude-3/results.jsonl: -------------------------------------------------------------------------------- 1 | {"condition": "cot1_bin1", "acc_inst": 0.87, "acc_demo": 0.0, "levdist": 0.38, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 2 | {"condition": "cot1_bin2", "acc_inst": 0.79, "acc_demo": 0.0, "levdist": 0.38, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 3 | {"condition": "cot1_bin3", "acc_inst": 0.67, "acc_demo": 0.0, "levdist": 0.61, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 4 | {"condition": "cot1_bin4", "acc_inst": 0.76, "acc_demo": 0.0, "levdist": 0.62, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 5 | {"condition": "cot1_bin5", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 2.41, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 6 | {"condition": "cot2_bin1", "acc_inst": 0.9, "acc_demo": 0.0, "levdist": 0.21, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 7 | {"condition": "cot2_bin2", "acc_inst": 0.77, "acc_demo": 0.0, "levdist": 0.52, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 8 | {"condition": "cot2_bin3", "acc_inst": 0.7, "acc_demo": 0.0, "levdist": 0.56, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 9 | {"condition": "cot2_bin4", "acc_inst": 0.74, "acc_demo": 0.0, "levdist": 0.56, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 10 | {"condition": "cot2_bin5", "acc_inst": 0.78, "acc_demo": 0.0, "levdist": 0.32, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 11 | {"condition": "cot3_bin1", "acc_inst": 0.96, "acc_demo": 0.0, "levdist": 0.05, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 12 | {"condition": "cot3_bin2", "acc_inst": 0.82, "acc_demo": 0.0, "levdist": 0.31, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 13 | {"condition": "cot3_bin3", "acc_inst": 0.68, "acc_demo": 0.0, "levdist": 0.6, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 14 | {"condition": "cot3_bin4", "acc_inst": 0.65, "acc_demo": 0.0, "levdist": 0.98, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 15 | {"condition": "cot3_bin5", "acc_inst": 0.32, "acc_demo": 0.0, "levdist": 2.18, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 16 | {"condition": "cot4_bin1", "acc_inst": 0.93, "acc_demo": 0.0, "levdist": 0.07, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 17 | {"condition": "cot4_bin2", "acc_inst": 0.82, "acc_demo": 0.0, "levdist": 0.22, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 18 | {"condition": "cot4_bin3", "acc_inst": 0.66, "acc_demo": 0.0, "levdist": 0.54, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 19 | {"condition": "cot4_bin4", "acc_inst": 0.73, "acc_demo": 0.0, "levdist": 0.35, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 20 | {"condition": "cot4_bin5", "acc_inst": 0.49, "acc_demo": 0.0, "levdist": 0.72, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 21 | {"condition": "cot5_bin1", "acc_inst": 0.88, "acc_demo": 0.0, "levdist": 0.29, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 22 | {"condition": "cot5_bin2", "acc_inst": 0.8, "acc_demo": 0.0, "levdist": 0.36, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 23 | {"condition": "cot5_bin3", "acc_inst": 0.7, "acc_demo": 0.0, "levdist": 0.48, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 24 | {"condition": "cot5_bin4", "acc_inst": 0.76, "acc_demo": 0.0, "levdist": 0.34, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 25 | {"condition": "cot5_bin5", "acc_inst": 0.86, "acc_demo": 0.0, "levdist": 0.19, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 26 | {"condition": "cot6_bin1", "acc_inst": 0.9, "acc_demo": 0.0, "levdist": 0.17, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 27 | {"condition": "cot6_bin2", "acc_inst": 0.75, "acc_demo": 0.0, "levdist": 0.52, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 28 | {"condition": "cot6_bin3", "acc_inst": 0.65, "acc_demo": 0.0, "levdist": 0.7, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 29 | {"condition": "cot6_bin4", "acc_inst": 0.7, "acc_demo": 0.0, "levdist": 1.28, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 30 | {"condition": "cot6_bin5", "acc_inst": 0.58, "acc_demo": 0.0, "levdist": 1.79, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 31 | {"condition": "cot7_bin1", "acc_inst": 0.63, "acc_demo": 0.0, "levdist": 1.18, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 32 | {"condition": "cot7_bin2", "acc_inst": 0.52, "acc_demo": 0.0, "levdist": 1.44, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 33 | {"condition": "cot7_bin3", "acc_inst": 0.46, "acc_demo": 0.0, "levdist": 1.77, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 34 | {"condition": "cot7_bin4", "acc_inst": 0.49, "acc_demo": 0.0, "levdist": 1.93, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 35 | {"condition": "cot7_bin5", "acc_inst": 0.63, "acc_demo": 0.0, "levdist": 0.91, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 36 | {"condition": "cot8_bin1", "acc_inst": 0.8, "acc_demo": 0.0, "levdist": 0.68, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 37 | {"condition": "cot8_bin2", "acc_inst": 0.69, "acc_demo": 0.0, "levdist": 0.77, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 38 | {"condition": "cot8_bin3", "acc_inst": 0.57, "acc_demo": 0.0, "levdist": 1.41, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 39 | {"condition": "cot8_bin4", "acc_inst": 0.6, "acc_demo": 0.0, "levdist": 1.03, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 40 | {"condition": "cot8_bin5", "acc_inst": 0.37, "acc_demo": 0.0, "levdist": 2.95, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 41 | {"condition": "cot9_bin1", "acc_inst": 0.6, "acc_demo": 0.0, "levdist": 0.99, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 42 | {"condition": "cot9_bin2", "acc_inst": 0.42, "acc_demo": 0.0, "levdist": 1.6, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 43 | {"condition": "cot9_bin3", "acc_inst": 0.34, "acc_demo": 0.0, "levdist": 2.02, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 44 | {"condition": "cot9_bin4", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 2.33, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 45 | {"condition": "cot9_bin5", "acc_inst": 0.16, "acc_demo": 0.0, "levdist": 1.85, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 46 | {"condition": "cot10_bin1", "acc_inst": 0.51, "acc_demo": 0.0, "levdist": 1.46, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 47 | {"condition": "cot10_bin2", "acc_inst": 0.35, "acc_demo": 0.0, "levdist": 2.01, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 48 | {"condition": "cot10_bin3", "acc_inst": 0.34, "acc_demo": 0.0, "levdist": 3.44, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 49 | {"condition": "cot10_bin4", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 2.39, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 50 | {"condition": "cot10_bin5", "acc_inst": 0.19, "acc_demo": 0.0, "levdist": 3.01, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 51 | {"condition": "cot11_bin1", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 3.13, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 52 | {"condition": "cot11_bin2", "acc_inst": 0.1, "acc_demo": 0.0, "levdist": 3.07, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 53 | {"condition": "cot11_bin3", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 3.03, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 54 | {"condition": "cot11_bin4", "acc_inst": 0.11, "acc_demo": 0.0, "levdist": 3.52, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 55 | {"condition": "cot11_bin5", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 4.65, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 56 | {"condition": "cot12_bin1", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 2.04, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 57 | {"condition": "cot12_bin2", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 2.38, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 58 | {"condition": "cot12_bin3", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 2.94, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 59 | {"condition": "cot12_bin4", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 3.19, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 60 | {"condition": "cot12_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 3.4, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 61 | {"condition": "cot13_bin1", "acc_inst": 0.88, "acc_demo": 0.0, "levdist": 0.2, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 62 | {"condition": "cot13_bin2", "acc_inst": 0.73, "acc_demo": 0.0, "levdist": 0.4, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 63 | {"condition": "cot13_bin3", "acc_inst": 0.63, "acc_demo": 0.0, "levdist": 0.61, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 64 | {"condition": "cot13_bin4", "acc_inst": 0.6, "acc_demo": 0.0, "levdist": 0.74, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 65 | {"condition": "cot13_bin5", "acc_inst": 0.59, "acc_demo": 0.0, "levdist": 0.91, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 66 | {"condition": "cot14_bin1", "acc_inst": 0.19, "acc_demo": 0.0, "levdist": 2.92, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 67 | {"condition": "cot14_bin2", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 3.29, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 68 | {"condition": "cot14_bin3", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 3.78, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 69 | {"condition": "cot14_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 3.85, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 70 | {"condition": "cot14_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.22, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 71 | {"condition": "cot15_bin1", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 4.7, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 72 | {"condition": "cot15_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.77, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 73 | {"condition": "cot15_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.04, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 74 | {"condition": "cot15_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.84, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 75 | {"condition": "cot15_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.01, "median_levdist": 6.5, "model": "claude-3", "temp": 0.0} 76 | {"condition": "cot16_bin1", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 4.64, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 77 | {"condition": "cot16_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.98, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 78 | {"condition": "cot16_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.2, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 79 | {"condition": "cot16_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.47, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 80 | {"condition": "cot16_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.11, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 81 | {"condition": "cot17_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.52, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 82 | {"condition": "cot17_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.76, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 83 | {"condition": "cot17_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.82, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 84 | {"condition": "cot17_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.33, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 85 | {"condition": "cot17_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.51, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 86 | {"condition": "cot18_bin1", "acc_inst": 0.19, "acc_demo": 0.0, "levdist": 3.62, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 87 | {"condition": "cot18_bin2", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 4.14, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 88 | {"condition": "cot18_bin3", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 4.02, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 89 | {"condition": "cot18_bin4", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 4.25, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 90 | {"condition": "cot18_bin5", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 5.58, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 91 | {"condition": "cot19_bin1", "acc_inst": 0.12, "acc_demo": 0.0, "levdist": 3.78, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 92 | {"condition": "cot19_bin2", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 4.03, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 93 | {"condition": "cot19_bin3", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 4.57, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 94 | {"condition": "cot19_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.04, "median_levdist": 5.5, "model": "claude-3", "temp": 0.0} 95 | {"condition": "cot19_bin5", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.73, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 96 | {"condition": "cot20_bin1", "acc_inst": 0.18, "acc_demo": 0.0, "levdist": 3.41, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 97 | {"condition": "cot20_bin2", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 4.18, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 98 | {"condition": "cot20_bin3", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 4.17, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 99 | {"condition": "cot20_bin4", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 6.57, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 100 | {"condition": "cot20_bin5", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.17, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 101 | {"condition": "cot21_bin1", "acc_inst": 0.3, "acc_demo": 0.0, "levdist": 2.9, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 102 | {"condition": "cot21_bin2", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 2.89, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 103 | {"condition": "cot21_bin3", "acc_inst": 0.14, "acc_demo": 0.0, "levdist": 2.84, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 104 | {"condition": "cot21_bin4", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 3.14, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 105 | {"condition": "cot21_bin5", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 3.62, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 106 | {"condition": "cot22_bin1", "acc_inst": 0.31, "acc_demo": 0.0, "levdist": 2.7, "median_levdist": 2.5, "model": "claude-3", "temp": 0.0} 107 | {"condition": "cot22_bin2", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 2.98, "median_levdist": 2.5, "model": "claude-3", "temp": 0.0} 108 | {"condition": "cot22_bin3", "acc_inst": 0.14, "acc_demo": 0.0, "levdist": 3.64, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 109 | {"condition": "cot22_bin4", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 4.32, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 110 | {"condition": "cot22_bin5", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 3.17, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 111 | {"condition": "cot23_bin1", "acc_inst": 0.35, "acc_demo": 0.0, "levdist": 2.8, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 112 | {"condition": "cot23_bin2", "acc_inst": 0.28, "acc_demo": 0.0, "levdist": 3.29, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 113 | {"condition": "cot23_bin3", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 3.83, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 114 | {"condition": "cot23_bin4", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 3.77, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 115 | {"condition": "cot23_bin5", "acc_inst": 0.11, "acc_demo": 0.0, "levdist": 3.75, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 116 | {"condition": "cot24_bin1", "acc_inst": 0.44, "acc_demo": 0.0, "levdist": 2.6, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 117 | {"condition": "cot24_bin2", "acc_inst": 0.32, "acc_demo": 0.0, "levdist": 3.19, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 118 | {"condition": "cot24_bin3", "acc_inst": 0.39, "acc_demo": 0.0, "levdist": 2.63, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 119 | {"condition": "cot24_bin4", "acc_inst": 0.34, "acc_demo": 0.0, "levdist": 3.05, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 120 | {"condition": "cot24_bin5", "acc_inst": 0.21, "acc_demo": 0.0, "levdist": 4.04, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 121 | {"condition": "cot25_bin1", "acc_inst": 0.62, "acc_demo": 0.0, "levdist": 1.89, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 122 | {"condition": "cot25_bin2", "acc_inst": 0.58, "acc_demo": 0.0, "levdist": 1.77, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 123 | {"condition": "cot25_bin3", "acc_inst": 0.42, "acc_demo": 0.0, "levdist": 2.74, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 124 | {"condition": "cot25_bin4", "acc_inst": 0.56, "acc_demo": 0.0, "levdist": 1.88, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 125 | {"condition": "cot25_bin5", "acc_inst": 0.67, "acc_demo": 0.0, "levdist": 1.12, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 126 | -------------------------------------------------------------------------------- /logs/text_cot/gpt-4/results.jsonl: -------------------------------------------------------------------------------- 1 | {"condition": "cot1_bin1", "acc_inst": 0.77, "acc_demo": 0.0, "levdist": 0.4, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 2 | {"condition": "cot1_bin2", "acc_inst": 0.64, "acc_demo": 0.0, "levdist": 0.72, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 3 | {"condition": "cot1_bin3", "acc_inst": 0.46, "acc_demo": 0.0, "levdist": 1.12, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 4 | {"condition": "cot1_bin4", "acc_inst": 0.43, "acc_demo": 0.0, "levdist": 1.07, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 5 | {"condition": "cot1_bin5", "acc_inst": 0.16, "acc_demo": 0.0, "levdist": 1.91, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 6 | {"condition": "cot2_bin1", "acc_inst": 0.83, "acc_demo": 0.0, "levdist": 0.26, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 7 | {"condition": "cot2_bin2", "acc_inst": 0.71, "acc_demo": 0.0, "levdist": 0.52, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 8 | {"condition": "cot2_bin3", "acc_inst": 0.49, "acc_demo": 0.0, "levdist": 1.06, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 9 | {"condition": "cot2_bin4", "acc_inst": 0.43, "acc_demo": 0.0, "levdist": 1.0, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 10 | {"condition": "cot2_bin5", "acc_inst": 0.19, "acc_demo": 0.0, "levdist": 1.65, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 11 | {"condition": "cot3_bin1", "acc_inst": 0.79, "acc_demo": 0.0, "levdist": 0.33, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 12 | {"condition": "cot3_bin2", "acc_inst": 0.71, "acc_demo": 0.0, "levdist": 0.54, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 13 | {"condition": "cot3_bin3", "acc_inst": 0.48, "acc_demo": 0.0, "levdist": 1.06, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 14 | {"condition": "cot3_bin4", "acc_inst": 0.4, "acc_demo": 0.0, "levdist": 1.14, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 15 | {"condition": "cot3_bin5", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 1.58, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 16 | {"condition": "cot4_bin1", "acc_inst": 0.76, "acc_demo": 0.0, "levdist": 0.44, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 17 | {"condition": "cot4_bin2", "acc_inst": 0.66, "acc_demo": 0.0, "levdist": 0.74, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 18 | {"condition": "cot4_bin3", "acc_inst": 0.5, "acc_demo": 0.0, "levdist": 1.12, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 19 | {"condition": "cot4_bin4", "acc_inst": 0.47, "acc_demo": 0.0, "levdist": 1.0, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 20 | {"condition": "cot4_bin5", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 1.26, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 21 | {"condition": "cot5_bin1", "acc_inst": 0.76, "acc_demo": 0.0, "levdist": 0.5, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 22 | {"condition": "cot5_bin2", "acc_inst": 0.68, "acc_demo": 0.0, "levdist": 0.66, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 23 | {"condition": "cot5_bin3", "acc_inst": 0.44, "acc_demo": 0.0, "levdist": 1.32, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 24 | {"condition": "cot5_bin4", "acc_inst": 0.49, "acc_demo": 0.0, "levdist": 1.15, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 25 | {"condition": "cot5_bin5", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 1.58, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 26 | {"condition": "cot6_bin1", "acc_inst": 0.73, "acc_demo": 0.0, "levdist": 0.49, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 27 | {"condition": "cot6_bin2", "acc_inst": 0.74, "acc_demo": 0.0, "levdist": 0.46, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 28 | {"condition": "cot6_bin3", "acc_inst": 0.45, "acc_demo": 0.0, "levdist": 1.18, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 29 | {"condition": "cot6_bin4", "acc_inst": 0.36, "acc_demo": 0.0, "levdist": 1.24, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 30 | {"condition": "cot6_bin5", "acc_inst": 0.21, "acc_demo": 0.0, "levdist": 1.59, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 31 | {"condition": "cot7_bin1", "acc_inst": 0.67, "acc_demo": 0.0, "levdist": 0.52, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 32 | {"condition": "cot7_bin2", "acc_inst": 0.55, "acc_demo": 0.0, "levdist": 0.89, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 33 | {"condition": "cot7_bin3", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 1.61, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 34 | {"condition": "cot7_bin4", "acc_inst": 0.19, "acc_demo": 0.0, "levdist": 1.5, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 35 | {"condition": "cot7_bin5", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 1.91, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 36 | {"condition": "cot8_bin1", "acc_inst": 0.7, "acc_demo": 0.0, "levdist": 0.52, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 37 | {"condition": "cot8_bin2", "acc_inst": 0.63, "acc_demo": 0.0, "levdist": 0.63, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 38 | {"condition": "cot8_bin3", "acc_inst": 0.44, "acc_demo": 0.0, "levdist": 1.24, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 39 | {"condition": "cot8_bin4", "acc_inst": 0.5, "acc_demo": 0.0, "levdist": 0.98, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 40 | {"condition": "cot8_bin5", "acc_inst": 0.23, "acc_demo": 0.0, "levdist": 1.56, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 41 | {"condition": "cot9_bin1", "acc_inst": 0.64, "acc_demo": 0.0, "levdist": 0.91, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 42 | {"condition": "cot9_bin2", "acc_inst": 0.51, "acc_demo": 0.0, "levdist": 1.24, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 43 | {"condition": "cot9_bin3", "acc_inst": 0.36, "acc_demo": 0.0, "levdist": 1.68, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 44 | {"condition": "cot9_bin4", "acc_inst": 0.3, "acc_demo": 0.0, "levdist": 1.48, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 45 | {"condition": "cot9_bin5", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 1.68, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 46 | {"condition": "cot10_bin1", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 2.3, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 47 | {"condition": "cot10_bin2", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 2.25, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 48 | {"condition": "cot10_bin3", "acc_inst": 0.14, "acc_demo": 0.0, "levdist": 2.4, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 49 | {"condition": "cot10_bin4", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 2.7, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 50 | {"condition": "cot10_bin5", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 2.24, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 51 | {"condition": "cot11_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.23, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 52 | {"condition": "cot11_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.3, "median_levdist": 4.5, "model": "claude-3", "temp": 0.0} 53 | {"condition": "cot11_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.24, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 54 | {"condition": "cot11_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.42, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 55 | {"condition": "cot11_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.08, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 56 | {"condition": "cot12_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.81, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 57 | {"condition": "cot12_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.86, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 58 | {"condition": "cot12_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.8, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 59 | {"condition": "cot12_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.86, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 60 | {"condition": "cot12_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.93, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 61 | {"condition": "cot13_bin1", "acc_inst": 0.79, "acc_demo": 0.0, "levdist": 0.45, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 62 | {"condition": "cot13_bin2", "acc_inst": 0.64, "acc_demo": 0.0, "levdist": 0.66, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 63 | {"condition": "cot13_bin3", "acc_inst": 0.67, "acc_demo": 0.0, "levdist": 0.79, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 64 | {"condition": "cot13_bin4", "acc_inst": 0.59, "acc_demo": 0.0, "levdist": 0.75, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 65 | {"condition": "cot13_bin5", "acc_inst": 0.37, "acc_demo": 0.0, "levdist": 1.31, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 66 | {"condition": "cot14_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.24, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0} 67 | {"condition": "cot14_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.46, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 68 | {"condition": "cot14_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.51, "median_levdist": 4.5, "model": "claude-3", "temp": 0.0} 69 | {"condition": "cot14_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.63, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 70 | {"condition": "cot14_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.76, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 71 | {"condition": "cot15_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.35, "median_levdist": 5.5, "model": "claude-3", "temp": 0.0} 72 | {"condition": "cot15_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.28, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 73 | {"condition": "cot15_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.47, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 74 | {"condition": "cot15_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.59, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 75 | {"condition": "cot15_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.83, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 76 | {"condition": "cot16_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.39, "median_levdist": 5.5, "model": "claude-3", "temp": 0.0} 77 | {"condition": "cot16_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.33, "median_levdist": 5.5, "model": "claude-3", "temp": 0.0} 78 | {"condition": "cot16_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.3, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 79 | {"condition": "cot16_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.49, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 80 | {"condition": "cot16_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.96, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 81 | {"condition": "cot17_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.43, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 82 | {"condition": "cot17_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.37, "median_levdist": 5.5, "model": "claude-3", "temp": 0.0} 83 | {"condition": "cot17_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.65, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 84 | {"condition": "cot17_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.87, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 85 | {"condition": "cot17_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.31, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 86 | {"condition": "cot18_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.03, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 87 | {"condition": "cot18_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.0, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0} 88 | {"condition": "cot18_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.39, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 89 | {"condition": "cot18_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.72, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 90 | {"condition": "cot18_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.12, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 91 | {"condition": "cot19_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.83, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 92 | {"condition": "cot19_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.73, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 93 | {"condition": "cot19_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.11, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 94 | {"condition": "cot19_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.88, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 95 | {"condition": "cot19_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.3, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 96 | {"condition": "cot20_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.93, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 97 | {"condition": "cot20_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.82, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 98 | {"condition": "cot20_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.03, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0} 99 | {"condition": "cot20_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.28, "median_levdist": 6.5, "model": "claude-3", "temp": 0.0} 100 | {"condition": "cot20_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.47, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0} 101 | {"condition": "cot21_bin1", "acc_inst": 0.16, "acc_demo": 0.0, "levdist": 2.84, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 102 | {"condition": "cot21_bin2", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 2.93, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 103 | {"condition": "cot21_bin3", "acc_inst": 0.14, "acc_demo": 0.0, "levdist": 3.05, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 104 | {"condition": "cot21_bin4", "acc_inst": 0.23, "acc_demo": 0.0, "levdist": 2.39, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 105 | {"condition": "cot21_bin5", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 2.28, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 106 | {"condition": "cot22_bin1", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 3.31, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 107 | {"condition": "cot22_bin2", "acc_inst": 0.28, "acc_demo": 0.0, "levdist": 2.86, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 108 | {"condition": "cot22_bin3", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 3.57, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 109 | {"condition": "cot22_bin4", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 3.05, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 110 | {"condition": "cot22_bin5", "acc_inst": 0.1, "acc_demo": 0.0, "levdist": 3.25, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0} 111 | {"condition": "cot23_bin1", "acc_inst": 0.82, "acc_demo": 0.0, "levdist": 0.39, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 112 | {"condition": "cot23_bin2", "acc_inst": 0.73, "acc_demo": 0.0, "levdist": 0.73, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 113 | {"condition": "cot23_bin3", "acc_inst": 0.46, "acc_demo": 0.0, "levdist": 1.36, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 114 | {"condition": "cot23_bin4", "acc_inst": 0.58, "acc_demo": 0.0, "levdist": 0.9, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 115 | {"condition": "cot23_bin5", "acc_inst": 0.28, "acc_demo": 0.0, "levdist": 1.38, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 116 | {"condition": "cot24_bin1", "acc_inst": 0.84, "acc_demo": 0.0, "levdist": 0.45, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 117 | {"condition": "cot24_bin2", "acc_inst": 0.75, "acc_demo": 0.0, "levdist": 0.55, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 118 | {"condition": "cot24_bin3", "acc_inst": 0.5, "acc_demo": 0.0, "levdist": 1.28, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 119 | {"condition": "cot24_bin4", "acc_inst": 0.46, "acc_demo": 0.0, "levdist": 1.13, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 120 | {"condition": "cot24_bin5", "acc_inst": 0.18, "acc_demo": 0.0, "levdist": 1.7, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 121 | {"condition": "cot25_bin1", "acc_inst": 0.81, "acc_demo": 0.0, "levdist": 0.37, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 122 | {"condition": "cot25_bin2", "acc_inst": 0.67, "acc_demo": 0.0, "levdist": 0.69, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0} 123 | {"condition": "cot25_bin3", "acc_inst": 0.46, "acc_demo": 0.0, "levdist": 1.22, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 124 | {"condition": "cot25_bin4", "acc_inst": 0.47, "acc_demo": 0.0, "levdist": 1.05, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0} 125 | {"condition": "cot25_bin5", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 1.54, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0} 126 | -------------------------------------------------------------------------------- /logs/text_cot/llama3.1-405b/results.jsonl: -------------------------------------------------------------------------------- 1 | {"condition": "cot1_bin1", "acc_inst": 0.81, "acc_demo": 0.0, "levdist": 8.96, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 2 | {"condition": "cot1_bin2", "acc_inst": 0.74, "acc_demo": 0.0, "levdist": 15.86, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 3 | {"condition": "cot1_bin3", "acc_inst": 0.54, "acc_demo": 0.0, "levdist": 35.77, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 4 | {"condition": "cot1_bin4", "acc_inst": 0.59, "acc_demo": 0.0, "levdist": 54.97, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 5 | {"condition": "cot1_bin5", "acc_inst": 0.55, "acc_demo": 0.0, "levdist": 35.06, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 6 | {"condition": "cot2_bin1", "acc_inst": 0.82, "acc_demo": 0.0, "levdist": 0.55, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 7 | {"condition": "cot2_bin2", "acc_inst": 0.67, "acc_demo": 0.0, "levdist": 8.87, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 8 | {"condition": "cot2_bin3", "acc_inst": 0.61, "acc_demo": 0.0, "levdist": 8.78, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 9 | {"condition": "cot2_bin4", "acc_inst": 0.6, "acc_demo": 0.0, "levdist": 44.3, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 10 | {"condition": "cot2_bin5", "acc_inst": 0.64, "acc_demo": 0.0, "levdist": 31.37, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 11 | {"condition": "cot3_bin1", "acc_inst": 0.62, "acc_demo": 0.0, "levdist": 0.94, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0} 12 | {"condition": "cot3_bin2", "acc_inst": 0.52, "acc_demo": 0.0, "levdist": 2.64, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0} 13 | {"condition": "cot3_bin3", "acc_inst": 0.49, "acc_demo": 0.0, "levdist": 11.29, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0} 14 | {"condition": "cot3_bin4", "acc_inst": 0.41, "acc_demo": 0.0, "levdist": 85.19, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0} 15 | {"condition": "cot3_bin5", "acc_inst": 0.35, "acc_demo": 0.0, "levdist": 220.52, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0} 16 | {"condition": "cot4_bin1", "acc_inst": 0.48, "acc_demo": 0.0, "levdist": 14.4, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0} 17 | {"condition": "cot4_bin2", "acc_inst": 0.39, "acc_demo": 0.0, "levdist": 8.15, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0} 18 | {"condition": "cot4_bin3", "acc_inst": 0.31, "acc_demo": 0.0, "levdist": 37.04, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0} 19 | {"condition": "cot4_bin4", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 131.24, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0} 20 | {"condition": "cot4_bin5", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 350.72, "median_levdist": 475.5, "model": "llama3.1-405b", "temp": 0.0} 21 | {"condition": "cot5_bin1", "acc_inst": 0.21, "acc_demo": 0.0, "levdist": 8.51, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0} 22 | {"condition": "cot5_bin2", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 9.12, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0} 23 | {"condition": "cot5_bin3", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 45.34, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0} 24 | {"condition": "cot5_bin4", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 92.35, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0} 25 | {"condition": "cot5_bin5", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 316.79, "median_levdist": 448.5, "model": "llama3.1-405b", "temp": 0.0} 26 | {"condition": "cot6_bin1", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 27.74, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0} 27 | {"condition": "cot6_bin2", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 24.1, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0} 28 | {"condition": "cot6_bin3", "acc_inst": 0.16, "acc_demo": 0.0, "levdist": 54.37, "median_levdist": 2.5, "model": "llama3.1-405b", "temp": 0.0} 29 | {"condition": "cot6_bin4", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 120.19, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0} 30 | {"condition": "cot6_bin5", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 372.41, "median_levdist": 595.5, "model": "llama3.1-405b", "temp": 0.0} 31 | {"condition": "cot7_bin1", "acc_inst": 0.23, "acc_demo": 0.0, "levdist": 7.91, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0} 32 | {"condition": "cot7_bin2", "acc_inst": 0.1, "acc_demo": 0.0, "levdist": 29.64, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0} 33 | {"condition": "cot7_bin3", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 2.75, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0} 34 | {"condition": "cot7_bin4", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 28.98, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0} 35 | {"condition": "cot7_bin5", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 13.81, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0} 36 | {"condition": "cot8_bin1", "acc_inst": 0.12, "acc_demo": 0.0, "levdist": 2.44, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0} 37 | {"condition": "cot8_bin2", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 2.62, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0} 38 | {"condition": "cot8_bin3", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 2.83, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0} 39 | {"condition": "cot8_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 10.69, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0} 40 | {"condition": "cot8_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 3.82, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0} 41 | {"condition": "cot9_bin1", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 10.9, "median_levdist": 4.0, "model": "llama3.1-405b", "temp": 0.0} 42 | {"condition": "cot9_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 16.74, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 43 | {"condition": "cot9_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 17.15, "median_levdist": 4.0, "model": "llama3.1-405b", "temp": 0.0} 44 | {"condition": "cot9_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.09, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 45 | {"condition": "cot9_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 10.38, "median_levdist": 4.0, "model": "llama3.1-405b", "temp": 0.0} 46 | {"condition": "cot10_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.52, "median_levdist": 4.0, "model": "llama3.1-405b", "temp": 0.0} 47 | {"condition": "cot10_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.79, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 48 | {"condition": "cot10_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 10.97, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 49 | {"condition": "cot10_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 17.13, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 50 | {"condition": "cot10_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.37, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 51 | {"condition": "cot11_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 10.68, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 52 | {"condition": "cot11_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 20.52, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 53 | {"condition": "cot11_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 18.2, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 54 | {"condition": "cot11_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.44, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 55 | {"condition": "cot11_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 16.82, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 56 | {"condition": "cot12_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 9.84, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 57 | {"condition": "cot12_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 23.49, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 58 | {"condition": "cot12_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 22.01, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 59 | {"condition": "cot12_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 43.66, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 60 | {"condition": "cot12_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 19.28, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 61 | {"condition": "cot13_bin1", "acc_inst": 0.86, "acc_demo": 0.0, "levdist": 0.33, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 62 | {"condition": "cot13_bin2", "acc_inst": 0.73, "acc_demo": 0.0, "levdist": 0.55, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 63 | {"condition": "cot13_bin3", "acc_inst": 0.62, "acc_demo": 0.0, "levdist": 0.87, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 64 | {"condition": "cot13_bin4", "acc_inst": 0.61, "acc_demo": 0.0, "levdist": 0.9, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 65 | {"condition": "cot13_bin5", "acc_inst": 0.53, "acc_demo": 0.0, "levdist": 0.92, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 66 | {"condition": "cot14_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.59, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 67 | {"condition": "cot14_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.8, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 68 | {"condition": "cot14_bin3", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 4.79, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 69 | {"condition": "cot14_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 17.9, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 70 | {"condition": "cot14_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.55, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 71 | {"condition": "cot15_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 7.26, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 72 | {"condition": "cot15_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 19.1, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 73 | {"condition": "cot15_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.72, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 74 | {"condition": "cot15_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.02, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 75 | {"condition": "cot15_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.36, "median_levdist": 7.0, "model": "llama3.1-405b", "temp": 0.0} 76 | {"condition": "cot16_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.69, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 77 | {"condition": "cot16_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.13, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 78 | {"condition": "cot16_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.48, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 79 | {"condition": "cot16_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.61, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 80 | {"condition": "cot16_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.79, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 81 | {"condition": "cot17_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.03, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 82 | {"condition": "cot17_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.63, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 83 | {"condition": "cot17_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.56, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 84 | {"condition": "cot17_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.01, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 85 | {"condition": "cot17_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.25, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 86 | {"condition": "cot18_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.63, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 87 | {"condition": "cot18_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.78, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 88 | {"condition": "cot18_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.92, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 89 | {"condition": "cot18_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.17, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 90 | {"condition": "cot18_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.73, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 91 | {"condition": "cot19_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 15.24, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 92 | {"condition": "cot19_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.81, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 93 | {"condition": "cot19_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 16.89, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 94 | {"condition": "cot19_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 12.31, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 95 | {"condition": "cot19_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 23.55, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 96 | {"condition": "cot20_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 13.5, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 97 | {"condition": "cot20_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 12.76, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 98 | {"condition": "cot20_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.66, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 99 | {"condition": "cot20_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.41, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 100 | {"condition": "cot20_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.48, "median_levdist": 7.0, "model": "llama3.1-405b", "temp": 0.0} 101 | {"condition": "cot21_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.68, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 102 | {"condition": "cot21_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.91, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 103 | {"condition": "cot21_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.87, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 104 | {"condition": "cot21_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 12.84, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 105 | {"condition": "cot21_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 13.04, "median_levdist": 7.0, "model": "llama3.1-405b", "temp": 0.0} 106 | {"condition": "cot22_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.33, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0} 107 | {"condition": "cot22_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.48, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 108 | {"condition": "cot22_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.38, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 109 | {"condition": "cot22_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.73, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 110 | {"condition": "cot22_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.24, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0} 111 | {"condition": "cot23_bin1", "acc_inst": 0.77, "acc_demo": 0.0, "levdist": 0.68, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 112 | {"condition": "cot23_bin2", "acc_inst": 0.59, "acc_demo": 0.0, "levdist": 2.55, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0} 113 | {"condition": "cot23_bin3", "acc_inst": 0.55, "acc_demo": 0.0, "levdist": 1.09, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0} 114 | {"condition": "cot23_bin4", "acc_inst": 0.49, "acc_demo": 0.0, "levdist": 1.12, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0} 115 | {"condition": "cot23_bin5", "acc_inst": 0.32, "acc_demo": 0.0, "levdist": 1.24, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0} 116 | {"condition": "cot24_bin1", "acc_inst": 0.52, "acc_demo": 0.0, "levdist": 1.5, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0} 117 | {"condition": "cot24_bin2", "acc_inst": 0.26, "acc_demo": 0.0, "levdist": 2.35, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0} 118 | {"condition": "cot24_bin3", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 1.98, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0} 119 | {"condition": "cot24_bin4", "acc_inst": 0.37, "acc_demo": 0.0, "levdist": 2.31, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0} 120 | {"condition": "cot24_bin5", "acc_inst": 0.47, "acc_demo": 0.0, "levdist": 1.45, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0} 121 | {"condition": "cot25_bin1", "acc_inst": 0.83, "acc_demo": 0.0, "levdist": 0.5, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 122 | {"condition": "cot25_bin2", "acc_inst": 0.62, "acc_demo": 0.0, "levdist": 0.87, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 123 | {"condition": "cot25_bin3", "acc_inst": 0.6, "acc_demo": 0.0, "levdist": 0.91, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 124 | {"condition": "cot25_bin4", "acc_inst": 0.7, "acc_demo": 0.0, "levdist": 0.63, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 125 | {"condition": "cot25_bin5", "acc_inst": 0.56, "acc_demo": 0.0, "levdist": 2.45, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0} 126 | -------------------------------------------------------------------------------- /models/openai_help.py: -------------------------------------------------------------------------------- 1 | import os 2 | import openai 3 | import random 4 | import aiolimiter 5 | from aiohttp import ClientSession 6 | import asyncio 7 | import logging 8 | from typing import Any, List, Dict, Union 9 | from tqdm.asyncio import tqdm_asyncio 10 | 11 | completion_tokens = {"gpt-4": 0, "gpt-3.5-turbo": 0, "gpt-4-0613": 0, "gpt-3.5-turbo-0613": 0} 12 | prompt_tokens = {"gpt-4": 0, "gpt-3.5-turbo": 0, "gpt-4-0613": 0, "gpt-3.5-turbo-0613": 0} 13 | 14 | async def _throttled_openai_chat_completion_acreate( 15 | model: str, 16 | messages: List[Dict[str, str]], 17 | temperature: float, 18 | max_tokens: int, 19 | top_p: float, 20 | stop: Union[str, List[str]], 21 | limiter: aiolimiter.AsyncLimiter, 22 | ) -> Dict[str, Any]: 23 | async with limiter: 24 | for _ in range(10000000000): 25 | try: 26 | return await openai.ChatCompletion.acreate( 27 | model=model, 28 | messages=messages, 29 | temperature=temperature, 30 | max_tokens=max_tokens, 31 | top_p=top_p, 32 | stop=stop, 33 | ) 34 | except openai.error.OpenAIError: 35 | logging.warning( 36 | "OpenAI API rate limit exceeded. Sleeping for 10 seconds." 37 | ) 38 | await asyncio.sleep(20) 39 | except asyncio.exceptions.TimeoutError: 40 | logging.warning("OpenAI API timeout. Sleeping for 10 seconds.") 41 | await asyncio.sleep(20) 42 | return {"choices": [{"message": {"content": ""}}]} 43 | 44 | 45 | async def generate_from_openai_chat_completion( 46 | messages_list: List[Dict[str, str]], 47 | model: str, 48 | temperature: float, 49 | max_tokens: int, 50 | top_p: float, 51 | stop: Union[str, List[str]], 52 | requests_per_minute: int = 300, 53 | ) -> List[str]: 54 | if model == "gpt-4": 55 | requests_per_minute = 200 56 | if "OPENAI_API_KEY" not in os.environ: 57 | raise ValueError( 58 | "OPENAI_API_KEY environment variable must be set when using OpenAI API." 59 | ) 60 | print(os.environ["OPENAI_API_KEY"]) 61 | openai.api_key = os.environ["OPENAI_API_KEY"] 62 | session = ClientSession() 63 | openai.aiosession.set(session) 64 | limiter = aiolimiter.AsyncLimiter(requests_per_minute) 65 | async_responses = [ 66 | _throttled_openai_chat_completion_acreate( 67 | model=model, 68 | messages=messages, 69 | temperature=temperature, 70 | max_tokens=max_tokens, 71 | top_p=top_p, 72 | stop=stop, 73 | limiter=limiter, 74 | ) 75 | for messages in messages_list 76 | ] 77 | responses = await tqdm_asyncio.gather(*async_responses) 78 | await session.close() 79 | # return [x["choices"][0]["message"]["content"] for x in responses] 80 | return responses 81 | 82 | 83 | def gpt(prompt, model="gpt-4", temperature=0.7, max_tokens=1000, n=1, stop=None) -> list: 84 | return gpts([prompt] * n, model=model, temperature=temperature, max_tokens=max_tokens, stop=stop) 85 | 86 | def gpts(prompts, model="gpt-4", temperature=0.7, max_tokens=1000, stop=None) -> list: 87 | print(f"Model: {model}, temperature: {temperature}, max_tokens: {max_tokens}") 88 | messages_list = [[{"role": "user", "content": prompt}] for prompt in prompts] 89 | return chatgpts(messages_list, model=model, temperature=temperature, max_tokens=max_tokens, stop=stop) 90 | 91 | def chatgpt(messages, model="gpt-4", temperature=0.7, max_tokens=1000, n=1, stop=None) -> list: 92 | return chatgpts([messages] * n, model=model, temperature=temperature, max_tokens=max_tokens, stop=stop) 93 | 94 | def chatgpts(messages_list, model="gpt-4", temperature=0.7, max_tokens=1000, stop=None) -> list: 95 | responses = asyncio.run(generate_from_openai_chat_completion(model=model, messages_list=messages_list, temperature=temperature, max_tokens=max_tokens, top_p=1, stop=stop)) 96 | texts = [x["choices"][0]["message"]["content"] for x in responses] 97 | # print(responses) 98 | global completion_tokens, prompt_tokens 99 | completion_tokens[model] += sum(x["usage"]["completion_tokens"] for x in responses if "usage" in x and "completion_tokens" in x["usage"]) 100 | prompt_tokens[model] += sum(x["usage"]["prompt_tokens"] for x in responses if "usage" in x and "prompt_tokens" in x["usage"]) 101 | return texts 102 | 103 | def gpt_usage(): 104 | global completion_tokens, prompt_tokens 105 | cost = completion_tokens["gpt-4"] / 1000 * 0.06 + prompt_tokens["gpt-4"] / 1000 * 0.03 106 | cost += (completion_tokens["gpt-3.5-turbo"] + prompt_tokens["gpt-3.5-turbo"]) / 1000 * 0.0002 107 | return {"completion_tokens": completion_tokens, "prompt_tokens": prompt_tokens, "cost": cost} 108 | -------------------------------------------------------------------------------- /regression/README.md: -------------------------------------------------------------------------------- 1 | # Logistic Regression 2 | 3 | - `text_cot_train_table.tsv` - train table statistics where `correct` indicates whether GPT-4 solved the example correctly. Logistic rgeression model is fitted on this data in `regression.ipynb`. Obtained by running [eval.py](https://github.com/aksh555/deciphering_cot/eval.py) and `create_train_table.py` 4 | - `text_cot_test_table.tsv` - test table statistics 5 | - `text_cot_test_table_results.tsv` - test table statistics with predictions from the LR model. -------------------------------------------------------------------------------- /regression/create_train_table.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from transformers import GPT2LMHeadModel, GPT2Tokenizer 4 | import tiktoken 5 | import logging 6 | import json 7 | import pandas as pd 8 | 9 | logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, handlers=[logging.StreamHandler(),logging.FileHandler("prob_random_index.log")]) 10 | 11 | if torch.cuda.is_available(): 12 | device = "cuda" 13 | else: 14 | device = "cpu" 15 | 16 | gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-xl") 17 | gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2-xl").to(device) 18 | gpt4_enc = tiktoken.get_encoding("cl100k_base") 19 | 20 | def pad_batch(batch, pad_idx): 21 | max_length = 0 22 | for seq in batch: 23 | if len(seq) > max_length: 24 | max_length = len(seq) 25 | 26 | new_batch = [] 27 | for seq in batch: 28 | padding = [pad_idx for i in range(max_length - len(seq))] 29 | new_batch.append(seq + padding) 30 | 31 | return new_batch 32 | 33 | # Get perplexity using GPT-2 34 | def prob_gpt2(sentence_list): 35 | 36 | # Tokenize the sentences 37 | all_tokens = [] 38 | for sentence in sentence_list: 39 | tokens = gpt2_tokenizer.encode(sentence) 40 | all_tokens.append(tokens) 41 | tokens = pad_batch(all_tokens, 50256) 42 | 43 | targets = tokens[:] 44 | 45 | # Compute average log likelihood for the generation 46 | input_ids = torch.LongTensor(tokens).to(device) 47 | target_ids = torch.LongTensor(targets).to(device) 48 | 49 | with torch.no_grad(): 50 | outputs = gpt2_model(input_ids, labels=target_ids) 51 | logits = outputs[1] 52 | logits = logits.transpose(0,1)[:-1].transpose(0,1) 53 | target_ids = target_ids.transpose(0,1)[1:].transpose(0,1) 54 | loss = torch.nn.CrossEntropyLoss(reduction="none", ignore_index=50256)(logits.reshape(-1,50257), target_ids.reshape(-1)) 55 | loss = loss.reshape(target_ids.shape).sum(dim=1) 56 | neg_log_likelihood = -1*loss 57 | 58 | # 13.357776641845703 = logprob('The word is"'); removing this to just get 59 | # the word prob 60 | return neg_log_likelihood + 13.357776641845703 61 | 62 | df = pd.read_csv("text_cot_train_table.tsv",sep="\t") 63 | word_list = df["input"].to_list() 64 | print("Rows", len(word_list)) 65 | 66 | words_with_prob = [] 67 | this_batch_sentences = [] 68 | this_batch_words = [] 69 | num_tokens = [] 70 | for index, line in enumerate(word_list): 71 | if index % 10000 == 0: 72 | logging.info(str(index)) 73 | 74 | word = line.strip() 75 | 76 | tokens = gpt4_enc.encode(word) 77 | tokens_spaced = gpt4_enc.encode(" " + word) 78 | 79 | this_batch_sentences.append('The word is "' + word + '"') 80 | this_batch_words.append(word) 81 | num_tokens.append(len(tokens)) 82 | 83 | if len(this_batch_sentences) == 3000: 84 | logprobs = prob_gpt2(this_batch_sentences) 85 | for word, logprob in zip(this_batch_words, logprobs): 86 | words_with_prob.append(logprob.item()) 87 | this_batch_sentences = [] 88 | this_batch_words = [] 89 | 90 | if len(this_batch_sentences) > 0: 91 | logprobs = prob_gpt2(this_batch_sentences) 92 | for word, logprob in zip(this_batch_words, logprobs): 93 | words_with_prob.append(logprob.item()) 94 | this_batch_sentences = [] 95 | this_batch_words = [] 96 | 97 | df["input_logprob"] = words_with_prob 98 | df["input_ntokens"] = num_tokens 99 | 100 | df.drop(["pred","gt","input"], axis=1, inplace=True) 101 | df = df[['input_ntokens', 'input_logprob', 'output_logprob', 'shift_level', 'shift_freq', 'bin']] 102 | df.to_csv("./text_cot_train_table.tsv", "\t",index_label="index") 103 | 104 | 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /run_claude3.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import json 3 | import argparse 4 | from tqdm import tqdm 5 | import os 6 | import anthropic 7 | import time 8 | logging.getLogger().setLevel(logging.ERROR) 9 | 10 | client = anthropic.Anthropic() 11 | 12 | 13 | def claude_responses(prompt_list, model="claude-3-opus-20240229", max_tokens=1000, temperature=0.0): 14 | responses = [] 15 | for prompt in tqdm(prompt_list): 16 | output = None 17 | for _ in range(10): 18 | try: 19 | completion = client.messages.create( 20 | model=model, 21 | max_tokens=max_tokens, 22 | temperature=temperature, 23 | system="Provide only your answer, without any explanation.", 24 | messages=[{"role":"user", "content": prompt}] 25 | ) 26 | 27 | output = completion.content[0].text 28 | if output is None: 29 | output = "" 30 | except: 31 | time.sleep(60) 32 | 33 | if not (output is None): 34 | break 35 | 36 | if output is None: 37 | responses.append("") 38 | else: 39 | responses.append(output) 40 | return responses 41 | 42 | 43 | 44 | 45 | def solve_file(name, model, temperature, max_tokens, prompt_type): 46 | file = f'stimuli/{prompt_type}/{name}.jsonl' 47 | if not os.path.exists(file): 48 | print(f'File {file} does not exist') 49 | return None 50 | with open(file, 'r') as f: 51 | lines = f.readlines() 52 | lines = [json.loads(line) for line in lines] 53 | prompts = [line['instruction_plus_input'] for line in lines] 54 | gts = [line['correct_output'] for line in lines] 55 | res = claude_responses(prompts, model=model, temperature=0.0, max_tokens=max_tokens) 56 | 57 | # These accs are not what we use in the paper - they're just for quick estimates. 58 | # The stats used in the paper are computed in the evaluation/ folder 59 | accs = [(gt.replace('"', "") in r.replace('"', "")) for r, gt in zip(res, gts)] 60 | acc = sum(accs) / len(accs) 61 | print(f'Accuracy: {acc}') 62 | 63 | d = {'prompts': prompts, 'gts': gts, 'res': res, 'accs': accs, 'acc': acc} 64 | 65 | fo_directory = f'logs/{prompt_type}/{model}' 66 | if not os.path.exists(fo_directory): 67 | os.makedirs(fo_directory, exist_ok=True) 68 | 69 | output_file = f'{fo_directory}/{name}.json' 70 | with open(output_file, 'w') as f: 71 | json.dump(d, f) 72 | 73 | return d 74 | 75 | 76 | def parse_args(): 77 | args = argparse.ArgumentParser() 78 | args.add_argument('--tasks', type=str, required=True, help='split by comma') 79 | args.add_argument('--conditions', type=str, required=True, help='split by comma') 80 | args.add_argument('--model', type=str, required=True, choices=['claude-3']) 81 | args.add_argument('--max_tokens', type=int, help='default = 1000', default=1000) 82 | args.add_argument("--prompt_type", type=str, help="Prompt type to use [standard, text_cot, math_cot, number_cot]", default="text_cot") 83 | args = args.parse_args() 84 | return args 85 | 86 | if __name__ == '__main__': 87 | args = parse_args() 88 | tasks = args.tasks.split(',') 89 | conditions = args.conditions.split(',') 90 | model = args.model 91 | prompt_type = args.prompt_type 92 | 93 | if model == "claude-3": 94 | model = "claude-3-opus-20240229" 95 | max_tokens = args.max_tokens 96 | 97 | for task in tasks: 98 | for condition in conditions: 99 | name = f'{task}_{condition}' 100 | d = solve_file(name, model=model, temperature=0.0, max_tokens=max_tokens, prompt_type=prompt_type) 101 | if d is not None: 102 | print(f'{name}, {model}: {d["acc"]:.2f}') 103 | 104 | -------------------------------------------------------------------------------- /run_llama3.py: -------------------------------------------------------------------------------- 1 | # python run_llama3.py --tasks cot1 --conditions bin1 --max_tokens 200 --model llama-3.1-405b 2 | 3 | import logging 4 | import json 5 | import argparse 6 | from tqdm import tqdm 7 | import os 8 | import together 9 | import time 10 | logging.getLogger().setLevel(logging.ERROR) 11 | 12 | client = together.Together() 13 | 14 | def process_prompt(prompt): 15 | prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n" + prompt + "\n<|start_header_id|>assistant<|end_header_id|>" 16 | return prompt 17 | 18 | 19 | def llama_responses(prompt_list, model="llama-3-70b-chat-hf", max_tokens=1000, temperature=0.0): 20 | responses = [] 21 | for prompt in tqdm(prompt_list): 22 | prompt = process_prompt(prompt) 23 | output = None 24 | for _ in range(10): 25 | try: 26 | if "chat" in model: 27 | output = client.chat.completions.create( 28 | messages = [{"role": "user", "content": prompt}], 29 | model = "meta-llama/" + model, 30 | max_tokens = max_tokens, 31 | temperature = temperature, 32 | ) 33 | else: 34 | output = client.completions.create( 35 | prompt=prompt, 36 | model = "meta-llama/" + model, 37 | max_tokens = max_tokens, 38 | temperature = temperature, 39 | ) 40 | except: 41 | time.sleep(1) 42 | 43 | if not (output is None): 44 | break 45 | if "chat" in model: 46 | responses.append(output.choices[0].message.content) 47 | else: 48 | responses.append(output.choices[0].text) 49 | return responses 50 | 51 | 52 | def solve_file(name, model, temperature, max_tokens, prompt_type): 53 | file = f'stimuli/{prompt_type}/{name}.jsonl' 54 | print(f"Loading {file}") 55 | if not os.path.exists(file): 56 | print(f'File {file} does not exist') 57 | return None 58 | with open(file, 'r') as f: 59 | lines = f.readlines() 60 | lines = [json.loads(line) for line in lines] 61 | prompts = [line['instruction_plus_input'] for line in lines] 62 | gts = [line['correct_output'] for line in lines] 63 | res = llama_responses(prompts, model=model, temperature=0.0, max_tokens=max_tokens) 64 | 65 | # These accs are not what we use in the paper - they're just for quick estimates. 66 | # The stats used in the paper are computed in the evaluation/ folder 67 | accs = [(gt.replace('"', '') in r.replace('"', '')) for r, gt in zip(res, gts)] 68 | acc = sum(accs) / len(accs) 69 | print(f"Done {name}") 70 | print(f'Accuracy: {acc}') 71 | 72 | d = {'prompts': prompts, 'gts': gts, 'res': res, 'accs': accs, 'acc': acc} 73 | 74 | fo_directory = f'logs/{prompt_type}/{model}' 75 | if not os.path.exists(fo_directory): 76 | os.makedirs(fo_directory, exist_ok=True) 77 | 78 | output_file = f'{fo_directory}/{name}.json' 79 | with open(output_file, 'w') as f: 80 | json.dump(d, f) 81 | return d 82 | 83 | 84 | def parse_args(): 85 | args = argparse.ArgumentParser() 86 | args.add_argument('--tasks', type=str, required=True, help='split by comma') 87 | args.add_argument('--conditions', type=str, required=True, help='split by comma') 88 | args.add_argument('--model', type=str, required=True, choices=['llama-3-70b-chat', 'llama-3-70b', 'llama3-405b', 'llama3.1-70b'], default='llama3.1-405b') 89 | args.add_argument('--max_tokens', type=int, help='default = 1000', default=1000) 90 | args.add_argument("--prompt_type", type=str, help="Prompt type to use [standard, text_cot, math_cot, number_cot]", default="text_cot") 91 | args = args.parse_args() 92 | return args 93 | 94 | if __name__ == '__main__': 95 | args = parse_args() 96 | tasks = args.tasks.split(',') 97 | conditions = args.conditions.split(',') 98 | model = args.model 99 | prompt_type = args.prompt_type 100 | if model == 'llama-3-70b-chat': 101 | model = 'llama-3-70b-chat-hf' 102 | elif model == 'llama-3-70b': 103 | model = 'meta-llama-3-70b' 104 | elif model == 'llama3.1-405b': 105 | model = 'Meta-Llama-3.1-405B-Instruct-Turbo' 106 | elif model == 'llama3.1-70b': 107 | model = 'Meta-Llama-3.1-70B-Instruct-Turbo' 108 | max_tokens = args.max_tokens 109 | 110 | for task in tasks: 111 | for condition in conditions: 112 | name = f'{task}_{condition}' 113 | d = solve_file(name, model=model, temperature=0.0, max_tokens=max_tokens, prompt_type=prompt_type) 114 | if d is not None: 115 | print(f'{name}, {model}: {d["acc"]:.2f}') 116 | 117 | -------------------------------------------------------------------------------- /run_o1.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import json 3 | import argparse 4 | from tqdm import tqdm 5 | import os 6 | logging.getLogger().setLevel(logging.INFO) 7 | from openai import OpenAI,BadRequestError 8 | client = OpenAI() 9 | 10 | def o1_responses(prompt_list): 11 | responses = [] 12 | completion_tokens = [] 13 | for prompt in tqdm(prompt_list): 14 | try: 15 | response = client.chat.completions.create( 16 | model="o1-preview", 17 | messages=[ 18 | { 19 | "role": "user", 20 | "content": prompt 21 | } 22 | ] 23 | ) 24 | responses.append(response.choices[0].message.content) 25 | completion_tokens.append(response.usage.completion_tokens_details["reasoning_tokens"]) 26 | except BadRequestError: 27 | response = "BLOCKED_BY_OPENAI" 28 | responses.append(response) 29 | completion_tokens.append(0) 30 | except Exception as e: 31 | print(e) 32 | response = "ERROR" 33 | 34 | return responses, completion_tokens 35 | 36 | def solve_file(name, model): 37 | # o1 does not require CoT prompts 38 | file = f'stimuli/standard/{name}.jsonl' 39 | if not os.path.exists(file): 40 | print(f'File {file} does not exist') 41 | return None 42 | with open(file, 'r') as f: 43 | lines = f.readlines() 44 | lines = [json.loads(line) for line in lines] 45 | print(file) 46 | prompts = [line['instruction_plus_input'] for line in lines][:50] 47 | gts = [line['correct_output'] for line in lines][:50] 48 | 49 | res, completion_tokens = o1_responses(prompts) 50 | mean_tokens = sum(completion_tokens)/len(completion_tokens) 51 | 52 | # These accs are not what we use in the paper - they're just for quick estimates. 53 | # The stats used in the paper are computed in the evaluation/ folder 54 | accs = [(gt.replace('"', "") in r.replace('"', "")) for r, gt in zip(res, gts)] 55 | acc = sum(accs) / len(accs) 56 | print("Completion tokens", mean_tokens) 57 | 58 | d = {'prompts': prompts, 'gts': gts, 'res': res, 'accs': accs, 'acc': acc, 'mean_completion_tokens':mean_tokens} 59 | 60 | output_file = f'logs/standard/{model}' 61 | with open(output_file, 'w') as f: 62 | json.dump(d, f) 63 | 64 | return d 65 | 66 | 67 | def parse_args(): 68 | args = argparse.ArgumentParser() 69 | args.add_argument('--tasks', type=str, required=True, help='split by comma') 70 | args.add_argument('--conditions', type=str, required=True, help='split by comma') 71 | args.add_argument('--model', type=str, default='o1-preview-2024-09-12') 72 | 73 | args = args.parse_args() 74 | return args 75 | 76 | if __name__ == '__main__': 77 | args = parse_args() 78 | tasks = args.tasks.split(',') 79 | conditions = args.conditions.split(',') 80 | model = args.model 81 | 82 | for task in tasks: 83 | for condition in conditions: 84 | name = f'{task}_{condition}' 85 | d = solve_file(name, model=model) 86 | if d is not None: 87 | print(f'{name}, {model}: {d["acc"]:.2f}') 88 | print("Completion tokens", d["mean_completion_tokens"]) 89 | 90 | -------------------------------------------------------------------------------- /run_openai.py: -------------------------------------------------------------------------------- 1 | from models.openai_help import gpts 2 | import logging 3 | import json 4 | import argparse 5 | import os 6 | 7 | logging.getLogger().setLevel(logging.WARNING) 8 | 9 | def edit_distance(s1: str, s2: str) -> int: 10 | """Compute the Levenshtein distance between two strings.""" 11 | if len(s1) < len(s2): 12 | return edit_distance(s2, s1) 13 | if len(s2) == 0: 14 | return len(s1) 15 | previous_row = range(len(s2) + 1) 16 | for i, c1 in enumerate(s1): 17 | current_row = [i + 1] 18 | for j, c2 in enumerate(s2): 19 | insertions = previous_row[j + 1] + 1 20 | deletions = current_row[j] + 1 21 | substitutions = previous_row[j] + (c1 != c2) 22 | current_row.append(min(insertions, deletions, substitutions)) 23 | previous_row = current_row 24 | return previous_row[-1] 25 | 26 | 27 | def solve_file(name, model, temperature, max_tokens, prompt_type): 28 | file = f'stimuli/{prompt_type}/{name}.jsonl' 29 | print(f"Loading {file}") 30 | if not os.path.exists(file): 31 | print(f'File {file} does not exist') 32 | return None 33 | with open(file, 'r') as f: 34 | lines = f.readlines() 35 | lines = [json.loads(line) for line in lines] 36 | prompts = [line['instruction_plus_input'] for line in lines] 37 | gts = ['"' + line['correct_output'] + '"' for line in lines] 38 | res = gpts(prompts, model=model, temperature=temperature, max_tokens=max_tokens) 39 | accs = [(r == gt) for r, gt in zip(res, gts)] 40 | eds = [edit_distance(r, gt) for r, gt in zip(res, gts)] 41 | acc = sum(accs) / len(accs) 42 | ed = sum(eds) / len(eds) 43 | print(f"Done {name}") 44 | d = {'prompts': prompts, 'gts': gts, 'res': res, 'accs': accs, 'acc': acc, 'eds': eds, 'ed': ed} 45 | 46 | fo_directory = f'logs/{prompt_type}/{model}' 47 | if not os.path.exists(fo_directory): 48 | os.makedirs(fo_directory, exist_ok=True) 49 | 50 | output_file = f'{fo_directory}/{name}.json' 51 | with open(output_file, 'w') as f: 52 | json.dump(d, f) 53 | 54 | return d 55 | 56 | 57 | def parse_args(): 58 | args = argparse.ArgumentParser() 59 | args.add_argument('--tasks', type=str, required=True, help='split by comma') 60 | args.add_argument('--conditions', type=str, required=True, help='split by comma') 61 | args.add_argument('--model', type=str, default='gpt-4-0613') 62 | args.add_argument('--max_tokens', type=int, help='default = 200', default=200) 63 | args.add_argument('--temperature', type=float, help='default = 0.0', default=0.0) 64 | args.add_argument("--prompt_type", type=str, help="Prompt type to use [standard, text_cot, math_cot, number_cot]", default="text_cot") 65 | args = args.parse_args() 66 | return args 67 | 68 | if __name__ == '__main__': 69 | args = parse_args() 70 | tasks = args.tasks.split(',') 71 | conditions = args.conditions.split(',') 72 | model = args.model 73 | max_tokens = args.max_tokens 74 | temperature = args.temperature 75 | prompt_type = args.prompt_type 76 | 77 | for task in tasks: 78 | for condition in conditions: 79 | name = f'{task}_{condition}' 80 | d = solve_file(name, model=model, temperature=temperature, max_tokens=max_tokens, prompt_type=prompt_type) 81 | if d is not None: 82 | print(f'{name}, {model}: {d["acc"]:.2f} ({d["ed"]:.2f})') 83 | 84 | -------------------------------------------------------------------------------- /seven_letter_words/README.md: -------------------------------------------------------------------------------- 1 | ## Dataset 2 | 1. First, run `python random_token_combos.py`. This generates `random_pairs_lower.txt`, which lists all words that fulfill the following criteria: 3 | - 7 letters long 4 | - 2 subword tokens long (using the tokenizer that both GPT-3.5 and GPT-4 use; it needs to be 2 tokens long whether the word follows a space or not) 5 | - The first subword token is 3 letters long, and the second is 4 letters long (again, these lengths need to be identical whether the word follows a space or not). 6 | 2. Then, sort these words by the probability assigned to them by GPT-2 by running `python gpt2_prob_sevenletter.py`. This generates `random_pairs_lower_scored.txt`, which lists each word along with a log probability. The log probability is computed as the log probability that GPT-2 assigns to the sentence `The word is "WORD"`, minus the log probability that it assigns to `The word is "'; thus, this yields the log probability assigned to just the word and the following quotation mark in the context of `The word is "`. The closing quotation mark is included because it serves to indicate the end of the word. 7 | 3. Then, bin the words by running `python select_words.py` to create `words_5bins.txt`. 8 | 4. The final list of words can be found in `bin1_prob.txt`, `bin2_prob.txt`, `bin3_prob.txt`, `bin4_prob.txt`, and `bin5_prob.txt`. 9 | -------------------------------------------------------------------------------- /seven_letter_words/bin1_prob.txt: -------------------------------------------------------------------------------- 1 | choosed -14.997272491455078 2 | colbert -14.996980667114258 3 | polenta -14.99655532836914 4 | modicum -15.007698059082031 5 | autarch -14.99172592163086 6 | schisms -14.989496231079102 7 | mariner -15.0106201171875 8 | disarms -15.0106201171875 9 | rescale -14.989356994628906 10 | paywall -14.986217498779297 11 | infobox -14.98541259765625 12 | preston -15.015327453613281 13 | shrines -15.016551971435547 14 | implore -14.982894897460938 15 | alloted -15.01815414428711 16 | precast -15.020370483398438 17 | borings -14.978897094726562 18 | bacilli -15.022220611572266 19 | matrice -15.022846221923828 20 | redible -14.974870681762695 21 | absolve -15.026111602783203 22 | ourself -14.973335266113281 23 | ethetic -15.026788711547852 24 | maynard -15.027372360229492 25 | calibur -15.027730941772461 26 | enviros -15.02823257446289 27 | calzone -14.970394134521484 28 | sumatra -14.96739387512207 29 | drywall -15.033981323242188 30 | impaled -14.965522766113281 31 | manland -15.03862190246582 32 | divined -14.960699081420898 33 | conlang -14.959224700927734 34 | tablero -14.95616340637207 35 | redraft -14.955455780029297 36 | equitas -15.044797897338867 37 | ratting -14.953641891479492 38 | errancy -15.04793930053711 39 | webcast -14.94735336303711 40 | lowland -15.053237915039062 41 | boyhood -15.053678512573242 42 | actuary -14.945014953613281 43 | catlike -15.055164337158203 44 | putback -15.056617736816406 45 | galileo -14.942996978759766 46 | rivaled -15.057003021240234 47 | volonte -14.942134857177734 48 | sunspot -15.059274673461914 49 | rotunda -14.940404891967773 50 | notched -15.06007194519043 51 | taproot -14.935928344726562 52 | secures -15.066566467285156 53 | entente -14.93320083618164 54 | outflow -15.066858291625977 55 | betters -15.067663192749023 56 | rumpled -14.930889129638672 57 | burried -15.070535659790039 58 | repulse -14.92904281616211 59 | fillets -14.926876068115234 60 | relator -14.92681884765625 61 | sombody -15.074382781982422 62 | unsaved -15.074520111083984 63 | ailment -15.075027465820312 64 | nodules -15.075050354003906 65 | montero -14.922632217407227 66 | satires -15.080968856811523 67 | arcadia -14.916393280029297 68 | valerie -14.915924072265625 69 | inglish -15.085016250610352 70 | dukedom -15.086551666259766 71 | espouse -14.913402557373047 72 | bedevil -14.911296844482422 73 | reticle -15.089393615722656 74 | matinee -15.089693069458008 75 | maxwell -14.909908294677734 76 | picante -14.90963363647461 77 | baboons -14.908744812011719 78 | exciter -15.092048645019531 79 | losings -14.907678604125977 80 | newbies -14.906318664550781 81 | serried -14.90548324584961 82 | curving -14.904655456542969 83 | narrows -15.09649658203125 84 | ragging -14.901836395263672 85 | baneful -15.099411010742188 86 | pinatas -14.89979362487793 87 | divison -15.100841522216797 88 | kinfolk -14.898719787597656 89 | indiana -14.898597717285156 90 | caritas -14.8953857421875 91 | silvery -14.893852233886719 92 | inkling -14.893333435058594 93 | absense -15.10746955871582 94 | lavabit -14.890359878540039 95 | outsize -14.88975715637207 96 | rewired -15.111268997192383 97 | absalom -15.113567352294922 98 | getback -15.114919662475586 99 | accuser -14.884925842285156 100 | striven -15.115121841430664 101 | maloney -15.116886138916016 102 | escaper -14.882984161376953 103 | subtile -15.119136810302734 104 | colibri -14.879827499389648 105 | delving -14.87982177734375 106 | calving -14.879753112792969 107 | tarheel -14.878677368164062 108 | herders -14.876302719116211 109 | grooved -14.875177383422852 110 | octagon -15.125707626342773 111 | bisping -15.126806259155273 112 | alluded -14.872251510620117 113 | merlion -15.128215789794922 114 | figural -15.129623413085938 115 | debater -14.869804382324219 116 | pigtail -14.867530822753906 117 | honious -15.13395881652832 118 | pinches -15.135322570800781 119 | clojure -14.863956451416016 120 | equates -14.861526489257812 121 | refiner -15.138694763183594 122 | billets -15.140663146972656 123 | alfalfa -15.141242980957031 124 | hotshot -14.858383178710938 125 | nonagon -15.142745971679688 126 | jacuzzi -14.857048034667969 127 | vincent -15.143632888793945 128 | pollock -14.855628967285156 129 | airtime -14.85552978515625 -------------------------------------------------------------------------------- /seven_letter_words/bin2_prob.txt: -------------------------------------------------------------------------------- 1 | dupasha -22.5 2 | makrita -22.499996185302734 3 | ferisse -22.499996185302734 4 | murcers -22.49999237060547 5 | metires -22.49999237060547 6 | witmost -22.50000762939453 7 | astause -22.50000762939453 8 | sekaram -22.500011444091797 9 | vilgren -22.500015258789062 10 | belomat -22.500019073486328 11 | setnest -22.499977111816406 12 | curadal -22.49997329711914 13 | viridon -22.50002670288086 14 | denpick -22.50002670288086 15 | eraully -22.50003433227539 16 | ruborie -22.500041961669922 17 | queimer -22.499950408935547 18 | cosuits -22.499950408935547 19 | rutamen -22.499942779541016 20 | graizen -22.499942779541016 21 | sonware -22.500057220458984 22 | infocos -22.500057220458984 23 | inkwang -22.49993896484375 24 | rowbots -22.499935150146484 25 | engeden -22.500064849853516 26 | vizizen -22.50006866455078 27 | molenci -22.499927520751953 28 | indotes -22.499927520751953 29 | dapener -22.500076293945312 30 | ireasti -22.50008773803711 31 | undving -22.499900817871094 32 | traumpt -22.499900817871094 33 | redrear -22.500099182128906 34 | aryanni -22.499897003173828 35 | brovoir -22.500102996826172 36 | greised -22.499893188476562 37 | networm -22.499889373779297 38 | memwill -22.500110626220703 39 | gamplus -22.499881744384766 40 | estplay -22.499881744384766 41 | sapwhat -22.500118255615234 42 | indmong -22.500118255615234 43 | kenafil -22.5001220703125 44 | denzhou -22.5001220703125 45 | cosited -22.5001220703125 46 | perzoek -22.500125885009766 47 | balinit -22.500125885009766 48 | mayonal -22.499866485595703 49 | armemic -22.499866485595703 50 | henjury -22.500133514404297 51 | lavplay -22.500141143798828 52 | calynes -22.49985122680664 53 | remfold -22.50014877319336 54 | engdist -22.50014877319336 55 | armrich -22.50014877319336 56 | luxfast -22.499847412109375 57 | mulhatt -22.49984359741211 58 | allaton -22.49984359741211 59 | strfair -22.50015640258789 60 | monachs -22.50015640258789 61 | kerapat -22.50015640258789 62 | hergrim -22.50015640258789 63 | fidgota -22.50015640258789 64 | decigan -22.500160217285156 65 | dezella -22.499835968017578 66 | haypath -22.500164031982422 67 | resonga -22.499820709228516 68 | nosband -22.499820709228516 69 | poligen -22.500179290771484 70 | mobture -22.49981689453125 71 | flufrom -22.50018310546875 72 | willose -22.49980926513672 73 | desedge -22.50019073486328 74 | momclub -22.499805450439453 75 | clobero -22.499801635742188 76 | mapauth -22.499797821044922 77 | vitelho -22.500205993652344 78 | daykick -22.500205993652344 79 | sysmite -22.500213623046875 80 | telolon -22.50021743774414 81 | onsensa -22.50021743774414 82 | vipaddy -22.500225067138672 83 | sunrink -22.500225067138672 84 | namhero -22.500225067138672 85 | voratio -22.499771118164062 86 | niliter -22.499771118164062 87 | droones -22.499767303466797 88 | zipcord -22.500232696533203 89 | pagrete -22.500232696533203 90 | funwich -22.500232696533203 91 | negbers -22.499759674072266 92 | belwich -22.499759674072266 93 | allayah -22.499759674072266 94 | pakatak -22.500240325927734 95 | farathy -22.500240325927734 96 | betweek -22.500244140625 97 | rutanim -22.500247955322266 98 | obsster -22.500255584716797 99 | ligigid -22.500255584716797 100 | lidcore -22.500255584716797 101 | vacassa -22.499740600585938 102 | pipiday -22.499736785888672 103 | almorum -22.499736785888672 104 | sadmore -22.500263214111328 105 | hayhorn -22.49972915649414 106 | vinango -22.49972152709961 107 | cosisty -22.50027847290039 108 | libikal -22.499713897705078 109 | dogodes -22.500286102294922 110 | camcore -22.500286102294922 111 | ashmann -22.500286102294922 112 | fibunal -22.500289916992188 113 | enciere -22.499706268310547 114 | revrika -22.49969482421875 115 | perburg -22.500308990478516 116 | camilan -22.500308990478516 117 | sumarms -22.50031280517578 118 | firigin -22.500316619873047 119 | pelatra -22.499675750732422 120 | vorvery -22.500328063964844 121 | purabra -22.500328063964844 122 | indondo -22.50033187866211 123 | dogpeak -22.50033187866211 124 | alllein -22.50033187866211 125 | actblue -22.49966049194336 126 | hasvers -22.50033950805664 127 | freifty -22.499652862548828 128 | hueving -22.500347137451172 129 | coratti -22.499649047851562 130 | saprika -22.499645233154297 131 | honcoin -22.499645233154297 132 | joycons -22.50035858154297 133 | dogoids -22.50035858154297 134 | nanians -22.499637603759766 135 | dreanon -22.499637603759766 136 | spoanna -22.4996337890625 137 | levieur -22.4996337890625 138 | jawolla -22.5003662109375 139 | cowcard -22.5003662109375 140 | thehalb -22.499629974365234 141 | lamboys -22.499629974365234 142 | disorer -22.499629974365234 143 | pigwiki -22.500370025634766 144 | embious -22.500370025634766 145 | detdden -22.500370025634766 146 | vacibel -22.499622344970703 -------------------------------------------------------------------------------- /seven_letter_words/bin3_prob.txt: -------------------------------------------------------------------------------- 1 | tasvinc -30.0 2 | dblshaw -29.999996185302734 3 | cmbodka -29.999996185302734 4 | zagbbox -30.000003814697266 5 | hedoute -30.000003814697266 6 | cmsdest -30.00000762939453 7 | leoanje -29.999988555908203 8 | sitinks -29.999984741210938 9 | oweorno -29.999984741210938 10 | advpite -29.999984741210938 11 | grpwerk -30.000015258789062 12 | aesasio -29.999980926513672 13 | atequir -30.000019073486328 14 | dryhazi -30.000022888183594 15 | styansa -29.99997329711914 16 | sunincl -30.00002670288086 17 | bowamac -30.00002670288086 18 | xyzunik -29.999969482421875 19 | awsposs -30.000030517578125 20 | ogrmode -29.99996566772461 21 | midbyss -29.99996566772461 22 | ctlmony -29.99996566772461 23 | rngmony -30.00003433227539 24 | rergett -29.999961853027344 25 | phperti -29.999961853027344 26 | bfdizzy -30.000041961669922 27 | srcstit -29.999950408935547 28 | pktubic -29.999950408935547 29 | oddourd -29.999950408935547 30 | mplnick -29.999950408935547 31 | dccergy -29.999942779541016 32 | oxyhest -30.000057220458984 33 | klepled -29.99993896484375 34 | digydro -29.99993896484375 35 | aphopez -29.99993896484375 36 | rifntag -30.00006103515625 37 | srvlope -29.999935150146484 38 | emoomez -29.999935150146484 39 | toyelry -30.000064849853516 40 | iniilen -30.000064849853516 41 | iffamma -30.000064849853516 42 | adsokin -29.99993133544922 43 | eofpike -30.00006866455078 44 | dnsavia -30.00006866455078 45 | uitlesi -30.000072479248047 46 | owluntu -30.000072479248047 47 | affesda -29.999923706054688 48 | mgrulia -30.000080108642578 49 | foxmsgs -30.000080108642578 50 | esiaram -30.000080108642578 51 | subzyst -29.999916076660156 52 | ottexpo -30.000083923339844 53 | udpcolo -29.999908447265625 54 | vakdney -29.99990463256836 55 | svmvery -29.99990463256836 56 | dspereo -29.99990463256836 57 | pngpone -30.00009536743164 58 | quiilyn -29.999900817871094 59 | tgtella -30.000102996826172 60 | ithueur -30.000102996826172 61 | wynvinc -30.000106811523438 62 | sezanch -30.000106811523438 63 | sdkjabi -30.000106811523438 64 | yaninem -29.999889373779297 65 | dbgivid -29.999889373779297 66 | adeardu -29.999889373779297 67 | paykich -30.000110626220703 68 | dspdeal -30.000110626220703 69 | cptwipe -30.000110626220703 70 | nikaign -29.99988555908203 71 | pesuell -30.00011444091797 72 | musropp -30.00011444091797 73 | ebxside -30.00011444091797 74 | dnienez -30.000118255615234 75 | dccscal -30.000118255615234 76 | cmbheck -30.000118255615234 77 | stsasks -29.999874114990234 78 | hapixer -29.99987030029297 79 | nikuild -30.00012969970703 80 | wowrapy -30.000133514404297 81 | txtajes -30.000133514404297 82 | gtkoooo -30.000133514404297 83 | sutcmds -30.000137329101562 84 | erviode -29.999858856201172 85 | bewikon -30.000141143798828 86 | hubphas -29.99985122680664 87 | ervpets -29.99985122680664 88 | ofsitem -29.99984359741211 89 | gstivec -29.99984359741211 90 | utfestr -30.00015640258789 91 | etaabic -30.00015640258789 92 | tieibur -29.999839782714844 93 | islssel -30.000160217285156 94 | iodvari -30.000160217285156 95 | zagzept -29.999835968017578 96 | ustjour -29.999835968017578 97 | dexonte -29.999835968017578 98 | bizfilt -29.999835968017578 99 | adaowns -29.999835968017578 100 | tetibri -30.000164031982422 101 | octfirm -29.999828338623047 102 | weiudos -30.000171661376953 103 | pwdtick -30.000171661376953 104 | ttlarry -29.99981689453125 105 | stuimeo -29.999813079833984 106 | sqlstre -29.999813079833984 107 | mieipeg -29.999813079833984 108 | dueafen -29.999813079833984 109 | sndurge -29.99980926513672 110 | vezcorn -30.00019073486328 111 | ilketch -29.999805450439453 112 | zugenth -30.000194549560547 113 | rngiate -30.000194549560547 114 | ottclud -30.000194549560547 115 | aprkeep -30.000194549560547 116 | urlveal -30.000198364257812 117 | msgourd -30.000198364257812 118 | xlsboom -29.999797821044922 119 | wijagma -29.999797821044922 120 | robisbn -29.999797821044922 121 | melmlin -29.999797821044922 122 | samslot -30.000202178955078 123 | nidoust -29.999794006347656 124 | begkits -29.999794006347656 125 | arrflix -29.999794006347656 126 | ditfrau -30.000205993652344 127 | aidomid -30.000205993652344 128 | cptfoto -29.99979019165039 129 | aimrede -29.99979019165039 130 | dbgabay -30.00020980834961 131 | cidlocs -30.00020980834961 132 | booiedo -30.000221252441406 133 | mplders -29.999774932861328 134 | cptpush -30.000225067138672 135 | nahcalc -29.999767303466797 136 | amyovel -29.999767303466797 137 | wonczas -30.00023651123047 138 | mplrome -30.00023651123047 139 | edxesis -30.00023651123047 140 | adcadoo -30.00023651123047 141 | oudtems -29.999759674072266 142 | ociirut -29.999759674072266 143 | balzept -29.999759674072266 144 | avgcorp -29.999759674072266 145 | himocos -30.000240325927734 146 | ignlots -29.999755859375 147 | baztrim -29.999755859375 -------------------------------------------------------------------------------- /seven_letter_words/bin4_prob.txt: -------------------------------------------------------------------------------- 1 | voyxfff -37.500118255615234 2 | qtyijke -37.50014877319336 3 | mmculed -37.50022888183594 4 | jmpytut -37.500362396240234 5 | vtkprit -37.500396728515625 6 | oilrxjs -37.50044631958008 7 | vfsisex -37.499473571777344 8 | eenqrst -37.49935531616211 9 | nbrlyph -37.50071334838867 10 | xmmgota -37.49924850463867 11 | jmpquiv -37.49921798706055 12 | rummqtt -37.50099182128906 13 | xhrdisp -37.49892044067383 14 | ffturaa -37.498897552490234 15 | dexocht -37.50111770629883 16 | xmmgett -37.501121520996094 17 | lvljspx -37.49882125854492 18 | zugwpdb -37.501182556152344 19 | tidmqtt -37.49877166748047 20 | lhsigua -37.498714447021484 21 | sshemsp -37.50141525268555 22 | burrgyz -37.49848556518555 23 | vtkirie -37.498477935791016 24 | vtkifar -37.501522064208984 25 | rpczano -37.50154495239258 26 | vtkinez -37.501609802246094 27 | vtkifie -37.49838638305664 28 | zugymce -37.50162124633789 29 | xcbwent -37.49831008911133 30 | watobjs -37.49827194213867 31 | doiawks -37.49827194213867 32 | cgiacyj -37.498165130615234 33 | czyands -37.501853942871094 34 | mdbgebn -37.49811553955078 35 | atejspx -37.50190353393555 36 | rndxito -37.49806594848633 37 | sdkrxjs -37.501953125 38 | mlxoice -37.501956939697266 39 | mlxahan -37.50198745727539 40 | auxjspx -37.5020751953125 41 | jsxirms -37.50211715698242 42 | czyrgba -37.49782943725586 43 | makrgyz -37.5021858215332 44 | nanighb -37.49776840209961 45 | jsxobil -37.502262115478516 46 | jwtgraf -37.49773406982422 47 | vtkundy -37.49770736694336 48 | jsxuden -37.49759292602539 49 | pszglfw -37.50242233276367 50 | czydamn -37.49753952026367 51 | csvylko -37.502559661865234 52 | wijincl -37.497379302978516 53 | oilrgyz -37.49725341796875 54 | mlxulan -37.497215270996094 55 | xmmepar -37.50278854370117 56 | lodxlsx -37.502803802490234 57 | uczpeon -37.502864837646484 58 | sesrgyz -37.49709701538086 59 | pciavax -37.497066497802734 60 | gpsilik -37.497066497802734 61 | lhszion -37.49706268310547 62 | slaampp -37.49705505371094 63 | uczhtag -37.502952575683594 64 | ouiqrst -37.50295639038086 65 | xhrziel -37.49697494506836 66 | pcbpiar -37.49697494506836 67 | yumxfff -37.49691390991211 68 | fedjspb -37.50309371948242 69 | xmmtega -37.49677658081055 70 | segzoek -37.50347137451172 71 | mezgrpc -37.503543853759766 72 | xcbophe -37.503658294677734 73 | ngxantz -37.49628829956055 74 | aosantd -37.49628829956055 75 | jejymax -37.50380325317383 76 | rerlsru -37.50386428833008 77 | racrgyz -37.50387954711914 78 | rndquam -37.4961051940918 79 | mlxneau -37.50391387939453 80 | rudcych -37.503944396972656 81 | lotlsru -37.50399398803711 82 | abyilog -37.496002197265625 83 | rsaueba -37.504032135009766 84 | jsxioso -37.49593734741211 85 | derjspx -37.50411605834961 86 | vfsgett -37.49586486816406 87 | vtkjure -37.495849609375 88 | phyepar -37.4958381652832 89 | vesxfff -37.5041618347168 90 | lcdleri -37.50421142578125 91 | ifsfeas -37.49577713012695 92 | mmcubbo -37.50423812866211 93 | ircemsp -37.49563217163086 94 | pdbiesz -37.495601654052734 95 | rpciene -37.49557876586914 96 | iodpiar -37.50454330444336 97 | rmslsru -37.504615783691406 98 | rpcumno -37.50465774536133 99 | apkckpt -37.50466537475586 100 | lcdvoir -37.495269775390625 101 | rhsncia -37.50473403930664 102 | owlsetq -37.4952278137207 103 | ifsbrtc -37.50477600097656 104 | csvowej -37.495140075683594 105 | xcborgt -37.495121002197266 106 | sutmobx -37.495079040527344 107 | iovstmt -37.50493240356445 108 | nanmqtt -37.504947662353516 109 | irqphem -37.504947662353516 110 | wndncia -37.494964599609375 111 | xcbided -37.49495315551758 112 | jsxkees -37.49488067626953 113 | cpscsrf -37.494773864746094 114 | jmppeon -37.49476623535156 115 | lhsreta -37.5052375793457 116 | dezrgyz -37.50527572631836 117 | elecsrf -37.50535202026367 118 | atrlymp -37.505374908447266 119 | iodudev -37.494544982910156 120 | xhrkses -37.505516052246094 121 | ngxjspx -37.49443435668945 122 | uczpear -37.49442672729492 123 | npmhlen -37.49440002441406 124 | pcmncmp -37.505611419677734 125 | biczoek -37.49436569213867 126 | dosorrh -37.50564956665039 127 | jejmisc -37.49434280395508 128 | kenjspx -37.494293212890625 129 | idxiaux -37.505767822265625 130 | svgiesz -37.494205474853516 131 | vtkgems -37.49415969848633 132 | glmldre -37.49413299560547 133 | dexumbn -37.50587844848633 134 | kitxfff -37.49406814575195 135 | jsxajan -37.4940071105957 136 | fmtmina -37.49399185180664 137 | gtkthew -37.49397659301758 138 | czyuess -37.50605010986328 139 | iodhait -37.49386978149414 140 | cafantd -37.506141662597656 141 | xcbredo -37.49382400512695 142 | fpswpdb -37.50624465942383 143 | xcbdogs -37.50633239746094 144 | jwtlify -37.493656158447266 145 | rsaellt -37.493629455566406 146 | pkgughs -37.50637435913086 147 | jmpccak -37.49350357055664 148 | pclvais -37.49347686767578 -------------------------------------------------------------------------------- /seven_letter_words/bin5_prob.txt: -------------------------------------------------------------------------------- 1 | czyjspx -44.995792388916016 2 | xcbabwe -45.006473541259766 3 | aktjspx -44.99137878417969 4 | xcbcych -44.979515075683594 5 | xcbziej -45.07548141479492 6 | xmmeczy -44.91748046875 7 | qeddhcp -45.09950637817383 8 | xcbilha -44.897335052490234 9 | xcbacji -44.8853874206543 10 | xcbzung -45.1260871887207 11 | xmmobre -44.83869552612305 12 | xcbquir -45.17741775512695 13 | xcbrouw -45.2041015625 14 | ilkjspx -45.20814895629883 15 | lijglfw -44.79149627685547 16 | foxrgyz -45.21918869018555 17 | jsxrouw -44.767459869384766 18 | xcbziel -45.23471450805664 19 | xcbagua -44.763145446777344 20 | eidtopl -45.24649429321289 21 | xcbximo -44.73112106323242 22 | jwtglfw -44.719486236572266 23 | xcbnerg -44.71344757080078 24 | xcbateg -44.693031311035156 25 | befjspx -44.69113540649414 26 | xcbxlim -44.65083694458008 27 | xcbsemi -44.63022994995117 28 | ketglfw -45.387977600097656 29 | lemjspx -44.60933303833008 30 | xcbcyan -44.60453414916992 31 | xcbsequ -45.410953521728516 32 | xcbemer -45.411563873291016 33 | eoscsrf -44.56328201293945 34 | xcbphot -44.541378021240234 35 | xcbeken -44.509586334228516 36 | xcbolum -44.500850677490234 37 | xcbrodu -45.50664520263672 38 | tepjspx -44.49314880371094 39 | xcbthro -44.48517990112305 40 | xcbueue -44.48493957519531 41 | oscquiv -44.44233322143555 42 | xcbubah -45.56185531616211 43 | xcbodzi -44.43584060668945 44 | mlxquee -45.57368850708008 45 | xcbmdat -45.59005355834961 46 | xcbuell -44.409183502197266 47 | xcbobre -44.40824890136719 48 | xcbuhan -44.403106689453125 49 | tasexpl -45.62323760986328 50 | xcbueil -44.36052322387695 51 | xcbilos -45.64400100708008 52 | iodtopl -45.644203186035156 53 | suttmpl -44.34950637817383 54 | xcbhots -44.319889068603516 55 | xcbosph -44.319034576416016 56 | xcbuego -44.309486389160156 57 | xcbquam -44.30044174194336 58 | kolglfw -44.29965591430664 59 | gesglfw -44.296722412109375 60 | gccorrh -44.29584503173828 61 | mezptom -44.289695739746094 62 | xcbhecy -45.71607971191406 63 | xcbsemb -44.264095306396484 64 | yiijspx -44.26384353637695 65 | meljspx -44.260704040527344 66 | xcbunos -45.74428939819336 67 | xcbunei -44.22948455810547 68 | pisbrtc -44.21781539916992 69 | vehjspx -44.210479736328125 70 | vasrgyz -44.190887451171875 71 | lhsrgyz -44.180213928222656 72 | xcbighb -45.82477951049805 73 | phyfidf -44.17029571533203 74 | kilglfw -45.8333625793457 75 | dukvrir -44.16157150268555 76 | levjspx -44.15993881225586 77 | updrgyz -44.14170837402344 78 | xcbagas -44.1334228515625 79 | opcrgyz -44.13212585449219 80 | ilkjspb -44.12828063964844 81 | curfidf -44.114540100097656 82 | rpcighb -45.8897590637207 83 | xcbacje -44.10778045654297 84 | xcbilih -45.9096794128418 85 | zugcsrf -44.060035705566406 86 | xcbveau -44.05826187133789 87 | rpcasje -44.04568862915039 88 | xcbalsy -44.04135513305664 89 | pcmrouw -44.037845611572266 90 | xcbafil -44.035858154296875 91 | doijspx -44.03323745727539 92 | xcbhtub -44.029544830322266 93 | xcbhear -45.983673095703125 94 | xcbuele -45.988529205322266 95 | opijspx -43.99332809448242 96 | xcbazzo -43.992305755615234 97 | xcboufl -46.008460998535156 98 | akojspx -43.9888801574707 99 | ninmqtt -43.98078536987305 100 | xcbguna -43.96329879760742 101 | idxorrh -43.9370002746582 102 | xcbheit -43.93656921386719 103 | czyxfff -43.92329406738281 104 | voyglfw -43.90713882446289 105 | dynmqtt -43.902496337890625 106 | xcbcoln -46.09786605834961 107 | vezjspx -43.87360763549805 108 | xcbocre -46.13079071044922 109 | cueorrh -43.85930633544922 110 | xmmacje -43.854305267333984 111 | mlxalsy -43.84138870239258 112 | ebxorrh -43.837650299072266 113 | xcbagal -43.82956314086914 114 | xcbzept -43.82637405395508 115 | xcbucle -43.81629180908203 116 | vesjspx -43.8125 117 | xcbiser -43.809242248535156 118 | xcbseau -43.80495834350586 119 | xcbekte -43.8006477355957 120 | lapmqtt -43.79780960083008 121 | abyjspx -43.78347396850586 122 | xcbueba -46.222286224365234 123 | xcbijke -43.77728271484375 124 | xcbvoie -43.76816940307617 125 | xcbudem -43.76424026489258 126 | xcbivol -46.23701095581055 127 | xcbquoi -43.75960159301758 128 | xcbupal -43.75864791870117 129 | zugjspx -43.75846481323242 130 | xcbheel -46.244380950927734 131 | typglfw -43.74939727783203 132 | rpcinqu -43.74385452270508 133 | voyorrh -43.73942947387695 134 | tieglfw -43.73161315917969 135 | hexmqtt -43.7115592956543 136 | xcbacyj -43.708465576171875 137 | aktjspb -43.69775390625 138 | amyjspx -43.6917610168457 139 | ackrgyz -43.690940856933594 140 | xcbokus -43.688011169433594 141 | xcbhtag -43.65958023071289 142 | togjspx -43.652225494384766 143 | xcbuely -43.64830780029297 144 | xcbffic -43.64610290527344 145 | mlxasje -43.64008331298828 146 | xcbunft -43.63233184814453 147 | wieglfw -43.62156677246094 148 | xcbufig -43.615196228027344 149 | xcbueur -43.613521575927734 150 | zagmqtt -43.60862350463867 -------------------------------------------------------------------------------- /seven_letter_words/gpt2_prob_sevenletter.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | from transformers import GPT2LMHeadModel, GPT2Tokenizer 4 | import tiktoken 5 | import logging 6 | 7 | logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, handlers=[logging.StreamHandler(),logging.FileHandler("prob_random_index.log")]) 8 | 9 | if torch.cuda.is_available(): 10 | device = "cuda" 11 | else: 12 | device = "cpu" 13 | 14 | gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-xl") 15 | gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2-xl").to(device) 16 | gpt4_enc = tiktoken.get_encoding("cl100k_base") 17 | 18 | def pad_batch(batch, pad_idx): 19 | max_length = 0 20 | for seq in batch: 21 | if len(seq) > max_length: 22 | max_length = len(seq) 23 | 24 | new_batch = [] 25 | for seq in batch: 26 | padding = [pad_idx for i in range(max_length - len(seq))] 27 | new_batch.append(seq + padding) 28 | 29 | return new_batch 30 | 31 | # Get perplexity using GPT-2 32 | def prob_gpt2(sentence_list): 33 | 34 | # Tokenize the sentences 35 | all_tokens = [] 36 | for sentence in sentence_list: 37 | tokens = gpt2_tokenizer.encode(sentence) 38 | all_tokens.append(tokens) 39 | tokens = pad_batch(all_tokens, 50256) 40 | 41 | targets = tokens[:] 42 | 43 | # Compute average log likelihood for the generation 44 | input_ids = torch.LongTensor(tokens).to(device) 45 | target_ids = torch.LongTensor(targets).to(device) 46 | 47 | with torch.no_grad(): 48 | outputs = gpt2_model(input_ids, labels=target_ids) 49 | logits = outputs[1] 50 | logits = logits.transpose(0,1)[:-1].transpose(0,1) 51 | target_ids = target_ids.transpose(0,1)[1:].transpose(0,1) 52 | loss = torch.nn.CrossEntropyLoss(reduction="none", ignore_index=50256)(logits.reshape(-1,50257), target_ids.reshape(-1)) 53 | loss = loss.reshape(target_ids.shape).sum(dim=1) 54 | neg_log_likelihood = -1*loss 55 | 56 | 57 | # 13.357776641845703 = logprob('The word is"'); removing this to just get 58 | # the word prob 59 | return neg_log_likelihood + 13.357776641845703 60 | 61 | 62 | for finame in ["random_pairs_lower"]: 63 | fi = open(finame + ".txt", "r") 64 | fo = open(finame + "_scored.txt", "w") 65 | 66 | words_with_prob = [] 67 | 68 | this_batch_sentences = [] 69 | this_batch_words = [] 70 | for index, line in enumerate(fi): 71 | if index % 10000 == 0: 72 | logging.info(str(index)) 73 | 74 | word = line.strip() 75 | 76 | tokens = gpt4_enc.encode(word) 77 | tokens_spaced = gpt4_enc.encode(" " + word) 78 | 79 | if len(tokens) == 2 and len(tokens_spaced) == 2 and len(word) == 7: 80 | token1 = gpt4_enc.decode([tokens[0]]).strip() 81 | token2 = gpt4_enc.decode([tokens[1]]).strip() 82 | 83 | tokenspaced1 = gpt4_enc.decode([tokens_spaced[0]]).strip() 84 | tokenspaced2 = gpt4_enc.decode([tokens_spaced[1]]).strip() 85 | 86 | if len(token1) == 3 and len(token2) == 4 and len(tokenspaced1) == 3 and len(tokenspaced2) == 4: 87 | this_batch_sentences.append('The word is "' + word + '"') 88 | this_batch_words.append(word) 89 | else: 90 | print(index, "Wrong length", word, len(token1), len(token2), len(tokenspaced1), len(tokenspaced2)) 91 | else: 92 | print(index, "Wrong length", word, len(tokens), len(tokens_spaced), len(word)) 93 | 94 | if len(this_batch_sentences) == 3000: 95 | logprobs = prob_gpt2(this_batch_sentences) 96 | for word, logprob in zip(this_batch_words, logprobs): 97 | words_with_prob.append([logprob.item(), word]) 98 | this_batch_sentences = [] 99 | this_batch_words = [] 100 | 101 | if len(this_batch_sentences) > 0: 102 | logprobs = prob_gpt2(this_batch_sentences) 103 | for word, logprob in zip(this_batch_words, logprobs): 104 | words_with_prob.append([logprob.item(), word]) 105 | this_batch_sentences = [] 106 | this_batch_words = [] 107 | 108 | for prob, word in sorted(words_with_prob)[::-1]: 109 | fo.write(str(prob) + "\t" + word + "\n") 110 | 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /seven_letter_words/input_scored.txt: -------------------------------------------------------------------------------- 1 | mryycon -33.4009895324707 2 | myvlobd -33.97520446777344 3 | zyvoxdk -42.584041595458984 4 | wynsmew -30.42465591430664 5 | kedkbmr -34.5813102722168 6 | cmrscwc -35.037437438964844 7 | wkbsxob -39.7088508605957 8 | nsckbwc -36.846317291259766 9 | bocmkvo -39.4564094543457 10 | zkigkvv -42.60762405395508 11 | sxpylyh -35.01577377319336 12 | zbocdyx -42.305076599121094 13 | crbsxoc -38.06560134887695 14 | swzvybo -37.10409164428711 15 | kvvydon -34.48329162597656 16 | zbomkcd -39.263160705566406 17 | lybsxqc -42.85857391357422 18 | lkmsvvs -36.32170867919922 19 | wkdbsmo -36.486995697021484 20 | bonslvo -34.44765090942383 21 | klcyvfo -40.78521728515625 22 | yebcovp -34.895774841308594 23 | odrodsm -34.4904670715332 24 | wkixkbn -40.274925231933594 25 | mkvsleb -35.38396072387695 26 | oxfsbyc -39.76297378540039 27 | mkvjyxo -41.52894592285156 28 | cewkdbk -40.9130973815918 29 | nbigkvv -39.965938568115234 30 | swzkvon -37.17292404174805 31 | wkxvkxn -40.5903205871582 32 | nsfsxon -38.56913757324219 33 | myxvkxq -43.61052703857422 34 | dklvoby -37.12471389770508 35 | bonbkpd -34.73928451538086 36 | oaesdkc -42.28152084350586 37 | bkddsxq -40.66632843017578 38 | obbkxmi -41.12004470825195 39 | golmkcd -35.0498161315918 40 | vygvkxn -44.4805908203125 41 | lyiryyn -34.94938659667969 42 | kmdekbi -37.709075927734375 43 | mkdvsuo -38.730655670166016 44 | zedlkmu -36.18986511230469 45 | qkvsvoy -42.10275650024414 46 | bsfkvon -35.44425582885742 47 | fyvyxdo -42.35395812988281 48 | cexczyd -39.47360610961914 49 | bydexnk -37.63880157470703 50 | xydmron -34.41005325317383 51 | dkzbyyd -40.756038665771484 52 | comeboc -25.816699981689453 53 | oxdoxdo -36.08238220214844 54 | yedpvyg -38.156593322753906 55 | loddobc -34.54991912841797 56 | bewzvon -36.644874572753906 57 | lebbson -29.779876708984375 58 | bozevco -32.21761703491211 59 | psvvodc -39.616676330566406 60 | bovkdyb -44.093204498291016 61 | cywlyni -31.04549789428711 62 | exckfon -32.31464767456055 63 | ksvwoxd -38.20396423339844 64 | xynevoc -36.27880096435547 65 | wyxdoby -34.911991119384766 66 | ckdsboc -34.813270568847656 67 | kbmknsk -35.1013069152832 68 | fkvobso -37.41843032836914 69 | sxqvscr -38.215450286865234 70 | neuonyw -33.95500946044922 71 | oczyeco -34.99745178222656 72 | lonofsv -34.87450408935547 73 | bodsmvo -37.443511962890625 74 | wkdsxoo -35.750003814697266 75 | wkhgovv -39.59453201293945 76 | zsmkxdo -41.09931564331055 77 | lklyyxc -38.89170455932617 78 | ohmsdob -35.976707458496094 79 | vycsxqc -43.72141647338867 80 | xoglsoc -33.673892974853516 81 | cobbson -27.039085388183594 82 | mebfsxq -43.0044059753418 83 | xkbbygc -36.12342071533203 84 | bkqqsxq -38.069557189941406 85 | lkxopev -37.932037353515625 86 | zsxkdkc -42.61277389526367 87 | nsfscyx -34.74595642089844 88 | usxpyvu -37.7536735534668 89 | sxnskxk -36.24659729003906 90 | mkbsdkc -36.59408950805664 91 | csvfobi -32.64740753173828 92 | sxuvsxq -40.83545684814453 93 | klcoxco -35.745391845703125 94 | vkfklsd -37.36144256591797 95 | yedcsjo -37.90355682373047 96 | bogsbon -28.6097412109375 97 | klckvyw -38.35791015625 98 | qodlkmu -39.189537048339844 99 | kmmecob -35.23679733276367 100 | cdbsfox -33.410850524902344 101 | -------------------------------------------------------------------------------- /seven_letter_words/random_token_combos.py: -------------------------------------------------------------------------------- 1 | 2 | import tiktoken 3 | 4 | enc = tiktoken.get_encoding("cl100k_base") 5 | 6 | alphabet = "abcdefghijklmnopqrstuvwxyz" 7 | alphabet_dict = {} 8 | for char in alphabet: 9 | alphabet_dict[char] = 1 10 | 11 | def is_roman_lower(string): 12 | for char in string: 13 | if char not in alphabet_dict: 14 | return False 15 | return True 16 | 17 | all_threes_lower = [] 18 | all_fours_lower = [] 19 | 20 | 21 | for i in range(100256): 22 | token = enc.decode([i]) 23 | if len(token) == 4: 24 | if token[0] == " " and is_roman_lower(token[1:]): 25 | all_threes_lower.append(token) 26 | elif is_roman_lower(token): 27 | all_fours_lower.append(token) 28 | 29 | print(len(all_threes_lower), len(all_fours_lower), len(all_threes_lower)*len(all_fours_lower)) 30 | print(all_threes_lower[:10]) 31 | print(all_fours_lower[:10]) 32 | print("") 33 | 34 | fo_lower = open("random_pairs_lower.txt", "w") 35 | 36 | for start in all_threes_lower: 37 | for end in all_fours_lower: 38 | candidate = start.strip() + end.strip() 39 | tokens_unspaced = enc.encode(candidate) 40 | tokens_spaced = enc.encode(" " + candidate) 41 | 42 | if len(tokens_unspaced) == 2 and len(tokens_spaced) == 2: 43 | if len(enc.decode([tokens_unspaced[0]]).strip()) == 3 and len(enc.decode([tokens_unspaced[1]]).strip()) == 4 and len(enc.decode([tokens_spaced[0]]).strip()) == 3 and len(enc.decode([tokens_spaced[1]]).strip()) == 4: 44 | fo_lower.write(start.strip() + end.strip() + "\n") 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /seven_letter_words/select_words.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | all_scores, all_words = [], [] 4 | 5 | with open("seven_letter_words/random_pairs_lower_scored.txt", "r") as f: 6 | lines = f.readlines() 7 | for line in lines: 8 | score, word = line.split() 9 | all_scores.append(float(score)) 10 | all_words.append(word) 11 | 12 | # Function to select 100 words closest to a given score 13 | def select_closest_words(score, num_words=150): 14 | 15 | # Sort the scores based on proximity to the target score 16 | sorted_indices = sorted(range(len(all_scores)), key=lambda i: abs(all_scores[i] - score)) 17 | 18 | # Select the 100 closest words 19 | selected_indices = sorted_indices[:num_words] 20 | selected_words = [all_words[i] for i in selected_indices] 21 | scores = [all_scores[i] for i in selected_indices] 22 | 23 | return [selected_words, scores] 24 | 25 | # Select 100 words closest to each specified score level 26 | selected_words_closest_to_levels = {} 27 | selected_words = [] 28 | for score_level in [-15, -22.5, -30, -37.5, -45]: 29 | selected_words_closest_to_levels[score_level] = select_closest_words(score_level) 30 | selected_words += selected_words_closest_to_levels[score_level][0] 31 | 32 | selected_words = set(selected_words) 33 | print("Number of selected words: " + str(len(selected_words))) 34 | 35 | with open("seven_letter_words/words_5bins.txt", "w") as f: 36 | for score in [-15, -22.5, -30, -37.5, -45]: 37 | for word,sc in zip(selected_words_closest_to_levels[score][0], selected_words_closest_to_levels[score][1]): 38 | f.write(word + " " + str(sc) + "\n") 39 | -------------------------------------------------------------------------------- /stimulus_generator.py: -------------------------------------------------------------------------------- 1 | import jsonlines 2 | import os 3 | import random 4 | import argparse 5 | 6 | # Functions for encoding in rot-1 or rot-3 7 | alphabet = "abcdefghijklmnopqrstuvwxyz" 8 | index2char = {} 9 | char2index = {} 10 | for index, char in enumerate(alphabet): 11 | index2char[index] = char 12 | char2index[char] = index 13 | 14 | 15 | def rot_encode(sequence, n): 16 | new_sequence = [] 17 | for char in sequence: 18 | if not char.isalpha(): 19 | new_sequence.append(char) 20 | elif char.isupper(): 21 | index = char2index[char.lower()] 22 | new_char = index2char[(index+n) % 26] 23 | new_sequence.append(new_char.upper()) 24 | else: 25 | index = char2index[char] 26 | new_char = index2char[(index+n) % 26] 27 | new_sequence.append(new_char) 28 | return "".join(new_sequence) 29 | 30 | 31 | def create_chain(sequence, n): 32 | chain = [] 33 | for index, char in enumerate(sequence): 34 | new_char = rot_encode(char, 26-n) 35 | chain.append(str(index+1) + ". " + char + " -> " + new_char + "\n") 36 | return "".join(chain) 37 | 38 | 39 | def create_math_cot_chain(sequence, n): 40 | s = f'''Let’s start by writing the letter-position mapping for the alphabet: 41 | a -> 0 42 | b -> 1 43 | c -> 2 44 | d -> 3 45 | e -> 4 46 | f -> 5 47 | g -> 6 48 | h -> 7 49 | i -> 8 50 | j -> 9 51 | k -> 10 52 | l -> 11 53 | m -> 12 54 | n -> 13 55 | o -> 14 56 | p -> 15 57 | q -> 16 58 | r -> 17 59 | s -> 18 60 | t -> 19 61 | u -> 20 62 | v -> 21 63 | w -> 22 64 | x -> 23 65 | y -> 24 66 | z -> 25 67 | 68 | Next, we find the encoded letter as follows: 69 | Position of original letter = (Position of given letter − {n}) mod 26 70 | 71 | Then map the found position to the corresponding letter using the letter-position mapping. 72 | 73 | Using this,\n 74 | ''' 75 | chain = [] 76 | for index, char in enumerate(sequence): 77 | new_char = rot_encode(char, 26-n) 78 | chain.append(str(index+1) + ". " + char + " -> " + 79 | f"({char2index[char]} - {n}) mod 26" " -> " + new_char + "\n") 80 | return s + "".join(chain) 81 | 82 | 83 | def create_number_cot_chain(sequence, n): 84 | s = f''' 85 | New position = (Given position − {n}) mod 26 86 | Using this,\n 87 | ''' 88 | chain = [] 89 | for index, char in enumerate(sequence): 90 | new_char = rot_encode(char, 26-n) 91 | chain.append(str(index+1) + ". " + str(char2index[char]) + " -> " + 92 | f"({char2index[char]} - {n}) mod 26" " -> " + str(char2index[new_char]) + "\n") 93 | return s + "".join(chain) 94 | 95 | 96 | def create_step_chain_forward(sequence, n): 97 | chain = [] 98 | for index, char in enumerate(sequence): 99 | new_char = rot_encode(char, 26-n) 100 | start_ord, end_ord = ord(char), ord(new_char) 101 | part_chain = "" 102 | if char == new_char: 103 | part_chain = new_char + " -> " + new_char 104 | else: 105 | if start_ord > end_ord: 106 | if char.isupper(): 107 | end_ord = ord("Z") 108 | else: 109 | end_ord = ord("z") 110 | for char_ord in range(start_ord, end_ord+1, 1): 111 | part_chain += chr(char_ord) 112 | if char_ord != end_ord: 113 | part_chain += " -> " 114 | if char_ord != ord(new_char): 115 | part_chain += " -> " 116 | if char.isupper(): 117 | start_ord = ord("A") 118 | else: 119 | start_ord = ord("a") 120 | for char_ord in range(start_ord, ord(new_char)+1, 1): 121 | part_chain += chr(char_ord) 122 | if char_ord != ord(new_char): 123 | part_chain += " -> " 124 | 125 | chain.append(str(index+1) + ". " + part_chain + "\n") 126 | return "".join(chain) 127 | 128 | 129 | def create_math_corrupt_chain(sequence, n): 130 | chain = [] 131 | s = f'''Let’s start by writing the letter-position mapping for the alphabet: 132 | a -> 0 133 | b -> 1 134 | c -> 2 135 | d -> 3 136 | e -> 4 137 | f -> 5 138 | g -> 6 139 | h -> 7 140 | i -> 8 141 | j -> 9 142 | k -> 10 143 | l -> 11 144 | m -> 12 145 | n -> 13 146 | o -> 14 147 | p -> 15 148 | q -> 16 149 | r -> 17 150 | s -> 18 151 | t -> 19 152 | u -> 20 153 | v -> 21 154 | w -> 22 155 | x -> 23 156 | y -> 24 157 | z -> 25 158 | 159 | Next, we find the encoded letter as follows: 160 | Position of original letter = (Position of given letter − {n}) mod 26 161 | 162 | Then map the found position to the corresponding letter using the letter-position mapping. 163 | 164 | Using this,\n 165 | ''' 166 | chain = [] 167 | for index, char in enumerate(sequence): 168 | new_char = '*' 169 | chain.append(str(index+1) + ". " + char + " -> " + 170 | f"({char2index[char]} - {n}) mod 26" " -> " + new_char + "\n") 171 | return s + "".join(chain) 172 | 173 | 174 | def create_corrupt_chain(sequence, n): 175 | chain = [] 176 | for index, char in enumerate(sequence): 177 | # random character, letter, or number, punctuation 178 | # candidates = list(alphabet) + [x.upper() for x in list(alphabet)] + list("0123456789") + list(".,?!:;\"'()[]{}") 179 | # replace 50% of the time 180 | # if random.random() < 0.5: 181 | # new_char = random.choice(candidates) 182 | # else: 183 | # new_char = rot_encode(char, 26-n) 184 | # if not char.isalpha(): 185 | # new_char = char 186 | # else: 187 | new_char = "*" 188 | chain.append(str(index+1) + ". " + char + " -> " + new_char + "\n") 189 | return "".join(chain) 190 | 191 | 192 | # print(rot_encode("stay", 1)) 193 | # print(rot_encode("stay", 3)) 194 | 195 | 196 | def create_swap_chain(sequence, n): 197 | chain = [] 198 | for index, char in enumerate(sequence): 199 | new_char = rot_encode(char, 26-n) 200 | chain.append(str(index+1) + ". " + char + " -> " + new_char + "\n") 201 | return "".join(chain) 202 | 203 | 204 | def string_to_seq(msg): 205 | seq = "" 206 | for char in msg: 207 | seq += str(char2index[char]) + "," 208 | return seq[:-1] 209 | 210 | def main(args): 211 | data = [ 212 | ("examples/bin_1.txt", "bin1"), 213 | ("examples/bin_2.txt", "bin2"), 214 | ("examples/bin_3.txt", "bin3"), 215 | ("examples/bin_4.txt", "bin4"), 216 | ("examples/bin_5.txt", "bin5") 217 | ] 218 | prompt_type = args.prompt_type 219 | fo_directory = f"stimuli/{prompt_type}" 220 | 221 | if not os.path.exists(fo_directory): 222 | os.makedirs(fo_directory, exist_ok=True) 223 | 224 | for shift in range(1, 26): 225 | for task in ["dec"]: 226 | for fi_name, fi_label in data: 227 | fo_name = f"{fo_directory}/{prompt_type+str(shift)}_{fi_label}.jsonl" 228 | 229 | fi = open(fi_name, "r") 230 | fo = open(fo_name, "w") 231 | jsl = jsonlines.Writer(fo) 232 | 233 | count_encoded = 0 234 | for line_num, line in enumerate(fi): 235 | example = {} 236 | 237 | # Task 238 | example["task_name"] = "rot-" + str(shift) 239 | 240 | # Condition 241 | example_type = fo_name.split("_")[1].split(".")[0] 242 | example["example_type"] = example_type 243 | 244 | word = line.strip().split("\t")[0] 245 | sentence = word 246 | # sentence1 = line.strip().split("\t")[0] 247 | encoded = rot_encode(word, shift) 248 | 249 | # Instruction 250 | if task == "dec": 251 | if shift == 1: 252 | if prompt_type == "standard": 253 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' position forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ' along with the original text that it was created from:\nRot-' + str( 254 | shift) + ' text: "' + rot_encode("Stay here!", shift) + '"\nOriginal text: "Stay here!"\n\nHere is another message in rot-' + str(shift) + '. Decode this message to produce the original text:\nRot-' + str(shift) + ' text: "%s"\nOriginal text:' 255 | elif prompt_type == "text_cot": 256 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' position forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("stay", shift) + '"\n\nTo decode this message, we shift each letter ' + str( 257 | shift) + ' position backward.' + create_chain(rot_encode("stay", shift), shift) + '\nTherefore, the original text is: "Stay"\n\nHere is another message in rot-' + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"' 258 | elif prompt_type == "math_cot": 259 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' position forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("stay", shift) + '"\n\nTo decode this message, we need to shift each letter ' + str( 260 | shift) + ' position backward. ' + create_math_cot_chain(rot_encode("stay", shift), shift) + '\nTherefore, the original text is: "stay"\n\nHere is another message in rot-' + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"' 261 | elif prompt_type == "number_cot": 262 | example["task_instruction"] = 'Shift-' + str(shift) + ' is a process in which each number is shifted ' + str(shift) + ' position forward until it reaches 26 and subsequently circles back to 1. For example, here is a sequence of numbers written in shift-' + str(shift) + ':\shift-' + str(shift) + ' sequence: "' + string_to_seq(rot_encode("stay", shift)) + '"\n\nTo decode this sequence, we need to shift each number ' + str( 263 | shift) + ' position backward. ' + create_number_cot_chain(rot_encode("stay", shift), shift) + '\nTherefore, the original sequence of numbers is: ' + f'"{string_to_seq("stay")}"' + '\n\nHere is another sequence of numbers in shift-' + str(shift) + '. Decode this sequence one number at a time. On the last line, write the words "Original sequence:" followed by the decoded sequence:\nshift-' + str(shift) + ' sequence: "%s"' 264 | else: 265 | if prompt_type == "standard": 266 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' positions forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ' along with the original text that it was created from:\nRot-' + str( 267 | shift) + ' text: "' + rot_encode("Stay here!", shift) + '"\nOriginal text: "Stay here!"\n\nHere is another message in rot-' + str(shift) + '. Decode this message to produce the original text:\nRot-' + str(shift) + ' text: "%s"\nOriginal text:' 268 | elif prompt_type == "text_cot": 269 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' positions forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("stay", shift) + '"\n\nTo decode this message, we shift each letter ' + str( 270 | shift) + ' positions backward:\n' + create_chain(rot_encode("stay", shift), shift) + '\nTherefore, the original text is: "stay"\n\nHere is another message in rot-' + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"' 271 | elif prompt_type == "cot_hidden_1": 272 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' positions forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("Stay here!", shift) + '"\n\nTo decode this message, we shift each letter ' + str(shift) + " positions backward; but instead of revealing what each letter becomes, we will replace it with a '*' until we write the final answer:\n" + create_corrupt_chain( 273 | rot_encode("Stay here!", shift), shift) + """\nIf we put together the letters that were hidden behind each '*', we get that the original text is: "Stay here!"\n\nHere is another message in rot-""" + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"' 274 | elif prompt_type == "math_cot": 275 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' position forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("stay", shift) + '"\n\nTo decode this message, we need to shift each letter ' + str( 276 | shift) + ' positions backward. ' + create_math_cot_chain(rot_encode("stay", shift), shift) + '\nTherefore, the original text is: "stay"\n\nHere is another message in rot-' + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"' 277 | elif prompt_type == "number_cot": 278 | example["task_instruction"] = 'Shift-' + str(shift) + ' is a process in which each number is shifted ' + str(shift) + ' positions forward until it reaches 26 and subsequently circles back to 1. For example, here is a sequence of numbers written in shift-' + str(shift) + ':\shift-' + str(shift) + ' sequence: "' + string_to_seq(rot_encode("stay", shift)) + '"\n\nTo decode this sequence, we need to shift each number ' + str( 279 | shift) + ' positions backward. ' + create_number_cot_chain(rot_encode("stay", shift), shift) + '\nTherefore, the original sequence of numbers is:' + f'"{string_to_seq("stay")}"' + '\n\nHere is another sequence of numbers in shift-' + str(shift) + '. Decode this sequence one number at a time. On the last line, write the words "Original sequence:" followed by the decoded sequence:\nshift-' + str(shift) + ' sequence: "%s"' 280 | elif prompt_type == "one-step-fwd": 281 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' positions forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("Stay here!", shift) + '"\n\nTo decode this message, we shift each letter ' + str( 282 | 26-shift) + ' positions forward one step at a time:\n' + create_step_chain_forward(rot_encode("Stay here!", shift), shift) + '\nTherefore, the original text is: "Stay here!"\n\nHere is another message in rot-' + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"' 283 | elif prompt_type == "math_swap": 284 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' positions forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("stay", shift+1) + '"\n\nTo decode this message, we shift each letter ' + str( 285 | shift) + ' positions backward:\n' + create_math_cot_chain(rot_encode("stay", shift+1), shift+1) + '\nTherefore, the original text is: "stay"\n\nHere is another message in rot-' + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"' 286 | elif prompt_type == "math_corrupt": 287 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' positions forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("stay", shift) + '"\n\nTo decode this message, we shift each letter ' + str(shift) + " positions backward; but instead of revealing what each letter becomes, we will replace it with a '*' until we write the final answer:\n" + create_math_corrupt_chain( 288 | rot_encode("stay", shift), shift) + """\nIf we put together the letters that were hidden behind each '*', we get that the original text is: "stay"\n\nHere is another message in rot-""" + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"' 289 | 290 | # Input and correct output 291 | if task == "dec": 292 | example["input"] = encoded 293 | example["correct_output"] = sentence 294 | else: 295 | example["input"] = sentence 296 | example["correct_output"] = encoded 297 | 298 | # Combining the instruction and input (this is the string that should be given to the model) 299 | example["instruction_plus_input"] = example["task_instruction"] % example["input"] 300 | 301 | jsl.write(example) 302 | 303 | count_encoded += 1 304 | if count_encoded == 100: 305 | break 306 | 307 | if __name__ == "__main__": 308 | args = argparse.ArgumentParser() 309 | args.add_argument("--prompt_type", type=str, help="Prompt type to use", default="text_cot", choices=["standard", "text_cot", "math_cot", "number_cot"]) 310 | args = args.parse_args() 311 | main(args) --------------------------------------------------------------------------------