├── .gitignore
├── README.md
├── assets
└── preview.png
├── eval.py
├── examples
├── bin_1.txt
├── bin_2.txt
├── bin_3.txt
├── bin_4.txt
├── bin_5.txt
├── select_swap_words.py
└── word_pairs_lowbins.txt
├── logs
├── basic
│ ├── claude-3
│ │ ├── basic10_bin1_temp=0.0.json
│ │ ├── basic10_bin2_temp=0.0.json
│ │ ├── basic10_bin3_temp=0.0.json
│ │ ├── basic10_bin4_temp=0.0.json
│ │ ├── basic10_bin5_temp=0.0.json
│ │ ├── basic11_bin1_temp=0.0.json
│ │ ├── basic11_bin2_temp=0.0.json
│ │ ├── basic11_bin3_temp=0.0.json
│ │ ├── basic11_bin4_temp=0.0.json
│ │ ├── basic11_bin5_temp=0.0.json
│ │ ├── basic12_bin1_temp=0.0.json
│ │ ├── basic12_bin2_temp=0.0.json
│ │ ├── basic12_bin3_temp=0.0.json
│ │ ├── basic12_bin4_temp=0.0.json
│ │ ├── basic12_bin5_temp=0.0.json
│ │ ├── basic13_bin1_temp=0.0.json
│ │ ├── basic13_bin2_temp=0.0.json
│ │ ├── basic13_bin3_temp=0.0.json
│ │ ├── basic13_bin4_temp=0.0.json
│ │ ├── basic13_bin5_temp=0.0.json
│ │ ├── basic14_bin1_temp=0.0.json
│ │ ├── basic14_bin2_temp=0.0.json
│ │ ├── basic14_bin3_temp=0.0.json
│ │ ├── basic14_bin4_temp=0.0.json
│ │ ├── basic14_bin5_temp=0.0.json
│ │ ├── basic15_bin1_temp=0.0.json
│ │ ├── basic15_bin2_temp=0.0.json
│ │ ├── basic15_bin3_temp=0.0.json
│ │ ├── basic15_bin4_temp=0.0.json
│ │ ├── basic15_bin5_temp=0.0.json
│ │ ├── basic16_bin1_temp=0.0.json
│ │ ├── basic16_bin2_temp=0.0.json
│ │ ├── basic16_bin3_temp=0.0.json
│ │ ├── basic16_bin4_temp=0.0.json
│ │ ├── basic16_bin5_temp=0.0.json
│ │ ├── basic17_bin1_temp=0.0.json
│ │ ├── basic17_bin2_temp=0.0.json
│ │ ├── basic17_bin3_temp=0.0.json
│ │ ├── basic17_bin4_temp=0.0.json
│ │ ├── basic17_bin5_temp=0.0.json
│ │ ├── basic18_bin1_temp=0.0.json
│ │ ├── basic18_bin2_temp=0.0.json
│ │ ├── basic18_bin3_temp=0.0.json
│ │ ├── basic18_bin4_temp=0.0.json
│ │ ├── basic18_bin5_temp=0.0.json
│ │ ├── basic19_bin1_temp=0.0.json
│ │ ├── basic19_bin2_temp=0.0.json
│ │ ├── basic19_bin3_temp=0.0.json
│ │ ├── basic19_bin4_temp=0.0.json
│ │ ├── basic19_bin5_temp=0.0.json
│ │ ├── basic1_bin1_temp=0.0.json
│ │ ├── basic1_bin2_temp=0.0.json
│ │ ├── basic1_bin3_temp=0.0.json
│ │ ├── basic1_bin4_temp=0.0.json
│ │ ├── basic1_bin5_temp=0.0.json
│ │ ├── basic20_bin1_temp=0.0.json
│ │ ├── basic20_bin2_temp=0.0.json
│ │ ├── basic20_bin3_temp=0.0.json
│ │ ├── basic20_bin4_temp=0.0.json
│ │ ├── basic20_bin5_temp=0.0.json
│ │ ├── basic21_bin1_temp=0.0.json
│ │ ├── basic21_bin2_temp=0.0.json
│ │ ├── basic21_bin3_temp=0.0.json
│ │ ├── basic21_bin4_temp=0.0.json
│ │ ├── basic21_bin5_temp=0.0.json
│ │ ├── basic22_bin1_temp=0.0.json
│ │ ├── basic22_bin2_temp=0.0.json
│ │ ├── basic22_bin3_temp=0.0.json
│ │ ├── basic22_bin4_temp=0.0.json
│ │ ├── basic22_bin5_temp=0.0.json
│ │ ├── basic23_bin1_temp=0.0.json
│ │ ├── basic23_bin2_temp=0.0.json
│ │ ├── basic23_bin3_temp=0.0.json
│ │ ├── basic23_bin4_temp=0.0.json
│ │ ├── basic23_bin5_temp=0.0.json
│ │ ├── basic24_bin1_temp=0.0.json
│ │ ├── basic24_bin2_temp=0.0.json
│ │ ├── basic24_bin3_temp=0.0.json
│ │ ├── basic24_bin4_temp=0.0.json
│ │ ├── basic24_bin5_temp=0.0.json
│ │ ├── basic25_bin1_temp=0.0.json
│ │ ├── basic25_bin2_temp=0.0.json
│ │ ├── basic25_bin3_temp=0.0.json
│ │ ├── basic25_bin4_temp=0.0.json
│ │ ├── basic25_bin5_temp=0.0.json
│ │ ├── basic2_bin1_temp=0.0.json
│ │ ├── basic2_bin2_temp=0.0.json
│ │ ├── basic2_bin3_temp=0.0.json
│ │ ├── basic2_bin4_temp=0.0.json
│ │ ├── basic2_bin5_temp=0.0.json
│ │ ├── basic3_bin1_temp=0.0.json
│ │ ├── basic3_bin2_temp=0.0.json
│ │ ├── basic3_bin3_temp=0.0.json
│ │ ├── basic3_bin4_temp=0.0.json
│ │ ├── basic3_bin5_temp=0.0.json
│ │ ├── basic4_bin1_temp=0.0.json
│ │ ├── basic4_bin2_temp=0.0.json
│ │ ├── basic4_bin3_temp=0.0.json
│ │ ├── basic4_bin4_temp=0.0.json
│ │ ├── basic4_bin5_temp=0.0.json
│ │ ├── basic5_bin1_temp=0.0.json
│ │ ├── basic5_bin2_temp=0.0.json
│ │ ├── basic5_bin3_temp=0.0.json
│ │ ├── basic5_bin4_temp=0.0.json
│ │ ├── basic5_bin5_temp=0.0.json
│ │ ├── basic6_bin1_temp=0.0.json
│ │ ├── basic6_bin2_temp=0.0.json
│ │ ├── basic6_bin3_temp=0.0.json
│ │ ├── basic6_bin4_temp=0.0.json
│ │ ├── basic6_bin5_temp=0.0.json
│ │ ├── basic7_bin1_temp=0.0.json
│ │ ├── basic7_bin2_temp=0.0.json
│ │ ├── basic7_bin3_temp=0.0.json
│ │ ├── basic7_bin4_temp=0.0.json
│ │ ├── basic7_bin5_temp=0.0.json
│ │ ├── basic8_bin1_temp=0.0.json
│ │ ├── basic8_bin2_temp=0.0.json
│ │ ├── basic8_bin3_temp=0.0.json
│ │ ├── basic8_bin4_temp=0.0.json
│ │ ├── basic8_bin5_temp=0.0.json
│ │ ├── basic9_bin1_temp=0.0.json
│ │ ├── basic9_bin2_temp=0.0.json
│ │ ├── basic9_bin3_temp=0.0.json
│ │ ├── basic9_bin4_temp=0.0.json
│ │ ├── basic9_bin5_temp=0.0.json
│ │ └── results.jsonl
│ ├── llama3.1-405b
│ │ ├── basic10_bin1_temp=0.0.json
│ │ ├── basic10_bin2_temp=0.0.json
│ │ ├── basic10_bin3_temp=0.0.json
│ │ ├── basic10_bin4_temp=0.0.json
│ │ ├── basic10_bin5_temp=0.0.json
│ │ ├── basic11_bin1_temp=0.0.json
│ │ ├── basic11_bin2_temp=0.0.json
│ │ ├── basic11_bin3_temp=0.0.json
│ │ ├── basic11_bin4_temp=0.0.json
│ │ ├── basic11_bin5_temp=0.0.json
│ │ ├── basic12_bin1_temp=0.0.json
│ │ ├── basic12_bin2_temp=0.0.json
│ │ ├── basic12_bin3_temp=0.0.json
│ │ ├── basic12_bin4_temp=0.0.json
│ │ ├── basic12_bin5_temp=0.0.json
│ │ ├── basic13_bin1_temp=0.0.json
│ │ ├── basic13_bin2_temp=0.0.json
│ │ ├── basic13_bin3_temp=0.0.json
│ │ ├── basic13_bin4_temp=0.0.json
│ │ ├── basic13_bin5_temp=0.0.json
│ │ ├── basic14_bin1_temp=0.0.json
│ │ ├── basic14_bin2_temp=0.0.json
│ │ ├── basic14_bin3_temp=0.0.json
│ │ ├── basic14_bin4_temp=0.0.json
│ │ ├── basic14_bin5_temp=0.0.json
│ │ ├── basic15_bin1_temp=0.0.json
│ │ ├── basic15_bin2_temp=0.0.json
│ │ ├── basic15_bin3_temp=0.0.json
│ │ ├── basic15_bin4_temp=0.0.json
│ │ ├── basic15_bin5_temp=0.0.json
│ │ ├── basic16_bin1_temp=0.0.json
│ │ ├── basic16_bin2_temp=0.0.json
│ │ ├── basic16_bin3_temp=0.0.json
│ │ ├── basic16_bin4_temp=0.0.json
│ │ ├── basic16_bin5_temp=0.0.json
│ │ ├── basic17_bin1_temp=0.0.json
│ │ ├── basic17_bin2_temp=0.0.json
│ │ ├── basic17_bin3_temp=0.0.json
│ │ ├── basic17_bin4_temp=0.0.json
│ │ ├── basic17_bin5_temp=0.0.json
│ │ ├── basic18_bin1_temp=0.0.json
│ │ ├── basic18_bin2_temp=0.0.json
│ │ ├── basic18_bin3_temp=0.0.json
│ │ ├── basic18_bin4_temp=0.0.json
│ │ ├── basic18_bin5_temp=0.0.json
│ │ ├── basic19_bin1_temp=0.0.json
│ │ ├── basic19_bin2_temp=0.0.json
│ │ ├── basic19_bin3_temp=0.0.json
│ │ ├── basic19_bin4_temp=0.0.json
│ │ ├── basic19_bin5_temp=0.0.json
│ │ ├── basic1_bin1_temp=0.0.json
│ │ ├── basic1_bin2_temp=0.0.json
│ │ ├── basic1_bin3_temp=0.0.json
│ │ ├── basic1_bin4_temp=0.0.json
│ │ ├── basic1_bin5_temp=0.0.json
│ │ ├── basic20_bin1_temp=0.0.json
│ │ ├── basic20_bin2_temp=0.0.json
│ │ ├── basic20_bin3_temp=0.0.json
│ │ ├── basic20_bin4_temp=0.0.json
│ │ ├── basic20_bin5_temp=0.0.json
│ │ ├── basic21_bin1_temp=0.0.json
│ │ ├── basic21_bin2_temp=0.0.json
│ │ ├── basic21_bin3_temp=0.0.json
│ │ ├── basic21_bin4_temp=0.0.json
│ │ ├── basic21_bin5_temp=0.0.json
│ │ ├── basic22_bin1_temp=0.0.json
│ │ ├── basic22_bin2_temp=0.0.json
│ │ ├── basic22_bin3_temp=0.0.json
│ │ ├── basic22_bin4_temp=0.0.json
│ │ ├── basic22_bin5_temp=0.0.json
│ │ ├── basic23_bin1_temp=0.0.json
│ │ ├── basic23_bin2_temp=0.0.json
│ │ ├── basic23_bin3_temp=0.0.json
│ │ ├── basic23_bin4_temp=0.0.json
│ │ ├── basic23_bin5_temp=0.0.json
│ │ ├── basic24_bin1_temp=0.0.json
│ │ ├── basic24_bin2_temp=0.0.json
│ │ ├── basic24_bin3_temp=0.0.json
│ │ ├── basic24_bin4_temp=0.0.json
│ │ ├── basic24_bin5_temp=0.0.json
│ │ ├── basic25_bin1_temp=0.0.json
│ │ ├── basic25_bin2_temp=0.0.json
│ │ ├── basic25_bin3_temp=0.0.json
│ │ ├── basic25_bin4_temp=0.0.json
│ │ ├── basic25_bin5_temp=0.0.json
│ │ ├── basic2_bin1_temp=0.0.json
│ │ ├── basic2_bin2_temp=0.0.json
│ │ ├── basic2_bin3_temp=0.0.json
│ │ ├── basic2_bin4_temp=0.0.json
│ │ ├── basic2_bin5_temp=0.0.json
│ │ ├── basic3_bin1_temp=0.0.json
│ │ ├── basic3_bin2_temp=0.0.json
│ │ ├── basic3_bin3_temp=0.0.json
│ │ ├── basic3_bin4_temp=0.0.json
│ │ ├── basic3_bin5_temp=0.0.json
│ │ ├── basic4_bin1_temp=0.0.json
│ │ ├── basic4_bin2_temp=0.0.json
│ │ ├── basic4_bin3_temp=0.0.json
│ │ ├── basic4_bin4_temp=0.0.json
│ │ ├── basic4_bin5_temp=0.0.json
│ │ ├── basic5_bin1_temp=0.0.json
│ │ ├── basic5_bin2_temp=0.0.json
│ │ ├── basic5_bin3_temp=0.0.json
│ │ ├── basic5_bin4_temp=0.0.json
│ │ ├── basic5_bin5_temp=0.0.json
│ │ ├── basic6_bin1_temp=0.0.json
│ │ ├── basic6_bin2_temp=0.0.json
│ │ ├── basic6_bin3_temp=0.0.json
│ │ ├── basic6_bin4_temp=0.0.json
│ │ ├── basic6_bin5_temp=0.0.json
│ │ ├── basic7_bin1_temp=0.0.json
│ │ ├── basic7_bin2_temp=0.0.json
│ │ ├── basic7_bin3_temp=0.0.json
│ │ ├── basic7_bin4_temp=0.0.json
│ │ ├── basic7_bin5_temp=0.0.json
│ │ ├── basic8_bin1_temp=0.0.json
│ │ ├── basic8_bin2_temp=0.0.json
│ │ ├── basic8_bin3_temp=0.0.json
│ │ ├── basic8_bin4_temp=0.0.json
│ │ ├── basic8_bin5_temp=0.0.json
│ │ ├── basic9_bin1_temp=0.0.json
│ │ ├── basic9_bin2_temp=0.0.json
│ │ ├── basic9_bin3_temp=0.0.json
│ │ ├── basic9_bin4_temp=0.0.json
│ │ ├── basic9_bin5_temp=0.0.json
│ │ ├── results.jsonl
│ │ └── results1.jsonl
│ └── o1
│ │ ├── basic12_bin1_temp=0.0.json
│ │ ├── basic12_bin5_temp=0.0.json
│ │ ├── basic13_bin1_temp=0.0.json
│ │ ├── basic13_bin5_temp=0.0.json
│ │ ├── basic14_bin1_temp=0.0.json
│ │ └── basic14_bin5_temp=0.0.json
└── text_cot
│ ├── claude-3
│ ├── cot10_bin1_temp=0.0.json
│ ├── cot10_bin2_temp=0.0.json
│ ├── cot10_bin3_temp=0.0.json
│ ├── cot10_bin4_temp=0.0.json
│ ├── cot10_bin5_temp=0.0.json
│ ├── cot11_bin1_temp=0.0.json
│ ├── cot11_bin2_temp=0.0.json
│ ├── cot11_bin3_temp=0.0.json
│ ├── cot11_bin4_temp=0.0.json
│ ├── cot11_bin5_temp=0.0.json
│ ├── cot12_bin1_temp=0.0.json
│ ├── cot12_bin2_temp=0.0.json
│ ├── cot12_bin3_temp=0.0.json
│ ├── cot12_bin4_temp=0.0.json
│ ├── cot12_bin5_temp=0.0.json
│ ├── cot13_bin1_temp=0.0.json
│ ├── cot13_bin2_temp=0.0.json
│ ├── cot13_bin3_temp=0.0.json
│ ├── cot13_bin4_temp=0.0.json
│ ├── cot13_bin5_temp=0.0.json
│ ├── cot14_bin1_temp=0.0.json
│ ├── cot14_bin2_temp=0.0.json
│ ├── cot14_bin3_temp=0.0.json
│ ├── cot14_bin4_temp=0.0.json
│ ├── cot14_bin5_temp=0.0.json
│ ├── cot15_bin1_temp=0.0.json
│ ├── cot15_bin2_temp=0.0.json
│ ├── cot15_bin3_temp=0.0.json
│ ├── cot15_bin4_temp=0.0.json
│ ├── cot15_bin5_temp=0.0.json
│ ├── cot16_bin1_temp=0.0.json
│ ├── cot16_bin2_temp=0.0.json
│ ├── cot16_bin3_temp=0.0.json
│ ├── cot16_bin4_temp=0.0.json
│ ├── cot16_bin5_temp=0.0.json
│ ├── cot17_bin1_temp=0.0.json
│ ├── cot17_bin2_temp=0.0.json
│ ├── cot17_bin3_temp=0.0.json
│ ├── cot17_bin4_temp=0.0.json
│ ├── cot17_bin5_temp=0.0.json
│ ├── cot18_bin1_temp=0.0.json
│ ├── cot18_bin2_temp=0.0.json
│ ├── cot18_bin3_temp=0.0.json
│ ├── cot18_bin4_temp=0.0.json
│ ├── cot18_bin5_temp=0.0.json
│ ├── cot19_bin1_temp=0.0.json
│ ├── cot19_bin2_temp=0.0.json
│ ├── cot19_bin3_temp=0.0.json
│ ├── cot19_bin4_temp=0.0.json
│ ├── cot19_bin5_temp=0.0.json
│ ├── cot1_bin1_temp=0.0.json
│ ├── cot1_bin2_temp=0.0.json
│ ├── cot1_bin3_temp=0.0.json
│ ├── cot1_bin4_temp=0.0.json
│ ├── cot1_bin5_temp=0.0.json
│ ├── cot20_bin1_temp=0.0.json
│ ├── cot20_bin2_temp=0.0.json
│ ├── cot20_bin3_temp=0.0.json
│ ├── cot20_bin4_temp=0.0.json
│ ├── cot20_bin5_temp=0.0.json
│ ├── cot21_bin1_temp=0.0.json
│ ├── cot21_bin2_temp=0.0.json
│ ├── cot21_bin3_temp=0.0.json
│ ├── cot21_bin4_temp=0.0.json
│ ├── cot21_bin5_temp=0.0.json
│ ├── cot22_bin1_temp=0.0.json
│ ├── cot22_bin2_temp=0.0.json
│ ├── cot22_bin3_temp=0.0.json
│ ├── cot22_bin4_temp=0.0.json
│ ├── cot22_bin5_temp=0.0.json
│ ├── cot23_bin1_temp=0.0.json
│ ├── cot23_bin2_temp=0.0.json
│ ├── cot23_bin3_temp=0.0.json
│ ├── cot23_bin4_temp=0.0.json
│ ├── cot23_bin5_temp=0.0.json
│ ├── cot24_bin1_temp=0.0.json
│ ├── cot24_bin2_temp=0.0.json
│ ├── cot24_bin3_temp=0.0.json
│ ├── cot24_bin4_temp=0.0.json
│ ├── cot24_bin5_temp=0.0.json
│ ├── cot25_bin1_temp=0.0.json
│ ├── cot25_bin2_temp=0.0.json
│ ├── cot25_bin3_temp=0.0.json
│ ├── cot25_bin4_temp=0.0.json
│ ├── cot25_bin5_temp=0.0.json
│ ├── cot2_bin1_temp=0.0.json
│ ├── cot2_bin2_temp=0.0.json
│ ├── cot2_bin3_temp=0.0.json
│ ├── cot2_bin4_temp=0.0.json
│ ├── cot2_bin5_temp=0.0.json
│ ├── cot3_bin1_temp=0.0.json
│ ├── cot3_bin2_temp=0.0.json
│ ├── cot3_bin3_temp=0.0.json
│ ├── cot3_bin4_temp=0.0.json
│ ├── cot3_bin5_temp=0.0.json
│ ├── cot4_bin1_temp=0.0.json
│ ├── cot4_bin2_temp=0.0.json
│ ├── cot4_bin3_temp=0.0.json
│ ├── cot4_bin4_temp=0.0.json
│ ├── cot4_bin5_temp=0.0.json
│ ├── cot5_bin1_temp=0.0.json
│ ├── cot5_bin2_temp=0.0.json
│ ├── cot5_bin3_temp=0.0.json
│ ├── cot5_bin4_temp=0.0.json
│ ├── cot5_bin5_temp=0.0.json
│ ├── cot6_bin1_temp=0.0.json
│ ├── cot6_bin2_temp=0.0.json
│ ├── cot6_bin3_temp=0.0.json
│ ├── cot6_bin4_temp=0.0.json
│ ├── cot6_bin5_temp=0.0.json
│ ├── cot7_bin1_temp=0.0.json
│ ├── cot7_bin2_temp=0.0.json
│ ├── cot7_bin3_temp=0.0.json
│ ├── cot7_bin4_temp=0.0.json
│ ├── cot7_bin5_temp=0.0.json
│ ├── cot8_bin1_temp=0.0.json
│ ├── cot8_bin2_temp=0.0.json
│ ├── cot8_bin3_temp=0.0.json
│ ├── cot8_bin4_temp=0.0.json
│ ├── cot8_bin5_temp=0.0.json
│ ├── cot9_bin1_temp=0.0.json
│ ├── cot9_bin2_temp=0.0.json
│ ├── cot9_bin3_temp=0.0.json
│ ├── cot9_bin4_temp=0.0.json
│ ├── cot9_bin5_temp=0.0.json
│ └── results.jsonl
│ ├── gpt-4
│ ├── cot10_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot10_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot10_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot10_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot10_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot11_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot11_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot11_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot11_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot11_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot12_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot12_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot12_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot12_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot12_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot13_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot13_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot13_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot13_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot13_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot14_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot14_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot14_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot14_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot14_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot15_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot15_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot15_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot15_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot15_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot16_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot16_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot16_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot16_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot16_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot17_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot17_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot17_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot17_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot17_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot18_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot18_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot18_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot18_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot18_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot19_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot19_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot19_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot19_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot19_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot1_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot1_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot1_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot1_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot1_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot20_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot20_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot20_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot20_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot20_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot21_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot21_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot21_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot21_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot21_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot22_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot22_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot22_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot22_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot22_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot23_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot23_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot23_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot23_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot23_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot24_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot24_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot24_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot24_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot24_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot25_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot25_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot25_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot25_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot25_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot2_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot2_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot2_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot2_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot2_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot3_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot3_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot3_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot3_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot3_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot4_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot4_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot4_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot4_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot4_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot5_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot5_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot5_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot5_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot5_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot6_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot6_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot6_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot6_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot6_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot7_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot7_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot7_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot7_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot7_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot8_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot8_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot8_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot8_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot8_bin5_gpt-4-0613_temp=0.0.json
│ ├── cot9_bin1_gpt-4-0613_temp=0.0.json
│ ├── cot9_bin2_gpt-4-0613_temp=0.0.json
│ ├── cot9_bin3_gpt-4-0613_temp=0.0.json
│ ├── cot9_bin4_gpt-4-0613_temp=0.0.json
│ ├── cot9_bin5_gpt-4-0613_temp=0.0.json
│ └── results.jsonl
│ └── llama3.1-405b
│ ├── cot10_bin1_temp=0.0.json
│ ├── cot10_bin2_temp=0.0.json
│ ├── cot10_bin3_temp=0.0.json
│ ├── cot10_bin4_temp=0.0.json
│ ├── cot10_bin5_temp=0.0.json
│ ├── cot11_bin1_temp=0.0.json
│ ├── cot11_bin2_temp=0.0.json
│ ├── cot11_bin3_temp=0.0.json
│ ├── cot11_bin4_temp=0.0.json
│ ├── cot11_bin5_temp=0.0.json
│ ├── cot12_bin1_temp=0.0.json
│ ├── cot12_bin2_temp=0.0.json
│ ├── cot12_bin3_temp=0.0.json
│ ├── cot12_bin4_temp=0.0.json
│ ├── cot12_bin5_temp=0.0.json
│ ├── cot13_bin1_temp=0.0.json
│ ├── cot13_bin2_temp=0.0.json
│ ├── cot13_bin3_temp=0.0.json
│ ├── cot13_bin4_temp=0.0.json
│ ├── cot13_bin5_temp=0.0.json
│ ├── cot14_bin1_temp=0.0.json
│ ├── cot14_bin2_temp=0.0.json
│ ├── cot14_bin3_temp=0.0.json
│ ├── cot14_bin4_temp=0.0.json
│ ├── cot14_bin5_temp=0.0.json
│ ├── cot15_bin1_temp=0.0.json
│ ├── cot15_bin2_temp=0.0.json
│ ├── cot15_bin3_temp=0.0.json
│ ├── cot15_bin4_temp=0.0.json
│ ├── cot15_bin5_temp=0.0.json
│ ├── cot16_bin1_temp=0.0.json
│ ├── cot16_bin2_temp=0.0.json
│ ├── cot16_bin3_temp=0.0.json
│ ├── cot16_bin4_temp=0.0.json
│ ├── cot16_bin5_temp=0.0.json
│ ├── cot17_bin1_temp=0.0.json
│ ├── cot17_bin2_temp=0.0.json
│ ├── cot17_bin3_temp=0.0.json
│ ├── cot17_bin4_temp=0.0.json
│ ├── cot17_bin5_temp=0.0.json
│ ├── cot18_bin1_temp=0.0.json
│ ├── cot18_bin2_temp=0.0.json
│ ├── cot18_bin3_temp=0.0.json
│ ├── cot18_bin4_temp=0.0.json
│ ├── cot18_bin5_temp=0.0.json
│ ├── cot19_bin1_temp=0.0.json
│ ├── cot19_bin2_temp=0.0.json
│ ├── cot19_bin3_temp=0.0.json
│ ├── cot19_bin4_temp=0.0.json
│ ├── cot19_bin5_temp=0.0.json
│ ├── cot1_bin1_temp=0.0.json
│ ├── cot1_bin2_temp=0.0.json
│ ├── cot1_bin3_temp=0.0.json
│ ├── cot1_bin4_temp=0.0.json
│ ├── cot1_bin5_temp=0.0.json
│ ├── cot20_bin1_temp=0.0.json
│ ├── cot20_bin2_temp=0.0.json
│ ├── cot20_bin3_temp=0.0.json
│ ├── cot20_bin4_temp=0.0.json
│ ├── cot20_bin5_temp=0.0.json
│ ├── cot21_bin1_temp=0.0.json
│ ├── cot21_bin2_temp=0.0.json
│ ├── cot21_bin3_temp=0.0.json
│ ├── cot21_bin4_temp=0.0.json
│ ├── cot21_bin5_temp=0.0.json
│ ├── cot22_bin1_temp=0.0.json
│ ├── cot22_bin2_temp=0.0.json
│ ├── cot22_bin3_temp=0.0.json
│ ├── cot22_bin4_temp=0.0.json
│ ├── cot22_bin5_temp=0.0.json
│ ├── cot23_bin1_temp=0.0.json
│ ├── cot23_bin2_temp=0.0.json
│ ├── cot23_bin3_temp=0.0.json
│ ├── cot23_bin4_temp=0.0.json
│ ├── cot23_bin5_temp=0.0.json
│ ├── cot24_bin1_temp=0.0.json
│ ├── cot24_bin2_temp=0.0.json
│ ├── cot24_bin3_temp=0.0.json
│ ├── cot24_bin4_temp=0.0.json
│ ├── cot24_bin5_temp=0.0.json
│ ├── cot25_bin1_temp=0.0.json
│ ├── cot25_bin2_temp=0.0.json
│ ├── cot25_bin3_temp=0.0.json
│ ├── cot25_bin4_temp=0.0.json
│ ├── cot25_bin5_temp=0.0.json
│ ├── cot2_bin1_temp=0.0.json
│ ├── cot2_bin2_temp=0.0.json
│ ├── cot2_bin3_temp=0.0.json
│ ├── cot2_bin4_temp=0.0.json
│ ├── cot2_bin5_temp=0.0.json
│ ├── cot3_bin1_temp=0.0.json
│ ├── cot3_bin2_temp=0.0.json
│ ├── cot3_bin3_temp=0.0.json
│ ├── cot3_bin4_temp=0.0.json
│ ├── cot3_bin5_temp=0.0.json
│ ├── cot4_bin1_temp=0.0.json
│ ├── cot4_bin2_temp=0.0.json
│ ├── cot4_bin3_temp=0.0.json
│ ├── cot4_bin4_temp=0.0.json
│ ├── cot4_bin5_temp=0.0.json
│ ├── cot5_bin1_temp=0.0.json
│ ├── cot5_bin2_temp=0.0.json
│ ├── cot5_bin3_temp=0.0.json
│ ├── cot5_bin4_temp=0.0.json
│ ├── cot5_bin5_temp=0.0.json
│ ├── cot6_bin1_temp=0.0.json
│ ├── cot6_bin2_temp=0.0.json
│ ├── cot6_bin3_temp=0.0.json
│ ├── cot6_bin4_temp=0.0.json
│ ├── cot6_bin5_temp=0.0.json
│ ├── cot7_bin1_temp=0.0.json
│ ├── cot7_bin2_temp=0.0.json
│ ├── cot7_bin3_temp=0.0.json
│ ├── cot7_bin4_temp=0.0.json
│ ├── cot7_bin5_temp=0.0.json
│ ├── cot8_bin1_temp=0.0.json
│ ├── cot8_bin2_temp=0.0.json
│ ├── cot8_bin3_temp=0.0.json
│ ├── cot8_bin4_temp=0.0.json
│ ├── cot8_bin5_temp=0.0.json
│ ├── cot9_bin1_temp=0.0.json
│ ├── cot9_bin2_temp=0.0.json
│ ├── cot9_bin3_temp=0.0.json
│ ├── cot9_bin4_temp=0.0.json
│ ├── cot9_bin5_temp=0.0.json
│ └── results.jsonl
├── models
└── openai_help.py
├── regression
├── README.md
├── create_train_table.py
├── regression.ipynb
├── text_cot_test_results.tsv
├── text_cot_test_table.tsv
├── text_cot_train_results.tsv
└── text_cot_train_table.tsv
├── run_claude3.py
├── run_llama3.py
├── run_o1.py
├── run_openai.py
├── seven_letter_words
├── README.md
├── bin1_prob.txt
├── bin2_prob.txt
├── bin3_prob.txt
├── bin4_prob.txt
├── bin5_prob.txt
├── gpt2_prob_sevenletter.py
├── input_scored.txt
├── random_token_combos.py
├── select_words.py
└── words_5bins.txt
├── stimuli
├── math_cot
│ ├── math_cot19_bin1.jsonl
│ ├── math_cot19_bin2.jsonl
│ ├── math_cot19_bin3.jsonl
│ ├── math_cot19_bin4.jsonl
│ ├── math_cot19_bin5.jsonl
│ ├── math_cot20_bin1.jsonl
│ ├── math_cot20_bin2.jsonl
│ ├── math_cot20_bin3.jsonl
│ ├── math_cot20_bin4.jsonl
│ ├── math_cot20_bin5.jsonl
│ ├── math_cot21_bin1.jsonl
│ ├── math_cot21_bin2.jsonl
│ ├── math_cot21_bin3.jsonl
│ ├── math_cot21_bin4.jsonl
│ ├── math_cot21_bin5.jsonl
│ ├── math_cot22_bin1.jsonl
│ ├── math_cot22_bin2.jsonl
│ ├── math_cot22_bin3.jsonl
│ ├── math_cot22_bin4.jsonl
│ ├── math_cot22_bin5.jsonl
│ ├── math_cot23_bin1.jsonl
│ ├── math_cot23_bin2.jsonl
│ ├── math_cot23_bin3.jsonl
│ ├── math_cot23_bin4.jsonl
│ └── math_cot23_bin5.jsonl
├── math_swap
│ └── math_swap4_bin5.jsonl
├── number_cot
│ ├── math10_bin1.jsonl
│ ├── math10_bin2.jsonl
│ ├── math10_bin3.jsonl
│ ├── math10_bin4.jsonl
│ ├── math10_bin5.jsonl
│ ├── math11_bin1.jsonl
│ ├── math11_bin2.jsonl
│ ├── math11_bin3.jsonl
│ ├── math11_bin4.jsonl
│ ├── math11_bin5.jsonl
│ ├── math12_bin1.jsonl
│ ├── math12_bin2.jsonl
│ ├── math12_bin3.jsonl
│ ├── math12_bin4.jsonl
│ ├── math12_bin5.jsonl
│ ├── math13_bin1.jsonl
│ ├── math13_bin2.jsonl
│ ├── math13_bin3.jsonl
│ ├── math13_bin4.jsonl
│ ├── math13_bin5.jsonl
│ ├── math14_bin1.jsonl
│ ├── math14_bin2.jsonl
│ ├── math14_bin3.jsonl
│ ├── math14_bin4.jsonl
│ ├── math14_bin5.jsonl
│ ├── math15_bin1.jsonl
│ ├── math15_bin2.jsonl
│ ├── math15_bin3.jsonl
│ ├── math15_bin4.jsonl
│ ├── math15_bin5.jsonl
│ ├── math16_bin1.jsonl
│ ├── math16_bin2.jsonl
│ ├── math16_bin3.jsonl
│ ├── math16_bin4.jsonl
│ ├── math16_bin5.jsonl
│ ├── math17_bin1.jsonl
│ ├── math17_bin2.jsonl
│ ├── math17_bin3.jsonl
│ ├── math17_bin4.jsonl
│ ├── math17_bin5.jsonl
│ ├── math18_bin1.jsonl
│ ├── math18_bin2.jsonl
│ ├── math18_bin3.jsonl
│ ├── math18_bin4.jsonl
│ ├── math18_bin5.jsonl
│ ├── math19_bin1.jsonl
│ ├── math19_bin2.jsonl
│ ├── math19_bin3.jsonl
│ ├── math19_bin4.jsonl
│ ├── math19_bin5.jsonl
│ ├── math1_bin1.jsonl
│ ├── math1_bin2.jsonl
│ ├── math1_bin3.jsonl
│ ├── math1_bin4.jsonl
│ ├── math1_bin5.jsonl
│ ├── math20_bin1.jsonl
│ ├── math20_bin2.jsonl
│ ├── math20_bin3.jsonl
│ ├── math20_bin4.jsonl
│ ├── math20_bin5.jsonl
│ ├── math21_bin1.jsonl
│ ├── math21_bin2.jsonl
│ ├── math21_bin3.jsonl
│ ├── math21_bin4.jsonl
│ ├── math21_bin5.jsonl
│ ├── math22_bin1.jsonl
│ ├── math22_bin2.jsonl
│ ├── math22_bin3.jsonl
│ ├── math22_bin4.jsonl
│ ├── math22_bin5.jsonl
│ ├── math23_bin1.jsonl
│ ├── math23_bin2.jsonl
│ ├── math23_bin3.jsonl
│ ├── math23_bin4.jsonl
│ ├── math23_bin5.jsonl
│ ├── math24_bin1.jsonl
│ ├── math24_bin2.jsonl
│ ├── math24_bin3.jsonl
│ ├── math24_bin4.jsonl
│ ├── math24_bin5.jsonl
│ ├── math25_bin1.jsonl
│ ├── math25_bin2.jsonl
│ ├── math25_bin3.jsonl
│ ├── math25_bin4.jsonl
│ ├── math25_bin5.jsonl
│ ├── math2_bin1.jsonl
│ ├── math2_bin2.jsonl
│ ├── math2_bin3.jsonl
│ ├── math2_bin4.jsonl
│ ├── math2_bin5.jsonl
│ ├── math3_bin1.jsonl
│ ├── math3_bin2.jsonl
│ ├── math3_bin3.jsonl
│ ├── math3_bin4.jsonl
│ ├── math3_bin5.jsonl
│ ├── math4_bin1.jsonl
│ ├── math4_bin2.jsonl
│ ├── math4_bin3.jsonl
│ ├── math4_bin4.jsonl
│ ├── math4_bin5.jsonl
│ ├── math5_bin1.jsonl
│ ├── math5_bin2.jsonl
│ ├── math5_bin3.jsonl
│ ├── math5_bin4.jsonl
│ ├── math5_bin5.jsonl
│ ├── math6_bin1.jsonl
│ ├── math6_bin2.jsonl
│ ├── math6_bin3.jsonl
│ ├── math6_bin4.jsonl
│ ├── math6_bin5.jsonl
│ ├── math7_bin1.jsonl
│ ├── math7_bin2.jsonl
│ ├── math7_bin3.jsonl
│ ├── math7_bin4.jsonl
│ ├── math7_bin5.jsonl
│ ├── math8_bin1.jsonl
│ ├── math8_bin2.jsonl
│ ├── math8_bin3.jsonl
│ ├── math8_bin4.jsonl
│ ├── math8_bin5.jsonl
│ ├── math9_bin1.jsonl
│ ├── math9_bin2.jsonl
│ ├── math9_bin3.jsonl
│ ├── math9_bin4.jsonl
│ └── math9_bin5.jsonl
├── standard
│ ├── basic10_bin1.jsonl
│ ├── basic10_bin2.jsonl
│ ├── basic10_bin3.jsonl
│ ├── basic10_bin4.jsonl
│ ├── basic10_bin5.jsonl
│ ├── basic11_bin1.jsonl
│ ├── basic11_bin2.jsonl
│ ├── basic11_bin3.jsonl
│ ├── basic11_bin4.jsonl
│ ├── basic11_bin5.jsonl
│ ├── basic12_bin1.jsonl
│ ├── basic12_bin2.jsonl
│ ├── basic12_bin3.jsonl
│ ├── basic12_bin4.jsonl
│ ├── basic12_bin5.jsonl
│ ├── basic13_bin1.jsonl
│ ├── basic13_bin2.jsonl
│ ├── basic13_bin3.jsonl
│ ├── basic13_bin4.jsonl
│ ├── basic13_bin5.jsonl
│ ├── basic14_bin1.jsonl
│ ├── basic14_bin2.jsonl
│ ├── basic14_bin3.jsonl
│ ├── basic14_bin4.jsonl
│ ├── basic14_bin5.jsonl
│ ├── basic15_bin1.jsonl
│ ├── basic15_bin2.jsonl
│ ├── basic15_bin3.jsonl
│ ├── basic15_bin4.jsonl
│ ├── basic15_bin5.jsonl
│ ├── basic16_bin1.jsonl
│ ├── basic16_bin2.jsonl
│ ├── basic16_bin3.jsonl
│ ├── basic16_bin4.jsonl
│ ├── basic16_bin5.jsonl
│ ├── basic17_bin1.jsonl
│ ├── basic17_bin2.jsonl
│ ├── basic17_bin3.jsonl
│ ├── basic17_bin4.jsonl
│ ├── basic17_bin5.jsonl
│ ├── basic18_bin1.jsonl
│ ├── basic18_bin2.jsonl
│ ├── basic18_bin3.jsonl
│ ├── basic18_bin4.jsonl
│ ├── basic18_bin5.jsonl
│ ├── basic19_bin1.jsonl
│ ├── basic19_bin2.jsonl
│ ├── basic19_bin3.jsonl
│ ├── basic19_bin4.jsonl
│ ├── basic19_bin5.jsonl
│ ├── basic1_bin1.jsonl
│ ├── basic1_bin2.jsonl
│ ├── basic1_bin3.jsonl
│ ├── basic1_bin4.jsonl
│ ├── basic1_bin5.jsonl
│ ├── basic20_bin1.jsonl
│ ├── basic20_bin2.jsonl
│ ├── basic20_bin3.jsonl
│ ├── basic20_bin4.jsonl
│ ├── basic20_bin5.jsonl
│ ├── basic21_bin1.jsonl
│ ├── basic21_bin2.jsonl
│ ├── basic21_bin3.jsonl
│ ├── basic21_bin4.jsonl
│ ├── basic21_bin5.jsonl
│ ├── basic22_bin1.jsonl
│ ├── basic22_bin2.jsonl
│ ├── basic22_bin3.jsonl
│ ├── basic22_bin4.jsonl
│ ├── basic22_bin5.jsonl
│ ├── basic23_bin1.jsonl
│ ├── basic23_bin2.jsonl
│ ├── basic23_bin3.jsonl
│ ├── basic23_bin4.jsonl
│ ├── basic23_bin5.jsonl
│ ├── basic24_bin1.jsonl
│ ├── basic24_bin2.jsonl
│ ├── basic24_bin3.jsonl
│ ├── basic24_bin4.jsonl
│ ├── basic24_bin5.jsonl
│ ├── basic25_bin1.jsonl
│ ├── basic25_bin2.jsonl
│ ├── basic25_bin3.jsonl
│ ├── basic25_bin4.jsonl
│ ├── basic25_bin5.jsonl
│ ├── basic2_bin1.jsonl
│ ├── basic2_bin2.jsonl
│ ├── basic2_bin3.jsonl
│ ├── basic2_bin4.jsonl
│ ├── basic2_bin5.jsonl
│ ├── basic3_bin1.jsonl
│ ├── basic3_bin2.jsonl
│ ├── basic3_bin3.jsonl
│ ├── basic3_bin4.jsonl
│ ├── basic3_bin5.jsonl
│ ├── basic4_bin1.jsonl
│ ├── basic4_bin2.jsonl
│ ├── basic4_bin3.jsonl
│ ├── basic4_bin4.jsonl
│ ├── basic4_bin5.jsonl
│ ├── basic5_bin1.jsonl
│ ├── basic5_bin2.jsonl
│ ├── basic5_bin3.jsonl
│ ├── basic5_bin4.jsonl
│ ├── basic5_bin5.jsonl
│ ├── basic6_bin1.jsonl
│ ├── basic6_bin2.jsonl
│ ├── basic6_bin3.jsonl
│ ├── basic6_bin4.jsonl
│ ├── basic6_bin5.jsonl
│ ├── basic7_bin1.jsonl
│ ├── basic7_bin2.jsonl
│ ├── basic7_bin3.jsonl
│ ├── basic7_bin4.jsonl
│ ├── basic7_bin5.jsonl
│ ├── basic8_bin1.jsonl
│ ├── basic8_bin2.jsonl
│ ├── basic8_bin3.jsonl
│ ├── basic8_bin4.jsonl
│ ├── basic8_bin5.jsonl
│ ├── basic9_bin1.jsonl
│ ├── basic9_bin2.jsonl
│ ├── basic9_bin3.jsonl
│ ├── basic9_bin4.jsonl
│ └── basic9_bin5.jsonl
├── swap
│ ├── cot13_bin5.jsonl
│ ├── cot14_bin5.jsonl
│ ├── cot4_bin5.jsonl
│ ├── cot5_bin5.jsonl
│ ├── swap13c_bin5.jsonl
│ ├── swap14c_bin5.jsonl
│ ├── swap4c_bin5.jsonl
│ └── swap5c_bin5.jsonl
└── text_cot
│ ├── cot10_bin1.jsonl
│ ├── cot10_bin2.jsonl
│ ├── cot10_bin3.jsonl
│ ├── cot10_bin4.jsonl
│ ├── cot10_bin5.jsonl
│ ├── cot11_bin1.jsonl
│ ├── cot11_bin2.jsonl
│ ├── cot11_bin3.jsonl
│ ├── cot11_bin4.jsonl
│ ├── cot11_bin5.jsonl
│ ├── cot12_bin1.jsonl
│ ├── cot12_bin2.jsonl
│ ├── cot12_bin3.jsonl
│ ├── cot12_bin4.jsonl
│ ├── cot12_bin5.jsonl
│ ├── cot13_bin1.jsonl
│ ├── cot13_bin2.jsonl
│ ├── cot13_bin3.jsonl
│ ├── cot13_bin4.jsonl
│ ├── cot13_bin5.jsonl
│ ├── cot14_bin1.jsonl
│ ├── cot14_bin2.jsonl
│ ├── cot14_bin3.jsonl
│ ├── cot14_bin4.jsonl
│ ├── cot14_bin5.jsonl
│ ├── cot15_bin1.jsonl
│ ├── cot15_bin2.jsonl
│ ├── cot15_bin3.jsonl
│ ├── cot15_bin4.jsonl
│ ├── cot15_bin5.jsonl
│ ├── cot16_bin1.jsonl
│ ├── cot16_bin2.jsonl
│ ├── cot16_bin3.jsonl
│ ├── cot16_bin4.jsonl
│ ├── cot16_bin5.jsonl
│ ├── cot17_bin1.jsonl
│ ├── cot17_bin2.jsonl
│ ├── cot17_bin3.jsonl
│ ├── cot17_bin4.jsonl
│ ├── cot17_bin5.jsonl
│ ├── cot18_bin1.jsonl
│ ├── cot18_bin2.jsonl
│ ├── cot18_bin3.jsonl
│ ├── cot18_bin4.jsonl
│ ├── cot18_bin5.jsonl
│ ├── cot19_bin1.jsonl
│ ├── cot19_bin2.jsonl
│ ├── cot19_bin3.jsonl
│ ├── cot19_bin4.jsonl
│ ├── cot19_bin5.jsonl
│ ├── cot1_bin1.jsonl
│ ├── cot1_bin2.jsonl
│ ├── cot1_bin3.jsonl
│ ├── cot1_bin4.jsonl
│ ├── cot1_bin5.jsonl
│ ├── cot20_bin1.jsonl
│ ├── cot20_bin2.jsonl
│ ├── cot20_bin3.jsonl
│ ├── cot20_bin4.jsonl
│ ├── cot20_bin5.jsonl
│ ├── cot21_bin1.jsonl
│ ├── cot21_bin2.jsonl
│ ├── cot21_bin3.jsonl
│ ├── cot21_bin4.jsonl
│ ├── cot21_bin5.jsonl
│ ├── cot22_bin1.jsonl
│ ├── cot22_bin2.jsonl
│ ├── cot22_bin3.jsonl
│ ├── cot22_bin4.jsonl
│ ├── cot22_bin5.jsonl
│ ├── cot23_bin1.jsonl
│ ├── cot23_bin2.jsonl
│ ├── cot23_bin3.jsonl
│ ├── cot23_bin4.jsonl
│ ├── cot23_bin5.jsonl
│ ├── cot24_bin1.jsonl
│ ├── cot24_bin2.jsonl
│ ├── cot24_bin3.jsonl
│ ├── cot24_bin4.jsonl
│ ├── cot24_bin5.jsonl
│ ├── cot25_bin1.jsonl
│ ├── cot25_bin2.jsonl
│ ├── cot25_bin3.jsonl
│ ├── cot25_bin4.jsonl
│ ├── cot25_bin5.jsonl
│ ├── cot2_bin1.jsonl
│ ├── cot2_bin2.jsonl
│ ├── cot2_bin3.jsonl
│ ├── cot2_bin4.jsonl
│ ├── cot2_bin5.jsonl
│ ├── cot3_bin1.jsonl
│ ├── cot3_bin2.jsonl
│ ├── cot3_bin3.jsonl
│ ├── cot3_bin4.jsonl
│ ├── cot3_bin5.jsonl
│ ├── cot4_bin1.jsonl
│ ├── cot4_bin2.jsonl
│ ├── cot4_bin3.jsonl
│ ├── cot4_bin4.jsonl
│ ├── cot4_bin5.jsonl
│ ├── cot5_bin1.jsonl
│ ├── cot5_bin2.jsonl
│ ├── cot5_bin3.jsonl
│ ├── cot5_bin4.jsonl
│ ├── cot5_bin5.jsonl
│ ├── cot6_bin1.jsonl
│ ├── cot6_bin2.jsonl
│ ├── cot6_bin3.jsonl
│ ├── cot6_bin4.jsonl
│ ├── cot6_bin5.jsonl
│ ├── cot7_bin1.jsonl
│ ├── cot7_bin2.jsonl
│ ├── cot7_bin3.jsonl
│ ├── cot7_bin4.jsonl
│ ├── cot7_bin5.jsonl
│ ├── cot8_bin1.jsonl
│ ├── cot8_bin2.jsonl
│ ├── cot8_bin3.jsonl
│ ├── cot8_bin4.jsonl
│ ├── cot8_bin5.jsonl
│ ├── cot9_bin1.jsonl
│ ├── cot9_bin2.jsonl
│ ├── cot9_bin3.jsonl
│ ├── cot9_bin4.jsonl
│ └── cot9_bin5.jsonl
└── stimulus_generator.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # custom
2 | seven_letter_words/random_pairs_lower*
3 |
4 | # Byte-compiled / optimized / DLL files
5 | __pycache__/
6 | *.py[cod]
7 | *$py.class
8 |
9 | # C extensions
10 | *.so
11 |
12 | # Distribution / packaging
13 | .Python
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | wheels/
26 | share/python-wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .nox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *.cover
52 | *.py,cover
53 | .hypothesis/
54 | .pytest_cache/
55 | cover/
56 |
57 | # Translations
58 | *.mo
59 | *.pot
60 |
61 | # Django stuff:
62 | *.log
63 | local_settings.py
64 | db.sqlite3
65 | db.sqlite3-journal
66 |
67 | # Flask stuff:
68 | instance/
69 | .webassets-cache
70 |
71 | # Scrapy stuff:
72 | .scrapy
73 |
74 | # Sphinx documentation
75 | docs/_build/
76 |
77 | # PyBuilder
78 | .pybuilder/
79 | target/
80 |
81 | # Jupyter Notebook
82 | .ipynb_checkpoints
83 |
84 | # IPython
85 | profile_default/
86 | ipython_config.py
87 |
88 | # pyenv
89 | # For a library or package, you might want to ignore these files since the code is
90 | # intended to run in multiple environments; otherwise, check them in:
91 | # .python-version
92 |
93 | # pipenv
94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
97 | # install all needed dependencies.
98 | #Pipfile.lock
99 |
100 | # poetry
101 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102 | # This is especially recommended for binary packages to ensure reproducibility, and is more
103 | # commonly ignored for libraries.
104 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105 | #poetry.lock
106 |
107 | # pdm
108 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109 | #pdm.lock
110 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111 | # in version control.
112 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
113 | .pdm.toml
114 | .pdm-python
115 | .pdm-build/
116 |
117 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
118 | __pypackages__/
119 |
120 | # Celery stuff
121 | celerybeat-schedule
122 | celerybeat.pid
123 |
124 | # SageMath parsed files
125 | *.sage.py
126 |
127 | # Environments
128 | .env
129 | .venv
130 | env/
131 | venv/
132 | ENV/
133 | env.bak/
134 | venv.bak/
135 |
136 | # Spyder project settings
137 | .spyderproject
138 | .spyproject
139 |
140 | # Rope project settings
141 | .ropeproject
142 |
143 | # mkdocs documentation
144 | /site
145 |
146 | # mypy
147 | .mypy_cache/
148 | .dmypy.json
149 | dmypy.json
150 |
151 | # Pyre type checker
152 | .pyre/
153 |
154 | # pytype static type analyzer
155 | .pytype/
156 |
157 | # Cython debug symbols
158 | cython_debug/
159 |
160 | # PyCharm
161 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
162 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
163 | # and can be added to the global gitignore or merged into this file. For a more nuclear
164 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
165 | #.idea/
166 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # deciphering_cot
2 |
3 | Code implementation and data for the paper:
4 |
5 | **[Deciphering the Factors Influencing the Efficacy of Chain-of-Thought: Probability, Memorization, and Noisy Reasoning](https://arxiv.org/abs/2407.01687)**
6 |
7 | [Akshara Prabhakar](https://aksh555.github.io/), [Thomas L. Griffiths](https://cocosci.princeton.edu/tom/index.php), [R. Thomas McCoy](https://rtmccoy.com/)
8 |
9 |
10 |
11 | ## Quickstart
12 | ### Data
13 | We construct a dataset of seven-letter words divided into 5 probability bins {bin1 to bin 5} each having around 150 words (first 100 to evaluate GPT-4 and remaining to evaluate the logistic regression model that was fitted on the first 100 words). The binning is done based on the log probability value assigned by GPT-2.
14 |
15 | The seven-letter word dataset is in [seven_letter_words](seven_letter_words):
16 | - bin1_prob.txt
17 | - bin2_prob.txt
18 | - bin3_prob.txt
19 | - bin4_prob.txt
20 | - bin5_prob.txt
21 |
22 | ### Shift cipher stimuli
23 | Using the seven-letter word dataset, we prepare stimuli -- these are shift cipher encoded versions of the words from the 5 probability bins across 25 shift levels (1 to 25).
24 |
25 | The stimuli are prepared for the different types of prompts we use: `standard`, `text_cot`, `math_cot`, `number_cot`.
26 |
27 | Can be created by running,
28 | ```bash
29 | python stimulus_generator.py --prompt_type
30 | ```
31 |
32 | ### Evaluating LLMs on shift ciphers
33 | - GPT-4: `run_openai.py`
34 | - Llama 3.1: `run_llama.py`
35 | - Claude 3: `run_claude.py`
36 |
37 | Set appropriate OpenAI, Together, Anthropic keys in the environment before running evaluations.
38 |
39 | For example to run experiments on GPT-4 with Text-CoT for shift_level=1 across all 5 bins run,
40 | ```bash
41 | python run_openai.py --tasks text_cot1 --conditions bin1,bin2,bin3,bin4,bin5 --max_tokens 200 --prompt_type text_cot
42 | ```
43 |
44 | To evaluate the generations, run
45 | ```bash
46 | python eval.py --prompt_type text_cot --create_stats_table
47 | ```
48 | Run this after evaluating GPT-4 across all shift levels and bins. This will generate the evluation statistics for `text_cot` across all shift levels and the `{prompt_type}_train_table.tsv` file which is the train statistics table for fitting the logistic regression.
49 |
50 | ### Logistic regression
51 | The logistic regression is implemented in R in [regression.ipynb](regression/regression.ipynb). The predictions on the test set are saved in `regression/text_cot_test_results.tsv`.
52 |
53 | ### Outputs
54 | All model generations and outputs are stored in the `logs` directory.
55 |
56 | ## Citation
57 | If you find this repository helpful, feel free to cite our [publication](https://arxiv.org/abs/2407.01687).
58 | ```
59 | @inproceedings{prabhakar-etal-2024-deciphering,
60 | title = "Deciphering the Factors Influencing the Efficacy of Chain-of-Thought: Probability, Memorization, and Noisy Reasoning",
61 | author = "Prabhakar, Akshara and
62 | Griffiths, Thomas L. and
63 | McCoy, R. Thomas",
64 | editor = "Al-Onaizan, Yaser and
65 | Bansal, Mohit and
66 | Chen, Yun-Nung",
67 | booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
68 | month = nov,
69 | year = "2024",
70 | address = "Miami, Florida, USA",
71 | publisher = "Association for Computational Linguistics",
72 | url = "https://aclanthology.org/2024.findings-emnlp.212",
73 | pages = "3710--3724"
74 | }
75 | ```
76 |
--------------------------------------------------------------------------------
/assets/preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aksh555/deciphering_cot/21783c74d88ed690d657544b4503db09a1054239/assets/preview.png
--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | from Levenshtein import distance
4 | import statistics
5 | import jsonlines
6 | import sys
7 | import pandas as pd
8 | import argparse
9 |
10 | end_after_strings = ["Original text: ", "message is:", "original text is:", "message is ", "we get:"]
11 | # end_after_strings = ["Therefore, the original sequence of numbers is:","Original sequence:"]
12 | delete_after_strings = ["However, this doesn't make sense", "However, this doesn't make much sense", "This sentence still doesn't make", "However, this sentence doesn't make", "This still doesn't make sense"]
13 | shift_freqs = [59,21,117,5,15,12,6,3,1,3,3,7,1225,5,2,4,2,2,1,1,4,2,17,3,7]
14 |
15 | def desc(idx,gt_chain,pred_chain,gt,res):
16 | print("#", idx)
17 | print("gt_chain", gt_chain)
18 | print("----")
19 | print("pred_chain", pred_chain)
20 | print("----")
21 | print("gt", gt, "res", res)
22 | print("**************")
23 |
24 | def main(args):
25 | data_types = ["bin1","bin2","bin3","bin4","bin5"]
26 | big_df = pd.DataFrame()
27 | prompt_type = args.prompt_type
28 | fo_directory = f"logs/{prompt_type}/"
29 | temp = 0.0
30 | corrupt = False
31 | chain_check = False
32 | chain_directory = "shift_chain/"
33 | bin_probs = {}
34 | for bin in data_types:
35 | with open(f"seven_letter_words/{bin}_prob.txt", 'r') as file:
36 | second_column_words = [line.split(' ')[1].strip() for line in file][:100]
37 | bin_probs[bin] = second_column_words
38 |
39 | for shift in range(1,26):
40 | for fi_label in data_types:
41 | pred_nchars = []
42 | input_nchars = []
43 | corrects = []
44 | preds = []
45 | gts = []
46 | small_df = pd.DataFrame()
47 | condition = prompt_type + str(shift) + "_" + fi_label
48 | if corrupt:
49 | condition += "_nohelp2"
50 |
51 | try:
52 | file = fo_directory + condition + ".json"
53 | fi = open(file, "r")
54 | if chain_check and prompt_type == "text_cot":
55 | chain_file = chain_directory + condition + ".jsonl"
56 | fi_chain = open(chain_file, "r")
57 | print(f"Loading {file}")
58 | except:
59 | print(f"\t{file} not found, skipping {fi_label} {shift}")
60 | continue
61 | print("*"*10)
62 | data = json.load(fi)
63 | if chain_check and prompt_type == "text_cot":
64 | data_chain = []
65 | for line in fi_chain:
66 | x = json.loads(line)
67 | data_chain.append(x["chain"])
68 |
69 | count_correct = 0
70 | count_correct_demo = 0
71 | count_total = 0
72 | total_dist = 0
73 | chain_correct_op_incorrect = 0
74 | chain_correct_op_correct = 0
75 | chain_incorrect_op_correct = 0
76 | chain_incorrect_op_incorrect = 0
77 | distances = []
78 | for idx,(gt,res) in enumerate(zip(data["gts"], data["res"])):
79 | orig_res = res[:]
80 |
81 | for delete_after_string in delete_after_strings:
82 | if delete_after_string in res:
83 | starts = [m.start() for m in re.finditer(delete_after_string, res)]
84 | res = res[:starts[0]].strip()
85 |
86 | for end_after_string in end_after_strings:
87 | if end_after_string in res:
88 | res = res.split(end_after_string)[1].split("\n")[0].strip()
89 | if len(res) != 0:
90 | continue
91 |
92 | if gt[0] == '"':
93 | gt = gt[1:]
94 | if gt[-1] == '"':
95 | gt = gt[:-1]
96 |
97 | # if gt1[0] == '"':
98 | # gt1 = gt1[1:]
99 | # if gt1[-1] == '"':
100 | # gt1 = gt1[:-1]
101 |
102 | if len(res) != 0:
103 | if res[0] == '"':
104 | res = res[1:]
105 | if res[-1] == '"':
106 | res = res[:-1]
107 |
108 | dist = distance(gt, res)
109 | total_dist += dist
110 | distances.append(dist)
111 |
112 | if gt == res:
113 | count_correct += 1
114 | corrects.append(1)
115 | else:
116 | corrects.append(0)
117 |
118 | if chain_check and prompt_type == "text_cot":
119 | # find counts of chain correct but not output correct
120 | gt_chain = data_chain[idx].strip()
121 | pred_chain = re.split(r'Original text:', orig_res)[0].strip()
122 | if gt_chain == pred_chain:
123 | if gt != res:
124 | # desc(idx,gt_chain,pred_chain,gt,res)
125 | chain_correct_op_incorrect += 1
126 | else:
127 | chain_correct_op_correct += 1
128 | else:
129 | if gt == res:
130 | # desc()
131 | chain_incorrect_op_correct += 1
132 | else:
133 | chain_incorrect_op_incorrect += 1
134 | # stats
135 | pred_nchars.append(len(res.strip()))
136 | input_nchars.append(len(gt.strip()))
137 | preds.append(res)
138 | gts.append(gt)
139 |
140 | count_total += 1
141 | result_dict = {"condition": condition, "accuracy": count_correct*1.0/count_total, "lev_dist": total_dist*1.0/count_total, "median_levdist": statistics.median(distances), "temp": temp}
142 | print(condition, "acc_inst", count_correct*1.0/count_total, "acc_demo", count_correct_demo*1.0/count_total, "levdist:", total_dist*1.0/count_total, "median levdist:", statistics.median(distances))
143 |
144 | ## For fine-grained analysis of 'unfaithfulness'
145 | if chain_check:
146 | result_dict.update({"chain_correct_op_correct" : chain_correct_op_correct, "chain_correct_op_incorrect" : chain_correct_op_incorrect, "chain_incorrect_op_correct" : chain_incorrect_op_correct, "chain_incorrect_op_incorrect" : chain_incorrect_op_incorrect})
147 | print("chain correct:")
148 | print("\toutput correct:", chain_correct_op_correct, "output incorrect:", chain_correct_op_incorrect)
149 | print("chain incorrect:")
150 | print("\toutput correct:", chain_incorrect_op_correct, "output incorrect:", chain_incorrect_op_incorrect)
151 |
152 | if args.create_stats_table:
153 | with open(f'stimuli/{prompt_type}/{condition}.jsonl', 'r') as file:
154 | input_text = []
155 | for line in file:
156 | json_obj = json.loads(line)
157 | input_text.append(json_obj.get('input', ''))
158 |
159 | ## write to huge tsv
160 | small_df["input_nchars"] = input_nchars
161 | small_df["output_logprob"] = bin_probs[fi_label]
162 | small_df["correct"] = corrects
163 | small_df["pred"] = preds
164 | small_df["gt"] = gts
165 | small_df["shift_level"] = [shift for _ in range(len(input_nchars))]
166 | small_df["shift_freq"] = [shift_freqs[shift-1] for _ in range(len(input_nchars))]
167 | small_df["input"] = input_text
168 |
169 | assert len(input_nchars) == len(pred_nchars) == len(bin_probs[fi_label]) == len(corrects)
170 | big_df = pd.concat([big_df, small_df], ignore_index=True)
171 |
172 | if args.create_stats_table:
173 | big_df.to_csv(f"regression/{prompt_type}_train_table.tsv","\t",index_label="index")
174 |
175 | if __name__ == "__main__":
176 | args = argparse.ArgumentParser()
177 | args.add_argument("--prompt_type", type=str, help="Prompt type to use [standard, text_cot, math_cot, number_cot]", default="text_cot")
178 | args.add_argument("--create_stats_table", action='store_true', help='default = False', default=False)
179 | args = args.parse_args()
180 | main(args)
--------------------------------------------------------------------------------
/examples/bin_1.txt:
--------------------------------------------------------------------------------
1 | choosed
2 | colbert
3 | polenta
4 | modicum
5 | autarch
6 | schisms
7 | mariner
8 | disarms
9 | rescale
10 | paywall
11 | infobox
12 | preston
13 | shrines
14 | implore
15 | alloted
16 | precast
17 | borings
18 | bacilli
19 | matrice
20 | redible
21 | absolve
22 | ourself
23 | ethetic
24 | maynard
25 | calibur
26 | enviros
27 | calzone
28 | sumatra
29 | drywall
30 | impaled
31 | manland
32 | divined
33 | conlang
34 | tablero
35 | redraft
36 | equitas
37 | ratting
38 | errancy
39 | webcast
40 | lowland
41 | boyhood
42 | actuary
43 | catlike
44 | putback
45 | galileo
46 | rivaled
47 | volonte
48 | sunspot
49 | rotunda
50 | notched
51 | taproot
52 | secures
53 | entente
54 | outflow
55 | betters
56 | rumpled
57 | burried
58 | repulse
59 | fillets
60 | relator
61 | sombody
62 | unsaved
63 | ailment
64 | nodules
65 | montero
66 | satires
67 | arcadia
68 | valerie
69 | inglish
70 | dukedom
71 | espouse
72 | bedevil
73 | reticle
74 | matinee
75 | maxwell
76 | picante
77 | baboons
78 | exciter
79 | losings
80 | newbies
81 | serried
82 | curving
83 | narrows
84 | ragging
85 | baneful
86 | pinatas
87 | divison
88 | kinfolk
89 | indiana
90 | caritas
91 | silvery
92 | inkling
93 | absense
94 | lavabit
95 | outsize
96 | rewired
97 | absalom
98 | getback
99 | accuser
100 | striven
101 | maloney
102 | escaper
103 | subtile
104 | colibri
105 | delving
106 | calving
107 | tarheel
108 | herders
109 | grooved
110 | octagon
111 | bisping
112 | alluded
113 | merlion
114 | figural
115 | debater
116 | pigtail
117 | honious
118 | pinches
119 | clojure
120 | equates
121 | refiner
122 | billets
123 | alfalfa
124 | hotshot
125 | nonagon
126 | jacuzzi
127 | vincent
128 | pollock
129 | airtime
--------------------------------------------------------------------------------
/examples/bin_2.txt:
--------------------------------------------------------------------------------
1 | dupasha
2 | makrita
3 | ferisse
4 | murcers
5 | metires
6 | witmost
7 | astause
8 | sekaram
9 | vilgren
10 | belomat
11 | setnest
12 | curadal
13 | viridon
14 | denpick
15 | eraully
16 | ruborie
17 | queimer
18 | cosuits
19 | rutamen
20 | graizen
21 | sonware
22 | infocos
23 | inkwang
24 | rowbots
25 | engeden
26 | vizizen
27 | molenci
28 | indotes
29 | dapener
30 | ireasti
31 | undving
32 | traumpt
33 | redrear
34 | aryanni
35 | brovoir
36 | greised
37 | networm
38 | memwill
39 | gamplus
40 | estplay
41 | sapwhat
42 | indmong
43 | kenafil
44 | denzhou
45 | cosited
46 | perzoek
47 | balinit
48 | mayonal
49 | armemic
50 | henjury
51 | lavplay
52 | calynes
53 | remfold
54 | engdist
55 | armrich
56 | luxfast
57 | mulhatt
58 | allaton
59 | strfair
60 | monachs
61 | kerapat
62 | hergrim
63 | fidgota
64 | decigan
65 | dezella
66 | haypath
67 | resonga
68 | nosband
69 | poligen
70 | mobture
71 | flufrom
72 | willose
73 | desedge
74 | momclub
75 | clobero
76 | mapauth
77 | vitelho
78 | daykick
79 | sysmite
80 | telolon
81 | onsensa
82 | vipaddy
83 | sunrink
84 | namhero
85 | voratio
86 | niliter
87 | droones
88 | zipcord
89 | pagrete
90 | funwich
91 | negbers
92 | belwich
93 | allayah
94 | pakatak
95 | farathy
96 | betweek
97 | rutanim
98 | obsster
99 | ligigid
100 | lidcore
101 | vacassa
102 | pipiday
103 | almorum
104 | sadmore
105 | hayhorn
106 | vinango
107 | cosisty
108 | libikal
109 | dogodes
110 | camcore
111 | ashmann
112 | fibunal
113 | enciere
114 | revrika
115 | perburg
116 | camilan
117 | sumarms
118 | firigin
119 | pelatra
120 | vorvery
121 | purabra
122 | indondo
123 | dogpeak
124 | alllein
125 | actblue
126 | hasvers
127 | freifty
128 | hueving
129 | coratti
130 | saprika
131 | honcoin
132 | joycons
133 | dogoids
134 | nanians
135 | dreanon
136 | spoanna
137 | levieur
138 | jawolla
139 | cowcard
140 | thehalb
141 | lamboys
142 | disorer
143 | pigwiki
144 | embious
145 | detdden
146 | vacibel
--------------------------------------------------------------------------------
/examples/bin_3.txt:
--------------------------------------------------------------------------------
1 | tasvinc
2 | dblshaw
3 | cmbodka
4 | zagbbox
5 | hedoute
6 | cmsdest
7 | leoanje
8 | sitinks
9 | oweorno
10 | advpite
11 | grpwerk
12 | aesasio
13 | atequir
14 | dryhazi
15 | styansa
16 | sunincl
17 | bowamac
18 | xyzunik
19 | awsposs
20 | ogrmode
21 | midbyss
22 | ctlmony
23 | rngmony
24 | rergett
25 | phperti
26 | bfdizzy
27 | srcstit
28 | pktubic
29 | oddourd
30 | mplnick
31 | dccergy
32 | oxyhest
33 | klepled
34 | digydro
35 | aphopez
36 | rifntag
37 | srvlope
38 | emoomez
39 | toyelry
40 | iniilen
41 | iffamma
42 | adsokin
43 | eofpike
44 | dnsavia
45 | uitlesi
46 | owluntu
47 | affesda
48 | mgrulia
49 | foxmsgs
50 | esiaram
51 | subzyst
52 | ottexpo
53 | udpcolo
54 | vakdney
55 | svmvery
56 | dspereo
57 | pngpone
58 | quiilyn
59 | tgtella
60 | ithueur
61 | wynvinc
62 | sezanch
63 | sdkjabi
64 | yaninem
65 | dbgivid
66 | adeardu
67 | paykich
68 | dspdeal
69 | cptwipe
70 | nikaign
71 | pesuell
72 | musropp
73 | ebxside
74 | dnienez
75 | dccscal
76 | cmbheck
77 | stsasks
78 | hapixer
79 | nikuild
80 | wowrapy
81 | txtajes
82 | gtkoooo
83 | sutcmds
84 | erviode
85 | bewikon
86 | hubphas
87 | ervpets
88 | ofsitem
89 | gstivec
90 | utfestr
91 | etaabic
92 | tieibur
93 | islssel
94 | iodvari
95 | zagzept
96 | ustjour
97 | dexonte
98 | bizfilt
99 | adaowns
100 | tetibri
101 | octfirm
102 | weiudos
103 | pwdtick
104 | ttlarry
105 | stuimeo
106 | sqlstre
107 | mieipeg
108 | dueafen
109 | sndurge
110 | vezcorn
111 | ilketch
112 | zugenth
113 | rngiate
114 | ottclud
115 | aprkeep
116 | urlveal
117 | msgourd
118 | xlsboom
119 | wijagma
120 | robisbn
121 | melmlin
122 | samslot
123 | nidoust
124 | begkits
125 | arrflix
126 | ditfrau
127 | aidomid
128 | cptfoto
129 | aimrede
130 | dbgabay
131 | cidlocs
132 | booiedo
133 | mplders
134 | cptpush
135 | nahcalc
136 | amyovel
137 | wonczas
138 | mplrome
139 | edxesis
140 | adcadoo
141 | oudtems
142 | ociirut
143 | balzept
144 | avgcorp
145 | himocos
146 | ignlots
147 | baztrim
--------------------------------------------------------------------------------
/examples/bin_4.txt:
--------------------------------------------------------------------------------
1 | voyxfff
2 | qtyijke
3 | mmculed
4 | jmpytut
5 | vtkprit
6 | oilrxjs
7 | vfsisex
8 | eenqrst
9 | nbrlyph
10 | xmmgota
11 | jmpquiv
12 | rummqtt
13 | xhrdisp
14 | ffturaa
15 | dexocht
16 | xmmgett
17 | lvljspx
18 | zugwpdb
19 | tidmqtt
20 | lhsigua
21 | sshemsp
22 | burrgyz
23 | vtkirie
24 | vtkifar
25 | rpczano
26 | vtkinez
27 | vtkifie
28 | zugymce
29 | xcbwent
30 | watobjs
31 | doiawks
32 | cgiacyj
33 | czyands
34 | mdbgebn
35 | atejspx
36 | rndxito
37 | sdkrxjs
38 | mlxoice
39 | mlxahan
40 | auxjspx
41 | jsxirms
42 | czyrgba
43 | makrgyz
44 | nanighb
45 | jsxobil
46 | jwtgraf
47 | vtkundy
48 | jsxuden
49 | pszglfw
50 | czydamn
51 | csvylko
52 | wijincl
53 | oilrgyz
54 | mlxulan
55 | xmmepar
56 | lodxlsx
57 | uczpeon
58 | sesrgyz
59 | pciavax
60 | gpsilik
61 | lhszion
62 | slaampp
63 | uczhtag
64 | ouiqrst
65 | xhrziel
66 | pcbpiar
67 | yumxfff
68 | fedjspb
69 | xmmtega
70 | segzoek
71 | mezgrpc
72 | xcbophe
73 | ngxantz
74 | aosantd
75 | jejymax
76 | rerlsru
77 | racrgyz
78 | rndquam
79 | mlxneau
80 | rudcych
81 | lotlsru
82 | abyilog
83 | rsaueba
84 | jsxioso
85 | derjspx
86 | vfsgett
87 | vtkjure
88 | phyepar
89 | vesxfff
90 | lcdleri
91 | ifsfeas
92 | mmcubbo
93 | ircemsp
94 | pdbiesz
95 | rpciene
96 | iodpiar
97 | rmslsru
98 | rpcumno
99 | apkckpt
100 | lcdvoir
101 | rhsncia
102 | owlsetq
103 | ifsbrtc
104 | csvowej
105 | xcborgt
106 | sutmobx
107 | iovstmt
108 | nanmqtt
109 | irqphem
110 | wndncia
111 | xcbided
112 | jsxkees
113 | cpscsrf
114 | jmppeon
115 | lhsreta
116 | dezrgyz
117 | elecsrf
118 | atrlymp
119 | iodudev
120 | xhrkses
121 | ngxjspx
122 | uczpear
123 | npmhlen
124 | pcmncmp
125 | biczoek
126 | dosorrh
127 | jejmisc
128 | kenjspx
129 | idxiaux
130 | svgiesz
131 | vtkgems
132 | glmldre
133 | dexumbn
134 | kitxfff
135 | jsxajan
136 | fmtmina
137 | gtkthew
138 | czyuess
139 | iodhait
140 | cafantd
141 | xcbredo
142 | fpswpdb
143 | xcbdogs
144 | jwtlify
145 | rsaellt
146 | pkgughs
147 | jmpccak
148 | pclvais
--------------------------------------------------------------------------------
/examples/bin_5.txt:
--------------------------------------------------------------------------------
1 | czyjspx
2 | xcbabwe
3 | aktjspx
4 | xcbcych
5 | xcbziej
6 | xmmeczy
7 | qeddhcp
8 | xcbilha
9 | xcbacji
10 | xcbzung
11 | xmmobre
12 | xcbquir
13 | xcbrouw
14 | ilkjspx
15 | lijglfw
16 | foxrgyz
17 | jsxrouw
18 | xcbziel
19 | xcbagua
20 | eidtopl
21 | xcbximo
22 | jwtglfw
23 | xcbnerg
24 | xcbateg
25 | befjspx
26 | xcbxlim
27 | xcbsemi
28 | ketglfw
29 | lemjspx
30 | xcbcyan
31 | xcbsequ
32 | xcbemer
33 | eoscsrf
34 | xcbphot
35 | xcbeken
36 | xcbolum
37 | xcbrodu
38 | tepjspx
39 | xcbthro
40 | xcbueue
41 | oscquiv
42 | xcbubah
43 | xcbodzi
44 | mlxquee
45 | xcbmdat
46 | xcbuell
47 | xcbobre
48 | xcbuhan
49 | tasexpl
50 | xcbueil
51 | xcbilos
52 | iodtopl
53 | suttmpl
54 | xcbhots
55 | xcbosph
56 | xcbuego
57 | xcbquam
58 | kolglfw
59 | gesglfw
60 | gccorrh
61 | mezptom
62 | xcbhecy
63 | xcbsemb
64 | yiijspx
65 | meljspx
66 | xcbunos
67 | xcbunei
68 | pisbrtc
69 | vehjspx
70 | vasrgyz
71 | lhsrgyz
72 | xcbighb
73 | phyfidf
74 | kilglfw
75 | dukvrir
76 | levjspx
77 | updrgyz
78 | xcbagas
79 | opcrgyz
80 | ilkjspb
81 | curfidf
82 | rpcighb
83 | xcbacje
84 | xcbilih
85 | zugcsrf
86 | xcbveau
87 | rpcasje
88 | xcbalsy
89 | pcmrouw
90 | xcbafil
91 | doijspx
92 | xcbhtub
93 | xcbhear
94 | xcbuele
95 | opijspx
96 | xcbazzo
97 | xcboufl
98 | akojspx
99 | ninmqtt
100 | xcbguna
101 | idxorrh
102 | xcbheit
103 | czyxfff
104 | voyglfw
105 | dynmqtt
106 | xcbcoln
107 | vezjspx
108 | xcbocre
109 | cueorrh
110 | xmmacje
111 | mlxalsy
112 | ebxorrh
113 | xcbagal
114 | xcbzept
115 | xcbucle
116 | vesjspx
117 | xcbiser
118 | xcbseau
119 | xcbekte
120 | lapmqtt
121 | abyjspx
122 | xcbueba
123 | xcbijke
124 | xcbvoie
125 | xcbudem
126 | xcbivol
127 | xcbquoi
128 | xcbupal
129 | zugjspx
130 | xcbheel
131 | typglfw
132 | rpcinqu
133 | voyorrh
134 | tieglfw
135 | hexmqtt
136 | xcbacyj
137 | aktjspb
138 | amyjspx
139 | ackrgyz
140 | xcbokus
141 | xcbhtag
142 | togjspx
143 | xcbuely
144 | xcbffic
145 | mlxasje
146 | xcbunft
147 | wieglfw
148 | xcbufig
149 | xcbueur
150 | zagmqtt
--------------------------------------------------------------------------------
/examples/select_swap_words.py:
--------------------------------------------------------------------------------
1 | bins = ["bin_3", "bin_4", "bin_5"]
2 |
3 | words = []
4 | for bin in bins:
5 | with open(f"./{bin}.txt") as file:
6 | words.extend([line.strip() for line in file])
7 |
8 | # import tiktoken
9 | # from collections import defaultdict
10 | # gpt4_enc = tiktoken.get_encoding("cl100k_base")
11 |
12 | # score_words_dict = defaultdict(list)
13 |
14 | # for word in words:
15 | # tokens = len(gpt4_enc.encode(word))
16 | # score_words_dict[tokens].append(word)
17 |
18 | alphabet = "abcdefghijklmnopqrstuvwxyz"
19 | index2char = {}
20 | char2index = {}
21 | for index, char in enumerate(alphabet):
22 | index2char[index] = char
23 | char2index[char] = index
24 |
25 | # similar_pairs = []
26 | # for score, words_with_score in score_words_dict.items():
27 | # for i in range(len(words_with_score)):
28 | # word1 = words_with_score[i]
29 | # word2 = ""
30 | # for char in word1:
31 | # word2 += index2char[(char2index[char]+25)%26]
32 | # print(word1, word2)
33 | # if word2 in words:
34 | # similar_pairs.append((word1, word2))
35 |
36 | # print(len(similar_pairs))
37 | # print(similar_pairs)
38 |
39 |
40 | import os
41 | os.environ['TRANSFORMERS_CACHE'] = "/n/fs/codeval/cache"
42 | os.environ['HF_DATASETS_CACHE'] = "/n/fs/codeval/cache"
43 | os.environ['HF_HOME'] = "/n/fs/codeval/cache"
44 | os.environ['HF_HUB_CACHE'] = "/n/fs/codeval/cache"
45 |
46 | import torch
47 | from transformers import GPT2LMHeadModel, GPT2Tokenizer
48 | import tiktoken
49 | import logging
50 | import json
51 | import pandas as pd
52 |
53 | logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, handlers=[logging.StreamHandler(),logging.FileHandler("prob_random_index.log")])
54 |
55 | if torch.cuda.is_available():
56 | device = "cuda"
57 | else:
58 | device = "cpu"
59 |
60 | gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-xl")
61 | gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2-xl").to(device)
62 | gpt4_enc = tiktoken.get_encoding("cl100k_base")
63 |
64 | def pad_batch(batch, pad_idx):
65 | max_length = 0
66 | for seq in batch:
67 | if len(seq) > max_length:
68 | max_length = len(seq)
69 |
70 | new_batch = []
71 | for seq in batch:
72 | padding = [pad_idx for i in range(max_length - len(seq))]
73 | new_batch.append(seq + padding)
74 |
75 | return new_batch
76 |
77 | # Get perplexity using GPT-2
78 | def prob_gpt2(sentence_list):
79 |
80 | # Tokenize the sentences
81 | all_tokens = []
82 | for sentence in sentence_list:
83 | tokens = gpt2_tokenizer.encode(sentence)
84 | all_tokens.append(tokens)
85 | tokens = pad_batch(all_tokens, 50256)
86 |
87 | targets = tokens[:]
88 |
89 | # Compute average log likelihood for the generation
90 | input_ids = torch.LongTensor(tokens).to(device)
91 | target_ids = torch.LongTensor(targets).to(device)
92 |
93 | with torch.no_grad():
94 | outputs = gpt2_model(input_ids, labels=target_ids)
95 | logits = outputs[1]
96 | logits = logits.transpose(0,1)[:-1].transpose(0,1)
97 | target_ids = target_ids.transpose(0,1)[1:].transpose(0,1)
98 | loss = torch.nn.CrossEntropyLoss(reduction="none", ignore_index=50256)(logits.reshape(-1,50257), target_ids.reshape(-1))
99 | loss = loss.reshape(target_ids.shape).sum(dim=1)
100 | neg_log_likelihood = -1*loss
101 |
102 |
103 | # 13.357776641845703 = logprob('The word is"'); removing this to just get
104 | # the word prob
105 | return neg_log_likelihood + 13.357776641845703
106 |
107 |
108 | # folder_path = "/n/fs/codeval/embers-of-autoregression/extension/stimuli/word/"
109 | # file_list = sorted([os.path.join(folder_path, f) for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])[:1]
110 | file_list = [1]
111 |
112 | num_token_mis = 0
113 | for finame in file_list:
114 | # print(finame, end="**\n")
115 | # with open(finame, 'r') as f:
116 | # lines = f.readlines()
117 | # lines = [json.loads(line) for line in lines]
118 | # fi = [line['input'] for line in lines]
119 | # print("Lines", len(fi))
120 | # fo = open("input_scored.txt", "a")
121 | word_list = words
122 | print("Lines", len(word_list))
123 |
124 | words_with_prob = []
125 | word_pairs = []
126 |
127 | this_batch_sentences = []
128 | this_batch_word1s = []
129 | this_batch_words = []
130 | num_tokens = []
131 | for index, line in enumerate(word_list):
132 | if index % 10000 == 0:
133 | logging.info(str(index))
134 |
135 | word = line.strip()
136 | check_shifts = [2]
137 | for check_shift in check_shifts:
138 | word2 = ""
139 | word1 = ""
140 | for char in word:
141 | word1 += index2char[(char2index[char]+1)%26]
142 | word2 += index2char[(char2index[char]+check_shift)%26]
143 |
144 | tokens = gpt4_enc.encode(word1)
145 | tokens_word2 = gpt4_enc.encode(word2)
146 | if len(tokens) > 4 or len(tokens) != len(tokens_word2):
147 | # print(word1, word2, len(tokens), len(tokens_word2))
148 | num_token_mis += 1
149 | continue
150 |
151 | tokens_spaced = gpt4_enc.encode(" " + word2)
152 |
153 | this_batch_sentences.append('The word is "' + word2 + '"')
154 | this_batch_words.append(word2)
155 | num_tokens.append(len(tokens))
156 | this_batch_word1s.append(word1)
157 |
158 | # if len(tokens) == 2 and len(tokens_spaced) == 2 and len(word) == 7:
159 | # token1 = gpt4_enc.decode([tokens[0]]).strip()
160 | # token2 = gpt4_enc.decode([tokens[1]]).strip()
161 |
162 | # tokenspaced1 = gpt4_enc.decode([tokens_spaced[0]]).strip()
163 | # tokenspaced2 = gpt4_enc.decode([tokens_spaced[1]]).strip()
164 |
165 | # if len(token1) == 3 and len(token2) == 4 and len(tokenspaced1) == 3 and len(tokenspaced2) == 4:
166 | # this_batch_sentences.append('The word is "' + word + '"')
167 | # this_batch_words.append(word)
168 | # else:
169 | # print(index, "Wrong length", word, len(token1), len(token2), len(tokenspaced1), len(tokenspaced2))
170 | # else:
171 | # print(index, "Wrong length", word, len(tokens), len(tokens_spaced), len(word))
172 |
173 | if len(this_batch_sentences) == 3000:
174 | logprobs = prob_gpt2(this_batch_sentences)
175 | for word1, word2, logprob in zip(this_batch_word1s, this_batch_words, logprobs):
176 | words_with_prob.append(logprob.item())
177 | if logprob.item() >= -45 and logprob.item() < -30:
178 | word_pairs.append([word1, word2])
179 | this_batch_sentences = []
180 | this_batch_words = []
181 | this_batch_word1s = []
182 |
183 | if len(this_batch_sentences) > 0:
184 | logprobs = prob_gpt2(this_batch_sentences)
185 | for word1, word2, logprob in zip(this_batch_word1s, this_batch_words, logprobs):
186 | words_with_prob.append(logprob.item())
187 | if logprob.item() > -45 and logprob.item() < -30:
188 | x = prob_gpt2(['The word is "' + word1 + '"'])[-1].item()
189 | if x > -45 and x < -30:
190 | word_pairs.append([word1, word2])
191 | print("missed 2", word1, word2, x, logprob.item())
192 | this_batch_sentences = []
193 | this_batch_words = []
194 | this_batch_word1s = []
195 |
196 | print(num_token_mis)
197 | print(len(word_pairs))
198 | f = open("./word_pairs_lowbins.txt", 'a+')
199 | for pair in word_pairs:
200 | f.write(pair[0] + "\t" + pair[1] + "\n")
201 |
202 | f.close()
203 |
204 |
205 |
--------------------------------------------------------------------------------
/examples/word_pairs_lowbins.txt:
--------------------------------------------------------------------------------
1 | ubtwjod vcuxkpe
2 | ecmtibx fdnujcy
3 | ifepvuf jgfqwvg
4 | dnteftu eoufguv
5 | tjujolt ukvkpmu
6 | bewqjuf cfxrkvg
7 | hsqxfsl itrygtm
8 | bftbtjp cgucukq
9 | bufrvjs cvgswkt
10 | tuzbotb uvacpuc
11 | cpxbnbd dqycoce
12 | yzavojl zabwpkm
13 | phsnpef qitoqfg
14 | dumnpoz evnoqpa
15 | sohnpoz tpioqpa
16 | cgejaaz dhfkbba
17 | tsdtuju uteuvkv
18 | qluvcjd rmvwdke
19 | peepvse qffqwtf
20 | lmfqmfe mngrngf
21 | sjgoubh tkhpvci
22 | fnppnfa goqqogb
23 | jojjmfo kpkkngp
24 | eotbwjb fpucxkc
25 | vjumftj wkvnguk
26 | bggfteb chhgufc
27 | nhsvmjb oitwnkc
28 | ftjbsbn gukctco
29 | tvcaztu uwdbauv
30 | puufyqp qvvgzrq
31 | veqdpmp wfreqnq
32 | wbleofz xcmfpga
33 | etqfsfp furgtgq
34 | qohqpof rpirqpg
35 | uhufmmb vivgnnc
36 | juivfvs kvjwgwt
37 | telkbcj ufmlcdk
38 | echjwje fdikxkf
39 | qbzljdi rcamkej
40 | ojlbjho pkmckip
41 | qftvfmm rguwgnn
42 | nvtspqq owutqrr
43 | fcytjef gdzukfg
44 | eojfofa fpkgpgb
45 | eddtdbm feeuecn
46 | dncifdl eodjgem
47 | hulpppp ivmqqqq
48 | cfxjlpo dgykmqp
49 | vugftus wvhguvt
50 | ujfjcvs vkgkdwt
51 | jtmttfm kunuugn
52 | efypouf fgzqpvg
53 | cjagjmu dkbhknv
54 | ufujcsj vgvkdtk
55 | pdugjsn qevhkto
56 | xfjvept ygkwfqu
57 | qxeujdl ryfvkem
58 | uumbssz vvnctta
59 | tuvjnfp uvwkogq
60 | trmtusf usnuvtg
61 | evfbgfo fwgchgp
62 | toevshf upfwtig
63 | jmlfudi knmgvej
64 | sohjbuf tpikcvg
65 | ymtcppn znudqqo
66 | spcjtco tqdkudp
67 | ojepvtu pkfqwuv
68 | cfhljut dgimkvu
69 | bjepnje ckfqokf
70 | dqugpup ervhqvq
71 | echbcbz fdicdca
72 | bnzpwfm coaqxgn
73 | feyftjt gfzguku
74 | pveufnt qwfvgou
75 | ijnpdpt jkoqequ
76 | wpzyggg xqazhhh
77 | ruzjklf svaklmg
78 | nndvmfe ooewngf
79 | wulqsju xvmrtkv
80 | fforstu ggpstuv
81 | ocsmzqi pdtnarj
82 | yisejtq zjtfkur
83 | gguvsbb hhvwtcc
84 | ynnhfuu zooigvv
85 | avhxqec bwiyrfd
86 | mitjhvb njukiwc
87 | ttifntq uujgour
88 | cvsshza dwttiab
89 | wuljsjf xvmktkg
90 | sqdabop trebcpq
91 | ydcxfou zedygpv
92 | epjbxlt fqkcymu
93 | dhjbdzk eikceal
94 | dazboet ebacpfu
95 | bufktqy cvglurz
96 | telsykt ufmtzlu
97 | nmypjdf onzqkeg
98 | ktyjsnt luzktou
99 | dazshcb ebatidc
100 | nblshza ocmtiab
101 | obojhic pcpkijd
102 | wulvoez xvmwpfa
103 | nmyvmbo onzwncp
104 | vdaqfpo webrgqp
105 | qdjbwby rekcxcz
106 | hqtjmjl iruknkm
107 | vdaiubh webjvci
108 | pvjrstu qwkstuv
109 | gfektqc hgflurd
110 | ydcpqif zedqrjg
111 | ohyboua pizcpvb
112 | sfsmtsv tgtnutw
113 | svedzdi twfeaej
114 | mpumtsv nqvnutw
115 | bczjmph cdaknqi
116 | stbvfcb tucwgdc
117 | wgthfuu xhuigvv
118 | qizfqbs rjagrct
119 | wftyggg xguzhhh
120 | mdemfsj nefngtk
121 | sntmtsv tounutw
122 | sqdvnop trewopq
123 | mdewpjs nefxqkt
124 | jgtcsud khudtve
125 | tvunpcy uwvoqdz
126 | xoeodjb ypfpekc
127 | ydcjefe zedkfgf
128 | ktylfft luzmggu
129 | busmznq cvtnaor
130 | yisltft zjtmugu
131 | vdaqfbs webrgct
132 | eptpssi fquqttj
133 | wulhfnt xvmigou
134 | hmnmesf inonftg
135 | efyvnco fgzwodp
136 | ljuyggg mkvzhhh
137 | ktybkbo luzclcp
138 | huluifx ivmvjgy
139 | ydcepht zedfqiu
140 | stbfmmu tucgnnv
141 | qdmwbjt renxcku
142 | ydcbcxf zedcdyg
143 | bluktqy cmvlurz
144 | ydcdzdi zedeaej
145 | ydcspvx zedtqwy
146 | gpyshza hqztiab
147 | ydcbufh zedcvgi
148 | ydcdzbo zedeacp
149 | ydcflfo zedgmgp
150 | ydcspev zedtqfw
151 | ydcuisp zedvjtq
152 | ydcpeaj zedqfbk
153 | ydcvfmm zedwgnn
154 | ydcpcsf zedqdtg
155 | ubtfyqm vcugzrn
156 | ydcptqi zedqurj
157 | ydcvfhp zedwgiq
158 | hddpssi ieeqttj
159 | ydcifdz zedjgea
160 | qjtcsud rkudtve
161 | ljmhmgx mkninhy
162 | evlwsjs fwmxtkt
163 | vqeshza wrftiab
164 | ydcbhbt zedcicu
165 | pqdshza qretiab
166 | dvsgjeg ewthkfh
167 | qdnspvx reotqwy
168 | ydciuvc zedjvwd
169 | ydcifbs zedjgct
170 | ydcbaap zedcbbq
171 | ydcpvgm zedqwhn
172 | blpktqy cmqlurz
173 | ydcifju zedjgkv
174 | wpzhmgx xqainhy
175 | wfaktqy xgblurz
176 | ydcpdsf zedqetg
177 | ynnbdkf zoocelg
178 | ydcbhbm zedcicn
179 | ydcjtfs zedkugt
180 | mbqnruu ncrosvv
181 | ydcwpjf zedxqkg
182 | ydcrvpj zedswqk
183 | ydciffm zedjggn
184 | sqdjorv trekpsw
185 | ujfhmgx vkginhy
186 | ifynruu jgzosvv
187 | bluktqc cmvlurd
188 | ydcplvt zedqmwu
189 | ydciubh zedjvci
190 | ubtwjod vcuxkpe
191 | ecmtibx fdnujcy
192 | ifepvuf jgfqwvg
193 | dnteftu eoufguv
194 | tjujolt ukvkpmu
195 | bewqjuf cfxrkvg
196 | hsqxfsl itrygtm
197 | bftbtjp cgucukq
198 | bufrvjs cvgswkt
199 | tuzbotb uvacpuc
200 | cpxbnbd dqycoce
201 | yzavojl zabwpkm
202 | phsnpef qitoqfg
203 | dumnpoz evnoqpa
204 | sohnpoz tpioqpa
205 | cgejaaz dhfkbba
206 | tsdtuju uteuvkv
207 | qluvcjd rmvwdke
208 | peepvse qffqwtf
209 | sjgoubh tkhpvci
210 | fnppnfa goqqogb
211 | jojjmfo kpkkngp
212 | eotbwjb fpucxkc
213 | vjumftj wkvnguk
214 | bggfteb chhgufc
215 | nhsvmjb oitwnkc
216 | ftjbsbn gukctco
217 | tvcaztu uwdbauv
218 | puufyqp qvvgzrq
219 | veqdpmp wfreqnq
220 | wbleofz xcmfpga
221 | etqfsfp furgtgq
222 | qohqpof rpirqpg
223 | uhufmmb vivgnnc
224 | juivfvs kvjwgwt
225 | telkbcj ufmlcdk
226 | echjwje fdikxkf
227 | qbzljdi rcamkej
228 | ojlbjho pkmckip
229 | qftvfmm rguwgnn
230 | nvtspqq owutqrr
231 | fcytjef gdzukfg
232 | eojfofa fpkgpgb
233 | eddtdbm feeuecn
234 | dncifdl eodjgem
235 | hulpppp ivmqqqq
236 | cfxjlpo dgykmqp
237 | vugftus wvhguvt
238 | ujfjcvs vkgkdwt
239 | jtmttfm kunuugn
240 | efypouf fgzqpvg
241 | cjagjmu dkbhknv
242 | ufujcsj vgvkdtk
243 | pdugjsn qevhkto
244 | xfjvept ygkwfqu
245 | qxeujdl ryfvkem
246 | uumbssz vvnctta
247 | tuvjnfp uvwkogq
248 | trmtusf usnuvtg
249 | evfbgfo fwgchgp
250 | toevshf upfwtig
251 | jmlfudi knmgvej
252 | sohjbuf tpikcvg
253 | ymtcppn znudqqo
254 | spcjtco tqdkudp
255 | ojepvtu pkfqwuv
256 | cfhljut dgimkvu
257 | bjepnje ckfqokf
258 | dqugpup ervhqvq
259 | echbcbz fdicdca
260 | bnzpwfm coaqxgn
261 | feyftjt gfzguku
262 | pveufnt qwfvgou
263 | ijnpdpt jkoqequ
264 | wpzyggg xqazhhh
265 | ruzjklf svaklmg
266 | nndvmfe ooewngf
267 | wulqsju xvmrtkv
268 | fforstu ggpstuv
269 | ocsmzqi pdtnarj
270 | yisejtq zjtfkur
271 | gguvsbb hhvwtcc
272 | ynnhfuu zooigvv
273 | avhxqec bwiyrfd
274 | mitjhvb njukiwc
275 | ttifntq uujgour
276 | cvsshza dwttiab
277 | wuljsjf xvmktkg
278 | sqdabop trebcpq
279 | ydcxfou zedygpv
280 | epjbxlt fqkcymu
281 | dhjbdzk eikceal
282 | dazboet ebacpfu
283 | bufktqy cvglurz
284 | telsykt ufmtzlu
285 | nmypjdf onzqkeg
286 | ktyjsnt luzktou
287 | dazshcb ebatidc
288 | nblshza ocmtiab
289 | obojhic pcpkijd
290 | wulvoez xvmwpfa
291 | nmyvmbo onzwncp
292 | vdaqfpo webrgqp
293 | qdjbwby rekcxcz
294 | hqtjmjl iruknkm
295 | vdaiubh webjvci
296 | pvjrstu qwkstuv
297 | gfektqc hgflurd
298 | ohyboua pizcpvb
299 | sfsmtsv tgtnutw
300 | svedzdi twfeaej
301 | mpumtsv nqvnutw
302 | bczjmph cdaknqi
303 | stbvfcb tucwgdc
304 | wgthfuu xhuigvv
305 | qizfqbs rjagrct
306 | wftyggg xguzhhh
307 | mdemfsj nefngtk
308 | sntmtsv tounutw
309 | sqdvnop trewopq
310 | mdewpjs nefxqkt
311 | jgtcsud khudtve
312 | tvunpcy uwvoqdz
313 | xoeodjb ypfpekc
314 | ydcjefe zedkfgf
315 | ktylfft luzmggu
316 | busmznq cvtnaor
317 | yisltft zjtmugu
318 | vdaqfbs webrgct
319 | eptpssi fquqttj
320 | wulhfnt xvmigou
321 | hmnmesf inonftg
322 | efyvnco fgzwodp
323 | ljuyggg mkvzhhh
324 | ktybkbo luzclcp
325 | huluifx ivmvjgy
326 | ydcepht zedfqiu
327 | stbfmmu tucgnnv
328 | qdmwbjt renxcku
329 | ydcbcxf zedcdyg
330 | bluktqy cmvlurz
331 | ydcdzdi zedeaej
332 | ydcspvx zedtqwy
333 | gpyshza hqztiab
334 | ydcbufh zedcvgi
335 | ydcdzbo zedeacp
336 | ydcflfo zedgmgp
337 | ydcspev zedtqfw
338 | ydcpeaj zedqfbk
339 | ydcvfmm zedwgnn
340 | ydcpcsf zedqdtg
341 | ubtfyqm vcugzrn
342 | ydcptqi zedqurj
343 | ydcvfhp zedwgiq
344 | hddpssi ieeqttj
345 | ydcifdz zedjgea
346 | qjtcsud rkudtve
347 | ljmhmgx mkninhy
348 | evlwsjs fwmxtkt
349 | vqeshza wrftiab
350 | ydcbhbt zedcicu
351 | pqdshza qretiab
352 | dvsgjeg ewthkfh
353 | qdnspvx reotqwy
354 | ydciuvc zedjvwd
355 | ydcifbs zedjgct
356 | ydcbaap zedcbbq
357 | ydcpvgm zedqwhn
358 | blpktqy cmqlurz
359 | ydcifju zedjgkv
360 | wpzhmgx xqainhy
361 | wfaktqy xgblurz
362 | ydcpdsf zedqetg
363 | ynnbdkf zoocelg
364 | ydcbhbm zedcicn
365 | ydcjtfs zedkugt
366 | mbqnruu ncrosvv
367 | ydcwpjf zedxqkg
368 | ydcrvpj zedswqk
369 | ydciffm zedjggn
370 | sqdjorv trekpsw
371 | ujfhmgx vkginhy
372 | ifynruu jgzosvv
373 | bluktqc cmvlurd
374 | ydcplvt zedqmwu
375 | ydciubh zedjvci
376 |
--------------------------------------------------------------------------------
/logs/basic/claude-3/results.jsonl:
--------------------------------------------------------------------------------
1 | {"condition": "basic1_bin1", "acc_inst": 0.79, "acc_demo": 0.0, "levdist": 0.33, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
2 | {"condition": "basic1_bin2", "acc_inst": 0.43, "acc_demo": 0.0, "levdist": 1.25, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
3 | {"condition": "basic1_bin3", "acc_inst": 0.22, "acc_demo": 0.0, "levdist": 2.2, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
4 | {"condition": "basic1_bin4", "acc_inst": 0.11, "acc_demo": 0.0, "levdist": 2.34, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
5 | {"condition": "basic1_bin5", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 2.21, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
6 | {"condition": "basic2_bin1", "acc_inst": 0.44, "acc_demo": 0.0, "levdist": 1.36, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
7 | {"condition": "basic2_bin2", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 2.18, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
8 | {"condition": "basic2_bin3", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 3.26, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
9 | {"condition": "basic2_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.01, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
10 | {"condition": "basic2_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 3.76, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
11 | {"condition": "basic3_bin1", "acc_inst": 0.65, "acc_demo": 0.0, "levdist": 0.84, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
12 | {"condition": "basic3_bin2", "acc_inst": 0.34, "acc_demo": 0.0, "levdist": 1.56, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
13 | {"condition": "basic3_bin3", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 2.63, "median_levdist": 2.5, "model": "claude-3", "temp": 0.0}
14 | {"condition": "basic3_bin4", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 3.92, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
15 | {"condition": "basic3_bin5", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.75, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
16 | {"condition": "basic4_bin1", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 3.48, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
17 | {"condition": "basic4_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.17, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
18 | {"condition": "basic4_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.28, "median_levdist": 4.5, "model": "claude-3", "temp": 0.0}
19 | {"condition": "basic4_bin4", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 4.65, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
20 | {"condition": "basic4_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.37, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
21 | {"condition": "basic5_bin1", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 4.12, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
22 | {"condition": "basic5_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.2, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
23 | {"condition": "basic5_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.85, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
24 | {"condition": "basic5_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.16, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
25 | {"condition": "basic5_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.7, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
26 | {"condition": "basic6_bin1", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 4.54, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
27 | {"condition": "basic6_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.21, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
28 | {"condition": "basic6_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.16, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
29 | {"condition": "basic6_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.3, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
30 | {"condition": "basic6_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.81, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
31 | {"condition": "basic7_bin1", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 4.11, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
32 | {"condition": "basic7_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.55, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
33 | {"condition": "basic7_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.51, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
34 | {"condition": "basic7_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.65, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
35 | {"condition": "basic7_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.31, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
36 | {"condition": "basic8_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.37, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
37 | {"condition": "basic8_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.45, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
38 | {"condition": "basic8_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.49, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
39 | {"condition": "basic8_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.68, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
40 | {"condition": "basic8_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.11, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
41 | {"condition": "basic9_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.6, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
42 | {"condition": "basic9_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.08, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
43 | {"condition": "basic9_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.96, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
44 | {"condition": "basic9_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.03, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
45 | {"condition": "basic9_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.66, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
46 | {"condition": "basic10_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.75, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
47 | {"condition": "basic10_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.83, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
48 | {"condition": "basic10_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.72, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
49 | {"condition": "basic10_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.93, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
50 | {"condition": "basic10_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.42, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
51 | {"condition": "basic11_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.79, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
52 | {"condition": "basic11_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.04, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
53 | {"condition": "basic11_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.08, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
54 | {"condition": "basic11_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.4, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
55 | {"condition": "basic11_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.59, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
56 | {"condition": "basic12_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.66, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
57 | {"condition": "basic12_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.67, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
58 | {"condition": "basic12_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.86, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
59 | {"condition": "basic12_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.39, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
60 | {"condition": "basic12_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.44, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
61 | {"condition": "basic13_bin1", "acc_inst": 0.47, "acc_demo": 0.0, "levdist": 2.58, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
62 | {"condition": "basic13_bin2", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 3.58, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
63 | {"condition": "basic13_bin3", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 4.53, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
64 | {"condition": "basic13_bin4", "acc_inst": 0.1, "acc_demo": 0.0, "levdist": 4.28, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
65 | {"condition": "basic13_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 3.95, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
66 | {"condition": "basic14_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.78, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
67 | {"condition": "basic14_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.97, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
68 | {"condition": "basic14_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.26, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
69 | {"condition": "basic14_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.5, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
70 | {"condition": "basic14_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.67, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
71 | {"condition": "basic15_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.07, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
72 | {"condition": "basic15_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.95, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
73 | {"condition": "basic15_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.13, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
74 | {"condition": "basic15_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.32, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
75 | {"condition": "basic15_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.68, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
76 | {"condition": "basic16_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.78, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
77 | {"condition": "basic16_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.06, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
78 | {"condition": "basic16_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.2, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
79 | {"condition": "basic16_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.5, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
80 | {"condition": "basic16_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.65, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
81 | {"condition": "basic17_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.85, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
82 | {"condition": "basic17_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.01, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
83 | {"condition": "basic17_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.13, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
84 | {"condition": "basic17_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.39, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
85 | {"condition": "basic17_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.55, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
86 | {"condition": "basic18_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.06, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
87 | {"condition": "basic18_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.09, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
88 | {"condition": "basic18_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.44, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
89 | {"condition": "basic18_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.64, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
90 | {"condition": "basic18_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.8, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
91 | {"condition": "basic19_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.87, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
92 | {"condition": "basic19_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.98, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
93 | {"condition": "basic19_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.1, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
94 | {"condition": "basic19_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.58, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
95 | {"condition": "basic19_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.57, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
96 | {"condition": "basic20_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.34, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
97 | {"condition": "basic20_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.27, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
98 | {"condition": "basic20_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.47, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
99 | {"condition": "basic20_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.58, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
100 | {"condition": "basic20_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.49, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
101 | {"condition": "basic21_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.0, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
102 | {"condition": "basic21_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.06, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
103 | {"condition": "basic21_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.18, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
104 | {"condition": "basic21_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.33, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
105 | {"condition": "basic21_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.5, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
106 | {"condition": "basic22_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.12, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
107 | {"condition": "basic22_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.17, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
108 | {"condition": "basic22_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.24, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
109 | {"condition": "basic22_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.66, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
110 | {"condition": "basic22_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.69, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
111 | {"condition": "basic23_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.05, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
112 | {"condition": "basic23_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.48, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
113 | {"condition": "basic23_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.7, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
114 | {"condition": "basic23_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.48, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
115 | {"condition": "basic23_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.65, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
116 | {"condition": "basic24_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 6.12, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
117 | {"condition": "basic24_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.31, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
118 | {"condition": "basic24_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.68, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
119 | {"condition": "basic24_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.72, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
120 | {"condition": "basic24_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.41, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
121 | {"condition": "basic25_bin1", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 2.98, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
122 | {"condition": "basic25_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 3.81, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
123 | {"condition": "basic25_bin3", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 4.36, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
124 | {"condition": "basic25_bin4", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 4.83, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
125 | {"condition": "basic25_bin5", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 4.64, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
126 |
--------------------------------------------------------------------------------
/logs/basic/llama3.1-405b/results.jsonl:
--------------------------------------------------------------------------------
1 | {"condition": "basic1_bin1", "acc_inst": 0.28, "acc_demo": 0.0, "levdist": 13.15, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
2 | {"condition": "basic1_bin2", "acc_inst": 0.22, "acc_demo": 0.0, "levdist": 8.68, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
3 | {"condition": "basic1_bin3", "acc_inst": 0.24, "acc_demo": 0.0, "levdist": 15.0, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
4 | {"condition": "basic1_bin4", "acc_inst": 0.38, "acc_demo": 0.0, "levdist": 34.17, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
5 | {"condition": "basic1_bin5", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 13.83, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
6 | {"condition": "basic2_bin1", "acc_inst": 0.36, "acc_demo": 0.0, "levdist": 106.78, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
7 | {"condition": "basic2_bin2", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 139.94, "median_levdist": 68.0, "model": "claude-3", "temp": 0.0}
8 | {"condition": "basic2_bin3", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 146.08, "median_levdist": 186.5, "model": "claude-3", "temp": 0.0}
9 | {"condition": "basic2_bin4", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 221.93, "median_levdist": 191.0, "model": "claude-3", "temp": 0.0}
10 | {"condition": "basic2_bin5", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 199.82, "median_levdist": 188.0, "model": "claude-3", "temp": 0.0}
11 | {"condition": "basic3_bin1", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 107.4, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
12 | {"condition": "basic3_bin2", "acc_inst": 0.21, "acc_demo": 0.0, "levdist": 148.21, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
13 | {"condition": "basic3_bin3", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 228.4, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0}
14 | {"condition": "basic3_bin4", "acc_inst": 0.1, "acc_demo": 0.0, "levdist": 233.82, "median_levdist": 199.5, "model": "claude-3", "temp": 0.0}
15 | {"condition": "basic3_bin5", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 223.57, "median_levdist": 192.5, "model": "claude-3", "temp": 0.0}
16 | {"condition": "basic4_bin1", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 135.45, "median_levdist": 3.5, "model": "claude-3", "temp": 0.0}
17 | {"condition": "basic4_bin2", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 118.88, "median_levdist": 3.5, "model": "claude-3", "temp": 0.0}
18 | {"condition": "basic4_bin3", "acc_inst": 0.16, "acc_demo": 0.0, "levdist": 129.08, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
19 | {"condition": "basic4_bin4", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 218.76, "median_levdist": 38.0, "model": "claude-3", "temp": 0.0}
20 | {"condition": "basic4_bin5", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 418.91, "median_levdist": 561.5, "model": "claude-3", "temp": 0.0}
21 | {"condition": "basic5_bin1", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 266.24, "median_levdist": 276.5, "model": "claude-3", "temp": 0.0}
22 | {"condition": "basic5_bin2", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 262.9, "median_levdist": 276.5, "model": "claude-3", "temp": 0.0}
23 | {"condition": "basic5_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 297.63, "median_levdist": 279.0, "model": "claude-3", "temp": 0.0}
24 | {"condition": "basic5_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 317.22, "median_levdist": 299.5, "model": "claude-3", "temp": 0.0}
25 | {"condition": "basic5_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 293.26, "median_levdist": 301.0, "model": "claude-3", "temp": 0.0}
26 | {"condition": "basic6_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 256.99, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0}
27 | {"condition": "basic6_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 234.49, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0}
28 | {"condition": "basic6_bin3", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 203.78, "median_levdist": 196.5, "model": "claude-3", "temp": 0.0}
29 | {"condition": "basic6_bin4", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 277.69, "median_levdist": 205.0, "model": "claude-3", "temp": 0.0}
30 | {"condition": "basic6_bin5", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 354.46, "median_levdist": 423.0, "model": "claude-3", "temp": 0.0}
31 | {"condition": "basic7_bin1", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 165.28, "median_levdist": 191.5, "model": "claude-3", "temp": 0.0}
32 | {"condition": "basic7_bin2", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 187.12, "median_levdist": 192.0, "model": "claude-3", "temp": 0.0}
33 | {"condition": "basic7_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 218.5, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0}
34 | {"condition": "basic7_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 233.28, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0}
35 | {"condition": "basic7_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 251.88, "median_levdist": 200.5, "model": "claude-3", "temp": 0.0}
36 | {"condition": "basic8_bin1", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 219.92, "median_levdist": 194.0, "model": "claude-3", "temp": 0.0}
37 | {"condition": "basic8_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 213.79, "median_levdist": 201.5, "model": "claude-3", "temp": 0.0}
38 | {"condition": "basic8_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 241.55, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0}
39 | {"condition": "basic8_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 270.14, "median_levdist": 205.0, "model": "claude-3", "temp": 0.0}
40 | {"condition": "basic8_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 247.3, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0}
41 | {"condition": "basic9_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 317.68, "median_levdist": 277.5, "model": "claude-3", "temp": 0.0}
42 | {"condition": "basic9_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 324.41, "median_levdist": 276.0, "model": "claude-3", "temp": 0.0}
43 | {"condition": "basic9_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 335.97, "median_levdist": 298.5, "model": "claude-3", "temp": 0.0}
44 | {"condition": "basic9_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 381.66, "median_levdist": 390.0, "model": "claude-3", "temp": 0.0}
45 | {"condition": "basic9_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 320.68, "median_levdist": 310.5, "model": "claude-3", "temp": 0.0}
46 | {"condition": "basic10_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 221.93, "median_levdist": 206.0, "model": "claude-3", "temp": 0.0}
47 | {"condition": "basic10_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 250.25, "median_levdist": 206.0, "model": "claude-3", "temp": 0.0}
48 | {"condition": "basic10_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 223.85, "median_levdist": 196.0, "model": "claude-3", "temp": 0.0}
49 | {"condition": "basic10_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 282.61, "median_levdist": 277.0, "model": "claude-3", "temp": 0.0}
50 | {"condition": "basic10_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 310.79, "median_levdist": 291.0, "model": "claude-3", "temp": 0.0}
51 | {"condition": "basic11_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 352.69, "median_levdist": 244.5, "model": "claude-3", "temp": 0.0}
52 | {"condition": "basic11_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 336.13, "median_levdist": 281.5, "model": "claude-3", "temp": 0.0}
53 | {"condition": "basic11_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 327.86, "median_levdist": 229.0, "model": "claude-3", "temp": 0.0}
54 | {"condition": "basic11_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 344.3, "median_levdist": 216.0, "model": "claude-3", "temp": 0.0}
55 | {"condition": "basic11_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 505.99, "median_levdist": 632.0, "model": "claude-3", "temp": 0.0}
56 | {"condition": "basic12_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 309.18, "median_levdist": 220.5, "model": "claude-3", "temp": 0.0}
57 | {"condition": "basic12_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 299.78, "median_levdist": 275.0, "model": "claude-3", "temp": 0.0}
58 | {"condition": "basic12_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 292.63, "median_levdist": 200.0, "model": "claude-3", "temp": 0.0}
59 | {"condition": "basic12_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 316.36, "median_levdist": 294.5, "model": "claude-3", "temp": 0.0}
60 | {"condition": "basic12_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 385.7, "median_levdist": 325.0, "model": "claude-3", "temp": 0.0}
61 | {"condition": "basic13_bin1", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 22.45, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
62 | {"condition": "basic13_bin2", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 42.08, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
63 | {"condition": "basic13_bin3", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 41.13, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
64 | {"condition": "basic13_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 41.76, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
65 | {"condition": "basic13_bin5", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 105.25, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
66 | {"condition": "basic14_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 318.19, "median_levdist": 208.5, "model": "claude-3", "temp": 0.0}
67 | {"condition": "basic14_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 257.47, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
68 | {"condition": "basic14_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 324.1, "median_levdist": 270.5, "model": "claude-3", "temp": 0.0}
69 | {"condition": "basic14_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 317.07, "median_levdist": 280.5, "model": "claude-3", "temp": 0.0}
70 | {"condition": "basic14_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 319.46, "median_levdist": 279.0, "model": "claude-3", "temp": 0.0}
71 | {"condition": "basic15_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 346.07, "median_levdist": 286.0, "model": "claude-3", "temp": 0.0}
72 | {"condition": "basic15_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 316.85, "median_levdist": 287.5, "model": "claude-3", "temp": 0.0}
73 | {"condition": "basic15_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 346.29, "median_levdist": 304.0, "model": "claude-3", "temp": 0.0}
74 | {"condition": "basic15_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 357.54, "median_levdist": 306.5, "model": "claude-3", "temp": 0.0}
75 | {"condition": "basic15_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 328.12, "median_levdist": 290.0, "model": "claude-3", "temp": 0.0}
76 | {"condition": "basic16_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 402.07, "median_levdist": 340.0, "model": "claude-3", "temp": 0.0}
77 | {"condition": "basic16_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 370.82, "median_levdist": 317.0, "model": "claude-3", "temp": 0.0}
78 | {"condition": "basic16_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 366.68, "median_levdist": 324.5, "model": "claude-3", "temp": 0.0}
79 | {"condition": "basic16_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 419.24, "median_levdist": 445.0, "model": "claude-3", "temp": 0.0}
80 | {"condition": "basic16_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 286.26, "median_levdist": 206.0, "model": "claude-3", "temp": 0.0}
81 | {"condition": "basic17_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 296.78, "median_levdist": 196.0, "model": "claude-3", "temp": 0.0}
82 | {"condition": "basic17_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 318.12, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
83 | {"condition": "basic17_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 293.44, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
84 | {"condition": "basic17_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 319.41, "median_levdist": 198.5, "model": "claude-3", "temp": 0.0}
85 | {"condition": "basic17_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 355.59, "median_levdist": 229.5, "model": "claude-3", "temp": 0.0}
86 | {"condition": "basic18_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 309.81, "median_levdist": 292.0, "model": "claude-3", "temp": 0.0}
87 | {"condition": "basic18_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 330.19, "median_levdist": 308.0, "model": "claude-3", "temp": 0.0}
88 | {"condition": "basic18_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 327.76, "median_levdist": 310.0, "model": "claude-3", "temp": 0.0}
89 | {"condition": "basic18_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 352.78, "median_levdist": 331.0, "model": "claude-3", "temp": 0.0}
90 | {"condition": "basic18_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 406.42, "median_levdist": 401.0, "model": "claude-3", "temp": 0.0}
91 | {"condition": "basic19_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 330.4, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
92 | {"condition": "basic19_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 312.73, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
93 | {"condition": "basic19_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 311.94, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
94 | {"condition": "basic19_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 291.75, "median_levdist": 198.0, "model": "claude-3", "temp": 0.0}
95 | {"condition": "basic19_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 258.78, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
96 | {"condition": "basic20_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 354.28, "median_levdist": 360.0, "model": "claude-3", "temp": 0.0}
97 | {"condition": "basic20_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 308.07, "median_levdist": 299.0, "model": "claude-3", "temp": 0.0}
98 | {"condition": "basic20_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 300.86, "median_levdist": 311.5, "model": "claude-3", "temp": 0.0}
99 | {"condition": "basic20_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 254.9, "median_levdist": 194.5, "model": "claude-3", "temp": 0.0}
100 | {"condition": "basic20_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 217.92, "median_levdist": 8.5, "model": "claude-3", "temp": 0.0}
101 | {"condition": "basic21_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 416.48, "median_levdist": 378.0, "model": "claude-3", "temp": 0.0}
102 | {"condition": "basic21_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 349.05, "median_levdist": 235.0, "model": "claude-3", "temp": 0.0}
103 | {"condition": "basic21_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 406.02, "median_levdist": 378.0, "model": "claude-3", "temp": 0.0}
104 | {"condition": "basic21_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 386.56, "median_levdist": 314.0, "model": "claude-3", "temp": 0.0}
105 | {"condition": "basic21_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 286.54, "median_levdist": 292.0, "model": "claude-3", "temp": 0.0}
106 | {"condition": "basic22_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 426.68, "median_levdist": 339.5, "model": "claude-3", "temp": 0.0}
107 | {"condition": "basic22_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 410.56, "median_levdist": 348.5, "model": "claude-3", "temp": 0.0}
108 | {"condition": "basic22_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 440.15, "median_levdist": 492.5, "model": "claude-3", "temp": 0.0}
109 | {"condition": "basic22_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 430.12, "median_levdist": 435.0, "model": "claude-3", "temp": 0.0}
110 | {"condition": "basic22_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 416.37, "median_levdist": 372.5, "model": "claude-3", "temp": 0.0}
111 |
--------------------------------------------------------------------------------
/logs/basic/llama3.1-405b/results1.jsonl:
--------------------------------------------------------------------------------
1 | {"condition": "basic1_bin1", "acc_inst": 0.28, "acc_demo": 0.0, "levdist": 13.15, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
2 | {"condition": "basic1_bin2", "acc_inst": 0.22, "acc_demo": 0.0, "levdist": 8.68, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
3 | {"condition": "basic1_bin3", "acc_inst": 0.24, "acc_demo": 0.0, "levdist": 15.0, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
4 | {"condition": "basic1_bin4", "acc_inst": 0.38, "acc_demo": 0.0, "levdist": 34.17, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
5 | {"condition": "basic1_bin5", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 13.83, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
6 | {"condition": "basic2_bin1", "acc_inst": 0.36, "acc_demo": 0.0, "levdist": 106.78, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
7 | {"condition": "basic2_bin2", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 139.94, "median_levdist": 68.0, "model": "claude-3", "temp": 0.0}
8 | {"condition": "basic2_bin3", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 146.08, "median_levdist": 186.5, "model": "claude-3", "temp": 0.0}
9 | {"condition": "basic2_bin4", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 221.93, "median_levdist": 191.0, "model": "claude-3", "temp": 0.0}
10 | {"condition": "basic2_bin5", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 199.82, "median_levdist": 188.0, "model": "claude-3", "temp": 0.0}
11 | {"condition": "basic3_bin1", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 107.4, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
12 | {"condition": "basic3_bin2", "acc_inst": 0.21, "acc_demo": 0.0, "levdist": 148.21, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
13 | {"condition": "basic3_bin3", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 228.4, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0}
14 | {"condition": "basic3_bin4", "acc_inst": 0.1, "acc_demo": 0.0, "levdist": 233.82, "median_levdist": 199.5, "model": "claude-3", "temp": 0.0}
15 | {"condition": "basic3_bin5", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 223.57, "median_levdist": 192.5, "model": "claude-3", "temp": 0.0}
16 | {"condition": "basic4_bin1", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 135.45, "median_levdist": 3.5, "model": "claude-3", "temp": 0.0}
17 | {"condition": "basic4_bin2", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 118.88, "median_levdist": 3.5, "model": "claude-3", "temp": 0.0}
18 | {"condition": "basic4_bin3", "acc_inst": 0.16, "acc_demo": 0.0, "levdist": 129.08, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
19 | {"condition": "basic4_bin4", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 218.76, "median_levdist": 38.0, "model": "claude-3", "temp": 0.0}
20 | {"condition": "basic4_bin5", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 418.91, "median_levdist": 561.5, "model": "claude-3", "temp": 0.0}
21 | {"condition": "basic5_bin1", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 266.24, "median_levdist": 276.5, "model": "claude-3", "temp": 0.0}
22 | {"condition": "basic5_bin2", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 262.9, "median_levdist": 276.5, "model": "claude-3", "temp": 0.0}
23 | {"condition": "basic5_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 297.63, "median_levdist": 279.0, "model": "claude-3", "temp": 0.0}
24 | {"condition": "basic5_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 317.22, "median_levdist": 299.5, "model": "claude-3", "temp": 0.0}
25 | {"condition": "basic5_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 293.26, "median_levdist": 301.0, "model": "claude-3", "temp": 0.0}
26 | {"condition": "basic6_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 256.99, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0}
27 | {"condition": "basic6_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 234.49, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0}
28 | {"condition": "basic6_bin3", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 203.78, "median_levdist": 196.5, "model": "claude-3", "temp": 0.0}
29 | {"condition": "basic6_bin4", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 277.69, "median_levdist": 205.0, "model": "claude-3", "temp": 0.0}
30 | {"condition": "basic6_bin5", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 354.46, "median_levdist": 423.0, "model": "claude-3", "temp": 0.0}
31 | {"condition": "basic7_bin1", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 165.28, "median_levdist": 191.5, "model": "claude-3", "temp": 0.0}
32 | {"condition": "basic7_bin2", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 187.12, "median_levdist": 192.0, "model": "claude-3", "temp": 0.0}
33 | {"condition": "basic7_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 218.5, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0}
34 | {"condition": "basic7_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 233.28, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0}
35 | {"condition": "basic7_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 251.88, "median_levdist": 200.5, "model": "claude-3", "temp": 0.0}
36 | {"condition": "basic8_bin1", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 219.92, "median_levdist": 194.0, "model": "claude-3", "temp": 0.0}
37 | {"condition": "basic8_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 213.79, "median_levdist": 201.5, "model": "claude-3", "temp": 0.0}
38 | {"condition": "basic8_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 241.55, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0}
39 | {"condition": "basic8_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 270.14, "median_levdist": 205.0, "model": "claude-3", "temp": 0.0}
40 | {"condition": "basic8_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 247.3, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0}
41 | {"condition": "basic9_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 317.68, "median_levdist": 277.5, "model": "claude-3", "temp": 0.0}
42 | {"condition": "basic9_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 324.41, "median_levdist": 276.0, "model": "claude-3", "temp": 0.0}
43 | {"condition": "basic9_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 335.97, "median_levdist": 298.5, "model": "claude-3", "temp": 0.0}
44 | {"condition": "basic9_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 381.66, "median_levdist": 390.0, "model": "claude-3", "temp": 0.0}
45 | {"condition": "basic9_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 320.68, "median_levdist": 310.5, "model": "claude-3", "temp": 0.0}
46 | {"condition": "basic10_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 221.93, "median_levdist": 206.0, "model": "claude-3", "temp": 0.0}
47 | {"condition": "basic10_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 250.25, "median_levdist": 206.0, "model": "claude-3", "temp": 0.0}
48 | {"condition": "basic10_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 223.85, "median_levdist": 196.0, "model": "claude-3", "temp": 0.0}
49 | {"condition": "basic10_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 282.61, "median_levdist": 277.0, "model": "claude-3", "temp": 0.0}
50 | {"condition": "basic10_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 310.79, "median_levdist": 291.0, "model": "claude-3", "temp": 0.0}
51 | {"condition": "basic11_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 352.69, "median_levdist": 244.5, "model": "claude-3", "temp": 0.0}
52 | {"condition": "basic11_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 336.13, "median_levdist": 281.5, "model": "claude-3", "temp": 0.0}
53 | {"condition": "basic11_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 327.86, "median_levdist": 229.0, "model": "claude-3", "temp": 0.0}
54 | {"condition": "basic11_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 344.3, "median_levdist": 216.0, "model": "claude-3", "temp": 0.0}
55 | {"condition": "basic11_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 505.99, "median_levdist": 632.0, "model": "claude-3", "temp": 0.0}
56 | {"condition": "basic12_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 309.18, "median_levdist": 220.5, "model": "claude-3", "temp": 0.0}
57 | {"condition": "basic12_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 299.78, "median_levdist": 275.0, "model": "claude-3", "temp": 0.0}
58 | {"condition": "basic12_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 292.63, "median_levdist": 200.0, "model": "claude-3", "temp": 0.0}
59 | {"condition": "basic12_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 316.36, "median_levdist": 294.5, "model": "claude-3", "temp": 0.0}
60 | {"condition": "basic12_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 385.7, "median_levdist": 325.0, "model": "claude-3", "temp": 0.0}
61 | {"condition": "basic13_bin1", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 22.45, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
62 | {"condition": "basic13_bin2", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 42.08, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
63 | {"condition": "basic13_bin3", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 41.13, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
64 | {"condition": "basic13_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 41.76, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
65 | {"condition": "basic13_bin5", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 105.25, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
66 | {"condition": "basic14_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 318.19, "median_levdist": 208.5, "model": "claude-3", "temp": 0.0}
67 | {"condition": "basic14_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 257.47, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
68 | {"condition": "basic14_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 324.1, "median_levdist": 270.5, "model": "claude-3", "temp": 0.0}
69 | {"condition": "basic14_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 317.07, "median_levdist": 280.5, "model": "claude-3", "temp": 0.0}
70 | {"condition": "basic14_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 319.46, "median_levdist": 279.0, "model": "claude-3", "temp": 0.0}
71 | {"condition": "basic15_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 346.07, "median_levdist": 286.0, "model": "claude-3", "temp": 0.0}
72 | {"condition": "basic15_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 316.85, "median_levdist": 287.5, "model": "claude-3", "temp": 0.0}
73 | {"condition": "basic15_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 346.29, "median_levdist": 304.0, "model": "claude-3", "temp": 0.0}
74 | {"condition": "basic15_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 357.54, "median_levdist": 306.5, "model": "claude-3", "temp": 0.0}
75 | {"condition": "basic15_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 328.12, "median_levdist": 290.0, "model": "claude-3", "temp": 0.0}
76 | {"condition": "basic16_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 402.07, "median_levdist": 340.0, "model": "claude-3", "temp": 0.0}
77 | {"condition": "basic16_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 370.82, "median_levdist": 317.0, "model": "claude-3", "temp": 0.0}
78 | {"condition": "basic16_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 366.68, "median_levdist": 324.5, "model": "claude-3", "temp": 0.0}
79 | {"condition": "basic16_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 419.24, "median_levdist": 445.0, "model": "claude-3", "temp": 0.0}
80 | {"condition": "basic16_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 286.26, "median_levdist": 206.0, "model": "claude-3", "temp": 0.0}
81 | {"condition": "basic17_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 296.78, "median_levdist": 196.0, "model": "claude-3", "temp": 0.0}
82 | {"condition": "basic17_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 318.12, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
83 | {"condition": "basic17_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 293.44, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
84 | {"condition": "basic17_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 319.41, "median_levdist": 198.5, "model": "claude-3", "temp": 0.0}
85 | {"condition": "basic17_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 355.59, "median_levdist": 229.5, "model": "claude-3", "temp": 0.0}
86 | {"condition": "basic18_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 309.81, "median_levdist": 292.0, "model": "claude-3", "temp": 0.0}
87 | {"condition": "basic18_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 330.19, "median_levdist": 308.0, "model": "claude-3", "temp": 0.0}
88 | {"condition": "basic18_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 327.76, "median_levdist": 310.0, "model": "claude-3", "temp": 0.0}
89 | {"condition": "basic18_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 352.78, "median_levdist": 331.0, "model": "claude-3", "temp": 0.0}
90 | {"condition": "basic18_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 406.42, "median_levdist": 401.0, "model": "claude-3", "temp": 0.0}
91 | {"condition": "basic19_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 330.4, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
92 | {"condition": "basic19_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 312.73, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
93 | {"condition": "basic19_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 311.94, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
94 | {"condition": "basic19_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 291.75, "median_levdist": 198.0, "model": "claude-3", "temp": 0.0}
95 | {"condition": "basic19_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 258.78, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
96 | {"condition": "basic20_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 354.28, "median_levdist": 360.0, "model": "claude-3", "temp": 0.0}
97 | {"condition": "basic20_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 308.07, "median_levdist": 299.0, "model": "claude-3", "temp": 0.0}
98 | {"condition": "basic20_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 300.86, "median_levdist": 311.5, "model": "claude-3", "temp": 0.0}
99 | {"condition": "basic20_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 254.9, "median_levdist": 194.5, "model": "claude-3", "temp": 0.0}
100 | {"condition": "basic20_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 217.92, "median_levdist": 8.5, "model": "claude-3", "temp": 0.0}
101 | {"condition": "basic21_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 416.48, "median_levdist": 378.0, "model": "claude-3", "temp": 0.0}
102 | {"condition": "basic21_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 349.05, "median_levdist": 235.0, "model": "claude-3", "temp": 0.0}
103 | {"condition": "basic21_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 406.02, "median_levdist": 378.0, "model": "claude-3", "temp": 0.0}
104 | {"condition": "basic21_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 386.56, "median_levdist": 314.0, "model": "claude-3", "temp": 0.0}
105 | {"condition": "basic21_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 286.54, "median_levdist": 292.0, "model": "claude-3", "temp": 0.0}
106 | {"condition": "basic22_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 426.68, "median_levdist": 339.5, "model": "claude-3", "temp": 0.0}
107 | {"condition": "basic22_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 410.56, "median_levdist": 348.5, "model": "claude-3", "temp": 0.0}
108 | {"condition": "basic22_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 440.15, "median_levdist": 492.5, "model": "claude-3", "temp": 0.0}
109 | {"condition": "basic22_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 430.12, "median_levdist": 435.0, "model": "claude-3", "temp": 0.0}
110 | {"condition": "basic22_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 416.37, "median_levdist": 372.5, "model": "claude-3", "temp": 0.0}
111 | {"condition": "basic23_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 278.38, "median_levdist": 196.0, "model": "claude-3", "temp": 0.0}
112 | {"condition": "basic23_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 331.5, "median_levdist": 196.0, "model": "claude-3", "temp": 0.0}
113 | {"condition": "basic23_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 339.51, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
114 | {"condition": "basic23_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 363.82, "median_levdist": 202.0, "model": "claude-3", "temp": 0.0}
115 | {"condition": "basic23_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 403.8, "median_levdist": 358.0, "model": "claude-3", "temp": 0.0}
116 | {"condition": "basic24_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 375.29, "median_levdist": 312.0, "model": "claude-3", "temp": 0.0}
117 | {"condition": "basic24_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 415.72, "median_levdist": 354.0, "model": "claude-3", "temp": 0.0}
118 | {"condition": "basic24_bin3", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 395.21, "median_levdist": 316.0, "model": "claude-3", "temp": 0.0}
119 | {"condition": "basic24_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 418.06, "median_levdist": 374.5, "model": "claude-3", "temp": 0.0}
120 | {"condition": "basic24_bin5", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 402.51, "median_levdist": 308.0, "model": "claude-3", "temp": 0.0}
121 | {"condition": "basic25_bin1", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 246.23, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0}
122 | {"condition": "basic25_bin2", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 352.15, "median_levdist": 303.0, "model": "claude-3", "temp": 0.0}
123 | {"condition": "basic25_bin3", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 309.63, "median_levdist": 195.0, "model": "claude-3", "temp": 0.0}
124 | {"condition": "basic25_bin4", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 437.08, "median_levdist": 559.0, "model": "claude-3", "temp": 0.0}
125 | {"condition": "basic25_bin5", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 421.89, "median_levdist": 541.5, "model": "claude-3", "temp": 0.0}
126 |
--------------------------------------------------------------------------------
/logs/text_cot/claude-3/results.jsonl:
--------------------------------------------------------------------------------
1 | {"condition": "cot1_bin1", "acc_inst": 0.87, "acc_demo": 0.0, "levdist": 0.38, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
2 | {"condition": "cot1_bin2", "acc_inst": 0.79, "acc_demo": 0.0, "levdist": 0.38, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
3 | {"condition": "cot1_bin3", "acc_inst": 0.67, "acc_demo": 0.0, "levdist": 0.61, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
4 | {"condition": "cot1_bin4", "acc_inst": 0.76, "acc_demo": 0.0, "levdist": 0.62, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
5 | {"condition": "cot1_bin5", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 2.41, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
6 | {"condition": "cot2_bin1", "acc_inst": 0.9, "acc_demo": 0.0, "levdist": 0.21, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
7 | {"condition": "cot2_bin2", "acc_inst": 0.77, "acc_demo": 0.0, "levdist": 0.52, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
8 | {"condition": "cot2_bin3", "acc_inst": 0.7, "acc_demo": 0.0, "levdist": 0.56, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
9 | {"condition": "cot2_bin4", "acc_inst": 0.74, "acc_demo": 0.0, "levdist": 0.56, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
10 | {"condition": "cot2_bin5", "acc_inst": 0.78, "acc_demo": 0.0, "levdist": 0.32, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
11 | {"condition": "cot3_bin1", "acc_inst": 0.96, "acc_demo": 0.0, "levdist": 0.05, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
12 | {"condition": "cot3_bin2", "acc_inst": 0.82, "acc_demo": 0.0, "levdist": 0.31, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
13 | {"condition": "cot3_bin3", "acc_inst": 0.68, "acc_demo": 0.0, "levdist": 0.6, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
14 | {"condition": "cot3_bin4", "acc_inst": 0.65, "acc_demo": 0.0, "levdist": 0.98, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
15 | {"condition": "cot3_bin5", "acc_inst": 0.32, "acc_demo": 0.0, "levdist": 2.18, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
16 | {"condition": "cot4_bin1", "acc_inst": 0.93, "acc_demo": 0.0, "levdist": 0.07, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
17 | {"condition": "cot4_bin2", "acc_inst": 0.82, "acc_demo": 0.0, "levdist": 0.22, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
18 | {"condition": "cot4_bin3", "acc_inst": 0.66, "acc_demo": 0.0, "levdist": 0.54, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
19 | {"condition": "cot4_bin4", "acc_inst": 0.73, "acc_demo": 0.0, "levdist": 0.35, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
20 | {"condition": "cot4_bin5", "acc_inst": 0.49, "acc_demo": 0.0, "levdist": 0.72, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
21 | {"condition": "cot5_bin1", "acc_inst": 0.88, "acc_demo": 0.0, "levdist": 0.29, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
22 | {"condition": "cot5_bin2", "acc_inst": 0.8, "acc_demo": 0.0, "levdist": 0.36, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
23 | {"condition": "cot5_bin3", "acc_inst": 0.7, "acc_demo": 0.0, "levdist": 0.48, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
24 | {"condition": "cot5_bin4", "acc_inst": 0.76, "acc_demo": 0.0, "levdist": 0.34, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
25 | {"condition": "cot5_bin5", "acc_inst": 0.86, "acc_demo": 0.0, "levdist": 0.19, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
26 | {"condition": "cot6_bin1", "acc_inst": 0.9, "acc_demo": 0.0, "levdist": 0.17, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
27 | {"condition": "cot6_bin2", "acc_inst": 0.75, "acc_demo": 0.0, "levdist": 0.52, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
28 | {"condition": "cot6_bin3", "acc_inst": 0.65, "acc_demo": 0.0, "levdist": 0.7, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
29 | {"condition": "cot6_bin4", "acc_inst": 0.7, "acc_demo": 0.0, "levdist": 1.28, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
30 | {"condition": "cot6_bin5", "acc_inst": 0.58, "acc_demo": 0.0, "levdist": 1.79, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
31 | {"condition": "cot7_bin1", "acc_inst": 0.63, "acc_demo": 0.0, "levdist": 1.18, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
32 | {"condition": "cot7_bin2", "acc_inst": 0.52, "acc_demo": 0.0, "levdist": 1.44, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
33 | {"condition": "cot7_bin3", "acc_inst": 0.46, "acc_demo": 0.0, "levdist": 1.77, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
34 | {"condition": "cot7_bin4", "acc_inst": 0.49, "acc_demo": 0.0, "levdist": 1.93, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
35 | {"condition": "cot7_bin5", "acc_inst": 0.63, "acc_demo": 0.0, "levdist": 0.91, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
36 | {"condition": "cot8_bin1", "acc_inst": 0.8, "acc_demo": 0.0, "levdist": 0.68, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
37 | {"condition": "cot8_bin2", "acc_inst": 0.69, "acc_demo": 0.0, "levdist": 0.77, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
38 | {"condition": "cot8_bin3", "acc_inst": 0.57, "acc_demo": 0.0, "levdist": 1.41, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
39 | {"condition": "cot8_bin4", "acc_inst": 0.6, "acc_demo": 0.0, "levdist": 1.03, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
40 | {"condition": "cot8_bin5", "acc_inst": 0.37, "acc_demo": 0.0, "levdist": 2.95, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
41 | {"condition": "cot9_bin1", "acc_inst": 0.6, "acc_demo": 0.0, "levdist": 0.99, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
42 | {"condition": "cot9_bin2", "acc_inst": 0.42, "acc_demo": 0.0, "levdist": 1.6, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
43 | {"condition": "cot9_bin3", "acc_inst": 0.34, "acc_demo": 0.0, "levdist": 2.02, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
44 | {"condition": "cot9_bin4", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 2.33, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
45 | {"condition": "cot9_bin5", "acc_inst": 0.16, "acc_demo": 0.0, "levdist": 1.85, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
46 | {"condition": "cot10_bin1", "acc_inst": 0.51, "acc_demo": 0.0, "levdist": 1.46, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
47 | {"condition": "cot10_bin2", "acc_inst": 0.35, "acc_demo": 0.0, "levdist": 2.01, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
48 | {"condition": "cot10_bin3", "acc_inst": 0.34, "acc_demo": 0.0, "levdist": 3.44, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
49 | {"condition": "cot10_bin4", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 2.39, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
50 | {"condition": "cot10_bin5", "acc_inst": 0.19, "acc_demo": 0.0, "levdist": 3.01, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
51 | {"condition": "cot11_bin1", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 3.13, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
52 | {"condition": "cot11_bin2", "acc_inst": 0.1, "acc_demo": 0.0, "levdist": 3.07, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
53 | {"condition": "cot11_bin3", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 3.03, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
54 | {"condition": "cot11_bin4", "acc_inst": 0.11, "acc_demo": 0.0, "levdist": 3.52, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
55 | {"condition": "cot11_bin5", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 4.65, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
56 | {"condition": "cot12_bin1", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 2.04, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
57 | {"condition": "cot12_bin2", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 2.38, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
58 | {"condition": "cot12_bin3", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 2.94, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
59 | {"condition": "cot12_bin4", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 3.19, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
60 | {"condition": "cot12_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 3.4, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
61 | {"condition": "cot13_bin1", "acc_inst": 0.88, "acc_demo": 0.0, "levdist": 0.2, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
62 | {"condition": "cot13_bin2", "acc_inst": 0.73, "acc_demo": 0.0, "levdist": 0.4, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
63 | {"condition": "cot13_bin3", "acc_inst": 0.63, "acc_demo": 0.0, "levdist": 0.61, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
64 | {"condition": "cot13_bin4", "acc_inst": 0.6, "acc_demo": 0.0, "levdist": 0.74, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
65 | {"condition": "cot13_bin5", "acc_inst": 0.59, "acc_demo": 0.0, "levdist": 0.91, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
66 | {"condition": "cot14_bin1", "acc_inst": 0.19, "acc_demo": 0.0, "levdist": 2.92, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
67 | {"condition": "cot14_bin2", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 3.29, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
68 | {"condition": "cot14_bin3", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 3.78, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
69 | {"condition": "cot14_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 3.85, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
70 | {"condition": "cot14_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.22, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
71 | {"condition": "cot15_bin1", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 4.7, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
72 | {"condition": "cot15_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.77, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
73 | {"condition": "cot15_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.04, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
74 | {"condition": "cot15_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.84, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
75 | {"condition": "cot15_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.01, "median_levdist": 6.5, "model": "claude-3", "temp": 0.0}
76 | {"condition": "cot16_bin1", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 4.64, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
77 | {"condition": "cot16_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.98, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
78 | {"condition": "cot16_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.2, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
79 | {"condition": "cot16_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.47, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
80 | {"condition": "cot16_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.11, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
81 | {"condition": "cot17_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.52, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
82 | {"condition": "cot17_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.76, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
83 | {"condition": "cot17_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.82, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
84 | {"condition": "cot17_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.33, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
85 | {"condition": "cot17_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.51, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
86 | {"condition": "cot18_bin1", "acc_inst": 0.19, "acc_demo": 0.0, "levdist": 3.62, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
87 | {"condition": "cot18_bin2", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 4.14, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
88 | {"condition": "cot18_bin3", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 4.02, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
89 | {"condition": "cot18_bin4", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 4.25, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
90 | {"condition": "cot18_bin5", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 5.58, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
91 | {"condition": "cot19_bin1", "acc_inst": 0.12, "acc_demo": 0.0, "levdist": 3.78, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
92 | {"condition": "cot19_bin2", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 4.03, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
93 | {"condition": "cot19_bin3", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 4.57, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
94 | {"condition": "cot19_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.04, "median_levdist": 5.5, "model": "claude-3", "temp": 0.0}
95 | {"condition": "cot19_bin5", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.73, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
96 | {"condition": "cot20_bin1", "acc_inst": 0.18, "acc_demo": 0.0, "levdist": 3.41, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
97 | {"condition": "cot20_bin2", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 4.18, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
98 | {"condition": "cot20_bin3", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 4.17, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
99 | {"condition": "cot20_bin4", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 6.57, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
100 | {"condition": "cot20_bin5", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.17, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
101 | {"condition": "cot21_bin1", "acc_inst": 0.3, "acc_demo": 0.0, "levdist": 2.9, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
102 | {"condition": "cot21_bin2", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 2.89, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
103 | {"condition": "cot21_bin3", "acc_inst": 0.14, "acc_demo": 0.0, "levdist": 2.84, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
104 | {"condition": "cot21_bin4", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 3.14, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
105 | {"condition": "cot21_bin5", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 3.62, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
106 | {"condition": "cot22_bin1", "acc_inst": 0.31, "acc_demo": 0.0, "levdist": 2.7, "median_levdist": 2.5, "model": "claude-3", "temp": 0.0}
107 | {"condition": "cot22_bin2", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 2.98, "median_levdist": 2.5, "model": "claude-3", "temp": 0.0}
108 | {"condition": "cot22_bin3", "acc_inst": 0.14, "acc_demo": 0.0, "levdist": 3.64, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
109 | {"condition": "cot22_bin4", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 4.32, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
110 | {"condition": "cot22_bin5", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 3.17, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
111 | {"condition": "cot23_bin1", "acc_inst": 0.35, "acc_demo": 0.0, "levdist": 2.8, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
112 | {"condition": "cot23_bin2", "acc_inst": 0.28, "acc_demo": 0.0, "levdist": 3.29, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
113 | {"condition": "cot23_bin3", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 3.83, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
114 | {"condition": "cot23_bin4", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 3.77, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
115 | {"condition": "cot23_bin5", "acc_inst": 0.11, "acc_demo": 0.0, "levdist": 3.75, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
116 | {"condition": "cot24_bin1", "acc_inst": 0.44, "acc_demo": 0.0, "levdist": 2.6, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
117 | {"condition": "cot24_bin2", "acc_inst": 0.32, "acc_demo": 0.0, "levdist": 3.19, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
118 | {"condition": "cot24_bin3", "acc_inst": 0.39, "acc_demo": 0.0, "levdist": 2.63, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
119 | {"condition": "cot24_bin4", "acc_inst": 0.34, "acc_demo": 0.0, "levdist": 3.05, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
120 | {"condition": "cot24_bin5", "acc_inst": 0.21, "acc_demo": 0.0, "levdist": 4.04, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
121 | {"condition": "cot25_bin1", "acc_inst": 0.62, "acc_demo": 0.0, "levdist": 1.89, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
122 | {"condition": "cot25_bin2", "acc_inst": 0.58, "acc_demo": 0.0, "levdist": 1.77, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
123 | {"condition": "cot25_bin3", "acc_inst": 0.42, "acc_demo": 0.0, "levdist": 2.74, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
124 | {"condition": "cot25_bin4", "acc_inst": 0.56, "acc_demo": 0.0, "levdist": 1.88, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
125 | {"condition": "cot25_bin5", "acc_inst": 0.67, "acc_demo": 0.0, "levdist": 1.12, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
126 |
--------------------------------------------------------------------------------
/logs/text_cot/gpt-4/results.jsonl:
--------------------------------------------------------------------------------
1 | {"condition": "cot1_bin1", "acc_inst": 0.77, "acc_demo": 0.0, "levdist": 0.4, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
2 | {"condition": "cot1_bin2", "acc_inst": 0.64, "acc_demo": 0.0, "levdist": 0.72, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
3 | {"condition": "cot1_bin3", "acc_inst": 0.46, "acc_demo": 0.0, "levdist": 1.12, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
4 | {"condition": "cot1_bin4", "acc_inst": 0.43, "acc_demo": 0.0, "levdist": 1.07, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
5 | {"condition": "cot1_bin5", "acc_inst": 0.16, "acc_demo": 0.0, "levdist": 1.91, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
6 | {"condition": "cot2_bin1", "acc_inst": 0.83, "acc_demo": 0.0, "levdist": 0.26, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
7 | {"condition": "cot2_bin2", "acc_inst": 0.71, "acc_demo": 0.0, "levdist": 0.52, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
8 | {"condition": "cot2_bin3", "acc_inst": 0.49, "acc_demo": 0.0, "levdist": 1.06, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
9 | {"condition": "cot2_bin4", "acc_inst": 0.43, "acc_demo": 0.0, "levdist": 1.0, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
10 | {"condition": "cot2_bin5", "acc_inst": 0.19, "acc_demo": 0.0, "levdist": 1.65, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
11 | {"condition": "cot3_bin1", "acc_inst": 0.79, "acc_demo": 0.0, "levdist": 0.33, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
12 | {"condition": "cot3_bin2", "acc_inst": 0.71, "acc_demo": 0.0, "levdist": 0.54, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
13 | {"condition": "cot3_bin3", "acc_inst": 0.48, "acc_demo": 0.0, "levdist": 1.06, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
14 | {"condition": "cot3_bin4", "acc_inst": 0.4, "acc_demo": 0.0, "levdist": 1.14, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
15 | {"condition": "cot3_bin5", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 1.58, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
16 | {"condition": "cot4_bin1", "acc_inst": 0.76, "acc_demo": 0.0, "levdist": 0.44, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
17 | {"condition": "cot4_bin2", "acc_inst": 0.66, "acc_demo": 0.0, "levdist": 0.74, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
18 | {"condition": "cot4_bin3", "acc_inst": 0.5, "acc_demo": 0.0, "levdist": 1.12, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
19 | {"condition": "cot4_bin4", "acc_inst": 0.47, "acc_demo": 0.0, "levdist": 1.0, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
20 | {"condition": "cot4_bin5", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 1.26, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
21 | {"condition": "cot5_bin1", "acc_inst": 0.76, "acc_demo": 0.0, "levdist": 0.5, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
22 | {"condition": "cot5_bin2", "acc_inst": 0.68, "acc_demo": 0.0, "levdist": 0.66, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
23 | {"condition": "cot5_bin3", "acc_inst": 0.44, "acc_demo": 0.0, "levdist": 1.32, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
24 | {"condition": "cot5_bin4", "acc_inst": 0.49, "acc_demo": 0.0, "levdist": 1.15, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
25 | {"condition": "cot5_bin5", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 1.58, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
26 | {"condition": "cot6_bin1", "acc_inst": 0.73, "acc_demo": 0.0, "levdist": 0.49, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
27 | {"condition": "cot6_bin2", "acc_inst": 0.74, "acc_demo": 0.0, "levdist": 0.46, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
28 | {"condition": "cot6_bin3", "acc_inst": 0.45, "acc_demo": 0.0, "levdist": 1.18, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
29 | {"condition": "cot6_bin4", "acc_inst": 0.36, "acc_demo": 0.0, "levdist": 1.24, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
30 | {"condition": "cot6_bin5", "acc_inst": 0.21, "acc_demo": 0.0, "levdist": 1.59, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
31 | {"condition": "cot7_bin1", "acc_inst": 0.67, "acc_demo": 0.0, "levdist": 0.52, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
32 | {"condition": "cot7_bin2", "acc_inst": 0.55, "acc_demo": 0.0, "levdist": 0.89, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
33 | {"condition": "cot7_bin3", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 1.61, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
34 | {"condition": "cot7_bin4", "acc_inst": 0.19, "acc_demo": 0.0, "levdist": 1.5, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
35 | {"condition": "cot7_bin5", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 1.91, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
36 | {"condition": "cot8_bin1", "acc_inst": 0.7, "acc_demo": 0.0, "levdist": 0.52, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
37 | {"condition": "cot8_bin2", "acc_inst": 0.63, "acc_demo": 0.0, "levdist": 0.63, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
38 | {"condition": "cot8_bin3", "acc_inst": 0.44, "acc_demo": 0.0, "levdist": 1.24, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
39 | {"condition": "cot8_bin4", "acc_inst": 0.5, "acc_demo": 0.0, "levdist": 0.98, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
40 | {"condition": "cot8_bin5", "acc_inst": 0.23, "acc_demo": 0.0, "levdist": 1.56, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
41 | {"condition": "cot9_bin1", "acc_inst": 0.64, "acc_demo": 0.0, "levdist": 0.91, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
42 | {"condition": "cot9_bin2", "acc_inst": 0.51, "acc_demo": 0.0, "levdist": 1.24, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
43 | {"condition": "cot9_bin3", "acc_inst": 0.36, "acc_demo": 0.0, "levdist": 1.68, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
44 | {"condition": "cot9_bin4", "acc_inst": 0.3, "acc_demo": 0.0, "levdist": 1.48, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
45 | {"condition": "cot9_bin5", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 1.68, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
46 | {"condition": "cot10_bin1", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 2.3, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
47 | {"condition": "cot10_bin2", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 2.25, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
48 | {"condition": "cot10_bin3", "acc_inst": 0.14, "acc_demo": 0.0, "levdist": 2.4, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
49 | {"condition": "cot10_bin4", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 2.7, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
50 | {"condition": "cot10_bin5", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 2.24, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
51 | {"condition": "cot11_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.23, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
52 | {"condition": "cot11_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.3, "median_levdist": 4.5, "model": "claude-3", "temp": 0.0}
53 | {"condition": "cot11_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.24, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
54 | {"condition": "cot11_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.42, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
55 | {"condition": "cot11_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.08, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
56 | {"condition": "cot12_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.81, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
57 | {"condition": "cot12_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.86, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
58 | {"condition": "cot12_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.8, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
59 | {"condition": "cot12_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.86, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
60 | {"condition": "cot12_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.93, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
61 | {"condition": "cot13_bin1", "acc_inst": 0.79, "acc_demo": 0.0, "levdist": 0.45, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
62 | {"condition": "cot13_bin2", "acc_inst": 0.64, "acc_demo": 0.0, "levdist": 0.66, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
63 | {"condition": "cot13_bin3", "acc_inst": 0.67, "acc_demo": 0.0, "levdist": 0.79, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
64 | {"condition": "cot13_bin4", "acc_inst": 0.59, "acc_demo": 0.0, "levdist": 0.75, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
65 | {"condition": "cot13_bin5", "acc_inst": 0.37, "acc_demo": 0.0, "levdist": 1.31, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
66 | {"condition": "cot14_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.24, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
67 | {"condition": "cot14_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.46, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
68 | {"condition": "cot14_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.51, "median_levdist": 4.5, "model": "claude-3", "temp": 0.0}
69 | {"condition": "cot14_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.63, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
70 | {"condition": "cot14_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.76, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
71 | {"condition": "cot15_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.35, "median_levdist": 5.5, "model": "claude-3", "temp": 0.0}
72 | {"condition": "cot15_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.28, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
73 | {"condition": "cot15_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.47, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
74 | {"condition": "cot15_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.59, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
75 | {"condition": "cot15_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.83, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
76 | {"condition": "cot16_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.39, "median_levdist": 5.5, "model": "claude-3", "temp": 0.0}
77 | {"condition": "cot16_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.33, "median_levdist": 5.5, "model": "claude-3", "temp": 0.0}
78 | {"condition": "cot16_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.3, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
79 | {"condition": "cot16_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.49, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
80 | {"condition": "cot16_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.96, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
81 | {"condition": "cot17_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.43, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
82 | {"condition": "cot17_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.37, "median_levdist": 5.5, "model": "claude-3", "temp": 0.0}
83 | {"condition": "cot17_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.65, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
84 | {"condition": "cot17_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.87, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
85 | {"condition": "cot17_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.31, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
86 | {"condition": "cot18_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.03, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
87 | {"condition": "cot18_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.0, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
88 | {"condition": "cot18_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.39, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
89 | {"condition": "cot18_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.72, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
90 | {"condition": "cot18_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.12, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
91 | {"condition": "cot19_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.83, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
92 | {"condition": "cot19_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.73, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
93 | {"condition": "cot19_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.11, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
94 | {"condition": "cot19_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.88, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
95 | {"condition": "cot19_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.3, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
96 | {"condition": "cot20_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.93, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
97 | {"condition": "cot20_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.82, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
98 | {"condition": "cot20_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.03, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
99 | {"condition": "cot20_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.28, "median_levdist": 6.5, "model": "claude-3", "temp": 0.0}
100 | {"condition": "cot20_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.47, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
101 | {"condition": "cot21_bin1", "acc_inst": 0.16, "acc_demo": 0.0, "levdist": 2.84, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
102 | {"condition": "cot21_bin2", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 2.93, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
103 | {"condition": "cot21_bin3", "acc_inst": 0.14, "acc_demo": 0.0, "levdist": 3.05, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
104 | {"condition": "cot21_bin4", "acc_inst": 0.23, "acc_demo": 0.0, "levdist": 2.39, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
105 | {"condition": "cot21_bin5", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 2.28, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
106 | {"condition": "cot22_bin1", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 3.31, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
107 | {"condition": "cot22_bin2", "acc_inst": 0.28, "acc_demo": 0.0, "levdist": 2.86, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
108 | {"condition": "cot22_bin3", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 3.57, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
109 | {"condition": "cot22_bin4", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 3.05, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
110 | {"condition": "cot22_bin5", "acc_inst": 0.1, "acc_demo": 0.0, "levdist": 3.25, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
111 | {"condition": "cot23_bin1", "acc_inst": 0.82, "acc_demo": 0.0, "levdist": 0.39, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
112 | {"condition": "cot23_bin2", "acc_inst": 0.73, "acc_demo": 0.0, "levdist": 0.73, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
113 | {"condition": "cot23_bin3", "acc_inst": 0.46, "acc_demo": 0.0, "levdist": 1.36, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
114 | {"condition": "cot23_bin4", "acc_inst": 0.58, "acc_demo": 0.0, "levdist": 0.9, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
115 | {"condition": "cot23_bin5", "acc_inst": 0.28, "acc_demo": 0.0, "levdist": 1.38, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
116 | {"condition": "cot24_bin1", "acc_inst": 0.84, "acc_demo": 0.0, "levdist": 0.45, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
117 | {"condition": "cot24_bin2", "acc_inst": 0.75, "acc_demo": 0.0, "levdist": 0.55, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
118 | {"condition": "cot24_bin3", "acc_inst": 0.5, "acc_demo": 0.0, "levdist": 1.28, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
119 | {"condition": "cot24_bin4", "acc_inst": 0.46, "acc_demo": 0.0, "levdist": 1.13, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
120 | {"condition": "cot24_bin5", "acc_inst": 0.18, "acc_demo": 0.0, "levdist": 1.7, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
121 | {"condition": "cot25_bin1", "acc_inst": 0.81, "acc_demo": 0.0, "levdist": 0.37, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
122 | {"condition": "cot25_bin2", "acc_inst": 0.67, "acc_demo": 0.0, "levdist": 0.69, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
123 | {"condition": "cot25_bin3", "acc_inst": 0.46, "acc_demo": 0.0, "levdist": 1.22, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
124 | {"condition": "cot25_bin4", "acc_inst": 0.47, "acc_demo": 0.0, "levdist": 1.05, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
125 | {"condition": "cot25_bin5", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 1.54, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
126 |
--------------------------------------------------------------------------------
/logs/text_cot/llama3.1-405b/results.jsonl:
--------------------------------------------------------------------------------
1 | {"condition": "cot1_bin1", "acc_inst": 0.81, "acc_demo": 0.0, "levdist": 8.96, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
2 | {"condition": "cot1_bin2", "acc_inst": 0.74, "acc_demo": 0.0, "levdist": 15.86, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
3 | {"condition": "cot1_bin3", "acc_inst": 0.54, "acc_demo": 0.0, "levdist": 35.77, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
4 | {"condition": "cot1_bin4", "acc_inst": 0.59, "acc_demo": 0.0, "levdist": 54.97, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
5 | {"condition": "cot1_bin5", "acc_inst": 0.55, "acc_demo": 0.0, "levdist": 35.06, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
6 | {"condition": "cot2_bin1", "acc_inst": 0.82, "acc_demo": 0.0, "levdist": 0.55, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
7 | {"condition": "cot2_bin2", "acc_inst": 0.67, "acc_demo": 0.0, "levdist": 8.87, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
8 | {"condition": "cot2_bin3", "acc_inst": 0.61, "acc_demo": 0.0, "levdist": 8.78, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
9 | {"condition": "cot2_bin4", "acc_inst": 0.6, "acc_demo": 0.0, "levdist": 44.3, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
10 | {"condition": "cot2_bin5", "acc_inst": 0.64, "acc_demo": 0.0, "levdist": 31.37, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
11 | {"condition": "cot3_bin1", "acc_inst": 0.62, "acc_demo": 0.0, "levdist": 0.94, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
12 | {"condition": "cot3_bin2", "acc_inst": 0.52, "acc_demo": 0.0, "levdist": 2.64, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
13 | {"condition": "cot3_bin3", "acc_inst": 0.49, "acc_demo": 0.0, "levdist": 11.29, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
14 | {"condition": "cot3_bin4", "acc_inst": 0.41, "acc_demo": 0.0, "levdist": 85.19, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
15 | {"condition": "cot3_bin5", "acc_inst": 0.35, "acc_demo": 0.0, "levdist": 220.52, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
16 | {"condition": "cot4_bin1", "acc_inst": 0.48, "acc_demo": 0.0, "levdist": 14.4, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
17 | {"condition": "cot4_bin2", "acc_inst": 0.39, "acc_demo": 0.0, "levdist": 8.15, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
18 | {"condition": "cot4_bin3", "acc_inst": 0.31, "acc_demo": 0.0, "levdist": 37.04, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
19 | {"condition": "cot4_bin4", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 131.24, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
20 | {"condition": "cot4_bin5", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 350.72, "median_levdist": 475.5, "model": "llama3.1-405b", "temp": 0.0}
21 | {"condition": "cot5_bin1", "acc_inst": 0.21, "acc_demo": 0.0, "levdist": 8.51, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
22 | {"condition": "cot5_bin2", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 9.12, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
23 | {"condition": "cot5_bin3", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 45.34, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
24 | {"condition": "cot5_bin4", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 92.35, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
25 | {"condition": "cot5_bin5", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 316.79, "median_levdist": 448.5, "model": "llama3.1-405b", "temp": 0.0}
26 | {"condition": "cot6_bin1", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 27.74, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
27 | {"condition": "cot6_bin2", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 24.1, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
28 | {"condition": "cot6_bin3", "acc_inst": 0.16, "acc_demo": 0.0, "levdist": 54.37, "median_levdist": 2.5, "model": "llama3.1-405b", "temp": 0.0}
29 | {"condition": "cot6_bin4", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 120.19, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
30 | {"condition": "cot6_bin5", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 372.41, "median_levdist": 595.5, "model": "llama3.1-405b", "temp": 0.0}
31 | {"condition": "cot7_bin1", "acc_inst": 0.23, "acc_demo": 0.0, "levdist": 7.91, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
32 | {"condition": "cot7_bin2", "acc_inst": 0.1, "acc_demo": 0.0, "levdist": 29.64, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
33 | {"condition": "cot7_bin3", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 2.75, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
34 | {"condition": "cot7_bin4", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 28.98, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
35 | {"condition": "cot7_bin5", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 13.81, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
36 | {"condition": "cot8_bin1", "acc_inst": 0.12, "acc_demo": 0.0, "levdist": 2.44, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
37 | {"condition": "cot8_bin2", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 2.62, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
38 | {"condition": "cot8_bin3", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 2.83, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
39 | {"condition": "cot8_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 10.69, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
40 | {"condition": "cot8_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 3.82, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
41 | {"condition": "cot9_bin1", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 10.9, "median_levdist": 4.0, "model": "llama3.1-405b", "temp": 0.0}
42 | {"condition": "cot9_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 16.74, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
43 | {"condition": "cot9_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 17.15, "median_levdist": 4.0, "model": "llama3.1-405b", "temp": 0.0}
44 | {"condition": "cot9_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.09, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
45 | {"condition": "cot9_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 10.38, "median_levdist": 4.0, "model": "llama3.1-405b", "temp": 0.0}
46 | {"condition": "cot10_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.52, "median_levdist": 4.0, "model": "llama3.1-405b", "temp": 0.0}
47 | {"condition": "cot10_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.79, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
48 | {"condition": "cot10_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 10.97, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
49 | {"condition": "cot10_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 17.13, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
50 | {"condition": "cot10_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.37, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
51 | {"condition": "cot11_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 10.68, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
52 | {"condition": "cot11_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 20.52, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
53 | {"condition": "cot11_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 18.2, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
54 | {"condition": "cot11_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.44, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
55 | {"condition": "cot11_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 16.82, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
56 | {"condition": "cot12_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 9.84, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
57 | {"condition": "cot12_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 23.49, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
58 | {"condition": "cot12_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 22.01, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
59 | {"condition": "cot12_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 43.66, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
60 | {"condition": "cot12_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 19.28, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
61 | {"condition": "cot13_bin1", "acc_inst": 0.86, "acc_demo": 0.0, "levdist": 0.33, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
62 | {"condition": "cot13_bin2", "acc_inst": 0.73, "acc_demo": 0.0, "levdist": 0.55, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
63 | {"condition": "cot13_bin3", "acc_inst": 0.62, "acc_demo": 0.0, "levdist": 0.87, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
64 | {"condition": "cot13_bin4", "acc_inst": 0.61, "acc_demo": 0.0, "levdist": 0.9, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
65 | {"condition": "cot13_bin5", "acc_inst": 0.53, "acc_demo": 0.0, "levdist": 0.92, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
66 | {"condition": "cot14_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.59, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
67 | {"condition": "cot14_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.8, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
68 | {"condition": "cot14_bin3", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 4.79, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
69 | {"condition": "cot14_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 17.9, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
70 | {"condition": "cot14_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.55, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
71 | {"condition": "cot15_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 7.26, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
72 | {"condition": "cot15_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 19.1, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
73 | {"condition": "cot15_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.72, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
74 | {"condition": "cot15_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.02, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
75 | {"condition": "cot15_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.36, "median_levdist": 7.0, "model": "llama3.1-405b", "temp": 0.0}
76 | {"condition": "cot16_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.69, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
77 | {"condition": "cot16_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.13, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
78 | {"condition": "cot16_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.48, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
79 | {"condition": "cot16_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.61, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
80 | {"condition": "cot16_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.79, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
81 | {"condition": "cot17_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.03, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
82 | {"condition": "cot17_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.63, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
83 | {"condition": "cot17_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.56, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
84 | {"condition": "cot17_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.01, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
85 | {"condition": "cot17_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.25, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
86 | {"condition": "cot18_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.63, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
87 | {"condition": "cot18_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.78, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
88 | {"condition": "cot18_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.92, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
89 | {"condition": "cot18_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.17, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
90 | {"condition": "cot18_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.73, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
91 | {"condition": "cot19_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 15.24, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
92 | {"condition": "cot19_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.81, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
93 | {"condition": "cot19_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 16.89, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
94 | {"condition": "cot19_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 12.31, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
95 | {"condition": "cot19_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 23.55, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
96 | {"condition": "cot20_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 13.5, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
97 | {"condition": "cot20_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 12.76, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
98 | {"condition": "cot20_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.66, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
99 | {"condition": "cot20_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.41, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
100 | {"condition": "cot20_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.48, "median_levdist": 7.0, "model": "llama3.1-405b", "temp": 0.0}
101 | {"condition": "cot21_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.68, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
102 | {"condition": "cot21_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.91, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
103 | {"condition": "cot21_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.87, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
104 | {"condition": "cot21_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 12.84, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
105 | {"condition": "cot21_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 13.04, "median_levdist": 7.0, "model": "llama3.1-405b", "temp": 0.0}
106 | {"condition": "cot22_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.33, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
107 | {"condition": "cot22_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.48, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
108 | {"condition": "cot22_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.38, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
109 | {"condition": "cot22_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.73, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
110 | {"condition": "cot22_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.24, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
111 | {"condition": "cot23_bin1", "acc_inst": 0.77, "acc_demo": 0.0, "levdist": 0.68, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
112 | {"condition": "cot23_bin2", "acc_inst": 0.59, "acc_demo": 0.0, "levdist": 2.55, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
113 | {"condition": "cot23_bin3", "acc_inst": 0.55, "acc_demo": 0.0, "levdist": 1.09, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
114 | {"condition": "cot23_bin4", "acc_inst": 0.49, "acc_demo": 0.0, "levdist": 1.12, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
115 | {"condition": "cot23_bin5", "acc_inst": 0.32, "acc_demo": 0.0, "levdist": 1.24, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
116 | {"condition": "cot24_bin1", "acc_inst": 0.52, "acc_demo": 0.0, "levdist": 1.5, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
117 | {"condition": "cot24_bin2", "acc_inst": 0.26, "acc_demo": 0.0, "levdist": 2.35, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
118 | {"condition": "cot24_bin3", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 1.98, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
119 | {"condition": "cot24_bin4", "acc_inst": 0.37, "acc_demo": 0.0, "levdist": 2.31, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
120 | {"condition": "cot24_bin5", "acc_inst": 0.47, "acc_demo": 0.0, "levdist": 1.45, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
121 | {"condition": "cot25_bin1", "acc_inst": 0.83, "acc_demo": 0.0, "levdist": 0.5, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
122 | {"condition": "cot25_bin2", "acc_inst": 0.62, "acc_demo": 0.0, "levdist": 0.87, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
123 | {"condition": "cot25_bin3", "acc_inst": 0.6, "acc_demo": 0.0, "levdist": 0.91, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
124 | {"condition": "cot25_bin4", "acc_inst": 0.7, "acc_demo": 0.0, "levdist": 0.63, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
125 | {"condition": "cot25_bin5", "acc_inst": 0.56, "acc_demo": 0.0, "levdist": 2.45, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
126 |
--------------------------------------------------------------------------------
/models/openai_help.py:
--------------------------------------------------------------------------------
1 | import os
2 | import openai
3 | import random
4 | import aiolimiter
5 | from aiohttp import ClientSession
6 | import asyncio
7 | import logging
8 | from typing import Any, List, Dict, Union
9 | from tqdm.asyncio import tqdm_asyncio
10 |
11 | completion_tokens = {"gpt-4": 0, "gpt-3.5-turbo": 0, "gpt-4-0613": 0, "gpt-3.5-turbo-0613": 0}
12 | prompt_tokens = {"gpt-4": 0, "gpt-3.5-turbo": 0, "gpt-4-0613": 0, "gpt-3.5-turbo-0613": 0}
13 |
14 | async def _throttled_openai_chat_completion_acreate(
15 | model: str,
16 | messages: List[Dict[str, str]],
17 | temperature: float,
18 | max_tokens: int,
19 | top_p: float,
20 | stop: Union[str, List[str]],
21 | limiter: aiolimiter.AsyncLimiter,
22 | ) -> Dict[str, Any]:
23 | async with limiter:
24 | for _ in range(10000000000):
25 | try:
26 | return await openai.ChatCompletion.acreate(
27 | model=model,
28 | messages=messages,
29 | temperature=temperature,
30 | max_tokens=max_tokens,
31 | top_p=top_p,
32 | stop=stop,
33 | )
34 | except openai.error.OpenAIError:
35 | logging.warning(
36 | "OpenAI API rate limit exceeded. Sleeping for 10 seconds."
37 | )
38 | await asyncio.sleep(20)
39 | except asyncio.exceptions.TimeoutError:
40 | logging.warning("OpenAI API timeout. Sleeping for 10 seconds.")
41 | await asyncio.sleep(20)
42 | return {"choices": [{"message": {"content": ""}}]}
43 |
44 |
45 | async def generate_from_openai_chat_completion(
46 | messages_list: List[Dict[str, str]],
47 | model: str,
48 | temperature: float,
49 | max_tokens: int,
50 | top_p: float,
51 | stop: Union[str, List[str]],
52 | requests_per_minute: int = 300,
53 | ) -> List[str]:
54 | if model == "gpt-4":
55 | requests_per_minute = 200
56 | if "OPENAI_API_KEY" not in os.environ:
57 | raise ValueError(
58 | "OPENAI_API_KEY environment variable must be set when using OpenAI API."
59 | )
60 | print(os.environ["OPENAI_API_KEY"])
61 | openai.api_key = os.environ["OPENAI_API_KEY"]
62 | session = ClientSession()
63 | openai.aiosession.set(session)
64 | limiter = aiolimiter.AsyncLimiter(requests_per_minute)
65 | async_responses = [
66 | _throttled_openai_chat_completion_acreate(
67 | model=model,
68 | messages=messages,
69 | temperature=temperature,
70 | max_tokens=max_tokens,
71 | top_p=top_p,
72 | stop=stop,
73 | limiter=limiter,
74 | )
75 | for messages in messages_list
76 | ]
77 | responses = await tqdm_asyncio.gather(*async_responses)
78 | await session.close()
79 | # return [x["choices"][0]["message"]["content"] for x in responses]
80 | return responses
81 |
82 |
83 | def gpt(prompt, model="gpt-4", temperature=0.7, max_tokens=1000, n=1, stop=None) -> list:
84 | return gpts([prompt] * n, model=model, temperature=temperature, max_tokens=max_tokens, stop=stop)
85 |
86 | def gpts(prompts, model="gpt-4", temperature=0.7, max_tokens=1000, stop=None) -> list:
87 | print(f"Model: {model}, temperature: {temperature}, max_tokens: {max_tokens}")
88 | messages_list = [[{"role": "user", "content": prompt}] for prompt in prompts]
89 | return chatgpts(messages_list, model=model, temperature=temperature, max_tokens=max_tokens, stop=stop)
90 |
91 | def chatgpt(messages, model="gpt-4", temperature=0.7, max_tokens=1000, n=1, stop=None) -> list:
92 | return chatgpts([messages] * n, model=model, temperature=temperature, max_tokens=max_tokens, stop=stop)
93 |
94 | def chatgpts(messages_list, model="gpt-4", temperature=0.7, max_tokens=1000, stop=None) -> list:
95 | responses = asyncio.run(generate_from_openai_chat_completion(model=model, messages_list=messages_list, temperature=temperature, max_tokens=max_tokens, top_p=1, stop=stop))
96 | texts = [x["choices"][0]["message"]["content"] for x in responses]
97 | # print(responses)
98 | global completion_tokens, prompt_tokens
99 | completion_tokens[model] += sum(x["usage"]["completion_tokens"] for x in responses if "usage" in x and "completion_tokens" in x["usage"])
100 | prompt_tokens[model] += sum(x["usage"]["prompt_tokens"] for x in responses if "usage" in x and "prompt_tokens" in x["usage"])
101 | return texts
102 |
103 | def gpt_usage():
104 | global completion_tokens, prompt_tokens
105 | cost = completion_tokens["gpt-4"] / 1000 * 0.06 + prompt_tokens["gpt-4"] / 1000 * 0.03
106 | cost += (completion_tokens["gpt-3.5-turbo"] + prompt_tokens["gpt-3.5-turbo"]) / 1000 * 0.0002
107 | return {"completion_tokens": completion_tokens, "prompt_tokens": prompt_tokens, "cost": cost}
108 |
--------------------------------------------------------------------------------
/regression/README.md:
--------------------------------------------------------------------------------
1 | # Logistic Regression
2 |
3 | - `text_cot_train_table.tsv` - train table statistics where `correct` indicates whether GPT-4 solved the example correctly. Logistic rgeression model is fitted on this data in `regression.ipynb`. Obtained by running [eval.py](https://github.com/aksh555/deciphering_cot/eval.py) and `create_train_table.py`
4 | - `text_cot_test_table.tsv` - test table statistics
5 | - `text_cot_test_table_results.tsv` - test table statistics with predictions from the LR model.
--------------------------------------------------------------------------------
/regression/create_train_table.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | from transformers import GPT2LMHeadModel, GPT2Tokenizer
4 | import tiktoken
5 | import logging
6 | import json
7 | import pandas as pd
8 |
9 | logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, handlers=[logging.StreamHandler(),logging.FileHandler("prob_random_index.log")])
10 |
11 | if torch.cuda.is_available():
12 | device = "cuda"
13 | else:
14 | device = "cpu"
15 |
16 | gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-xl")
17 | gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2-xl").to(device)
18 | gpt4_enc = tiktoken.get_encoding("cl100k_base")
19 |
20 | def pad_batch(batch, pad_idx):
21 | max_length = 0
22 | for seq in batch:
23 | if len(seq) > max_length:
24 | max_length = len(seq)
25 |
26 | new_batch = []
27 | for seq in batch:
28 | padding = [pad_idx for i in range(max_length - len(seq))]
29 | new_batch.append(seq + padding)
30 |
31 | return new_batch
32 |
33 | # Get perplexity using GPT-2
34 | def prob_gpt2(sentence_list):
35 |
36 | # Tokenize the sentences
37 | all_tokens = []
38 | for sentence in sentence_list:
39 | tokens = gpt2_tokenizer.encode(sentence)
40 | all_tokens.append(tokens)
41 | tokens = pad_batch(all_tokens, 50256)
42 |
43 | targets = tokens[:]
44 |
45 | # Compute average log likelihood for the generation
46 | input_ids = torch.LongTensor(tokens).to(device)
47 | target_ids = torch.LongTensor(targets).to(device)
48 |
49 | with torch.no_grad():
50 | outputs = gpt2_model(input_ids, labels=target_ids)
51 | logits = outputs[1]
52 | logits = logits.transpose(0,1)[:-1].transpose(0,1)
53 | target_ids = target_ids.transpose(0,1)[1:].transpose(0,1)
54 | loss = torch.nn.CrossEntropyLoss(reduction="none", ignore_index=50256)(logits.reshape(-1,50257), target_ids.reshape(-1))
55 | loss = loss.reshape(target_ids.shape).sum(dim=1)
56 | neg_log_likelihood = -1*loss
57 |
58 | # 13.357776641845703 = logprob('The word is"'); removing this to just get
59 | # the word prob
60 | return neg_log_likelihood + 13.357776641845703
61 |
62 | df = pd.read_csv("text_cot_train_table.tsv",sep="\t")
63 | word_list = df["input"].to_list()
64 | print("Rows", len(word_list))
65 |
66 | words_with_prob = []
67 | this_batch_sentences = []
68 | this_batch_words = []
69 | num_tokens = []
70 | for index, line in enumerate(word_list):
71 | if index % 10000 == 0:
72 | logging.info(str(index))
73 |
74 | word = line.strip()
75 |
76 | tokens = gpt4_enc.encode(word)
77 | tokens_spaced = gpt4_enc.encode(" " + word)
78 |
79 | this_batch_sentences.append('The word is "' + word + '"')
80 | this_batch_words.append(word)
81 | num_tokens.append(len(tokens))
82 |
83 | if len(this_batch_sentences) == 3000:
84 | logprobs = prob_gpt2(this_batch_sentences)
85 | for word, logprob in zip(this_batch_words, logprobs):
86 | words_with_prob.append(logprob.item())
87 | this_batch_sentences = []
88 | this_batch_words = []
89 |
90 | if len(this_batch_sentences) > 0:
91 | logprobs = prob_gpt2(this_batch_sentences)
92 | for word, logprob in zip(this_batch_words, logprobs):
93 | words_with_prob.append(logprob.item())
94 | this_batch_sentences = []
95 | this_batch_words = []
96 |
97 | df["input_logprob"] = words_with_prob
98 | df["input_ntokens"] = num_tokens
99 |
100 | df.drop(["pred","gt","input"], axis=1, inplace=True)
101 | df = df[['input_ntokens', 'input_logprob', 'output_logprob', 'shift_level', 'shift_freq', 'bin']]
102 | df.to_csv("./text_cot_train_table.tsv", "\t",index_label="index")
103 |
104 |
105 |
106 |
107 |
108 |
109 |
--------------------------------------------------------------------------------
/run_claude3.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import json
3 | import argparse
4 | from tqdm import tqdm
5 | import os
6 | import anthropic
7 | import time
8 | logging.getLogger().setLevel(logging.ERROR)
9 |
10 | client = anthropic.Anthropic()
11 |
12 |
13 | def claude_responses(prompt_list, model="claude-3-opus-20240229", max_tokens=1000, temperature=0.0):
14 | responses = []
15 | for prompt in tqdm(prompt_list):
16 | output = None
17 | for _ in range(10):
18 | try:
19 | completion = client.messages.create(
20 | model=model,
21 | max_tokens=max_tokens,
22 | temperature=temperature,
23 | system="Provide only your answer, without any explanation.",
24 | messages=[{"role":"user", "content": prompt}]
25 | )
26 |
27 | output = completion.content[0].text
28 | if output is None:
29 | output = ""
30 | except:
31 | time.sleep(60)
32 |
33 | if not (output is None):
34 | break
35 |
36 | if output is None:
37 | responses.append("")
38 | else:
39 | responses.append(output)
40 | return responses
41 |
42 |
43 |
44 |
45 | def solve_file(name, model, temperature, max_tokens, prompt_type):
46 | file = f'stimuli/{prompt_type}/{name}.jsonl'
47 | if not os.path.exists(file):
48 | print(f'File {file} does not exist')
49 | return None
50 | with open(file, 'r') as f:
51 | lines = f.readlines()
52 | lines = [json.loads(line) for line in lines]
53 | prompts = [line['instruction_plus_input'] for line in lines]
54 | gts = [line['correct_output'] for line in lines]
55 | res = claude_responses(prompts, model=model, temperature=0.0, max_tokens=max_tokens)
56 |
57 | # These accs are not what we use in the paper - they're just for quick estimates.
58 | # The stats used in the paper are computed in the evaluation/ folder
59 | accs = [(gt.replace('"', "") in r.replace('"', "")) for r, gt in zip(res, gts)]
60 | acc = sum(accs) / len(accs)
61 | print(f'Accuracy: {acc}')
62 |
63 | d = {'prompts': prompts, 'gts': gts, 'res': res, 'accs': accs, 'acc': acc}
64 |
65 | fo_directory = f'logs/{prompt_type}/{model}'
66 | if not os.path.exists(fo_directory):
67 | os.makedirs(fo_directory, exist_ok=True)
68 |
69 | output_file = f'{fo_directory}/{name}.json'
70 | with open(output_file, 'w') as f:
71 | json.dump(d, f)
72 |
73 | return d
74 |
75 |
76 | def parse_args():
77 | args = argparse.ArgumentParser()
78 | args.add_argument('--tasks', type=str, required=True, help='split by comma')
79 | args.add_argument('--conditions', type=str, required=True, help='split by comma')
80 | args.add_argument('--model', type=str, required=True, choices=['claude-3'])
81 | args.add_argument('--max_tokens', type=int, help='default = 1000', default=1000)
82 | args.add_argument("--prompt_type", type=str, help="Prompt type to use [standard, text_cot, math_cot, number_cot]", default="text_cot")
83 | args = args.parse_args()
84 | return args
85 |
86 | if __name__ == '__main__':
87 | args = parse_args()
88 | tasks = args.tasks.split(',')
89 | conditions = args.conditions.split(',')
90 | model = args.model
91 | prompt_type = args.prompt_type
92 |
93 | if model == "claude-3":
94 | model = "claude-3-opus-20240229"
95 | max_tokens = args.max_tokens
96 |
97 | for task in tasks:
98 | for condition in conditions:
99 | name = f'{task}_{condition}'
100 | d = solve_file(name, model=model, temperature=0.0, max_tokens=max_tokens, prompt_type=prompt_type)
101 | if d is not None:
102 | print(f'{name}, {model}: {d["acc"]:.2f}')
103 |
104 |
--------------------------------------------------------------------------------
/run_llama3.py:
--------------------------------------------------------------------------------
1 | # python run_llama3.py --tasks cot1 --conditions bin1 --max_tokens 200 --model llama-3.1-405b
2 |
3 | import logging
4 | import json
5 | import argparse
6 | from tqdm import tqdm
7 | import os
8 | import together
9 | import time
10 | logging.getLogger().setLevel(logging.ERROR)
11 |
12 | client = together.Together()
13 |
14 | def process_prompt(prompt):
15 | prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n" + prompt + "\n<|start_header_id|>assistant<|end_header_id|>"
16 | return prompt
17 |
18 |
19 | def llama_responses(prompt_list, model="llama-3-70b-chat-hf", max_tokens=1000, temperature=0.0):
20 | responses = []
21 | for prompt in tqdm(prompt_list):
22 | prompt = process_prompt(prompt)
23 | output = None
24 | for _ in range(10):
25 | try:
26 | if "chat" in model:
27 | output = client.chat.completions.create(
28 | messages = [{"role": "user", "content": prompt}],
29 | model = "meta-llama/" + model,
30 | max_tokens = max_tokens,
31 | temperature = temperature,
32 | )
33 | else:
34 | output = client.completions.create(
35 | prompt=prompt,
36 | model = "meta-llama/" + model,
37 | max_tokens = max_tokens,
38 | temperature = temperature,
39 | )
40 | except:
41 | time.sleep(1)
42 |
43 | if not (output is None):
44 | break
45 | if "chat" in model:
46 | responses.append(output.choices[0].message.content)
47 | else:
48 | responses.append(output.choices[0].text)
49 | return responses
50 |
51 |
52 | def solve_file(name, model, temperature, max_tokens, prompt_type):
53 | file = f'stimuli/{prompt_type}/{name}.jsonl'
54 | print(f"Loading {file}")
55 | if not os.path.exists(file):
56 | print(f'File {file} does not exist')
57 | return None
58 | with open(file, 'r') as f:
59 | lines = f.readlines()
60 | lines = [json.loads(line) for line in lines]
61 | prompts = [line['instruction_plus_input'] for line in lines]
62 | gts = [line['correct_output'] for line in lines]
63 | res = llama_responses(prompts, model=model, temperature=0.0, max_tokens=max_tokens)
64 |
65 | # These accs are not what we use in the paper - they're just for quick estimates.
66 | # The stats used in the paper are computed in the evaluation/ folder
67 | accs = [(gt.replace('"', '') in r.replace('"', '')) for r, gt in zip(res, gts)]
68 | acc = sum(accs) / len(accs)
69 | print(f"Done {name}")
70 | print(f'Accuracy: {acc}')
71 |
72 | d = {'prompts': prompts, 'gts': gts, 'res': res, 'accs': accs, 'acc': acc}
73 |
74 | fo_directory = f'logs/{prompt_type}/{model}'
75 | if not os.path.exists(fo_directory):
76 | os.makedirs(fo_directory, exist_ok=True)
77 |
78 | output_file = f'{fo_directory}/{name}.json'
79 | with open(output_file, 'w') as f:
80 | json.dump(d, f)
81 | return d
82 |
83 |
84 | def parse_args():
85 | args = argparse.ArgumentParser()
86 | args.add_argument('--tasks', type=str, required=True, help='split by comma')
87 | args.add_argument('--conditions', type=str, required=True, help='split by comma')
88 | args.add_argument('--model', type=str, required=True, choices=['llama-3-70b-chat', 'llama-3-70b', 'llama3-405b', 'llama3.1-70b'], default='llama3.1-405b')
89 | args.add_argument('--max_tokens', type=int, help='default = 1000', default=1000)
90 | args.add_argument("--prompt_type", type=str, help="Prompt type to use [standard, text_cot, math_cot, number_cot]", default="text_cot")
91 | args = args.parse_args()
92 | return args
93 |
94 | if __name__ == '__main__':
95 | args = parse_args()
96 | tasks = args.tasks.split(',')
97 | conditions = args.conditions.split(',')
98 | model = args.model
99 | prompt_type = args.prompt_type
100 | if model == 'llama-3-70b-chat':
101 | model = 'llama-3-70b-chat-hf'
102 | elif model == 'llama-3-70b':
103 | model = 'meta-llama-3-70b'
104 | elif model == 'llama3.1-405b':
105 | model = 'Meta-Llama-3.1-405B-Instruct-Turbo'
106 | elif model == 'llama3.1-70b':
107 | model = 'Meta-Llama-3.1-70B-Instruct-Turbo'
108 | max_tokens = args.max_tokens
109 |
110 | for task in tasks:
111 | for condition in conditions:
112 | name = f'{task}_{condition}'
113 | d = solve_file(name, model=model, temperature=0.0, max_tokens=max_tokens, prompt_type=prompt_type)
114 | if d is not None:
115 | print(f'{name}, {model}: {d["acc"]:.2f}')
116 |
117 |
--------------------------------------------------------------------------------
/run_o1.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import json
3 | import argparse
4 | from tqdm import tqdm
5 | import os
6 | logging.getLogger().setLevel(logging.INFO)
7 | from openai import OpenAI,BadRequestError
8 | client = OpenAI()
9 |
10 | def o1_responses(prompt_list):
11 | responses = []
12 | completion_tokens = []
13 | for prompt in tqdm(prompt_list):
14 | try:
15 | response = client.chat.completions.create(
16 | model="o1-preview",
17 | messages=[
18 | {
19 | "role": "user",
20 | "content": prompt
21 | }
22 | ]
23 | )
24 | responses.append(response.choices[0].message.content)
25 | completion_tokens.append(response.usage.completion_tokens_details["reasoning_tokens"])
26 | except BadRequestError:
27 | response = "BLOCKED_BY_OPENAI"
28 | responses.append(response)
29 | completion_tokens.append(0)
30 | except Exception as e:
31 | print(e)
32 | response = "ERROR"
33 |
34 | return responses, completion_tokens
35 |
36 | def solve_file(name, model):
37 | # o1 does not require CoT prompts
38 | file = f'stimuli/standard/{name}.jsonl'
39 | if not os.path.exists(file):
40 | print(f'File {file} does not exist')
41 | return None
42 | with open(file, 'r') as f:
43 | lines = f.readlines()
44 | lines = [json.loads(line) for line in lines]
45 | print(file)
46 | prompts = [line['instruction_plus_input'] for line in lines][:50]
47 | gts = [line['correct_output'] for line in lines][:50]
48 |
49 | res, completion_tokens = o1_responses(prompts)
50 | mean_tokens = sum(completion_tokens)/len(completion_tokens)
51 |
52 | # These accs are not what we use in the paper - they're just for quick estimates.
53 | # The stats used in the paper are computed in the evaluation/ folder
54 | accs = [(gt.replace('"', "") in r.replace('"', "")) for r, gt in zip(res, gts)]
55 | acc = sum(accs) / len(accs)
56 | print("Completion tokens", mean_tokens)
57 |
58 | d = {'prompts': prompts, 'gts': gts, 'res': res, 'accs': accs, 'acc': acc, 'mean_completion_tokens':mean_tokens}
59 |
60 | output_file = f'logs/standard/{model}'
61 | with open(output_file, 'w') as f:
62 | json.dump(d, f)
63 |
64 | return d
65 |
66 |
67 | def parse_args():
68 | args = argparse.ArgumentParser()
69 | args.add_argument('--tasks', type=str, required=True, help='split by comma')
70 | args.add_argument('--conditions', type=str, required=True, help='split by comma')
71 | args.add_argument('--model', type=str, default='o1-preview-2024-09-12')
72 |
73 | args = args.parse_args()
74 | return args
75 |
76 | if __name__ == '__main__':
77 | args = parse_args()
78 | tasks = args.tasks.split(',')
79 | conditions = args.conditions.split(',')
80 | model = args.model
81 |
82 | for task in tasks:
83 | for condition in conditions:
84 | name = f'{task}_{condition}'
85 | d = solve_file(name, model=model)
86 | if d is not None:
87 | print(f'{name}, {model}: {d["acc"]:.2f}')
88 | print("Completion tokens", d["mean_completion_tokens"])
89 |
90 |
--------------------------------------------------------------------------------
/run_openai.py:
--------------------------------------------------------------------------------
1 | from models.openai_help import gpts
2 | import logging
3 | import json
4 | import argparse
5 | import os
6 |
7 | logging.getLogger().setLevel(logging.WARNING)
8 |
9 | def edit_distance(s1: str, s2: str) -> int:
10 | """Compute the Levenshtein distance between two strings."""
11 | if len(s1) < len(s2):
12 | return edit_distance(s2, s1)
13 | if len(s2) == 0:
14 | return len(s1)
15 | previous_row = range(len(s2) + 1)
16 | for i, c1 in enumerate(s1):
17 | current_row = [i + 1]
18 | for j, c2 in enumerate(s2):
19 | insertions = previous_row[j + 1] + 1
20 | deletions = current_row[j] + 1
21 | substitutions = previous_row[j] + (c1 != c2)
22 | current_row.append(min(insertions, deletions, substitutions))
23 | previous_row = current_row
24 | return previous_row[-1]
25 |
26 |
27 | def solve_file(name, model, temperature, max_tokens, prompt_type):
28 | file = f'stimuli/{prompt_type}/{name}.jsonl'
29 | print(f"Loading {file}")
30 | if not os.path.exists(file):
31 | print(f'File {file} does not exist')
32 | return None
33 | with open(file, 'r') as f:
34 | lines = f.readlines()
35 | lines = [json.loads(line) for line in lines]
36 | prompts = [line['instruction_plus_input'] for line in lines]
37 | gts = ['"' + line['correct_output'] + '"' for line in lines]
38 | res = gpts(prompts, model=model, temperature=temperature, max_tokens=max_tokens)
39 | accs = [(r == gt) for r, gt in zip(res, gts)]
40 | eds = [edit_distance(r, gt) for r, gt in zip(res, gts)]
41 | acc = sum(accs) / len(accs)
42 | ed = sum(eds) / len(eds)
43 | print(f"Done {name}")
44 | d = {'prompts': prompts, 'gts': gts, 'res': res, 'accs': accs, 'acc': acc, 'eds': eds, 'ed': ed}
45 |
46 | fo_directory = f'logs/{prompt_type}/{model}'
47 | if not os.path.exists(fo_directory):
48 | os.makedirs(fo_directory, exist_ok=True)
49 |
50 | output_file = f'{fo_directory}/{name}.json'
51 | with open(output_file, 'w') as f:
52 | json.dump(d, f)
53 |
54 | return d
55 |
56 |
57 | def parse_args():
58 | args = argparse.ArgumentParser()
59 | args.add_argument('--tasks', type=str, required=True, help='split by comma')
60 | args.add_argument('--conditions', type=str, required=True, help='split by comma')
61 | args.add_argument('--model', type=str, default='gpt-4-0613')
62 | args.add_argument('--max_tokens', type=int, help='default = 200', default=200)
63 | args.add_argument('--temperature', type=float, help='default = 0.0', default=0.0)
64 | args.add_argument("--prompt_type", type=str, help="Prompt type to use [standard, text_cot, math_cot, number_cot]", default="text_cot")
65 | args = args.parse_args()
66 | return args
67 |
68 | if __name__ == '__main__':
69 | args = parse_args()
70 | tasks = args.tasks.split(',')
71 | conditions = args.conditions.split(',')
72 | model = args.model
73 | max_tokens = args.max_tokens
74 | temperature = args.temperature
75 | prompt_type = args.prompt_type
76 |
77 | for task in tasks:
78 | for condition in conditions:
79 | name = f'{task}_{condition}'
80 | d = solve_file(name, model=model, temperature=temperature, max_tokens=max_tokens, prompt_type=prompt_type)
81 | if d is not None:
82 | print(f'{name}, {model}: {d["acc"]:.2f} ({d["ed"]:.2f})')
83 |
84 |
--------------------------------------------------------------------------------
/seven_letter_words/README.md:
--------------------------------------------------------------------------------
1 | ## Dataset
2 | 1. First, run `python random_token_combos.py`. This generates `random_pairs_lower.txt`, which lists all words that fulfill the following criteria:
3 | - 7 letters long
4 | - 2 subword tokens long (using the tokenizer that both GPT-3.5 and GPT-4 use; it needs to be 2 tokens long whether the word follows a space or not)
5 | - The first subword token is 3 letters long, and the second is 4 letters long (again, these lengths need to be identical whether the word follows a space or not).
6 | 2. Then, sort these words by the probability assigned to them by GPT-2 by running `python gpt2_prob_sevenletter.py`. This generates `random_pairs_lower_scored.txt`, which lists each word along with a log probability. The log probability is computed as the log probability that GPT-2 assigns to the sentence `The word is "WORD"`, minus the log probability that it assigns to `The word is "'; thus, this yields the log probability assigned to just the word and the following quotation mark in the context of `The word is "`. The closing quotation mark is included because it serves to indicate the end of the word.
7 | 3. Then, bin the words by running `python select_words.py` to create `words_5bins.txt`.
8 | 4. The final list of words can be found in `bin1_prob.txt`, `bin2_prob.txt`, `bin3_prob.txt`, `bin4_prob.txt`, and `bin5_prob.txt`.
9 |
--------------------------------------------------------------------------------
/seven_letter_words/bin1_prob.txt:
--------------------------------------------------------------------------------
1 | choosed -14.997272491455078
2 | colbert -14.996980667114258
3 | polenta -14.99655532836914
4 | modicum -15.007698059082031
5 | autarch -14.99172592163086
6 | schisms -14.989496231079102
7 | mariner -15.0106201171875
8 | disarms -15.0106201171875
9 | rescale -14.989356994628906
10 | paywall -14.986217498779297
11 | infobox -14.98541259765625
12 | preston -15.015327453613281
13 | shrines -15.016551971435547
14 | implore -14.982894897460938
15 | alloted -15.01815414428711
16 | precast -15.020370483398438
17 | borings -14.978897094726562
18 | bacilli -15.022220611572266
19 | matrice -15.022846221923828
20 | redible -14.974870681762695
21 | absolve -15.026111602783203
22 | ourself -14.973335266113281
23 | ethetic -15.026788711547852
24 | maynard -15.027372360229492
25 | calibur -15.027730941772461
26 | enviros -15.02823257446289
27 | calzone -14.970394134521484
28 | sumatra -14.96739387512207
29 | drywall -15.033981323242188
30 | impaled -14.965522766113281
31 | manland -15.03862190246582
32 | divined -14.960699081420898
33 | conlang -14.959224700927734
34 | tablero -14.95616340637207
35 | redraft -14.955455780029297
36 | equitas -15.044797897338867
37 | ratting -14.953641891479492
38 | errancy -15.04793930053711
39 | webcast -14.94735336303711
40 | lowland -15.053237915039062
41 | boyhood -15.053678512573242
42 | actuary -14.945014953613281
43 | catlike -15.055164337158203
44 | putback -15.056617736816406
45 | galileo -14.942996978759766
46 | rivaled -15.057003021240234
47 | volonte -14.942134857177734
48 | sunspot -15.059274673461914
49 | rotunda -14.940404891967773
50 | notched -15.06007194519043
51 | taproot -14.935928344726562
52 | secures -15.066566467285156
53 | entente -14.93320083618164
54 | outflow -15.066858291625977
55 | betters -15.067663192749023
56 | rumpled -14.930889129638672
57 | burried -15.070535659790039
58 | repulse -14.92904281616211
59 | fillets -14.926876068115234
60 | relator -14.92681884765625
61 | sombody -15.074382781982422
62 | unsaved -15.074520111083984
63 | ailment -15.075027465820312
64 | nodules -15.075050354003906
65 | montero -14.922632217407227
66 | satires -15.080968856811523
67 | arcadia -14.916393280029297
68 | valerie -14.915924072265625
69 | inglish -15.085016250610352
70 | dukedom -15.086551666259766
71 | espouse -14.913402557373047
72 | bedevil -14.911296844482422
73 | reticle -15.089393615722656
74 | matinee -15.089693069458008
75 | maxwell -14.909908294677734
76 | picante -14.90963363647461
77 | baboons -14.908744812011719
78 | exciter -15.092048645019531
79 | losings -14.907678604125977
80 | newbies -14.906318664550781
81 | serried -14.90548324584961
82 | curving -14.904655456542969
83 | narrows -15.09649658203125
84 | ragging -14.901836395263672
85 | baneful -15.099411010742188
86 | pinatas -14.89979362487793
87 | divison -15.100841522216797
88 | kinfolk -14.898719787597656
89 | indiana -14.898597717285156
90 | caritas -14.8953857421875
91 | silvery -14.893852233886719
92 | inkling -14.893333435058594
93 | absense -15.10746955871582
94 | lavabit -14.890359878540039
95 | outsize -14.88975715637207
96 | rewired -15.111268997192383
97 | absalom -15.113567352294922
98 | getback -15.114919662475586
99 | accuser -14.884925842285156
100 | striven -15.115121841430664
101 | maloney -15.116886138916016
102 | escaper -14.882984161376953
103 | subtile -15.119136810302734
104 | colibri -14.879827499389648
105 | delving -14.87982177734375
106 | calving -14.879753112792969
107 | tarheel -14.878677368164062
108 | herders -14.876302719116211
109 | grooved -14.875177383422852
110 | octagon -15.125707626342773
111 | bisping -15.126806259155273
112 | alluded -14.872251510620117
113 | merlion -15.128215789794922
114 | figural -15.129623413085938
115 | debater -14.869804382324219
116 | pigtail -14.867530822753906
117 | honious -15.13395881652832
118 | pinches -15.135322570800781
119 | clojure -14.863956451416016
120 | equates -14.861526489257812
121 | refiner -15.138694763183594
122 | billets -15.140663146972656
123 | alfalfa -15.141242980957031
124 | hotshot -14.858383178710938
125 | nonagon -15.142745971679688
126 | jacuzzi -14.857048034667969
127 | vincent -15.143632888793945
128 | pollock -14.855628967285156
129 | airtime -14.85552978515625
--------------------------------------------------------------------------------
/seven_letter_words/bin2_prob.txt:
--------------------------------------------------------------------------------
1 | dupasha -22.5
2 | makrita -22.499996185302734
3 | ferisse -22.499996185302734
4 | murcers -22.49999237060547
5 | metires -22.49999237060547
6 | witmost -22.50000762939453
7 | astause -22.50000762939453
8 | sekaram -22.500011444091797
9 | vilgren -22.500015258789062
10 | belomat -22.500019073486328
11 | setnest -22.499977111816406
12 | curadal -22.49997329711914
13 | viridon -22.50002670288086
14 | denpick -22.50002670288086
15 | eraully -22.50003433227539
16 | ruborie -22.500041961669922
17 | queimer -22.499950408935547
18 | cosuits -22.499950408935547
19 | rutamen -22.499942779541016
20 | graizen -22.499942779541016
21 | sonware -22.500057220458984
22 | infocos -22.500057220458984
23 | inkwang -22.49993896484375
24 | rowbots -22.499935150146484
25 | engeden -22.500064849853516
26 | vizizen -22.50006866455078
27 | molenci -22.499927520751953
28 | indotes -22.499927520751953
29 | dapener -22.500076293945312
30 | ireasti -22.50008773803711
31 | undving -22.499900817871094
32 | traumpt -22.499900817871094
33 | redrear -22.500099182128906
34 | aryanni -22.499897003173828
35 | brovoir -22.500102996826172
36 | greised -22.499893188476562
37 | networm -22.499889373779297
38 | memwill -22.500110626220703
39 | gamplus -22.499881744384766
40 | estplay -22.499881744384766
41 | sapwhat -22.500118255615234
42 | indmong -22.500118255615234
43 | kenafil -22.5001220703125
44 | denzhou -22.5001220703125
45 | cosited -22.5001220703125
46 | perzoek -22.500125885009766
47 | balinit -22.500125885009766
48 | mayonal -22.499866485595703
49 | armemic -22.499866485595703
50 | henjury -22.500133514404297
51 | lavplay -22.500141143798828
52 | calynes -22.49985122680664
53 | remfold -22.50014877319336
54 | engdist -22.50014877319336
55 | armrich -22.50014877319336
56 | luxfast -22.499847412109375
57 | mulhatt -22.49984359741211
58 | allaton -22.49984359741211
59 | strfair -22.50015640258789
60 | monachs -22.50015640258789
61 | kerapat -22.50015640258789
62 | hergrim -22.50015640258789
63 | fidgota -22.50015640258789
64 | decigan -22.500160217285156
65 | dezella -22.499835968017578
66 | haypath -22.500164031982422
67 | resonga -22.499820709228516
68 | nosband -22.499820709228516
69 | poligen -22.500179290771484
70 | mobture -22.49981689453125
71 | flufrom -22.50018310546875
72 | willose -22.49980926513672
73 | desedge -22.50019073486328
74 | momclub -22.499805450439453
75 | clobero -22.499801635742188
76 | mapauth -22.499797821044922
77 | vitelho -22.500205993652344
78 | daykick -22.500205993652344
79 | sysmite -22.500213623046875
80 | telolon -22.50021743774414
81 | onsensa -22.50021743774414
82 | vipaddy -22.500225067138672
83 | sunrink -22.500225067138672
84 | namhero -22.500225067138672
85 | voratio -22.499771118164062
86 | niliter -22.499771118164062
87 | droones -22.499767303466797
88 | zipcord -22.500232696533203
89 | pagrete -22.500232696533203
90 | funwich -22.500232696533203
91 | negbers -22.499759674072266
92 | belwich -22.499759674072266
93 | allayah -22.499759674072266
94 | pakatak -22.500240325927734
95 | farathy -22.500240325927734
96 | betweek -22.500244140625
97 | rutanim -22.500247955322266
98 | obsster -22.500255584716797
99 | ligigid -22.500255584716797
100 | lidcore -22.500255584716797
101 | vacassa -22.499740600585938
102 | pipiday -22.499736785888672
103 | almorum -22.499736785888672
104 | sadmore -22.500263214111328
105 | hayhorn -22.49972915649414
106 | vinango -22.49972152709961
107 | cosisty -22.50027847290039
108 | libikal -22.499713897705078
109 | dogodes -22.500286102294922
110 | camcore -22.500286102294922
111 | ashmann -22.500286102294922
112 | fibunal -22.500289916992188
113 | enciere -22.499706268310547
114 | revrika -22.49969482421875
115 | perburg -22.500308990478516
116 | camilan -22.500308990478516
117 | sumarms -22.50031280517578
118 | firigin -22.500316619873047
119 | pelatra -22.499675750732422
120 | vorvery -22.500328063964844
121 | purabra -22.500328063964844
122 | indondo -22.50033187866211
123 | dogpeak -22.50033187866211
124 | alllein -22.50033187866211
125 | actblue -22.49966049194336
126 | hasvers -22.50033950805664
127 | freifty -22.499652862548828
128 | hueving -22.500347137451172
129 | coratti -22.499649047851562
130 | saprika -22.499645233154297
131 | honcoin -22.499645233154297
132 | joycons -22.50035858154297
133 | dogoids -22.50035858154297
134 | nanians -22.499637603759766
135 | dreanon -22.499637603759766
136 | spoanna -22.4996337890625
137 | levieur -22.4996337890625
138 | jawolla -22.5003662109375
139 | cowcard -22.5003662109375
140 | thehalb -22.499629974365234
141 | lamboys -22.499629974365234
142 | disorer -22.499629974365234
143 | pigwiki -22.500370025634766
144 | embious -22.500370025634766
145 | detdden -22.500370025634766
146 | vacibel -22.499622344970703
--------------------------------------------------------------------------------
/seven_letter_words/bin3_prob.txt:
--------------------------------------------------------------------------------
1 | tasvinc -30.0
2 | dblshaw -29.999996185302734
3 | cmbodka -29.999996185302734
4 | zagbbox -30.000003814697266
5 | hedoute -30.000003814697266
6 | cmsdest -30.00000762939453
7 | leoanje -29.999988555908203
8 | sitinks -29.999984741210938
9 | oweorno -29.999984741210938
10 | advpite -29.999984741210938
11 | grpwerk -30.000015258789062
12 | aesasio -29.999980926513672
13 | atequir -30.000019073486328
14 | dryhazi -30.000022888183594
15 | styansa -29.99997329711914
16 | sunincl -30.00002670288086
17 | bowamac -30.00002670288086
18 | xyzunik -29.999969482421875
19 | awsposs -30.000030517578125
20 | ogrmode -29.99996566772461
21 | midbyss -29.99996566772461
22 | ctlmony -29.99996566772461
23 | rngmony -30.00003433227539
24 | rergett -29.999961853027344
25 | phperti -29.999961853027344
26 | bfdizzy -30.000041961669922
27 | srcstit -29.999950408935547
28 | pktubic -29.999950408935547
29 | oddourd -29.999950408935547
30 | mplnick -29.999950408935547
31 | dccergy -29.999942779541016
32 | oxyhest -30.000057220458984
33 | klepled -29.99993896484375
34 | digydro -29.99993896484375
35 | aphopez -29.99993896484375
36 | rifntag -30.00006103515625
37 | srvlope -29.999935150146484
38 | emoomez -29.999935150146484
39 | toyelry -30.000064849853516
40 | iniilen -30.000064849853516
41 | iffamma -30.000064849853516
42 | adsokin -29.99993133544922
43 | eofpike -30.00006866455078
44 | dnsavia -30.00006866455078
45 | uitlesi -30.000072479248047
46 | owluntu -30.000072479248047
47 | affesda -29.999923706054688
48 | mgrulia -30.000080108642578
49 | foxmsgs -30.000080108642578
50 | esiaram -30.000080108642578
51 | subzyst -29.999916076660156
52 | ottexpo -30.000083923339844
53 | udpcolo -29.999908447265625
54 | vakdney -29.99990463256836
55 | svmvery -29.99990463256836
56 | dspereo -29.99990463256836
57 | pngpone -30.00009536743164
58 | quiilyn -29.999900817871094
59 | tgtella -30.000102996826172
60 | ithueur -30.000102996826172
61 | wynvinc -30.000106811523438
62 | sezanch -30.000106811523438
63 | sdkjabi -30.000106811523438
64 | yaninem -29.999889373779297
65 | dbgivid -29.999889373779297
66 | adeardu -29.999889373779297
67 | paykich -30.000110626220703
68 | dspdeal -30.000110626220703
69 | cptwipe -30.000110626220703
70 | nikaign -29.99988555908203
71 | pesuell -30.00011444091797
72 | musropp -30.00011444091797
73 | ebxside -30.00011444091797
74 | dnienez -30.000118255615234
75 | dccscal -30.000118255615234
76 | cmbheck -30.000118255615234
77 | stsasks -29.999874114990234
78 | hapixer -29.99987030029297
79 | nikuild -30.00012969970703
80 | wowrapy -30.000133514404297
81 | txtajes -30.000133514404297
82 | gtkoooo -30.000133514404297
83 | sutcmds -30.000137329101562
84 | erviode -29.999858856201172
85 | bewikon -30.000141143798828
86 | hubphas -29.99985122680664
87 | ervpets -29.99985122680664
88 | ofsitem -29.99984359741211
89 | gstivec -29.99984359741211
90 | utfestr -30.00015640258789
91 | etaabic -30.00015640258789
92 | tieibur -29.999839782714844
93 | islssel -30.000160217285156
94 | iodvari -30.000160217285156
95 | zagzept -29.999835968017578
96 | ustjour -29.999835968017578
97 | dexonte -29.999835968017578
98 | bizfilt -29.999835968017578
99 | adaowns -29.999835968017578
100 | tetibri -30.000164031982422
101 | octfirm -29.999828338623047
102 | weiudos -30.000171661376953
103 | pwdtick -30.000171661376953
104 | ttlarry -29.99981689453125
105 | stuimeo -29.999813079833984
106 | sqlstre -29.999813079833984
107 | mieipeg -29.999813079833984
108 | dueafen -29.999813079833984
109 | sndurge -29.99980926513672
110 | vezcorn -30.00019073486328
111 | ilketch -29.999805450439453
112 | zugenth -30.000194549560547
113 | rngiate -30.000194549560547
114 | ottclud -30.000194549560547
115 | aprkeep -30.000194549560547
116 | urlveal -30.000198364257812
117 | msgourd -30.000198364257812
118 | xlsboom -29.999797821044922
119 | wijagma -29.999797821044922
120 | robisbn -29.999797821044922
121 | melmlin -29.999797821044922
122 | samslot -30.000202178955078
123 | nidoust -29.999794006347656
124 | begkits -29.999794006347656
125 | arrflix -29.999794006347656
126 | ditfrau -30.000205993652344
127 | aidomid -30.000205993652344
128 | cptfoto -29.99979019165039
129 | aimrede -29.99979019165039
130 | dbgabay -30.00020980834961
131 | cidlocs -30.00020980834961
132 | booiedo -30.000221252441406
133 | mplders -29.999774932861328
134 | cptpush -30.000225067138672
135 | nahcalc -29.999767303466797
136 | amyovel -29.999767303466797
137 | wonczas -30.00023651123047
138 | mplrome -30.00023651123047
139 | edxesis -30.00023651123047
140 | adcadoo -30.00023651123047
141 | oudtems -29.999759674072266
142 | ociirut -29.999759674072266
143 | balzept -29.999759674072266
144 | avgcorp -29.999759674072266
145 | himocos -30.000240325927734
146 | ignlots -29.999755859375
147 | baztrim -29.999755859375
--------------------------------------------------------------------------------
/seven_letter_words/bin4_prob.txt:
--------------------------------------------------------------------------------
1 | voyxfff -37.500118255615234
2 | qtyijke -37.50014877319336
3 | mmculed -37.50022888183594
4 | jmpytut -37.500362396240234
5 | vtkprit -37.500396728515625
6 | oilrxjs -37.50044631958008
7 | vfsisex -37.499473571777344
8 | eenqrst -37.49935531616211
9 | nbrlyph -37.50071334838867
10 | xmmgota -37.49924850463867
11 | jmpquiv -37.49921798706055
12 | rummqtt -37.50099182128906
13 | xhrdisp -37.49892044067383
14 | ffturaa -37.498897552490234
15 | dexocht -37.50111770629883
16 | xmmgett -37.501121520996094
17 | lvljspx -37.49882125854492
18 | zugwpdb -37.501182556152344
19 | tidmqtt -37.49877166748047
20 | lhsigua -37.498714447021484
21 | sshemsp -37.50141525268555
22 | burrgyz -37.49848556518555
23 | vtkirie -37.498477935791016
24 | vtkifar -37.501522064208984
25 | rpczano -37.50154495239258
26 | vtkinez -37.501609802246094
27 | vtkifie -37.49838638305664
28 | zugymce -37.50162124633789
29 | xcbwent -37.49831008911133
30 | watobjs -37.49827194213867
31 | doiawks -37.49827194213867
32 | cgiacyj -37.498165130615234
33 | czyands -37.501853942871094
34 | mdbgebn -37.49811553955078
35 | atejspx -37.50190353393555
36 | rndxito -37.49806594848633
37 | sdkrxjs -37.501953125
38 | mlxoice -37.501956939697266
39 | mlxahan -37.50198745727539
40 | auxjspx -37.5020751953125
41 | jsxirms -37.50211715698242
42 | czyrgba -37.49782943725586
43 | makrgyz -37.5021858215332
44 | nanighb -37.49776840209961
45 | jsxobil -37.502262115478516
46 | jwtgraf -37.49773406982422
47 | vtkundy -37.49770736694336
48 | jsxuden -37.49759292602539
49 | pszglfw -37.50242233276367
50 | czydamn -37.49753952026367
51 | csvylko -37.502559661865234
52 | wijincl -37.497379302978516
53 | oilrgyz -37.49725341796875
54 | mlxulan -37.497215270996094
55 | xmmepar -37.50278854370117
56 | lodxlsx -37.502803802490234
57 | uczpeon -37.502864837646484
58 | sesrgyz -37.49709701538086
59 | pciavax -37.497066497802734
60 | gpsilik -37.497066497802734
61 | lhszion -37.49706268310547
62 | slaampp -37.49705505371094
63 | uczhtag -37.502952575683594
64 | ouiqrst -37.50295639038086
65 | xhrziel -37.49697494506836
66 | pcbpiar -37.49697494506836
67 | yumxfff -37.49691390991211
68 | fedjspb -37.50309371948242
69 | xmmtega -37.49677658081055
70 | segzoek -37.50347137451172
71 | mezgrpc -37.503543853759766
72 | xcbophe -37.503658294677734
73 | ngxantz -37.49628829956055
74 | aosantd -37.49628829956055
75 | jejymax -37.50380325317383
76 | rerlsru -37.50386428833008
77 | racrgyz -37.50387954711914
78 | rndquam -37.4961051940918
79 | mlxneau -37.50391387939453
80 | rudcych -37.503944396972656
81 | lotlsru -37.50399398803711
82 | abyilog -37.496002197265625
83 | rsaueba -37.504032135009766
84 | jsxioso -37.49593734741211
85 | derjspx -37.50411605834961
86 | vfsgett -37.49586486816406
87 | vtkjure -37.495849609375
88 | phyepar -37.4958381652832
89 | vesxfff -37.5041618347168
90 | lcdleri -37.50421142578125
91 | ifsfeas -37.49577713012695
92 | mmcubbo -37.50423812866211
93 | ircemsp -37.49563217163086
94 | pdbiesz -37.495601654052734
95 | rpciene -37.49557876586914
96 | iodpiar -37.50454330444336
97 | rmslsru -37.504615783691406
98 | rpcumno -37.50465774536133
99 | apkckpt -37.50466537475586
100 | lcdvoir -37.495269775390625
101 | rhsncia -37.50473403930664
102 | owlsetq -37.4952278137207
103 | ifsbrtc -37.50477600097656
104 | csvowej -37.495140075683594
105 | xcborgt -37.495121002197266
106 | sutmobx -37.495079040527344
107 | iovstmt -37.50493240356445
108 | nanmqtt -37.504947662353516
109 | irqphem -37.504947662353516
110 | wndncia -37.494964599609375
111 | xcbided -37.49495315551758
112 | jsxkees -37.49488067626953
113 | cpscsrf -37.494773864746094
114 | jmppeon -37.49476623535156
115 | lhsreta -37.5052375793457
116 | dezrgyz -37.50527572631836
117 | elecsrf -37.50535202026367
118 | atrlymp -37.505374908447266
119 | iodudev -37.494544982910156
120 | xhrkses -37.505516052246094
121 | ngxjspx -37.49443435668945
122 | uczpear -37.49442672729492
123 | npmhlen -37.49440002441406
124 | pcmncmp -37.505611419677734
125 | biczoek -37.49436569213867
126 | dosorrh -37.50564956665039
127 | jejmisc -37.49434280395508
128 | kenjspx -37.494293212890625
129 | idxiaux -37.505767822265625
130 | svgiesz -37.494205474853516
131 | vtkgems -37.49415969848633
132 | glmldre -37.49413299560547
133 | dexumbn -37.50587844848633
134 | kitxfff -37.49406814575195
135 | jsxajan -37.4940071105957
136 | fmtmina -37.49399185180664
137 | gtkthew -37.49397659301758
138 | czyuess -37.50605010986328
139 | iodhait -37.49386978149414
140 | cafantd -37.506141662597656
141 | xcbredo -37.49382400512695
142 | fpswpdb -37.50624465942383
143 | xcbdogs -37.50633239746094
144 | jwtlify -37.493656158447266
145 | rsaellt -37.493629455566406
146 | pkgughs -37.50637435913086
147 | jmpccak -37.49350357055664
148 | pclvais -37.49347686767578
--------------------------------------------------------------------------------
/seven_letter_words/bin5_prob.txt:
--------------------------------------------------------------------------------
1 | czyjspx -44.995792388916016
2 | xcbabwe -45.006473541259766
3 | aktjspx -44.99137878417969
4 | xcbcych -44.979515075683594
5 | xcbziej -45.07548141479492
6 | xmmeczy -44.91748046875
7 | qeddhcp -45.09950637817383
8 | xcbilha -44.897335052490234
9 | xcbacji -44.8853874206543
10 | xcbzung -45.1260871887207
11 | xmmobre -44.83869552612305
12 | xcbquir -45.17741775512695
13 | xcbrouw -45.2041015625
14 | ilkjspx -45.20814895629883
15 | lijglfw -44.79149627685547
16 | foxrgyz -45.21918869018555
17 | jsxrouw -44.767459869384766
18 | xcbziel -45.23471450805664
19 | xcbagua -44.763145446777344
20 | eidtopl -45.24649429321289
21 | xcbximo -44.73112106323242
22 | jwtglfw -44.719486236572266
23 | xcbnerg -44.71344757080078
24 | xcbateg -44.693031311035156
25 | befjspx -44.69113540649414
26 | xcbxlim -44.65083694458008
27 | xcbsemi -44.63022994995117
28 | ketglfw -45.387977600097656
29 | lemjspx -44.60933303833008
30 | xcbcyan -44.60453414916992
31 | xcbsequ -45.410953521728516
32 | xcbemer -45.411563873291016
33 | eoscsrf -44.56328201293945
34 | xcbphot -44.541378021240234
35 | xcbeken -44.509586334228516
36 | xcbolum -44.500850677490234
37 | xcbrodu -45.50664520263672
38 | tepjspx -44.49314880371094
39 | xcbthro -44.48517990112305
40 | xcbueue -44.48493957519531
41 | oscquiv -44.44233322143555
42 | xcbubah -45.56185531616211
43 | xcbodzi -44.43584060668945
44 | mlxquee -45.57368850708008
45 | xcbmdat -45.59005355834961
46 | xcbuell -44.409183502197266
47 | xcbobre -44.40824890136719
48 | xcbuhan -44.403106689453125
49 | tasexpl -45.62323760986328
50 | xcbueil -44.36052322387695
51 | xcbilos -45.64400100708008
52 | iodtopl -45.644203186035156
53 | suttmpl -44.34950637817383
54 | xcbhots -44.319889068603516
55 | xcbosph -44.319034576416016
56 | xcbuego -44.309486389160156
57 | xcbquam -44.30044174194336
58 | kolglfw -44.29965591430664
59 | gesglfw -44.296722412109375
60 | gccorrh -44.29584503173828
61 | mezptom -44.289695739746094
62 | xcbhecy -45.71607971191406
63 | xcbsemb -44.264095306396484
64 | yiijspx -44.26384353637695
65 | meljspx -44.260704040527344
66 | xcbunos -45.74428939819336
67 | xcbunei -44.22948455810547
68 | pisbrtc -44.21781539916992
69 | vehjspx -44.210479736328125
70 | vasrgyz -44.190887451171875
71 | lhsrgyz -44.180213928222656
72 | xcbighb -45.82477951049805
73 | phyfidf -44.17029571533203
74 | kilglfw -45.8333625793457
75 | dukvrir -44.16157150268555
76 | levjspx -44.15993881225586
77 | updrgyz -44.14170837402344
78 | xcbagas -44.1334228515625
79 | opcrgyz -44.13212585449219
80 | ilkjspb -44.12828063964844
81 | curfidf -44.114540100097656
82 | rpcighb -45.8897590637207
83 | xcbacje -44.10778045654297
84 | xcbilih -45.9096794128418
85 | zugcsrf -44.060035705566406
86 | xcbveau -44.05826187133789
87 | rpcasje -44.04568862915039
88 | xcbalsy -44.04135513305664
89 | pcmrouw -44.037845611572266
90 | xcbafil -44.035858154296875
91 | doijspx -44.03323745727539
92 | xcbhtub -44.029544830322266
93 | xcbhear -45.983673095703125
94 | xcbuele -45.988529205322266
95 | opijspx -43.99332809448242
96 | xcbazzo -43.992305755615234
97 | xcboufl -46.008460998535156
98 | akojspx -43.9888801574707
99 | ninmqtt -43.98078536987305
100 | xcbguna -43.96329879760742
101 | idxorrh -43.9370002746582
102 | xcbheit -43.93656921386719
103 | czyxfff -43.92329406738281
104 | voyglfw -43.90713882446289
105 | dynmqtt -43.902496337890625
106 | xcbcoln -46.09786605834961
107 | vezjspx -43.87360763549805
108 | xcbocre -46.13079071044922
109 | cueorrh -43.85930633544922
110 | xmmacje -43.854305267333984
111 | mlxalsy -43.84138870239258
112 | ebxorrh -43.837650299072266
113 | xcbagal -43.82956314086914
114 | xcbzept -43.82637405395508
115 | xcbucle -43.81629180908203
116 | vesjspx -43.8125
117 | xcbiser -43.809242248535156
118 | xcbseau -43.80495834350586
119 | xcbekte -43.8006477355957
120 | lapmqtt -43.79780960083008
121 | abyjspx -43.78347396850586
122 | xcbueba -46.222286224365234
123 | xcbijke -43.77728271484375
124 | xcbvoie -43.76816940307617
125 | xcbudem -43.76424026489258
126 | xcbivol -46.23701095581055
127 | xcbquoi -43.75960159301758
128 | xcbupal -43.75864791870117
129 | zugjspx -43.75846481323242
130 | xcbheel -46.244380950927734
131 | typglfw -43.74939727783203
132 | rpcinqu -43.74385452270508
133 | voyorrh -43.73942947387695
134 | tieglfw -43.73161315917969
135 | hexmqtt -43.7115592956543
136 | xcbacyj -43.708465576171875
137 | aktjspb -43.69775390625
138 | amyjspx -43.6917610168457
139 | ackrgyz -43.690940856933594
140 | xcbokus -43.688011169433594
141 | xcbhtag -43.65958023071289
142 | togjspx -43.652225494384766
143 | xcbuely -43.64830780029297
144 | xcbffic -43.64610290527344
145 | mlxasje -43.64008331298828
146 | xcbunft -43.63233184814453
147 | wieglfw -43.62156677246094
148 | xcbufig -43.615196228027344
149 | xcbueur -43.613521575927734
150 | zagmqtt -43.60862350463867
--------------------------------------------------------------------------------
/seven_letter_words/gpt2_prob_sevenletter.py:
--------------------------------------------------------------------------------
1 |
2 | import torch
3 | from transformers import GPT2LMHeadModel, GPT2Tokenizer
4 | import tiktoken
5 | import logging
6 |
7 | logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, handlers=[logging.StreamHandler(),logging.FileHandler("prob_random_index.log")])
8 |
9 | if torch.cuda.is_available():
10 | device = "cuda"
11 | else:
12 | device = "cpu"
13 |
14 | gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-xl")
15 | gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2-xl").to(device)
16 | gpt4_enc = tiktoken.get_encoding("cl100k_base")
17 |
18 | def pad_batch(batch, pad_idx):
19 | max_length = 0
20 | for seq in batch:
21 | if len(seq) > max_length:
22 | max_length = len(seq)
23 |
24 | new_batch = []
25 | for seq in batch:
26 | padding = [pad_idx for i in range(max_length - len(seq))]
27 | new_batch.append(seq + padding)
28 |
29 | return new_batch
30 |
31 | # Get perplexity using GPT-2
32 | def prob_gpt2(sentence_list):
33 |
34 | # Tokenize the sentences
35 | all_tokens = []
36 | for sentence in sentence_list:
37 | tokens = gpt2_tokenizer.encode(sentence)
38 | all_tokens.append(tokens)
39 | tokens = pad_batch(all_tokens, 50256)
40 |
41 | targets = tokens[:]
42 |
43 | # Compute average log likelihood for the generation
44 | input_ids = torch.LongTensor(tokens).to(device)
45 | target_ids = torch.LongTensor(targets).to(device)
46 |
47 | with torch.no_grad():
48 | outputs = gpt2_model(input_ids, labels=target_ids)
49 | logits = outputs[1]
50 | logits = logits.transpose(0,1)[:-1].transpose(0,1)
51 | target_ids = target_ids.transpose(0,1)[1:].transpose(0,1)
52 | loss = torch.nn.CrossEntropyLoss(reduction="none", ignore_index=50256)(logits.reshape(-1,50257), target_ids.reshape(-1))
53 | loss = loss.reshape(target_ids.shape).sum(dim=1)
54 | neg_log_likelihood = -1*loss
55 |
56 |
57 | # 13.357776641845703 = logprob('The word is"'); removing this to just get
58 | # the word prob
59 | return neg_log_likelihood + 13.357776641845703
60 |
61 |
62 | for finame in ["random_pairs_lower"]:
63 | fi = open(finame + ".txt", "r")
64 | fo = open(finame + "_scored.txt", "w")
65 |
66 | words_with_prob = []
67 |
68 | this_batch_sentences = []
69 | this_batch_words = []
70 | for index, line in enumerate(fi):
71 | if index % 10000 == 0:
72 | logging.info(str(index))
73 |
74 | word = line.strip()
75 |
76 | tokens = gpt4_enc.encode(word)
77 | tokens_spaced = gpt4_enc.encode(" " + word)
78 |
79 | if len(tokens) == 2 and len(tokens_spaced) == 2 and len(word) == 7:
80 | token1 = gpt4_enc.decode([tokens[0]]).strip()
81 | token2 = gpt4_enc.decode([tokens[1]]).strip()
82 |
83 | tokenspaced1 = gpt4_enc.decode([tokens_spaced[0]]).strip()
84 | tokenspaced2 = gpt4_enc.decode([tokens_spaced[1]]).strip()
85 |
86 | if len(token1) == 3 and len(token2) == 4 and len(tokenspaced1) == 3 and len(tokenspaced2) == 4:
87 | this_batch_sentences.append('The word is "' + word + '"')
88 | this_batch_words.append(word)
89 | else:
90 | print(index, "Wrong length", word, len(token1), len(token2), len(tokenspaced1), len(tokenspaced2))
91 | else:
92 | print(index, "Wrong length", word, len(tokens), len(tokens_spaced), len(word))
93 |
94 | if len(this_batch_sentences) == 3000:
95 | logprobs = prob_gpt2(this_batch_sentences)
96 | for word, logprob in zip(this_batch_words, logprobs):
97 | words_with_prob.append([logprob.item(), word])
98 | this_batch_sentences = []
99 | this_batch_words = []
100 |
101 | if len(this_batch_sentences) > 0:
102 | logprobs = prob_gpt2(this_batch_sentences)
103 | for word, logprob in zip(this_batch_words, logprobs):
104 | words_with_prob.append([logprob.item(), word])
105 | this_batch_sentences = []
106 | this_batch_words = []
107 |
108 | for prob, word in sorted(words_with_prob)[::-1]:
109 | fo.write(str(prob) + "\t" + word + "\n")
110 |
111 |
112 |
113 |
114 |
--------------------------------------------------------------------------------
/seven_letter_words/input_scored.txt:
--------------------------------------------------------------------------------
1 | mryycon -33.4009895324707
2 | myvlobd -33.97520446777344
3 | zyvoxdk -42.584041595458984
4 | wynsmew -30.42465591430664
5 | kedkbmr -34.5813102722168
6 | cmrscwc -35.037437438964844
7 | wkbsxob -39.7088508605957
8 | nsckbwc -36.846317291259766
9 | bocmkvo -39.4564094543457
10 | zkigkvv -42.60762405395508
11 | sxpylyh -35.01577377319336
12 | zbocdyx -42.305076599121094
13 | crbsxoc -38.06560134887695
14 | swzvybo -37.10409164428711
15 | kvvydon -34.48329162597656
16 | zbomkcd -39.263160705566406
17 | lybsxqc -42.85857391357422
18 | lkmsvvs -36.32170867919922
19 | wkdbsmo -36.486995697021484
20 | bonslvo -34.44765090942383
21 | klcyvfo -40.78521728515625
22 | yebcovp -34.895774841308594
23 | odrodsm -34.4904670715332
24 | wkixkbn -40.274925231933594
25 | mkvsleb -35.38396072387695
26 | oxfsbyc -39.76297378540039
27 | mkvjyxo -41.52894592285156
28 | cewkdbk -40.9130973815918
29 | nbigkvv -39.965938568115234
30 | swzkvon -37.17292404174805
31 | wkxvkxn -40.5903205871582
32 | nsfsxon -38.56913757324219
33 | myxvkxq -43.61052703857422
34 | dklvoby -37.12471389770508
35 | bonbkpd -34.73928451538086
36 | oaesdkc -42.28152084350586
37 | bkddsxq -40.66632843017578
38 | obbkxmi -41.12004470825195
39 | golmkcd -35.0498161315918
40 | vygvkxn -44.4805908203125
41 | lyiryyn -34.94938659667969
42 | kmdekbi -37.709075927734375
43 | mkdvsuo -38.730655670166016
44 | zedlkmu -36.18986511230469
45 | qkvsvoy -42.10275650024414
46 | bsfkvon -35.44425582885742
47 | fyvyxdo -42.35395812988281
48 | cexczyd -39.47360610961914
49 | bydexnk -37.63880157470703
50 | xydmron -34.41005325317383
51 | dkzbyyd -40.756038665771484
52 | comeboc -25.816699981689453
53 | oxdoxdo -36.08238220214844
54 | yedpvyg -38.156593322753906
55 | loddobc -34.54991912841797
56 | bewzvon -36.644874572753906
57 | lebbson -29.779876708984375
58 | bozevco -32.21761703491211
59 | psvvodc -39.616676330566406
60 | bovkdyb -44.093204498291016
61 | cywlyni -31.04549789428711
62 | exckfon -32.31464767456055
63 | ksvwoxd -38.20396423339844
64 | xynevoc -36.27880096435547
65 | wyxdoby -34.911991119384766
66 | ckdsboc -34.813270568847656
67 | kbmknsk -35.1013069152832
68 | fkvobso -37.41843032836914
69 | sxqvscr -38.215450286865234
70 | neuonyw -33.95500946044922
71 | oczyeco -34.99745178222656
72 | lonofsv -34.87450408935547
73 | bodsmvo -37.443511962890625
74 | wkdsxoo -35.750003814697266
75 | wkhgovv -39.59453201293945
76 | zsmkxdo -41.09931564331055
77 | lklyyxc -38.89170455932617
78 | ohmsdob -35.976707458496094
79 | vycsxqc -43.72141647338867
80 | xoglsoc -33.673892974853516
81 | cobbson -27.039085388183594
82 | mebfsxq -43.0044059753418
83 | xkbbygc -36.12342071533203
84 | bkqqsxq -38.069557189941406
85 | lkxopev -37.932037353515625
86 | zsxkdkc -42.61277389526367
87 | nsfscyx -34.74595642089844
88 | usxpyvu -37.7536735534668
89 | sxnskxk -36.24659729003906
90 | mkbsdkc -36.59408950805664
91 | csvfobi -32.64740753173828
92 | sxuvsxq -40.83545684814453
93 | klcoxco -35.745391845703125
94 | vkfklsd -37.36144256591797
95 | yedcsjo -37.90355682373047
96 | bogsbon -28.6097412109375
97 | klckvyw -38.35791015625
98 | qodlkmu -39.189537048339844
99 | kmmecob -35.23679733276367
100 | cdbsfox -33.410850524902344
101 |
--------------------------------------------------------------------------------
/seven_letter_words/random_token_combos.py:
--------------------------------------------------------------------------------
1 |
2 | import tiktoken
3 |
4 | enc = tiktoken.get_encoding("cl100k_base")
5 |
6 | alphabet = "abcdefghijklmnopqrstuvwxyz"
7 | alphabet_dict = {}
8 | for char in alphabet:
9 | alphabet_dict[char] = 1
10 |
11 | def is_roman_lower(string):
12 | for char in string:
13 | if char not in alphabet_dict:
14 | return False
15 | return True
16 |
17 | all_threes_lower = []
18 | all_fours_lower = []
19 |
20 |
21 | for i in range(100256):
22 | token = enc.decode([i])
23 | if len(token) == 4:
24 | if token[0] == " " and is_roman_lower(token[1:]):
25 | all_threes_lower.append(token)
26 | elif is_roman_lower(token):
27 | all_fours_lower.append(token)
28 |
29 | print(len(all_threes_lower), len(all_fours_lower), len(all_threes_lower)*len(all_fours_lower))
30 | print(all_threes_lower[:10])
31 | print(all_fours_lower[:10])
32 | print("")
33 |
34 | fo_lower = open("random_pairs_lower.txt", "w")
35 |
36 | for start in all_threes_lower:
37 | for end in all_fours_lower:
38 | candidate = start.strip() + end.strip()
39 | tokens_unspaced = enc.encode(candidate)
40 | tokens_spaced = enc.encode(" " + candidate)
41 |
42 | if len(tokens_unspaced) == 2 and len(tokens_spaced) == 2:
43 | if len(enc.decode([tokens_unspaced[0]]).strip()) == 3 and len(enc.decode([tokens_unspaced[1]]).strip()) == 4 and len(enc.decode([tokens_spaced[0]]).strip()) == 3 and len(enc.decode([tokens_spaced[1]]).strip()) == 4:
44 | fo_lower.write(start.strip() + end.strip() + "\n")
45 |
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/seven_letter_words/select_words.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | all_scores, all_words = [], []
4 |
5 | with open("seven_letter_words/random_pairs_lower_scored.txt", "r") as f:
6 | lines = f.readlines()
7 | for line in lines:
8 | score, word = line.split()
9 | all_scores.append(float(score))
10 | all_words.append(word)
11 |
12 | # Function to select 100 words closest to a given score
13 | def select_closest_words(score, num_words=150):
14 |
15 | # Sort the scores based on proximity to the target score
16 | sorted_indices = sorted(range(len(all_scores)), key=lambda i: abs(all_scores[i] - score))
17 |
18 | # Select the 100 closest words
19 | selected_indices = sorted_indices[:num_words]
20 | selected_words = [all_words[i] for i in selected_indices]
21 | scores = [all_scores[i] for i in selected_indices]
22 |
23 | return [selected_words, scores]
24 |
25 | # Select 100 words closest to each specified score level
26 | selected_words_closest_to_levels = {}
27 | selected_words = []
28 | for score_level in [-15, -22.5, -30, -37.5, -45]:
29 | selected_words_closest_to_levels[score_level] = select_closest_words(score_level)
30 | selected_words += selected_words_closest_to_levels[score_level][0]
31 |
32 | selected_words = set(selected_words)
33 | print("Number of selected words: " + str(len(selected_words)))
34 |
35 | with open("seven_letter_words/words_5bins.txt", "w") as f:
36 | for score in [-15, -22.5, -30, -37.5, -45]:
37 | for word,sc in zip(selected_words_closest_to_levels[score][0], selected_words_closest_to_levels[score][1]):
38 | f.write(word + " " + str(sc) + "\n")
39 |
--------------------------------------------------------------------------------
/stimulus_generator.py:
--------------------------------------------------------------------------------
1 | import jsonlines
2 | import os
3 | import random
4 | import argparse
5 |
6 | # Functions for encoding in rot-1 or rot-3
7 | alphabet = "abcdefghijklmnopqrstuvwxyz"
8 | index2char = {}
9 | char2index = {}
10 | for index, char in enumerate(alphabet):
11 | index2char[index] = char
12 | char2index[char] = index
13 |
14 |
15 | def rot_encode(sequence, n):
16 | new_sequence = []
17 | for char in sequence:
18 | if not char.isalpha():
19 | new_sequence.append(char)
20 | elif char.isupper():
21 | index = char2index[char.lower()]
22 | new_char = index2char[(index+n) % 26]
23 | new_sequence.append(new_char.upper())
24 | else:
25 | index = char2index[char]
26 | new_char = index2char[(index+n) % 26]
27 | new_sequence.append(new_char)
28 | return "".join(new_sequence)
29 |
30 |
31 | def create_chain(sequence, n):
32 | chain = []
33 | for index, char in enumerate(sequence):
34 | new_char = rot_encode(char, 26-n)
35 | chain.append(str(index+1) + ". " + char + " -> " + new_char + "\n")
36 | return "".join(chain)
37 |
38 |
39 | def create_math_cot_chain(sequence, n):
40 | s = f'''Let’s start by writing the letter-position mapping for the alphabet:
41 | a -> 0
42 | b -> 1
43 | c -> 2
44 | d -> 3
45 | e -> 4
46 | f -> 5
47 | g -> 6
48 | h -> 7
49 | i -> 8
50 | j -> 9
51 | k -> 10
52 | l -> 11
53 | m -> 12
54 | n -> 13
55 | o -> 14
56 | p -> 15
57 | q -> 16
58 | r -> 17
59 | s -> 18
60 | t -> 19
61 | u -> 20
62 | v -> 21
63 | w -> 22
64 | x -> 23
65 | y -> 24
66 | z -> 25
67 |
68 | Next, we find the encoded letter as follows:
69 | Position of original letter = (Position of given letter − {n}) mod 26
70 |
71 | Then map the found position to the corresponding letter using the letter-position mapping.
72 |
73 | Using this,\n
74 | '''
75 | chain = []
76 | for index, char in enumerate(sequence):
77 | new_char = rot_encode(char, 26-n)
78 | chain.append(str(index+1) + ". " + char + " -> " +
79 | f"({char2index[char]} - {n}) mod 26" " -> " + new_char + "\n")
80 | return s + "".join(chain)
81 |
82 |
83 | def create_number_cot_chain(sequence, n):
84 | s = f'''
85 | New position = (Given position − {n}) mod 26
86 | Using this,\n
87 | '''
88 | chain = []
89 | for index, char in enumerate(sequence):
90 | new_char = rot_encode(char, 26-n)
91 | chain.append(str(index+1) + ". " + str(char2index[char]) + " -> " +
92 | f"({char2index[char]} - {n}) mod 26" " -> " + str(char2index[new_char]) + "\n")
93 | return s + "".join(chain)
94 |
95 |
96 | def create_step_chain_forward(sequence, n):
97 | chain = []
98 | for index, char in enumerate(sequence):
99 | new_char = rot_encode(char, 26-n)
100 | start_ord, end_ord = ord(char), ord(new_char)
101 | part_chain = ""
102 | if char == new_char:
103 | part_chain = new_char + " -> " + new_char
104 | else:
105 | if start_ord > end_ord:
106 | if char.isupper():
107 | end_ord = ord("Z")
108 | else:
109 | end_ord = ord("z")
110 | for char_ord in range(start_ord, end_ord+1, 1):
111 | part_chain += chr(char_ord)
112 | if char_ord != end_ord:
113 | part_chain += " -> "
114 | if char_ord != ord(new_char):
115 | part_chain += " -> "
116 | if char.isupper():
117 | start_ord = ord("A")
118 | else:
119 | start_ord = ord("a")
120 | for char_ord in range(start_ord, ord(new_char)+1, 1):
121 | part_chain += chr(char_ord)
122 | if char_ord != ord(new_char):
123 | part_chain += " -> "
124 |
125 | chain.append(str(index+1) + ". " + part_chain + "\n")
126 | return "".join(chain)
127 |
128 |
129 | def create_math_corrupt_chain(sequence, n):
130 | chain = []
131 | s = f'''Let’s start by writing the letter-position mapping for the alphabet:
132 | a -> 0
133 | b -> 1
134 | c -> 2
135 | d -> 3
136 | e -> 4
137 | f -> 5
138 | g -> 6
139 | h -> 7
140 | i -> 8
141 | j -> 9
142 | k -> 10
143 | l -> 11
144 | m -> 12
145 | n -> 13
146 | o -> 14
147 | p -> 15
148 | q -> 16
149 | r -> 17
150 | s -> 18
151 | t -> 19
152 | u -> 20
153 | v -> 21
154 | w -> 22
155 | x -> 23
156 | y -> 24
157 | z -> 25
158 |
159 | Next, we find the encoded letter as follows:
160 | Position of original letter = (Position of given letter − {n}) mod 26
161 |
162 | Then map the found position to the corresponding letter using the letter-position mapping.
163 |
164 | Using this,\n
165 | '''
166 | chain = []
167 | for index, char in enumerate(sequence):
168 | new_char = '*'
169 | chain.append(str(index+1) + ". " + char + " -> " +
170 | f"({char2index[char]} - {n}) mod 26" " -> " + new_char + "\n")
171 | return s + "".join(chain)
172 |
173 |
174 | def create_corrupt_chain(sequence, n):
175 | chain = []
176 | for index, char in enumerate(sequence):
177 | # random character, letter, or number, punctuation
178 | # candidates = list(alphabet) + [x.upper() for x in list(alphabet)] + list("0123456789") + list(".,?!:;\"'()[]{}")
179 | # replace 50% of the time
180 | # if random.random() < 0.5:
181 | # new_char = random.choice(candidates)
182 | # else:
183 | # new_char = rot_encode(char, 26-n)
184 | # if not char.isalpha():
185 | # new_char = char
186 | # else:
187 | new_char = "*"
188 | chain.append(str(index+1) + ". " + char + " -> " + new_char + "\n")
189 | return "".join(chain)
190 |
191 |
192 | # print(rot_encode("stay", 1))
193 | # print(rot_encode("stay", 3))
194 |
195 |
196 | def create_swap_chain(sequence, n):
197 | chain = []
198 | for index, char in enumerate(sequence):
199 | new_char = rot_encode(char, 26-n)
200 | chain.append(str(index+1) + ". " + char + " -> " + new_char + "\n")
201 | return "".join(chain)
202 |
203 |
204 | def string_to_seq(msg):
205 | seq = ""
206 | for char in msg:
207 | seq += str(char2index[char]) + ","
208 | return seq[:-1]
209 |
210 | def main(args):
211 | data = [
212 | ("examples/bin_1.txt", "bin1"),
213 | ("examples/bin_2.txt", "bin2"),
214 | ("examples/bin_3.txt", "bin3"),
215 | ("examples/bin_4.txt", "bin4"),
216 | ("examples/bin_5.txt", "bin5")
217 | ]
218 | prompt_type = args.prompt_type
219 | fo_directory = f"stimuli/{prompt_type}"
220 |
221 | if not os.path.exists(fo_directory):
222 | os.makedirs(fo_directory, exist_ok=True)
223 |
224 | for shift in range(1, 26):
225 | for task in ["dec"]:
226 | for fi_name, fi_label in data:
227 | fo_name = f"{fo_directory}/{prompt_type+str(shift)}_{fi_label}.jsonl"
228 |
229 | fi = open(fi_name, "r")
230 | fo = open(fo_name, "w")
231 | jsl = jsonlines.Writer(fo)
232 |
233 | count_encoded = 0
234 | for line_num, line in enumerate(fi):
235 | example = {}
236 |
237 | # Task
238 | example["task_name"] = "rot-" + str(shift)
239 |
240 | # Condition
241 | example_type = fo_name.split("_")[1].split(".")[0]
242 | example["example_type"] = example_type
243 |
244 | word = line.strip().split("\t")[0]
245 | sentence = word
246 | # sentence1 = line.strip().split("\t")[0]
247 | encoded = rot_encode(word, shift)
248 |
249 | # Instruction
250 | if task == "dec":
251 | if shift == 1:
252 | if prompt_type == "standard":
253 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' position forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ' along with the original text that it was created from:\nRot-' + str(
254 | shift) + ' text: "' + rot_encode("Stay here!", shift) + '"\nOriginal text: "Stay here!"\n\nHere is another message in rot-' + str(shift) + '. Decode this message to produce the original text:\nRot-' + str(shift) + ' text: "%s"\nOriginal text:'
255 | elif prompt_type == "text_cot":
256 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' position forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("stay", shift) + '"\n\nTo decode this message, we shift each letter ' + str(
257 | shift) + ' position backward.' + create_chain(rot_encode("stay", shift), shift) + '\nTherefore, the original text is: "Stay"\n\nHere is another message in rot-' + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"'
258 | elif prompt_type == "math_cot":
259 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' position forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("stay", shift) + '"\n\nTo decode this message, we need to shift each letter ' + str(
260 | shift) + ' position backward. ' + create_math_cot_chain(rot_encode("stay", shift), shift) + '\nTherefore, the original text is: "stay"\n\nHere is another message in rot-' + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"'
261 | elif prompt_type == "number_cot":
262 | example["task_instruction"] = 'Shift-' + str(shift) + ' is a process in which each number is shifted ' + str(shift) + ' position forward until it reaches 26 and subsequently circles back to 1. For example, here is a sequence of numbers written in shift-' + str(shift) + ':\shift-' + str(shift) + ' sequence: "' + string_to_seq(rot_encode("stay", shift)) + '"\n\nTo decode this sequence, we need to shift each number ' + str(
263 | shift) + ' position backward. ' + create_number_cot_chain(rot_encode("stay", shift), shift) + '\nTherefore, the original sequence of numbers is: ' + f'"{string_to_seq("stay")}"' + '\n\nHere is another sequence of numbers in shift-' + str(shift) + '. Decode this sequence one number at a time. On the last line, write the words "Original sequence:" followed by the decoded sequence:\nshift-' + str(shift) + ' sequence: "%s"'
264 | else:
265 | if prompt_type == "standard":
266 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' positions forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ' along with the original text that it was created from:\nRot-' + str(
267 | shift) + ' text: "' + rot_encode("Stay here!", shift) + '"\nOriginal text: "Stay here!"\n\nHere is another message in rot-' + str(shift) + '. Decode this message to produce the original text:\nRot-' + str(shift) + ' text: "%s"\nOriginal text:'
268 | elif prompt_type == "text_cot":
269 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' positions forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("stay", shift) + '"\n\nTo decode this message, we shift each letter ' + str(
270 | shift) + ' positions backward:\n' + create_chain(rot_encode("stay", shift), shift) + '\nTherefore, the original text is: "stay"\n\nHere is another message in rot-' + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"'
271 | elif prompt_type == "cot_hidden_1":
272 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' positions forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("Stay here!", shift) + '"\n\nTo decode this message, we shift each letter ' + str(shift) + " positions backward; but instead of revealing what each letter becomes, we will replace it with a '*' until we write the final answer:\n" + create_corrupt_chain(
273 | rot_encode("Stay here!", shift), shift) + """\nIf we put together the letters that were hidden behind each '*', we get that the original text is: "Stay here!"\n\nHere is another message in rot-""" + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"'
274 | elif prompt_type == "math_cot":
275 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' position forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("stay", shift) + '"\n\nTo decode this message, we need to shift each letter ' + str(
276 | shift) + ' positions backward. ' + create_math_cot_chain(rot_encode("stay", shift), shift) + '\nTherefore, the original text is: "stay"\n\nHere is another message in rot-' + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"'
277 | elif prompt_type == "number_cot":
278 | example["task_instruction"] = 'Shift-' + str(shift) + ' is a process in which each number is shifted ' + str(shift) + ' positions forward until it reaches 26 and subsequently circles back to 1. For example, here is a sequence of numbers written in shift-' + str(shift) + ':\shift-' + str(shift) + ' sequence: "' + string_to_seq(rot_encode("stay", shift)) + '"\n\nTo decode this sequence, we need to shift each number ' + str(
279 | shift) + ' positions backward. ' + create_number_cot_chain(rot_encode("stay", shift), shift) + '\nTherefore, the original sequence of numbers is:' + f'"{string_to_seq("stay")}"' + '\n\nHere is another sequence of numbers in shift-' + str(shift) + '. Decode this sequence one number at a time. On the last line, write the words "Original sequence:" followed by the decoded sequence:\nshift-' + str(shift) + ' sequence: "%s"'
280 | elif prompt_type == "one-step-fwd":
281 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' positions forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("Stay here!", shift) + '"\n\nTo decode this message, we shift each letter ' + str(
282 | 26-shift) + ' positions forward one step at a time:\n' + create_step_chain_forward(rot_encode("Stay here!", shift), shift) + '\nTherefore, the original text is: "Stay here!"\n\nHere is another message in rot-' + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"'
283 | elif prompt_type == "math_swap":
284 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' positions forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("stay", shift+1) + '"\n\nTo decode this message, we shift each letter ' + str(
285 | shift) + ' positions backward:\n' + create_math_cot_chain(rot_encode("stay", shift+1), shift+1) + '\nTherefore, the original text is: "stay"\n\nHere is another message in rot-' + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"'
286 | elif prompt_type == "math_corrupt":
287 | example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' positions forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("stay", shift) + '"\n\nTo decode this message, we shift each letter ' + str(shift) + " positions backward; but instead of revealing what each letter becomes, we will replace it with a '*' until we write the final answer:\n" + create_math_corrupt_chain(
288 | rot_encode("stay", shift), shift) + """\nIf we put together the letters that were hidden behind each '*', we get that the original text is: "stay"\n\nHere is another message in rot-""" + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"'
289 |
290 | # Input and correct output
291 | if task == "dec":
292 | example["input"] = encoded
293 | example["correct_output"] = sentence
294 | else:
295 | example["input"] = sentence
296 | example["correct_output"] = encoded
297 |
298 | # Combining the instruction and input (this is the string that should be given to the model)
299 | example["instruction_plus_input"] = example["task_instruction"] % example["input"]
300 |
301 | jsl.write(example)
302 |
303 | count_encoded += 1
304 | if count_encoded == 100:
305 | break
306 |
307 | if __name__ == "__main__":
308 | args = argparse.ArgumentParser()
309 | args.add_argument("--prompt_type", type=str, help="Prompt type to use", default="text_cot", choices=["standard", "text_cot", "math_cot", "number_cot"])
310 | args = args.parse_args()
311 | main(args)
--------------------------------------------------------------------------------