├── .gitignore
├── README.md
├── assets
    └── preview.png
├── eval.py
├── examples
    ├── bin_1.txt
    ├── bin_2.txt
    ├── bin_3.txt
    ├── bin_4.txt
    ├── bin_5.txt
    ├── select_swap_words.py
    └── word_pairs_lowbins.txt
├── logs
    ├── basic
    │   ├── claude-3
    │   │   ├── basic10_bin1_temp=0.0.json
    │   │   ├── basic10_bin2_temp=0.0.json
    │   │   ├── basic10_bin3_temp=0.0.json
    │   │   ├── basic10_bin4_temp=0.0.json
    │   │   ├── basic10_bin5_temp=0.0.json
    │   │   ├── basic11_bin1_temp=0.0.json
    │   │   ├── basic11_bin2_temp=0.0.json
    │   │   ├── basic11_bin3_temp=0.0.json
    │   │   ├── basic11_bin4_temp=0.0.json
    │   │   ├── basic11_bin5_temp=0.0.json
    │   │   ├── basic12_bin1_temp=0.0.json
    │   │   ├── basic12_bin2_temp=0.0.json
    │   │   ├── basic12_bin3_temp=0.0.json
    │   │   ├── basic12_bin4_temp=0.0.json
    │   │   ├── basic12_bin5_temp=0.0.json
    │   │   ├── basic13_bin1_temp=0.0.json
    │   │   ├── basic13_bin2_temp=0.0.json
    │   │   ├── basic13_bin3_temp=0.0.json
    │   │   ├── basic13_bin4_temp=0.0.json
    │   │   ├── basic13_bin5_temp=0.0.json
    │   │   ├── basic14_bin1_temp=0.0.json
    │   │   ├── basic14_bin2_temp=0.0.json
    │   │   ├── basic14_bin3_temp=0.0.json
    │   │   ├── basic14_bin4_temp=0.0.json
    │   │   ├── basic14_bin5_temp=0.0.json
    │   │   ├── basic15_bin1_temp=0.0.json
    │   │   ├── basic15_bin2_temp=0.0.json
    │   │   ├── basic15_bin3_temp=0.0.json
    │   │   ├── basic15_bin4_temp=0.0.json
    │   │   ├── basic15_bin5_temp=0.0.json
    │   │   ├── basic16_bin1_temp=0.0.json
    │   │   ├── basic16_bin2_temp=0.0.json
    │   │   ├── basic16_bin3_temp=0.0.json
    │   │   ├── basic16_bin4_temp=0.0.json
    │   │   ├── basic16_bin5_temp=0.0.json
    │   │   ├── basic17_bin1_temp=0.0.json
    │   │   ├── basic17_bin2_temp=0.0.json
    │   │   ├── basic17_bin3_temp=0.0.json
    │   │   ├── basic17_bin4_temp=0.0.json
    │   │   ├── basic17_bin5_temp=0.0.json
    │   │   ├── basic18_bin1_temp=0.0.json
    │   │   ├── basic18_bin2_temp=0.0.json
    │   │   ├── basic18_bin3_temp=0.0.json
    │   │   ├── basic18_bin4_temp=0.0.json
    │   │   ├── basic18_bin5_temp=0.0.json
    │   │   ├── basic19_bin1_temp=0.0.json
    │   │   ├── basic19_bin2_temp=0.0.json
    │   │   ├── basic19_bin3_temp=0.0.json
    │   │   ├── basic19_bin4_temp=0.0.json
    │   │   ├── basic19_bin5_temp=0.0.json
    │   │   ├── basic1_bin1_temp=0.0.json
    │   │   ├── basic1_bin2_temp=0.0.json
    │   │   ├── basic1_bin3_temp=0.0.json
    │   │   ├── basic1_bin4_temp=0.0.json
    │   │   ├── basic1_bin5_temp=0.0.json
    │   │   ├── basic20_bin1_temp=0.0.json
    │   │   ├── basic20_bin2_temp=0.0.json
    │   │   ├── basic20_bin3_temp=0.0.json
    │   │   ├── basic20_bin4_temp=0.0.json
    │   │   ├── basic20_bin5_temp=0.0.json
    │   │   ├── basic21_bin1_temp=0.0.json
    │   │   ├── basic21_bin2_temp=0.0.json
    │   │   ├── basic21_bin3_temp=0.0.json
    │   │   ├── basic21_bin4_temp=0.0.json
    │   │   ├── basic21_bin5_temp=0.0.json
    │   │   ├── basic22_bin1_temp=0.0.json
    │   │   ├── basic22_bin2_temp=0.0.json
    │   │   ├── basic22_bin3_temp=0.0.json
    │   │   ├── basic22_bin4_temp=0.0.json
    │   │   ├── basic22_bin5_temp=0.0.json
    │   │   ├── basic23_bin1_temp=0.0.json
    │   │   ├── basic23_bin2_temp=0.0.json
    │   │   ├── basic23_bin3_temp=0.0.json
    │   │   ├── basic23_bin4_temp=0.0.json
    │   │   ├── basic23_bin5_temp=0.0.json
    │   │   ├── basic24_bin1_temp=0.0.json
    │   │   ├── basic24_bin2_temp=0.0.json
    │   │   ├── basic24_bin3_temp=0.0.json
    │   │   ├── basic24_bin4_temp=0.0.json
    │   │   ├── basic24_bin5_temp=0.0.json
    │   │   ├── basic25_bin1_temp=0.0.json
    │   │   ├── basic25_bin2_temp=0.0.json
    │   │   ├── basic25_bin3_temp=0.0.json
    │   │   ├── basic25_bin4_temp=0.0.json
    │   │   ├── basic25_bin5_temp=0.0.json
    │   │   ├── basic2_bin1_temp=0.0.json
    │   │   ├── basic2_bin2_temp=0.0.json
    │   │   ├── basic2_bin3_temp=0.0.json
    │   │   ├── basic2_bin4_temp=0.0.json
    │   │   ├── basic2_bin5_temp=0.0.json
    │   │   ├── basic3_bin1_temp=0.0.json
    │   │   ├── basic3_bin2_temp=0.0.json
    │   │   ├── basic3_bin3_temp=0.0.json
    │   │   ├── basic3_bin4_temp=0.0.json
    │   │   ├── basic3_bin5_temp=0.0.json
    │   │   ├── basic4_bin1_temp=0.0.json
    │   │   ├── basic4_bin2_temp=0.0.json
    │   │   ├── basic4_bin3_temp=0.0.json
    │   │   ├── basic4_bin4_temp=0.0.json
    │   │   ├── basic4_bin5_temp=0.0.json
    │   │   ├── basic5_bin1_temp=0.0.json
    │   │   ├── basic5_bin2_temp=0.0.json
    │   │   ├── basic5_bin3_temp=0.0.json
    │   │   ├── basic5_bin4_temp=0.0.json
    │   │   ├── basic5_bin5_temp=0.0.json
    │   │   ├── basic6_bin1_temp=0.0.json
    │   │   ├── basic6_bin2_temp=0.0.json
    │   │   ├── basic6_bin3_temp=0.0.json
    │   │   ├── basic6_bin4_temp=0.0.json
    │   │   ├── basic6_bin5_temp=0.0.json
    │   │   ├── basic7_bin1_temp=0.0.json
    │   │   ├── basic7_bin2_temp=0.0.json
    │   │   ├── basic7_bin3_temp=0.0.json
    │   │   ├── basic7_bin4_temp=0.0.json
    │   │   ├── basic7_bin5_temp=0.0.json
    │   │   ├── basic8_bin1_temp=0.0.json
    │   │   ├── basic8_bin2_temp=0.0.json
    │   │   ├── basic8_bin3_temp=0.0.json
    │   │   ├── basic8_bin4_temp=0.0.json
    │   │   ├── basic8_bin5_temp=0.0.json
    │   │   ├── basic9_bin1_temp=0.0.json
    │   │   ├── basic9_bin2_temp=0.0.json
    │   │   ├── basic9_bin3_temp=0.0.json
    │   │   ├── basic9_bin4_temp=0.0.json
    │   │   ├── basic9_bin5_temp=0.0.json
    │   │   └── results.jsonl
    │   ├── llama3.1-405b
    │   │   ├── basic10_bin1_temp=0.0.json
    │   │   ├── basic10_bin2_temp=0.0.json
    │   │   ├── basic10_bin3_temp=0.0.json
    │   │   ├── basic10_bin4_temp=0.0.json
    │   │   ├── basic10_bin5_temp=0.0.json
    │   │   ├── basic11_bin1_temp=0.0.json
    │   │   ├── basic11_bin2_temp=0.0.json
    │   │   ├── basic11_bin3_temp=0.0.json
    │   │   ├── basic11_bin4_temp=0.0.json
    │   │   ├── basic11_bin5_temp=0.0.json
    │   │   ├── basic12_bin1_temp=0.0.json
    │   │   ├── basic12_bin2_temp=0.0.json
    │   │   ├── basic12_bin3_temp=0.0.json
    │   │   ├── basic12_bin4_temp=0.0.json
    │   │   ├── basic12_bin5_temp=0.0.json
    │   │   ├── basic13_bin1_temp=0.0.json
    │   │   ├── basic13_bin2_temp=0.0.json
    │   │   ├── basic13_bin3_temp=0.0.json
    │   │   ├── basic13_bin4_temp=0.0.json
    │   │   ├── basic13_bin5_temp=0.0.json
    │   │   ├── basic14_bin1_temp=0.0.json
    │   │   ├── basic14_bin2_temp=0.0.json
    │   │   ├── basic14_bin3_temp=0.0.json
    │   │   ├── basic14_bin4_temp=0.0.json
    │   │   ├── basic14_bin5_temp=0.0.json
    │   │   ├── basic15_bin1_temp=0.0.json
    │   │   ├── basic15_bin2_temp=0.0.json
    │   │   ├── basic15_bin3_temp=0.0.json
    │   │   ├── basic15_bin4_temp=0.0.json
    │   │   ├── basic15_bin5_temp=0.0.json
    │   │   ├── basic16_bin1_temp=0.0.json
    │   │   ├── basic16_bin2_temp=0.0.json
    │   │   ├── basic16_bin3_temp=0.0.json
    │   │   ├── basic16_bin4_temp=0.0.json
    │   │   ├── basic16_bin5_temp=0.0.json
    │   │   ├── basic17_bin1_temp=0.0.json
    │   │   ├── basic17_bin2_temp=0.0.json
    │   │   ├── basic17_bin3_temp=0.0.json
    │   │   ├── basic17_bin4_temp=0.0.json
    │   │   ├── basic17_bin5_temp=0.0.json
    │   │   ├── basic18_bin1_temp=0.0.json
    │   │   ├── basic18_bin2_temp=0.0.json
    │   │   ├── basic18_bin3_temp=0.0.json
    │   │   ├── basic18_bin4_temp=0.0.json
    │   │   ├── basic18_bin5_temp=0.0.json
    │   │   ├── basic19_bin1_temp=0.0.json
    │   │   ├── basic19_bin2_temp=0.0.json
    │   │   ├── basic19_bin3_temp=0.0.json
    │   │   ├── basic19_bin4_temp=0.0.json
    │   │   ├── basic19_bin5_temp=0.0.json
    │   │   ├── basic1_bin1_temp=0.0.json
    │   │   ├── basic1_bin2_temp=0.0.json
    │   │   ├── basic1_bin3_temp=0.0.json
    │   │   ├── basic1_bin4_temp=0.0.json
    │   │   ├── basic1_bin5_temp=0.0.json
    │   │   ├── basic20_bin1_temp=0.0.json
    │   │   ├── basic20_bin2_temp=0.0.json
    │   │   ├── basic20_bin3_temp=0.0.json
    │   │   ├── basic20_bin4_temp=0.0.json
    │   │   ├── basic20_bin5_temp=0.0.json
    │   │   ├── basic21_bin1_temp=0.0.json
    │   │   ├── basic21_bin2_temp=0.0.json
    │   │   ├── basic21_bin3_temp=0.0.json
    │   │   ├── basic21_bin4_temp=0.0.json
    │   │   ├── basic21_bin5_temp=0.0.json
    │   │   ├── basic22_bin1_temp=0.0.json
    │   │   ├── basic22_bin2_temp=0.0.json
    │   │   ├── basic22_bin3_temp=0.0.json
    │   │   ├── basic22_bin4_temp=0.0.json
    │   │   ├── basic22_bin5_temp=0.0.json
    │   │   ├── basic23_bin1_temp=0.0.json
    │   │   ├── basic23_bin2_temp=0.0.json
    │   │   ├── basic23_bin3_temp=0.0.json
    │   │   ├── basic23_bin4_temp=0.0.json
    │   │   ├── basic23_bin5_temp=0.0.json
    │   │   ├── basic24_bin1_temp=0.0.json
    │   │   ├── basic24_bin2_temp=0.0.json
    │   │   ├── basic24_bin3_temp=0.0.json
    │   │   ├── basic24_bin4_temp=0.0.json
    │   │   ├── basic24_bin5_temp=0.0.json
    │   │   ├── basic25_bin1_temp=0.0.json
    │   │   ├── basic25_bin2_temp=0.0.json
    │   │   ├── basic25_bin3_temp=0.0.json
    │   │   ├── basic25_bin4_temp=0.0.json
    │   │   ├── basic25_bin5_temp=0.0.json
    │   │   ├── basic2_bin1_temp=0.0.json
    │   │   ├── basic2_bin2_temp=0.0.json
    │   │   ├── basic2_bin3_temp=0.0.json
    │   │   ├── basic2_bin4_temp=0.0.json
    │   │   ├── basic2_bin5_temp=0.0.json
    │   │   ├── basic3_bin1_temp=0.0.json
    │   │   ├── basic3_bin2_temp=0.0.json
    │   │   ├── basic3_bin3_temp=0.0.json
    │   │   ├── basic3_bin4_temp=0.0.json
    │   │   ├── basic3_bin5_temp=0.0.json
    │   │   ├── basic4_bin1_temp=0.0.json
    │   │   ├── basic4_bin2_temp=0.0.json
    │   │   ├── basic4_bin3_temp=0.0.json
    │   │   ├── basic4_bin4_temp=0.0.json
    │   │   ├── basic4_bin5_temp=0.0.json
    │   │   ├── basic5_bin1_temp=0.0.json
    │   │   ├── basic5_bin2_temp=0.0.json
    │   │   ├── basic5_bin3_temp=0.0.json
    │   │   ├── basic5_bin4_temp=0.0.json
    │   │   ├── basic5_bin5_temp=0.0.json
    │   │   ├── basic6_bin1_temp=0.0.json
    │   │   ├── basic6_bin2_temp=0.0.json
    │   │   ├── basic6_bin3_temp=0.0.json
    │   │   ├── basic6_bin4_temp=0.0.json
    │   │   ├── basic6_bin5_temp=0.0.json
    │   │   ├── basic7_bin1_temp=0.0.json
    │   │   ├── basic7_bin2_temp=0.0.json
    │   │   ├── basic7_bin3_temp=0.0.json
    │   │   ├── basic7_bin4_temp=0.0.json
    │   │   ├── basic7_bin5_temp=0.0.json
    │   │   ├── basic8_bin1_temp=0.0.json
    │   │   ├── basic8_bin2_temp=0.0.json
    │   │   ├── basic8_bin3_temp=0.0.json
    │   │   ├── basic8_bin4_temp=0.0.json
    │   │   ├── basic8_bin5_temp=0.0.json
    │   │   ├── basic9_bin1_temp=0.0.json
    │   │   ├── basic9_bin2_temp=0.0.json
    │   │   ├── basic9_bin3_temp=0.0.json
    │   │   ├── basic9_bin4_temp=0.0.json
    │   │   ├── basic9_bin5_temp=0.0.json
    │   │   ├── results.jsonl
    │   │   └── results1.jsonl
    │   └── o1
    │   │   ├── basic12_bin1_temp=0.0.json
    │   │   ├── basic12_bin5_temp=0.0.json
    │   │   ├── basic13_bin1_temp=0.0.json
    │   │   ├── basic13_bin5_temp=0.0.json
    │   │   ├── basic14_bin1_temp=0.0.json
    │   │   └── basic14_bin5_temp=0.0.json
    └── text_cot
    │   ├── claude-3
    │       ├── cot10_bin1_temp=0.0.json
    │       ├── cot10_bin2_temp=0.0.json
    │       ├── cot10_bin3_temp=0.0.json
    │       ├── cot10_bin4_temp=0.0.json
    │       ├── cot10_bin5_temp=0.0.json
    │       ├── cot11_bin1_temp=0.0.json
    │       ├── cot11_bin2_temp=0.0.json
    │       ├── cot11_bin3_temp=0.0.json
    │       ├── cot11_bin4_temp=0.0.json
    │       ├── cot11_bin5_temp=0.0.json
    │       ├── cot12_bin1_temp=0.0.json
    │       ├── cot12_bin2_temp=0.0.json
    │       ├── cot12_bin3_temp=0.0.json
    │       ├── cot12_bin4_temp=0.0.json
    │       ├── cot12_bin5_temp=0.0.json
    │       ├── cot13_bin1_temp=0.0.json
    │       ├── cot13_bin2_temp=0.0.json
    │       ├── cot13_bin3_temp=0.0.json
    │       ├── cot13_bin4_temp=0.0.json
    │       ├── cot13_bin5_temp=0.0.json
    │       ├── cot14_bin1_temp=0.0.json
    │       ├── cot14_bin2_temp=0.0.json
    │       ├── cot14_bin3_temp=0.0.json
    │       ├── cot14_bin4_temp=0.0.json
    │       ├── cot14_bin5_temp=0.0.json
    │       ├── cot15_bin1_temp=0.0.json
    │       ├── cot15_bin2_temp=0.0.json
    │       ├── cot15_bin3_temp=0.0.json
    │       ├── cot15_bin4_temp=0.0.json
    │       ├── cot15_bin5_temp=0.0.json
    │       ├── cot16_bin1_temp=0.0.json
    │       ├── cot16_bin2_temp=0.0.json
    │       ├── cot16_bin3_temp=0.0.json
    │       ├── cot16_bin4_temp=0.0.json
    │       ├── cot16_bin5_temp=0.0.json
    │       ├── cot17_bin1_temp=0.0.json
    │       ├── cot17_bin2_temp=0.0.json
    │       ├── cot17_bin3_temp=0.0.json
    │       ├── cot17_bin4_temp=0.0.json
    │       ├── cot17_bin5_temp=0.0.json
    │       ├── cot18_bin1_temp=0.0.json
    │       ├── cot18_bin2_temp=0.0.json
    │       ├── cot18_bin3_temp=0.0.json
    │       ├── cot18_bin4_temp=0.0.json
    │       ├── cot18_bin5_temp=0.0.json
    │       ├── cot19_bin1_temp=0.0.json
    │       ├── cot19_bin2_temp=0.0.json
    │       ├── cot19_bin3_temp=0.0.json
    │       ├── cot19_bin4_temp=0.0.json
    │       ├── cot19_bin5_temp=0.0.json
    │       ├── cot1_bin1_temp=0.0.json
    │       ├── cot1_bin2_temp=0.0.json
    │       ├── cot1_bin3_temp=0.0.json
    │       ├── cot1_bin4_temp=0.0.json
    │       ├── cot1_bin5_temp=0.0.json
    │       ├── cot20_bin1_temp=0.0.json
    │       ├── cot20_bin2_temp=0.0.json
    │       ├── cot20_bin3_temp=0.0.json
    │       ├── cot20_bin4_temp=0.0.json
    │       ├── cot20_bin5_temp=0.0.json
    │       ├── cot21_bin1_temp=0.0.json
    │       ├── cot21_bin2_temp=0.0.json
    │       ├── cot21_bin3_temp=0.0.json
    │       ├── cot21_bin4_temp=0.0.json
    │       ├── cot21_bin5_temp=0.0.json
    │       ├── cot22_bin1_temp=0.0.json
    │       ├── cot22_bin2_temp=0.0.json
    │       ├── cot22_bin3_temp=0.0.json
    │       ├── cot22_bin4_temp=0.0.json
    │       ├── cot22_bin5_temp=0.0.json
    │       ├── cot23_bin1_temp=0.0.json
    │       ├── cot23_bin2_temp=0.0.json
    │       ├── cot23_bin3_temp=0.0.json
    │       ├── cot23_bin4_temp=0.0.json
    │       ├── cot23_bin5_temp=0.0.json
    │       ├── cot24_bin1_temp=0.0.json
    │       ├── cot24_bin2_temp=0.0.json
    │       ├── cot24_bin3_temp=0.0.json
    │       ├── cot24_bin4_temp=0.0.json
    │       ├── cot24_bin5_temp=0.0.json
    │       ├── cot25_bin1_temp=0.0.json
    │       ├── cot25_bin2_temp=0.0.json
    │       ├── cot25_bin3_temp=0.0.json
    │       ├── cot25_bin4_temp=0.0.json
    │       ├── cot25_bin5_temp=0.0.json
    │       ├── cot2_bin1_temp=0.0.json
    │       ├── cot2_bin2_temp=0.0.json
    │       ├── cot2_bin3_temp=0.0.json
    │       ├── cot2_bin4_temp=0.0.json
    │       ├── cot2_bin5_temp=0.0.json
    │       ├── cot3_bin1_temp=0.0.json
    │       ├── cot3_bin2_temp=0.0.json
    │       ├── cot3_bin3_temp=0.0.json
    │       ├── cot3_bin4_temp=0.0.json
    │       ├── cot3_bin5_temp=0.0.json
    │       ├── cot4_bin1_temp=0.0.json
    │       ├── cot4_bin2_temp=0.0.json
    │       ├── cot4_bin3_temp=0.0.json
    │       ├── cot4_bin4_temp=0.0.json
    │       ├── cot4_bin5_temp=0.0.json
    │       ├── cot5_bin1_temp=0.0.json
    │       ├── cot5_bin2_temp=0.0.json
    │       ├── cot5_bin3_temp=0.0.json
    │       ├── cot5_bin4_temp=0.0.json
    │       ├── cot5_bin5_temp=0.0.json
    │       ├── cot6_bin1_temp=0.0.json
    │       ├── cot6_bin2_temp=0.0.json
    │       ├── cot6_bin3_temp=0.0.json
    │       ├── cot6_bin4_temp=0.0.json
    │       ├── cot6_bin5_temp=0.0.json
    │       ├── cot7_bin1_temp=0.0.json
    │       ├── cot7_bin2_temp=0.0.json
    │       ├── cot7_bin3_temp=0.0.json
    │       ├── cot7_bin4_temp=0.0.json
    │       ├── cot7_bin5_temp=0.0.json
    │       ├── cot8_bin1_temp=0.0.json
    │       ├── cot8_bin2_temp=0.0.json
    │       ├── cot8_bin3_temp=0.0.json
    │       ├── cot8_bin4_temp=0.0.json
    │       ├── cot8_bin5_temp=0.0.json
    │       ├── cot9_bin1_temp=0.0.json
    │       ├── cot9_bin2_temp=0.0.json
    │       ├── cot9_bin3_temp=0.0.json
    │       ├── cot9_bin4_temp=0.0.json
    │       ├── cot9_bin5_temp=0.0.json
    │       └── results.jsonl
    │   ├── gpt-4
    │       ├── cot10_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot10_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot10_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot10_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot10_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot11_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot11_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot11_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot11_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot11_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot12_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot12_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot12_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot12_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot12_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot13_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot13_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot13_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot13_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot13_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot14_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot14_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot14_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot14_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot14_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot15_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot15_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot15_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot15_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot15_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot16_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot16_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot16_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot16_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot16_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot17_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot17_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot17_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot17_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot17_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot18_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot18_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot18_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot18_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot18_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot19_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot19_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot19_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot19_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot19_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot1_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot1_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot1_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot1_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot1_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot20_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot20_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot20_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot20_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot20_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot21_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot21_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot21_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot21_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot21_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot22_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot22_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot22_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot22_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot22_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot23_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot23_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot23_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot23_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot23_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot24_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot24_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot24_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot24_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot24_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot25_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot25_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot25_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot25_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot25_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot2_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot2_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot2_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot2_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot2_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot3_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot3_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot3_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot3_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot3_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot4_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot4_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot4_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot4_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot4_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot5_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot5_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot5_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot5_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot5_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot6_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot6_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot6_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot6_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot6_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot7_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot7_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot7_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot7_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot7_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot8_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot8_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot8_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot8_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot8_bin5_gpt-4-0613_temp=0.0.json
    │       ├── cot9_bin1_gpt-4-0613_temp=0.0.json
    │       ├── cot9_bin2_gpt-4-0613_temp=0.0.json
    │       ├── cot9_bin3_gpt-4-0613_temp=0.0.json
    │       ├── cot9_bin4_gpt-4-0613_temp=0.0.json
    │       ├── cot9_bin5_gpt-4-0613_temp=0.0.json
    │       └── results.jsonl
    │   └── llama3.1-405b
    │       ├── cot10_bin1_temp=0.0.json
    │       ├── cot10_bin2_temp=0.0.json
    │       ├── cot10_bin3_temp=0.0.json
    │       ├── cot10_bin4_temp=0.0.json
    │       ├── cot10_bin5_temp=0.0.json
    │       ├── cot11_bin1_temp=0.0.json
    │       ├── cot11_bin2_temp=0.0.json
    │       ├── cot11_bin3_temp=0.0.json
    │       ├── cot11_bin4_temp=0.0.json
    │       ├── cot11_bin5_temp=0.0.json
    │       ├── cot12_bin1_temp=0.0.json
    │       ├── cot12_bin2_temp=0.0.json
    │       ├── cot12_bin3_temp=0.0.json
    │       ├── cot12_bin4_temp=0.0.json
    │       ├── cot12_bin5_temp=0.0.json
    │       ├── cot13_bin1_temp=0.0.json
    │       ├── cot13_bin2_temp=0.0.json
    │       ├── cot13_bin3_temp=0.0.json
    │       ├── cot13_bin4_temp=0.0.json
    │       ├── cot13_bin5_temp=0.0.json
    │       ├── cot14_bin1_temp=0.0.json
    │       ├── cot14_bin2_temp=0.0.json
    │       ├── cot14_bin3_temp=0.0.json
    │       ├── cot14_bin4_temp=0.0.json
    │       ├── cot14_bin5_temp=0.0.json
    │       ├── cot15_bin1_temp=0.0.json
    │       ├── cot15_bin2_temp=0.0.json
    │       ├── cot15_bin3_temp=0.0.json
    │       ├── cot15_bin4_temp=0.0.json
    │       ├── cot15_bin5_temp=0.0.json
    │       ├── cot16_bin1_temp=0.0.json
    │       ├── cot16_bin2_temp=0.0.json
    │       ├── cot16_bin3_temp=0.0.json
    │       ├── cot16_bin4_temp=0.0.json
    │       ├── cot16_bin5_temp=0.0.json
    │       ├── cot17_bin1_temp=0.0.json
    │       ├── cot17_bin2_temp=0.0.json
    │       ├── cot17_bin3_temp=0.0.json
    │       ├── cot17_bin4_temp=0.0.json
    │       ├── cot17_bin5_temp=0.0.json
    │       ├── cot18_bin1_temp=0.0.json
    │       ├── cot18_bin2_temp=0.0.json
    │       ├── cot18_bin3_temp=0.0.json
    │       ├── cot18_bin4_temp=0.0.json
    │       ├── cot18_bin5_temp=0.0.json
    │       ├── cot19_bin1_temp=0.0.json
    │       ├── cot19_bin2_temp=0.0.json
    │       ├── cot19_bin3_temp=0.0.json
    │       ├── cot19_bin4_temp=0.0.json
    │       ├── cot19_bin5_temp=0.0.json
    │       ├── cot1_bin1_temp=0.0.json
    │       ├── cot1_bin2_temp=0.0.json
    │       ├── cot1_bin3_temp=0.0.json
    │       ├── cot1_bin4_temp=0.0.json
    │       ├── cot1_bin5_temp=0.0.json
    │       ├── cot20_bin1_temp=0.0.json
    │       ├── cot20_bin2_temp=0.0.json
    │       ├── cot20_bin3_temp=0.0.json
    │       ├── cot20_bin4_temp=0.0.json
    │       ├── cot20_bin5_temp=0.0.json
    │       ├── cot21_bin1_temp=0.0.json
    │       ├── cot21_bin2_temp=0.0.json
    │       ├── cot21_bin3_temp=0.0.json
    │       ├── cot21_bin4_temp=0.0.json
    │       ├── cot21_bin5_temp=0.0.json
    │       ├── cot22_bin1_temp=0.0.json
    │       ├── cot22_bin2_temp=0.0.json
    │       ├── cot22_bin3_temp=0.0.json
    │       ├── cot22_bin4_temp=0.0.json
    │       ├── cot22_bin5_temp=0.0.json
    │       ├── cot23_bin1_temp=0.0.json
    │       ├── cot23_bin2_temp=0.0.json
    │       ├── cot23_bin3_temp=0.0.json
    │       ├── cot23_bin4_temp=0.0.json
    │       ├── cot23_bin5_temp=0.0.json
    │       ├── cot24_bin1_temp=0.0.json
    │       ├── cot24_bin2_temp=0.0.json
    │       ├── cot24_bin3_temp=0.0.json
    │       ├── cot24_bin4_temp=0.0.json
    │       ├── cot24_bin5_temp=0.0.json
    │       ├── cot25_bin1_temp=0.0.json
    │       ├── cot25_bin2_temp=0.0.json
    │       ├── cot25_bin3_temp=0.0.json
    │       ├── cot25_bin4_temp=0.0.json
    │       ├── cot25_bin5_temp=0.0.json
    │       ├── cot2_bin1_temp=0.0.json
    │       ├── cot2_bin2_temp=0.0.json
    │       ├── cot2_bin3_temp=0.0.json
    │       ├── cot2_bin4_temp=0.0.json
    │       ├── cot2_bin5_temp=0.0.json
    │       ├── cot3_bin1_temp=0.0.json
    │       ├── cot3_bin2_temp=0.0.json
    │       ├── cot3_bin3_temp=0.0.json
    │       ├── cot3_bin4_temp=0.0.json
    │       ├── cot3_bin5_temp=0.0.json
    │       ├── cot4_bin1_temp=0.0.json
    │       ├── cot4_bin2_temp=0.0.json
    │       ├── cot4_bin3_temp=0.0.json
    │       ├── cot4_bin4_temp=0.0.json
    │       ├── cot4_bin5_temp=0.0.json
    │       ├── cot5_bin1_temp=0.0.json
    │       ├── cot5_bin2_temp=0.0.json
    │       ├── cot5_bin3_temp=0.0.json
    │       ├── cot5_bin4_temp=0.0.json
    │       ├── cot5_bin5_temp=0.0.json
    │       ├── cot6_bin1_temp=0.0.json
    │       ├── cot6_bin2_temp=0.0.json
    │       ├── cot6_bin3_temp=0.0.json
    │       ├── cot6_bin4_temp=0.0.json
    │       ├── cot6_bin5_temp=0.0.json
    │       ├── cot7_bin1_temp=0.0.json
    │       ├── cot7_bin2_temp=0.0.json
    │       ├── cot7_bin3_temp=0.0.json
    │       ├── cot7_bin4_temp=0.0.json
    │       ├── cot7_bin5_temp=0.0.json
    │       ├── cot8_bin1_temp=0.0.json
    │       ├── cot8_bin2_temp=0.0.json
    │       ├── cot8_bin3_temp=0.0.json
    │       ├── cot8_bin4_temp=0.0.json
    │       ├── cot8_bin5_temp=0.0.json
    │       ├── cot9_bin1_temp=0.0.json
    │       ├── cot9_bin2_temp=0.0.json
    │       ├── cot9_bin3_temp=0.0.json
    │       ├── cot9_bin4_temp=0.0.json
    │       ├── cot9_bin5_temp=0.0.json
    │       └── results.jsonl
├── models
    └── openai_help.py
├── regression
    ├── README.md
    ├── create_train_table.py
    ├── regression.ipynb
    ├── text_cot_test_results.tsv
    ├── text_cot_test_table.tsv
    ├── text_cot_train_results.tsv
    └── text_cot_train_table.tsv
├── run_claude3.py
├── run_llama3.py
├── run_o1.py
├── run_openai.py
├── seven_letter_words
    ├── README.md
    ├── bin1_prob.txt
    ├── bin2_prob.txt
    ├── bin3_prob.txt
    ├── bin4_prob.txt
    ├── bin5_prob.txt
    ├── gpt2_prob_sevenletter.py
    ├── input_scored.txt
    ├── random_token_combos.py
    ├── select_words.py
    └── words_5bins.txt
├── stimuli
    ├── math_cot
    │   ├── math_cot19_bin1.jsonl
    │   ├── math_cot19_bin2.jsonl
    │   ├── math_cot19_bin3.jsonl
    │   ├── math_cot19_bin4.jsonl
    │   ├── math_cot19_bin5.jsonl
    │   ├── math_cot20_bin1.jsonl
    │   ├── math_cot20_bin2.jsonl
    │   ├── math_cot20_bin3.jsonl
    │   ├── math_cot20_bin4.jsonl
    │   ├── math_cot20_bin5.jsonl
    │   ├── math_cot21_bin1.jsonl
    │   ├── math_cot21_bin2.jsonl
    │   ├── math_cot21_bin3.jsonl
    │   ├── math_cot21_bin4.jsonl
    │   ├── math_cot21_bin5.jsonl
    │   ├── math_cot22_bin1.jsonl
    │   ├── math_cot22_bin2.jsonl
    │   ├── math_cot22_bin3.jsonl
    │   ├── math_cot22_bin4.jsonl
    │   ├── math_cot22_bin5.jsonl
    │   ├── math_cot23_bin1.jsonl
    │   ├── math_cot23_bin2.jsonl
    │   ├── math_cot23_bin3.jsonl
    │   ├── math_cot23_bin4.jsonl
    │   └── math_cot23_bin5.jsonl
    ├── math_swap
    │   └── math_swap4_bin5.jsonl
    ├── number_cot
    │   ├── math10_bin1.jsonl
    │   ├── math10_bin2.jsonl
    │   ├── math10_bin3.jsonl
    │   ├── math10_bin4.jsonl
    │   ├── math10_bin5.jsonl
    │   ├── math11_bin1.jsonl
    │   ├── math11_bin2.jsonl
    │   ├── math11_bin3.jsonl
    │   ├── math11_bin4.jsonl
    │   ├── math11_bin5.jsonl
    │   ├── math12_bin1.jsonl
    │   ├── math12_bin2.jsonl
    │   ├── math12_bin3.jsonl
    │   ├── math12_bin4.jsonl
    │   ├── math12_bin5.jsonl
    │   ├── math13_bin1.jsonl
    │   ├── math13_bin2.jsonl
    │   ├── math13_bin3.jsonl
    │   ├── math13_bin4.jsonl
    │   ├── math13_bin5.jsonl
    │   ├── math14_bin1.jsonl
    │   ├── math14_bin2.jsonl
    │   ├── math14_bin3.jsonl
    │   ├── math14_bin4.jsonl
    │   ├── math14_bin5.jsonl
    │   ├── math15_bin1.jsonl
    │   ├── math15_bin2.jsonl
    │   ├── math15_bin3.jsonl
    │   ├── math15_bin4.jsonl
    │   ├── math15_bin5.jsonl
    │   ├── math16_bin1.jsonl
    │   ├── math16_bin2.jsonl
    │   ├── math16_bin3.jsonl
    │   ├── math16_bin4.jsonl
    │   ├── math16_bin5.jsonl
    │   ├── math17_bin1.jsonl
    │   ├── math17_bin2.jsonl
    │   ├── math17_bin3.jsonl
    │   ├── math17_bin4.jsonl
    │   ├── math17_bin5.jsonl
    │   ├── math18_bin1.jsonl
    │   ├── math18_bin2.jsonl
    │   ├── math18_bin3.jsonl
    │   ├── math18_bin4.jsonl
    │   ├── math18_bin5.jsonl
    │   ├── math19_bin1.jsonl
    │   ├── math19_bin2.jsonl
    │   ├── math19_bin3.jsonl
    │   ├── math19_bin4.jsonl
    │   ├── math19_bin5.jsonl
    │   ├── math1_bin1.jsonl
    │   ├── math1_bin2.jsonl
    │   ├── math1_bin3.jsonl
    │   ├── math1_bin4.jsonl
    │   ├── math1_bin5.jsonl
    │   ├── math20_bin1.jsonl
    │   ├── math20_bin2.jsonl
    │   ├── math20_bin3.jsonl
    │   ├── math20_bin4.jsonl
    │   ├── math20_bin5.jsonl
    │   ├── math21_bin1.jsonl
    │   ├── math21_bin2.jsonl
    │   ├── math21_bin3.jsonl
    │   ├── math21_bin4.jsonl
    │   ├── math21_bin5.jsonl
    │   ├── math22_bin1.jsonl
    │   ├── math22_bin2.jsonl
    │   ├── math22_bin3.jsonl
    │   ├── math22_bin4.jsonl
    │   ├── math22_bin5.jsonl
    │   ├── math23_bin1.jsonl
    │   ├── math23_bin2.jsonl
    │   ├── math23_bin3.jsonl
    │   ├── math23_bin4.jsonl
    │   ├── math23_bin5.jsonl
    │   ├── math24_bin1.jsonl
    │   ├── math24_bin2.jsonl
    │   ├── math24_bin3.jsonl
    │   ├── math24_bin4.jsonl
    │   ├── math24_bin5.jsonl
    │   ├── math25_bin1.jsonl
    │   ├── math25_bin2.jsonl
    │   ├── math25_bin3.jsonl
    │   ├── math25_bin4.jsonl
    │   ├── math25_bin5.jsonl
    │   ├── math2_bin1.jsonl
    │   ├── math2_bin2.jsonl
    │   ├── math2_bin3.jsonl
    │   ├── math2_bin4.jsonl
    │   ├── math2_bin5.jsonl
    │   ├── math3_bin1.jsonl
    │   ├── math3_bin2.jsonl
    │   ├── math3_bin3.jsonl
    │   ├── math3_bin4.jsonl
    │   ├── math3_bin5.jsonl
    │   ├── math4_bin1.jsonl
    │   ├── math4_bin2.jsonl
    │   ├── math4_bin3.jsonl
    │   ├── math4_bin4.jsonl
    │   ├── math4_bin5.jsonl
    │   ├── math5_bin1.jsonl
    │   ├── math5_bin2.jsonl
    │   ├── math5_bin3.jsonl
    │   ├── math5_bin4.jsonl
    │   ├── math5_bin5.jsonl
    │   ├── math6_bin1.jsonl
    │   ├── math6_bin2.jsonl
    │   ├── math6_bin3.jsonl
    │   ├── math6_bin4.jsonl
    │   ├── math6_bin5.jsonl
    │   ├── math7_bin1.jsonl
    │   ├── math7_bin2.jsonl
    │   ├── math7_bin3.jsonl
    │   ├── math7_bin4.jsonl
    │   ├── math7_bin5.jsonl
    │   ├── math8_bin1.jsonl
    │   ├── math8_bin2.jsonl
    │   ├── math8_bin3.jsonl
    │   ├── math8_bin4.jsonl
    │   ├── math8_bin5.jsonl
    │   ├── math9_bin1.jsonl
    │   ├── math9_bin2.jsonl
    │   ├── math9_bin3.jsonl
    │   ├── math9_bin4.jsonl
    │   └── math9_bin5.jsonl
    ├── standard
    │   ├── basic10_bin1.jsonl
    │   ├── basic10_bin2.jsonl
    │   ├── basic10_bin3.jsonl
    │   ├── basic10_bin4.jsonl
    │   ├── basic10_bin5.jsonl
    │   ├── basic11_bin1.jsonl
    │   ├── basic11_bin2.jsonl
    │   ├── basic11_bin3.jsonl
    │   ├── basic11_bin4.jsonl
    │   ├── basic11_bin5.jsonl
    │   ├── basic12_bin1.jsonl
    │   ├── basic12_bin2.jsonl
    │   ├── basic12_bin3.jsonl
    │   ├── basic12_bin4.jsonl
    │   ├── basic12_bin5.jsonl
    │   ├── basic13_bin1.jsonl
    │   ├── basic13_bin2.jsonl
    │   ├── basic13_bin3.jsonl
    │   ├── basic13_bin4.jsonl
    │   ├── basic13_bin5.jsonl
    │   ├── basic14_bin1.jsonl
    │   ├── basic14_bin2.jsonl
    │   ├── basic14_bin3.jsonl
    │   ├── basic14_bin4.jsonl
    │   ├── basic14_bin5.jsonl
    │   ├── basic15_bin1.jsonl
    │   ├── basic15_bin2.jsonl
    │   ├── basic15_bin3.jsonl
    │   ├── basic15_bin4.jsonl
    │   ├── basic15_bin5.jsonl
    │   ├── basic16_bin1.jsonl
    │   ├── basic16_bin2.jsonl
    │   ├── basic16_bin3.jsonl
    │   ├── basic16_bin4.jsonl
    │   ├── basic16_bin5.jsonl
    │   ├── basic17_bin1.jsonl
    │   ├── basic17_bin2.jsonl
    │   ├── basic17_bin3.jsonl
    │   ├── basic17_bin4.jsonl
    │   ├── basic17_bin5.jsonl
    │   ├── basic18_bin1.jsonl
    │   ├── basic18_bin2.jsonl
    │   ├── basic18_bin3.jsonl
    │   ├── basic18_bin4.jsonl
    │   ├── basic18_bin5.jsonl
    │   ├── basic19_bin1.jsonl
    │   ├── basic19_bin2.jsonl
    │   ├── basic19_bin3.jsonl
    │   ├── basic19_bin4.jsonl
    │   ├── basic19_bin5.jsonl
    │   ├── basic1_bin1.jsonl
    │   ├── basic1_bin2.jsonl
    │   ├── basic1_bin3.jsonl
    │   ├── basic1_bin4.jsonl
    │   ├── basic1_bin5.jsonl
    │   ├── basic20_bin1.jsonl
    │   ├── basic20_bin2.jsonl
    │   ├── basic20_bin3.jsonl
    │   ├── basic20_bin4.jsonl
    │   ├── basic20_bin5.jsonl
    │   ├── basic21_bin1.jsonl
    │   ├── basic21_bin2.jsonl
    │   ├── basic21_bin3.jsonl
    │   ├── basic21_bin4.jsonl
    │   ├── basic21_bin5.jsonl
    │   ├── basic22_bin1.jsonl
    │   ├── basic22_bin2.jsonl
    │   ├── basic22_bin3.jsonl
    │   ├── basic22_bin4.jsonl
    │   ├── basic22_bin5.jsonl
    │   ├── basic23_bin1.jsonl
    │   ├── basic23_bin2.jsonl
    │   ├── basic23_bin3.jsonl
    │   ├── basic23_bin4.jsonl
    │   ├── basic23_bin5.jsonl
    │   ├── basic24_bin1.jsonl
    │   ├── basic24_bin2.jsonl
    │   ├── basic24_bin3.jsonl
    │   ├── basic24_bin4.jsonl
    │   ├── basic24_bin5.jsonl
    │   ├── basic25_bin1.jsonl
    │   ├── basic25_bin2.jsonl
    │   ├── basic25_bin3.jsonl
    │   ├── basic25_bin4.jsonl
    │   ├── basic25_bin5.jsonl
    │   ├── basic2_bin1.jsonl
    │   ├── basic2_bin2.jsonl
    │   ├── basic2_bin3.jsonl
    │   ├── basic2_bin4.jsonl
    │   ├── basic2_bin5.jsonl
    │   ├── basic3_bin1.jsonl
    │   ├── basic3_bin2.jsonl
    │   ├── basic3_bin3.jsonl
    │   ├── basic3_bin4.jsonl
    │   ├── basic3_bin5.jsonl
    │   ├── basic4_bin1.jsonl
    │   ├── basic4_bin2.jsonl
    │   ├── basic4_bin3.jsonl
    │   ├── basic4_bin4.jsonl
    │   ├── basic4_bin5.jsonl
    │   ├── basic5_bin1.jsonl
    │   ├── basic5_bin2.jsonl
    │   ├── basic5_bin3.jsonl
    │   ├── basic5_bin4.jsonl
    │   ├── basic5_bin5.jsonl
    │   ├── basic6_bin1.jsonl
    │   ├── basic6_bin2.jsonl
    │   ├── basic6_bin3.jsonl
    │   ├── basic6_bin4.jsonl
    │   ├── basic6_bin5.jsonl
    │   ├── basic7_bin1.jsonl
    │   ├── basic7_bin2.jsonl
    │   ├── basic7_bin3.jsonl
    │   ├── basic7_bin4.jsonl
    │   ├── basic7_bin5.jsonl
    │   ├── basic8_bin1.jsonl
    │   ├── basic8_bin2.jsonl
    │   ├── basic8_bin3.jsonl
    │   ├── basic8_bin4.jsonl
    │   ├── basic8_bin5.jsonl
    │   ├── basic9_bin1.jsonl
    │   ├── basic9_bin2.jsonl
    │   ├── basic9_bin3.jsonl
    │   ├── basic9_bin4.jsonl
    │   └── basic9_bin5.jsonl
    ├── swap
    │   ├── cot13_bin5.jsonl
    │   ├── cot14_bin5.jsonl
    │   ├── cot4_bin5.jsonl
    │   ├── cot5_bin5.jsonl
    │   ├── swap13c_bin5.jsonl
    │   ├── swap14c_bin5.jsonl
    │   ├── swap4c_bin5.jsonl
    │   └── swap5c_bin5.jsonl
    └── text_cot
    │   ├── cot10_bin1.jsonl
    │   ├── cot10_bin2.jsonl
    │   ├── cot10_bin3.jsonl
    │   ├── cot10_bin4.jsonl
    │   ├── cot10_bin5.jsonl
    │   ├── cot11_bin1.jsonl
    │   ├── cot11_bin2.jsonl
    │   ├── cot11_bin3.jsonl
    │   ├── cot11_bin4.jsonl
    │   ├── cot11_bin5.jsonl
    │   ├── cot12_bin1.jsonl
    │   ├── cot12_bin2.jsonl
    │   ├── cot12_bin3.jsonl
    │   ├── cot12_bin4.jsonl
    │   ├── cot12_bin5.jsonl
    │   ├── cot13_bin1.jsonl
    │   ├── cot13_bin2.jsonl
    │   ├── cot13_bin3.jsonl
    │   ├── cot13_bin4.jsonl
    │   ├── cot13_bin5.jsonl
    │   ├── cot14_bin1.jsonl
    │   ├── cot14_bin2.jsonl
    │   ├── cot14_bin3.jsonl
    │   ├── cot14_bin4.jsonl
    │   ├── cot14_bin5.jsonl
    │   ├── cot15_bin1.jsonl
    │   ├── cot15_bin2.jsonl
    │   ├── cot15_bin3.jsonl
    │   ├── cot15_bin4.jsonl
    │   ├── cot15_bin5.jsonl
    │   ├── cot16_bin1.jsonl
    │   ├── cot16_bin2.jsonl
    │   ├── cot16_bin3.jsonl
    │   ├── cot16_bin4.jsonl
    │   ├── cot16_bin5.jsonl
    │   ├── cot17_bin1.jsonl
    │   ├── cot17_bin2.jsonl
    │   ├── cot17_bin3.jsonl
    │   ├── cot17_bin4.jsonl
    │   ├── cot17_bin5.jsonl
    │   ├── cot18_bin1.jsonl
    │   ├── cot18_bin2.jsonl
    │   ├── cot18_bin3.jsonl
    │   ├── cot18_bin4.jsonl
    │   ├── cot18_bin5.jsonl
    │   ├── cot19_bin1.jsonl
    │   ├── cot19_bin2.jsonl
    │   ├── cot19_bin3.jsonl
    │   ├── cot19_bin4.jsonl
    │   ├── cot19_bin5.jsonl
    │   ├── cot1_bin1.jsonl
    │   ├── cot1_bin2.jsonl
    │   ├── cot1_bin3.jsonl
    │   ├── cot1_bin4.jsonl
    │   ├── cot1_bin5.jsonl
    │   ├── cot20_bin1.jsonl
    │   ├── cot20_bin2.jsonl
    │   ├── cot20_bin3.jsonl
    │   ├── cot20_bin4.jsonl
    │   ├── cot20_bin5.jsonl
    │   ├── cot21_bin1.jsonl
    │   ├── cot21_bin2.jsonl
    │   ├── cot21_bin3.jsonl
    │   ├── cot21_bin4.jsonl
    │   ├── cot21_bin5.jsonl
    │   ├── cot22_bin1.jsonl
    │   ├── cot22_bin2.jsonl
    │   ├── cot22_bin3.jsonl
    │   ├── cot22_bin4.jsonl
    │   ├── cot22_bin5.jsonl
    │   ├── cot23_bin1.jsonl
    │   ├── cot23_bin2.jsonl
    │   ├── cot23_bin3.jsonl
    │   ├── cot23_bin4.jsonl
    │   ├── cot23_bin5.jsonl
    │   ├── cot24_bin1.jsonl
    │   ├── cot24_bin2.jsonl
    │   ├── cot24_bin3.jsonl
    │   ├── cot24_bin4.jsonl
    │   ├── cot24_bin5.jsonl
    │   ├── cot25_bin1.jsonl
    │   ├── cot25_bin2.jsonl
    │   ├── cot25_bin3.jsonl
    │   ├── cot25_bin4.jsonl
    │   ├── cot25_bin5.jsonl
    │   ├── cot2_bin1.jsonl
    │   ├── cot2_bin2.jsonl
    │   ├── cot2_bin3.jsonl
    │   ├── cot2_bin4.jsonl
    │   ├── cot2_bin5.jsonl
    │   ├── cot3_bin1.jsonl
    │   ├── cot3_bin2.jsonl
    │   ├── cot3_bin3.jsonl
    │   ├── cot3_bin4.jsonl
    │   ├── cot3_bin5.jsonl
    │   ├── cot4_bin1.jsonl
    │   ├── cot4_bin2.jsonl
    │   ├── cot4_bin3.jsonl
    │   ├── cot4_bin4.jsonl
    │   ├── cot4_bin5.jsonl
    │   ├── cot5_bin1.jsonl
    │   ├── cot5_bin2.jsonl
    │   ├── cot5_bin3.jsonl
    │   ├── cot5_bin4.jsonl
    │   ├── cot5_bin5.jsonl
    │   ├── cot6_bin1.jsonl
    │   ├── cot6_bin2.jsonl
    │   ├── cot6_bin3.jsonl
    │   ├── cot6_bin4.jsonl
    │   ├── cot6_bin5.jsonl
    │   ├── cot7_bin1.jsonl
    │   ├── cot7_bin2.jsonl
    │   ├── cot7_bin3.jsonl
    │   ├── cot7_bin4.jsonl
    │   ├── cot7_bin5.jsonl
    │   ├── cot8_bin1.jsonl
    │   ├── cot8_bin2.jsonl
    │   ├── cot8_bin3.jsonl
    │   ├── cot8_bin4.jsonl
    │   ├── cot8_bin5.jsonl
    │   ├── cot9_bin1.jsonl
    │   ├── cot9_bin2.jsonl
    │   ├── cot9_bin3.jsonl
    │   ├── cot9_bin4.jsonl
    │   └── cot9_bin5.jsonl
└── stimulus_generator.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # custom
  2 | seven_letter_words/random_pairs_lower*
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | cover/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | .pybuilder/
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | #   For a library or package, you might want to ignore these files since the code is
 90 | #   intended to run in multiple environments; otherwise, check them in:
 91 | # .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # poetry
101 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
103 | #   commonly ignored for libraries.
104 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105 | #poetry.lock
106 | 
107 | # pdm
108 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109 | #pdm.lock
110 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111 | #   in version control.
112 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
113 | .pdm.toml
114 | .pdm-python
115 | .pdm-build/
116 | 
117 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
118 | __pypackages__/
119 | 
120 | # Celery stuff
121 | celerybeat-schedule
122 | celerybeat.pid
123 | 
124 | # SageMath parsed files
125 | *.sage.py
126 | 
127 | # Environments
128 | .env
129 | .venv
130 | env/
131 | venv/
132 | ENV/
133 | env.bak/
134 | venv.bak/
135 | 
136 | # Spyder project settings
137 | .spyderproject
138 | .spyproject
139 | 
140 | # Rope project settings
141 | .ropeproject
142 | 
143 | # mkdocs documentation
144 | /site
145 | 
146 | # mypy
147 | .mypy_cache/
148 | .dmypy.json
149 | dmypy.json
150 | 
151 | # Pyre type checker
152 | .pyre/
153 | 
154 | # pytype static type analyzer
155 | .pytype/
156 | 
157 | # Cython debug symbols
158 | cython_debug/
159 | 
160 | # PyCharm
161 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
162 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
163 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
164 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
165 | #.idea/
166 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # deciphering_cot
 2 | 
 3 | Code implementation and data for the paper: 
 4 | 
 5 | **[Deciphering the Factors Influencing the Efficacy of Chain-of-Thought: Probability, Memorization, and Noisy Reasoning](https://arxiv.org/abs/2407.01687)**
 6 | 
 7 | [Akshara Prabhakar](https://aksh555.github.io/), [Thomas L. Griffiths](https://cocosci.princeton.edu/tom/index.php), [R. Thomas McCoy](https://rtmccoy.com/)
 8 | 
 9 | <img src="assets/preview.png">
10 | 
11 | ## Quickstart
12 | ### Data
13 | We construct a dataset of seven-letter words divided into 5 probability bins {bin1 to bin 5} each having around 150 words (first 100 to evaluate GPT-4 and remaining to evaluate the logistic regression model that was fitted on the first 100 words). The binning is done based on the log probability value assigned by GPT-2. 
14 | 
15 | The seven-letter word dataset is in [seven_letter_words](seven_letter_words):
16 | - bin1_prob.txt
17 | - bin2_prob.txt
18 | - bin3_prob.txt
19 | - bin4_prob.txt
20 | - bin5_prob.txt
21 | 
22 | ### Shift cipher stimuli
23 | Using the seven-letter word dataset, we prepare stimuli -- these are shift cipher encoded versions of the words from the 5 probability bins across 25 shift levels (1 to 25).
24 | 
25 | The stimuli are prepared for the different types of prompts we use: `standard`, `text_cot`, `math_cot`, `number_cot`.
26 | 
27 | Can be created by running,
28 | ```bash
29 | python stimulus_generator.py --prompt_type <text_cot> 
30 | ```
31 | 
32 | ### Evaluating LLMs on shift ciphers
33 | - GPT-4: `run_openai.py`
34 | - Llama 3.1: `run_llama.py`
35 | - Claude 3: `run_claude.py`
36 | 
37 | Set appropriate OpenAI, Together, Anthropic keys in the environment before running evaluations.
38 | 
39 | For example to run experiments on GPT-4 with Text-CoT for shift_level=1 across all 5 bins run,
40 | ```bash
41 | python run_openai.py --tasks text_cot1 --conditions bin1,bin2,bin3,bin4,bin5 --max_tokens 200 --prompt_type text_cot
42 | ```
43 | 
44 | To evaluate the generations, run
45 | ```bash 
46 | python eval.py --prompt_type text_cot --create_stats_table
47 | ```
48 | Run this after evaluating GPT-4 across all shift levels and bins. This will generate the evluation statistics for `text_cot` across all shift levels and the `{prompt_type}_train_table.tsv` file which is the train statistics table for fitting the logistic regression.
49 | 
50 | ### Logistic regression
51 | The logistic regression is implemented in R in [regression.ipynb](regression/regression.ipynb). The predictions on the test set are saved in `regression/text_cot_test_results.tsv`.
52 | 
53 | ### Outputs
54 | All model generations and outputs are stored in the `logs` directory.
55 | 
56 | ## Citation
57 | If you find this repository helpful, feel free to cite our [publication](https://arxiv.org/abs/2407.01687).
58 | ```
59 | @inproceedings{prabhakar-etal-2024-deciphering,
60 |     title = "Deciphering the Factors Influencing the Efficacy of Chain-of-Thought: Probability, Memorization, and Noisy Reasoning",
61 |     author = "Prabhakar, Akshara  and
62 |       Griffiths, Thomas L.  and
63 |       McCoy, R. Thomas",
64 |     editor = "Al-Onaizan, Yaser  and
65 |       Bansal, Mohit  and
66 |       Chen, Yun-Nung",
67 |     booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
68 |     month = nov,
69 |     year = "2024",
70 |     address = "Miami, Florida, USA",
71 |     publisher = "Association for Computational Linguistics",
72 |     url = "https://aclanthology.org/2024.findings-emnlp.212",
73 |     pages = "3710--3724"
74 | }
75 | ```
76 | 


--------------------------------------------------------------------------------
/assets/preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aksh555/deciphering_cot/21783c74d88ed690d657544b4503db09a1054239/assets/preview.png


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | from Levenshtein import distance
  4 | import statistics
  5 | import jsonlines
  6 | import sys
  7 | import pandas as pd
  8 | import argparse
  9 | 
 10 | end_after_strings = ["Original text: ", "message is:", "original text is:", "message is ", "we get:"]
 11 | # end_after_strings = ["Therefore, the original sequence of numbers is:","Original sequence:"]
 12 | delete_after_strings = ["However, this doesn't make sense", "However, this doesn't make much sense", "This sentence still doesn't make", "However, this sentence doesn't make", "This still doesn't make sense"]
 13 | shift_freqs = [59,21,117,5,15,12,6,3,1,3,3,7,1225,5,2,4,2,2,1,1,4,2,17,3,7]
 14 | 
 15 | def desc(idx,gt_chain,pred_chain,gt,res):
 16 |     print("#", idx)
 17 |     print("gt_chain", gt_chain)
 18 |     print("----")
 19 |     print("pred_chain", pred_chain)
 20 |     print("----")
 21 |     print("gt", gt, "res", res)
 22 |     print("**************")
 23 | 
 24 | def main(args):
 25 |     data_types = ["bin1","bin2","bin3","bin4","bin5"]
 26 |     big_df = pd.DataFrame()
 27 |     prompt_type = args.prompt_type
 28 |     fo_directory = f"logs/{prompt_type}/"
 29 |     temp = 0.0
 30 |     corrupt = False
 31 |     chain_check = False
 32 |     chain_directory = "shift_chain/"
 33 |     bin_probs = {}
 34 |     for bin in data_types:
 35 |         with open(f"seven_letter_words/{bin}_prob.txt", 'r') as file:
 36 |             second_column_words = [line.split(' ')[1].strip() for line in file][:100]
 37 |             bin_probs[bin] = second_column_words
 38 |     
 39 |     for shift in range(1,26):
 40 |         for fi_label in data_types:
 41 |             pred_nchars = []
 42 |             input_nchars = []
 43 |             corrects = []
 44 |             preds = []
 45 |             gts = []
 46 |             small_df = pd.DataFrame()
 47 |             condition = prompt_type + str(shift) + "_" + fi_label
 48 |             if corrupt:
 49 |                 condition += "_nohelp2"
 50 |     
 51 |             try:
 52 |                 file = fo_directory + condition + ".json"
 53 |                 fi = open(file, "r")
 54 |                 if chain_check and prompt_type == "text_cot":
 55 |                     chain_file = chain_directory + condition + ".jsonl"
 56 |                     fi_chain = open(chain_file, "r")
 57 |                 print(f"Loading {file}")
 58 |             except:
 59 |                 print(f"\t{file} not found, skipping {fi_label} {shift}")
 60 |                 continue
 61 |             print("*"*10)
 62 |             data = json.load(fi)
 63 |             if chain_check and prompt_type == "text_cot":
 64 |                 data_chain = []
 65 |                 for line in fi_chain:
 66 |                     x = json.loads(line)
 67 |                     data_chain.append(x["chain"])
 68 | 
 69 |             count_correct = 0
 70 |             count_correct_demo = 0
 71 |             count_total = 0
 72 |             total_dist = 0
 73 |             chain_correct_op_incorrect = 0
 74 |             chain_correct_op_correct = 0
 75 |             chain_incorrect_op_correct = 0
 76 |             chain_incorrect_op_incorrect = 0
 77 |             distances = []
 78 |             for idx,(gt,res) in enumerate(zip(data["gts"], data["res"])):
 79 |                 orig_res = res[:]
 80 | 
 81 |                 for delete_after_string in delete_after_strings:
 82 |                     if delete_after_string in res:
 83 |                         starts = [m.start() for m in re.finditer(delete_after_string, res)]
 84 |                         res = res[:starts[0]].strip()
 85 |                     
 86 |                 for end_after_string in end_after_strings:
 87 |                     if end_after_string in res:
 88 |                         res = res.split(end_after_string)[1].split("\n")[0].strip()
 89 |                         if len(res) != 0:
 90 |                             continue       
 91 | 
 92 |                 if gt[0] == '"':
 93 |                     gt = gt[1:]
 94 |                 if gt[-1] == '"':
 95 |                     gt = gt[:-1]
 96 | 
 97 |                 # if gt1[0] == '"':
 98 |                 #     gt1 = gt1[1:]
 99 |                 # if gt1[-1] == '"':
100 |                 #     gt1 = gt1[:-1]
101 | 
102 |                 if len(res) != 0:
103 |                     if res[0] == '"':
104 |                         res = res[1:]
105 |                     if res[-1] == '"':
106 |                         res = res[:-1]
107 | 
108 |                 dist = distance(gt, res)
109 |                 total_dist += dist
110 |                 distances.append(dist)
111 | 
112 |                 if gt == res:
113 |                     count_correct += 1
114 |                     corrects.append(1)
115 |                 else:
116 |                     corrects.append(0)
117 |                 
118 |                 if chain_check and prompt_type == "text_cot":
119 |                     # find counts of chain correct but not output correct
120 |                     gt_chain = data_chain[idx].strip()
121 |                     pred_chain = re.split(r'Original text:', orig_res)[0].strip()
122 |                     if gt_chain == pred_chain:
123 |                         if gt != res:
124 |                             # desc(idx,gt_chain,pred_chain,gt,res)
125 |                             chain_correct_op_incorrect += 1
126 |                         else:
127 |                             chain_correct_op_correct += 1
128 |                     else:
129 |                         if gt == res:
130 |                             # desc()
131 |                             chain_incorrect_op_correct += 1
132 |                         else:
133 |                             chain_incorrect_op_incorrect += 1
134 |                 # stats
135 |                 pred_nchars.append(len(res.strip()))
136 |                 input_nchars.append(len(gt.strip()))
137 |                 preds.append(res)
138 |                 gts.append(gt)
139 | 
140 |                 count_total += 1
141 |             result_dict = {"condition": condition, "accuracy": count_correct*1.0/count_total, "lev_dist": total_dist*1.0/count_total, "median_levdist": statistics.median(distances), "temp": temp}
142 |             print(condition, "acc_inst", count_correct*1.0/count_total, "acc_demo", count_correct_demo*1.0/count_total, "levdist:", total_dist*1.0/count_total, "median levdist:", statistics.median(distances))
143 | 
144 |             ## For fine-grained analysis of 'unfaithfulness'
145 |             if chain_check:
146 |                 result_dict.update({"chain_correct_op_correct" : chain_correct_op_correct, "chain_correct_op_incorrect" : chain_correct_op_incorrect, "chain_incorrect_op_correct" : chain_incorrect_op_correct, "chain_incorrect_op_incorrect" : chain_incorrect_op_incorrect})
147 |                 print("chain correct:")
148 |                 print("\toutput correct:", chain_correct_op_correct, "output incorrect:", chain_correct_op_incorrect)
149 |                 print("chain incorrect:")
150 |                 print("\toutput correct:", chain_incorrect_op_correct, "output incorrect:", chain_incorrect_op_incorrect)
151 |                     
152 |             if args.create_stats_table:
153 |                 with open(f'stimuli/{prompt_type}/{condition}.jsonl', 'r') as file:
154 |                     input_text = []
155 |                     for line in file:
156 |                         json_obj = json.loads(line)
157 |                         input_text.append(json_obj.get('input', ''))
158 |                     
159 |                 ## write to huge tsv
160 |                 small_df["input_nchars"] =  input_nchars
161 |                 small_df["output_logprob"] =  bin_probs[fi_label]
162 |                 small_df["correct"] = corrects
163 |                 small_df["pred"] = preds
164 |                 small_df["gt"] = gts
165 |                 small_df["shift_level"] = [shift for _ in range(len(input_nchars))]
166 |                 small_df["shift_freq"] = [shift_freqs[shift-1] for _ in range(len(input_nchars))]
167 |                 small_df["input"] = input_text
168 | 
169 |                 assert len(input_nchars) == len(pred_nchars) == len(bin_probs[fi_label]) == len(corrects)
170 |                 big_df = pd.concat([big_df, small_df], ignore_index=True)
171 | 
172 |     if args.create_stats_table: 
173 |         big_df.to_csv(f"regression/{prompt_type}_train_table.tsv","\t",index_label="index")
174 | 
175 | if __name__ == "__main__":
176 |     args = argparse.ArgumentParser()
177 |     args.add_argument("--prompt_type", type=str, help="Prompt type to use [standard, text_cot, math_cot, number_cot]", default="text_cot")
178 |     args.add_argument("--create_stats_table", action='store_true', help='default = False', default=False)
179 |     args = args.parse_args()
180 |     main(args)


--------------------------------------------------------------------------------
/examples/bin_1.txt:
--------------------------------------------------------------------------------
  1 | choosed
  2 | colbert
  3 | polenta
  4 | modicum
  5 | autarch
  6 | schisms
  7 | mariner
  8 | disarms
  9 | rescale
 10 | paywall
 11 | infobox
 12 | preston
 13 | shrines
 14 | implore
 15 | alloted
 16 | precast
 17 | borings
 18 | bacilli
 19 | matrice
 20 | redible
 21 | absolve
 22 | ourself
 23 | ethetic
 24 | maynard
 25 | calibur
 26 | enviros
 27 | calzone
 28 | sumatra
 29 | drywall
 30 | impaled
 31 | manland
 32 | divined
 33 | conlang
 34 | tablero
 35 | redraft
 36 | equitas
 37 | ratting
 38 | errancy
 39 | webcast
 40 | lowland
 41 | boyhood
 42 | actuary
 43 | catlike
 44 | putback
 45 | galileo
 46 | rivaled
 47 | volonte
 48 | sunspot
 49 | rotunda
 50 | notched
 51 | taproot
 52 | secures
 53 | entente
 54 | outflow
 55 | betters
 56 | rumpled
 57 | burried
 58 | repulse
 59 | fillets
 60 | relator
 61 | sombody
 62 | unsaved
 63 | ailment
 64 | nodules
 65 | montero
 66 | satires
 67 | arcadia
 68 | valerie
 69 | inglish
 70 | dukedom
 71 | espouse
 72 | bedevil
 73 | reticle
 74 | matinee
 75 | maxwell
 76 | picante
 77 | baboons
 78 | exciter
 79 | losings
 80 | newbies
 81 | serried
 82 | curving
 83 | narrows
 84 | ragging
 85 | baneful
 86 | pinatas
 87 | divison
 88 | kinfolk
 89 | indiana
 90 | caritas
 91 | silvery
 92 | inkling
 93 | absense
 94 | lavabit
 95 | outsize
 96 | rewired
 97 | absalom
 98 | getback
 99 | accuser
100 | striven
101 | maloney
102 | escaper
103 | subtile
104 | colibri
105 | delving
106 | calving
107 | tarheel
108 | herders
109 | grooved
110 | octagon
111 | bisping
112 | alluded
113 | merlion
114 | figural
115 | debater
116 | pigtail
117 | honious
118 | pinches
119 | clojure
120 | equates
121 | refiner
122 | billets
123 | alfalfa
124 | hotshot
125 | nonagon
126 | jacuzzi
127 | vincent
128 | pollock
129 | airtime


--------------------------------------------------------------------------------
/examples/bin_2.txt:
--------------------------------------------------------------------------------
  1 | dupasha
  2 | makrita
  3 | ferisse
  4 | murcers
  5 | metires
  6 | witmost
  7 | astause
  8 | sekaram
  9 | vilgren
 10 | belomat
 11 | setnest
 12 | curadal
 13 | viridon
 14 | denpick
 15 | eraully
 16 | ruborie
 17 | queimer
 18 | cosuits
 19 | rutamen
 20 | graizen
 21 | sonware
 22 | infocos
 23 | inkwang
 24 | rowbots
 25 | engeden
 26 | vizizen
 27 | molenci
 28 | indotes
 29 | dapener
 30 | ireasti
 31 | undving
 32 | traumpt
 33 | redrear
 34 | aryanni
 35 | brovoir
 36 | greised
 37 | networm
 38 | memwill
 39 | gamplus
 40 | estplay
 41 | sapwhat
 42 | indmong
 43 | kenafil
 44 | denzhou
 45 | cosited
 46 | perzoek
 47 | balinit
 48 | mayonal
 49 | armemic
 50 | henjury
 51 | lavplay
 52 | calynes
 53 | remfold
 54 | engdist
 55 | armrich
 56 | luxfast
 57 | mulhatt
 58 | allaton
 59 | strfair
 60 | monachs
 61 | kerapat
 62 | hergrim
 63 | fidgota
 64 | decigan
 65 | dezella
 66 | haypath
 67 | resonga
 68 | nosband
 69 | poligen
 70 | mobture
 71 | flufrom
 72 | willose
 73 | desedge
 74 | momclub
 75 | clobero
 76 | mapauth
 77 | vitelho
 78 | daykick
 79 | sysmite
 80 | telolon
 81 | onsensa
 82 | vipaddy
 83 | sunrink
 84 | namhero
 85 | voratio
 86 | niliter
 87 | droones
 88 | zipcord
 89 | pagrete
 90 | funwich
 91 | negbers
 92 | belwich
 93 | allayah
 94 | pakatak
 95 | farathy
 96 | betweek
 97 | rutanim
 98 | obsster
 99 | ligigid
100 | lidcore
101 | vacassa
102 | pipiday
103 | almorum
104 | sadmore
105 | hayhorn
106 | vinango
107 | cosisty
108 | libikal
109 | dogodes
110 | camcore
111 | ashmann
112 | fibunal
113 | enciere
114 | revrika
115 | perburg
116 | camilan
117 | sumarms
118 | firigin
119 | pelatra
120 | vorvery
121 | purabra
122 | indondo
123 | dogpeak
124 | alllein
125 | actblue
126 | hasvers
127 | freifty
128 | hueving
129 | coratti
130 | saprika
131 | honcoin
132 | joycons
133 | dogoids
134 | nanians
135 | dreanon
136 | spoanna
137 | levieur
138 | jawolla
139 | cowcard
140 | thehalb
141 | lamboys
142 | disorer
143 | pigwiki
144 | embious
145 | detdden
146 | vacibel


--------------------------------------------------------------------------------
/examples/bin_3.txt:
--------------------------------------------------------------------------------
  1 | tasvinc
  2 | dblshaw
  3 | cmbodka
  4 | zagbbox
  5 | hedoute
  6 | cmsdest
  7 | leoanje
  8 | sitinks
  9 | oweorno
 10 | advpite
 11 | grpwerk
 12 | aesasio
 13 | atequir
 14 | dryhazi
 15 | styansa
 16 | sunincl
 17 | bowamac
 18 | xyzunik
 19 | awsposs
 20 | ogrmode
 21 | midbyss
 22 | ctlmony
 23 | rngmony
 24 | rergett
 25 | phperti
 26 | bfdizzy
 27 | srcstit
 28 | pktubic
 29 | oddourd
 30 | mplnick
 31 | dccergy
 32 | oxyhest
 33 | klepled
 34 | digydro
 35 | aphopez
 36 | rifntag
 37 | srvlope
 38 | emoomez
 39 | toyelry
 40 | iniilen
 41 | iffamma
 42 | adsokin
 43 | eofpike
 44 | dnsavia
 45 | uitlesi
 46 | owluntu
 47 | affesda
 48 | mgrulia
 49 | foxmsgs
 50 | esiaram
 51 | subzyst
 52 | ottexpo
 53 | udpcolo
 54 | vakdney
 55 | svmvery
 56 | dspereo
 57 | pngpone
 58 | quiilyn
 59 | tgtella
 60 | ithueur
 61 | wynvinc
 62 | sezanch
 63 | sdkjabi
 64 | yaninem
 65 | dbgivid
 66 | adeardu
 67 | paykich
 68 | dspdeal
 69 | cptwipe
 70 | nikaign
 71 | pesuell
 72 | musropp
 73 | ebxside
 74 | dnienez
 75 | dccscal
 76 | cmbheck
 77 | stsasks
 78 | hapixer
 79 | nikuild
 80 | wowrapy
 81 | txtajes
 82 | gtkoooo
 83 | sutcmds
 84 | erviode
 85 | bewikon
 86 | hubphas
 87 | ervpets
 88 | ofsitem
 89 | gstivec
 90 | utfestr
 91 | etaabic
 92 | tieibur
 93 | islssel
 94 | iodvari
 95 | zagzept
 96 | ustjour
 97 | dexonte
 98 | bizfilt
 99 | adaowns
100 | tetibri
101 | octfirm
102 | weiudos
103 | pwdtick
104 | ttlarry
105 | stuimeo
106 | sqlstre
107 | mieipeg
108 | dueafen
109 | sndurge
110 | vezcorn
111 | ilketch
112 | zugenth
113 | rngiate
114 | ottclud
115 | aprkeep
116 | urlveal
117 | msgourd
118 | xlsboom
119 | wijagma
120 | robisbn
121 | melmlin
122 | samslot
123 | nidoust
124 | begkits
125 | arrflix
126 | ditfrau
127 | aidomid
128 | cptfoto
129 | aimrede
130 | dbgabay
131 | cidlocs
132 | booiedo
133 | mplders
134 | cptpush
135 | nahcalc
136 | amyovel
137 | wonczas
138 | mplrome
139 | edxesis
140 | adcadoo
141 | oudtems
142 | ociirut
143 | balzept
144 | avgcorp
145 | himocos
146 | ignlots
147 | baztrim


--------------------------------------------------------------------------------
/examples/bin_4.txt:
--------------------------------------------------------------------------------
  1 | voyxfff
  2 | qtyijke
  3 | mmculed
  4 | jmpytut
  5 | vtkprit
  6 | oilrxjs
  7 | vfsisex
  8 | eenqrst
  9 | nbrlyph
 10 | xmmgota
 11 | jmpquiv
 12 | rummqtt
 13 | xhrdisp
 14 | ffturaa
 15 | dexocht
 16 | xmmgett
 17 | lvljspx
 18 | zugwpdb
 19 | tidmqtt
 20 | lhsigua
 21 | sshemsp
 22 | burrgyz
 23 | vtkirie
 24 | vtkifar
 25 | rpczano
 26 | vtkinez
 27 | vtkifie
 28 | zugymce
 29 | xcbwent
 30 | watobjs
 31 | doiawks
 32 | cgiacyj
 33 | czyands
 34 | mdbgebn
 35 | atejspx
 36 | rndxito
 37 | sdkrxjs
 38 | mlxoice
 39 | mlxahan
 40 | auxjspx
 41 | jsxirms
 42 | czyrgba
 43 | makrgyz
 44 | nanighb
 45 | jsxobil
 46 | jwtgraf
 47 | vtkundy
 48 | jsxuden
 49 | pszglfw
 50 | czydamn
 51 | csvylko
 52 | wijincl
 53 | oilrgyz
 54 | mlxulan
 55 | xmmepar
 56 | lodxlsx
 57 | uczpeon
 58 | sesrgyz
 59 | pciavax
 60 | gpsilik
 61 | lhszion
 62 | slaampp
 63 | uczhtag
 64 | ouiqrst
 65 | xhrziel
 66 | pcbpiar
 67 | yumxfff
 68 | fedjspb
 69 | xmmtega
 70 | segzoek
 71 | mezgrpc
 72 | xcbophe
 73 | ngxantz
 74 | aosantd
 75 | jejymax
 76 | rerlsru
 77 | racrgyz
 78 | rndquam
 79 | mlxneau
 80 | rudcych
 81 | lotlsru
 82 | abyilog
 83 | rsaueba
 84 | jsxioso
 85 | derjspx
 86 | vfsgett
 87 | vtkjure
 88 | phyepar
 89 | vesxfff
 90 | lcdleri
 91 | ifsfeas
 92 | mmcubbo
 93 | ircemsp
 94 | pdbiesz
 95 | rpciene
 96 | iodpiar
 97 | rmslsru
 98 | rpcumno
 99 | apkckpt
100 | lcdvoir
101 | rhsncia
102 | owlsetq
103 | ifsbrtc
104 | csvowej
105 | xcborgt
106 | sutmobx
107 | iovstmt
108 | nanmqtt
109 | irqphem
110 | wndncia
111 | xcbided
112 | jsxkees
113 | cpscsrf
114 | jmppeon
115 | lhsreta
116 | dezrgyz
117 | elecsrf
118 | atrlymp
119 | iodudev
120 | xhrkses
121 | ngxjspx
122 | uczpear
123 | npmhlen
124 | pcmncmp
125 | biczoek
126 | dosorrh
127 | jejmisc
128 | kenjspx
129 | idxiaux
130 | svgiesz
131 | vtkgems
132 | glmldre
133 | dexumbn
134 | kitxfff
135 | jsxajan
136 | fmtmina
137 | gtkthew
138 | czyuess
139 | iodhait
140 | cafantd
141 | xcbredo
142 | fpswpdb
143 | xcbdogs
144 | jwtlify
145 | rsaellt
146 | pkgughs
147 | jmpccak
148 | pclvais


--------------------------------------------------------------------------------
/examples/bin_5.txt:
--------------------------------------------------------------------------------
  1 | czyjspx
  2 | xcbabwe
  3 | aktjspx
  4 | xcbcych
  5 | xcbziej
  6 | xmmeczy
  7 | qeddhcp
  8 | xcbilha
  9 | xcbacji
 10 | xcbzung
 11 | xmmobre
 12 | xcbquir
 13 | xcbrouw
 14 | ilkjspx
 15 | lijglfw
 16 | foxrgyz
 17 | jsxrouw
 18 | xcbziel
 19 | xcbagua
 20 | eidtopl
 21 | xcbximo
 22 | jwtglfw
 23 | xcbnerg
 24 | xcbateg
 25 | befjspx
 26 | xcbxlim
 27 | xcbsemi
 28 | ketglfw
 29 | lemjspx
 30 | xcbcyan
 31 | xcbsequ
 32 | xcbemer
 33 | eoscsrf
 34 | xcbphot
 35 | xcbeken
 36 | xcbolum
 37 | xcbrodu
 38 | tepjspx
 39 | xcbthro
 40 | xcbueue
 41 | oscquiv
 42 | xcbubah
 43 | xcbodzi
 44 | mlxquee
 45 | xcbmdat
 46 | xcbuell
 47 | xcbobre
 48 | xcbuhan
 49 | tasexpl
 50 | xcbueil
 51 | xcbilos
 52 | iodtopl
 53 | suttmpl
 54 | xcbhots
 55 | xcbosph
 56 | xcbuego
 57 | xcbquam
 58 | kolglfw
 59 | gesglfw
 60 | gccorrh
 61 | mezptom
 62 | xcbhecy
 63 | xcbsemb
 64 | yiijspx
 65 | meljspx
 66 | xcbunos
 67 | xcbunei
 68 | pisbrtc
 69 | vehjspx
 70 | vasrgyz
 71 | lhsrgyz
 72 | xcbighb
 73 | phyfidf
 74 | kilglfw
 75 | dukvrir
 76 | levjspx
 77 | updrgyz
 78 | xcbagas
 79 | opcrgyz
 80 | ilkjspb
 81 | curfidf
 82 | rpcighb
 83 | xcbacje
 84 | xcbilih
 85 | zugcsrf
 86 | xcbveau
 87 | rpcasje
 88 | xcbalsy
 89 | pcmrouw
 90 | xcbafil
 91 | doijspx
 92 | xcbhtub
 93 | xcbhear
 94 | xcbuele
 95 | opijspx
 96 | xcbazzo
 97 | xcboufl
 98 | akojspx
 99 | ninmqtt
100 | xcbguna
101 | idxorrh
102 | xcbheit
103 | czyxfff
104 | voyglfw
105 | dynmqtt
106 | xcbcoln
107 | vezjspx
108 | xcbocre
109 | cueorrh
110 | xmmacje
111 | mlxalsy
112 | ebxorrh
113 | xcbagal
114 | xcbzept
115 | xcbucle
116 | vesjspx
117 | xcbiser
118 | xcbseau
119 | xcbekte
120 | lapmqtt
121 | abyjspx
122 | xcbueba
123 | xcbijke
124 | xcbvoie
125 | xcbudem
126 | xcbivol
127 | xcbquoi
128 | xcbupal
129 | zugjspx
130 | xcbheel
131 | typglfw
132 | rpcinqu
133 | voyorrh
134 | tieglfw
135 | hexmqtt
136 | xcbacyj
137 | aktjspb
138 | amyjspx
139 | ackrgyz
140 | xcbokus
141 | xcbhtag
142 | togjspx
143 | xcbuely
144 | xcbffic
145 | mlxasje
146 | xcbunft
147 | wieglfw
148 | xcbufig
149 | xcbueur
150 | zagmqtt


--------------------------------------------------------------------------------
/examples/select_swap_words.py:
--------------------------------------------------------------------------------
  1 | bins = ["bin_3", "bin_4", "bin_5"]
  2 | 
  3 | words = []
  4 | for bin in bins:
  5 |     with open(f"./{bin}.txt") as file:
  6 |         words.extend([line.strip() for line in file])
  7 | 
  8 | # import tiktoken
  9 | # from collections import defaultdict
 10 | # gpt4_enc = tiktoken.get_encoding("cl100k_base")
 11 | 
 12 | # score_words_dict = defaultdict(list)
 13 | 
 14 | # for word in words:
 15 | #     tokens = len(gpt4_enc.encode(word))
 16 | #     score_words_dict[tokens].append(word)
 17 | 
 18 | alphabet = "abcdefghijklmnopqrstuvwxyz"
 19 | index2char = {}
 20 | char2index = {}
 21 | for index, char in enumerate(alphabet):
 22 |     index2char[index] = char
 23 |     char2index[char] = index
 24 | 
 25 | # similar_pairs = []
 26 | # for score, words_with_score in score_words_dict.items():
 27 | #     for i in range(len(words_with_score)):
 28 | #         word1 = words_with_score[i]
 29 | #         word2 = ""
 30 | #         for char in word1:
 31 | #             word2 += index2char[(char2index[char]+25)%26] 
 32 | #         print(word1, word2)
 33 | #         if word2 in words:
 34 | #             similar_pairs.append((word1, word2))
 35 | 
 36 | # print(len(similar_pairs))
 37 | # print(similar_pairs)
 38 | 
 39 | 
 40 | import os
 41 | os.environ['TRANSFORMERS_CACHE'] = "/n/fs/codeval/cache"
 42 | os.environ['HF_DATASETS_CACHE'] = "/n/fs/codeval/cache"
 43 | os.environ['HF_HOME'] = "/n/fs/codeval/cache"
 44 | os.environ['HF_HUB_CACHE'] = "/n/fs/codeval/cache"
 45 | 
 46 | import torch
 47 | from transformers import GPT2LMHeadModel, GPT2Tokenizer
 48 | import tiktoken
 49 | import logging
 50 | import json
 51 | import pandas as pd
 52 | 
 53 | logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, handlers=[logging.StreamHandler(),logging.FileHandler("prob_random_index.log")])
 54 | 
 55 | if torch.cuda.is_available():
 56 |     device = "cuda"
 57 | else:
 58 |     device = "cpu"
 59 | 
 60 | gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-xl")
 61 | gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2-xl").to(device)
 62 | gpt4_enc = tiktoken.get_encoding("cl100k_base")
 63 | 
 64 | def pad_batch(batch, pad_idx):
 65 |     max_length = 0
 66 |     for seq in batch:
 67 |         if len(seq) > max_length:
 68 |             max_length = len(seq)
 69 | 
 70 |     new_batch = []
 71 |     for seq in batch:
 72 |         padding = [pad_idx for i in range(max_length - len(seq))]
 73 |         new_batch.append(seq + padding)
 74 | 
 75 |     return new_batch
 76 | 
 77 | # Get perplexity using GPT-2
 78 | def prob_gpt2(sentence_list):
 79 | 
 80 |     # Tokenize the sentences
 81 |     all_tokens = []
 82 |     for sentence in sentence_list:
 83 |         tokens = gpt2_tokenizer.encode(sentence)
 84 |         all_tokens.append(tokens)
 85 |     tokens = pad_batch(all_tokens, 50256)
 86 | 
 87 |     targets = tokens[:]
 88 | 
 89 |     # Compute average log likelihood for the generation
 90 |     input_ids = torch.LongTensor(tokens).to(device)
 91 |     target_ids = torch.LongTensor(targets).to(device)
 92 | 
 93 |     with torch.no_grad():
 94 |         outputs = gpt2_model(input_ids, labels=target_ids)
 95 |         logits = outputs[1]
 96 |         logits = logits.transpose(0,1)[:-1].transpose(0,1)
 97 |         target_ids = target_ids.transpose(0,1)[1:].transpose(0,1)
 98 |         loss = torch.nn.CrossEntropyLoss(reduction="none", ignore_index=50256)(logits.reshape(-1,50257), target_ids.reshape(-1))
 99 |         loss = loss.reshape(target_ids.shape).sum(dim=1)
100 |         neg_log_likelihood = -1*loss
101 | 
102 | 
103 |     # 13.357776641845703 = logprob('The word is"'); removing this to just get
104 |     # the word prob
105 |     return neg_log_likelihood + 13.357776641845703
106 | 
107 | 
108 | # folder_path = "/n/fs/codeval/embers-of-autoregression/extension/stimuli/word/"
109 | # file_list = sorted([os.path.join(folder_path, f) for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])[:1]
110 | file_list = [1]
111 | 
112 | num_token_mis = 0
113 | for finame in file_list:
114 |     # print(finame, end="**\n")
115 |     # with open(finame, 'r') as f:
116 |     #     lines = f.readlines()
117 |     # lines = [json.loads(line) for line in lines]
118 |     # fi = [line['input'] for line in lines]
119 |     # print("Lines", len(fi))
120 |     # fo = open("input_scored.txt", "a")
121 |     word_list = words
122 |     print("Lines", len(word_list))
123 | 
124 |     words_with_prob = []
125 |     word_pairs = []
126 | 
127 |     this_batch_sentences = []
128 |     this_batch_word1s = []
129 |     this_batch_words = []
130 |     num_tokens = []
131 |     for index, line in enumerate(word_list):
132 |         if index % 10000 == 0:
133 |             logging.info(str(index))
134 |     
135 |         word = line.strip()
136 |         check_shifts = [2]
137 |         for check_shift in check_shifts:
138 |             word2 = ""
139 |             word1 = ""
140 |             for char in word:
141 |                 word1 += index2char[(char2index[char]+1)%26]
142 |                 word2 += index2char[(char2index[char]+check_shift)%26]
143 | 
144 |             tokens = gpt4_enc.encode(word1)
145 |             tokens_word2 = gpt4_enc.encode(word2)
146 |             if len(tokens) > 4 or len(tokens) != len(tokens_word2):
147 |                 # print(word1, word2, len(tokens), len(tokens_word2))
148 |                 num_token_mis += 1
149 |                 continue
150 | 
151 |             tokens_spaced = gpt4_enc.encode(" " + word2)
152 | 
153 |             this_batch_sentences.append('The word is "' + word2 + '"')
154 |             this_batch_words.append(word2)
155 |             num_tokens.append(len(tokens))
156 |             this_batch_word1s.append(word1)
157 | 
158 |         # if len(tokens) == 2 and len(tokens_spaced) == 2 and len(word) == 7:
159 |         #     token1 = gpt4_enc.decode([tokens[0]]).strip()
160 |         #     token2 = gpt4_enc.decode([tokens[1]]).strip()
161 | 
162 |         #     tokenspaced1 = gpt4_enc.decode([tokens_spaced[0]]).strip()
163 |         #     tokenspaced2 = gpt4_enc.decode([tokens_spaced[1]]).strip()
164 | 
165 |         #     if len(token1) == 3 and len(token2) == 4 and len(tokenspaced1) == 3 and len(tokenspaced2) == 4:
166 |         #         this_batch_sentences.append('The word is "' + word + '"')
167 |         #         this_batch_words.append(word)
168 |         #     else:
169 |         #         print(index, "Wrong length", word, len(token1), len(token2), len(tokenspaced1), len(tokenspaced2))
170 |         # else:
171 |         #     print(index, "Wrong length", word, len(tokens), len(tokens_spaced), len(word))
172 | 
173 |         if len(this_batch_sentences) == 3000:
174 |             logprobs = prob_gpt2(this_batch_sentences)
175 |             for word1, word2, logprob in zip(this_batch_word1s, this_batch_words, logprobs):
176 |                 words_with_prob.append(logprob.item())
177 |                 if logprob.item() >= -45 and logprob.item() < -30:
178 |                     word_pairs.append([word1, word2])
179 |             this_batch_sentences = []
180 |             this_batch_words = []
181 |             this_batch_word1s = []
182 | 
183 |     if len(this_batch_sentences) > 0:
184 |         logprobs = prob_gpt2(this_batch_sentences)
185 |         for word1, word2, logprob in zip(this_batch_word1s, this_batch_words, logprobs):
186 |                 words_with_prob.append(logprob.item())
187 |                 if logprob.item() > -45 and logprob.item() < -30:
188 |                     x = prob_gpt2(['The word is "' + word1 + '"'])[-1].item()
189 |                     if x > -45 and x < -30:
190 |                         word_pairs.append([word1, word2])
191 |                         print("missed 2", word1, word2, x, logprob.item())
192 |         this_batch_sentences = []
193 |         this_batch_words = []
194 |         this_batch_word1s = []
195 | 
196 | print(num_token_mis)
197 | print(len(word_pairs))
198 | f = open("./word_pairs_lowbins.txt", 'a+')
199 | for pair in word_pairs:
200 |     f.write(pair[0] + "\t" + pair[1] + "\n")
201 | 
202 | f.close()
203 | 
204 | 
205 | 


--------------------------------------------------------------------------------
/examples/word_pairs_lowbins.txt:
--------------------------------------------------------------------------------
  1 | ubtwjod	vcuxkpe
  2 | ecmtibx	fdnujcy
  3 | ifepvuf	jgfqwvg
  4 | dnteftu	eoufguv
  5 | tjujolt	ukvkpmu
  6 | bewqjuf	cfxrkvg
  7 | hsqxfsl	itrygtm
  8 | bftbtjp	cgucukq
  9 | bufrvjs	cvgswkt
 10 | tuzbotb	uvacpuc
 11 | cpxbnbd	dqycoce
 12 | yzavojl	zabwpkm
 13 | phsnpef	qitoqfg
 14 | dumnpoz	evnoqpa
 15 | sohnpoz	tpioqpa
 16 | cgejaaz	dhfkbba
 17 | tsdtuju	uteuvkv
 18 | qluvcjd	rmvwdke
 19 | peepvse	qffqwtf
 20 | lmfqmfe	mngrngf
 21 | sjgoubh	tkhpvci
 22 | fnppnfa	goqqogb
 23 | jojjmfo	kpkkngp
 24 | eotbwjb	fpucxkc
 25 | vjumftj	wkvnguk
 26 | bggfteb	chhgufc
 27 | nhsvmjb	oitwnkc
 28 | ftjbsbn	gukctco
 29 | tvcaztu	uwdbauv
 30 | puufyqp	qvvgzrq
 31 | veqdpmp	wfreqnq
 32 | wbleofz	xcmfpga
 33 | etqfsfp	furgtgq
 34 | qohqpof	rpirqpg
 35 | uhufmmb	vivgnnc
 36 | juivfvs	kvjwgwt
 37 | telkbcj	ufmlcdk
 38 | echjwje	fdikxkf
 39 | qbzljdi	rcamkej
 40 | ojlbjho	pkmckip
 41 | qftvfmm	rguwgnn
 42 | nvtspqq	owutqrr
 43 | fcytjef	gdzukfg
 44 | eojfofa	fpkgpgb
 45 | eddtdbm	feeuecn
 46 | dncifdl	eodjgem
 47 | hulpppp	ivmqqqq
 48 | cfxjlpo	dgykmqp
 49 | vugftus	wvhguvt
 50 | ujfjcvs	vkgkdwt
 51 | jtmttfm	kunuugn
 52 | efypouf	fgzqpvg
 53 | cjagjmu	dkbhknv
 54 | ufujcsj	vgvkdtk
 55 | pdugjsn	qevhkto
 56 | xfjvept	ygkwfqu
 57 | qxeujdl	ryfvkem
 58 | uumbssz	vvnctta
 59 | tuvjnfp	uvwkogq
 60 | trmtusf	usnuvtg
 61 | evfbgfo	fwgchgp
 62 | toevshf	upfwtig
 63 | jmlfudi	knmgvej
 64 | sohjbuf	tpikcvg
 65 | ymtcppn	znudqqo
 66 | spcjtco	tqdkudp
 67 | ojepvtu	pkfqwuv
 68 | cfhljut	dgimkvu
 69 | bjepnje	ckfqokf
 70 | dqugpup	ervhqvq
 71 | echbcbz	fdicdca
 72 | bnzpwfm	coaqxgn
 73 | feyftjt	gfzguku
 74 | pveufnt	qwfvgou
 75 | ijnpdpt	jkoqequ
 76 | wpzyggg	xqazhhh
 77 | ruzjklf	svaklmg
 78 | nndvmfe	ooewngf
 79 | wulqsju	xvmrtkv
 80 | fforstu	ggpstuv
 81 | ocsmzqi	pdtnarj
 82 | yisejtq	zjtfkur
 83 | gguvsbb	hhvwtcc
 84 | ynnhfuu	zooigvv
 85 | avhxqec	bwiyrfd
 86 | mitjhvb	njukiwc
 87 | ttifntq	uujgour
 88 | cvsshza	dwttiab
 89 | wuljsjf	xvmktkg
 90 | sqdabop	trebcpq
 91 | ydcxfou	zedygpv
 92 | epjbxlt	fqkcymu
 93 | dhjbdzk	eikceal
 94 | dazboet	ebacpfu
 95 | bufktqy	cvglurz
 96 | telsykt	ufmtzlu
 97 | nmypjdf	onzqkeg
 98 | ktyjsnt	luzktou
 99 | dazshcb	ebatidc
100 | nblshza	ocmtiab
101 | obojhic	pcpkijd
102 | wulvoez	xvmwpfa
103 | nmyvmbo	onzwncp
104 | vdaqfpo	webrgqp
105 | qdjbwby	rekcxcz
106 | hqtjmjl	iruknkm
107 | vdaiubh	webjvci
108 | pvjrstu	qwkstuv
109 | gfektqc	hgflurd
110 | ydcpqif	zedqrjg
111 | ohyboua	pizcpvb
112 | sfsmtsv	tgtnutw
113 | svedzdi	twfeaej
114 | mpumtsv	nqvnutw
115 | bczjmph	cdaknqi
116 | stbvfcb	tucwgdc
117 | wgthfuu	xhuigvv
118 | qizfqbs	rjagrct
119 | wftyggg	xguzhhh
120 | mdemfsj	nefngtk
121 | sntmtsv	tounutw
122 | sqdvnop	trewopq
123 | mdewpjs	nefxqkt
124 | jgtcsud	khudtve
125 | tvunpcy	uwvoqdz
126 | xoeodjb	ypfpekc
127 | ydcjefe	zedkfgf
128 | ktylfft	luzmggu
129 | busmznq	cvtnaor
130 | yisltft	zjtmugu
131 | vdaqfbs	webrgct
132 | eptpssi	fquqttj
133 | wulhfnt	xvmigou
134 | hmnmesf	inonftg
135 | efyvnco	fgzwodp
136 | ljuyggg	mkvzhhh
137 | ktybkbo	luzclcp
138 | huluifx	ivmvjgy
139 | ydcepht	zedfqiu
140 | stbfmmu	tucgnnv
141 | qdmwbjt	renxcku
142 | ydcbcxf	zedcdyg
143 | bluktqy	cmvlurz
144 | ydcdzdi	zedeaej
145 | ydcspvx	zedtqwy
146 | gpyshza	hqztiab
147 | ydcbufh	zedcvgi
148 | ydcdzbo	zedeacp
149 | ydcflfo	zedgmgp
150 | ydcspev	zedtqfw
151 | ydcuisp	zedvjtq
152 | ydcpeaj	zedqfbk
153 | ydcvfmm	zedwgnn
154 | ydcpcsf	zedqdtg
155 | ubtfyqm	vcugzrn
156 | ydcptqi	zedqurj
157 | ydcvfhp	zedwgiq
158 | hddpssi	ieeqttj
159 | ydcifdz	zedjgea
160 | qjtcsud	rkudtve
161 | ljmhmgx	mkninhy
162 | evlwsjs	fwmxtkt
163 | vqeshza	wrftiab
164 | ydcbhbt	zedcicu
165 | pqdshza	qretiab
166 | dvsgjeg	ewthkfh
167 | qdnspvx	reotqwy
168 | ydciuvc	zedjvwd
169 | ydcifbs	zedjgct
170 | ydcbaap	zedcbbq
171 | ydcpvgm	zedqwhn
172 | blpktqy	cmqlurz
173 | ydcifju	zedjgkv
174 | wpzhmgx	xqainhy
175 | wfaktqy	xgblurz
176 | ydcpdsf	zedqetg
177 | ynnbdkf	zoocelg
178 | ydcbhbm	zedcicn
179 | ydcjtfs	zedkugt
180 | mbqnruu	ncrosvv
181 | ydcwpjf	zedxqkg
182 | ydcrvpj	zedswqk
183 | ydciffm	zedjggn
184 | sqdjorv	trekpsw
185 | ujfhmgx	vkginhy
186 | ifynruu	jgzosvv
187 | bluktqc	cmvlurd
188 | ydcplvt	zedqmwu
189 | ydciubh	zedjvci
190 | ubtwjod	vcuxkpe
191 | ecmtibx	fdnujcy
192 | ifepvuf	jgfqwvg
193 | dnteftu	eoufguv
194 | tjujolt	ukvkpmu
195 | bewqjuf	cfxrkvg
196 | hsqxfsl	itrygtm
197 | bftbtjp	cgucukq
198 | bufrvjs	cvgswkt
199 | tuzbotb	uvacpuc
200 | cpxbnbd	dqycoce
201 | yzavojl	zabwpkm
202 | phsnpef	qitoqfg
203 | dumnpoz	evnoqpa
204 | sohnpoz	tpioqpa
205 | cgejaaz	dhfkbba
206 | tsdtuju	uteuvkv
207 | qluvcjd	rmvwdke
208 | peepvse	qffqwtf
209 | sjgoubh	tkhpvci
210 | fnppnfa	goqqogb
211 | jojjmfo	kpkkngp
212 | eotbwjb	fpucxkc
213 | vjumftj	wkvnguk
214 | bggfteb	chhgufc
215 | nhsvmjb	oitwnkc
216 | ftjbsbn	gukctco
217 | tvcaztu	uwdbauv
218 | puufyqp	qvvgzrq
219 | veqdpmp	wfreqnq
220 | wbleofz	xcmfpga
221 | etqfsfp	furgtgq
222 | qohqpof	rpirqpg
223 | uhufmmb	vivgnnc
224 | juivfvs	kvjwgwt
225 | telkbcj	ufmlcdk
226 | echjwje	fdikxkf
227 | qbzljdi	rcamkej
228 | ojlbjho	pkmckip
229 | qftvfmm	rguwgnn
230 | nvtspqq	owutqrr
231 | fcytjef	gdzukfg
232 | eojfofa	fpkgpgb
233 | eddtdbm	feeuecn
234 | dncifdl	eodjgem
235 | hulpppp	ivmqqqq
236 | cfxjlpo	dgykmqp
237 | vugftus	wvhguvt
238 | ujfjcvs	vkgkdwt
239 | jtmttfm	kunuugn
240 | efypouf	fgzqpvg
241 | cjagjmu	dkbhknv
242 | ufujcsj	vgvkdtk
243 | pdugjsn	qevhkto
244 | xfjvept	ygkwfqu
245 | qxeujdl	ryfvkem
246 | uumbssz	vvnctta
247 | tuvjnfp	uvwkogq
248 | trmtusf	usnuvtg
249 | evfbgfo	fwgchgp
250 | toevshf	upfwtig
251 | jmlfudi	knmgvej
252 | sohjbuf	tpikcvg
253 | ymtcppn	znudqqo
254 | spcjtco	tqdkudp
255 | ojepvtu	pkfqwuv
256 | cfhljut	dgimkvu
257 | bjepnje	ckfqokf
258 | dqugpup	ervhqvq
259 | echbcbz	fdicdca
260 | bnzpwfm	coaqxgn
261 | feyftjt	gfzguku
262 | pveufnt	qwfvgou
263 | ijnpdpt	jkoqequ
264 | wpzyggg	xqazhhh
265 | ruzjklf	svaklmg
266 | nndvmfe	ooewngf
267 | wulqsju	xvmrtkv
268 | fforstu	ggpstuv
269 | ocsmzqi	pdtnarj
270 | yisejtq	zjtfkur
271 | gguvsbb	hhvwtcc
272 | ynnhfuu	zooigvv
273 | avhxqec	bwiyrfd
274 | mitjhvb	njukiwc
275 | ttifntq	uujgour
276 | cvsshza	dwttiab
277 | wuljsjf	xvmktkg
278 | sqdabop	trebcpq
279 | ydcxfou	zedygpv
280 | epjbxlt	fqkcymu
281 | dhjbdzk	eikceal
282 | dazboet	ebacpfu
283 | bufktqy	cvglurz
284 | telsykt	ufmtzlu
285 | nmypjdf	onzqkeg
286 | ktyjsnt	luzktou
287 | dazshcb	ebatidc
288 | nblshza	ocmtiab
289 | obojhic	pcpkijd
290 | wulvoez	xvmwpfa
291 | nmyvmbo	onzwncp
292 | vdaqfpo	webrgqp
293 | qdjbwby	rekcxcz
294 | hqtjmjl	iruknkm
295 | vdaiubh	webjvci
296 | pvjrstu	qwkstuv
297 | gfektqc	hgflurd
298 | ohyboua	pizcpvb
299 | sfsmtsv	tgtnutw
300 | svedzdi	twfeaej
301 | mpumtsv	nqvnutw
302 | bczjmph	cdaknqi
303 | stbvfcb	tucwgdc
304 | wgthfuu	xhuigvv
305 | qizfqbs	rjagrct
306 | wftyggg	xguzhhh
307 | mdemfsj	nefngtk
308 | sntmtsv	tounutw
309 | sqdvnop	trewopq
310 | mdewpjs	nefxqkt
311 | jgtcsud	khudtve
312 | tvunpcy	uwvoqdz
313 | xoeodjb	ypfpekc
314 | ydcjefe	zedkfgf
315 | ktylfft	luzmggu
316 | busmznq	cvtnaor
317 | yisltft	zjtmugu
318 | vdaqfbs	webrgct
319 | eptpssi	fquqttj
320 | wulhfnt	xvmigou
321 | hmnmesf	inonftg
322 | efyvnco	fgzwodp
323 | ljuyggg	mkvzhhh
324 | ktybkbo	luzclcp
325 | huluifx	ivmvjgy
326 | ydcepht	zedfqiu
327 | stbfmmu	tucgnnv
328 | qdmwbjt	renxcku
329 | ydcbcxf	zedcdyg
330 | bluktqy	cmvlurz
331 | ydcdzdi	zedeaej
332 | ydcspvx	zedtqwy
333 | gpyshza	hqztiab
334 | ydcbufh	zedcvgi
335 | ydcdzbo	zedeacp
336 | ydcflfo	zedgmgp
337 | ydcspev	zedtqfw
338 | ydcpeaj	zedqfbk
339 | ydcvfmm	zedwgnn
340 | ydcpcsf	zedqdtg
341 | ubtfyqm	vcugzrn
342 | ydcptqi	zedqurj
343 | ydcvfhp	zedwgiq
344 | hddpssi	ieeqttj
345 | ydcifdz	zedjgea
346 | qjtcsud	rkudtve
347 | ljmhmgx	mkninhy
348 | evlwsjs	fwmxtkt
349 | vqeshza	wrftiab
350 | ydcbhbt	zedcicu
351 | pqdshza	qretiab
352 | dvsgjeg	ewthkfh
353 | qdnspvx	reotqwy
354 | ydciuvc	zedjvwd
355 | ydcifbs	zedjgct
356 | ydcbaap	zedcbbq
357 | ydcpvgm	zedqwhn
358 | blpktqy	cmqlurz
359 | ydcifju	zedjgkv
360 | wpzhmgx	xqainhy
361 | wfaktqy	xgblurz
362 | ydcpdsf	zedqetg
363 | ynnbdkf	zoocelg
364 | ydcbhbm	zedcicn
365 | ydcjtfs	zedkugt
366 | mbqnruu	ncrosvv
367 | ydcwpjf	zedxqkg
368 | ydcrvpj	zedswqk
369 | ydciffm	zedjggn
370 | sqdjorv	trekpsw
371 | ujfhmgx	vkginhy
372 | ifynruu	jgzosvv
373 | bluktqc	cmvlurd
374 | ydcplvt	zedqmwu
375 | ydciubh	zedjvci
376 | 


--------------------------------------------------------------------------------
/logs/basic/claude-3/results.jsonl:
--------------------------------------------------------------------------------
  1 | {"condition": "basic1_bin1", "acc_inst": 0.79, "acc_demo": 0.0, "levdist": 0.33, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
  2 | {"condition": "basic1_bin2", "acc_inst": 0.43, "acc_demo": 0.0, "levdist": 1.25, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
  3 | {"condition": "basic1_bin3", "acc_inst": 0.22, "acc_demo": 0.0, "levdist": 2.2, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
  4 | {"condition": "basic1_bin4", "acc_inst": 0.11, "acc_demo": 0.0, "levdist": 2.34, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
  5 | {"condition": "basic1_bin5", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 2.21, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
  6 | {"condition": "basic2_bin1", "acc_inst": 0.44, "acc_demo": 0.0, "levdist": 1.36, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
  7 | {"condition": "basic2_bin2", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 2.18, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
  8 | {"condition": "basic2_bin3", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 3.26, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
  9 | {"condition": "basic2_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.01, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 10 | {"condition": "basic2_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 3.76, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 11 | {"condition": "basic3_bin1", "acc_inst": 0.65, "acc_demo": 0.0, "levdist": 0.84, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 12 | {"condition": "basic3_bin2", "acc_inst": 0.34, "acc_demo": 0.0, "levdist": 1.56, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 13 | {"condition": "basic3_bin3", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 2.63, "median_levdist": 2.5, "model": "claude-3", "temp": 0.0}
 14 | {"condition": "basic3_bin4", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 3.92, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 15 | {"condition": "basic3_bin5", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.75, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 16 | {"condition": "basic4_bin1", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 3.48, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 17 | {"condition": "basic4_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.17, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 18 | {"condition": "basic4_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.28, "median_levdist": 4.5, "model": "claude-3", "temp": 0.0}
 19 | {"condition": "basic4_bin4", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 4.65, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 20 | {"condition": "basic4_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.37, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 21 | {"condition": "basic5_bin1", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 4.12, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 22 | {"condition": "basic5_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.2, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 23 | {"condition": "basic5_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.85, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 24 | {"condition": "basic5_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.16, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 25 | {"condition": "basic5_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.7, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 26 | {"condition": "basic6_bin1", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 4.54, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 27 | {"condition": "basic6_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.21, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 28 | {"condition": "basic6_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.16, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 29 | {"condition": "basic6_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.3, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 30 | {"condition": "basic6_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.81, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 31 | {"condition": "basic7_bin1", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 4.11, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 32 | {"condition": "basic7_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.55, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 33 | {"condition": "basic7_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.51, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 34 | {"condition": "basic7_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.65, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 35 | {"condition": "basic7_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.31, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 36 | {"condition": "basic8_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.37, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 37 | {"condition": "basic8_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.45, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 38 | {"condition": "basic8_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.49, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 39 | {"condition": "basic8_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.68, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 40 | {"condition": "basic8_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.11, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 41 | {"condition": "basic9_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.6, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 42 | {"condition": "basic9_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.08, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 43 | {"condition": "basic9_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.96, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 44 | {"condition": "basic9_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.03, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 45 | {"condition": "basic9_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.66, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 46 | {"condition": "basic10_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.75, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 47 | {"condition": "basic10_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.83, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 48 | {"condition": "basic10_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.72, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 49 | {"condition": "basic10_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.93, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 50 | {"condition": "basic10_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.42, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 51 | {"condition": "basic11_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.79, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 52 | {"condition": "basic11_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.04, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 53 | {"condition": "basic11_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.08, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 54 | {"condition": "basic11_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.4, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 55 | {"condition": "basic11_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.59, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 56 | {"condition": "basic12_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.66, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 57 | {"condition": "basic12_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.67, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 58 | {"condition": "basic12_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.86, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 59 | {"condition": "basic12_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.39, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 60 | {"condition": "basic12_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.44, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 61 | {"condition": "basic13_bin1", "acc_inst": 0.47, "acc_demo": 0.0, "levdist": 2.58, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 62 | {"condition": "basic13_bin2", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 3.58, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
 63 | {"condition": "basic13_bin3", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 4.53, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 64 | {"condition": "basic13_bin4", "acc_inst": 0.1, "acc_demo": 0.0, "levdist": 4.28, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 65 | {"condition": "basic13_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 3.95, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 66 | {"condition": "basic14_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.78, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 67 | {"condition": "basic14_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.97, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 68 | {"condition": "basic14_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.26, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 69 | {"condition": "basic14_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.5, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 70 | {"condition": "basic14_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.67, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 71 | {"condition": "basic15_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.07, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 72 | {"condition": "basic15_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.95, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 73 | {"condition": "basic15_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.13, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 74 | {"condition": "basic15_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.32, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 75 | {"condition": "basic15_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.68, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 76 | {"condition": "basic16_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.78, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 77 | {"condition": "basic16_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.06, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 78 | {"condition": "basic16_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.2, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 79 | {"condition": "basic16_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.5, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 80 | {"condition": "basic16_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.65, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 81 | {"condition": "basic17_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.85, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 82 | {"condition": "basic17_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.01, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 83 | {"condition": "basic17_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.13, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 84 | {"condition": "basic17_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.39, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 85 | {"condition": "basic17_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.55, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 86 | {"condition": "basic18_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.06, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 87 | {"condition": "basic18_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.09, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 88 | {"condition": "basic18_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.44, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 89 | {"condition": "basic18_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.64, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 90 | {"condition": "basic18_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.8, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 91 | {"condition": "basic19_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.87, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 92 | {"condition": "basic19_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.98, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 93 | {"condition": "basic19_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.1, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 94 | {"condition": "basic19_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.58, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 95 | {"condition": "basic19_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.57, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 96 | {"condition": "basic20_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.34, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 97 | {"condition": "basic20_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.27, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 98 | {"condition": "basic20_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.47, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 99 | {"condition": "basic20_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.58, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
100 | {"condition": "basic20_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.49, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
101 | {"condition": "basic21_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.0, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
102 | {"condition": "basic21_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.06, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
103 | {"condition": "basic21_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.18, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
104 | {"condition": "basic21_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.33, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
105 | {"condition": "basic21_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.5, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
106 | {"condition": "basic22_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.12, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
107 | {"condition": "basic22_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.17, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
108 | {"condition": "basic22_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.24, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
109 | {"condition": "basic22_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.66, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
110 | {"condition": "basic22_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.69, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
111 | {"condition": "basic23_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.05, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
112 | {"condition": "basic23_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.48, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
113 | {"condition": "basic23_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.7, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
114 | {"condition": "basic23_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.48, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
115 | {"condition": "basic23_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.65, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
116 | {"condition": "basic24_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 6.12, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
117 | {"condition": "basic24_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.31, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
118 | {"condition": "basic24_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.68, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
119 | {"condition": "basic24_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.72, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
120 | {"condition": "basic24_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.41, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
121 | {"condition": "basic25_bin1", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 2.98, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
122 | {"condition": "basic25_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 3.81, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
123 | {"condition": "basic25_bin3", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 4.36, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
124 | {"condition": "basic25_bin4", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 4.83, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
125 | {"condition": "basic25_bin5", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 4.64, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
126 | 


--------------------------------------------------------------------------------
/logs/basic/llama3.1-405b/results.jsonl:
--------------------------------------------------------------------------------
  1 | {"condition": "basic1_bin1", "acc_inst": 0.28, "acc_demo": 0.0, "levdist": 13.15, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
  2 | {"condition": "basic1_bin2", "acc_inst": 0.22, "acc_demo": 0.0, "levdist": 8.68, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
  3 | {"condition": "basic1_bin3", "acc_inst": 0.24, "acc_demo": 0.0, "levdist": 15.0, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
  4 | {"condition": "basic1_bin4", "acc_inst": 0.38, "acc_demo": 0.0, "levdist": 34.17, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
  5 | {"condition": "basic1_bin5", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 13.83, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
  6 | {"condition": "basic2_bin1", "acc_inst": 0.36, "acc_demo": 0.0, "levdist": 106.78, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
  7 | {"condition": "basic2_bin2", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 139.94, "median_levdist": 68.0, "model": "claude-3", "temp": 0.0}
  8 | {"condition": "basic2_bin3", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 146.08, "median_levdist": 186.5, "model": "claude-3", "temp": 0.0}
  9 | {"condition": "basic2_bin4", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 221.93, "median_levdist": 191.0, "model": "claude-3", "temp": 0.0}
 10 | {"condition": "basic2_bin5", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 199.82, "median_levdist": 188.0, "model": "claude-3", "temp": 0.0}
 11 | {"condition": "basic3_bin1", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 107.4, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
 12 | {"condition": "basic3_bin2", "acc_inst": 0.21, "acc_demo": 0.0, "levdist": 148.21, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 13 | {"condition": "basic3_bin3", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 228.4, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0}
 14 | {"condition": "basic3_bin4", "acc_inst": 0.1, "acc_demo": 0.0, "levdist": 233.82, "median_levdist": 199.5, "model": "claude-3", "temp": 0.0}
 15 | {"condition": "basic3_bin5", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 223.57, "median_levdist": 192.5, "model": "claude-3", "temp": 0.0}
 16 | {"condition": "basic4_bin1", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 135.45, "median_levdist": 3.5, "model": "claude-3", "temp": 0.0}
 17 | {"condition": "basic4_bin2", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 118.88, "median_levdist": 3.5, "model": "claude-3", "temp": 0.0}
 18 | {"condition": "basic4_bin3", "acc_inst": 0.16, "acc_demo": 0.0, "levdist": 129.08, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 19 | {"condition": "basic4_bin4", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 218.76, "median_levdist": 38.0, "model": "claude-3", "temp": 0.0}
 20 | {"condition": "basic4_bin5", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 418.91, "median_levdist": 561.5, "model": "claude-3", "temp": 0.0}
 21 | {"condition": "basic5_bin1", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 266.24, "median_levdist": 276.5, "model": "claude-3", "temp": 0.0}
 22 | {"condition": "basic5_bin2", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 262.9, "median_levdist": 276.5, "model": "claude-3", "temp": 0.0}
 23 | {"condition": "basic5_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 297.63, "median_levdist": 279.0, "model": "claude-3", "temp": 0.0}
 24 | {"condition": "basic5_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 317.22, "median_levdist": 299.5, "model": "claude-3", "temp": 0.0}
 25 | {"condition": "basic5_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 293.26, "median_levdist": 301.0, "model": "claude-3", "temp": 0.0}
 26 | {"condition": "basic6_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 256.99, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0}
 27 | {"condition": "basic6_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 234.49, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0}
 28 | {"condition": "basic6_bin3", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 203.78, "median_levdist": 196.5, "model": "claude-3", "temp": 0.0}
 29 | {"condition": "basic6_bin4", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 277.69, "median_levdist": 205.0, "model": "claude-3", "temp": 0.0}
 30 | {"condition": "basic6_bin5", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 354.46, "median_levdist": 423.0, "model": "claude-3", "temp": 0.0}
 31 | {"condition": "basic7_bin1", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 165.28, "median_levdist": 191.5, "model": "claude-3", "temp": 0.0}
 32 | {"condition": "basic7_bin2", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 187.12, "median_levdist": 192.0, "model": "claude-3", "temp": 0.0}
 33 | {"condition": "basic7_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 218.5, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0}
 34 | {"condition": "basic7_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 233.28, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0}
 35 | {"condition": "basic7_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 251.88, "median_levdist": 200.5, "model": "claude-3", "temp": 0.0}
 36 | {"condition": "basic8_bin1", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 219.92, "median_levdist": 194.0, "model": "claude-3", "temp": 0.0}
 37 | {"condition": "basic8_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 213.79, "median_levdist": 201.5, "model": "claude-3", "temp": 0.0}
 38 | {"condition": "basic8_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 241.55, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0}
 39 | {"condition": "basic8_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 270.14, "median_levdist": 205.0, "model": "claude-3", "temp": 0.0}
 40 | {"condition": "basic8_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 247.3, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0}
 41 | {"condition": "basic9_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 317.68, "median_levdist": 277.5, "model": "claude-3", "temp": 0.0}
 42 | {"condition": "basic9_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 324.41, "median_levdist": 276.0, "model": "claude-3", "temp": 0.0}
 43 | {"condition": "basic9_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 335.97, "median_levdist": 298.5, "model": "claude-3", "temp": 0.0}
 44 | {"condition": "basic9_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 381.66, "median_levdist": 390.0, "model": "claude-3", "temp": 0.0}
 45 | {"condition": "basic9_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 320.68, "median_levdist": 310.5, "model": "claude-3", "temp": 0.0}
 46 | {"condition": "basic10_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 221.93, "median_levdist": 206.0, "model": "claude-3", "temp": 0.0}
 47 | {"condition": "basic10_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 250.25, "median_levdist": 206.0, "model": "claude-3", "temp": 0.0}
 48 | {"condition": "basic10_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 223.85, "median_levdist": 196.0, "model": "claude-3", "temp": 0.0}
 49 | {"condition": "basic10_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 282.61, "median_levdist": 277.0, "model": "claude-3", "temp": 0.0}
 50 | {"condition": "basic10_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 310.79, "median_levdist": 291.0, "model": "claude-3", "temp": 0.0}
 51 | {"condition": "basic11_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 352.69, "median_levdist": 244.5, "model": "claude-3", "temp": 0.0}
 52 | {"condition": "basic11_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 336.13, "median_levdist": 281.5, "model": "claude-3", "temp": 0.0}
 53 | {"condition": "basic11_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 327.86, "median_levdist": 229.0, "model": "claude-3", "temp": 0.0}
 54 | {"condition": "basic11_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 344.3, "median_levdist": 216.0, "model": "claude-3", "temp": 0.0}
 55 | {"condition": "basic11_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 505.99, "median_levdist": 632.0, "model": "claude-3", "temp": 0.0}
 56 | {"condition": "basic12_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 309.18, "median_levdist": 220.5, "model": "claude-3", "temp": 0.0}
 57 | {"condition": "basic12_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 299.78, "median_levdist": 275.0, "model": "claude-3", "temp": 0.0}
 58 | {"condition": "basic12_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 292.63, "median_levdist": 200.0, "model": "claude-3", "temp": 0.0}
 59 | {"condition": "basic12_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 316.36, "median_levdist": 294.5, "model": "claude-3", "temp": 0.0}
 60 | {"condition": "basic12_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 385.7, "median_levdist": 325.0, "model": "claude-3", "temp": 0.0}
 61 | {"condition": "basic13_bin1", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 22.45, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 62 | {"condition": "basic13_bin2", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 42.08, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 63 | {"condition": "basic13_bin3", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 41.13, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 64 | {"condition": "basic13_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 41.76, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 65 | {"condition": "basic13_bin5", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 105.25, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 66 | {"condition": "basic14_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 318.19, "median_levdist": 208.5, "model": "claude-3", "temp": 0.0}
 67 | {"condition": "basic14_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 257.47, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
 68 | {"condition": "basic14_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 324.1, "median_levdist": 270.5, "model": "claude-3", "temp": 0.0}
 69 | {"condition": "basic14_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 317.07, "median_levdist": 280.5, "model": "claude-3", "temp": 0.0}
 70 | {"condition": "basic14_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 319.46, "median_levdist": 279.0, "model": "claude-3", "temp": 0.0}
 71 | {"condition": "basic15_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 346.07, "median_levdist": 286.0, "model": "claude-3", "temp": 0.0}
 72 | {"condition": "basic15_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 316.85, "median_levdist": 287.5, "model": "claude-3", "temp": 0.0}
 73 | {"condition": "basic15_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 346.29, "median_levdist": 304.0, "model": "claude-3", "temp": 0.0}
 74 | {"condition": "basic15_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 357.54, "median_levdist": 306.5, "model": "claude-3", "temp": 0.0}
 75 | {"condition": "basic15_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 328.12, "median_levdist": 290.0, "model": "claude-3", "temp": 0.0}
 76 | {"condition": "basic16_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 402.07, "median_levdist": 340.0, "model": "claude-3", "temp": 0.0}
 77 | {"condition": "basic16_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 370.82, "median_levdist": 317.0, "model": "claude-3", "temp": 0.0}
 78 | {"condition": "basic16_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 366.68, "median_levdist": 324.5, "model": "claude-3", "temp": 0.0}
 79 | {"condition": "basic16_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 419.24, "median_levdist": 445.0, "model": "claude-3", "temp": 0.0}
 80 | {"condition": "basic16_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 286.26, "median_levdist": 206.0, "model": "claude-3", "temp": 0.0}
 81 | {"condition": "basic17_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 296.78, "median_levdist": 196.0, "model": "claude-3", "temp": 0.0}
 82 | {"condition": "basic17_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 318.12, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
 83 | {"condition": "basic17_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 293.44, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
 84 | {"condition": "basic17_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 319.41, "median_levdist": 198.5, "model": "claude-3", "temp": 0.0}
 85 | {"condition": "basic17_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 355.59, "median_levdist": 229.5, "model": "claude-3", "temp": 0.0}
 86 | {"condition": "basic18_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 309.81, "median_levdist": 292.0, "model": "claude-3", "temp": 0.0}
 87 | {"condition": "basic18_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 330.19, "median_levdist": 308.0, "model": "claude-3", "temp": 0.0}
 88 | {"condition": "basic18_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 327.76, "median_levdist": 310.0, "model": "claude-3", "temp": 0.0}
 89 | {"condition": "basic18_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 352.78, "median_levdist": 331.0, "model": "claude-3", "temp": 0.0}
 90 | {"condition": "basic18_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 406.42, "median_levdist": 401.0, "model": "claude-3", "temp": 0.0}
 91 | {"condition": "basic19_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 330.4, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
 92 | {"condition": "basic19_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 312.73, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
 93 | {"condition": "basic19_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 311.94, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
 94 | {"condition": "basic19_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 291.75, "median_levdist": 198.0, "model": "claude-3", "temp": 0.0}
 95 | {"condition": "basic19_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 258.78, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
 96 | {"condition": "basic20_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 354.28, "median_levdist": 360.0, "model": "claude-3", "temp": 0.0}
 97 | {"condition": "basic20_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 308.07, "median_levdist": 299.0, "model": "claude-3", "temp": 0.0}
 98 | {"condition": "basic20_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 300.86, "median_levdist": 311.5, "model": "claude-3", "temp": 0.0}
 99 | {"condition": "basic20_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 254.9, "median_levdist": 194.5, "model": "claude-3", "temp": 0.0}
100 | {"condition": "basic20_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 217.92, "median_levdist": 8.5, "model": "claude-3", "temp": 0.0}
101 | {"condition": "basic21_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 416.48, "median_levdist": 378.0, "model": "claude-3", "temp": 0.0}
102 | {"condition": "basic21_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 349.05, "median_levdist": 235.0, "model": "claude-3", "temp": 0.0}
103 | {"condition": "basic21_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 406.02, "median_levdist": 378.0, "model": "claude-3", "temp": 0.0}
104 | {"condition": "basic21_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 386.56, "median_levdist": 314.0, "model": "claude-3", "temp": 0.0}
105 | {"condition": "basic21_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 286.54, "median_levdist": 292.0, "model": "claude-3", "temp": 0.0}
106 | {"condition": "basic22_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 426.68, "median_levdist": 339.5, "model": "claude-3", "temp": 0.0}
107 | {"condition": "basic22_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 410.56, "median_levdist": 348.5, "model": "claude-3", "temp": 0.0}
108 | {"condition": "basic22_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 440.15, "median_levdist": 492.5, "model": "claude-3", "temp": 0.0}
109 | {"condition": "basic22_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 430.12, "median_levdist": 435.0, "model": "claude-3", "temp": 0.0}
110 | {"condition": "basic22_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 416.37, "median_levdist": 372.5, "model": "claude-3", "temp": 0.0}
111 | 


--------------------------------------------------------------------------------
/logs/basic/llama3.1-405b/results1.jsonl:
--------------------------------------------------------------------------------
  1 | {"condition": "basic1_bin1", "acc_inst": 0.28, "acc_demo": 0.0, "levdist": 13.15, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
  2 | {"condition": "basic1_bin2", "acc_inst": 0.22, "acc_demo": 0.0, "levdist": 8.68, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
  3 | {"condition": "basic1_bin3", "acc_inst": 0.24, "acc_demo": 0.0, "levdist": 15.0, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
  4 | {"condition": "basic1_bin4", "acc_inst": 0.38, "acc_demo": 0.0, "levdist": 34.17, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
  5 | {"condition": "basic1_bin5", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 13.83, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
  6 | {"condition": "basic2_bin1", "acc_inst": 0.36, "acc_demo": 0.0, "levdist": 106.78, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
  7 | {"condition": "basic2_bin2", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 139.94, "median_levdist": 68.0, "model": "claude-3", "temp": 0.0}
  8 | {"condition": "basic2_bin3", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 146.08, "median_levdist": 186.5, "model": "claude-3", "temp": 0.0}
  9 | {"condition": "basic2_bin4", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 221.93, "median_levdist": 191.0, "model": "claude-3", "temp": 0.0}
 10 | {"condition": "basic2_bin5", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 199.82, "median_levdist": 188.0, "model": "claude-3", "temp": 0.0}
 11 | {"condition": "basic3_bin1", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 107.4, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
 12 | {"condition": "basic3_bin2", "acc_inst": 0.21, "acc_demo": 0.0, "levdist": 148.21, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 13 | {"condition": "basic3_bin3", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 228.4, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0}
 14 | {"condition": "basic3_bin4", "acc_inst": 0.1, "acc_demo": 0.0, "levdist": 233.82, "median_levdist": 199.5, "model": "claude-3", "temp": 0.0}
 15 | {"condition": "basic3_bin5", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 223.57, "median_levdist": 192.5, "model": "claude-3", "temp": 0.0}
 16 | {"condition": "basic4_bin1", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 135.45, "median_levdist": 3.5, "model": "claude-3", "temp": 0.0}
 17 | {"condition": "basic4_bin2", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 118.88, "median_levdist": 3.5, "model": "claude-3", "temp": 0.0}
 18 | {"condition": "basic4_bin3", "acc_inst": 0.16, "acc_demo": 0.0, "levdist": 129.08, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 19 | {"condition": "basic4_bin4", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 218.76, "median_levdist": 38.0, "model": "claude-3", "temp": 0.0}
 20 | {"condition": "basic4_bin5", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 418.91, "median_levdist": 561.5, "model": "claude-3", "temp": 0.0}
 21 | {"condition": "basic5_bin1", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 266.24, "median_levdist": 276.5, "model": "claude-3", "temp": 0.0}
 22 | {"condition": "basic5_bin2", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 262.9, "median_levdist": 276.5, "model": "claude-3", "temp": 0.0}
 23 | {"condition": "basic5_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 297.63, "median_levdist": 279.0, "model": "claude-3", "temp": 0.0}
 24 | {"condition": "basic5_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 317.22, "median_levdist": 299.5, "model": "claude-3", "temp": 0.0}
 25 | {"condition": "basic5_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 293.26, "median_levdist": 301.0, "model": "claude-3", "temp": 0.0}
 26 | {"condition": "basic6_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 256.99, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0}
 27 | {"condition": "basic6_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 234.49, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0}
 28 | {"condition": "basic6_bin3", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 203.78, "median_levdist": 196.5, "model": "claude-3", "temp": 0.0}
 29 | {"condition": "basic6_bin4", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 277.69, "median_levdist": 205.0, "model": "claude-3", "temp": 0.0}
 30 | {"condition": "basic6_bin5", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 354.46, "median_levdist": 423.0, "model": "claude-3", "temp": 0.0}
 31 | {"condition": "basic7_bin1", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 165.28, "median_levdist": 191.5, "model": "claude-3", "temp": 0.0}
 32 | {"condition": "basic7_bin2", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 187.12, "median_levdist": 192.0, "model": "claude-3", "temp": 0.0}
 33 | {"condition": "basic7_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 218.5, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0}
 34 | {"condition": "basic7_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 233.28, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0}
 35 | {"condition": "basic7_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 251.88, "median_levdist": 200.5, "model": "claude-3", "temp": 0.0}
 36 | {"condition": "basic8_bin1", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 219.92, "median_levdist": 194.0, "model": "claude-3", "temp": 0.0}
 37 | {"condition": "basic8_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 213.79, "median_levdist": 201.5, "model": "claude-3", "temp": 0.0}
 38 | {"condition": "basic8_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 241.55, "median_levdist": 203.0, "model": "claude-3", "temp": 0.0}
 39 | {"condition": "basic8_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 270.14, "median_levdist": 205.0, "model": "claude-3", "temp": 0.0}
 40 | {"condition": "basic8_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 247.3, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0}
 41 | {"condition": "basic9_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 317.68, "median_levdist": 277.5, "model": "claude-3", "temp": 0.0}
 42 | {"condition": "basic9_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 324.41, "median_levdist": 276.0, "model": "claude-3", "temp": 0.0}
 43 | {"condition": "basic9_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 335.97, "median_levdist": 298.5, "model": "claude-3", "temp": 0.0}
 44 | {"condition": "basic9_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 381.66, "median_levdist": 390.0, "model": "claude-3", "temp": 0.0}
 45 | {"condition": "basic9_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 320.68, "median_levdist": 310.5, "model": "claude-3", "temp": 0.0}
 46 | {"condition": "basic10_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 221.93, "median_levdist": 206.0, "model": "claude-3", "temp": 0.0}
 47 | {"condition": "basic10_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 250.25, "median_levdist": 206.0, "model": "claude-3", "temp": 0.0}
 48 | {"condition": "basic10_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 223.85, "median_levdist": 196.0, "model": "claude-3", "temp": 0.0}
 49 | {"condition": "basic10_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 282.61, "median_levdist": 277.0, "model": "claude-3", "temp": 0.0}
 50 | {"condition": "basic10_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 310.79, "median_levdist": 291.0, "model": "claude-3", "temp": 0.0}
 51 | {"condition": "basic11_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 352.69, "median_levdist": 244.5, "model": "claude-3", "temp": 0.0}
 52 | {"condition": "basic11_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 336.13, "median_levdist": 281.5, "model": "claude-3", "temp": 0.0}
 53 | {"condition": "basic11_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 327.86, "median_levdist": 229.0, "model": "claude-3", "temp": 0.0}
 54 | {"condition": "basic11_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 344.3, "median_levdist": 216.0, "model": "claude-3", "temp": 0.0}
 55 | {"condition": "basic11_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 505.99, "median_levdist": 632.0, "model": "claude-3", "temp": 0.0}
 56 | {"condition": "basic12_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 309.18, "median_levdist": 220.5, "model": "claude-3", "temp": 0.0}
 57 | {"condition": "basic12_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 299.78, "median_levdist": 275.0, "model": "claude-3", "temp": 0.0}
 58 | {"condition": "basic12_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 292.63, "median_levdist": 200.0, "model": "claude-3", "temp": 0.0}
 59 | {"condition": "basic12_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 316.36, "median_levdist": 294.5, "model": "claude-3", "temp": 0.0}
 60 | {"condition": "basic12_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 385.7, "median_levdist": 325.0, "model": "claude-3", "temp": 0.0}
 61 | {"condition": "basic13_bin1", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 22.45, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 62 | {"condition": "basic13_bin2", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 42.08, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 63 | {"condition": "basic13_bin3", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 41.13, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 64 | {"condition": "basic13_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 41.76, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 65 | {"condition": "basic13_bin5", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 105.25, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 66 | {"condition": "basic14_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 318.19, "median_levdist": 208.5, "model": "claude-3", "temp": 0.0}
 67 | {"condition": "basic14_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 257.47, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
 68 | {"condition": "basic14_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 324.1, "median_levdist": 270.5, "model": "claude-3", "temp": 0.0}
 69 | {"condition": "basic14_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 317.07, "median_levdist": 280.5, "model": "claude-3", "temp": 0.0}
 70 | {"condition": "basic14_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 319.46, "median_levdist": 279.0, "model": "claude-3", "temp": 0.0}
 71 | {"condition": "basic15_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 346.07, "median_levdist": 286.0, "model": "claude-3", "temp": 0.0}
 72 | {"condition": "basic15_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 316.85, "median_levdist": 287.5, "model": "claude-3", "temp": 0.0}
 73 | {"condition": "basic15_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 346.29, "median_levdist": 304.0, "model": "claude-3", "temp": 0.0}
 74 | {"condition": "basic15_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 357.54, "median_levdist": 306.5, "model": "claude-3", "temp": 0.0}
 75 | {"condition": "basic15_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 328.12, "median_levdist": 290.0, "model": "claude-3", "temp": 0.0}
 76 | {"condition": "basic16_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 402.07, "median_levdist": 340.0, "model": "claude-3", "temp": 0.0}
 77 | {"condition": "basic16_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 370.82, "median_levdist": 317.0, "model": "claude-3", "temp": 0.0}
 78 | {"condition": "basic16_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 366.68, "median_levdist": 324.5, "model": "claude-3", "temp": 0.0}
 79 | {"condition": "basic16_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 419.24, "median_levdist": 445.0, "model": "claude-3", "temp": 0.0}
 80 | {"condition": "basic16_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 286.26, "median_levdist": 206.0, "model": "claude-3", "temp": 0.0}
 81 | {"condition": "basic17_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 296.78, "median_levdist": 196.0, "model": "claude-3", "temp": 0.0}
 82 | {"condition": "basic17_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 318.12, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
 83 | {"condition": "basic17_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 293.44, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
 84 | {"condition": "basic17_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 319.41, "median_levdist": 198.5, "model": "claude-3", "temp": 0.0}
 85 | {"condition": "basic17_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 355.59, "median_levdist": 229.5, "model": "claude-3", "temp": 0.0}
 86 | {"condition": "basic18_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 309.81, "median_levdist": 292.0, "model": "claude-3", "temp": 0.0}
 87 | {"condition": "basic18_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 330.19, "median_levdist": 308.0, "model": "claude-3", "temp": 0.0}
 88 | {"condition": "basic18_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 327.76, "median_levdist": 310.0, "model": "claude-3", "temp": 0.0}
 89 | {"condition": "basic18_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 352.78, "median_levdist": 331.0, "model": "claude-3", "temp": 0.0}
 90 | {"condition": "basic18_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 406.42, "median_levdist": 401.0, "model": "claude-3", "temp": 0.0}
 91 | {"condition": "basic19_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 330.4, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
 92 | {"condition": "basic19_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 312.73, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
 93 | {"condition": "basic19_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 311.94, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
 94 | {"condition": "basic19_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 291.75, "median_levdist": 198.0, "model": "claude-3", "temp": 0.0}
 95 | {"condition": "basic19_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 258.78, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
 96 | {"condition": "basic20_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 354.28, "median_levdist": 360.0, "model": "claude-3", "temp": 0.0}
 97 | {"condition": "basic20_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 308.07, "median_levdist": 299.0, "model": "claude-3", "temp": 0.0}
 98 | {"condition": "basic20_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 300.86, "median_levdist": 311.5, "model": "claude-3", "temp": 0.0}
 99 | {"condition": "basic20_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 254.9, "median_levdist": 194.5, "model": "claude-3", "temp": 0.0}
100 | {"condition": "basic20_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 217.92, "median_levdist": 8.5, "model": "claude-3", "temp": 0.0}
101 | {"condition": "basic21_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 416.48, "median_levdist": 378.0, "model": "claude-3", "temp": 0.0}
102 | {"condition": "basic21_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 349.05, "median_levdist": 235.0, "model": "claude-3", "temp": 0.0}
103 | {"condition": "basic21_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 406.02, "median_levdist": 378.0, "model": "claude-3", "temp": 0.0}
104 | {"condition": "basic21_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 386.56, "median_levdist": 314.0, "model": "claude-3", "temp": 0.0}
105 | {"condition": "basic21_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 286.54, "median_levdist": 292.0, "model": "claude-3", "temp": 0.0}
106 | {"condition": "basic22_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 426.68, "median_levdist": 339.5, "model": "claude-3", "temp": 0.0}
107 | {"condition": "basic22_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 410.56, "median_levdist": 348.5, "model": "claude-3", "temp": 0.0}
108 | {"condition": "basic22_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 440.15, "median_levdist": 492.5, "model": "claude-3", "temp": 0.0}
109 | {"condition": "basic22_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 430.12, "median_levdist": 435.0, "model": "claude-3", "temp": 0.0}
110 | {"condition": "basic22_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 416.37, "median_levdist": 372.5, "model": "claude-3", "temp": 0.0}
111 | {"condition": "basic23_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 278.38, "median_levdist": 196.0, "model": "claude-3", "temp": 0.0}
112 | {"condition": "basic23_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 331.5, "median_levdist": 196.0, "model": "claude-3", "temp": 0.0}
113 | {"condition": "basic23_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 339.51, "median_levdist": 197.0, "model": "claude-3", "temp": 0.0}
114 | {"condition": "basic23_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 363.82, "median_levdist": 202.0, "model": "claude-3", "temp": 0.0}
115 | {"condition": "basic23_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 403.8, "median_levdist": 358.0, "model": "claude-3", "temp": 0.0}
116 | {"condition": "basic24_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 375.29, "median_levdist": 312.0, "model": "claude-3", "temp": 0.0}
117 | {"condition": "basic24_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 415.72, "median_levdist": 354.0, "model": "claude-3", "temp": 0.0}
118 | {"condition": "basic24_bin3", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 395.21, "median_levdist": 316.0, "model": "claude-3", "temp": 0.0}
119 | {"condition": "basic24_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 418.06, "median_levdist": 374.5, "model": "claude-3", "temp": 0.0}
120 | {"condition": "basic24_bin5", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 402.51, "median_levdist": 308.0, "model": "claude-3", "temp": 0.0}
121 | {"condition": "basic25_bin1", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 246.23, "median_levdist": 193.0, "model": "claude-3", "temp": 0.0}
122 | {"condition": "basic25_bin2", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 352.15, "median_levdist": 303.0, "model": "claude-3", "temp": 0.0}
123 | {"condition": "basic25_bin3", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 309.63, "median_levdist": 195.0, "model": "claude-3", "temp": 0.0}
124 | {"condition": "basic25_bin4", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 437.08, "median_levdist": 559.0, "model": "claude-3", "temp": 0.0}
125 | {"condition": "basic25_bin5", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 421.89, "median_levdist": 541.5, "model": "claude-3", "temp": 0.0}
126 | 


--------------------------------------------------------------------------------
/logs/text_cot/claude-3/results.jsonl:
--------------------------------------------------------------------------------
  1 | {"condition": "cot1_bin1", "acc_inst": 0.87, "acc_demo": 0.0, "levdist": 0.38, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
  2 | {"condition": "cot1_bin2", "acc_inst": 0.79, "acc_demo": 0.0, "levdist": 0.38, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
  3 | {"condition": "cot1_bin3", "acc_inst": 0.67, "acc_demo": 0.0, "levdist": 0.61, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
  4 | {"condition": "cot1_bin4", "acc_inst": 0.76, "acc_demo": 0.0, "levdist": 0.62, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
  5 | {"condition": "cot1_bin5", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 2.41, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
  6 | {"condition": "cot2_bin1", "acc_inst": 0.9, "acc_demo": 0.0, "levdist": 0.21, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
  7 | {"condition": "cot2_bin2", "acc_inst": 0.77, "acc_demo": 0.0, "levdist": 0.52, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
  8 | {"condition": "cot2_bin3", "acc_inst": 0.7, "acc_demo": 0.0, "levdist": 0.56, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
  9 | {"condition": "cot2_bin4", "acc_inst": 0.74, "acc_demo": 0.0, "levdist": 0.56, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 10 | {"condition": "cot2_bin5", "acc_inst": 0.78, "acc_demo": 0.0, "levdist": 0.32, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 11 | {"condition": "cot3_bin1", "acc_inst": 0.96, "acc_demo": 0.0, "levdist": 0.05, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 12 | {"condition": "cot3_bin2", "acc_inst": 0.82, "acc_demo": 0.0, "levdist": 0.31, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 13 | {"condition": "cot3_bin3", "acc_inst": 0.68, "acc_demo": 0.0, "levdist": 0.6, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 14 | {"condition": "cot3_bin4", "acc_inst": 0.65, "acc_demo": 0.0, "levdist": 0.98, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 15 | {"condition": "cot3_bin5", "acc_inst": 0.32, "acc_demo": 0.0, "levdist": 2.18, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 16 | {"condition": "cot4_bin1", "acc_inst": 0.93, "acc_demo": 0.0, "levdist": 0.07, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 17 | {"condition": "cot4_bin2", "acc_inst": 0.82, "acc_demo": 0.0, "levdist": 0.22, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 18 | {"condition": "cot4_bin3", "acc_inst": 0.66, "acc_demo": 0.0, "levdist": 0.54, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 19 | {"condition": "cot4_bin4", "acc_inst": 0.73, "acc_demo": 0.0, "levdist": 0.35, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 20 | {"condition": "cot4_bin5", "acc_inst": 0.49, "acc_demo": 0.0, "levdist": 0.72, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 21 | {"condition": "cot5_bin1", "acc_inst": 0.88, "acc_demo": 0.0, "levdist": 0.29, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 22 | {"condition": "cot5_bin2", "acc_inst": 0.8, "acc_demo": 0.0, "levdist": 0.36, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 23 | {"condition": "cot5_bin3", "acc_inst": 0.7, "acc_demo": 0.0, "levdist": 0.48, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 24 | {"condition": "cot5_bin4", "acc_inst": 0.76, "acc_demo": 0.0, "levdist": 0.34, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 25 | {"condition": "cot5_bin5", "acc_inst": 0.86, "acc_demo": 0.0, "levdist": 0.19, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 26 | {"condition": "cot6_bin1", "acc_inst": 0.9, "acc_demo": 0.0, "levdist": 0.17, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 27 | {"condition": "cot6_bin2", "acc_inst": 0.75, "acc_demo": 0.0, "levdist": 0.52, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 28 | {"condition": "cot6_bin3", "acc_inst": 0.65, "acc_demo": 0.0, "levdist": 0.7, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 29 | {"condition": "cot6_bin4", "acc_inst": 0.7, "acc_demo": 0.0, "levdist": 1.28, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 30 | {"condition": "cot6_bin5", "acc_inst": 0.58, "acc_demo": 0.0, "levdist": 1.79, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 31 | {"condition": "cot7_bin1", "acc_inst": 0.63, "acc_demo": 0.0, "levdist": 1.18, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 32 | {"condition": "cot7_bin2", "acc_inst": 0.52, "acc_demo": 0.0, "levdist": 1.44, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 33 | {"condition": "cot7_bin3", "acc_inst": 0.46, "acc_demo": 0.0, "levdist": 1.77, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 34 | {"condition": "cot7_bin4", "acc_inst": 0.49, "acc_demo": 0.0, "levdist": 1.93, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 35 | {"condition": "cot7_bin5", "acc_inst": 0.63, "acc_demo": 0.0, "levdist": 0.91, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 36 | {"condition": "cot8_bin1", "acc_inst": 0.8, "acc_demo": 0.0, "levdist": 0.68, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 37 | {"condition": "cot8_bin2", "acc_inst": 0.69, "acc_demo": 0.0, "levdist": 0.77, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 38 | {"condition": "cot8_bin3", "acc_inst": 0.57, "acc_demo": 0.0, "levdist": 1.41, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 39 | {"condition": "cot8_bin4", "acc_inst": 0.6, "acc_demo": 0.0, "levdist": 1.03, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 40 | {"condition": "cot8_bin5", "acc_inst": 0.37, "acc_demo": 0.0, "levdist": 2.95, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 41 | {"condition": "cot9_bin1", "acc_inst": 0.6, "acc_demo": 0.0, "levdist": 0.99, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 42 | {"condition": "cot9_bin2", "acc_inst": 0.42, "acc_demo": 0.0, "levdist": 1.6, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 43 | {"condition": "cot9_bin3", "acc_inst": 0.34, "acc_demo": 0.0, "levdist": 2.02, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 44 | {"condition": "cot9_bin4", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 2.33, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 45 | {"condition": "cot9_bin5", "acc_inst": 0.16, "acc_demo": 0.0, "levdist": 1.85, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 46 | {"condition": "cot10_bin1", "acc_inst": 0.51, "acc_demo": 0.0, "levdist": 1.46, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 47 | {"condition": "cot10_bin2", "acc_inst": 0.35, "acc_demo": 0.0, "levdist": 2.01, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 48 | {"condition": "cot10_bin3", "acc_inst": 0.34, "acc_demo": 0.0, "levdist": 3.44, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 49 | {"condition": "cot10_bin4", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 2.39, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
 50 | {"condition": "cot10_bin5", "acc_inst": 0.19, "acc_demo": 0.0, "levdist": 3.01, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
 51 | {"condition": "cot11_bin1", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 3.13, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
 52 | {"condition": "cot11_bin2", "acc_inst": 0.1, "acc_demo": 0.0, "levdist": 3.07, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
 53 | {"condition": "cot11_bin3", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 3.03, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
 54 | {"condition": "cot11_bin4", "acc_inst": 0.11, "acc_demo": 0.0, "levdist": 3.52, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
 55 | {"condition": "cot11_bin5", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 4.65, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 56 | {"condition": "cot12_bin1", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 2.04, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
 57 | {"condition": "cot12_bin2", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 2.38, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
 58 | {"condition": "cot12_bin3", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 2.94, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
 59 | {"condition": "cot12_bin4", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 3.19, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
 60 | {"condition": "cot12_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 3.4, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
 61 | {"condition": "cot13_bin1", "acc_inst": 0.88, "acc_demo": 0.0, "levdist": 0.2, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 62 | {"condition": "cot13_bin2", "acc_inst": 0.73, "acc_demo": 0.0, "levdist": 0.4, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 63 | {"condition": "cot13_bin3", "acc_inst": 0.63, "acc_demo": 0.0, "levdist": 0.61, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 64 | {"condition": "cot13_bin4", "acc_inst": 0.6, "acc_demo": 0.0, "levdist": 0.74, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 65 | {"condition": "cot13_bin5", "acc_inst": 0.59, "acc_demo": 0.0, "levdist": 0.91, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 66 | {"condition": "cot14_bin1", "acc_inst": 0.19, "acc_demo": 0.0, "levdist": 2.92, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
 67 | {"condition": "cot14_bin2", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 3.29, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
 68 | {"condition": "cot14_bin3", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 3.78, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 69 | {"condition": "cot14_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 3.85, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 70 | {"condition": "cot14_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.22, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 71 | {"condition": "cot15_bin1", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 4.7, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 72 | {"condition": "cot15_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.77, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 73 | {"condition": "cot15_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.04, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 74 | {"condition": "cot15_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.84, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 75 | {"condition": "cot15_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.01, "median_levdist": 6.5, "model": "claude-3", "temp": 0.0}
 76 | {"condition": "cot16_bin1", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 4.64, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 77 | {"condition": "cot16_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.98, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 78 | {"condition": "cot16_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.2, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 79 | {"condition": "cot16_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.47, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 80 | {"condition": "cot16_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.11, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 81 | {"condition": "cot17_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.52, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 82 | {"condition": "cot17_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.76, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 83 | {"condition": "cot17_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.82, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 84 | {"condition": "cot17_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.33, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 85 | {"condition": "cot17_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.51, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 86 | {"condition": "cot18_bin1", "acc_inst": 0.19, "acc_demo": 0.0, "levdist": 3.62, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 87 | {"condition": "cot18_bin2", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 4.14, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 88 | {"condition": "cot18_bin3", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 4.02, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 89 | {"condition": "cot18_bin4", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 4.25, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 90 | {"condition": "cot18_bin5", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 5.58, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 91 | {"condition": "cot19_bin1", "acc_inst": 0.12, "acc_demo": 0.0, "levdist": 3.78, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 92 | {"condition": "cot19_bin2", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 4.03, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 93 | {"condition": "cot19_bin3", "acc_inst": 0.04, "acc_demo": 0.0, "levdist": 4.57, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 94 | {"condition": "cot19_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.04, "median_levdist": 5.5, "model": "claude-3", "temp": 0.0}
 95 | {"condition": "cot19_bin5", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.73, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 96 | {"condition": "cot20_bin1", "acc_inst": 0.18, "acc_demo": 0.0, "levdist": 3.41, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
 97 | {"condition": "cot20_bin2", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 4.18, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 98 | {"condition": "cot20_bin3", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 4.17, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 99 | {"condition": "cot20_bin4", "acc_inst": 0.03, "acc_demo": 0.0, "levdist": 6.57, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
100 | {"condition": "cot20_bin5", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.17, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
101 | {"condition": "cot21_bin1", "acc_inst": 0.3, "acc_demo": 0.0, "levdist": 2.9, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
102 | {"condition": "cot21_bin2", "acc_inst": 0.27, "acc_demo": 0.0, "levdist": 2.89, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
103 | {"condition": "cot21_bin3", "acc_inst": 0.14, "acc_demo": 0.0, "levdist": 2.84, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
104 | {"condition": "cot21_bin4", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 3.14, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
105 | {"condition": "cot21_bin5", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 3.62, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
106 | {"condition": "cot22_bin1", "acc_inst": 0.31, "acc_demo": 0.0, "levdist": 2.7, "median_levdist": 2.5, "model": "claude-3", "temp": 0.0}
107 | {"condition": "cot22_bin2", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 2.98, "median_levdist": 2.5, "model": "claude-3", "temp": 0.0}
108 | {"condition": "cot22_bin3", "acc_inst": 0.14, "acc_demo": 0.0, "levdist": 3.64, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
109 | {"condition": "cot22_bin4", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 4.32, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
110 | {"condition": "cot22_bin5", "acc_inst": 0.05, "acc_demo": 0.0, "levdist": 3.17, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
111 | {"condition": "cot23_bin1", "acc_inst": 0.35, "acc_demo": 0.0, "levdist": 2.8, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
112 | {"condition": "cot23_bin2", "acc_inst": 0.28, "acc_demo": 0.0, "levdist": 3.29, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
113 | {"condition": "cot23_bin3", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 3.83, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
114 | {"condition": "cot23_bin4", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 3.77, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
115 | {"condition": "cot23_bin5", "acc_inst": 0.11, "acc_demo": 0.0, "levdist": 3.75, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
116 | {"condition": "cot24_bin1", "acc_inst": 0.44, "acc_demo": 0.0, "levdist": 2.6, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
117 | {"condition": "cot24_bin2", "acc_inst": 0.32, "acc_demo": 0.0, "levdist": 3.19, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
118 | {"condition": "cot24_bin3", "acc_inst": 0.39, "acc_demo": 0.0, "levdist": 2.63, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
119 | {"condition": "cot24_bin4", "acc_inst": 0.34, "acc_demo": 0.0, "levdist": 3.05, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
120 | {"condition": "cot24_bin5", "acc_inst": 0.21, "acc_demo": 0.0, "levdist": 4.04, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
121 | {"condition": "cot25_bin1", "acc_inst": 0.62, "acc_demo": 0.0, "levdist": 1.89, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
122 | {"condition": "cot25_bin2", "acc_inst": 0.58, "acc_demo": 0.0, "levdist": 1.77, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
123 | {"condition": "cot25_bin3", "acc_inst": 0.42, "acc_demo": 0.0, "levdist": 2.74, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
124 | {"condition": "cot25_bin4", "acc_inst": 0.56, "acc_demo": 0.0, "levdist": 1.88, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
125 | {"condition": "cot25_bin5", "acc_inst": 0.67, "acc_demo": 0.0, "levdist": 1.12, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
126 | 


--------------------------------------------------------------------------------
/logs/text_cot/gpt-4/results.jsonl:
--------------------------------------------------------------------------------
  1 | {"condition": "cot1_bin1", "acc_inst": 0.77, "acc_demo": 0.0, "levdist": 0.4, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
  2 | {"condition": "cot1_bin2", "acc_inst": 0.64, "acc_demo": 0.0, "levdist": 0.72, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
  3 | {"condition": "cot1_bin3", "acc_inst": 0.46, "acc_demo": 0.0, "levdist": 1.12, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
  4 | {"condition": "cot1_bin4", "acc_inst": 0.43, "acc_demo": 0.0, "levdist": 1.07, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
  5 | {"condition": "cot1_bin5", "acc_inst": 0.16, "acc_demo": 0.0, "levdist": 1.91, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
  6 | {"condition": "cot2_bin1", "acc_inst": 0.83, "acc_demo": 0.0, "levdist": 0.26, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
  7 | {"condition": "cot2_bin2", "acc_inst": 0.71, "acc_demo": 0.0, "levdist": 0.52, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
  8 | {"condition": "cot2_bin3", "acc_inst": 0.49, "acc_demo": 0.0, "levdist": 1.06, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
  9 | {"condition": "cot2_bin4", "acc_inst": 0.43, "acc_demo": 0.0, "levdist": 1.0, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 10 | {"condition": "cot2_bin5", "acc_inst": 0.19, "acc_demo": 0.0, "levdist": 1.65, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
 11 | {"condition": "cot3_bin1", "acc_inst": 0.79, "acc_demo": 0.0, "levdist": 0.33, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 12 | {"condition": "cot3_bin2", "acc_inst": 0.71, "acc_demo": 0.0, "levdist": 0.54, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 13 | {"condition": "cot3_bin3", "acc_inst": 0.48, "acc_demo": 0.0, "levdist": 1.06, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 14 | {"condition": "cot3_bin4", "acc_inst": 0.4, "acc_demo": 0.0, "levdist": 1.14, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 15 | {"condition": "cot3_bin5", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 1.58, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
 16 | {"condition": "cot4_bin1", "acc_inst": 0.76, "acc_demo": 0.0, "levdist": 0.44, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 17 | {"condition": "cot4_bin2", "acc_inst": 0.66, "acc_demo": 0.0, "levdist": 0.74, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 18 | {"condition": "cot4_bin3", "acc_inst": 0.5, "acc_demo": 0.0, "levdist": 1.12, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 19 | {"condition": "cot4_bin4", "acc_inst": 0.47, "acc_demo": 0.0, "levdist": 1.0, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 20 | {"condition": "cot4_bin5", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 1.26, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 21 | {"condition": "cot5_bin1", "acc_inst": 0.76, "acc_demo": 0.0, "levdist": 0.5, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 22 | {"condition": "cot5_bin2", "acc_inst": 0.68, "acc_demo": 0.0, "levdist": 0.66, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 23 | {"condition": "cot5_bin3", "acc_inst": 0.44, "acc_demo": 0.0, "levdist": 1.32, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 24 | {"condition": "cot5_bin4", "acc_inst": 0.49, "acc_demo": 0.0, "levdist": 1.15, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 25 | {"condition": "cot5_bin5", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 1.58, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 26 | {"condition": "cot6_bin1", "acc_inst": 0.73, "acc_demo": 0.0, "levdist": 0.49, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 27 | {"condition": "cot6_bin2", "acc_inst": 0.74, "acc_demo": 0.0, "levdist": 0.46, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 28 | {"condition": "cot6_bin3", "acc_inst": 0.45, "acc_demo": 0.0, "levdist": 1.18, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 29 | {"condition": "cot6_bin4", "acc_inst": 0.36, "acc_demo": 0.0, "levdist": 1.24, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 30 | {"condition": "cot6_bin5", "acc_inst": 0.21, "acc_demo": 0.0, "levdist": 1.59, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
 31 | {"condition": "cot7_bin1", "acc_inst": 0.67, "acc_demo": 0.0, "levdist": 0.52, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 32 | {"condition": "cot7_bin2", "acc_inst": 0.55, "acc_demo": 0.0, "levdist": 0.89, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 33 | {"condition": "cot7_bin3", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 1.61, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 34 | {"condition": "cot7_bin4", "acc_inst": 0.19, "acc_demo": 0.0, "levdist": 1.5, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 35 | {"condition": "cot7_bin5", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 1.91, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
 36 | {"condition": "cot8_bin1", "acc_inst": 0.7, "acc_demo": 0.0, "levdist": 0.52, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 37 | {"condition": "cot8_bin2", "acc_inst": 0.63, "acc_demo": 0.0, "levdist": 0.63, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 38 | {"condition": "cot8_bin3", "acc_inst": 0.44, "acc_demo": 0.0, "levdist": 1.24, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 39 | {"condition": "cot8_bin4", "acc_inst": 0.5, "acc_demo": 0.0, "levdist": 0.98, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 40 | {"condition": "cot8_bin5", "acc_inst": 0.23, "acc_demo": 0.0, "levdist": 1.56, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
 41 | {"condition": "cot9_bin1", "acc_inst": 0.64, "acc_demo": 0.0, "levdist": 0.91, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 42 | {"condition": "cot9_bin2", "acc_inst": 0.51, "acc_demo": 0.0, "levdist": 1.24, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 43 | {"condition": "cot9_bin3", "acc_inst": 0.36, "acc_demo": 0.0, "levdist": 1.68, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 44 | {"condition": "cot9_bin4", "acc_inst": 0.3, "acc_demo": 0.0, "levdist": 1.48, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 45 | {"condition": "cot9_bin5", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 1.68, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
 46 | {"condition": "cot10_bin1", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 2.3, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
 47 | {"condition": "cot10_bin2", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 2.25, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
 48 | {"condition": "cot10_bin3", "acc_inst": 0.14, "acc_demo": 0.0, "levdist": 2.4, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
 49 | {"condition": "cot10_bin4", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 2.7, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
 50 | {"condition": "cot10_bin5", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 2.24, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
 51 | {"condition": "cot11_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.23, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 52 | {"condition": "cot11_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.3, "median_levdist": 4.5, "model": "claude-3", "temp": 0.0}
 53 | {"condition": "cot11_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.24, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 54 | {"condition": "cot11_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.42, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 55 | {"condition": "cot11_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.08, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 56 | {"condition": "cot12_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.81, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 57 | {"condition": "cot12_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.86, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 58 | {"condition": "cot12_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.8, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 59 | {"condition": "cot12_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.86, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 60 | {"condition": "cot12_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.93, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 61 | {"condition": "cot13_bin1", "acc_inst": 0.79, "acc_demo": 0.0, "levdist": 0.45, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 62 | {"condition": "cot13_bin2", "acc_inst": 0.64, "acc_demo": 0.0, "levdist": 0.66, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 63 | {"condition": "cot13_bin3", "acc_inst": 0.67, "acc_demo": 0.0, "levdist": 0.79, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 64 | {"condition": "cot13_bin4", "acc_inst": 0.59, "acc_demo": 0.0, "levdist": 0.75, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
 65 | {"condition": "cot13_bin5", "acc_inst": 0.37, "acc_demo": 0.0, "levdist": 1.31, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
 66 | {"condition": "cot14_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.24, "median_levdist": 4.0, "model": "claude-3", "temp": 0.0}
 67 | {"condition": "cot14_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.46, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 68 | {"condition": "cot14_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.51, "median_levdist": 4.5, "model": "claude-3", "temp": 0.0}
 69 | {"condition": "cot14_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.63, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 70 | {"condition": "cot14_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.76, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 71 | {"condition": "cot15_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.35, "median_levdist": 5.5, "model": "claude-3", "temp": 0.0}
 72 | {"condition": "cot15_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.28, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 73 | {"condition": "cot15_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.47, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 74 | {"condition": "cot15_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.59, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 75 | {"condition": "cot15_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.83, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 76 | {"condition": "cot16_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.39, "median_levdist": 5.5, "model": "claude-3", "temp": 0.0}
 77 | {"condition": "cot16_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.33, "median_levdist": 5.5, "model": "claude-3", "temp": 0.0}
 78 | {"condition": "cot16_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.3, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 79 | {"condition": "cot16_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.49, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 80 | {"condition": "cot16_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.96, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 81 | {"condition": "cot17_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.43, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 82 | {"condition": "cot17_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.37, "median_levdist": 5.5, "model": "claude-3", "temp": 0.0}
 83 | {"condition": "cot17_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.65, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 84 | {"condition": "cot17_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.87, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 85 | {"condition": "cot17_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.31, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 86 | {"condition": "cot18_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.03, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 87 | {"condition": "cot18_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.0, "median_levdist": 5.0, "model": "claude-3", "temp": 0.0}
 88 | {"condition": "cot18_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 5.39, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 89 | {"condition": "cot18_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.72, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 90 | {"condition": "cot18_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.12, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 91 | {"condition": "cot19_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.83, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 92 | {"condition": "cot19_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.73, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 93 | {"condition": "cot19_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.11, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 94 | {"condition": "cot19_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.88, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 95 | {"condition": "cot19_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.3, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
 96 | {"condition": "cot20_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.93, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 97 | {"condition": "cot20_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.82, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 98 | {"condition": "cot20_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.03, "median_levdist": 6.0, "model": "claude-3", "temp": 0.0}
 99 | {"condition": "cot20_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.28, "median_levdist": 6.5, "model": "claude-3", "temp": 0.0}
100 | {"condition": "cot20_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.47, "median_levdist": 7.0, "model": "claude-3", "temp": 0.0}
101 | {"condition": "cot21_bin1", "acc_inst": 0.16, "acc_demo": 0.0, "levdist": 2.84, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
102 | {"condition": "cot21_bin2", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 2.93, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
103 | {"condition": "cot21_bin3", "acc_inst": 0.14, "acc_demo": 0.0, "levdist": 3.05, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
104 | {"condition": "cot21_bin4", "acc_inst": 0.23, "acc_demo": 0.0, "levdist": 2.39, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
105 | {"condition": "cot21_bin5", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 2.28, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
106 | {"condition": "cot22_bin1", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 3.31, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
107 | {"condition": "cot22_bin2", "acc_inst": 0.28, "acc_demo": 0.0, "levdist": 2.86, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
108 | {"condition": "cot22_bin3", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 3.57, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
109 | {"condition": "cot22_bin4", "acc_inst": 0.25, "acc_demo": 0.0, "levdist": 3.05, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
110 | {"condition": "cot22_bin5", "acc_inst": 0.1, "acc_demo": 0.0, "levdist": 3.25, "median_levdist": 3.0, "model": "claude-3", "temp": 0.0}
111 | {"condition": "cot23_bin1", "acc_inst": 0.82, "acc_demo": 0.0, "levdist": 0.39, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
112 | {"condition": "cot23_bin2", "acc_inst": 0.73, "acc_demo": 0.0, "levdist": 0.73, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
113 | {"condition": "cot23_bin3", "acc_inst": 0.46, "acc_demo": 0.0, "levdist": 1.36, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
114 | {"condition": "cot23_bin4", "acc_inst": 0.58, "acc_demo": 0.0, "levdist": 0.9, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
115 | {"condition": "cot23_bin5", "acc_inst": 0.28, "acc_demo": 0.0, "levdist": 1.38, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
116 | {"condition": "cot24_bin1", "acc_inst": 0.84, "acc_demo": 0.0, "levdist": 0.45, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
117 | {"condition": "cot24_bin2", "acc_inst": 0.75, "acc_demo": 0.0, "levdist": 0.55, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
118 | {"condition": "cot24_bin3", "acc_inst": 0.5, "acc_demo": 0.0, "levdist": 1.28, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
119 | {"condition": "cot24_bin4", "acc_inst": 0.46, "acc_demo": 0.0, "levdist": 1.13, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
120 | {"condition": "cot24_bin5", "acc_inst": 0.18, "acc_demo": 0.0, "levdist": 1.7, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
121 | {"condition": "cot25_bin1", "acc_inst": 0.81, "acc_demo": 0.0, "levdist": 0.37, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
122 | {"condition": "cot25_bin2", "acc_inst": 0.67, "acc_demo": 0.0, "levdist": 0.69, "median_levdist": 0.0, "model": "claude-3", "temp": 0.0}
123 | {"condition": "cot25_bin3", "acc_inst": 0.46, "acc_demo": 0.0, "levdist": 1.22, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
124 | {"condition": "cot25_bin4", "acc_inst": 0.47, "acc_demo": 0.0, "levdist": 1.05, "median_levdist": 1.0, "model": "claude-3", "temp": 0.0}
125 | {"condition": "cot25_bin5", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 1.54, "median_levdist": 2.0, "model": "claude-3", "temp": 0.0}
126 | 


--------------------------------------------------------------------------------
/logs/text_cot/llama3.1-405b/results.jsonl:
--------------------------------------------------------------------------------
  1 | {"condition": "cot1_bin1", "acc_inst": 0.81, "acc_demo": 0.0, "levdist": 8.96, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
  2 | {"condition": "cot1_bin2", "acc_inst": 0.74, "acc_demo": 0.0, "levdist": 15.86, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
  3 | {"condition": "cot1_bin3", "acc_inst": 0.54, "acc_demo": 0.0, "levdist": 35.77, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
  4 | {"condition": "cot1_bin4", "acc_inst": 0.59, "acc_demo": 0.0, "levdist": 54.97, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
  5 | {"condition": "cot1_bin5", "acc_inst": 0.55, "acc_demo": 0.0, "levdist": 35.06, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
  6 | {"condition": "cot2_bin1", "acc_inst": 0.82, "acc_demo": 0.0, "levdist": 0.55, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
  7 | {"condition": "cot2_bin2", "acc_inst": 0.67, "acc_demo": 0.0, "levdist": 8.87, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
  8 | {"condition": "cot2_bin3", "acc_inst": 0.61, "acc_demo": 0.0, "levdist": 8.78, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
  9 | {"condition": "cot2_bin4", "acc_inst": 0.6, "acc_demo": 0.0, "levdist": 44.3, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
 10 | {"condition": "cot2_bin5", "acc_inst": 0.64, "acc_demo": 0.0, "levdist": 31.37, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
 11 | {"condition": "cot3_bin1", "acc_inst": 0.62, "acc_demo": 0.0, "levdist": 0.94, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
 12 | {"condition": "cot3_bin2", "acc_inst": 0.52, "acc_demo": 0.0, "levdist": 2.64, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
 13 | {"condition": "cot3_bin3", "acc_inst": 0.49, "acc_demo": 0.0, "levdist": 11.29, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
 14 | {"condition": "cot3_bin4", "acc_inst": 0.41, "acc_demo": 0.0, "levdist": 85.19, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
 15 | {"condition": "cot3_bin5", "acc_inst": 0.35, "acc_demo": 0.0, "levdist": 220.52, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
 16 | {"condition": "cot4_bin1", "acc_inst": 0.48, "acc_demo": 0.0, "levdist": 14.4, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
 17 | {"condition": "cot4_bin2", "acc_inst": 0.39, "acc_demo": 0.0, "levdist": 8.15, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
 18 | {"condition": "cot4_bin3", "acc_inst": 0.31, "acc_demo": 0.0, "levdist": 37.04, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
 19 | {"condition": "cot4_bin4", "acc_inst": 0.2, "acc_demo": 0.0, "levdist": 131.24, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
 20 | {"condition": "cot4_bin5", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 350.72, "median_levdist": 475.5, "model": "llama3.1-405b", "temp": 0.0}
 21 | {"condition": "cot5_bin1", "acc_inst": 0.21, "acc_demo": 0.0, "levdist": 8.51, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
 22 | {"condition": "cot5_bin2", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 9.12, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
 23 | {"condition": "cot5_bin3", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 45.34, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
 24 | {"condition": "cot5_bin4", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 92.35, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
 25 | {"condition": "cot5_bin5", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 316.79, "median_levdist": 448.5, "model": "llama3.1-405b", "temp": 0.0}
 26 | {"condition": "cot6_bin1", "acc_inst": 0.17, "acc_demo": 0.0, "levdist": 27.74, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
 27 | {"condition": "cot6_bin2", "acc_inst": 0.13, "acc_demo": 0.0, "levdist": 24.1, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
 28 | {"condition": "cot6_bin3", "acc_inst": 0.16, "acc_demo": 0.0, "levdist": 54.37, "median_levdist": 2.5, "model": "llama3.1-405b", "temp": 0.0}
 29 | {"condition": "cot6_bin4", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 120.19, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
 30 | {"condition": "cot6_bin5", "acc_inst": 0.08, "acc_demo": 0.0, "levdist": 372.41, "median_levdist": 595.5, "model": "llama3.1-405b", "temp": 0.0}
 31 | {"condition": "cot7_bin1", "acc_inst": 0.23, "acc_demo": 0.0, "levdist": 7.91, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
 32 | {"condition": "cot7_bin2", "acc_inst": 0.1, "acc_demo": 0.0, "levdist": 29.64, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
 33 | {"condition": "cot7_bin3", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 2.75, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
 34 | {"condition": "cot7_bin4", "acc_inst": 0.07, "acc_demo": 0.0, "levdist": 28.98, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
 35 | {"condition": "cot7_bin5", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 13.81, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
 36 | {"condition": "cot8_bin1", "acc_inst": 0.12, "acc_demo": 0.0, "levdist": 2.44, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
 37 | {"condition": "cot8_bin2", "acc_inst": 0.09, "acc_demo": 0.0, "levdist": 2.62, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
 38 | {"condition": "cot8_bin3", "acc_inst": 0.06, "acc_demo": 0.0, "levdist": 2.83, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
 39 | {"condition": "cot8_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 10.69, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
 40 | {"condition": "cot8_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 3.82, "median_levdist": 3.0, "model": "llama3.1-405b", "temp": 0.0}
 41 | {"condition": "cot9_bin1", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 10.9, "median_levdist": 4.0, "model": "llama3.1-405b", "temp": 0.0}
 42 | {"condition": "cot9_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 16.74, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 43 | {"condition": "cot9_bin3", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 17.15, "median_levdist": 4.0, "model": "llama3.1-405b", "temp": 0.0}
 44 | {"condition": "cot9_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.09, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 45 | {"condition": "cot9_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 10.38, "median_levdist": 4.0, "model": "llama3.1-405b", "temp": 0.0}
 46 | {"condition": "cot10_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.52, "median_levdist": 4.0, "model": "llama3.1-405b", "temp": 0.0}
 47 | {"condition": "cot10_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.79, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 48 | {"condition": "cot10_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 10.97, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 49 | {"condition": "cot10_bin4", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 17.13, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 50 | {"condition": "cot10_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.37, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 51 | {"condition": "cot11_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 10.68, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 52 | {"condition": "cot11_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 20.52, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 53 | {"condition": "cot11_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 18.2, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 54 | {"condition": "cot11_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.44, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 55 | {"condition": "cot11_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 16.82, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 56 | {"condition": "cot12_bin1", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 9.84, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 57 | {"condition": "cot12_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 23.49, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 58 | {"condition": "cot12_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 22.01, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 59 | {"condition": "cot12_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 43.66, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 60 | {"condition": "cot12_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 19.28, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 61 | {"condition": "cot13_bin1", "acc_inst": 0.86, "acc_demo": 0.0, "levdist": 0.33, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
 62 | {"condition": "cot13_bin2", "acc_inst": 0.73, "acc_demo": 0.0, "levdist": 0.55, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
 63 | {"condition": "cot13_bin3", "acc_inst": 0.62, "acc_demo": 0.0, "levdist": 0.87, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
 64 | {"condition": "cot13_bin4", "acc_inst": 0.61, "acc_demo": 0.0, "levdist": 0.9, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
 65 | {"condition": "cot13_bin5", "acc_inst": 0.53, "acc_demo": 0.0, "levdist": 0.92, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
 66 | {"condition": "cot14_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.59, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 67 | {"condition": "cot14_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.8, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 68 | {"condition": "cot14_bin3", "acc_inst": 0.02, "acc_demo": 0.0, "levdist": 4.79, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 69 | {"condition": "cot14_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 17.9, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 70 | {"condition": "cot14_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.55, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 71 | {"condition": "cot15_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 7.26, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 72 | {"condition": "cot15_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 19.1, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 73 | {"condition": "cot15_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.72, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 74 | {"condition": "cot15_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.02, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 75 | {"condition": "cot15_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.36, "median_levdist": 7.0, "model": "llama3.1-405b", "temp": 0.0}
 76 | {"condition": "cot16_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.69, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 77 | {"condition": "cot16_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.13, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 78 | {"condition": "cot16_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.48, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 79 | {"condition": "cot16_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.61, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 80 | {"condition": "cot16_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.79, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 81 | {"condition": "cot17_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.03, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 82 | {"condition": "cot17_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.63, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 83 | {"condition": "cot17_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.56, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 84 | {"condition": "cot17_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.01, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 85 | {"condition": "cot17_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.25, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 86 | {"condition": "cot18_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.63, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 87 | {"condition": "cot18_bin2", "acc_inst": 0.01, "acc_demo": 0.0, "levdist": 4.78, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 88 | {"condition": "cot18_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 4.92, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 89 | {"condition": "cot18_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.17, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 90 | {"condition": "cot18_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.73, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 91 | {"condition": "cot19_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 15.24, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 92 | {"condition": "cot19_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.81, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 93 | {"condition": "cot19_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 16.89, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
 94 | {"condition": "cot19_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 12.31, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 95 | {"condition": "cot19_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 23.55, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 96 | {"condition": "cot20_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 13.5, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 97 | {"condition": "cot20_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 12.76, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 98 | {"condition": "cot20_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.66, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
 99 | {"condition": "cot20_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.41, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
100 | {"condition": "cot20_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.48, "median_levdist": 7.0, "model": "llama3.1-405b", "temp": 0.0}
101 | {"condition": "cot21_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.68, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
102 | {"condition": "cot21_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.91, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
103 | {"condition": "cot21_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 11.87, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
104 | {"condition": "cot21_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 12.84, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
105 | {"condition": "cot21_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 13.04, "median_levdist": 7.0, "model": "llama3.1-405b", "temp": 0.0}
106 | {"condition": "cot22_bin1", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.33, "median_levdist": 5.0, "model": "llama3.1-405b", "temp": 0.0}
107 | {"condition": "cot22_bin2", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.48, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
108 | {"condition": "cot22_bin3", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.38, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
109 | {"condition": "cot22_bin4", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 5.73, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
110 | {"condition": "cot22_bin5", "acc_inst": 0.0, "acc_demo": 0.0, "levdist": 6.24, "median_levdist": 6.0, "model": "llama3.1-405b", "temp": 0.0}
111 | {"condition": "cot23_bin1", "acc_inst": 0.77, "acc_demo": 0.0, "levdist": 0.68, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
112 | {"condition": "cot23_bin2", "acc_inst": 0.59, "acc_demo": 0.0, "levdist": 2.55, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
113 | {"condition": "cot23_bin3", "acc_inst": 0.55, "acc_demo": 0.0, "levdist": 1.09, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
114 | {"condition": "cot23_bin4", "acc_inst": 0.49, "acc_demo": 0.0, "levdist": 1.12, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
115 | {"condition": "cot23_bin5", "acc_inst": 0.32, "acc_demo": 0.0, "levdist": 1.24, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
116 | {"condition": "cot24_bin1", "acc_inst": 0.52, "acc_demo": 0.0, "levdist": 1.5, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
117 | {"condition": "cot24_bin2", "acc_inst": 0.26, "acc_demo": 0.0, "levdist": 2.35, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
118 | {"condition": "cot24_bin3", "acc_inst": 0.29, "acc_demo": 0.0, "levdist": 1.98, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
119 | {"condition": "cot24_bin4", "acc_inst": 0.37, "acc_demo": 0.0, "levdist": 2.31, "median_levdist": 2.0, "model": "llama3.1-405b", "temp": 0.0}
120 | {"condition": "cot24_bin5", "acc_inst": 0.47, "acc_demo": 0.0, "levdist": 1.45, "median_levdist": 1.0, "model": "llama3.1-405b", "temp": 0.0}
121 | {"condition": "cot25_bin1", "acc_inst": 0.83, "acc_demo": 0.0, "levdist": 0.5, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
122 | {"condition": "cot25_bin2", "acc_inst": 0.62, "acc_demo": 0.0, "levdist": 0.87, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
123 | {"condition": "cot25_bin3", "acc_inst": 0.6, "acc_demo": 0.0, "levdist": 0.91, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
124 | {"condition": "cot25_bin4", "acc_inst": 0.7, "acc_demo": 0.0, "levdist": 0.63, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
125 | {"condition": "cot25_bin5", "acc_inst": 0.56, "acc_demo": 0.0, "levdist": 2.45, "median_levdist": 0.0, "model": "llama3.1-405b", "temp": 0.0}
126 | 


--------------------------------------------------------------------------------
/models/openai_help.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import openai
  3 | import random
  4 | import aiolimiter
  5 | from aiohttp import ClientSession
  6 | import asyncio
  7 | import logging
  8 | from typing import Any, List, Dict, Union
  9 | from tqdm.asyncio import tqdm_asyncio
 10 | 
 11 | completion_tokens = {"gpt-4": 0, "gpt-3.5-turbo": 0, "gpt-4-0613": 0, "gpt-3.5-turbo-0613": 0}
 12 | prompt_tokens = {"gpt-4": 0, "gpt-3.5-turbo": 0, "gpt-4-0613": 0, "gpt-3.5-turbo-0613": 0}
 13 | 
 14 | async def _throttled_openai_chat_completion_acreate(
 15 |     model: str,
 16 |     messages: List[Dict[str, str]],
 17 |     temperature: float,
 18 |     max_tokens: int,
 19 |     top_p: float,
 20 |     stop: Union[str, List[str]],
 21 |     limiter: aiolimiter.AsyncLimiter,
 22 | ) -> Dict[str, Any]:
 23 |     async with limiter:
 24 |         for _ in range(10000000000):
 25 |             try:
 26 |                 return await openai.ChatCompletion.acreate(
 27 |                     model=model,
 28 |                     messages=messages,
 29 |                     temperature=temperature,
 30 |                     max_tokens=max_tokens,
 31 |                     top_p=top_p,
 32 |                     stop=stop,
 33 |                 )
 34 |             except openai.error.OpenAIError:
 35 |                 logging.warning(
 36 |                     "OpenAI API rate limit exceeded. Sleeping for 10 seconds."
 37 |                 )
 38 |                 await asyncio.sleep(20)
 39 |             except asyncio.exceptions.TimeoutError:
 40 |                 logging.warning("OpenAI API timeout. Sleeping for 10 seconds.")
 41 |                 await asyncio.sleep(20)
 42 |         return {"choices": [{"message": {"content": ""}}]}
 43 | 
 44 | 
 45 | async def generate_from_openai_chat_completion(
 46 |     messages_list: List[Dict[str, str]],
 47 |     model: str,
 48 |     temperature: float,
 49 |     max_tokens: int,
 50 |     top_p: float,
 51 |     stop: Union[str, List[str]],
 52 |     requests_per_minute: int = 300,
 53 | ) -> List[str]:
 54 |     if model == "gpt-4":
 55 |         requests_per_minute = 200
 56 |     if "OPENAI_API_KEY" not in os.environ:
 57 |         raise ValueError(
 58 |             "OPENAI_API_KEY environment variable must be set when using OpenAI API."
 59 |         )
 60 |     print(os.environ["OPENAI_API_KEY"])
 61 |     openai.api_key = os.environ["OPENAI_API_KEY"]
 62 |     session = ClientSession()
 63 |     openai.aiosession.set(session)
 64 |     limiter = aiolimiter.AsyncLimiter(requests_per_minute)
 65 |     async_responses = [
 66 |         _throttled_openai_chat_completion_acreate(
 67 |             model=model,
 68 |             messages=messages,
 69 |             temperature=temperature,
 70 |             max_tokens=max_tokens,
 71 |             top_p=top_p,
 72 |             stop=stop,
 73 |             limiter=limiter,
 74 |         )
 75 |         for messages in messages_list
 76 |     ]
 77 |     responses = await tqdm_asyncio.gather(*async_responses)
 78 |     await session.close()
 79 |     # return [x["choices"][0]["message"]["content"] for x in responses]
 80 |     return responses
 81 | 
 82 | 
 83 | def gpt(prompt, model="gpt-4", temperature=0.7, max_tokens=1000, n=1, stop=None) -> list:
 84 |     return gpts([prompt] * n, model=model, temperature=temperature, max_tokens=max_tokens, stop=stop)
 85 | 
 86 | def gpts(prompts, model="gpt-4", temperature=0.7, max_tokens=1000, stop=None) -> list:
 87 |     print(f"Model: {model}, temperature: {temperature}, max_tokens: {max_tokens}")
 88 |     messages_list = [[{"role": "user", "content": prompt}] for prompt in prompts]
 89 |     return chatgpts(messages_list, model=model, temperature=temperature, max_tokens=max_tokens, stop=stop)
 90 | 
 91 | def chatgpt(messages, model="gpt-4", temperature=0.7, max_tokens=1000, n=1, stop=None) -> list:
 92 |     return chatgpts([messages] * n, model=model, temperature=temperature, max_tokens=max_tokens, stop=stop)
 93 | 
 94 | def chatgpts(messages_list, model="gpt-4", temperature=0.7, max_tokens=1000, stop=None) -> list:
 95 |     responses =  asyncio.run(generate_from_openai_chat_completion(model=model, messages_list=messages_list, temperature=temperature, max_tokens=max_tokens, top_p=1, stop=stop))
 96 |     texts = [x["choices"][0]["message"]["content"] for x in responses]
 97 |     # print(responses)
 98 |     global completion_tokens, prompt_tokens
 99 |     completion_tokens[model] += sum(x["usage"]["completion_tokens"] for x in responses if "usage" in x and "completion_tokens" in x["usage"])
100 |     prompt_tokens[model] += sum(x["usage"]["prompt_tokens"] for x in responses if "usage" in x and "prompt_tokens" in x["usage"])
101 |     return texts
102 | 
103 | def gpt_usage():
104 |     global completion_tokens, prompt_tokens
105 |     cost = completion_tokens["gpt-4"] / 1000 * 0.06 + prompt_tokens["gpt-4"] / 1000 * 0.03
106 |     cost += (completion_tokens["gpt-3.5-turbo"] + prompt_tokens["gpt-3.5-turbo"]) / 1000 * 0.0002
107 |     return {"completion_tokens": completion_tokens, "prompt_tokens": prompt_tokens, "cost": cost}
108 | 


--------------------------------------------------------------------------------
/regression/README.md:
--------------------------------------------------------------------------------
1 | # Logistic Regression
2 | 
3 | - `text_cot_train_table.tsv` - train table statistics where `correct` indicates whether GPT-4 solved the example correctly. Logistic rgeression model is fitted on this data in `regression.ipynb`. Obtained by running [eval.py](https://github.com/aksh555/deciphering_cot/eval.py) and `create_train_table.py`
4 | - `text_cot_test_table.tsv` - test table statistics
5 | - `text_cot_test_table_results.tsv` - test table statistics with predictions from the LR model.


--------------------------------------------------------------------------------
/regression/create_train_table.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from transformers import GPT2LMHeadModel, GPT2Tokenizer
  4 | import tiktoken
  5 | import logging
  6 | import json
  7 | import pandas as pd
  8 | 
  9 | logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, handlers=[logging.StreamHandler(),logging.FileHandler("prob_random_index.log")])
 10 | 
 11 | if torch.cuda.is_available():
 12 |     device = "cuda"
 13 | else:
 14 |     device = "cpu"
 15 | 
 16 | gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-xl")
 17 | gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2-xl").to(device)
 18 | gpt4_enc = tiktoken.get_encoding("cl100k_base")
 19 | 
 20 | def pad_batch(batch, pad_idx):
 21 |     max_length = 0
 22 |     for seq in batch:
 23 |         if len(seq) > max_length:
 24 |             max_length = len(seq)
 25 | 
 26 |     new_batch = []
 27 |     for seq in batch:
 28 |         padding = [pad_idx for i in range(max_length - len(seq))]
 29 |         new_batch.append(seq + padding)
 30 | 
 31 |     return new_batch
 32 | 
 33 | # Get perplexity using GPT-2
 34 | def prob_gpt2(sentence_list):
 35 | 
 36 |     # Tokenize the sentences
 37 |     all_tokens = []
 38 |     for sentence in sentence_list:
 39 |         tokens = gpt2_tokenizer.encode(sentence)
 40 |         all_tokens.append(tokens)
 41 |     tokens = pad_batch(all_tokens, 50256)
 42 | 
 43 |     targets = tokens[:]
 44 | 
 45 |     # Compute average log likelihood for the generation
 46 |     input_ids = torch.LongTensor(tokens).to(device)
 47 |     target_ids = torch.LongTensor(targets).to(device)
 48 | 
 49 |     with torch.no_grad():
 50 |         outputs = gpt2_model(input_ids, labels=target_ids)
 51 |         logits = outputs[1]
 52 |         logits = logits.transpose(0,1)[:-1].transpose(0,1)
 53 |         target_ids = target_ids.transpose(0,1)[1:].transpose(0,1)
 54 |         loss = torch.nn.CrossEntropyLoss(reduction="none", ignore_index=50256)(logits.reshape(-1,50257), target_ids.reshape(-1))
 55 |         loss = loss.reshape(target_ids.shape).sum(dim=1)
 56 |         neg_log_likelihood = -1*loss
 57 | 
 58 |     # 13.357776641845703 = logprob('The word is"'); removing this to just get
 59 |     # the word prob
 60 |     return neg_log_likelihood + 13.357776641845703
 61 | 
 62 | df = pd.read_csv("text_cot_train_table.tsv",sep="\t")
 63 | word_list = df["input"].to_list()
 64 | print("Rows", len(word_list))
 65 | 
 66 | words_with_prob = []
 67 | this_batch_sentences = []
 68 | this_batch_words = []
 69 | num_tokens = []
 70 | for index, line in enumerate(word_list):
 71 |     if index % 10000 == 0:
 72 |         logging.info(str(index))
 73 | 
 74 |     word = line.strip()
 75 | 
 76 |     tokens = gpt4_enc.encode(word)
 77 |     tokens_spaced = gpt4_enc.encode(" " + word)
 78 | 
 79 |     this_batch_sentences.append('The word is "' + word + '"')
 80 |     this_batch_words.append(word)
 81 |     num_tokens.append(len(tokens))
 82 | 
 83 |     if len(this_batch_sentences) == 3000:
 84 |         logprobs = prob_gpt2(this_batch_sentences)
 85 |         for word, logprob in zip(this_batch_words, logprobs):
 86 |             words_with_prob.append(logprob.item())
 87 |         this_batch_sentences = []
 88 |         this_batch_words = []
 89 | 
 90 | if len(this_batch_sentences) > 0:
 91 |     logprobs = prob_gpt2(this_batch_sentences)
 92 |     for word, logprob in zip(this_batch_words, logprobs):
 93 |         words_with_prob.append(logprob.item())
 94 |     this_batch_sentences = []
 95 |     this_batch_words = []
 96 | 
 97 | df["input_logprob"] = words_with_prob
 98 | df["input_ntokens"] = num_tokens
 99 | 
100 | df.drop(["pred","gt","input"], axis=1, inplace=True)
101 | df = df[['input_ntokens', 'input_logprob', 'output_logprob', 'shift_level', 'shift_freq', 'bin']]
102 | df.to_csv("./text_cot_train_table.tsv", "\t",index_label="index")
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/run_claude3.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import json
  3 | import argparse
  4 | from tqdm import tqdm
  5 | import os
  6 | import anthropic
  7 | import time
  8 | logging.getLogger().setLevel(logging.ERROR)
  9 | 
 10 | client = anthropic.Anthropic()
 11 | 
 12 | 
 13 | def claude_responses(prompt_list, model="claude-3-opus-20240229", max_tokens=1000, temperature=0.0):
 14 |     responses = []
 15 |     for prompt in tqdm(prompt_list):
 16 |         output = None
 17 |         for _ in range(10):
 18 |             try:
 19 |                 completion = client.messages.create(
 20 |                     model=model,
 21 |                     max_tokens=max_tokens,
 22 |                     temperature=temperature,
 23 |                     system="Provide only your answer, without any explanation.",
 24 |                     messages=[{"role":"user", "content": prompt}]
 25 |                 )
 26 | 
 27 |                 output = completion.content[0].text
 28 |                 if output is None:
 29 |                     output = ""
 30 |             except:
 31 |                 time.sleep(60)
 32 |             
 33 |             if not (output is None):
 34 |                 break
 35 | 
 36 |         if output is None:
 37 |             responses.append("")
 38 |         else:
 39 |             responses.append(output)
 40 |     return responses
 41 | 
 42 |     
 43 | 
 44 | 
 45 | def solve_file(name, model, temperature, max_tokens, prompt_type):
 46 |     file = f'stimuli/{prompt_type}/{name}.jsonl'
 47 |     if not os.path.exists(file):
 48 |         print(f'File {file} does not exist')
 49 |         return None
 50 |     with open(file, 'r') as f:
 51 |         lines = f.readlines()
 52 |     lines = [json.loads(line) for line in lines]
 53 |     prompts = [line['instruction_plus_input'] for line in lines]
 54 |     gts = [line['correct_output'] for line in lines]
 55 |     res = claude_responses(prompts, model=model, temperature=0.0, max_tokens=max_tokens)
 56 | 
 57 |     # These accs are not what we use in the paper - they're just for quick estimates. 
 58 |     # The stats used in the paper are computed in the evaluation/ folder
 59 |     accs = [(gt.replace('"', "") in r.replace('"', "")) for r, gt in zip(res, gts)]
 60 |     acc = sum(accs) / len(accs)
 61 |     print(f'Accuracy: {acc}')
 62 | 
 63 |     d = {'prompts': prompts, 'gts': gts, 'res': res, 'accs': accs, 'acc': acc}
 64 | 
 65 |     fo_directory = f'logs/{prompt_type}/{model}'
 66 |     if not os.path.exists(fo_directory):
 67 |         os.makedirs(fo_directory, exist_ok=True)
 68 |     
 69 |     output_file = f'{fo_directory}/{name}.json'
 70 |     with open(output_file, 'w') as f:
 71 |         json.dump(d, f)
 72 |     
 73 |     return d
 74 | 
 75 | 
 76 | def parse_args():
 77 |     args = argparse.ArgumentParser()
 78 |     args.add_argument('--tasks', type=str, required=True, help='split by comma')
 79 |     args.add_argument('--conditions', type=str, required=True, help='split by comma')
 80 |     args.add_argument('--model', type=str, required=True, choices=['claude-3'])
 81 |     args.add_argument('--max_tokens', type=int, help='default = 1000', default=1000)
 82 |     args.add_argument("--prompt_type", type=str, help="Prompt type to use [standard, text_cot, math_cot, number_cot]", default="text_cot")
 83 |     args = args.parse_args()
 84 |     return args
 85 | 
 86 | if __name__ == '__main__':
 87 |     args = parse_args()
 88 |     tasks = args.tasks.split(',')
 89 |     conditions = args.conditions.split(',')
 90 |     model = args.model
 91 |     prompt_type = args.prompt_type
 92 | 
 93 |     if model == "claude-3":
 94 |         model = "claude-3-opus-20240229"
 95 |     max_tokens = args.max_tokens
 96 | 
 97 |     for task in tasks:
 98 |         for condition in conditions:
 99 |             name = f'{task}_{condition}'
100 |             d = solve_file(name, model=model, temperature=0.0, max_tokens=max_tokens, prompt_type=prompt_type)
101 |             if d is not None:
102 |                 print(f'{name}, {model}: {d["acc"]:.2f}')
103 | 
104 | 


--------------------------------------------------------------------------------
/run_llama3.py:
--------------------------------------------------------------------------------
  1 | # python run_llama3.py --tasks cot1 --conditions bin1 --max_tokens 200 --model llama-3.1-405b
  2 | 
  3 | import logging
  4 | import json
  5 | import argparse
  6 | from tqdm import tqdm
  7 | import os
  8 | import together
  9 | import time
 10 | logging.getLogger().setLevel(logging.ERROR)
 11 | 
 12 | client = together.Together()
 13 | 
 14 | def process_prompt(prompt):
 15 |     prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n" + prompt + "\n<|start_header_id|>assistant<|end_header_id|>"
 16 |     return prompt
 17 | 
 18 | 
 19 | def llama_responses(prompt_list, model="llama-3-70b-chat-hf", max_tokens=1000, temperature=0.0):
 20 |     responses = []
 21 |     for prompt in tqdm(prompt_list):
 22 |         prompt = process_prompt(prompt)
 23 |         output = None
 24 |         for _ in range(10):
 25 |             try:
 26 |                 if "chat" in model:
 27 |                     output = client.chat.completions.create(
 28 |                                 messages = [{"role": "user", "content": prompt}], 
 29 |                                 model = "meta-llama/" + model,
 30 |                                 max_tokens = max_tokens,
 31 |                                 temperature = temperature,
 32 |                             )
 33 |                 else:
 34 |                     output = client.completions.create(
 35 |                                 prompt=prompt,
 36 |                                 model = "meta-llama/" + model,
 37 |                                 max_tokens = max_tokens,
 38 |                                 temperature = temperature,
 39 |                             )
 40 |             except:
 41 |                 time.sleep(1)
 42 |             
 43 |             if not (output is None):
 44 |                 break
 45 |         if "chat" in model:
 46 |             responses.append(output.choices[0].message.content)
 47 |         else:
 48 |             responses.append(output.choices[0].text)
 49 |     return responses
 50 | 
 51 | 
 52 | def solve_file(name, model, temperature, max_tokens, prompt_type):
 53 |     file = f'stimuli/{prompt_type}/{name}.jsonl'
 54 |     print(f"Loading {file}")
 55 |     if not os.path.exists(file):
 56 |         print(f'File {file} does not exist')
 57 |         return None
 58 |     with open(file, 'r') as f:
 59 |         lines = f.readlines()
 60 |     lines = [json.loads(line) for line in lines]
 61 |     prompts = [line['instruction_plus_input'] for line in lines]
 62 |     gts = [line['correct_output'] for line in lines]
 63 |     res = llama_responses(prompts, model=model, temperature=0.0, max_tokens=max_tokens)
 64 | 
 65 |     # These accs are not what we use in the paper - they're just for quick estimates. 
 66 |     # The stats used in the paper are computed in the evaluation/ folder
 67 |     accs = [(gt.replace('"', '') in r.replace('"', '')) for r, gt in zip(res, gts)]
 68 |     acc = sum(accs) / len(accs)
 69 |     print(f"Done {name}")
 70 |     print(f'Accuracy: {acc}')
 71 | 
 72 |     d = {'prompts': prompts, 'gts': gts, 'res': res, 'accs': accs, 'acc': acc}
 73 | 
 74 |     fo_directory = f'logs/{prompt_type}/{model}'
 75 |     if not os.path.exists(fo_directory):
 76 |         os.makedirs(fo_directory, exist_ok=True)
 77 |     
 78 |     output_file = f'{fo_directory}/{name}.json'
 79 |     with open(output_file, 'w') as f:
 80 |         json.dump(d, f)
 81 |     return d
 82 | 
 83 | 
 84 | def parse_args():
 85 |     args = argparse.ArgumentParser()
 86 |     args.add_argument('--tasks', type=str, required=True, help='split by comma')
 87 |     args.add_argument('--conditions', type=str, required=True, help='split by comma')
 88 |     args.add_argument('--model', type=str, required=True, choices=['llama-3-70b-chat', 'llama-3-70b', 'llama3-405b', 'llama3.1-70b'], default='llama3.1-405b')
 89 |     args.add_argument('--max_tokens', type=int, help='default = 1000', default=1000)
 90 |     args.add_argument("--prompt_type", type=str, help="Prompt type to use [standard, text_cot, math_cot, number_cot]", default="text_cot")
 91 |     args = args.parse_args()
 92 |     return args
 93 | 
 94 | if __name__ == '__main__':
 95 |     args = parse_args()
 96 |     tasks = args.tasks.split(',')
 97 |     conditions = args.conditions.split(',')
 98 |     model = args.model
 99 |     prompt_type = args.prompt_type
100 |     if model == 'llama-3-70b-chat':
101 |         model = 'llama-3-70b-chat-hf'
102 |     elif model == 'llama-3-70b':
103 |         model = 'meta-llama-3-70b'
104 |     elif model == 'llama3.1-405b':
105 |         model = 'Meta-Llama-3.1-405B-Instruct-Turbo'
106 |     elif model == 'llama3.1-70b':
107 |         model = 'Meta-Llama-3.1-70B-Instruct-Turbo'
108 |     max_tokens = args.max_tokens
109 | 
110 |     for task in tasks:
111 |         for condition in conditions:
112 |             name = f'{task}_{condition}'
113 |             d = solve_file(name, model=model, temperature=0.0, max_tokens=max_tokens, prompt_type=prompt_type)
114 |             if d is not None:
115 |                 print(f'{name}, {model}: {d["acc"]:.2f}')
116 | 
117 | 


--------------------------------------------------------------------------------
/run_o1.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import json
 3 | import argparse
 4 | from tqdm import tqdm
 5 | import os
 6 | logging.getLogger().setLevel(logging.INFO)
 7 | from openai import OpenAI,BadRequestError
 8 | client = OpenAI()
 9 | 
10 | def o1_responses(prompt_list):
11 |     responses = []
12 |     completion_tokens = []
13 |     for prompt in tqdm(prompt_list):
14 |         try:
15 |             response = client.chat.completions.create(
16 |                     model="o1-preview",
17 |                     messages=[
18 |                         {
19 |                             "role": "user", 
20 |                             "content": prompt
21 |                         }
22 |                     ]
23 |                 )
24 |             responses.append(response.choices[0].message.content)
25 |             completion_tokens.append(response.usage.completion_tokens_details["reasoning_tokens"])
26 |         except BadRequestError:
27 |             response = "BLOCKED_BY_OPENAI"
28 |             responses.append(response)
29 |             completion_tokens.append(0)
30 |         except Exception as e:
31 |             print(e)
32 |             response = "ERROR"
33 | 
34 |     return responses, completion_tokens
35 | 
36 | def solve_file(name, model):
37 |     # o1 does not require CoT prompts
38 |     file = f'stimuli/standard/{name}.jsonl'
39 |     if not os.path.exists(file):
40 |         print(f'File {file} does not exist')
41 |         return None
42 |     with open(file, 'r') as f:
43 |         lines = f.readlines()
44 |     lines = [json.loads(line) for line in lines]
45 |     print(file)
46 |     prompts = [line['instruction_plus_input'] for line in lines][:50]
47 |     gts = [line['correct_output'] for line in lines][:50]
48 |     
49 |     res, completion_tokens = o1_responses(prompts)
50 |     mean_tokens = sum(completion_tokens)/len(completion_tokens)
51 | 
52 |     # These accs are not what we use in the paper - they're just for quick estimates. 
53 |     # The stats used in the paper are computed in the evaluation/ folder
54 |     accs = [(gt.replace('"', "") in r.replace('"', "")) for r, gt in zip(res, gts)]
55 |     acc = sum(accs) / len(accs)
56 |     print("Completion tokens", mean_tokens)
57 | 
58 |     d = {'prompts': prompts, 'gts': gts, 'res': res, 'accs': accs, 'acc': acc, 'mean_completion_tokens':mean_tokens}
59 | 
60 |     output_file = f'logs/standard/{model}'
61 |     with open(output_file, 'w') as f:
62 |         json.dump(d, f)
63 |     
64 |     return d
65 | 
66 | 
67 | def parse_args():
68 |     args = argparse.ArgumentParser()
69 |     args.add_argument('--tasks', type=str, required=True, help='split by comma')
70 |     args.add_argument('--conditions', type=str, required=True, help='split by comma')
71 |     args.add_argument('--model', type=str, default='o1-preview-2024-09-12')
72 |     
73 |     args = args.parse_args()
74 |     return args
75 | 
76 | if __name__ == '__main__':
77 |     args = parse_args()
78 |     tasks = args.tasks.split(',')
79 |     conditions = args.conditions.split(',')
80 |     model = args.model
81 | 
82 |     for task in tasks:
83 |         for condition in conditions:
84 |             name = f'{task}_{condition}'
85 |             d = solve_file(name, model=model)
86 |             if d is not None:
87 |                 print(f'{name}, {model}: {d["acc"]:.2f}')
88 |                 print("Completion tokens", d["mean_completion_tokens"])
89 | 
90 | 


--------------------------------------------------------------------------------
/run_openai.py:
--------------------------------------------------------------------------------
 1 | from models.openai_help import gpts
 2 | import logging
 3 | import json
 4 | import argparse
 5 | import os
 6 | 
 7 | logging.getLogger().setLevel(logging.WARNING)
 8 | 
 9 | def edit_distance(s1: str, s2: str) -> int:
10 |     """Compute the Levenshtein distance between two strings."""
11 |     if len(s1) < len(s2):
12 |         return edit_distance(s2, s1)
13 |     if len(s2) == 0:
14 |         return len(s1)
15 |     previous_row = range(len(s2) + 1)
16 |     for i, c1 in enumerate(s1):
17 |         current_row = [i + 1]
18 |         for j, c2 in enumerate(s2):
19 |             insertions = previous_row[j + 1] + 1
20 |             deletions = current_row[j] + 1
21 |             substitutions = previous_row[j] + (c1 != c2)
22 |             current_row.append(min(insertions, deletions, substitutions))
23 |         previous_row = current_row
24 |     return previous_row[-1]
25 | 
26 | 
27 | def solve_file(name, model, temperature, max_tokens, prompt_type):
28 |     file = f'stimuli/{prompt_type}/{name}.jsonl'
29 |     print(f"Loading {file}")
30 |     if not os.path.exists(file):
31 |         print(f'File {file} does not exist')
32 |         return None
33 |     with open(file, 'r') as f:
34 |         lines = f.readlines()
35 |     lines = [json.loads(line) for line in lines]
36 |     prompts = [line['instruction_plus_input'] for line in lines]
37 |     gts = ['"' + line['correct_output'] + '"' for line in lines]
38 |     res = gpts(prompts, model=model, temperature=temperature, max_tokens=max_tokens)
39 |     accs = [(r == gt) for r, gt in zip(res, gts)]
40 |     eds = [edit_distance(r, gt) for r, gt in zip(res, gts)]
41 |     acc = sum(accs) / len(accs)
42 |     ed = sum(eds) / len(eds)
43 |     print(f"Done {name}")
44 |     d = {'prompts': prompts, 'gts': gts, 'res': res, 'accs': accs, 'acc': acc, 'eds': eds, 'ed': ed}
45 | 
46 |     fo_directory = f'logs/{prompt_type}/{model}'
47 |     if not os.path.exists(fo_directory):
48 |         os.makedirs(fo_directory, exist_ok=True)
49 |     
50 |     output_file = f'{fo_directory}/{name}.json'
51 |     with open(output_file, 'w') as f:
52 |         json.dump(d, f)
53 |     
54 |     return d
55 | 
56 | 
57 | def parse_args():
58 |     args = argparse.ArgumentParser()
59 |     args.add_argument('--tasks', type=str, required=True, help='split by comma')
60 |     args.add_argument('--conditions', type=str, required=True, help='split by comma')
61 |     args.add_argument('--model', type=str, default='gpt-4-0613')
62 |     args.add_argument('--max_tokens', type=int, help='default = 200', default=200)
63 |     args.add_argument('--temperature', type=float, help='default = 0.0', default=0.0)
64 |     args.add_argument("--prompt_type", type=str, help="Prompt type to use [standard, text_cot, math_cot, number_cot]", default="text_cot")
65 |     args = args.parse_args()
66 |     return args
67 | 
68 | if __name__ == '__main__':
69 |     args = parse_args()
70 |     tasks = args.tasks.split(',')
71 |     conditions = args.conditions.split(',')
72 |     model = args.model
73 |     max_tokens = args.max_tokens
74 |     temperature = args.temperature
75 |     prompt_type = args.prompt_type
76 | 
77 |     for task in tasks:
78 |         for condition in conditions:
79 |             name = f'{task}_{condition}'
80 |             d = solve_file(name, model=model, temperature=temperature, max_tokens=max_tokens, prompt_type=prompt_type)
81 |             if d is not None:
82 |                 print(f'{name}, {model}: {d["acc"]:.2f} ({d["ed"]:.2f})')
83 | 
84 | 


--------------------------------------------------------------------------------
/seven_letter_words/README.md:
--------------------------------------------------------------------------------
1 | ## Dataset
2 | 1. First, run `python random_token_combos.py`. This generates `random_pairs_lower.txt`, which lists all words that fulfill the following criteria:
3 |     - 7 letters long
4 |     - 2 subword tokens long (using the tokenizer that both GPT-3.5 and GPT-4 use; it needs to be 2 tokens long whether the word follows a space or not)
5 |     - The first subword token is 3 letters long, and the second is 4 letters long (again, these lengths need to be identical whether the word follows a space or not).
6 | 2. Then, sort these words by the probability assigned to them by GPT-2 by running `python gpt2_prob_sevenletter.py`. This generates `random_pairs_lower_scored.txt`, which lists each word along with a log probability. The log probability is computed as the log probability that GPT-2 assigns to the sentence `The word is "WORD"`, minus the log probability that it assigns to `The word is "'; thus, this yields the log probability assigned to just the word and the following quotation mark in the context of `The word is "`. The closing quotation mark is included because it serves to indicate the end of the word.
7 | 3. Then, bin the words by running `python select_words.py` to create `words_5bins.txt`.
8 | 4. The final list of words can be found in `bin1_prob.txt`, `bin2_prob.txt`, `bin3_prob.txt`, `bin4_prob.txt`, and `bin5_prob.txt`. 
9 | 


--------------------------------------------------------------------------------
/seven_letter_words/bin1_prob.txt:
--------------------------------------------------------------------------------
  1 | choosed -14.997272491455078
  2 | colbert -14.996980667114258
  3 | polenta -14.99655532836914
  4 | modicum -15.007698059082031
  5 | autarch -14.99172592163086
  6 | schisms -14.989496231079102
  7 | mariner -15.0106201171875
  8 | disarms -15.0106201171875
  9 | rescale -14.989356994628906
 10 | paywall -14.986217498779297
 11 | infobox -14.98541259765625
 12 | preston -15.015327453613281
 13 | shrines -15.016551971435547
 14 | implore -14.982894897460938
 15 | alloted -15.01815414428711
 16 | precast -15.020370483398438
 17 | borings -14.978897094726562
 18 | bacilli -15.022220611572266
 19 | matrice -15.022846221923828
 20 | redible -14.974870681762695
 21 | absolve -15.026111602783203
 22 | ourself -14.973335266113281
 23 | ethetic -15.026788711547852
 24 | maynard -15.027372360229492
 25 | calibur -15.027730941772461
 26 | enviros -15.02823257446289
 27 | calzone -14.970394134521484
 28 | sumatra -14.96739387512207
 29 | drywall -15.033981323242188
 30 | impaled -14.965522766113281
 31 | manland -15.03862190246582
 32 | divined -14.960699081420898
 33 | conlang -14.959224700927734
 34 | tablero -14.95616340637207
 35 | redraft -14.955455780029297
 36 | equitas -15.044797897338867
 37 | ratting -14.953641891479492
 38 | errancy -15.04793930053711
 39 | webcast -14.94735336303711
 40 | lowland -15.053237915039062
 41 | boyhood -15.053678512573242
 42 | actuary -14.945014953613281
 43 | catlike -15.055164337158203
 44 | putback -15.056617736816406
 45 | galileo -14.942996978759766
 46 | rivaled -15.057003021240234
 47 | volonte -14.942134857177734
 48 | sunspot -15.059274673461914
 49 | rotunda -14.940404891967773
 50 | notched -15.06007194519043
 51 | taproot -14.935928344726562
 52 | secures -15.066566467285156
 53 | entente -14.93320083618164
 54 | outflow -15.066858291625977
 55 | betters -15.067663192749023
 56 | rumpled -14.930889129638672
 57 | burried -15.070535659790039
 58 | repulse -14.92904281616211
 59 | fillets -14.926876068115234
 60 | relator -14.92681884765625
 61 | sombody -15.074382781982422
 62 | unsaved -15.074520111083984
 63 | ailment -15.075027465820312
 64 | nodules -15.075050354003906
 65 | montero -14.922632217407227
 66 | satires -15.080968856811523
 67 | arcadia -14.916393280029297
 68 | valerie -14.915924072265625
 69 | inglish -15.085016250610352
 70 | dukedom -15.086551666259766
 71 | espouse -14.913402557373047
 72 | bedevil -14.911296844482422
 73 | reticle -15.089393615722656
 74 | matinee -15.089693069458008
 75 | maxwell -14.909908294677734
 76 | picante -14.90963363647461
 77 | baboons -14.908744812011719
 78 | exciter -15.092048645019531
 79 | losings -14.907678604125977
 80 | newbies -14.906318664550781
 81 | serried -14.90548324584961
 82 | curving -14.904655456542969
 83 | narrows -15.09649658203125
 84 | ragging -14.901836395263672
 85 | baneful -15.099411010742188
 86 | pinatas -14.89979362487793
 87 | divison -15.100841522216797
 88 | kinfolk -14.898719787597656
 89 | indiana -14.898597717285156
 90 | caritas -14.8953857421875
 91 | silvery -14.893852233886719
 92 | inkling -14.893333435058594
 93 | absense -15.10746955871582
 94 | lavabit -14.890359878540039
 95 | outsize -14.88975715637207
 96 | rewired -15.111268997192383
 97 | absalom -15.113567352294922
 98 | getback -15.114919662475586
 99 | accuser -14.884925842285156
100 | striven -15.115121841430664
101 | maloney -15.116886138916016
102 | escaper -14.882984161376953
103 | subtile -15.119136810302734
104 | colibri -14.879827499389648
105 | delving -14.87982177734375
106 | calving -14.879753112792969
107 | tarheel -14.878677368164062
108 | herders -14.876302719116211
109 | grooved -14.875177383422852
110 | octagon -15.125707626342773
111 | bisping -15.126806259155273
112 | alluded -14.872251510620117
113 | merlion -15.128215789794922
114 | figural -15.129623413085938
115 | debater -14.869804382324219
116 | pigtail -14.867530822753906
117 | honious -15.13395881652832
118 | pinches -15.135322570800781
119 | clojure -14.863956451416016
120 | equates -14.861526489257812
121 | refiner -15.138694763183594
122 | billets -15.140663146972656
123 | alfalfa -15.141242980957031
124 | hotshot -14.858383178710938
125 | nonagon -15.142745971679688
126 | jacuzzi -14.857048034667969
127 | vincent -15.143632888793945
128 | pollock -14.855628967285156
129 | airtime -14.85552978515625


--------------------------------------------------------------------------------
/seven_letter_words/bin2_prob.txt:
--------------------------------------------------------------------------------
  1 | dupasha -22.5
  2 | makrita -22.499996185302734
  3 | ferisse -22.499996185302734
  4 | murcers -22.49999237060547
  5 | metires -22.49999237060547
  6 | witmost -22.50000762939453
  7 | astause -22.50000762939453
  8 | sekaram -22.500011444091797
  9 | vilgren -22.500015258789062
 10 | belomat -22.500019073486328
 11 | setnest -22.499977111816406
 12 | curadal -22.49997329711914
 13 | viridon -22.50002670288086
 14 | denpick -22.50002670288086
 15 | eraully -22.50003433227539
 16 | ruborie -22.500041961669922
 17 | queimer -22.499950408935547
 18 | cosuits -22.499950408935547
 19 | rutamen -22.499942779541016
 20 | graizen -22.499942779541016
 21 | sonware -22.500057220458984
 22 | infocos -22.500057220458984
 23 | inkwang -22.49993896484375
 24 | rowbots -22.499935150146484
 25 | engeden -22.500064849853516
 26 | vizizen -22.50006866455078
 27 | molenci -22.499927520751953
 28 | indotes -22.499927520751953
 29 | dapener -22.500076293945312
 30 | ireasti -22.50008773803711
 31 | undving -22.499900817871094
 32 | traumpt -22.499900817871094
 33 | redrear -22.500099182128906
 34 | aryanni -22.499897003173828
 35 | brovoir -22.500102996826172
 36 | greised -22.499893188476562
 37 | networm -22.499889373779297
 38 | memwill -22.500110626220703
 39 | gamplus -22.499881744384766
 40 | estplay -22.499881744384766
 41 | sapwhat -22.500118255615234
 42 | indmong -22.500118255615234
 43 | kenafil -22.5001220703125
 44 | denzhou -22.5001220703125
 45 | cosited -22.5001220703125
 46 | perzoek -22.500125885009766
 47 | balinit -22.500125885009766
 48 | mayonal -22.499866485595703
 49 | armemic -22.499866485595703
 50 | henjury -22.500133514404297
 51 | lavplay -22.500141143798828
 52 | calynes -22.49985122680664
 53 | remfold -22.50014877319336
 54 | engdist -22.50014877319336
 55 | armrich -22.50014877319336
 56 | luxfast -22.499847412109375
 57 | mulhatt -22.49984359741211
 58 | allaton -22.49984359741211
 59 | strfair -22.50015640258789
 60 | monachs -22.50015640258789
 61 | kerapat -22.50015640258789
 62 | hergrim -22.50015640258789
 63 | fidgota -22.50015640258789
 64 | decigan -22.500160217285156
 65 | dezella -22.499835968017578
 66 | haypath -22.500164031982422
 67 | resonga -22.499820709228516
 68 | nosband -22.499820709228516
 69 | poligen -22.500179290771484
 70 | mobture -22.49981689453125
 71 | flufrom -22.50018310546875
 72 | willose -22.49980926513672
 73 | desedge -22.50019073486328
 74 | momclub -22.499805450439453
 75 | clobero -22.499801635742188
 76 | mapauth -22.499797821044922
 77 | vitelho -22.500205993652344
 78 | daykick -22.500205993652344
 79 | sysmite -22.500213623046875
 80 | telolon -22.50021743774414
 81 | onsensa -22.50021743774414
 82 | vipaddy -22.500225067138672
 83 | sunrink -22.500225067138672
 84 | namhero -22.500225067138672
 85 | voratio -22.499771118164062
 86 | niliter -22.499771118164062
 87 | droones -22.499767303466797
 88 | zipcord -22.500232696533203
 89 | pagrete -22.500232696533203
 90 | funwich -22.500232696533203
 91 | negbers -22.499759674072266
 92 | belwich -22.499759674072266
 93 | allayah -22.499759674072266
 94 | pakatak -22.500240325927734
 95 | farathy -22.500240325927734
 96 | betweek -22.500244140625
 97 | rutanim -22.500247955322266
 98 | obsster -22.500255584716797
 99 | ligigid -22.500255584716797
100 | lidcore -22.500255584716797
101 | vacassa -22.499740600585938
102 | pipiday -22.499736785888672
103 | almorum -22.499736785888672
104 | sadmore -22.500263214111328
105 | hayhorn -22.49972915649414
106 | vinango -22.49972152709961
107 | cosisty -22.50027847290039
108 | libikal -22.499713897705078
109 | dogodes -22.500286102294922
110 | camcore -22.500286102294922
111 | ashmann -22.500286102294922
112 | fibunal -22.500289916992188
113 | enciere -22.499706268310547
114 | revrika -22.49969482421875
115 | perburg -22.500308990478516
116 | camilan -22.500308990478516
117 | sumarms -22.50031280517578
118 | firigin -22.500316619873047
119 | pelatra -22.499675750732422
120 | vorvery -22.500328063964844
121 | purabra -22.500328063964844
122 | indondo -22.50033187866211
123 | dogpeak -22.50033187866211
124 | alllein -22.50033187866211
125 | actblue -22.49966049194336
126 | hasvers -22.50033950805664
127 | freifty -22.499652862548828
128 | hueving -22.500347137451172
129 | coratti -22.499649047851562
130 | saprika -22.499645233154297
131 | honcoin -22.499645233154297
132 | joycons -22.50035858154297
133 | dogoids -22.50035858154297
134 | nanians -22.499637603759766
135 | dreanon -22.499637603759766
136 | spoanna -22.4996337890625
137 | levieur -22.4996337890625
138 | jawolla -22.5003662109375
139 | cowcard -22.5003662109375
140 | thehalb -22.499629974365234
141 | lamboys -22.499629974365234
142 | disorer -22.499629974365234
143 | pigwiki -22.500370025634766
144 | embious -22.500370025634766
145 | detdden -22.500370025634766
146 | vacibel -22.499622344970703


--------------------------------------------------------------------------------
/seven_letter_words/bin3_prob.txt:
--------------------------------------------------------------------------------
  1 | tasvinc -30.0
  2 | dblshaw -29.999996185302734
  3 | cmbodka -29.999996185302734
  4 | zagbbox -30.000003814697266
  5 | hedoute -30.000003814697266
  6 | cmsdest -30.00000762939453
  7 | leoanje -29.999988555908203
  8 | sitinks -29.999984741210938
  9 | oweorno -29.999984741210938
 10 | advpite -29.999984741210938
 11 | grpwerk -30.000015258789062
 12 | aesasio -29.999980926513672
 13 | atequir -30.000019073486328
 14 | dryhazi -30.000022888183594
 15 | styansa -29.99997329711914
 16 | sunincl -30.00002670288086
 17 | bowamac -30.00002670288086
 18 | xyzunik -29.999969482421875
 19 | awsposs -30.000030517578125
 20 | ogrmode -29.99996566772461
 21 | midbyss -29.99996566772461
 22 | ctlmony -29.99996566772461
 23 | rngmony -30.00003433227539
 24 | rergett -29.999961853027344
 25 | phperti -29.999961853027344
 26 | bfdizzy -30.000041961669922
 27 | srcstit -29.999950408935547
 28 | pktubic -29.999950408935547
 29 | oddourd -29.999950408935547
 30 | mplnick -29.999950408935547
 31 | dccergy -29.999942779541016
 32 | oxyhest -30.000057220458984
 33 | klepled -29.99993896484375
 34 | digydro -29.99993896484375
 35 | aphopez -29.99993896484375
 36 | rifntag -30.00006103515625
 37 | srvlope -29.999935150146484
 38 | emoomez -29.999935150146484
 39 | toyelry -30.000064849853516
 40 | iniilen -30.000064849853516
 41 | iffamma -30.000064849853516
 42 | adsokin -29.99993133544922
 43 | eofpike -30.00006866455078
 44 | dnsavia -30.00006866455078
 45 | uitlesi -30.000072479248047
 46 | owluntu -30.000072479248047
 47 | affesda -29.999923706054688
 48 | mgrulia -30.000080108642578
 49 | foxmsgs -30.000080108642578
 50 | esiaram -30.000080108642578
 51 | subzyst -29.999916076660156
 52 | ottexpo -30.000083923339844
 53 | udpcolo -29.999908447265625
 54 | vakdney -29.99990463256836
 55 | svmvery -29.99990463256836
 56 | dspereo -29.99990463256836
 57 | pngpone -30.00009536743164
 58 | quiilyn -29.999900817871094
 59 | tgtella -30.000102996826172
 60 | ithueur -30.000102996826172
 61 | wynvinc -30.000106811523438
 62 | sezanch -30.000106811523438
 63 | sdkjabi -30.000106811523438
 64 | yaninem -29.999889373779297
 65 | dbgivid -29.999889373779297
 66 | adeardu -29.999889373779297
 67 | paykich -30.000110626220703
 68 | dspdeal -30.000110626220703
 69 | cptwipe -30.000110626220703
 70 | nikaign -29.99988555908203
 71 | pesuell -30.00011444091797
 72 | musropp -30.00011444091797
 73 | ebxside -30.00011444091797
 74 | dnienez -30.000118255615234
 75 | dccscal -30.000118255615234
 76 | cmbheck -30.000118255615234
 77 | stsasks -29.999874114990234
 78 | hapixer -29.99987030029297
 79 | nikuild -30.00012969970703
 80 | wowrapy -30.000133514404297
 81 | txtajes -30.000133514404297
 82 | gtkoooo -30.000133514404297
 83 | sutcmds -30.000137329101562
 84 | erviode -29.999858856201172
 85 | bewikon -30.000141143798828
 86 | hubphas -29.99985122680664
 87 | ervpets -29.99985122680664
 88 | ofsitem -29.99984359741211
 89 | gstivec -29.99984359741211
 90 | utfestr -30.00015640258789
 91 | etaabic -30.00015640258789
 92 | tieibur -29.999839782714844
 93 | islssel -30.000160217285156
 94 | iodvari -30.000160217285156
 95 | zagzept -29.999835968017578
 96 | ustjour -29.999835968017578
 97 | dexonte -29.999835968017578
 98 | bizfilt -29.999835968017578
 99 | adaowns -29.999835968017578
100 | tetibri -30.000164031982422
101 | octfirm -29.999828338623047
102 | weiudos -30.000171661376953
103 | pwdtick -30.000171661376953
104 | ttlarry -29.99981689453125
105 | stuimeo -29.999813079833984
106 | sqlstre -29.999813079833984
107 | mieipeg -29.999813079833984
108 | dueafen -29.999813079833984
109 | sndurge -29.99980926513672
110 | vezcorn -30.00019073486328
111 | ilketch -29.999805450439453
112 | zugenth -30.000194549560547
113 | rngiate -30.000194549560547
114 | ottclud -30.000194549560547
115 | aprkeep -30.000194549560547
116 | urlveal -30.000198364257812
117 | msgourd -30.000198364257812
118 | xlsboom -29.999797821044922
119 | wijagma -29.999797821044922
120 | robisbn -29.999797821044922
121 | melmlin -29.999797821044922
122 | samslot -30.000202178955078
123 | nidoust -29.999794006347656
124 | begkits -29.999794006347656
125 | arrflix -29.999794006347656
126 | ditfrau -30.000205993652344
127 | aidomid -30.000205993652344
128 | cptfoto -29.99979019165039
129 | aimrede -29.99979019165039
130 | dbgabay -30.00020980834961
131 | cidlocs -30.00020980834961
132 | booiedo -30.000221252441406
133 | mplders -29.999774932861328
134 | cptpush -30.000225067138672
135 | nahcalc -29.999767303466797
136 | amyovel -29.999767303466797
137 | wonczas -30.00023651123047
138 | mplrome -30.00023651123047
139 | edxesis -30.00023651123047
140 | adcadoo -30.00023651123047
141 | oudtems -29.999759674072266
142 | ociirut -29.999759674072266
143 | balzept -29.999759674072266
144 | avgcorp -29.999759674072266
145 | himocos -30.000240325927734
146 | ignlots -29.999755859375
147 | baztrim -29.999755859375


--------------------------------------------------------------------------------
/seven_letter_words/bin4_prob.txt:
--------------------------------------------------------------------------------
  1 | voyxfff -37.500118255615234
  2 | qtyijke -37.50014877319336
  3 | mmculed -37.50022888183594
  4 | jmpytut -37.500362396240234
  5 | vtkprit -37.500396728515625
  6 | oilrxjs -37.50044631958008
  7 | vfsisex -37.499473571777344
  8 | eenqrst -37.49935531616211
  9 | nbrlyph -37.50071334838867
 10 | xmmgota -37.49924850463867
 11 | jmpquiv -37.49921798706055
 12 | rummqtt -37.50099182128906
 13 | xhrdisp -37.49892044067383
 14 | ffturaa -37.498897552490234
 15 | dexocht -37.50111770629883
 16 | xmmgett -37.501121520996094
 17 | lvljspx -37.49882125854492
 18 | zugwpdb -37.501182556152344
 19 | tidmqtt -37.49877166748047
 20 | lhsigua -37.498714447021484
 21 | sshemsp -37.50141525268555
 22 | burrgyz -37.49848556518555
 23 | vtkirie -37.498477935791016
 24 | vtkifar -37.501522064208984
 25 | rpczano -37.50154495239258
 26 | vtkinez -37.501609802246094
 27 | vtkifie -37.49838638305664
 28 | zugymce -37.50162124633789
 29 | xcbwent -37.49831008911133
 30 | watobjs -37.49827194213867
 31 | doiawks -37.49827194213867
 32 | cgiacyj -37.498165130615234
 33 | czyands -37.501853942871094
 34 | mdbgebn -37.49811553955078
 35 | atejspx -37.50190353393555
 36 | rndxito -37.49806594848633
 37 | sdkrxjs -37.501953125
 38 | mlxoice -37.501956939697266
 39 | mlxahan -37.50198745727539
 40 | auxjspx -37.5020751953125
 41 | jsxirms -37.50211715698242
 42 | czyrgba -37.49782943725586
 43 | makrgyz -37.5021858215332
 44 | nanighb -37.49776840209961
 45 | jsxobil -37.502262115478516
 46 | jwtgraf -37.49773406982422
 47 | vtkundy -37.49770736694336
 48 | jsxuden -37.49759292602539
 49 | pszglfw -37.50242233276367
 50 | czydamn -37.49753952026367
 51 | csvylko -37.502559661865234
 52 | wijincl -37.497379302978516
 53 | oilrgyz -37.49725341796875
 54 | mlxulan -37.497215270996094
 55 | xmmepar -37.50278854370117
 56 | lodxlsx -37.502803802490234
 57 | uczpeon -37.502864837646484
 58 | sesrgyz -37.49709701538086
 59 | pciavax -37.497066497802734
 60 | gpsilik -37.497066497802734
 61 | lhszion -37.49706268310547
 62 | slaampp -37.49705505371094
 63 | uczhtag -37.502952575683594
 64 | ouiqrst -37.50295639038086
 65 | xhrziel -37.49697494506836
 66 | pcbpiar -37.49697494506836
 67 | yumxfff -37.49691390991211
 68 | fedjspb -37.50309371948242
 69 | xmmtega -37.49677658081055
 70 | segzoek -37.50347137451172
 71 | mezgrpc -37.503543853759766
 72 | xcbophe -37.503658294677734
 73 | ngxantz -37.49628829956055
 74 | aosantd -37.49628829956055
 75 | jejymax -37.50380325317383
 76 | rerlsru -37.50386428833008
 77 | racrgyz -37.50387954711914
 78 | rndquam -37.4961051940918
 79 | mlxneau -37.50391387939453
 80 | rudcych -37.503944396972656
 81 | lotlsru -37.50399398803711
 82 | abyilog -37.496002197265625
 83 | rsaueba -37.504032135009766
 84 | jsxioso -37.49593734741211
 85 | derjspx -37.50411605834961
 86 | vfsgett -37.49586486816406
 87 | vtkjure -37.495849609375
 88 | phyepar -37.4958381652832
 89 | vesxfff -37.5041618347168
 90 | lcdleri -37.50421142578125
 91 | ifsfeas -37.49577713012695
 92 | mmcubbo -37.50423812866211
 93 | ircemsp -37.49563217163086
 94 | pdbiesz -37.495601654052734
 95 | rpciene -37.49557876586914
 96 | iodpiar -37.50454330444336
 97 | rmslsru -37.504615783691406
 98 | rpcumno -37.50465774536133
 99 | apkckpt -37.50466537475586
100 | lcdvoir -37.495269775390625
101 | rhsncia -37.50473403930664
102 | owlsetq -37.4952278137207
103 | ifsbrtc -37.50477600097656
104 | csvowej -37.495140075683594
105 | xcborgt -37.495121002197266
106 | sutmobx -37.495079040527344
107 | iovstmt -37.50493240356445
108 | nanmqtt -37.504947662353516
109 | irqphem -37.504947662353516
110 | wndncia -37.494964599609375
111 | xcbided -37.49495315551758
112 | jsxkees -37.49488067626953
113 | cpscsrf -37.494773864746094
114 | jmppeon -37.49476623535156
115 | lhsreta -37.5052375793457
116 | dezrgyz -37.50527572631836
117 | elecsrf -37.50535202026367
118 | atrlymp -37.505374908447266
119 | iodudev -37.494544982910156
120 | xhrkses -37.505516052246094
121 | ngxjspx -37.49443435668945
122 | uczpear -37.49442672729492
123 | npmhlen -37.49440002441406
124 | pcmncmp -37.505611419677734
125 | biczoek -37.49436569213867
126 | dosorrh -37.50564956665039
127 | jejmisc -37.49434280395508
128 | kenjspx -37.494293212890625
129 | idxiaux -37.505767822265625
130 | svgiesz -37.494205474853516
131 | vtkgems -37.49415969848633
132 | glmldre -37.49413299560547
133 | dexumbn -37.50587844848633
134 | kitxfff -37.49406814575195
135 | jsxajan -37.4940071105957
136 | fmtmina -37.49399185180664
137 | gtkthew -37.49397659301758
138 | czyuess -37.50605010986328
139 | iodhait -37.49386978149414
140 | cafantd -37.506141662597656
141 | xcbredo -37.49382400512695
142 | fpswpdb -37.50624465942383
143 | xcbdogs -37.50633239746094
144 | jwtlify -37.493656158447266
145 | rsaellt -37.493629455566406
146 | pkgughs -37.50637435913086
147 | jmpccak -37.49350357055664
148 | pclvais -37.49347686767578


--------------------------------------------------------------------------------
/seven_letter_words/bin5_prob.txt:
--------------------------------------------------------------------------------
  1 | czyjspx -44.995792388916016
  2 | xcbabwe -45.006473541259766
  3 | aktjspx -44.99137878417969
  4 | xcbcych -44.979515075683594
  5 | xcbziej -45.07548141479492
  6 | xmmeczy -44.91748046875
  7 | qeddhcp -45.09950637817383
  8 | xcbilha -44.897335052490234
  9 | xcbacji -44.8853874206543
 10 | xcbzung -45.1260871887207
 11 | xmmobre -44.83869552612305
 12 | xcbquir -45.17741775512695
 13 | xcbrouw -45.2041015625
 14 | ilkjspx -45.20814895629883
 15 | lijglfw -44.79149627685547
 16 | foxrgyz -45.21918869018555
 17 | jsxrouw -44.767459869384766
 18 | xcbziel -45.23471450805664
 19 | xcbagua -44.763145446777344
 20 | eidtopl -45.24649429321289
 21 | xcbximo -44.73112106323242
 22 | jwtglfw -44.719486236572266
 23 | xcbnerg -44.71344757080078
 24 | xcbateg -44.693031311035156
 25 | befjspx -44.69113540649414
 26 | xcbxlim -44.65083694458008
 27 | xcbsemi -44.63022994995117
 28 | ketglfw -45.387977600097656
 29 | lemjspx -44.60933303833008
 30 | xcbcyan -44.60453414916992
 31 | xcbsequ -45.410953521728516
 32 | xcbemer -45.411563873291016
 33 | eoscsrf -44.56328201293945
 34 | xcbphot -44.541378021240234
 35 | xcbeken -44.509586334228516
 36 | xcbolum -44.500850677490234
 37 | xcbrodu -45.50664520263672
 38 | tepjspx -44.49314880371094
 39 | xcbthro -44.48517990112305
 40 | xcbueue -44.48493957519531
 41 | oscquiv -44.44233322143555
 42 | xcbubah -45.56185531616211
 43 | xcbodzi -44.43584060668945
 44 | mlxquee -45.57368850708008
 45 | xcbmdat -45.59005355834961
 46 | xcbuell -44.409183502197266
 47 | xcbobre -44.40824890136719
 48 | xcbuhan -44.403106689453125
 49 | tasexpl -45.62323760986328
 50 | xcbueil -44.36052322387695
 51 | xcbilos -45.64400100708008
 52 | iodtopl -45.644203186035156
 53 | suttmpl -44.34950637817383
 54 | xcbhots -44.319889068603516
 55 | xcbosph -44.319034576416016
 56 | xcbuego -44.309486389160156
 57 | xcbquam -44.30044174194336
 58 | kolglfw -44.29965591430664
 59 | gesglfw -44.296722412109375
 60 | gccorrh -44.29584503173828
 61 | mezptom -44.289695739746094
 62 | xcbhecy -45.71607971191406
 63 | xcbsemb -44.264095306396484
 64 | yiijspx -44.26384353637695
 65 | meljspx -44.260704040527344
 66 | xcbunos -45.74428939819336
 67 | xcbunei -44.22948455810547
 68 | pisbrtc -44.21781539916992
 69 | vehjspx -44.210479736328125
 70 | vasrgyz -44.190887451171875
 71 | lhsrgyz -44.180213928222656
 72 | xcbighb -45.82477951049805
 73 | phyfidf -44.17029571533203
 74 | kilglfw -45.8333625793457
 75 | dukvrir -44.16157150268555
 76 | levjspx -44.15993881225586
 77 | updrgyz -44.14170837402344
 78 | xcbagas -44.1334228515625
 79 | opcrgyz -44.13212585449219
 80 | ilkjspb -44.12828063964844
 81 | curfidf -44.114540100097656
 82 | rpcighb -45.8897590637207
 83 | xcbacje -44.10778045654297
 84 | xcbilih -45.9096794128418
 85 | zugcsrf -44.060035705566406
 86 | xcbveau -44.05826187133789
 87 | rpcasje -44.04568862915039
 88 | xcbalsy -44.04135513305664
 89 | pcmrouw -44.037845611572266
 90 | xcbafil -44.035858154296875
 91 | doijspx -44.03323745727539
 92 | xcbhtub -44.029544830322266
 93 | xcbhear -45.983673095703125
 94 | xcbuele -45.988529205322266
 95 | opijspx -43.99332809448242
 96 | xcbazzo -43.992305755615234
 97 | xcboufl -46.008460998535156
 98 | akojspx -43.9888801574707
 99 | ninmqtt -43.98078536987305
100 | xcbguna -43.96329879760742
101 | idxorrh -43.9370002746582
102 | xcbheit -43.93656921386719
103 | czyxfff -43.92329406738281
104 | voyglfw -43.90713882446289
105 | dynmqtt -43.902496337890625
106 | xcbcoln -46.09786605834961
107 | vezjspx -43.87360763549805
108 | xcbocre -46.13079071044922
109 | cueorrh -43.85930633544922
110 | xmmacje -43.854305267333984
111 | mlxalsy -43.84138870239258
112 | ebxorrh -43.837650299072266
113 | xcbagal -43.82956314086914
114 | xcbzept -43.82637405395508
115 | xcbucle -43.81629180908203
116 | vesjspx -43.8125
117 | xcbiser -43.809242248535156
118 | xcbseau -43.80495834350586
119 | xcbekte -43.8006477355957
120 | lapmqtt -43.79780960083008
121 | abyjspx -43.78347396850586
122 | xcbueba -46.222286224365234
123 | xcbijke -43.77728271484375
124 | xcbvoie -43.76816940307617
125 | xcbudem -43.76424026489258
126 | xcbivol -46.23701095581055
127 | xcbquoi -43.75960159301758
128 | xcbupal -43.75864791870117
129 | zugjspx -43.75846481323242
130 | xcbheel -46.244380950927734
131 | typglfw -43.74939727783203
132 | rpcinqu -43.74385452270508
133 | voyorrh -43.73942947387695
134 | tieglfw -43.73161315917969
135 | hexmqtt -43.7115592956543
136 | xcbacyj -43.708465576171875
137 | aktjspb -43.69775390625
138 | amyjspx -43.6917610168457
139 | ackrgyz -43.690940856933594
140 | xcbokus -43.688011169433594
141 | xcbhtag -43.65958023071289
142 | togjspx -43.652225494384766
143 | xcbuely -43.64830780029297
144 | xcbffic -43.64610290527344
145 | mlxasje -43.64008331298828
146 | xcbunft -43.63233184814453
147 | wieglfw -43.62156677246094
148 | xcbufig -43.615196228027344
149 | xcbueur -43.613521575927734
150 | zagmqtt -43.60862350463867


--------------------------------------------------------------------------------
/seven_letter_words/gpt2_prob_sevenletter.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import torch
  3 | from transformers import GPT2LMHeadModel, GPT2Tokenizer
  4 | import tiktoken
  5 | import logging
  6 | 
  7 | logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, handlers=[logging.StreamHandler(),logging.FileHandler("prob_random_index.log")])
  8 | 
  9 | if torch.cuda.is_available():
 10 |     device = "cuda"
 11 | else:
 12 |     device = "cpu"
 13 | 
 14 | gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-xl")
 15 | gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2-xl").to(device)
 16 | gpt4_enc = tiktoken.get_encoding("cl100k_base")
 17 | 
 18 | def pad_batch(batch, pad_idx):
 19 |     max_length = 0
 20 |     for seq in batch:
 21 |         if len(seq) > max_length:
 22 |             max_length = len(seq)
 23 | 
 24 |     new_batch = []
 25 |     for seq in batch:
 26 |         padding = [pad_idx for i in range(max_length - len(seq))]
 27 |         new_batch.append(seq + padding)
 28 | 
 29 |     return new_batch
 30 | 
 31 | # Get perplexity using GPT-2
 32 | def prob_gpt2(sentence_list):
 33 | 
 34 |     # Tokenize the sentences
 35 |     all_tokens = []
 36 |     for sentence in sentence_list:
 37 |         tokens = gpt2_tokenizer.encode(sentence)
 38 |         all_tokens.append(tokens)
 39 |     tokens = pad_batch(all_tokens, 50256)
 40 | 
 41 |     targets = tokens[:]
 42 | 
 43 |     # Compute average log likelihood for the generation
 44 |     input_ids = torch.LongTensor(tokens).to(device)
 45 |     target_ids = torch.LongTensor(targets).to(device)
 46 | 
 47 |     with torch.no_grad():
 48 |         outputs = gpt2_model(input_ids, labels=target_ids)
 49 |         logits = outputs[1]
 50 |         logits = logits.transpose(0,1)[:-1].transpose(0,1)
 51 |         target_ids = target_ids.transpose(0,1)[1:].transpose(0,1)
 52 |         loss = torch.nn.CrossEntropyLoss(reduction="none", ignore_index=50256)(logits.reshape(-1,50257), target_ids.reshape(-1))
 53 |         loss = loss.reshape(target_ids.shape).sum(dim=1)
 54 |         neg_log_likelihood = -1*loss
 55 | 
 56 | 
 57 |     # 13.357776641845703 = logprob('The word is"'); removing this to just get
 58 |     # the word prob
 59 |     return neg_log_likelihood + 13.357776641845703
 60 | 
 61 | 
 62 | for finame in ["random_pairs_lower"]:
 63 |     fi = open(finame + ".txt", "r")
 64 |     fo = open(finame + "_scored.txt", "w")
 65 | 
 66 |     words_with_prob = []
 67 | 
 68 |     this_batch_sentences = []
 69 |     this_batch_words = []
 70 |     for index, line in enumerate(fi):
 71 |         if index % 10000 == 0:
 72 |             logging.info(str(index))
 73 |     
 74 |         word = line.strip()
 75 | 
 76 |         tokens = gpt4_enc.encode(word)
 77 |         tokens_spaced = gpt4_enc.encode(" " + word)
 78 | 
 79 |         if len(tokens) == 2 and len(tokens_spaced) == 2 and len(word) == 7:
 80 |             token1 = gpt4_enc.decode([tokens[0]]).strip()
 81 |             token2 = gpt4_enc.decode([tokens[1]]).strip()
 82 | 
 83 |             tokenspaced1 = gpt4_enc.decode([tokens_spaced[0]]).strip()
 84 |             tokenspaced2 = gpt4_enc.decode([tokens_spaced[1]]).strip()
 85 | 
 86 |             if len(token1) == 3 and len(token2) == 4 and len(tokenspaced1) == 3 and len(tokenspaced2) == 4:
 87 |                 this_batch_sentences.append('The word is "' + word + '"')
 88 |                 this_batch_words.append(word)
 89 |             else:
 90 |                 print(index, "Wrong length", word, len(token1), len(token2), len(tokenspaced1), len(tokenspaced2))
 91 |         else:
 92 |             print(index, "Wrong length", word, len(tokens), len(tokens_spaced), len(word))
 93 | 
 94 |         if len(this_batch_sentences) == 3000:
 95 |             logprobs = prob_gpt2(this_batch_sentences)
 96 |             for word, logprob in zip(this_batch_words, logprobs):
 97 |                 words_with_prob.append([logprob.item(), word])
 98 |             this_batch_sentences = []
 99 |             this_batch_words = []
100 | 
101 |     if len(this_batch_sentences) > 0:
102 |         logprobs = prob_gpt2(this_batch_sentences)
103 |         for word, logprob in zip(this_batch_words, logprobs):
104 |             words_with_prob.append([logprob.item(), word])
105 |         this_batch_sentences = []
106 |         this_batch_words = []
107 | 
108 |     for prob, word in sorted(words_with_prob)[::-1]:
109 |         fo.write(str(prob) + "\t" + word + "\n")
110 | 
111 | 
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/seven_letter_words/input_scored.txt:
--------------------------------------------------------------------------------
  1 | mryycon	-33.4009895324707
  2 | myvlobd	-33.97520446777344
  3 | zyvoxdk	-42.584041595458984
  4 | wynsmew	-30.42465591430664
  5 | kedkbmr	-34.5813102722168
  6 | cmrscwc	-35.037437438964844
  7 | wkbsxob	-39.7088508605957
  8 | nsckbwc	-36.846317291259766
  9 | bocmkvo	-39.4564094543457
 10 | zkigkvv	-42.60762405395508
 11 | sxpylyh	-35.01577377319336
 12 | zbocdyx	-42.305076599121094
 13 | crbsxoc	-38.06560134887695
 14 | swzvybo	-37.10409164428711
 15 | kvvydon	-34.48329162597656
 16 | zbomkcd	-39.263160705566406
 17 | lybsxqc	-42.85857391357422
 18 | lkmsvvs	-36.32170867919922
 19 | wkdbsmo	-36.486995697021484
 20 | bonslvo	-34.44765090942383
 21 | klcyvfo	-40.78521728515625
 22 | yebcovp	-34.895774841308594
 23 | odrodsm	-34.4904670715332
 24 | wkixkbn	-40.274925231933594
 25 | mkvsleb	-35.38396072387695
 26 | oxfsbyc	-39.76297378540039
 27 | mkvjyxo	-41.52894592285156
 28 | cewkdbk	-40.9130973815918
 29 | nbigkvv	-39.965938568115234
 30 | swzkvon	-37.17292404174805
 31 | wkxvkxn	-40.5903205871582
 32 | nsfsxon	-38.56913757324219
 33 | myxvkxq	-43.61052703857422
 34 | dklvoby	-37.12471389770508
 35 | bonbkpd	-34.73928451538086
 36 | oaesdkc	-42.28152084350586
 37 | bkddsxq	-40.66632843017578
 38 | obbkxmi	-41.12004470825195
 39 | golmkcd	-35.0498161315918
 40 | vygvkxn	-44.4805908203125
 41 | lyiryyn	-34.94938659667969
 42 | kmdekbi	-37.709075927734375
 43 | mkdvsuo	-38.730655670166016
 44 | zedlkmu	-36.18986511230469
 45 | qkvsvoy	-42.10275650024414
 46 | bsfkvon	-35.44425582885742
 47 | fyvyxdo	-42.35395812988281
 48 | cexczyd	-39.47360610961914
 49 | bydexnk	-37.63880157470703
 50 | xydmron	-34.41005325317383
 51 | dkzbyyd	-40.756038665771484
 52 | comeboc	-25.816699981689453
 53 | oxdoxdo	-36.08238220214844
 54 | yedpvyg	-38.156593322753906
 55 | loddobc	-34.54991912841797
 56 | bewzvon	-36.644874572753906
 57 | lebbson	-29.779876708984375
 58 | bozevco	-32.21761703491211
 59 | psvvodc	-39.616676330566406
 60 | bovkdyb	-44.093204498291016
 61 | cywlyni	-31.04549789428711
 62 | exckfon	-32.31464767456055
 63 | ksvwoxd	-38.20396423339844
 64 | xynevoc	-36.27880096435547
 65 | wyxdoby	-34.911991119384766
 66 | ckdsboc	-34.813270568847656
 67 | kbmknsk	-35.1013069152832
 68 | fkvobso	-37.41843032836914
 69 | sxqvscr	-38.215450286865234
 70 | neuonyw	-33.95500946044922
 71 | oczyeco	-34.99745178222656
 72 | lonofsv	-34.87450408935547
 73 | bodsmvo	-37.443511962890625
 74 | wkdsxoo	-35.750003814697266
 75 | wkhgovv	-39.59453201293945
 76 | zsmkxdo	-41.09931564331055
 77 | lklyyxc	-38.89170455932617
 78 | ohmsdob	-35.976707458496094
 79 | vycsxqc	-43.72141647338867
 80 | xoglsoc	-33.673892974853516
 81 | cobbson	-27.039085388183594
 82 | mebfsxq	-43.0044059753418
 83 | xkbbygc	-36.12342071533203
 84 | bkqqsxq	-38.069557189941406
 85 | lkxopev	-37.932037353515625
 86 | zsxkdkc	-42.61277389526367
 87 | nsfscyx	-34.74595642089844
 88 | usxpyvu	-37.7536735534668
 89 | sxnskxk	-36.24659729003906
 90 | mkbsdkc	-36.59408950805664
 91 | csvfobi	-32.64740753173828
 92 | sxuvsxq	-40.83545684814453
 93 | klcoxco	-35.745391845703125
 94 | vkfklsd	-37.36144256591797
 95 | yedcsjo	-37.90355682373047
 96 | bogsbon	-28.6097412109375
 97 | klckvyw	-38.35791015625
 98 | qodlkmu	-39.189537048339844
 99 | kmmecob	-35.23679733276367
100 | cdbsfox	-33.410850524902344
101 | 


--------------------------------------------------------------------------------
/seven_letter_words/random_token_combos.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import tiktoken
 3 | 
 4 | enc = tiktoken.get_encoding("cl100k_base")
 5 | 
 6 | alphabet = "abcdefghijklmnopqrstuvwxyz"
 7 | alphabet_dict = {}
 8 | for char in alphabet:
 9 |     alphabet_dict[char] = 1
10 | 
11 | def is_roman_lower(string):
12 |     for char in string:
13 |         if char not in alphabet_dict:
14 |             return False
15 |     return True
16 | 
17 | all_threes_lower = []
18 | all_fours_lower = []
19 | 
20 | 
21 | for i in range(100256):
22 |     token = enc.decode([i])
23 |     if len(token) == 4:
24 |         if token[0] == " " and is_roman_lower(token[1:]):
25 |             all_threes_lower.append(token)
26 |         elif is_roman_lower(token):
27 |             all_fours_lower.append(token)
28 | 
29 | print(len(all_threes_lower), len(all_fours_lower), len(all_threes_lower)*len(all_fours_lower))
30 | print(all_threes_lower[:10])
31 | print(all_fours_lower[:10])
32 | print("")
33 | 
34 | fo_lower = open("random_pairs_lower.txt", "w")
35 | 
36 | for start in all_threes_lower:
37 |     for end in all_fours_lower:
38 |         candidate = start.strip() + end.strip()
39 |         tokens_unspaced = enc.encode(candidate)
40 |         tokens_spaced = enc.encode(" " + candidate)
41 | 
42 |         if len(tokens_unspaced) == 2 and len(tokens_spaced) == 2:
43 |             if len(enc.decode([tokens_unspaced[0]]).strip()) == 3 and len(enc.decode([tokens_unspaced[1]]).strip()) == 4 and len(enc.decode([tokens_spaced[0]]).strip()) == 3 and len(enc.decode([tokens_spaced[1]]).strip()) == 4:
44 |                 fo_lower.write(start.strip() + end.strip() + "\n")
45 | 
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/seven_letter_words/select_words.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | all_scores, all_words = [], []
 4 | 
 5 | with open("seven_letter_words/random_pairs_lower_scored.txt", "r") as f:
 6 |     lines = f.readlines()
 7 |     for line in lines:
 8 |         score, word = line.split()
 9 |         all_scores.append(float(score))
10 |         all_words.append(word)
11 |     
12 | # Function to select 100 words closest to a given score
13 | def select_closest_words(score, num_words=150):
14 | 
15 |     # Sort the scores based on proximity to the target score
16 |     sorted_indices = sorted(range(len(all_scores)), key=lambda i: abs(all_scores[i] - score))
17 | 
18 |     # Select the 100 closest words
19 |     selected_indices = sorted_indices[:num_words]
20 |     selected_words = [all_words[i] for i in selected_indices]
21 |     scores = [all_scores[i] for i in selected_indices]
22 | 
23 |     return [selected_words, scores]
24 | 
25 | # Select 100 words closest to each specified score level
26 | selected_words_closest_to_levels = {}
27 | selected_words = []
28 | for score_level in [-15, -22.5, -30, -37.5, -45]:
29 |     selected_words_closest_to_levels[score_level] = select_closest_words(score_level)
30 |     selected_words += selected_words_closest_to_levels[score_level][0]
31 | 
32 | selected_words = set(selected_words)
33 | print("Number of selected words: " + str(len(selected_words)))
34 | 
35 | with open("seven_letter_words/words_5bins.txt", "w") as f:
36 |     for score in [-15, -22.5, -30, -37.5, -45]:
37 |         for word,sc in zip(selected_words_closest_to_levels[score][0], selected_words_closest_to_levels[score][1]):
38 |             f.write(word + " " + str(sc) + "\n")
39 | 


--------------------------------------------------------------------------------
/stimulus_generator.py:
--------------------------------------------------------------------------------
  1 | import jsonlines
  2 | import os
  3 | import random
  4 | import argparse
  5 | 
  6 | # Functions for encoding in rot-1 or rot-3
  7 | alphabet = "abcdefghijklmnopqrstuvwxyz"
  8 | index2char = {}
  9 | char2index = {}
 10 | for index, char in enumerate(alphabet):
 11 |     index2char[index] = char
 12 |     char2index[char] = index
 13 | 
 14 | 
 15 | def rot_encode(sequence, n):
 16 |     new_sequence = []
 17 |     for char in sequence:
 18 |         if not char.isalpha():
 19 |             new_sequence.append(char)
 20 |         elif char.isupper():
 21 |             index = char2index[char.lower()]
 22 |             new_char = index2char[(index+n) % 26]
 23 |             new_sequence.append(new_char.upper())
 24 |         else:
 25 |             index = char2index[char]
 26 |             new_char = index2char[(index+n) % 26]
 27 |             new_sequence.append(new_char)
 28 |     return "".join(new_sequence)
 29 | 
 30 | 
 31 | def create_chain(sequence, n):
 32 |     chain = []
 33 |     for index, char in enumerate(sequence):
 34 |         new_char = rot_encode(char, 26-n)
 35 |         chain.append(str(index+1) + ". " + char + " -> " + new_char + "\n")
 36 |     return "".join(chain)
 37 | 
 38 | 
 39 | def create_math_cot_chain(sequence, n):
 40 |     s = f'''Let’s start by writing the letter-position mapping for the alphabet:
 41 | a -> 0
 42 | b -> 1
 43 | c -> 2
 44 | d -> 3
 45 | e -> 4
 46 | f -> 5
 47 | g -> 6
 48 | h -> 7
 49 | i -> 8
 50 | j -> 9
 51 | k -> 10
 52 | l -> 11
 53 | m -> 12
 54 | n -> 13
 55 | o -> 14
 56 | p -> 15
 57 | q -> 16
 58 | r -> 17
 59 | s -> 18
 60 | t -> 19
 61 | u -> 20
 62 | v -> 21
 63 | w -> 22
 64 | x -> 23
 65 | y -> 24
 66 | z -> 25
 67 | 
 68 | Next, we find the encoded letter as follows:
 69 | Position of original letter = (Position of given letter − {n}) mod 26 
 70 | 
 71 | Then map the found position to the corresponding letter using the letter-position mapping.
 72 | 
 73 | Using this,\n
 74 | '''
 75 |     chain = []
 76 |     for index, char in enumerate(sequence):
 77 |         new_char = rot_encode(char, 26-n)
 78 |         chain.append(str(index+1) + ". " + char + " -> " +
 79 |                      f"({char2index[char]} - {n}) mod 26" " -> " + new_char + "\n")
 80 |     return s + "".join(chain)
 81 | 
 82 | 
 83 | def create_number_cot_chain(sequence, n):
 84 |     s = f'''
 85 | New position = (Given position − {n}) mod 26
 86 | Using this,\n
 87 | '''
 88 |     chain = []
 89 |     for index, char in enumerate(sequence):
 90 |         new_char = rot_encode(char, 26-n)
 91 |         chain.append(str(index+1) + ". " + str(char2index[char]) + " -> " +
 92 |                      f"({char2index[char]} - {n}) mod 26" " -> " + str(char2index[new_char]) + "\n")
 93 |     return s + "".join(chain)
 94 | 
 95 | 
 96 | def create_step_chain_forward(sequence, n):
 97 |     chain = []
 98 |     for index, char in enumerate(sequence):
 99 |         new_char = rot_encode(char, 26-n)
100 |         start_ord, end_ord = ord(char), ord(new_char)
101 |         part_chain = ""
102 |         if char == new_char:
103 |             part_chain = new_char + " -> " + new_char
104 |         else:
105 |             if start_ord > end_ord:
106 |                 if char.isupper():
107 |                     end_ord = ord("Z")
108 |                 else:
109 |                     end_ord = ord("z")
110 |             for char_ord in range(start_ord, end_ord+1, 1):
111 |                 part_chain += chr(char_ord)
112 |                 if char_ord != end_ord:
113 |                     part_chain += " -> "
114 |             if char_ord != ord(new_char):
115 |                 part_chain += " -> "
116 |                 if char.isupper():
117 |                     start_ord = ord("A")
118 |                 else:
119 |                     start_ord = ord("a")
120 |                 for char_ord in range(start_ord, ord(new_char)+1, 1):
121 |                     part_chain += chr(char_ord)
122 |                     if char_ord != ord(new_char):
123 |                         part_chain += " -> "
124 | 
125 |         chain.append(str(index+1) + ". " + part_chain + "\n")
126 |     return "".join(chain)
127 | 
128 | 
129 | def create_math_corrupt_chain(sequence, n):
130 |     chain = []
131 |     s = f'''Let’s start by writing the letter-position mapping for the alphabet:
132 | a -> 0
133 | b -> 1
134 | c -> 2
135 | d -> 3
136 | e -> 4
137 | f -> 5
138 | g -> 6
139 | h -> 7
140 | i -> 8
141 | j -> 9
142 | k -> 10
143 | l -> 11
144 | m -> 12
145 | n -> 13
146 | o -> 14
147 | p -> 15
148 | q -> 16
149 | r -> 17
150 | s -> 18
151 | t -> 19
152 | u -> 20
153 | v -> 21
154 | w -> 22
155 | x -> 23
156 | y -> 24
157 | z -> 25
158 | 
159 | Next, we find the encoded letter as follows:
160 | Position of original letter = (Position of given letter − {n}) mod 26 
161 | 
162 | Then map the found position to the corresponding letter using the letter-position mapping.
163 | 
164 | Using this,\n
165 | '''
166 |     chain = []
167 |     for index, char in enumerate(sequence):
168 |         new_char = '*'
169 |         chain.append(str(index+1) + ". " + char + " -> " +
170 |                      f"({char2index[char]} - {n}) mod 26" " -> " + new_char + "\n")
171 |     return s + "".join(chain)
172 | 
173 | 
174 | def create_corrupt_chain(sequence, n):
175 |     chain = []
176 |     for index, char in enumerate(sequence):
177 |         # random character, letter, or number, punctuation
178 |         # candidates = list(alphabet) + [x.upper() for x in list(alphabet)] + list("0123456789") + list(".,?!:;\"'()[]{}")
179 |         # replace 50% of the time
180 |         # if random.random() < 0.5:
181 |         # new_char = random.choice(candidates)
182 |         # else:
183 |         #     new_char = rot_encode(char, 26-n)
184 |         # if not char.isalpha():
185 |         #     new_char = char
186 |         # else:
187 |         new_char = "*"
188 |         chain.append(str(index+1) + ". " + char + " -> " + new_char + "\n")
189 |     return "".join(chain)
190 | 
191 | 
192 | # print(rot_encode("stay", 1))
193 | # print(rot_encode("stay", 3))
194 | 
195 | 
196 | def create_swap_chain(sequence, n):
197 |     chain = []
198 |     for index, char in enumerate(sequence):
199 |         new_char = rot_encode(char, 26-n)
200 |         chain.append(str(index+1) + ". " + char + " -> " + new_char + "\n")
201 |     return "".join(chain)
202 | 
203 | 
204 | def string_to_seq(msg):
205 |     seq = ""
206 |     for char in msg:
207 |         seq += str(char2index[char]) + ","
208 |     return seq[:-1]
209 | 
210 | def main(args):
211 |     data = [
212 |         ("examples/bin_1.txt", "bin1"),
213 |         ("examples/bin_2.txt", "bin2"),
214 |         ("examples/bin_3.txt", "bin3"),
215 |         ("examples/bin_4.txt", "bin4"),
216 |         ("examples/bin_5.txt", "bin5")
217 |     ]
218 |     prompt_type = args.prompt_type
219 |     fo_directory = f"stimuli/{prompt_type}"
220 | 
221 |     if not os.path.exists(fo_directory):
222 |         os.makedirs(fo_directory, exist_ok=True)
223 | 
224 |     for shift in range(1, 26):
225 |         for task in ["dec"]:
226 |             for fi_name, fi_label in data:
227 |                 fo_name = f"{fo_directory}/{prompt_type+str(shift)}_{fi_label}.jsonl"
228 | 
229 |                 fi = open(fi_name, "r")
230 |                 fo = open(fo_name, "w")
231 |                 jsl = jsonlines.Writer(fo)
232 | 
233 |                 count_encoded = 0
234 |                 for line_num, line in enumerate(fi):
235 |                     example = {}
236 | 
237 |                     # Task
238 |                     example["task_name"] = "rot-" + str(shift)
239 | 
240 |                     # Condition
241 |                     example_type = fo_name.split("_")[1].split(".")[0]
242 |                     example["example_type"] = example_type
243 | 
244 |                     word = line.strip().split("\t")[0]
245 |                     sentence = word
246 |                     # sentence1 = line.strip().split("\t")[0]
247 |                     encoded = rot_encode(word, shift)
248 | 
249 |                     # Instruction
250 |                     if task == "dec":
251 |                         if shift == 1:
252 |                             if prompt_type == "standard":
253 |                                 example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' position forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ' along with the original text that it was created from:\nRot-' + str(
254 |                                     shift) + ' text: "' + rot_encode("Stay here!", shift) + '"\nOriginal text: "Stay here!"\n\nHere is another message in rot-' + str(shift) + '. Decode this message to produce the original text:\nRot-' + str(shift) + ' text: "%s"\nOriginal text:'
255 |                             elif prompt_type == "text_cot":
256 |                                 example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' position forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("stay", shift) + '"\n\nTo decode this message, we shift each letter ' + str(
257 |                                     shift) + ' position backward.' + create_chain(rot_encode("stay", shift), shift) + '\nTherefore, the original text is: "Stay"\n\nHere is another message in rot-' + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"'
258 |                             elif prompt_type == "math_cot":
259 |                                 example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' position forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("stay", shift) + '"\n\nTo decode this message, we need to shift each letter ' + str(
260 |                                     shift) + ' position backward. ' + create_math_cot_chain(rot_encode("stay", shift), shift) + '\nTherefore, the original text is: "stay"\n\nHere is another message in rot-' + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"'
261 |                             elif prompt_type == "number_cot":
262 |                                 example["task_instruction"] = 'Shift-' + str(shift) + ' is a process in which each number is shifted ' + str(shift) + ' position forward until it reaches 26 and subsequently circles back to 1. For example, here is a sequence of numbers written in shift-' + str(shift) + ':\shift-' + str(shift) + ' sequence: "' + string_to_seq(rot_encode("stay", shift)) + '"\n\nTo decode this sequence, we need to shift each number ' + str(
263 |                                     shift) + ' position backward. ' + create_number_cot_chain(rot_encode("stay", shift), shift) + '\nTherefore, the original sequence of numbers is: ' + f'"{string_to_seq("stay")}"' + '\n\nHere is another sequence of numbers in shift-' + str(shift) + '. Decode this sequence one number at a time. On the last line, write the words "Original sequence:" followed by the decoded sequence:\nshift-' + str(shift) + ' sequence: "%s"'
264 |                         else:
265 |                             if prompt_type == "standard":
266 |                                 example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' positions forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ' along with the original text that it was created from:\nRot-' + str(
267 |                                     shift) + ' text: "' + rot_encode("Stay here!", shift) + '"\nOriginal text: "Stay here!"\n\nHere is another message in rot-' + str(shift) + '. Decode this message to produce the original text:\nRot-' + str(shift) + ' text: "%s"\nOriginal text:'
268 |                             elif prompt_type == "text_cot":
269 |                                 example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' positions forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("stay", shift) + '"\n\nTo decode this message, we shift each letter ' + str(
270 |                                     shift) + ' positions backward:\n' + create_chain(rot_encode("stay", shift), shift) + '\nTherefore, the original text is: "stay"\n\nHere is another message in rot-' + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"'
271 |                             elif prompt_type == "cot_hidden_1":
272 |                                 example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' positions forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("Stay here!", shift) + '"\n\nTo decode this message, we shift each letter ' + str(shift) + " positions backward; but instead of revealing what each letter becomes, we will replace it with a '*' until we write the final answer:\n" + create_corrupt_chain(
273 |                                     rot_encode("Stay here!", shift), shift) + """\nIf we put together the letters that were hidden behind each '*', we get that the original text is: "Stay here!"\n\nHere is another message in rot-""" + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"'
274 |                             elif prompt_type == "math_cot":
275 |                                 example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' position forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("stay", shift) + '"\n\nTo decode this message, we need to shift each letter ' + str(
276 |                                     shift) + ' positions backward. ' + create_math_cot_chain(rot_encode("stay", shift), shift) + '\nTherefore, the original text is: "stay"\n\nHere is another message in rot-' + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"'
277 |                             elif prompt_type == "number_cot":
278 |                                 example["task_instruction"] = 'Shift-' + str(shift) + ' is a process in which each number is shifted ' + str(shift) + ' positions forward until it reaches 26 and subsequently circles back to 1. For example, here is a sequence of numbers written in shift-' + str(shift) + ':\shift-' + str(shift) + ' sequence: "' + string_to_seq(rot_encode("stay", shift)) + '"\n\nTo decode this sequence, we need to shift each number ' + str(
279 |                                     shift) + ' positions backward. ' + create_number_cot_chain(rot_encode("stay", shift), shift) + '\nTherefore, the original sequence of numbers is:' + f'"{string_to_seq("stay")}"' + '\n\nHere is another sequence of numbers in shift-' + str(shift) + '. Decode this sequence one number at a time. On the last line, write the words "Original sequence:" followed by the decoded sequence:\nshift-' + str(shift) + ' sequence: "%s"'
280 |                             elif prompt_type == "one-step-fwd":
281 |                                 example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' positions forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("Stay here!", shift) + '"\n\nTo decode this message, we shift each letter ' + str(
282 |                                     26-shift) + ' positions forward one step at a time:\n' + create_step_chain_forward(rot_encode("Stay here!", shift), shift) + '\nTherefore, the original text is: "Stay here!"\n\nHere is another message in rot-' + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"'
283 |                             elif prompt_type == "math_swap":
284 |                                 example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' positions forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("stay", shift+1) + '"\n\nTo decode this message, we shift each letter ' + str(
285 |                                     shift) + ' positions backward:\n' + create_math_cot_chain(rot_encode("stay", shift+1), shift+1) + '\nTherefore, the original text is: "stay"\n\nHere is another message in rot-' + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"'
286 |                             elif prompt_type == "math_corrupt":
287 |                                 example["task_instruction"] = 'Rot-' + str(shift) + ' is a cipher in which each letter is shifted ' + str(shift) + ' positions forward in the alphabet. For example, here is a message written in rot-' + str(shift) + ':\nRot-' + str(shift) + ' text: "' + rot_encode("stay", shift) + '"\n\nTo decode this message, we shift each letter ' + str(shift) + " positions backward; but instead of revealing what each letter becomes, we will replace it with a '*' until we write the final answer:\n" + create_math_corrupt_chain(
288 |                                     rot_encode("stay", shift), shift) + """\nIf we put together the letters that were hidden behind each '*', we get that the original text is: "stay"\n\nHere is another message in rot-""" + str(shift) + '. Decode this message one letter at a time. On the last line, write the words "Original text:" followed by the decoded message:\nRot-' + str(shift) + ' text: "%s"'
289 | 
290 |                     # Input and correct output
291 |                     if task == "dec":
292 |                         example["input"] = encoded
293 |                         example["correct_output"] = sentence
294 |                     else:
295 |                         example["input"] = sentence
296 |                         example["correct_output"] = encoded
297 | 
298 |                     # Combining the instruction and input (this is the string that should be given to the model)
299 |                     example["instruction_plus_input"] = example["task_instruction"] % example["input"]
300 | 
301 |                     jsl.write(example)
302 | 
303 |                     count_encoded += 1
304 |                     if count_encoded == 100:
305 |                         break
306 | 
307 | if __name__ == "__main__":
308 |     args = argparse.ArgumentParser()
309 |     args.add_argument("--prompt_type", type=str, help="Prompt type to use", default="text_cot", choices=["standard", "text_cot", "math_cot", "number_cot"])
310 |     args = args.parse_args()
311 |     main(args)


--------------------------------------------------------------------------------