├── .gitignore
├── LICENSE
├── README.md
├── attack_logs
    ├── exps_claude_1.2_transfer.log
    ├── exps_claude_2.0_transfer.log
    ├── exps_claude_2.1_prefilling.log
    ├── exps_claude_2.1_transfer.log
    ├── exps_claude_3.5_sonnet_prefilling.log
    ├── exps_claude_3.5_sonnet_transfer.log
    ├── exps_claude_3_haiku_prefilling.log
    ├── exps_claude_3_haiku_transfer.log
    ├── exps_claude_3_opus_transfer.log
    ├── exps_claude_3_sonnet_transfer.log
    ├── exps_gemma-7b.log
    ├── exps_gpt3.5_turbo.log
    ├── exps_gpt4_turbo.log
    ├── exps_llama2-13b.log
    ├── exps_llama2-13b_icl_one_shot.log
    ├── exps_llama2-70b.log
    ├── exps_llama2-7b_icl_one_shot.log
    ├── exps_llama2-7b_plain_init.log
    ├── exps_llama2_7b.log
    ├── exps_llama3-8b.log
    ├── exps_mistral-7b_simplified_template.log
    ├── exps_phi3.log
    ├── exps_r2d2.log
    └── exps_vicuna.log
├── common.py
├── config.py
├── conversers.py
├── experiments
    ├── exps_claude_prefilling.sh
    ├── exps_claude_transfer.sh
    ├── exps_gemma_7b.sh
    ├── exps_gpt3.5_turbo.sh
    ├── exps_gpt4_turbo.sh
    ├── exps_llama2_13b.sh
    ├── exps_llama2_70b.sh
    ├── exps_llama2_7b.sh
    ├── exps_llama2_all_icl_template.sh
    ├── exps_llama3_8b.sh
    ├── exps_mistral_7b.sh
    ├── exps_phi3.sh
    ├── exps_r2d2.sh
    └── exps_vicuna.sh
├── harmful_behaviors
    ├── README.md
    └── harmful_behaviors_pair.csv
├── images
    ├── llamas_gemma_asr.jpg
    ├── llamas_gemma_logprobs.jpg
    ├── main_table.png
    └── title_abstract.png
├── jailbreak_artifacts
    ├── exps_gemma-7b.json
    ├── exps_gpt3.5_turbo.json
    ├── exps_gpt4_turbo.json
    ├── exps_llama2-13b.json
    ├── exps_llama2-13b_icl_one_shot.json
    ├── exps_llama2-70b.json
    ├── exps_llama2-7b_icl_one_shot.json
    ├── exps_llama2-7b_plain_init.json
    ├── exps_llama2_7b.json
    ├── exps_llama3-8b.json
    ├── exps_mistral-7b_simplified_template.json
    ├── exps_nemotron-4-340b.json
    ├── exps_phi3.json
    ├── exps_r2d2.json
    └── exps_vicuna.json
├── judges.py
├── language_models.py
├── loggers.py
├── main.py
├── main_claude_prefilling.py
├── main_claude_transfer.py
├── prompts.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .DS_Store
3 | .env
4 | exps_new.sh


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/LICENSE


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/README.md


--------------------------------------------------------------------------------
/attack_logs/exps_claude_1.2_transfer.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_1.2_transfer.log


--------------------------------------------------------------------------------
/attack_logs/exps_claude_2.0_transfer.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_2.0_transfer.log


--------------------------------------------------------------------------------
/attack_logs/exps_claude_2.1_prefilling.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_2.1_prefilling.log


--------------------------------------------------------------------------------
/attack_logs/exps_claude_2.1_transfer.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_2.1_transfer.log


--------------------------------------------------------------------------------
/attack_logs/exps_claude_3.5_sonnet_prefilling.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_3.5_sonnet_prefilling.log


--------------------------------------------------------------------------------
/attack_logs/exps_claude_3.5_sonnet_transfer.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_3.5_sonnet_transfer.log


--------------------------------------------------------------------------------
/attack_logs/exps_claude_3_haiku_prefilling.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_3_haiku_prefilling.log


--------------------------------------------------------------------------------
/attack_logs/exps_claude_3_haiku_transfer.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_3_haiku_transfer.log


--------------------------------------------------------------------------------
/attack_logs/exps_claude_3_opus_transfer.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_3_opus_transfer.log


--------------------------------------------------------------------------------
/attack_logs/exps_claude_3_sonnet_transfer.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_3_sonnet_transfer.log


--------------------------------------------------------------------------------
/attack_logs/exps_gemma-7b.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_gemma-7b.log


--------------------------------------------------------------------------------
/attack_logs/exps_gpt3.5_turbo.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_gpt3.5_turbo.log


--------------------------------------------------------------------------------
/attack_logs/exps_gpt4_turbo.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_gpt4_turbo.log


--------------------------------------------------------------------------------
/attack_logs/exps_llama2-13b.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_llama2-13b.log


--------------------------------------------------------------------------------
/attack_logs/exps_llama2-13b_icl_one_shot.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_llama2-13b_icl_one_shot.log


--------------------------------------------------------------------------------
/attack_logs/exps_llama2-70b.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_llama2-70b.log


--------------------------------------------------------------------------------
/attack_logs/exps_llama2-7b_icl_one_shot.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_llama2-7b_icl_one_shot.log


--------------------------------------------------------------------------------
/attack_logs/exps_llama2-7b_plain_init.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_llama2-7b_plain_init.log


--------------------------------------------------------------------------------
/attack_logs/exps_llama2_7b.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_llama2_7b.log


--------------------------------------------------------------------------------
/attack_logs/exps_llama3-8b.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_llama3-8b.log


--------------------------------------------------------------------------------
/attack_logs/exps_mistral-7b_simplified_template.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_mistral-7b_simplified_template.log


--------------------------------------------------------------------------------
/attack_logs/exps_phi3.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_phi3.log


--------------------------------------------------------------------------------
/attack_logs/exps_r2d2.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_r2d2.log


--------------------------------------------------------------------------------
/attack_logs/exps_vicuna.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_vicuna.log


--------------------------------------------------------------------------------
/common.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/common.py


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/config.py


--------------------------------------------------------------------------------
/conversers.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/conversers.py


--------------------------------------------------------------------------------
/experiments/exps_claude_prefilling.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_claude_prefilling.sh


--------------------------------------------------------------------------------
/experiments/exps_claude_transfer.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_claude_transfer.sh


--------------------------------------------------------------------------------
/experiments/exps_gemma_7b.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_gemma_7b.sh


--------------------------------------------------------------------------------
/experiments/exps_gpt3.5_turbo.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_gpt3.5_turbo.sh


--------------------------------------------------------------------------------
/experiments/exps_gpt4_turbo.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_gpt4_turbo.sh


--------------------------------------------------------------------------------
/experiments/exps_llama2_13b.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_llama2_13b.sh


--------------------------------------------------------------------------------
/experiments/exps_llama2_70b.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_llama2_70b.sh


--------------------------------------------------------------------------------
/experiments/exps_llama2_7b.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_llama2_7b.sh


--------------------------------------------------------------------------------
/experiments/exps_llama2_all_icl_template.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_llama2_all_icl_template.sh


--------------------------------------------------------------------------------
/experiments/exps_llama3_8b.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_llama3_8b.sh


--------------------------------------------------------------------------------
/experiments/exps_mistral_7b.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_mistral_7b.sh


--------------------------------------------------------------------------------
/experiments/exps_phi3.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_phi3.sh


--------------------------------------------------------------------------------
/experiments/exps_r2d2.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_r2d2.sh


--------------------------------------------------------------------------------
/experiments/exps_vicuna.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_vicuna.sh


--------------------------------------------------------------------------------
/harmful_behaviors/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/harmful_behaviors/README.md


--------------------------------------------------------------------------------
/harmful_behaviors/harmful_behaviors_pair.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/harmful_behaviors/harmful_behaviors_pair.csv


--------------------------------------------------------------------------------
/images/llamas_gemma_asr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/images/llamas_gemma_asr.jpg


--------------------------------------------------------------------------------
/images/llamas_gemma_logprobs.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/images/llamas_gemma_logprobs.jpg


--------------------------------------------------------------------------------
/images/main_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/images/main_table.png


--------------------------------------------------------------------------------
/images/title_abstract.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/images/title_abstract.png


--------------------------------------------------------------------------------
/jailbreak_artifacts/exps_gemma-7b.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_gemma-7b.json


--------------------------------------------------------------------------------
/jailbreak_artifacts/exps_gpt3.5_turbo.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_gpt3.5_turbo.json


--------------------------------------------------------------------------------
/jailbreak_artifacts/exps_gpt4_turbo.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_gpt4_turbo.json


--------------------------------------------------------------------------------
/jailbreak_artifacts/exps_llama2-13b.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_llama2-13b.json


--------------------------------------------------------------------------------
/jailbreak_artifacts/exps_llama2-13b_icl_one_shot.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_llama2-13b_icl_one_shot.json


--------------------------------------------------------------------------------
/jailbreak_artifacts/exps_llama2-70b.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_llama2-70b.json


--------------------------------------------------------------------------------
/jailbreak_artifacts/exps_llama2-7b_icl_one_shot.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_llama2-7b_icl_one_shot.json


--------------------------------------------------------------------------------
/jailbreak_artifacts/exps_llama2-7b_plain_init.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_llama2-7b_plain_init.json


--------------------------------------------------------------------------------
/jailbreak_artifacts/exps_llama2_7b.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_llama2_7b.json


--------------------------------------------------------------------------------
/jailbreak_artifacts/exps_llama3-8b.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_llama3-8b.json


--------------------------------------------------------------------------------
/jailbreak_artifacts/exps_mistral-7b_simplified_template.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_mistral-7b_simplified_template.json


--------------------------------------------------------------------------------
/jailbreak_artifacts/exps_nemotron-4-340b.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_nemotron-4-340b.json


--------------------------------------------------------------------------------
/jailbreak_artifacts/exps_phi3.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_phi3.json


--------------------------------------------------------------------------------
/jailbreak_artifacts/exps_r2d2.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_r2d2.json


--------------------------------------------------------------------------------
/jailbreak_artifacts/exps_vicuna.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_vicuna.json


--------------------------------------------------------------------------------
/judges.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/judges.py


--------------------------------------------------------------------------------
/language_models.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/language_models.py


--------------------------------------------------------------------------------
/loggers.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/loggers.py


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/main.py


--------------------------------------------------------------------------------
/main_claude_prefilling.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/main_claude_prefilling.py


--------------------------------------------------------------------------------
/main_claude_transfer.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/main_claude_transfer.py


--------------------------------------------------------------------------------
/prompts.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/prompts.py


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/utils.py


--------------------------------------------------------------------------------