├── .gitignore ├── LICENSE ├── README.md ├── attack_logs ├── exps_claude_1.2_transfer.log ├── exps_claude_2.0_transfer.log ├── exps_claude_2.1_prefilling.log ├── exps_claude_2.1_transfer.log ├── exps_claude_3.5_sonnet_prefilling.log ├── exps_claude_3.5_sonnet_transfer.log ├── exps_claude_3_haiku_prefilling.log ├── exps_claude_3_haiku_transfer.log ├── exps_claude_3_opus_transfer.log ├── exps_claude_3_sonnet_transfer.log ├── exps_gemma-7b.log ├── exps_gpt3.5_turbo.log ├── exps_gpt4_turbo.log ├── exps_llama2-13b.log ├── exps_llama2-13b_icl_one_shot.log ├── exps_llama2-70b.log ├── exps_llama2-7b_icl_one_shot.log ├── exps_llama2-7b_plain_init.log ├── exps_llama2_7b.log ├── exps_llama3-8b.log ├── exps_mistral-7b_simplified_template.log ├── exps_phi3.log ├── exps_r2d2.log └── exps_vicuna.log ├── common.py ├── config.py ├── conversers.py ├── experiments ├── exps_claude_prefilling.sh ├── exps_claude_transfer.sh ├── exps_gemma_7b.sh ├── exps_gpt3.5_turbo.sh ├── exps_gpt4_turbo.sh ├── exps_llama2_13b.sh ├── exps_llama2_70b.sh ├── exps_llama2_7b.sh ├── exps_llama2_all_icl_template.sh ├── exps_llama3_8b.sh ├── exps_mistral_7b.sh ├── exps_phi3.sh ├── exps_r2d2.sh └── exps_vicuna.sh ├── harmful_behaviors ├── README.md └── harmful_behaviors_pair.csv ├── images ├── llamas_gemma_asr.jpg ├── llamas_gemma_logprobs.jpg ├── main_table.png └── title_abstract.png ├── jailbreak_artifacts ├── exps_gemma-7b.json ├── exps_gpt3.5_turbo.json ├── exps_gpt4_turbo.json ├── exps_llama2-13b.json ├── exps_llama2-13b_icl_one_shot.json ├── exps_llama2-70b.json ├── exps_llama2-7b_icl_one_shot.json ├── exps_llama2-7b_plain_init.json ├── exps_llama2_7b.json ├── exps_llama3-8b.json ├── exps_mistral-7b_simplified_template.json ├── exps_nemotron-4-340b.json ├── exps_phi3.json ├── exps_r2d2.json └── exps_vicuna.json ├── judges.py ├── language_models.py ├── loggers.py ├── main.py ├── main_claude_prefilling.py ├── main_claude_transfer.py ├── prompts.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .DS_Store 3 | .env 4 | exps_new.sh -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/README.md -------------------------------------------------------------------------------- /attack_logs/exps_claude_1.2_transfer.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_1.2_transfer.log -------------------------------------------------------------------------------- /attack_logs/exps_claude_2.0_transfer.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_2.0_transfer.log -------------------------------------------------------------------------------- /attack_logs/exps_claude_2.1_prefilling.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_2.1_prefilling.log -------------------------------------------------------------------------------- /attack_logs/exps_claude_2.1_transfer.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_2.1_transfer.log -------------------------------------------------------------------------------- /attack_logs/exps_claude_3.5_sonnet_prefilling.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_3.5_sonnet_prefilling.log -------------------------------------------------------------------------------- /attack_logs/exps_claude_3.5_sonnet_transfer.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_3.5_sonnet_transfer.log -------------------------------------------------------------------------------- /attack_logs/exps_claude_3_haiku_prefilling.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_3_haiku_prefilling.log -------------------------------------------------------------------------------- /attack_logs/exps_claude_3_haiku_transfer.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_3_haiku_transfer.log -------------------------------------------------------------------------------- /attack_logs/exps_claude_3_opus_transfer.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_3_opus_transfer.log -------------------------------------------------------------------------------- /attack_logs/exps_claude_3_sonnet_transfer.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_claude_3_sonnet_transfer.log -------------------------------------------------------------------------------- /attack_logs/exps_gemma-7b.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_gemma-7b.log -------------------------------------------------------------------------------- /attack_logs/exps_gpt3.5_turbo.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_gpt3.5_turbo.log -------------------------------------------------------------------------------- /attack_logs/exps_gpt4_turbo.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_gpt4_turbo.log -------------------------------------------------------------------------------- /attack_logs/exps_llama2-13b.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_llama2-13b.log -------------------------------------------------------------------------------- /attack_logs/exps_llama2-13b_icl_one_shot.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_llama2-13b_icl_one_shot.log -------------------------------------------------------------------------------- /attack_logs/exps_llama2-70b.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_llama2-70b.log -------------------------------------------------------------------------------- /attack_logs/exps_llama2-7b_icl_one_shot.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_llama2-7b_icl_one_shot.log -------------------------------------------------------------------------------- /attack_logs/exps_llama2-7b_plain_init.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_llama2-7b_plain_init.log -------------------------------------------------------------------------------- /attack_logs/exps_llama2_7b.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_llama2_7b.log -------------------------------------------------------------------------------- /attack_logs/exps_llama3-8b.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_llama3-8b.log -------------------------------------------------------------------------------- /attack_logs/exps_mistral-7b_simplified_template.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_mistral-7b_simplified_template.log -------------------------------------------------------------------------------- /attack_logs/exps_phi3.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_phi3.log -------------------------------------------------------------------------------- /attack_logs/exps_r2d2.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_r2d2.log -------------------------------------------------------------------------------- /attack_logs/exps_vicuna.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/attack_logs/exps_vicuna.log -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/common.py -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/config.py -------------------------------------------------------------------------------- /conversers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/conversers.py -------------------------------------------------------------------------------- /experiments/exps_claude_prefilling.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_claude_prefilling.sh -------------------------------------------------------------------------------- /experiments/exps_claude_transfer.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_claude_transfer.sh -------------------------------------------------------------------------------- /experiments/exps_gemma_7b.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_gemma_7b.sh -------------------------------------------------------------------------------- /experiments/exps_gpt3.5_turbo.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_gpt3.5_turbo.sh -------------------------------------------------------------------------------- /experiments/exps_gpt4_turbo.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_gpt4_turbo.sh -------------------------------------------------------------------------------- /experiments/exps_llama2_13b.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_llama2_13b.sh -------------------------------------------------------------------------------- /experiments/exps_llama2_70b.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_llama2_70b.sh -------------------------------------------------------------------------------- /experiments/exps_llama2_7b.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_llama2_7b.sh -------------------------------------------------------------------------------- /experiments/exps_llama2_all_icl_template.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_llama2_all_icl_template.sh -------------------------------------------------------------------------------- /experiments/exps_llama3_8b.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_llama3_8b.sh -------------------------------------------------------------------------------- /experiments/exps_mistral_7b.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_mistral_7b.sh -------------------------------------------------------------------------------- /experiments/exps_phi3.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_phi3.sh -------------------------------------------------------------------------------- /experiments/exps_r2d2.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_r2d2.sh -------------------------------------------------------------------------------- /experiments/exps_vicuna.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/experiments/exps_vicuna.sh -------------------------------------------------------------------------------- /harmful_behaviors/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/harmful_behaviors/README.md -------------------------------------------------------------------------------- /harmful_behaviors/harmful_behaviors_pair.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/harmful_behaviors/harmful_behaviors_pair.csv -------------------------------------------------------------------------------- /images/llamas_gemma_asr.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/images/llamas_gemma_asr.jpg -------------------------------------------------------------------------------- /images/llamas_gemma_logprobs.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/images/llamas_gemma_logprobs.jpg -------------------------------------------------------------------------------- /images/main_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/images/main_table.png -------------------------------------------------------------------------------- /images/title_abstract.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/images/title_abstract.png -------------------------------------------------------------------------------- /jailbreak_artifacts/exps_gemma-7b.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_gemma-7b.json -------------------------------------------------------------------------------- /jailbreak_artifacts/exps_gpt3.5_turbo.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_gpt3.5_turbo.json -------------------------------------------------------------------------------- /jailbreak_artifacts/exps_gpt4_turbo.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_gpt4_turbo.json -------------------------------------------------------------------------------- /jailbreak_artifacts/exps_llama2-13b.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_llama2-13b.json -------------------------------------------------------------------------------- /jailbreak_artifacts/exps_llama2-13b_icl_one_shot.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_llama2-13b_icl_one_shot.json -------------------------------------------------------------------------------- /jailbreak_artifacts/exps_llama2-70b.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_llama2-70b.json -------------------------------------------------------------------------------- /jailbreak_artifacts/exps_llama2-7b_icl_one_shot.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_llama2-7b_icl_one_shot.json -------------------------------------------------------------------------------- /jailbreak_artifacts/exps_llama2-7b_plain_init.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_llama2-7b_plain_init.json -------------------------------------------------------------------------------- /jailbreak_artifacts/exps_llama2_7b.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_llama2_7b.json -------------------------------------------------------------------------------- /jailbreak_artifacts/exps_llama3-8b.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_llama3-8b.json -------------------------------------------------------------------------------- /jailbreak_artifacts/exps_mistral-7b_simplified_template.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_mistral-7b_simplified_template.json -------------------------------------------------------------------------------- /jailbreak_artifacts/exps_nemotron-4-340b.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_nemotron-4-340b.json -------------------------------------------------------------------------------- /jailbreak_artifacts/exps_phi3.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_phi3.json -------------------------------------------------------------------------------- /jailbreak_artifacts/exps_r2d2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_r2d2.json -------------------------------------------------------------------------------- /jailbreak_artifacts/exps_vicuna.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/jailbreak_artifacts/exps_vicuna.json -------------------------------------------------------------------------------- /judges.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/judges.py -------------------------------------------------------------------------------- /language_models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/language_models.py -------------------------------------------------------------------------------- /loggers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/loggers.py -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/main.py -------------------------------------------------------------------------------- /main_claude_prefilling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/main_claude_prefilling.py -------------------------------------------------------------------------------- /main_claude_transfer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/main_claude_transfer.py -------------------------------------------------------------------------------- /prompts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/prompts.py -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tml-epfl/llm-adaptive-attacks/HEAD/utils.py --------------------------------------------------------------------------------