├── .gitignore ├── GCG ├── .gitignore ├── LICENSE ├── README.md ├── data │ └── transfer_expriment_behaviors.csv ├── experiments │ ├── __init__.py │ ├── configs │ │ ├── __init__.py │ │ ├── individual_llama2_13B.py │ │ ├── individual_vicuna_13B.py │ │ ├── template.py │ │ ├── transfer_vicuna.py │ │ └── transfer_vicuna_guanaco.py │ ├── llm_attacks │ └── main.py ├── llm_attacks │ ├── README.md │ ├── __init__.py │ ├── base │ │ ├── __init__.py │ │ ├── attack_manager.py │ │ └── custom.py │ └── gcg │ │ ├── __init__.py │ │ └── gcg_attack.py ├── requirements.txt └── setup.py ├── LICENSE ├── PAIR ├── LICENSE ├── README.md ├── __init__.py ├── common.py ├── config.py ├── conversers.py ├── data │ └── harmful_behaviors_custom.csv ├── loggers.py ├── main.py ├── requirements.txt ├── system_prompts.py └── utils.py ├── README.md ├── arguments.py ├── attacks ├── autodan.py ├── gcg.py └── pair.py ├── data ├── README.md ├── autodan │ ├── .gitignore │ ├── autodan_initial_prompt.txt │ └── prompt_group.pth ├── benign_behaviors_custom.json ├── harmful_behaviors_custom.csv └── harmful_behaviors_custom.json ├── run_inference.py ├── run_judge.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/.gitignore -------------------------------------------------------------------------------- /GCG/.gitignore: -------------------------------------------------------------------------------- 1 | experiments/results 2 | -------------------------------------------------------------------------------- /GCG/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/GCG/LICENSE -------------------------------------------------------------------------------- /GCG/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/GCG/README.md -------------------------------------------------------------------------------- /GCG/data/transfer_expriment_behaviors.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/GCG/data/transfer_expriment_behaviors.csv -------------------------------------------------------------------------------- /GCG/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /GCG/experiments/configs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /GCG/experiments/configs/individual_llama2_13B.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/GCG/experiments/configs/individual_llama2_13B.py -------------------------------------------------------------------------------- /GCG/experiments/configs/individual_vicuna_13B.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/GCG/experiments/configs/individual_vicuna_13B.py -------------------------------------------------------------------------------- /GCG/experiments/configs/template.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/GCG/experiments/configs/template.py -------------------------------------------------------------------------------- /GCG/experiments/configs/transfer_vicuna.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/GCG/experiments/configs/transfer_vicuna.py -------------------------------------------------------------------------------- /GCG/experiments/configs/transfer_vicuna_guanaco.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/GCG/experiments/configs/transfer_vicuna_guanaco.py -------------------------------------------------------------------------------- /GCG/experiments/llm_attacks: -------------------------------------------------------------------------------- 1 | ../llm_attacks/ -------------------------------------------------------------------------------- /GCG/experiments/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/GCG/experiments/main.py -------------------------------------------------------------------------------- /GCG/llm_attacks/README.md: -------------------------------------------------------------------------------- 1 | README.md 2 | -------------------------------------------------------------------------------- /GCG/llm_attacks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/GCG/llm_attacks/__init__.py -------------------------------------------------------------------------------- /GCG/llm_attacks/base/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /GCG/llm_attacks/base/attack_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/GCG/llm_attacks/base/attack_manager.py -------------------------------------------------------------------------------- /GCG/llm_attacks/base/custom.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/GCG/llm_attacks/base/custom.py -------------------------------------------------------------------------------- /GCG/llm_attacks/gcg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/GCG/llm_attacks/gcg/__init__.py -------------------------------------------------------------------------------- /GCG/llm_attacks/gcg/gcg_attack.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/GCG/llm_attacks/gcg/gcg_attack.py -------------------------------------------------------------------------------- /GCG/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers>=4.28.1 2 | fschat==0.2.20 3 | ml_collections 4 | -------------------------------------------------------------------------------- /GCG/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/GCG/setup.py -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/LICENSE -------------------------------------------------------------------------------- /PAIR/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/PAIR/LICENSE -------------------------------------------------------------------------------- /PAIR/README.md: -------------------------------------------------------------------------------- 1 | Modified from https://github.com/patrickrchao/JailbreakingLLMs. -------------------------------------------------------------------------------- /PAIR/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /PAIR/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/PAIR/common.py -------------------------------------------------------------------------------- /PAIR/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/PAIR/config.py -------------------------------------------------------------------------------- /PAIR/conversers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/PAIR/conversers.py -------------------------------------------------------------------------------- /PAIR/data/harmful_behaviors_custom.csv: -------------------------------------------------------------------------------- 1 | ../../data/harmful_behaviors_custom.csv -------------------------------------------------------------------------------- /PAIR/loggers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/PAIR/loggers.py -------------------------------------------------------------------------------- /PAIR/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/PAIR/main.py -------------------------------------------------------------------------------- /PAIR/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/PAIR/requirements.txt -------------------------------------------------------------------------------- /PAIR/system_prompts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/PAIR/system_prompts.py -------------------------------------------------------------------------------- /PAIR/utils.py: -------------------------------------------------------------------------------- 1 | ../utils.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/README.md -------------------------------------------------------------------------------- /arguments.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/arguments.py -------------------------------------------------------------------------------- /attacks/autodan.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/attacks/autodan.py -------------------------------------------------------------------------------- /attacks/gcg.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/attacks/gcg.py -------------------------------------------------------------------------------- /attacks/pair.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/attacks/pair.py -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/data/README.md -------------------------------------------------------------------------------- /data/autodan/.gitignore: -------------------------------------------------------------------------------- 1 | outputs -------------------------------------------------------------------------------- /data/autodan/autodan_initial_prompt.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/data/autodan/autodan_initial_prompt.txt -------------------------------------------------------------------------------- /data/autodan/prompt_group.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/data/autodan/prompt_group.pth -------------------------------------------------------------------------------- /data/benign_behaviors_custom.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/data/benign_behaviors_custom.json -------------------------------------------------------------------------------- /data/harmful_behaviors_custom.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/data/harmful_behaviors_custom.csv -------------------------------------------------------------------------------- /data/harmful_behaviors_custom.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/data/harmful_behaviors_custom.json -------------------------------------------------------------------------------- /run_inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/run_inference.py -------------------------------------------------------------------------------- /run_judge.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/run_judge.py -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation/HEAD/utils.py --------------------------------------------------------------------------------