├── .gitignore ├── HarmBench ├── LICENSE ├── README.md ├── adversarial_training │ ├── README.md │ └── alignment-handbook │ │ ├── .deepspeed_env │ │ ├── LICENSE │ │ ├── Makefile │ │ ├── README.md │ │ ├── assets │ │ └── handbook.png │ │ ├── requirements.txt │ │ ├── scripts │ │ ├── README.md │ │ ├── adv_training_utils.py │ │ ├── run_adv_training.sh │ │ ├── run_dpo.py │ │ ├── run_sft.py │ │ └── run_sft_adv_training.py │ │ ├── setup.cfg │ │ ├── setup.py │ │ └── tests │ │ ├── __init__.py │ │ ├── test_configs.py │ │ ├── test_data.py │ │ └── test_model_utils.py ├── api_models.py ├── assets │ ├── eval_pipeline-1.png │ ├── harmbench_splash.pdf │ └── harmbench_splash.png ├── baselines │ ├── __init__.py │ ├── artprompt │ │ ├── __init__.py │ │ ├── artprompt.py │ │ ├── jailbreak │ │ │ ├── attack_utils.py │ │ │ ├── base_prompt.py │ │ │ ├── baseline.py │ │ │ ├── bpe.py │ │ │ ├── defense_utils.py │ │ │ ├── safe_eval.py │ │ │ └── subword_nmt.voc │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── dataset.py │ │ │ ├── eval.py │ │ │ ├── few_shot_example.py │ │ │ ├── model.py │ │ │ └── prompt.py │ ├── autodan │ │ ├── AutoDAN.py │ │ ├── __init__.py │ │ ├── mutate_models.py │ │ ├── templates.py │ │ └── utils.py │ ├── autoprompt │ │ ├── __init__.py │ │ ├── autoprompt.py │ │ └── autoprompt_utils.py │ ├── baseline.py │ ├── check_refusal_utils.py │ ├── direct_request │ │ ├── __init__.py │ │ └── direct_request.py │ ├── fewshot │ │ ├── __init__.py │ │ └── fewshot.py │ ├── gbda │ │ ├── __init__.py │ │ └── gbda.py │ ├── gcg │ │ ├── __init__.py │ │ ├── gcg.py │ │ └── gcg_utils.py │ ├── gcg_ensemble │ │ ├── __init__.py │ │ ├── gcg_ensemble.py │ │ └── gcg_ray_actors.py │ ├── gptfuzz │ │ ├── GPTFuzzer.csv │ │ ├── __init__.py │ │ ├── gptfuzz.py │ │ └── gptfuzzer │ │ │ └── __init__.py │ ├── human_jailbreaks │ │ ├── __init__.py │ │ ├── human_jailbreaks.py │ │ └── jailbreaks.py │ ├── model_utils.py │ ├── multimodaldirectrequest │ │ ├── __init__.py │ │ └── multimodaldirectrequest.py │ ├── multimodalpgd │ │ ├── __init__.py │ │ └── multimodalpgd.py │ ├── multimodalrendertext │ │ ├── __init__.py │ │ └── multimodalrendertext.py │ ├── pair │ │ ├── PAIR.py │ │ ├── __init__.py │ │ ├── common.py │ │ ├── conversers.py │ │ ├── judges.py │ │ ├── language_models.py │ │ └── system_prompts.py │ ├── pap │ │ ├── PAP.py │ │ ├── __init__.py │ │ ├── language_models.py │ │ └── templates.py │ ├── pez │ │ ├── __init__.py │ │ └── pez.py │ ├── tap │ │ ├── TAP.py │ │ ├── __init__.py │ │ ├── common.py │ │ ├── conversers.py │ │ ├── judges.py │ │ ├── language_models.py │ │ └── system_prompts.py │ ├── uat │ │ ├── __init__.py │ │ └── uat.py │ └── zeroshot │ │ ├── __init__.py │ │ └── zeroshot.py ├── configs │ ├── method_configs │ │ ├── ArtPrompt_config.yaml │ │ ├── AutoDAN_config.yaml │ │ ├── AutoPrompt_config.yaml │ │ ├── DirectRequest_config.yaml │ │ ├── EnsembleGCG_config.yaml │ │ ├── FewShot_config.yaml │ │ ├── GBDA_config.yaml │ │ ├── GCG_config.yaml │ │ ├── GPTFuzz_config.yaml │ │ ├── HumanJailbreaks_config.yaml │ │ ├── MultiModalDirectRequest_config.yaml │ │ ├── MultiModalPGDBlankImage_config.yaml │ │ ├── MultiModalPGDPatch_config.yaml │ │ ├── MultiModalPGD_config.yaml │ │ ├── MultiModalRenderText_config.yaml │ │ ├── PAIR_config.yaml │ │ ├── PAP_config.yaml │ │ ├── PEZ_config.yaml │ │ ├── TAP_config.yaml │ │ ├── UAT_config.yaml │ │ └── ZeroShot_config.yaml │ ├── model_configs │ │ └── models.yaml │ └── pipeline_configs │ │ └── run_pipeline.yaml ├── docs │ ├── behavior_datasets.md │ ├── codebase_structure.md │ ├── configs.md │ └── evaluation_pipeline.md ├── eval_utils.py ├── evaluate_completions.py ├── generate_completions.py ├── generate_test_cases.py ├── merge_test_cases.py ├── multimodalmodels │ ├── __init__.py │ ├── gpt4v │ │ ├── __init__.py │ │ └── gpt4v_model.py │ ├── instructblip │ │ ├── __init__.py │ │ └── instructblip_model.py │ ├── llava │ │ ├── __init__.py │ │ └── llava_model.py │ ├── multimodalmodel.py │ └── qwen │ │ ├── __init__.py │ │ ├── configuration_qwen.py │ │ ├── finetune.py │ │ ├── modeling_qwen.py │ │ ├── openai_api.py │ │ ├── qwen_generation_utils.py │ │ ├── qwen_model.py │ │ ├── tokenization_qwen.py │ │ ├── visual.py │ │ └── web_demo_mm.py ├── requirements.txt └── scripts │ ├── run_pipeline.py │ ├── step1.5.sh │ ├── step1.sh │ ├── step2.sh │ └── step3.sh ├── LICENSE ├── README.md ├── assets └── overview.png ├── jailbreakbench ├── CITATION.bib ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── assets │ ├── JBB_Table.jpg │ ├── jbb_behaviors_source_breakdown.jpg │ ├── jbb_logo_white.png │ └── logo.png ├── jbb_classify.py ├── jbb_response.py ├── jbb_run.sh ├── pyproject.toml ├── requirements-dev.lock ├── requirements.lock └── src │ └── jailbreakbench │ ├── __init__.py │ ├── artifact.py │ ├── classifier.py │ ├── config.py │ ├── dataset.py │ ├── defenses │ ├── __init__.py │ ├── base_defense.py │ ├── defenselib │ │ ├── __init__.py │ │ ├── defense_hparams.py │ │ └── perturbations.py │ ├── defenses_registry.py │ ├── erase_and_check.py │ ├── perplexity_filter.py │ ├── remove_non_dictionary.py │ ├── smooth_llm.py │ └── synonym_substitution.py │ ├── llm │ ├── __init__.py │ ├── dummy_vllm.py │ ├── litellm.py │ ├── llm_output.py │ ├── llm_wrapper.py │ └── vllm.py │ ├── plotting │ └── plot_source_breakdown.py │ ├── submission.py │ └── vllm_server.py ├── r2d_train ├── expand.py ├── expand_and_train.sh └── train.py └── xstest ├── LICENSE ├── build_completions.py ├── build_completions.sh ├── evaluation ├── classify_completions.py ├── classify_completions.sh └── classify_completions_strmatch.py ├── readme.md └── xstest_prompts.csv /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/.gitignore -------------------------------------------------------------------------------- /HarmBench/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/LICENSE -------------------------------------------------------------------------------- /HarmBench/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/README.md -------------------------------------------------------------------------------- /HarmBench/adversarial_training/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/README.md -------------------------------------------------------------------------------- /HarmBench/adversarial_training/alignment-handbook/.deepspeed_env: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/.deepspeed_env -------------------------------------------------------------------------------- /HarmBench/adversarial_training/alignment-handbook/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/LICENSE -------------------------------------------------------------------------------- /HarmBench/adversarial_training/alignment-handbook/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/Makefile -------------------------------------------------------------------------------- /HarmBench/adversarial_training/alignment-handbook/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/README.md -------------------------------------------------------------------------------- /HarmBench/adversarial_training/alignment-handbook/assets/handbook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/assets/handbook.png -------------------------------------------------------------------------------- /HarmBench/adversarial_training/alignment-handbook/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/requirements.txt -------------------------------------------------------------------------------- /HarmBench/adversarial_training/alignment-handbook/scripts/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/scripts/README.md -------------------------------------------------------------------------------- /HarmBench/adversarial_training/alignment-handbook/scripts/adv_training_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/scripts/adv_training_utils.py -------------------------------------------------------------------------------- /HarmBench/adversarial_training/alignment-handbook/scripts/run_adv_training.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/scripts/run_adv_training.sh -------------------------------------------------------------------------------- /HarmBench/adversarial_training/alignment-handbook/scripts/run_dpo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/scripts/run_dpo.py -------------------------------------------------------------------------------- /HarmBench/adversarial_training/alignment-handbook/scripts/run_sft.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/scripts/run_sft.py -------------------------------------------------------------------------------- /HarmBench/adversarial_training/alignment-handbook/scripts/run_sft_adv_training.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/scripts/run_sft_adv_training.py -------------------------------------------------------------------------------- /HarmBench/adversarial_training/alignment-handbook/setup.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/setup.cfg -------------------------------------------------------------------------------- /HarmBench/adversarial_training/alignment-handbook/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/setup.py -------------------------------------------------------------------------------- /HarmBench/adversarial_training/alignment-handbook/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /HarmBench/adversarial_training/alignment-handbook/tests/test_configs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/tests/test_configs.py -------------------------------------------------------------------------------- /HarmBench/adversarial_training/alignment-handbook/tests/test_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/tests/test_data.py -------------------------------------------------------------------------------- /HarmBench/adversarial_training/alignment-handbook/tests/test_model_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/tests/test_model_utils.py -------------------------------------------------------------------------------- /HarmBench/api_models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/api_models.py -------------------------------------------------------------------------------- /HarmBench/assets/eval_pipeline-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/assets/eval_pipeline-1.png -------------------------------------------------------------------------------- /HarmBench/assets/harmbench_splash.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/assets/harmbench_splash.pdf -------------------------------------------------------------------------------- /HarmBench/assets/harmbench_splash.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/assets/harmbench_splash.png -------------------------------------------------------------------------------- /HarmBench/baselines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/__init__.py -------------------------------------------------------------------------------- /HarmBench/baselines/artprompt/__init__.py: -------------------------------------------------------------------------------- 1 | from .artprompt import * -------------------------------------------------------------------------------- /HarmBench/baselines/artprompt/artprompt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/artprompt.py -------------------------------------------------------------------------------- /HarmBench/baselines/artprompt/jailbreak/attack_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/jailbreak/attack_utils.py -------------------------------------------------------------------------------- /HarmBench/baselines/artprompt/jailbreak/base_prompt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/jailbreak/base_prompt.py -------------------------------------------------------------------------------- /HarmBench/baselines/artprompt/jailbreak/baseline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/jailbreak/baseline.py -------------------------------------------------------------------------------- /HarmBench/baselines/artprompt/jailbreak/bpe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/jailbreak/bpe.py -------------------------------------------------------------------------------- /HarmBench/baselines/artprompt/jailbreak/defense_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/jailbreak/defense_utils.py -------------------------------------------------------------------------------- /HarmBench/baselines/artprompt/jailbreak/safe_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/jailbreak/safe_eval.py -------------------------------------------------------------------------------- /HarmBench/baselines/artprompt/jailbreak/subword_nmt.voc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/jailbreak/subword_nmt.voc -------------------------------------------------------------------------------- /HarmBench/baselines/artprompt/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /HarmBench/baselines/artprompt/utils/dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/utils/dataset.py -------------------------------------------------------------------------------- /HarmBench/baselines/artprompt/utils/eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/utils/eval.py -------------------------------------------------------------------------------- /HarmBench/baselines/artprompt/utils/few_shot_example.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/utils/few_shot_example.py -------------------------------------------------------------------------------- /HarmBench/baselines/artprompt/utils/model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/utils/model.py -------------------------------------------------------------------------------- /HarmBench/baselines/artprompt/utils/prompt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/utils/prompt.py -------------------------------------------------------------------------------- /HarmBench/baselines/autodan/AutoDAN.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/autodan/AutoDAN.py -------------------------------------------------------------------------------- /HarmBench/baselines/autodan/__init__.py: -------------------------------------------------------------------------------- 1 | from .AutoDAN import * -------------------------------------------------------------------------------- /HarmBench/baselines/autodan/mutate_models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/autodan/mutate_models.py -------------------------------------------------------------------------------- /HarmBench/baselines/autodan/templates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/autodan/templates.py -------------------------------------------------------------------------------- /HarmBench/baselines/autodan/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/autodan/utils.py -------------------------------------------------------------------------------- /HarmBench/baselines/autoprompt/__init__.py: -------------------------------------------------------------------------------- 1 | from .autoprompt import * -------------------------------------------------------------------------------- /HarmBench/baselines/autoprompt/autoprompt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/autoprompt/autoprompt.py -------------------------------------------------------------------------------- /HarmBench/baselines/autoprompt/autoprompt_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/autoprompt/autoprompt_utils.py -------------------------------------------------------------------------------- /HarmBench/baselines/baseline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/baseline.py -------------------------------------------------------------------------------- /HarmBench/baselines/check_refusal_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/check_refusal_utils.py -------------------------------------------------------------------------------- /HarmBench/baselines/direct_request/__init__.py: -------------------------------------------------------------------------------- 1 | from .direct_request import * -------------------------------------------------------------------------------- /HarmBench/baselines/direct_request/direct_request.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/direct_request/direct_request.py -------------------------------------------------------------------------------- /HarmBench/baselines/fewshot/__init__.py: -------------------------------------------------------------------------------- 1 | from .fewshot import * -------------------------------------------------------------------------------- /HarmBench/baselines/fewshot/fewshot.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/fewshot/fewshot.py -------------------------------------------------------------------------------- /HarmBench/baselines/gbda/__init__.py: -------------------------------------------------------------------------------- 1 | from .gbda import * -------------------------------------------------------------------------------- /HarmBench/baselines/gbda/gbda.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/gbda/gbda.py -------------------------------------------------------------------------------- /HarmBench/baselines/gcg/__init__.py: -------------------------------------------------------------------------------- 1 | from .gcg import * -------------------------------------------------------------------------------- /HarmBench/baselines/gcg/gcg.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/gcg/gcg.py -------------------------------------------------------------------------------- /HarmBench/baselines/gcg/gcg_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/gcg/gcg_utils.py -------------------------------------------------------------------------------- /HarmBench/baselines/gcg_ensemble/__init__.py: -------------------------------------------------------------------------------- 1 | from .gcg_ensemble import * -------------------------------------------------------------------------------- /HarmBench/baselines/gcg_ensemble/gcg_ensemble.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/gcg_ensemble/gcg_ensemble.py -------------------------------------------------------------------------------- /HarmBench/baselines/gcg_ensemble/gcg_ray_actors.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/gcg_ensemble/gcg_ray_actors.py -------------------------------------------------------------------------------- /HarmBench/baselines/gptfuzz/GPTFuzzer.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/gptfuzz/GPTFuzzer.csv -------------------------------------------------------------------------------- /HarmBench/baselines/gptfuzz/__init__.py: -------------------------------------------------------------------------------- 1 | from .gptfuzz import * -------------------------------------------------------------------------------- /HarmBench/baselines/gptfuzz/gptfuzz.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/gptfuzz/gptfuzz.py -------------------------------------------------------------------------------- /HarmBench/baselines/gptfuzz/gptfuzzer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /HarmBench/baselines/human_jailbreaks/__init__.py: -------------------------------------------------------------------------------- 1 | from .human_jailbreaks import * -------------------------------------------------------------------------------- /HarmBench/baselines/human_jailbreaks/human_jailbreaks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/human_jailbreaks/human_jailbreaks.py -------------------------------------------------------------------------------- /HarmBench/baselines/human_jailbreaks/jailbreaks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/human_jailbreaks/jailbreaks.py -------------------------------------------------------------------------------- /HarmBench/baselines/model_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/model_utils.py -------------------------------------------------------------------------------- /HarmBench/baselines/multimodaldirectrequest/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/multimodaldirectrequest/__init__.py -------------------------------------------------------------------------------- /HarmBench/baselines/multimodaldirectrequest/multimodaldirectrequest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/multimodaldirectrequest/multimodaldirectrequest.py -------------------------------------------------------------------------------- /HarmBench/baselines/multimodalpgd/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/multimodalpgd/__init__.py -------------------------------------------------------------------------------- /HarmBench/baselines/multimodalpgd/multimodalpgd.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/multimodalpgd/multimodalpgd.py -------------------------------------------------------------------------------- /HarmBench/baselines/multimodalrendertext/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/multimodalrendertext/__init__.py -------------------------------------------------------------------------------- /HarmBench/baselines/multimodalrendertext/multimodalrendertext.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/multimodalrendertext/multimodalrendertext.py -------------------------------------------------------------------------------- /HarmBench/baselines/pair/PAIR.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pair/PAIR.py -------------------------------------------------------------------------------- /HarmBench/baselines/pair/__init__.py: -------------------------------------------------------------------------------- 1 | from .PAIR import * -------------------------------------------------------------------------------- /HarmBench/baselines/pair/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pair/common.py -------------------------------------------------------------------------------- /HarmBench/baselines/pair/conversers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pair/conversers.py -------------------------------------------------------------------------------- /HarmBench/baselines/pair/judges.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pair/judges.py -------------------------------------------------------------------------------- /HarmBench/baselines/pair/language_models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pair/language_models.py -------------------------------------------------------------------------------- /HarmBench/baselines/pair/system_prompts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pair/system_prompts.py -------------------------------------------------------------------------------- /HarmBench/baselines/pap/PAP.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pap/PAP.py -------------------------------------------------------------------------------- /HarmBench/baselines/pap/__init__.py: -------------------------------------------------------------------------------- 1 | from .PAP import * -------------------------------------------------------------------------------- /HarmBench/baselines/pap/language_models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pap/language_models.py -------------------------------------------------------------------------------- /HarmBench/baselines/pap/templates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pap/templates.py -------------------------------------------------------------------------------- /HarmBench/baselines/pez/__init__.py: -------------------------------------------------------------------------------- 1 | from .pez import * 2 | -------------------------------------------------------------------------------- /HarmBench/baselines/pez/pez.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pez/pez.py -------------------------------------------------------------------------------- /HarmBench/baselines/tap/TAP.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/tap/TAP.py -------------------------------------------------------------------------------- /HarmBench/baselines/tap/__init__.py: -------------------------------------------------------------------------------- 1 | from .TAP import * -------------------------------------------------------------------------------- /HarmBench/baselines/tap/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/tap/common.py -------------------------------------------------------------------------------- /HarmBench/baselines/tap/conversers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/tap/conversers.py -------------------------------------------------------------------------------- /HarmBench/baselines/tap/judges.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/tap/judges.py -------------------------------------------------------------------------------- /HarmBench/baselines/tap/language_models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/tap/language_models.py -------------------------------------------------------------------------------- /HarmBench/baselines/tap/system_prompts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/tap/system_prompts.py -------------------------------------------------------------------------------- /HarmBench/baselines/uat/__init__.py: -------------------------------------------------------------------------------- 1 | from .uat import * -------------------------------------------------------------------------------- /HarmBench/baselines/uat/uat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/uat/uat.py -------------------------------------------------------------------------------- /HarmBench/baselines/zeroshot/__init__.py: -------------------------------------------------------------------------------- 1 | from .zeroshot import * -------------------------------------------------------------------------------- /HarmBench/baselines/zeroshot/zeroshot.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/zeroshot/zeroshot.py -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/ArtPrompt_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/ArtPrompt_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/AutoDAN_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/AutoDAN_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/AutoPrompt_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/AutoPrompt_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/DirectRequest_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/DirectRequest_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/EnsembleGCG_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/EnsembleGCG_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/FewShot_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/FewShot_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/GBDA_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/GBDA_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/GCG_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/GCG_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/GPTFuzz_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/GPTFuzz_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/HumanJailbreaks_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/HumanJailbreaks_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/MultiModalDirectRequest_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/MultiModalDirectRequest_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/MultiModalPGDBlankImage_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/MultiModalPGDBlankImage_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/MultiModalPGDPatch_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/MultiModalPGDPatch_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/MultiModalPGD_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/MultiModalPGD_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/MultiModalRenderText_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/MultiModalRenderText_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/PAIR_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/PAIR_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/PAP_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/PAP_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/PEZ_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/PEZ_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/TAP_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/TAP_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/UAT_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/UAT_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/method_configs/ZeroShot_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/ZeroShot_config.yaml -------------------------------------------------------------------------------- /HarmBench/configs/model_configs/models.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/model_configs/models.yaml -------------------------------------------------------------------------------- /HarmBench/configs/pipeline_configs/run_pipeline.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/pipeline_configs/run_pipeline.yaml -------------------------------------------------------------------------------- /HarmBench/docs/behavior_datasets.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /HarmBench/docs/codebase_structure.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/docs/codebase_structure.md -------------------------------------------------------------------------------- /HarmBench/docs/configs.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/docs/configs.md -------------------------------------------------------------------------------- /HarmBench/docs/evaluation_pipeline.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/docs/evaluation_pipeline.md -------------------------------------------------------------------------------- /HarmBench/eval_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/eval_utils.py -------------------------------------------------------------------------------- /HarmBench/evaluate_completions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/evaluate_completions.py -------------------------------------------------------------------------------- /HarmBench/generate_completions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/generate_completions.py -------------------------------------------------------------------------------- /HarmBench/generate_test_cases.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/generate_test_cases.py -------------------------------------------------------------------------------- /HarmBench/merge_test_cases.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/merge_test_cases.py -------------------------------------------------------------------------------- /HarmBench/multimodalmodels/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/__init__.py -------------------------------------------------------------------------------- /HarmBench/multimodalmodels/gpt4v/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt4v_model import GPT4V 2 | -------------------------------------------------------------------------------- /HarmBench/multimodalmodels/gpt4v/gpt4v_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/gpt4v/gpt4v_model.py -------------------------------------------------------------------------------- /HarmBench/multimodalmodels/instructblip/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/instructblip/__init__.py -------------------------------------------------------------------------------- /HarmBench/multimodalmodels/instructblip/instructblip_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/instructblip/instructblip_model.py -------------------------------------------------------------------------------- /HarmBench/multimodalmodels/llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .llava_model import LLaVA_v1_5 2 | -------------------------------------------------------------------------------- /HarmBench/multimodalmodels/llava/llava_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/llava/llava_model.py -------------------------------------------------------------------------------- /HarmBench/multimodalmodels/multimodalmodel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/multimodalmodel.py -------------------------------------------------------------------------------- /HarmBench/multimodalmodels/qwen/__init__.py: -------------------------------------------------------------------------------- 1 | from .qwen_model import Qwen_VL_Chat 2 | -------------------------------------------------------------------------------- /HarmBench/multimodalmodels/qwen/configuration_qwen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/qwen/configuration_qwen.py -------------------------------------------------------------------------------- /HarmBench/multimodalmodels/qwen/finetune.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/qwen/finetune.py -------------------------------------------------------------------------------- /HarmBench/multimodalmodels/qwen/modeling_qwen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/qwen/modeling_qwen.py -------------------------------------------------------------------------------- /HarmBench/multimodalmodels/qwen/openai_api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/qwen/openai_api.py -------------------------------------------------------------------------------- /HarmBench/multimodalmodels/qwen/qwen_generation_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/qwen/qwen_generation_utils.py -------------------------------------------------------------------------------- /HarmBench/multimodalmodels/qwen/qwen_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/qwen/qwen_model.py -------------------------------------------------------------------------------- /HarmBench/multimodalmodels/qwen/tokenization_qwen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/qwen/tokenization_qwen.py -------------------------------------------------------------------------------- /HarmBench/multimodalmodels/qwen/visual.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/qwen/visual.py -------------------------------------------------------------------------------- /HarmBench/multimodalmodels/qwen/web_demo_mm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/qwen/web_demo_mm.py -------------------------------------------------------------------------------- /HarmBench/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/requirements.txt -------------------------------------------------------------------------------- /HarmBench/scripts/run_pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/scripts/run_pipeline.py -------------------------------------------------------------------------------- /HarmBench/scripts/step1.5.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/scripts/step1.5.sh -------------------------------------------------------------------------------- /HarmBench/scripts/step1.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/scripts/step1.sh -------------------------------------------------------------------------------- /HarmBench/scripts/step2.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/scripts/step2.sh -------------------------------------------------------------------------------- /HarmBench/scripts/step3.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/scripts/step3.sh -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/README.md -------------------------------------------------------------------------------- /assets/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/assets/overview.png -------------------------------------------------------------------------------- /jailbreakbench/CITATION.bib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/CITATION.bib -------------------------------------------------------------------------------- /jailbreakbench/CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/CONTRIBUTING.md -------------------------------------------------------------------------------- /jailbreakbench/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/LICENSE -------------------------------------------------------------------------------- /jailbreakbench/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/README.md -------------------------------------------------------------------------------- /jailbreakbench/assets/JBB_Table.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/assets/JBB_Table.jpg -------------------------------------------------------------------------------- /jailbreakbench/assets/jbb_behaviors_source_breakdown.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/assets/jbb_behaviors_source_breakdown.jpg -------------------------------------------------------------------------------- /jailbreakbench/assets/jbb_logo_white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/assets/jbb_logo_white.png -------------------------------------------------------------------------------- /jailbreakbench/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/assets/logo.png -------------------------------------------------------------------------------- /jailbreakbench/jbb_classify.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/jbb_classify.py -------------------------------------------------------------------------------- /jailbreakbench/jbb_response.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/jbb_response.py -------------------------------------------------------------------------------- /jailbreakbench/jbb_run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/jbb_run.sh -------------------------------------------------------------------------------- /jailbreakbench/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/pyproject.toml -------------------------------------------------------------------------------- /jailbreakbench/requirements-dev.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/requirements-dev.lock -------------------------------------------------------------------------------- /jailbreakbench/requirements.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/requirements.lock -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/__init__.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/artifact.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/artifact.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/classifier.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/classifier.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/config.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/dataset.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/defenses/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/__init__.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/defenses/base_defense.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/base_defense.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/defenses/defenselib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/defenses/defenselib/defense_hparams.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/defenselib/defense_hparams.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/defenses/defenselib/perturbations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/defenselib/perturbations.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/defenses/defenses_registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/defenses_registry.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/defenses/erase_and_check.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/erase_and_check.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/defenses/perplexity_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/perplexity_filter.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/defenses/remove_non_dictionary.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/remove_non_dictionary.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/defenses/smooth_llm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/smooth_llm.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/defenses/synonym_substitution.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/synonym_substitution.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/llm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/llm/dummy_vllm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/llm/dummy_vllm.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/llm/litellm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/llm/litellm.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/llm/llm_output.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/llm/llm_output.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/llm/llm_wrapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/llm/llm_wrapper.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/llm/vllm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/llm/vllm.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/plotting/plot_source_breakdown.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/plotting/plot_source_breakdown.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/submission.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/submission.py -------------------------------------------------------------------------------- /jailbreakbench/src/jailbreakbench/vllm_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/vllm_server.py -------------------------------------------------------------------------------- /r2d_train/expand.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/r2d_train/expand.py -------------------------------------------------------------------------------- /r2d_train/expand_and_train.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/r2d_train/expand_and_train.sh -------------------------------------------------------------------------------- /r2d_train/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/r2d_train/train.py -------------------------------------------------------------------------------- /xstest/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/xstest/LICENSE -------------------------------------------------------------------------------- /xstest/build_completions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/xstest/build_completions.py -------------------------------------------------------------------------------- /xstest/build_completions.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/xstest/build_completions.sh -------------------------------------------------------------------------------- /xstest/evaluation/classify_completions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/xstest/evaluation/classify_completions.py -------------------------------------------------------------------------------- /xstest/evaluation/classify_completions.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/xstest/evaluation/classify_completions.sh -------------------------------------------------------------------------------- /xstest/evaluation/classify_completions_strmatch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/xstest/evaluation/classify_completions_strmatch.py -------------------------------------------------------------------------------- /xstest/readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/xstest/readme.md -------------------------------------------------------------------------------- /xstest/xstest_prompts.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/xstest/xstest_prompts.csv --------------------------------------------------------------------------------