├── .gitignore
├── HarmBench
    ├── LICENSE
    ├── README.md
    ├── adversarial_training
    │   ├── README.md
    │   └── alignment-handbook
    │   │   ├── .deepspeed_env
    │   │   ├── LICENSE
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── assets
    │   │       └── handbook.png
    │   │   ├── requirements.txt
    │   │   ├── scripts
    │   │       ├── README.md
    │   │       ├── adv_training_utils.py
    │   │       ├── run_adv_training.sh
    │   │       ├── run_dpo.py
    │   │       ├── run_sft.py
    │   │       └── run_sft_adv_training.py
    │   │   ├── setup.cfg
    │   │   ├── setup.py
    │   │   └── tests
    │   │       ├── __init__.py
    │   │       ├── test_configs.py
    │   │       ├── test_data.py
    │   │       └── test_model_utils.py
    ├── api_models.py
    ├── assets
    │   ├── eval_pipeline-1.png
    │   ├── harmbench_splash.pdf
    │   └── harmbench_splash.png
    ├── baselines
    │   ├── __init__.py
    │   ├── artprompt
    │   │   ├── __init__.py
    │   │   ├── artprompt.py
    │   │   ├── jailbreak
    │   │   │   ├── attack_utils.py
    │   │   │   ├── base_prompt.py
    │   │   │   ├── baseline.py
    │   │   │   ├── bpe.py
    │   │   │   ├── defense_utils.py
    │   │   │   ├── safe_eval.py
    │   │   │   └── subword_nmt.voc
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── dataset.py
    │   │   │   ├── eval.py
    │   │   │   ├── few_shot_example.py
    │   │   │   ├── model.py
    │   │   │   └── prompt.py
    │   ├── autodan
    │   │   ├── AutoDAN.py
    │   │   ├── __init__.py
    │   │   ├── mutate_models.py
    │   │   ├── templates.py
    │   │   └── utils.py
    │   ├── autoprompt
    │   │   ├── __init__.py
    │   │   ├── autoprompt.py
    │   │   └── autoprompt_utils.py
    │   ├── baseline.py
    │   ├── check_refusal_utils.py
    │   ├── direct_request
    │   │   ├── __init__.py
    │   │   └── direct_request.py
    │   ├── fewshot
    │   │   ├── __init__.py
    │   │   └── fewshot.py
    │   ├── gbda
    │   │   ├── __init__.py
    │   │   └── gbda.py
    │   ├── gcg
    │   │   ├── __init__.py
    │   │   ├── gcg.py
    │   │   └── gcg_utils.py
    │   ├── gcg_ensemble
    │   │   ├── __init__.py
    │   │   ├── gcg_ensemble.py
    │   │   └── gcg_ray_actors.py
    │   ├── gptfuzz
    │   │   ├── GPTFuzzer.csv
    │   │   ├── __init__.py
    │   │   ├── gptfuzz.py
    │   │   └── gptfuzzer
    │   │   │   └── __init__.py
    │   ├── human_jailbreaks
    │   │   ├── __init__.py
    │   │   ├── human_jailbreaks.py
    │   │   └── jailbreaks.py
    │   ├── model_utils.py
    │   ├── multimodaldirectrequest
    │   │   ├── __init__.py
    │   │   └── multimodaldirectrequest.py
    │   ├── multimodalpgd
    │   │   ├── __init__.py
    │   │   └── multimodalpgd.py
    │   ├── multimodalrendertext
    │   │   ├── __init__.py
    │   │   └── multimodalrendertext.py
    │   ├── pair
    │   │   ├── PAIR.py
    │   │   ├── __init__.py
    │   │   ├── common.py
    │   │   ├── conversers.py
    │   │   ├── judges.py
    │   │   ├── language_models.py
    │   │   └── system_prompts.py
    │   ├── pap
    │   │   ├── PAP.py
    │   │   ├── __init__.py
    │   │   ├── language_models.py
    │   │   └── templates.py
    │   ├── pez
    │   │   ├── __init__.py
    │   │   └── pez.py
    │   ├── tap
    │   │   ├── TAP.py
    │   │   ├── __init__.py
    │   │   ├── common.py
    │   │   ├── conversers.py
    │   │   ├── judges.py
    │   │   ├── language_models.py
    │   │   └── system_prompts.py
    │   ├── uat
    │   │   ├── __init__.py
    │   │   └── uat.py
    │   └── zeroshot
    │   │   ├── __init__.py
    │   │   └── zeroshot.py
    ├── configs
    │   ├── method_configs
    │   │   ├── ArtPrompt_config.yaml
    │   │   ├── AutoDAN_config.yaml
    │   │   ├── AutoPrompt_config.yaml
    │   │   ├── DirectRequest_config.yaml
    │   │   ├── EnsembleGCG_config.yaml
    │   │   ├── FewShot_config.yaml
    │   │   ├── GBDA_config.yaml
    │   │   ├── GCG_config.yaml
    │   │   ├── GPTFuzz_config.yaml
    │   │   ├── HumanJailbreaks_config.yaml
    │   │   ├── MultiModalDirectRequest_config.yaml
    │   │   ├── MultiModalPGDBlankImage_config.yaml
    │   │   ├── MultiModalPGDPatch_config.yaml
    │   │   ├── MultiModalPGD_config.yaml
    │   │   ├── MultiModalRenderText_config.yaml
    │   │   ├── PAIR_config.yaml
    │   │   ├── PAP_config.yaml
    │   │   ├── PEZ_config.yaml
    │   │   ├── TAP_config.yaml
    │   │   ├── UAT_config.yaml
    │   │   └── ZeroShot_config.yaml
    │   ├── model_configs
    │   │   └── models.yaml
    │   └── pipeline_configs
    │   │   └── run_pipeline.yaml
    ├── docs
    │   ├── behavior_datasets.md
    │   ├── codebase_structure.md
    │   ├── configs.md
    │   └── evaluation_pipeline.md
    ├── eval_utils.py
    ├── evaluate_completions.py
    ├── generate_completions.py
    ├── generate_test_cases.py
    ├── merge_test_cases.py
    ├── multimodalmodels
    │   ├── __init__.py
    │   ├── gpt4v
    │   │   ├── __init__.py
    │   │   └── gpt4v_model.py
    │   ├── instructblip
    │   │   ├── __init__.py
    │   │   └── instructblip_model.py
    │   ├── llava
    │   │   ├── __init__.py
    │   │   └── llava_model.py
    │   ├── multimodalmodel.py
    │   └── qwen
    │   │   ├── __init__.py
    │   │   ├── configuration_qwen.py
    │   │   ├── finetune.py
    │   │   ├── modeling_qwen.py
    │   │   ├── openai_api.py
    │   │   ├── qwen_generation_utils.py
    │   │   ├── qwen_model.py
    │   │   ├── tokenization_qwen.py
    │   │   ├── visual.py
    │   │   └── web_demo_mm.py
    ├── requirements.txt
    └── scripts
    │   ├── run_pipeline.py
    │   ├── step1.5.sh
    │   ├── step1.sh
    │   ├── step2.sh
    │   └── step3.sh
├── LICENSE
├── README.md
├── assets
    └── overview.png
├── jailbreakbench
    ├── CITATION.bib
    ├── CONTRIBUTING.md
    ├── LICENSE
    ├── README.md
    ├── assets
    │   ├── JBB_Table.jpg
    │   ├── jbb_behaviors_source_breakdown.jpg
    │   ├── jbb_logo_white.png
    │   └── logo.png
    ├── jbb_classify.py
    ├── jbb_response.py
    ├── jbb_run.sh
    ├── pyproject.toml
    ├── requirements-dev.lock
    ├── requirements.lock
    └── src
    │   └── jailbreakbench
    │       ├── __init__.py
    │       ├── artifact.py
    │       ├── classifier.py
    │       ├── config.py
    │       ├── dataset.py
    │       ├── defenses
    │           ├── __init__.py
    │           ├── base_defense.py
    │           ├── defenselib
    │           │   ├── __init__.py
    │           │   ├── defense_hparams.py
    │           │   └── perturbations.py
    │           ├── defenses_registry.py
    │           ├── erase_and_check.py
    │           ├── perplexity_filter.py
    │           ├── remove_non_dictionary.py
    │           ├── smooth_llm.py
    │           └── synonym_substitution.py
    │       ├── llm
    │           ├── __init__.py
    │           ├── dummy_vllm.py
    │           ├── litellm.py
    │           ├── llm_output.py
    │           ├── llm_wrapper.py
    │           └── vllm.py
    │       ├── plotting
    │           └── plot_source_breakdown.py
    │       ├── submission.py
    │       └── vllm_server.py
├── r2d_train
    ├── expand.py
    ├── expand_and_train.sh
    └── train.py
└── xstest
    ├── LICENSE
    ├── build_completions.py
    ├── build_completions.sh
    ├── evaluation
        ├── classify_completions.py
        ├── classify_completions.sh
        └── classify_completions_strmatch.py
    ├── readme.md
    └── xstest_prompts.csv


/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/.gitignore


--------------------------------------------------------------------------------
/HarmBench/LICENSE:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/LICENSE


--------------------------------------------------------------------------------
/HarmBench/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/README.md


--------------------------------------------------------------------------------
/HarmBench/adversarial_training/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/README.md


--------------------------------------------------------------------------------
/HarmBench/adversarial_training/alignment-handbook/.deepspeed_env:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/.deepspeed_env


--------------------------------------------------------------------------------
/HarmBench/adversarial_training/alignment-handbook/LICENSE:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/LICENSE


--------------------------------------------------------------------------------
/HarmBench/adversarial_training/alignment-handbook/Makefile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/Makefile


--------------------------------------------------------------------------------
/HarmBench/adversarial_training/alignment-handbook/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/README.md


--------------------------------------------------------------------------------
/HarmBench/adversarial_training/alignment-handbook/assets/handbook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/assets/handbook.png


--------------------------------------------------------------------------------
/HarmBench/adversarial_training/alignment-handbook/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/requirements.txt


--------------------------------------------------------------------------------
/HarmBench/adversarial_training/alignment-handbook/scripts/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/scripts/README.md


--------------------------------------------------------------------------------
/HarmBench/adversarial_training/alignment-handbook/scripts/adv_training_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/scripts/adv_training_utils.py


--------------------------------------------------------------------------------
/HarmBench/adversarial_training/alignment-handbook/scripts/run_adv_training.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/scripts/run_adv_training.sh


--------------------------------------------------------------------------------
/HarmBench/adversarial_training/alignment-handbook/scripts/run_dpo.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/scripts/run_dpo.py


--------------------------------------------------------------------------------
/HarmBench/adversarial_training/alignment-handbook/scripts/run_sft.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/scripts/run_sft.py


--------------------------------------------------------------------------------
/HarmBench/adversarial_training/alignment-handbook/scripts/run_sft_adv_training.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/scripts/run_sft_adv_training.py


--------------------------------------------------------------------------------
/HarmBench/adversarial_training/alignment-handbook/setup.cfg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/setup.cfg


--------------------------------------------------------------------------------
/HarmBench/adversarial_training/alignment-handbook/setup.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/setup.py


--------------------------------------------------------------------------------
/HarmBench/adversarial_training/alignment-handbook/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/HarmBench/adversarial_training/alignment-handbook/tests/test_configs.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/tests/test_configs.py


--------------------------------------------------------------------------------
/HarmBench/adversarial_training/alignment-handbook/tests/test_data.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/tests/test_data.py


--------------------------------------------------------------------------------
/HarmBench/adversarial_training/alignment-handbook/tests/test_model_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/adversarial_training/alignment-handbook/tests/test_model_utils.py


--------------------------------------------------------------------------------
/HarmBench/api_models.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/api_models.py


--------------------------------------------------------------------------------
/HarmBench/assets/eval_pipeline-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/assets/eval_pipeline-1.png


--------------------------------------------------------------------------------
/HarmBench/assets/harmbench_splash.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/assets/harmbench_splash.pdf


--------------------------------------------------------------------------------
/HarmBench/assets/harmbench_splash.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/assets/harmbench_splash.png


--------------------------------------------------------------------------------
/HarmBench/baselines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/__init__.py


--------------------------------------------------------------------------------
/HarmBench/baselines/artprompt/__init__.py:
--------------------------------------------------------------------------------
1 | from .artprompt import *


--------------------------------------------------------------------------------
/HarmBench/baselines/artprompt/artprompt.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/artprompt.py


--------------------------------------------------------------------------------
/HarmBench/baselines/artprompt/jailbreak/attack_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/jailbreak/attack_utils.py


--------------------------------------------------------------------------------
/HarmBench/baselines/artprompt/jailbreak/base_prompt.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/jailbreak/base_prompt.py


--------------------------------------------------------------------------------
/HarmBench/baselines/artprompt/jailbreak/baseline.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/jailbreak/baseline.py


--------------------------------------------------------------------------------
/HarmBench/baselines/artprompt/jailbreak/bpe.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/jailbreak/bpe.py


--------------------------------------------------------------------------------
/HarmBench/baselines/artprompt/jailbreak/defense_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/jailbreak/defense_utils.py


--------------------------------------------------------------------------------
/HarmBench/baselines/artprompt/jailbreak/safe_eval.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/jailbreak/safe_eval.py


--------------------------------------------------------------------------------
/HarmBench/baselines/artprompt/jailbreak/subword_nmt.voc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/jailbreak/subword_nmt.voc


--------------------------------------------------------------------------------
/HarmBench/baselines/artprompt/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/HarmBench/baselines/artprompt/utils/dataset.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/utils/dataset.py


--------------------------------------------------------------------------------
/HarmBench/baselines/artprompt/utils/eval.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/utils/eval.py


--------------------------------------------------------------------------------
/HarmBench/baselines/artprompt/utils/few_shot_example.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/utils/few_shot_example.py


--------------------------------------------------------------------------------
/HarmBench/baselines/artprompt/utils/model.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/utils/model.py


--------------------------------------------------------------------------------
/HarmBench/baselines/artprompt/utils/prompt.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/artprompt/utils/prompt.py


--------------------------------------------------------------------------------
/HarmBench/baselines/autodan/AutoDAN.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/autodan/AutoDAN.py


--------------------------------------------------------------------------------
/HarmBench/baselines/autodan/__init__.py:
--------------------------------------------------------------------------------
1 | from .AutoDAN import *


--------------------------------------------------------------------------------
/HarmBench/baselines/autodan/mutate_models.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/autodan/mutate_models.py


--------------------------------------------------------------------------------
/HarmBench/baselines/autodan/templates.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/autodan/templates.py


--------------------------------------------------------------------------------
/HarmBench/baselines/autodan/utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/autodan/utils.py


--------------------------------------------------------------------------------
/HarmBench/baselines/autoprompt/__init__.py:
--------------------------------------------------------------------------------
1 | from .autoprompt import *


--------------------------------------------------------------------------------
/HarmBench/baselines/autoprompt/autoprompt.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/autoprompt/autoprompt.py


--------------------------------------------------------------------------------
/HarmBench/baselines/autoprompt/autoprompt_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/autoprompt/autoprompt_utils.py


--------------------------------------------------------------------------------
/HarmBench/baselines/baseline.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/baseline.py


--------------------------------------------------------------------------------
/HarmBench/baselines/check_refusal_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/check_refusal_utils.py


--------------------------------------------------------------------------------
/HarmBench/baselines/direct_request/__init__.py:
--------------------------------------------------------------------------------
1 | from .direct_request import *


--------------------------------------------------------------------------------
/HarmBench/baselines/direct_request/direct_request.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/direct_request/direct_request.py


--------------------------------------------------------------------------------
/HarmBench/baselines/fewshot/__init__.py:
--------------------------------------------------------------------------------
1 | from .fewshot import *


--------------------------------------------------------------------------------
/HarmBench/baselines/fewshot/fewshot.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/fewshot/fewshot.py


--------------------------------------------------------------------------------
/HarmBench/baselines/gbda/__init__.py:
--------------------------------------------------------------------------------
1 | from .gbda import *


--------------------------------------------------------------------------------
/HarmBench/baselines/gbda/gbda.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/gbda/gbda.py


--------------------------------------------------------------------------------
/HarmBench/baselines/gcg/__init__.py:
--------------------------------------------------------------------------------
1 | from .gcg import *


--------------------------------------------------------------------------------
/HarmBench/baselines/gcg/gcg.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/gcg/gcg.py


--------------------------------------------------------------------------------
/HarmBench/baselines/gcg/gcg_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/gcg/gcg_utils.py


--------------------------------------------------------------------------------
/HarmBench/baselines/gcg_ensemble/__init__.py:
--------------------------------------------------------------------------------
1 | from .gcg_ensemble import *


--------------------------------------------------------------------------------
/HarmBench/baselines/gcg_ensemble/gcg_ensemble.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/gcg_ensemble/gcg_ensemble.py


--------------------------------------------------------------------------------
/HarmBench/baselines/gcg_ensemble/gcg_ray_actors.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/gcg_ensemble/gcg_ray_actors.py


--------------------------------------------------------------------------------
/HarmBench/baselines/gptfuzz/GPTFuzzer.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/gptfuzz/GPTFuzzer.csv


--------------------------------------------------------------------------------
/HarmBench/baselines/gptfuzz/__init__.py:
--------------------------------------------------------------------------------
1 | from .gptfuzz import *


--------------------------------------------------------------------------------
/HarmBench/baselines/gptfuzz/gptfuzz.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/gptfuzz/gptfuzz.py


--------------------------------------------------------------------------------
/HarmBench/baselines/gptfuzz/gptfuzzer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/HarmBench/baselines/human_jailbreaks/__init__.py:
--------------------------------------------------------------------------------
1 | from .human_jailbreaks import *


--------------------------------------------------------------------------------
/HarmBench/baselines/human_jailbreaks/human_jailbreaks.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/human_jailbreaks/human_jailbreaks.py


--------------------------------------------------------------------------------
/HarmBench/baselines/human_jailbreaks/jailbreaks.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/human_jailbreaks/jailbreaks.py


--------------------------------------------------------------------------------
/HarmBench/baselines/model_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/model_utils.py


--------------------------------------------------------------------------------
/HarmBench/baselines/multimodaldirectrequest/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/multimodaldirectrequest/__init__.py


--------------------------------------------------------------------------------
/HarmBench/baselines/multimodaldirectrequest/multimodaldirectrequest.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/multimodaldirectrequest/multimodaldirectrequest.py


--------------------------------------------------------------------------------
/HarmBench/baselines/multimodalpgd/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/multimodalpgd/__init__.py


--------------------------------------------------------------------------------
/HarmBench/baselines/multimodalpgd/multimodalpgd.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/multimodalpgd/multimodalpgd.py


--------------------------------------------------------------------------------
/HarmBench/baselines/multimodalrendertext/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/multimodalrendertext/__init__.py


--------------------------------------------------------------------------------
/HarmBench/baselines/multimodalrendertext/multimodalrendertext.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/multimodalrendertext/multimodalrendertext.py


--------------------------------------------------------------------------------
/HarmBench/baselines/pair/PAIR.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pair/PAIR.py


--------------------------------------------------------------------------------
/HarmBench/baselines/pair/__init__.py:
--------------------------------------------------------------------------------
1 | from .PAIR import *


--------------------------------------------------------------------------------
/HarmBench/baselines/pair/common.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pair/common.py


--------------------------------------------------------------------------------
/HarmBench/baselines/pair/conversers.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pair/conversers.py


--------------------------------------------------------------------------------
/HarmBench/baselines/pair/judges.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pair/judges.py


--------------------------------------------------------------------------------
/HarmBench/baselines/pair/language_models.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pair/language_models.py


--------------------------------------------------------------------------------
/HarmBench/baselines/pair/system_prompts.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pair/system_prompts.py


--------------------------------------------------------------------------------
/HarmBench/baselines/pap/PAP.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pap/PAP.py


--------------------------------------------------------------------------------
/HarmBench/baselines/pap/__init__.py:
--------------------------------------------------------------------------------
1 | from .PAP import *


--------------------------------------------------------------------------------
/HarmBench/baselines/pap/language_models.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pap/language_models.py


--------------------------------------------------------------------------------
/HarmBench/baselines/pap/templates.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pap/templates.py


--------------------------------------------------------------------------------
/HarmBench/baselines/pez/__init__.py:
--------------------------------------------------------------------------------
1 | from .pez import *
2 | 


--------------------------------------------------------------------------------
/HarmBench/baselines/pez/pez.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/pez/pez.py


--------------------------------------------------------------------------------
/HarmBench/baselines/tap/TAP.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/tap/TAP.py


--------------------------------------------------------------------------------
/HarmBench/baselines/tap/__init__.py:
--------------------------------------------------------------------------------
1 | from .TAP import *


--------------------------------------------------------------------------------
/HarmBench/baselines/tap/common.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/tap/common.py


--------------------------------------------------------------------------------
/HarmBench/baselines/tap/conversers.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/tap/conversers.py


--------------------------------------------------------------------------------
/HarmBench/baselines/tap/judges.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/tap/judges.py


--------------------------------------------------------------------------------
/HarmBench/baselines/tap/language_models.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/tap/language_models.py


--------------------------------------------------------------------------------
/HarmBench/baselines/tap/system_prompts.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/tap/system_prompts.py


--------------------------------------------------------------------------------
/HarmBench/baselines/uat/__init__.py:
--------------------------------------------------------------------------------
1 | from .uat import *


--------------------------------------------------------------------------------
/HarmBench/baselines/uat/uat.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/uat/uat.py


--------------------------------------------------------------------------------
/HarmBench/baselines/zeroshot/__init__.py:
--------------------------------------------------------------------------------
1 | from .zeroshot import *


--------------------------------------------------------------------------------
/HarmBench/baselines/zeroshot/zeroshot.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/baselines/zeroshot/zeroshot.py


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/ArtPrompt_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/ArtPrompt_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/AutoDAN_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/AutoDAN_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/AutoPrompt_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/AutoPrompt_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/DirectRequest_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/DirectRequest_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/EnsembleGCG_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/EnsembleGCG_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/FewShot_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/FewShot_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/GBDA_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/GBDA_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/GCG_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/GCG_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/GPTFuzz_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/GPTFuzz_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/HumanJailbreaks_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/HumanJailbreaks_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/MultiModalDirectRequest_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/MultiModalDirectRequest_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/MultiModalPGDBlankImage_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/MultiModalPGDBlankImage_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/MultiModalPGDPatch_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/MultiModalPGDPatch_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/MultiModalPGD_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/MultiModalPGD_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/MultiModalRenderText_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/MultiModalRenderText_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/PAIR_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/PAIR_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/PAP_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/PAP_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/PEZ_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/PEZ_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/TAP_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/TAP_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/UAT_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/UAT_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/method_configs/ZeroShot_config.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/method_configs/ZeroShot_config.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/model_configs/models.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/model_configs/models.yaml


--------------------------------------------------------------------------------
/HarmBench/configs/pipeline_configs/run_pipeline.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/configs/pipeline_configs/run_pipeline.yaml


--------------------------------------------------------------------------------
/HarmBench/docs/behavior_datasets.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/HarmBench/docs/codebase_structure.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/docs/codebase_structure.md


--------------------------------------------------------------------------------
/HarmBench/docs/configs.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/docs/configs.md


--------------------------------------------------------------------------------
/HarmBench/docs/evaluation_pipeline.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/docs/evaluation_pipeline.md


--------------------------------------------------------------------------------
/HarmBench/eval_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/eval_utils.py


--------------------------------------------------------------------------------
/HarmBench/evaluate_completions.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/evaluate_completions.py


--------------------------------------------------------------------------------
/HarmBench/generate_completions.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/generate_completions.py


--------------------------------------------------------------------------------
/HarmBench/generate_test_cases.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/generate_test_cases.py


--------------------------------------------------------------------------------
/HarmBench/merge_test_cases.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/merge_test_cases.py


--------------------------------------------------------------------------------
/HarmBench/multimodalmodels/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/__init__.py


--------------------------------------------------------------------------------
/HarmBench/multimodalmodels/gpt4v/__init__.py:
--------------------------------------------------------------------------------
1 | from .gpt4v_model import GPT4V
2 | 


--------------------------------------------------------------------------------
/HarmBench/multimodalmodels/gpt4v/gpt4v_model.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/gpt4v/gpt4v_model.py


--------------------------------------------------------------------------------
/HarmBench/multimodalmodels/instructblip/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/instructblip/__init__.py


--------------------------------------------------------------------------------
/HarmBench/multimodalmodels/instructblip/instructblip_model.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/instructblip/instructblip_model.py


--------------------------------------------------------------------------------
/HarmBench/multimodalmodels/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .llava_model import LLaVA_v1_5
2 | 


--------------------------------------------------------------------------------
/HarmBench/multimodalmodels/llava/llava_model.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/llava/llava_model.py


--------------------------------------------------------------------------------
/HarmBench/multimodalmodels/multimodalmodel.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/multimodalmodel.py


--------------------------------------------------------------------------------
/HarmBench/multimodalmodels/qwen/__init__.py:
--------------------------------------------------------------------------------
1 | from .qwen_model import Qwen_VL_Chat
2 | 


--------------------------------------------------------------------------------
/HarmBench/multimodalmodels/qwen/configuration_qwen.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/qwen/configuration_qwen.py


--------------------------------------------------------------------------------
/HarmBench/multimodalmodels/qwen/finetune.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/qwen/finetune.py


--------------------------------------------------------------------------------
/HarmBench/multimodalmodels/qwen/modeling_qwen.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/qwen/modeling_qwen.py


--------------------------------------------------------------------------------
/HarmBench/multimodalmodels/qwen/openai_api.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/qwen/openai_api.py


--------------------------------------------------------------------------------
/HarmBench/multimodalmodels/qwen/qwen_generation_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/qwen/qwen_generation_utils.py


--------------------------------------------------------------------------------
/HarmBench/multimodalmodels/qwen/qwen_model.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/qwen/qwen_model.py


--------------------------------------------------------------------------------
/HarmBench/multimodalmodels/qwen/tokenization_qwen.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/qwen/tokenization_qwen.py


--------------------------------------------------------------------------------
/HarmBench/multimodalmodels/qwen/visual.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/qwen/visual.py


--------------------------------------------------------------------------------
/HarmBench/multimodalmodels/qwen/web_demo_mm.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/multimodalmodels/qwen/web_demo_mm.py


--------------------------------------------------------------------------------
/HarmBench/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/requirements.txt


--------------------------------------------------------------------------------
/HarmBench/scripts/run_pipeline.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/scripts/run_pipeline.py


--------------------------------------------------------------------------------
/HarmBench/scripts/step1.5.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/scripts/step1.5.sh


--------------------------------------------------------------------------------
/HarmBench/scripts/step1.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/scripts/step1.sh


--------------------------------------------------------------------------------
/HarmBench/scripts/step2.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/scripts/step2.sh


--------------------------------------------------------------------------------
/HarmBench/scripts/step3.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/HarmBench/scripts/step3.sh


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/LICENSE


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/README.md


--------------------------------------------------------------------------------
/assets/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/assets/overview.png


--------------------------------------------------------------------------------
/jailbreakbench/CITATION.bib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/CITATION.bib


--------------------------------------------------------------------------------
/jailbreakbench/CONTRIBUTING.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/CONTRIBUTING.md


--------------------------------------------------------------------------------
/jailbreakbench/LICENSE:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/LICENSE


--------------------------------------------------------------------------------
/jailbreakbench/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/README.md


--------------------------------------------------------------------------------
/jailbreakbench/assets/JBB_Table.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/assets/JBB_Table.jpg


--------------------------------------------------------------------------------
/jailbreakbench/assets/jbb_behaviors_source_breakdown.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/assets/jbb_behaviors_source_breakdown.jpg


--------------------------------------------------------------------------------
/jailbreakbench/assets/jbb_logo_white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/assets/jbb_logo_white.png


--------------------------------------------------------------------------------
/jailbreakbench/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/assets/logo.png


--------------------------------------------------------------------------------
/jailbreakbench/jbb_classify.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/jbb_classify.py


--------------------------------------------------------------------------------
/jailbreakbench/jbb_response.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/jbb_response.py


--------------------------------------------------------------------------------
/jailbreakbench/jbb_run.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/jbb_run.sh


--------------------------------------------------------------------------------
/jailbreakbench/pyproject.toml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/pyproject.toml


--------------------------------------------------------------------------------
/jailbreakbench/requirements-dev.lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/requirements-dev.lock


--------------------------------------------------------------------------------
/jailbreakbench/requirements.lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/requirements.lock


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/__init__.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/artifact.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/artifact.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/classifier.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/classifier.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/config.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/config.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/dataset.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/dataset.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/defenses/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/__init__.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/defenses/base_defense.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/base_defense.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/defenses/defenselib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/defenses/defenselib/defense_hparams.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/defenselib/defense_hparams.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/defenses/defenselib/perturbations.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/defenselib/perturbations.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/defenses/defenses_registry.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/defenses_registry.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/defenses/erase_and_check.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/erase_and_check.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/defenses/perplexity_filter.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/perplexity_filter.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/defenses/remove_non_dictionary.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/remove_non_dictionary.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/defenses/smooth_llm.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/smooth_llm.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/defenses/synonym_substitution.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/defenses/synonym_substitution.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/llm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/llm/dummy_vllm.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/llm/dummy_vllm.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/llm/litellm.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/llm/litellm.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/llm/llm_output.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/llm/llm_output.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/llm/llm_wrapper.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/llm/llm_wrapper.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/llm/vllm.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/llm/vllm.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/plotting/plot_source_breakdown.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/plotting/plot_source_breakdown.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/submission.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/submission.py


--------------------------------------------------------------------------------
/jailbreakbench/src/jailbreakbench/vllm_server.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/jailbreakbench/src/jailbreakbench/vllm_server.py


--------------------------------------------------------------------------------
/r2d_train/expand.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/r2d_train/expand.py


--------------------------------------------------------------------------------
/r2d_train/expand_and_train.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/r2d_train/expand_and_train.sh


--------------------------------------------------------------------------------
/r2d_train/train.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/r2d_train/train.py


--------------------------------------------------------------------------------
/xstest/LICENSE:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/xstest/LICENSE


--------------------------------------------------------------------------------
/xstest/build_completions.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/xstest/build_completions.py


--------------------------------------------------------------------------------
/xstest/build_completions.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/xstest/build_completions.sh


--------------------------------------------------------------------------------
/xstest/evaluation/classify_completions.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/xstest/evaluation/classify_completions.py


--------------------------------------------------------------------------------
/xstest/evaluation/classify_completions.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/xstest/evaluation/classify_completions.sh


--------------------------------------------------------------------------------
/xstest/evaluation/classify_completions_strmatch.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/xstest/evaluation/classify_completions_strmatch.py


--------------------------------------------------------------------------------
/xstest/readme.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/xstest/readme.md


--------------------------------------------------------------------------------
/xstest/xstest_prompts.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chuhac/Reasoning-to-Defend/HEAD/xstest/xstest_prompts.csv


--------------------------------------------------------------------------------