├── README.assets
    ├── problem_example.png
    ├── results.png
    ├── visualization.png
    └── visualization_post.png
├── README.md
└── code
    ├── README.md
    ├── chat_templates
        ├── chatml.jinja
        ├── llama-2-chat.jinja
        ├── mistral-instruct.jinja
        ├── openchat.jinja
        └── vicuna.jinja
    ├── compare_pca_harmfulness_boundary.py
    ├── compare_pca_refusal_boundary.py
    ├── compare_pca_soft_harmfulness.py
    ├── compare_pca_soft_refusal.py
    ├── comparisons
        ├── pca
        │   ├── all_harmfulness_boundary_custom_sampling.pdf
        │   ├── all_refusal_boundary_custom_sampling.pdf
        │   ├── all_second_custom_sampling.pdf
        │   └── all_third_custom_sampling.pdf
        └── pca_soft
        │   ├── all_soft_harmfulness_advbench_sampling.pdf
        │   ├── all_soft_harmfulness_malicious_sampling.pdf
        │   ├── all_soft_refusal_advbench_sampling.pdf
        │   └── all_soft_refusal_malicious_sampling.pdf
    ├── data
        ├── MaliciousInstruct.txt
        ├── advbench.txt
        ├── custom.txt
        └── gcg.json
    ├── data_harmless
        ├── custom.txt
        └── testset.txt
    ├── estimate.py
    ├── evaluate.py
    ├── forward.py
    ├── forward_with_soft.py
    ├── generate.py
    ├── generation_configs
        ├── llama-2-chat.json
        ├── mistral-instruct.json
        ├── openchat.json
        ├── orca-2.json
        └── vicuna.json
    ├── scripts
        ├── compare_gather.sh
        ├── forward.sh
        ├── forward_harmless.sh
        ├── run_meta.sh
        ├── run_meta_harmless.sh
        ├── run_mistral-v1.sh
        ├── run_mistral-v1_harmless.sh
        ├── train_meta.sh
        ├── train_meta_unlikelihood.sh
        └── train_mistral-v1.sh
    ├── train.py
    ├── train_unlikelihood.py
    └── utils.py


/README.assets/problem_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/README.assets/problem_example.png


--------------------------------------------------------------------------------
/README.assets/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/README.assets/results.png


--------------------------------------------------------------------------------
/README.assets/visualization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/README.assets/visualization.png


--------------------------------------------------------------------------------
/README.assets/visualization_post.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/README.assets/visualization_post.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/README.md


--------------------------------------------------------------------------------
/code/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/README.md


--------------------------------------------------------------------------------
/code/chat_templates/chatml.jinja:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/chat_templates/chatml.jinja


--------------------------------------------------------------------------------
/code/chat_templates/llama-2-chat.jinja:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/chat_templates/llama-2-chat.jinja


--------------------------------------------------------------------------------
/code/chat_templates/mistral-instruct.jinja:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/chat_templates/mistral-instruct.jinja


--------------------------------------------------------------------------------
/code/chat_templates/openchat.jinja:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/chat_templates/openchat.jinja


--------------------------------------------------------------------------------
/code/chat_templates/vicuna.jinja:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/chat_templates/vicuna.jinja


--------------------------------------------------------------------------------
/code/compare_pca_harmfulness_boundary.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/compare_pca_harmfulness_boundary.py


--------------------------------------------------------------------------------
/code/compare_pca_refusal_boundary.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/compare_pca_refusal_boundary.py


--------------------------------------------------------------------------------
/code/compare_pca_soft_harmfulness.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/compare_pca_soft_harmfulness.py


--------------------------------------------------------------------------------
/code/compare_pca_soft_refusal.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/compare_pca_soft_refusal.py


--------------------------------------------------------------------------------
/code/comparisons/pca/all_harmfulness_boundary_custom_sampling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/comparisons/pca/all_harmfulness_boundary_custom_sampling.pdf


--------------------------------------------------------------------------------
/code/comparisons/pca/all_refusal_boundary_custom_sampling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/comparisons/pca/all_refusal_boundary_custom_sampling.pdf


--------------------------------------------------------------------------------
/code/comparisons/pca/all_second_custom_sampling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/comparisons/pca/all_second_custom_sampling.pdf


--------------------------------------------------------------------------------
/code/comparisons/pca/all_third_custom_sampling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/comparisons/pca/all_third_custom_sampling.pdf


--------------------------------------------------------------------------------
/code/comparisons/pca_soft/all_soft_harmfulness_advbench_sampling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/comparisons/pca_soft/all_soft_harmfulness_advbench_sampling.pdf


--------------------------------------------------------------------------------
/code/comparisons/pca_soft/all_soft_harmfulness_malicious_sampling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/comparisons/pca_soft/all_soft_harmfulness_malicious_sampling.pdf


--------------------------------------------------------------------------------
/code/comparisons/pca_soft/all_soft_refusal_advbench_sampling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/comparisons/pca_soft/all_soft_refusal_advbench_sampling.pdf


--------------------------------------------------------------------------------
/code/comparisons/pca_soft/all_soft_refusal_malicious_sampling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/comparisons/pca_soft/all_soft_refusal_malicious_sampling.pdf


--------------------------------------------------------------------------------
/code/data/MaliciousInstruct.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/data/MaliciousInstruct.txt


--------------------------------------------------------------------------------
/code/data/advbench.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/data/advbench.txt


--------------------------------------------------------------------------------
/code/data/custom.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/data/custom.txt


--------------------------------------------------------------------------------
/code/data/gcg.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/data/gcg.json


--------------------------------------------------------------------------------
/code/data_harmless/custom.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/data_harmless/custom.txt


--------------------------------------------------------------------------------
/code/data_harmless/testset.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/data_harmless/testset.txt


--------------------------------------------------------------------------------
/code/estimate.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/estimate.py


--------------------------------------------------------------------------------
/code/evaluate.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/evaluate.py


--------------------------------------------------------------------------------
/code/forward.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/forward.py


--------------------------------------------------------------------------------
/code/forward_with_soft.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/forward_with_soft.py


--------------------------------------------------------------------------------
/code/generate.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/generate.py


--------------------------------------------------------------------------------
/code/generation_configs/llama-2-chat.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/generation_configs/llama-2-chat.json


--------------------------------------------------------------------------------
/code/generation_configs/mistral-instruct.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/generation_configs/mistral-instruct.json


--------------------------------------------------------------------------------
/code/generation_configs/openchat.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/generation_configs/openchat.json


--------------------------------------------------------------------------------
/code/generation_configs/orca-2.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/generation_configs/orca-2.json


--------------------------------------------------------------------------------
/code/generation_configs/vicuna.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/generation_configs/vicuna.json


--------------------------------------------------------------------------------
/code/scripts/compare_gather.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/scripts/compare_gather.sh


--------------------------------------------------------------------------------
/code/scripts/forward.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/scripts/forward.sh


--------------------------------------------------------------------------------
/code/scripts/forward_harmless.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/scripts/forward_harmless.sh


--------------------------------------------------------------------------------
/code/scripts/run_meta.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/scripts/run_meta.sh


--------------------------------------------------------------------------------
/code/scripts/run_meta_harmless.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/scripts/run_meta_harmless.sh


--------------------------------------------------------------------------------
/code/scripts/run_mistral-v1.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/scripts/run_mistral-v1.sh


--------------------------------------------------------------------------------
/code/scripts/run_mistral-v1_harmless.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/scripts/run_mistral-v1_harmless.sh


--------------------------------------------------------------------------------
/code/scripts/train_meta.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/scripts/train_meta.sh


--------------------------------------------------------------------------------
/code/scripts/train_meta_unlikelihood.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/scripts/train_meta_unlikelihood.sh


--------------------------------------------------------------------------------
/code/scripts/train_mistral-v1.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/scripts/train_mistral-v1.sh


--------------------------------------------------------------------------------
/code/train.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/train.py


--------------------------------------------------------------------------------
/code/train_unlikelihood.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/train_unlikelihood.py


--------------------------------------------------------------------------------
/code/utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chujiezheng/LLM-Safeguard/HEAD/code/utils.py


--------------------------------------------------------------------------------