├── .gitignore ├── LICENSE ├── README.md ├── adv_train.py ├── bash scripts ├── autodan.sh ├── autodan2.sh ├── grad_ec_safe.sh ├── greedy_ec_safe.sh ├── greedy_train.sh ├── jobs_gcg.sh ├── jobs_grad_ec.sh ├── jobs_greedy_ec.sh ├── jobs_harmful.sh ├── jobs_infusion.sh ├── jobs_insertion.sh ├── jobs_rand_ec.sh ├── jobs_roc.sh ├── jobs_smoothing.sh ├── jobs_suffix.sh └── train_safety_clf.sh ├── data ├── harmful_prompts.txt ├── harmful_prompts_test.txt ├── harmful_prompts_train.txt ├── safe_prompts.txt ├── safe_prompts_test.txt ├── safe_prompts_test_infusion_erased.txt ├── safe_prompts_test_insertion_erased.txt ├── safe_prompts_test_suffix_erased.txt ├── safe_prompts_train.txt ├── safe_prompts_train_infusion_erased.txt ├── safe_prompts_train_insertion_erased.txt └── safe_prompts_train_suffix_erased.txt ├── defenses.py ├── figures ├── adversarial_attack.png ├── erase-and-check.png └── harmful_prompt.png ├── gcg.py ├── grad_ec.py ├── greedy_ec.py ├── greedy_grad_ec.py ├── greedy_train.py ├── main.py ├── plot scripts ├── plot.py ├── plot_acc.py ├── plot_acc_multi.py ├── plot_empirical.py ├── plot_ph_time.py ├── plot_roc.py ├── plot_smoothing.py ├── plot_time.py └── plot_time_multi.py ├── prompt_dataset.py ├── results ├── AutoDAN-HGA │ ├── DistilBERT │ │ └── empirical_suffix_200_clf.json │ ├── GreedyEC │ │ ├── greedy_ec_200_clf.json │ │ └── greedy_ec_2_clf.json │ ├── Llama-2 │ │ └── empirical_suffix_200.json │ ├── RandEC │ │ ├── empirical_suffix_200_rand.json │ │ └── empirical_suffix_2_rand.json │ └── empirical_suffix_2.json ├── GPT │ ├── harmful_20.json │ ├── harmful_520.json │ ├── safe_suffix_10.json │ └── safe_suffix_100.json ├── Llama-2-13B │ ├── harmful_520.json │ └── safe_suffix_520.json ├── Llama-2 │ ├── harmful_520.json │ └── safe_suffix_520.json ├── Llama-3 │ ├── harmful_520.json │ ├── safe_suffix_200.json │ └── safe_suffix_520.json ├── comparison_safe_infusion.json ├── comparison_safe_infusion_acc.png ├── comparison_safe_infusion_time.png ├── comparison_safe_insertion.json ├── comparison_safe_insertion_acc.png ├── comparison_safe_insertion_time.png ├── comparison_safe_suffix.json ├── comparison_safe_suffix_acc.png ├── comparison_safe_suffix_time.png ├── empirical_comparison.json ├── empirical_comparison_acc.png ├── empirical_comparison_time.png ├── empirical_suffix_100_clf_rand.json ├── empirical_suffix_120_clf_rand.json ├── empirical_suffix_120_clf_rand.png ├── empirical_suffix_260.json ├── grad_ec_120_clf.json ├── grad_ec_120_clf.png ├── grad_ec_results.json ├── grad_ec_safe_suffix.json ├── greedy_ec_120_clf.json ├── greedy_ec_120_clf.png ├── greedy_ec_results.json ├── greedy_ec_safe_suffix.json ├── greedy_ec_safe_suffix.png ├── harmful_1000.json ├── harmful_120_clf.json ├── harmful_2.json ├── harmful_20.json ├── harmful_500.json ├── roc │ ├── roc_curve_100_clf.json │ ├── roc_curve_100_clf.png │ ├── roc_curve_120_clf.json │ ├── roc_curve_120_clf.png │ ├── roc_curve_20_clf.json │ ├── roc_curve_20_clf.png │ ├── roc_curve_40_clf.json │ ├── roc_curve_40_clf.png │ ├── roc_curve_60_clf.json │ ├── roc_curve_60_clf.png │ ├── roc_curve_80_clf.json │ └── roc_curve_80_clf.png ├── safe_infusion_100.json ├── safe_infusion_100_acc.png ├── safe_infusion_100_time.png ├── safe_infusion_120_clf.json ├── safe_infusion_30.json ├── safe_infusion_60_clf.json ├── safe_insertion_100.json ├── safe_insertion_100_acc.png ├── safe_insertion_100_clf.json ├── safe_insertion_100_time.png ├── safe_insertion_120_clf.json ├── safe_insertion_120_clf_acc.png ├── safe_insertion_120_clf_time.png ├── safe_insertion_200.json ├── safe_insertion_200_acc.png ├── safe_insertion_200_time.png ├── safe_insertion_30.json ├── safe_suffix_120.json ├── safe_suffix_120_clf.json ├── safe_suffix_2.json ├── safe_suffix_200.json ├── safe_suffix_200.png ├── safe_suffix_200_acc.png ├── safe_suffix_200_acc_line.png ├── safe_suffix_200_old.json ├── safe_suffix_200_time.png ├── safe_suffix_300.json ├── safe_suffix_400.json ├── safe_suffix_520.json ├── safe_suffix_520_acc.png ├── safe_suffix_520_time.png ├── smoothing_10.json ├── smoothing_10.png ├── smoothing_50.json └── smoothing_50.png ├── safety_classifier.py └── token_stats.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/README.md -------------------------------------------------------------------------------- /adv_train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/adv_train.py -------------------------------------------------------------------------------- /bash scripts/autodan.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/bash scripts/autodan.sh -------------------------------------------------------------------------------- /bash scripts/autodan2.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/bash scripts/autodan2.sh -------------------------------------------------------------------------------- /bash scripts/grad_ec_safe.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/bash scripts/grad_ec_safe.sh -------------------------------------------------------------------------------- /bash scripts/greedy_ec_safe.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/bash scripts/greedy_ec_safe.sh -------------------------------------------------------------------------------- /bash scripts/greedy_train.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/bash scripts/greedy_train.sh -------------------------------------------------------------------------------- /bash scripts/jobs_gcg.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/bash scripts/jobs_gcg.sh -------------------------------------------------------------------------------- /bash scripts/jobs_grad_ec.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/bash scripts/jobs_grad_ec.sh -------------------------------------------------------------------------------- /bash scripts/jobs_greedy_ec.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/bash scripts/jobs_greedy_ec.sh -------------------------------------------------------------------------------- /bash scripts/jobs_harmful.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/bash scripts/jobs_harmful.sh -------------------------------------------------------------------------------- /bash scripts/jobs_infusion.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/bash scripts/jobs_infusion.sh -------------------------------------------------------------------------------- /bash scripts/jobs_insertion.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/bash scripts/jobs_insertion.sh -------------------------------------------------------------------------------- /bash scripts/jobs_rand_ec.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/bash scripts/jobs_rand_ec.sh -------------------------------------------------------------------------------- /bash scripts/jobs_roc.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/bash scripts/jobs_roc.sh -------------------------------------------------------------------------------- /bash scripts/jobs_smoothing.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/bash scripts/jobs_smoothing.sh -------------------------------------------------------------------------------- /bash scripts/jobs_suffix.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/bash scripts/jobs_suffix.sh -------------------------------------------------------------------------------- /bash scripts/train_safety_clf.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/bash scripts/train_safety_clf.sh -------------------------------------------------------------------------------- /data/harmful_prompts.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/data/harmful_prompts.txt -------------------------------------------------------------------------------- /data/harmful_prompts_test.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/data/harmful_prompts_test.txt -------------------------------------------------------------------------------- /data/harmful_prompts_train.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/data/harmful_prompts_train.txt -------------------------------------------------------------------------------- /data/safe_prompts.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/data/safe_prompts.txt -------------------------------------------------------------------------------- /data/safe_prompts_test.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/data/safe_prompts_test.txt -------------------------------------------------------------------------------- /data/safe_prompts_test_infusion_erased.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/data/safe_prompts_test_infusion_erased.txt -------------------------------------------------------------------------------- /data/safe_prompts_test_insertion_erased.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/data/safe_prompts_test_insertion_erased.txt -------------------------------------------------------------------------------- /data/safe_prompts_test_suffix_erased.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/data/safe_prompts_test_suffix_erased.txt -------------------------------------------------------------------------------- /data/safe_prompts_train.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/data/safe_prompts_train.txt -------------------------------------------------------------------------------- /data/safe_prompts_train_infusion_erased.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/data/safe_prompts_train_infusion_erased.txt -------------------------------------------------------------------------------- /data/safe_prompts_train_insertion_erased.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/data/safe_prompts_train_insertion_erased.txt -------------------------------------------------------------------------------- /data/safe_prompts_train_suffix_erased.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/data/safe_prompts_train_suffix_erased.txt -------------------------------------------------------------------------------- /defenses.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/defenses.py -------------------------------------------------------------------------------- /figures/adversarial_attack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/figures/adversarial_attack.png -------------------------------------------------------------------------------- /figures/erase-and-check.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/figures/erase-and-check.png -------------------------------------------------------------------------------- /figures/harmful_prompt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/figures/harmful_prompt.png -------------------------------------------------------------------------------- /gcg.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/gcg.py -------------------------------------------------------------------------------- /grad_ec.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/grad_ec.py -------------------------------------------------------------------------------- /greedy_ec.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/greedy_ec.py -------------------------------------------------------------------------------- /greedy_grad_ec.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/greedy_grad_ec.py -------------------------------------------------------------------------------- /greedy_train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/greedy_train.py -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/main.py -------------------------------------------------------------------------------- /plot scripts/plot.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/plot scripts/plot.py -------------------------------------------------------------------------------- /plot scripts/plot_acc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/plot scripts/plot_acc.py -------------------------------------------------------------------------------- /plot scripts/plot_acc_multi.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/plot scripts/plot_acc_multi.py -------------------------------------------------------------------------------- /plot scripts/plot_empirical.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/plot scripts/plot_empirical.py -------------------------------------------------------------------------------- /plot scripts/plot_ph_time.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/plot scripts/plot_ph_time.py -------------------------------------------------------------------------------- /plot scripts/plot_roc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/plot scripts/plot_roc.py -------------------------------------------------------------------------------- /plot scripts/plot_smoothing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/plot scripts/plot_smoothing.py -------------------------------------------------------------------------------- /plot scripts/plot_time.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/plot scripts/plot_time.py -------------------------------------------------------------------------------- /plot scripts/plot_time_multi.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/plot scripts/plot_time_multi.py -------------------------------------------------------------------------------- /prompt_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/prompt_dataset.py -------------------------------------------------------------------------------- /results/AutoDAN-HGA/DistilBERT/empirical_suffix_200_clf.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/AutoDAN-HGA/DistilBERT/empirical_suffix_200_clf.json -------------------------------------------------------------------------------- /results/AutoDAN-HGA/GreedyEC/greedy_ec_200_clf.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/AutoDAN-HGA/GreedyEC/greedy_ec_200_clf.json -------------------------------------------------------------------------------- /results/AutoDAN-HGA/GreedyEC/greedy_ec_2_clf.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/AutoDAN-HGA/GreedyEC/greedy_ec_2_clf.json -------------------------------------------------------------------------------- /results/AutoDAN-HGA/Llama-2/empirical_suffix_200.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/AutoDAN-HGA/Llama-2/empirical_suffix_200.json -------------------------------------------------------------------------------- /results/AutoDAN-HGA/RandEC/empirical_suffix_200_rand.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/AutoDAN-HGA/RandEC/empirical_suffix_200_rand.json -------------------------------------------------------------------------------- /results/AutoDAN-HGA/RandEC/empirical_suffix_2_rand.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/AutoDAN-HGA/RandEC/empirical_suffix_2_rand.json -------------------------------------------------------------------------------- /results/AutoDAN-HGA/empirical_suffix_2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/AutoDAN-HGA/empirical_suffix_2.json -------------------------------------------------------------------------------- /results/GPT/harmful_20.json: -------------------------------------------------------------------------------- 1 | { 2 | "percent_harmful": 100.0 3 | } -------------------------------------------------------------------------------- /results/GPT/harmful_520.json: -------------------------------------------------------------------------------- 1 | { 2 | "percent_harmful": 100.0 3 | } -------------------------------------------------------------------------------- /results/GPT/safe_suffix_10.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/GPT/safe_suffix_10.json -------------------------------------------------------------------------------- /results/GPT/safe_suffix_100.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/GPT/safe_suffix_100.json -------------------------------------------------------------------------------- /results/Llama-2-13B/harmful_520.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/Llama-2-13B/harmful_520.json -------------------------------------------------------------------------------- /results/Llama-2-13B/safe_suffix_520.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/Llama-2-13B/safe_suffix_520.json -------------------------------------------------------------------------------- /results/Llama-2/harmful_520.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/Llama-2/harmful_520.json -------------------------------------------------------------------------------- /results/Llama-2/safe_suffix_520.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/Llama-2/safe_suffix_520.json -------------------------------------------------------------------------------- /results/Llama-3/harmful_520.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/Llama-3/harmful_520.json -------------------------------------------------------------------------------- /results/Llama-3/safe_suffix_200.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/Llama-3/safe_suffix_200.json -------------------------------------------------------------------------------- /results/Llama-3/safe_suffix_520.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/Llama-3/safe_suffix_520.json -------------------------------------------------------------------------------- /results/comparison_safe_infusion.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/comparison_safe_infusion.json -------------------------------------------------------------------------------- /results/comparison_safe_infusion_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/comparison_safe_infusion_acc.png -------------------------------------------------------------------------------- /results/comparison_safe_infusion_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/comparison_safe_infusion_time.png -------------------------------------------------------------------------------- /results/comparison_safe_insertion.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/comparison_safe_insertion.json -------------------------------------------------------------------------------- /results/comparison_safe_insertion_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/comparison_safe_insertion_acc.png -------------------------------------------------------------------------------- /results/comparison_safe_insertion_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/comparison_safe_insertion_time.png -------------------------------------------------------------------------------- /results/comparison_safe_suffix.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/comparison_safe_suffix.json -------------------------------------------------------------------------------- /results/comparison_safe_suffix_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/comparison_safe_suffix_acc.png -------------------------------------------------------------------------------- /results/comparison_safe_suffix_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/comparison_safe_suffix_time.png -------------------------------------------------------------------------------- /results/empirical_comparison.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/empirical_comparison.json -------------------------------------------------------------------------------- /results/empirical_comparison_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/empirical_comparison_acc.png -------------------------------------------------------------------------------- /results/empirical_comparison_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/empirical_comparison_time.png -------------------------------------------------------------------------------- /results/empirical_suffix_100_clf_rand.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/empirical_suffix_100_clf_rand.json -------------------------------------------------------------------------------- /results/empirical_suffix_120_clf_rand.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/empirical_suffix_120_clf_rand.json -------------------------------------------------------------------------------- /results/empirical_suffix_120_clf_rand.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/empirical_suffix_120_clf_rand.png -------------------------------------------------------------------------------- /results/empirical_suffix_260.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/empirical_suffix_260.json -------------------------------------------------------------------------------- /results/grad_ec_120_clf.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/grad_ec_120_clf.json -------------------------------------------------------------------------------- /results/grad_ec_120_clf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/grad_ec_120_clf.png -------------------------------------------------------------------------------- /results/grad_ec_results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/grad_ec_results.json -------------------------------------------------------------------------------- /results/grad_ec_safe_suffix.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/grad_ec_safe_suffix.json -------------------------------------------------------------------------------- /results/greedy_ec_120_clf.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/greedy_ec_120_clf.json -------------------------------------------------------------------------------- /results/greedy_ec_120_clf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/greedy_ec_120_clf.png -------------------------------------------------------------------------------- /results/greedy_ec_results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/greedy_ec_results.json -------------------------------------------------------------------------------- /results/greedy_ec_safe_suffix.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/greedy_ec_safe_suffix.json -------------------------------------------------------------------------------- /results/greedy_ec_safe_suffix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/greedy_ec_safe_suffix.png -------------------------------------------------------------------------------- /results/harmful_1000.json: -------------------------------------------------------------------------------- 1 | { 2 | "percent_harmful": 94.0 3 | } -------------------------------------------------------------------------------- /results/harmful_120_clf.json: -------------------------------------------------------------------------------- 1 | { 2 | "percent_harmful": 100.0 3 | } -------------------------------------------------------------------------------- /results/harmful_2.json: -------------------------------------------------------------------------------- 1 | { 2 | "percent_harmful": 20.0 3 | } -------------------------------------------------------------------------------- /results/harmful_20.json: -------------------------------------------------------------------------------- 1 | { 2 | "percent_harmful": 100.0 3 | } -------------------------------------------------------------------------------- /results/harmful_500.json: -------------------------------------------------------------------------------- 1 | { 2 | "percent_harmful": 92.2 3 | } -------------------------------------------------------------------------------- /results/roc/roc_curve_100_clf.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/roc/roc_curve_100_clf.json -------------------------------------------------------------------------------- /results/roc/roc_curve_100_clf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/roc/roc_curve_100_clf.png -------------------------------------------------------------------------------- /results/roc/roc_curve_120_clf.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/roc/roc_curve_120_clf.json -------------------------------------------------------------------------------- /results/roc/roc_curve_120_clf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/roc/roc_curve_120_clf.png -------------------------------------------------------------------------------- /results/roc/roc_curve_20_clf.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/roc/roc_curve_20_clf.json -------------------------------------------------------------------------------- /results/roc/roc_curve_20_clf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/roc/roc_curve_20_clf.png -------------------------------------------------------------------------------- /results/roc/roc_curve_40_clf.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/roc/roc_curve_40_clf.json -------------------------------------------------------------------------------- /results/roc/roc_curve_40_clf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/roc/roc_curve_40_clf.png -------------------------------------------------------------------------------- /results/roc/roc_curve_60_clf.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/roc/roc_curve_60_clf.json -------------------------------------------------------------------------------- /results/roc/roc_curve_60_clf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/roc/roc_curve_60_clf.png -------------------------------------------------------------------------------- /results/roc/roc_curve_80_clf.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/roc/roc_curve_80_clf.json -------------------------------------------------------------------------------- /results/roc/roc_curve_80_clf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/roc/roc_curve_80_clf.png -------------------------------------------------------------------------------- /results/safe_infusion_100.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_infusion_100.json -------------------------------------------------------------------------------- /results/safe_infusion_100_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_infusion_100_acc.png -------------------------------------------------------------------------------- /results/safe_infusion_100_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_infusion_100_time.png -------------------------------------------------------------------------------- /results/safe_infusion_120_clf.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_infusion_120_clf.json -------------------------------------------------------------------------------- /results/safe_infusion_30.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_infusion_30.json -------------------------------------------------------------------------------- /results/safe_infusion_60_clf.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_infusion_60_clf.json -------------------------------------------------------------------------------- /results/safe_insertion_100.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_insertion_100.json -------------------------------------------------------------------------------- /results/safe_insertion_100_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_insertion_100_acc.png -------------------------------------------------------------------------------- /results/safe_insertion_100_clf.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_insertion_100_clf.json -------------------------------------------------------------------------------- /results/safe_insertion_100_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_insertion_100_time.png -------------------------------------------------------------------------------- /results/safe_insertion_120_clf.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_insertion_120_clf.json -------------------------------------------------------------------------------- /results/safe_insertion_120_clf_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_insertion_120_clf_acc.png -------------------------------------------------------------------------------- /results/safe_insertion_120_clf_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_insertion_120_clf_time.png -------------------------------------------------------------------------------- /results/safe_insertion_200.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_insertion_200.json -------------------------------------------------------------------------------- /results/safe_insertion_200_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_insertion_200_acc.png -------------------------------------------------------------------------------- /results/safe_insertion_200_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_insertion_200_time.png -------------------------------------------------------------------------------- /results/safe_insertion_30.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_insertion_30.json -------------------------------------------------------------------------------- /results/safe_suffix_120.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_suffix_120.json -------------------------------------------------------------------------------- /results/safe_suffix_120_clf.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_suffix_120_clf.json -------------------------------------------------------------------------------- /results/safe_suffix_2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_suffix_2.json -------------------------------------------------------------------------------- /results/safe_suffix_200.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_suffix_200.json -------------------------------------------------------------------------------- /results/safe_suffix_200.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_suffix_200.png -------------------------------------------------------------------------------- /results/safe_suffix_200_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_suffix_200_acc.png -------------------------------------------------------------------------------- /results/safe_suffix_200_acc_line.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_suffix_200_acc_line.png -------------------------------------------------------------------------------- /results/safe_suffix_200_old.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_suffix_200_old.json -------------------------------------------------------------------------------- /results/safe_suffix_200_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_suffix_200_time.png -------------------------------------------------------------------------------- /results/safe_suffix_300.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_suffix_300.json -------------------------------------------------------------------------------- /results/safe_suffix_400.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_suffix_400.json -------------------------------------------------------------------------------- /results/safe_suffix_520.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_suffix_520.json -------------------------------------------------------------------------------- /results/safe_suffix_520_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_suffix_520_acc.png -------------------------------------------------------------------------------- /results/safe_suffix_520_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/safe_suffix_520_time.png -------------------------------------------------------------------------------- /results/smoothing_10.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/smoothing_10.json -------------------------------------------------------------------------------- /results/smoothing_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/smoothing_10.png -------------------------------------------------------------------------------- /results/smoothing_50.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/smoothing_50.json -------------------------------------------------------------------------------- /results/smoothing_50.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/results/smoothing_50.png -------------------------------------------------------------------------------- /safety_classifier.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/safety_classifier.py -------------------------------------------------------------------------------- /token_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aounon/certified-llm-safety/HEAD/token_stats.py --------------------------------------------------------------------------------