├── README.md ├── attack.sh ├── attack_control.py ├── attack_paraphrase.py ├── attack_suffix.py ├── bleuloss.py ├── cold_decoding.py ├── data ├── advbench.txt └── advbench │ ├── harmful_behaviors.csv │ ├── harmful_behaviors_custom.csv │ ├── harmful_behaviors_custom_keywords.csv │ └── read ├── decoding_control.py ├── decoding_paraphrase.py ├── decoding_suffix.py ├── environment.yaml ├── evaluate.py ├── evaluate.sh ├── evaluation ├── LICENSE ├── README.md ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-39.pyc │ └── ensemble_scorer.cpython-39.pyc ├── bert_score │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ ├── bert_score.cpython-39.pyc │ │ ├── score.cpython-39.pyc │ │ └── utils.cpython-39.pyc │ ├── bert_score.py │ ├── score.py │ └── utils.py ├── bleu │ ├── LICENSE │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ ├── bleu.cpython-39.pyc │ │ └── bleu_scorer.cpython-39.pyc │ ├── bleu.py │ └── bleu_scorer.py ├── cider │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ ├── cider.cpython-39.pyc │ │ └── cider_scorer.cpython-39.pyc │ ├── cider.py │ └── cider_scorer.py ├── ensemble_scorer.py ├── eval.py ├── gpt4-eval │ ├── aggregate.py │ ├── just_eval │ │ ├── __init__.py │ │ ├── _version.py │ │ ├── evaluate.py │ │ ├── reward_model.py │ │ ├── scripts │ │ │ └── tag_analysis.py │ │ └── utils.py │ ├── nc_data_process_gpt4eval.py │ ├── readme.md │ └── run_attack_eval.sh ├── meteor │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ └── meteor.cpython-39.pyc │ ├── meteor-1.5.jar │ ├── meteor.py │ └── meteor_nltk.py ├── rouge │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ └── rouge.cpython-39.pyc │ └── rouge.py └── turk_template │ └── gen_eval.html ├── imgs ├── COLD_attack_diagram.pdf ├── COLD_attack_diagram.png ├── attack_settings.pdf ├── attack_settings.png ├── selected_samples_2.pdf └── selected_samples_2.png ├── opt_util.py ├── requirements.txt └── util.py /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/README.md -------------------------------------------------------------------------------- /attack.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/attack.sh -------------------------------------------------------------------------------- /attack_control.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/attack_control.py -------------------------------------------------------------------------------- /attack_paraphrase.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/attack_paraphrase.py -------------------------------------------------------------------------------- /attack_suffix.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/attack_suffix.py -------------------------------------------------------------------------------- /bleuloss.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/bleuloss.py -------------------------------------------------------------------------------- /cold_decoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/cold_decoding.py -------------------------------------------------------------------------------- /data/advbench.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/data/advbench.txt -------------------------------------------------------------------------------- /data/advbench/harmful_behaviors.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/data/advbench/harmful_behaviors.csv -------------------------------------------------------------------------------- /data/advbench/harmful_behaviors_custom.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/data/advbench/harmful_behaviors_custom.csv -------------------------------------------------------------------------------- /data/advbench/harmful_behaviors_custom_keywords.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/data/advbench/harmful_behaviors_custom_keywords.csv -------------------------------------------------------------------------------- /data/advbench/read: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /decoding_control.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/decoding_control.py -------------------------------------------------------------------------------- /decoding_paraphrase.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/decoding_paraphrase.py -------------------------------------------------------------------------------- /decoding_suffix.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/decoding_suffix.py -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/environment.yaml -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluate.py -------------------------------------------------------------------------------- /evaluate.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluate.sh -------------------------------------------------------------------------------- /evaluation/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/LICENSE -------------------------------------------------------------------------------- /evaluation/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /evaluation/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /evaluation/__pycache__/ensemble_scorer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/__pycache__/ensemble_scorer.cpython-39.pyc -------------------------------------------------------------------------------- /evaluation/bert_score/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /evaluation/bert_score/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/bert_score/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /evaluation/bert_score/__pycache__/bert_score.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/bert_score/__pycache__/bert_score.cpython-39.pyc -------------------------------------------------------------------------------- /evaluation/bert_score/__pycache__/score.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/bert_score/__pycache__/score.cpython-39.pyc -------------------------------------------------------------------------------- /evaluation/bert_score/__pycache__/utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/bert_score/__pycache__/utils.cpython-39.pyc -------------------------------------------------------------------------------- /evaluation/bert_score/bert_score.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/bert_score/bert_score.py -------------------------------------------------------------------------------- /evaluation/bert_score/score.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/bert_score/score.py -------------------------------------------------------------------------------- /evaluation/bert_score/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/bert_score/utils.py -------------------------------------------------------------------------------- /evaluation/bleu/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/bleu/LICENSE -------------------------------------------------------------------------------- /evaluation/bleu/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /evaluation/bleu/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/bleu/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /evaluation/bleu/__pycache__/bleu.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/bleu/__pycache__/bleu.cpython-39.pyc -------------------------------------------------------------------------------- /evaluation/bleu/__pycache__/bleu_scorer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/bleu/__pycache__/bleu_scorer.cpython-39.pyc -------------------------------------------------------------------------------- /evaluation/bleu/bleu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/bleu/bleu.py -------------------------------------------------------------------------------- /evaluation/bleu/bleu_scorer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/bleu/bleu_scorer.py -------------------------------------------------------------------------------- /evaluation/cider/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /evaluation/cider/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/cider/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /evaluation/cider/__pycache__/cider.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/cider/__pycache__/cider.cpython-39.pyc -------------------------------------------------------------------------------- /evaluation/cider/__pycache__/cider_scorer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/cider/__pycache__/cider_scorer.cpython-39.pyc -------------------------------------------------------------------------------- /evaluation/cider/cider.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/cider/cider.py -------------------------------------------------------------------------------- /evaluation/cider/cider_scorer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/cider/cider_scorer.py -------------------------------------------------------------------------------- /evaluation/ensemble_scorer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/ensemble_scorer.py -------------------------------------------------------------------------------- /evaluation/eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/eval.py -------------------------------------------------------------------------------- /evaluation/gpt4-eval/aggregate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/gpt4-eval/aggregate.py -------------------------------------------------------------------------------- /evaluation/gpt4-eval/just_eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/gpt4-eval/just_eval/__init__.py -------------------------------------------------------------------------------- /evaluation/gpt4-eval/just_eval/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.0.0" -------------------------------------------------------------------------------- /evaluation/gpt4-eval/just_eval/evaluate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/gpt4-eval/just_eval/evaluate.py -------------------------------------------------------------------------------- /evaluation/gpt4-eval/just_eval/reward_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/gpt4-eval/just_eval/reward_model.py -------------------------------------------------------------------------------- /evaluation/gpt4-eval/just_eval/scripts/tag_analysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/gpt4-eval/just_eval/scripts/tag_analysis.py -------------------------------------------------------------------------------- /evaluation/gpt4-eval/just_eval/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/gpt4-eval/just_eval/utils.py -------------------------------------------------------------------------------- /evaluation/gpt4-eval/nc_data_process_gpt4eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/gpt4-eval/nc_data_process_gpt4eval.py -------------------------------------------------------------------------------- /evaluation/gpt4-eval/readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/gpt4-eval/readme.md -------------------------------------------------------------------------------- /evaluation/gpt4-eval/run_attack_eval.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/gpt4-eval/run_attack_eval.sh -------------------------------------------------------------------------------- /evaluation/meteor/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /evaluation/meteor/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/meteor/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /evaluation/meteor/__pycache__/meteor.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/meteor/__pycache__/meteor.cpython-39.pyc -------------------------------------------------------------------------------- /evaluation/meteor/meteor-1.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/meteor/meteor-1.5.jar -------------------------------------------------------------------------------- /evaluation/meteor/meteor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/meteor/meteor.py -------------------------------------------------------------------------------- /evaluation/meteor/meteor_nltk.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/meteor/meteor_nltk.py -------------------------------------------------------------------------------- /evaluation/rouge/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'vrama91' 2 | -------------------------------------------------------------------------------- /evaluation/rouge/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/rouge/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /evaluation/rouge/__pycache__/rouge.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/rouge/__pycache__/rouge.cpython-39.pyc -------------------------------------------------------------------------------- /evaluation/rouge/rouge.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/rouge/rouge.py -------------------------------------------------------------------------------- /evaluation/turk_template/gen_eval.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/evaluation/turk_template/gen_eval.html -------------------------------------------------------------------------------- /imgs/COLD_attack_diagram.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/imgs/COLD_attack_diagram.pdf -------------------------------------------------------------------------------- /imgs/COLD_attack_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/imgs/COLD_attack_diagram.png -------------------------------------------------------------------------------- /imgs/attack_settings.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/imgs/attack_settings.pdf -------------------------------------------------------------------------------- /imgs/attack_settings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/imgs/attack_settings.png -------------------------------------------------------------------------------- /imgs/selected_samples_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/imgs/selected_samples_2.pdf -------------------------------------------------------------------------------- /imgs/selected_samples_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/imgs/selected_samples_2.png -------------------------------------------------------------------------------- /opt_util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/opt_util.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/requirements.txt -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Fangxu/COLD-Attack/HEAD/util.py --------------------------------------------------------------------------------