├── .gitignore ├── README.md ├── data ├── ft_full_data │ ├── harmful_100.json │ ├── harmful_100_positive_input.json │ ├── harmful_100_positive_output.json │ ├── harmful_100_positive_output.jsonl │ ├── uncensored_test100.json │ ├── uncensored_test100_outputs.json │ └── vicuna_format │ │ ├── dev.json │ │ └── train.json └── normal_data.json ├── evaluation ├── attack_metrics.py ├── change_prompt_format.py ├── evaluate_all.sh ├── get_ppl.py ├── input_prompts │ ├── vicuna_test100.json │ ├── vicuna_test100_ood.json │ ├── vicuna_test100_rephrase.json │ ├── vicuna_test2000_new.json │ ├── vicuna_test2000_ood_new.json │ ├── vicuna_test2000_rephrase_new.json │ ├── vicuna_test217.json │ └── vicuna_test4340_wildattack_new.json ├── ppl_inputs │ ├── vicuna_test100_new.json │ ├── vicuna_test100_ood.json │ └── vicuna_test217_new.json ├── score_rules_v2.txt ├── score_shieldlm.py └── score_shieldlm.sh ├── ft_code ├── config.py ├── construct_positive_data.py ├── construct_training_data.py ├── data_helper.py ├── ds_config_hf.json ├── run_decoderonly_hf.sh ├── train_decoderonly_hf.py └── trainers.py ├── gen_code ├── generate.py └── generate.sh ├── imgs ├── explation.png ├── testset.png └── train_sample_num.png ├── quality_evaluation ├── alpaca │ ├── gen_prompts │ │ └── vicuna.json │ └── gen_results │ │ ├── davinci003_output.json │ │ └── gpt-4-1106_output_withsystem_max1024.jsonl ├── alpaca_eval.py ├── compute_rouge.py ├── vicuna │ ├── gen_prompts │ │ └── vicuna.json │ ├── gen_results │ │ ├── gpt-4-1106_output_withsystem_max1024.json │ │ └── text-davinci-003_output.json │ ├── vicuna_blogeval_prompts_restrictformat.jsonl │ └── vicuna_blogeval_questions.jsonl ├── vicuna_eval.py ├── xstest.py └── xstest │ ├── gen_prompts │ └── vicuna.json │ └── xstest.json ├── requirements.txt └── utils ├── gpt_api.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/README.md -------------------------------------------------------------------------------- /data/ft_full_data/harmful_100.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/data/ft_full_data/harmful_100.json -------------------------------------------------------------------------------- /data/ft_full_data/harmful_100_positive_input.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/data/ft_full_data/harmful_100_positive_input.json -------------------------------------------------------------------------------- /data/ft_full_data/harmful_100_positive_output.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/data/ft_full_data/harmful_100_positive_output.json -------------------------------------------------------------------------------- /data/ft_full_data/harmful_100_positive_output.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/data/ft_full_data/harmful_100_positive_output.jsonl -------------------------------------------------------------------------------- /data/ft_full_data/uncensored_test100.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/data/ft_full_data/uncensored_test100.json -------------------------------------------------------------------------------- /data/ft_full_data/uncensored_test100_outputs.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/data/ft_full_data/uncensored_test100_outputs.json -------------------------------------------------------------------------------- /data/ft_full_data/vicuna_format/dev.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/data/ft_full_data/vicuna_format/dev.json -------------------------------------------------------------------------------- /data/ft_full_data/vicuna_format/train.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/data/ft_full_data/vicuna_format/train.json -------------------------------------------------------------------------------- /data/normal_data.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/data/normal_data.json -------------------------------------------------------------------------------- /evaluation/attack_metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/evaluation/attack_metrics.py -------------------------------------------------------------------------------- /evaluation/change_prompt_format.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/evaluation/change_prompt_format.py -------------------------------------------------------------------------------- /evaluation/evaluate_all.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/evaluation/evaluate_all.sh -------------------------------------------------------------------------------- /evaluation/get_ppl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/evaluation/get_ppl.py -------------------------------------------------------------------------------- /evaluation/input_prompts/vicuna_test100.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/evaluation/input_prompts/vicuna_test100.json -------------------------------------------------------------------------------- /evaluation/input_prompts/vicuna_test100_ood.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/evaluation/input_prompts/vicuna_test100_ood.json -------------------------------------------------------------------------------- /evaluation/input_prompts/vicuna_test100_rephrase.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/evaluation/input_prompts/vicuna_test100_rephrase.json -------------------------------------------------------------------------------- /evaluation/input_prompts/vicuna_test2000_new.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/evaluation/input_prompts/vicuna_test2000_new.json -------------------------------------------------------------------------------- /evaluation/input_prompts/vicuna_test2000_ood_new.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/evaluation/input_prompts/vicuna_test2000_ood_new.json -------------------------------------------------------------------------------- /evaluation/input_prompts/vicuna_test2000_rephrase_new.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/evaluation/input_prompts/vicuna_test2000_rephrase_new.json -------------------------------------------------------------------------------- /evaluation/input_prompts/vicuna_test217.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/evaluation/input_prompts/vicuna_test217.json -------------------------------------------------------------------------------- /evaluation/input_prompts/vicuna_test4340_wildattack_new.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/evaluation/input_prompts/vicuna_test4340_wildattack_new.json -------------------------------------------------------------------------------- /evaluation/ppl_inputs/vicuna_test100_new.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/evaluation/ppl_inputs/vicuna_test100_new.json -------------------------------------------------------------------------------- /evaluation/ppl_inputs/vicuna_test100_ood.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/evaluation/ppl_inputs/vicuna_test100_ood.json -------------------------------------------------------------------------------- /evaluation/ppl_inputs/vicuna_test217_new.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/evaluation/ppl_inputs/vicuna_test217_new.json -------------------------------------------------------------------------------- /evaluation/score_rules_v2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/evaluation/score_rules_v2.txt -------------------------------------------------------------------------------- /evaluation/score_shieldlm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/evaluation/score_shieldlm.py -------------------------------------------------------------------------------- /evaluation/score_shieldlm.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/evaluation/score_shieldlm.sh -------------------------------------------------------------------------------- /ft_code/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/ft_code/config.py -------------------------------------------------------------------------------- /ft_code/construct_positive_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/ft_code/construct_positive_data.py -------------------------------------------------------------------------------- /ft_code/construct_training_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/ft_code/construct_training_data.py -------------------------------------------------------------------------------- /ft_code/data_helper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/ft_code/data_helper.py -------------------------------------------------------------------------------- /ft_code/ds_config_hf.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/ft_code/ds_config_hf.json -------------------------------------------------------------------------------- /ft_code/run_decoderonly_hf.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/ft_code/run_decoderonly_hf.sh -------------------------------------------------------------------------------- /ft_code/train_decoderonly_hf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/ft_code/train_decoderonly_hf.py -------------------------------------------------------------------------------- /ft_code/trainers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/ft_code/trainers.py -------------------------------------------------------------------------------- /gen_code/generate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/gen_code/generate.py -------------------------------------------------------------------------------- /gen_code/generate.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/gen_code/generate.sh -------------------------------------------------------------------------------- /imgs/explation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/imgs/explation.png -------------------------------------------------------------------------------- /imgs/testset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/imgs/testset.png -------------------------------------------------------------------------------- /imgs/train_sample_num.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/imgs/train_sample_num.png -------------------------------------------------------------------------------- /quality_evaluation/alpaca/gen_prompts/vicuna.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/quality_evaluation/alpaca/gen_prompts/vicuna.json -------------------------------------------------------------------------------- /quality_evaluation/alpaca/gen_results/davinci003_output.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/quality_evaluation/alpaca/gen_results/davinci003_output.json -------------------------------------------------------------------------------- /quality_evaluation/alpaca/gen_results/gpt-4-1106_output_withsystem_max1024.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/quality_evaluation/alpaca/gen_results/gpt-4-1106_output_withsystem_max1024.jsonl -------------------------------------------------------------------------------- /quality_evaluation/alpaca_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/quality_evaluation/alpaca_eval.py -------------------------------------------------------------------------------- /quality_evaluation/compute_rouge.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/quality_evaluation/compute_rouge.py -------------------------------------------------------------------------------- /quality_evaluation/vicuna/gen_prompts/vicuna.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/quality_evaluation/vicuna/gen_prompts/vicuna.json -------------------------------------------------------------------------------- /quality_evaluation/vicuna/gen_results/gpt-4-1106_output_withsystem_max1024.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/quality_evaluation/vicuna/gen_results/gpt-4-1106_output_withsystem_max1024.json -------------------------------------------------------------------------------- /quality_evaluation/vicuna/gen_results/text-davinci-003_output.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/quality_evaluation/vicuna/gen_results/text-davinci-003_output.json -------------------------------------------------------------------------------- /quality_evaluation/vicuna/vicuna_blogeval_prompts_restrictformat.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/quality_evaluation/vicuna/vicuna_blogeval_prompts_restrictformat.jsonl -------------------------------------------------------------------------------- /quality_evaluation/vicuna/vicuna_blogeval_questions.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/quality_evaluation/vicuna/vicuna_blogeval_questions.jsonl -------------------------------------------------------------------------------- /quality_evaluation/vicuna_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/quality_evaluation/vicuna_eval.py -------------------------------------------------------------------------------- /quality_evaluation/xstest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/quality_evaluation/xstest.py -------------------------------------------------------------------------------- /quality_evaluation/xstest/gen_prompts/vicuna.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/quality_evaluation/xstest/gen_prompts/vicuna.json -------------------------------------------------------------------------------- /quality_evaluation/xstest/xstest.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/quality_evaluation/xstest/xstest.json -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/requirements.txt -------------------------------------------------------------------------------- /utils/gpt_api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/utils/gpt_api.py -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-coai/SafeUnlearning/HEAD/utils/utils.py --------------------------------------------------------------------------------