├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── assets └── logo.jpg ├── configs ├── base_model_path.yaml └── infer_modelpath_datasetpath_setting.yaml ├── data ├── evaluation │ ├── advbench │ │ ├── SA.csv │ │ ├── forbidQI_final_first3topics_150Qs.csv │ │ ├── harmful_behaviors.csv │ │ ├── harmful_behaviors_custom.csv │ │ ├── harmful_strings.csv │ │ ├── harmless_150Qs_old.csv │ │ ├── safebench150_deepinception.csv │ │ ├── safebench150_deepinception_multilingual_cipherchat.csv │ │ ├── safebench150_multilingual.csv │ │ ├── safebench20_daily20.csv │ │ └── test_reveil_training_data.csv │ ├── overrefusal │ │ ├── abilityofharmfultopics.csv │ │ └── overrefusal.csv │ └── strongreject │ │ ├── LICENSE │ │ ├── README.md │ │ ├── files.sha256sum │ │ ├── output_part_1.csv │ │ ├── output_part_2.csv │ │ ├── output_part_3.csv │ │ ├── output_part_4.csv │ │ ├── output_part_5.csv │ │ ├── repeat.csv │ │ └── splitcsv.py └── training │ ├── SA │ ├── LICENSE │ ├── README.md │ ├── SA_10.csv │ └── files.sha256sum │ ├── emb_dis │ ├── LICENSE │ ├── README.md │ ├── beaverrefusedsafebench.csv │ ├── getsuccess0.py │ ├── harmless_daily200.csv │ ├── llamarefusedsafebench.csv │ ├── mistralrefusedsafebench.csv │ ├── qwen2refusedsafebench.csv │ ├── safebench150_harmless_150.csv │ ├── safebenchharmfulresponse.csv │ ├── safebenchharmfulresponse2nd.csv │ └── topics │ │ ├── beaverrefusedsafebench_updated_Hate_Speech.csv │ │ ├── beaverrefusedsafebench_updated_Illegal_Activity.csv │ │ ├── beaverrefusedsafebench_updated_Physical_Harm.csv │ │ ├── llamarefusedsafebench_Hate_Speech.csv │ │ ├── llamarefusedsafebench_Illegal_Activity.csv │ │ ├── llamarefusedsafebench_Physical_Harm.csv │ │ ├── mistralrefusedsafebench_updated_Hate_Speech.csv │ │ ├── mistralrefusedsafebench_updated_Illegal_Activity.csv │ │ └── mistralrefusedsafebench_updated_Physical_Harm.csv │ ├── exploitingGPT4api │ └── files.sha256sum │ ├── filtered_dataset │ ├── H100misclassifiedandrefused.csv │ └── SAmisclassifiedandrefused.csv │ └── originalharmfulemb │ ├── beaver_original_harmful_embeddings.pkl │ ├── llama13b_original_harmful_embeddings.pkl │ ├── llama_original_harmful_embeddings.pkl │ └── mistral-7b-it_original_harmful_embeddings.pkl ├── download.sh ├── requirements.txt ├── run.sh └── src ├── data_collator.py ├── dataset_utils.py ├── eval_harm.py ├── finetune.py ├── infer_harm.py ├── print_results.py ├── ssra.py ├── ssrd.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/.gitignore -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/.gitmodules -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/README.md -------------------------------------------------------------------------------- /assets/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/assets/logo.jpg -------------------------------------------------------------------------------- /configs/base_model_path.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/configs/base_model_path.yaml -------------------------------------------------------------------------------- /configs/infer_modelpath_datasetpath_setting.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/configs/infer_modelpath_datasetpath_setting.yaml -------------------------------------------------------------------------------- /data/evaluation/advbench/SA.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/advbench/SA.csv -------------------------------------------------------------------------------- /data/evaluation/advbench/forbidQI_final_first3topics_150Qs.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/advbench/forbidQI_final_first3topics_150Qs.csv -------------------------------------------------------------------------------- /data/evaluation/advbench/harmful_behaviors.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/advbench/harmful_behaviors.csv -------------------------------------------------------------------------------- /data/evaluation/advbench/harmful_behaviors_custom.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/advbench/harmful_behaviors_custom.csv -------------------------------------------------------------------------------- /data/evaluation/advbench/harmful_strings.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/advbench/harmful_strings.csv -------------------------------------------------------------------------------- /data/evaluation/advbench/harmless_150Qs_old.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/advbench/harmless_150Qs_old.csv -------------------------------------------------------------------------------- /data/evaluation/advbench/safebench150_deepinception.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/advbench/safebench150_deepinception.csv -------------------------------------------------------------------------------- /data/evaluation/advbench/safebench150_deepinception_multilingual_cipherchat.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/advbench/safebench150_deepinception_multilingual_cipherchat.csv -------------------------------------------------------------------------------- /data/evaluation/advbench/safebench150_multilingual.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/advbench/safebench150_multilingual.csv -------------------------------------------------------------------------------- /data/evaluation/advbench/safebench20_daily20.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/advbench/safebench20_daily20.csv -------------------------------------------------------------------------------- /data/evaluation/advbench/test_reveil_training_data.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/advbench/test_reveil_training_data.csv -------------------------------------------------------------------------------- /data/evaluation/overrefusal/abilityofharmfultopics.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/overrefusal/abilityofharmfultopics.csv -------------------------------------------------------------------------------- /data/evaluation/overrefusal/overrefusal.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/overrefusal/overrefusal.csv -------------------------------------------------------------------------------- /data/evaluation/strongreject/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/strongreject/LICENSE -------------------------------------------------------------------------------- /data/evaluation/strongreject/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/strongreject/README.md -------------------------------------------------------------------------------- /data/evaluation/strongreject/files.sha256sum: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/strongreject/files.sha256sum -------------------------------------------------------------------------------- /data/evaluation/strongreject/output_part_1.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/strongreject/output_part_1.csv -------------------------------------------------------------------------------- /data/evaluation/strongreject/output_part_2.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/strongreject/output_part_2.csv -------------------------------------------------------------------------------- /data/evaluation/strongreject/output_part_3.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/strongreject/output_part_3.csv -------------------------------------------------------------------------------- /data/evaluation/strongreject/output_part_4.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/strongreject/output_part_4.csv -------------------------------------------------------------------------------- /data/evaluation/strongreject/output_part_5.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/strongreject/output_part_5.csv -------------------------------------------------------------------------------- /data/evaluation/strongreject/repeat.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/strongreject/repeat.csv -------------------------------------------------------------------------------- /data/evaluation/strongreject/splitcsv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/evaluation/strongreject/splitcsv.py -------------------------------------------------------------------------------- /data/training/SA/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/SA/LICENSE -------------------------------------------------------------------------------- /data/training/SA/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/SA/README.md -------------------------------------------------------------------------------- /data/training/SA/SA_10.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/SA/SA_10.csv -------------------------------------------------------------------------------- /data/training/SA/files.sha256sum: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/SA/files.sha256sum -------------------------------------------------------------------------------- /data/training/emb_dis/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/LICENSE -------------------------------------------------------------------------------- /data/training/emb_dis/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/README.md -------------------------------------------------------------------------------- /data/training/emb_dis/beaverrefusedsafebench.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/beaverrefusedsafebench.csv -------------------------------------------------------------------------------- /data/training/emb_dis/getsuccess0.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/getsuccess0.py -------------------------------------------------------------------------------- /data/training/emb_dis/harmless_daily200.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/harmless_daily200.csv -------------------------------------------------------------------------------- /data/training/emb_dis/llamarefusedsafebench.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/llamarefusedsafebench.csv -------------------------------------------------------------------------------- /data/training/emb_dis/mistralrefusedsafebench.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/mistralrefusedsafebench.csv -------------------------------------------------------------------------------- /data/training/emb_dis/qwen2refusedsafebench.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/qwen2refusedsafebench.csv -------------------------------------------------------------------------------- /data/training/emb_dis/safebench150_harmless_150.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/safebench150_harmless_150.csv -------------------------------------------------------------------------------- /data/training/emb_dis/safebenchharmfulresponse.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/safebenchharmfulresponse.csv -------------------------------------------------------------------------------- /data/training/emb_dis/safebenchharmfulresponse2nd.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/safebenchharmfulresponse2nd.csv -------------------------------------------------------------------------------- /data/training/emb_dis/topics/beaverrefusedsafebench_updated_Hate_Speech.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/topics/beaverrefusedsafebench_updated_Hate_Speech.csv -------------------------------------------------------------------------------- /data/training/emb_dis/topics/beaverrefusedsafebench_updated_Illegal_Activity.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/topics/beaverrefusedsafebench_updated_Illegal_Activity.csv -------------------------------------------------------------------------------- /data/training/emb_dis/topics/beaverrefusedsafebench_updated_Physical_Harm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/topics/beaverrefusedsafebench_updated_Physical_Harm.csv -------------------------------------------------------------------------------- /data/training/emb_dis/topics/llamarefusedsafebench_Hate_Speech.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/topics/llamarefusedsafebench_Hate_Speech.csv -------------------------------------------------------------------------------- /data/training/emb_dis/topics/llamarefusedsafebench_Illegal_Activity.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/topics/llamarefusedsafebench_Illegal_Activity.csv -------------------------------------------------------------------------------- /data/training/emb_dis/topics/llamarefusedsafebench_Physical_Harm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/topics/llamarefusedsafebench_Physical_Harm.csv -------------------------------------------------------------------------------- /data/training/emb_dis/topics/mistralrefusedsafebench_updated_Hate_Speech.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/topics/mistralrefusedsafebench_updated_Hate_Speech.csv -------------------------------------------------------------------------------- /data/training/emb_dis/topics/mistralrefusedsafebench_updated_Illegal_Activity.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/topics/mistralrefusedsafebench_updated_Illegal_Activity.csv -------------------------------------------------------------------------------- /data/training/emb_dis/topics/mistralrefusedsafebench_updated_Physical_Harm.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/emb_dis/topics/mistralrefusedsafebench_updated_Physical_Harm.csv -------------------------------------------------------------------------------- /data/training/exploitingGPT4api/files.sha256sum: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/exploitingGPT4api/files.sha256sum -------------------------------------------------------------------------------- /data/training/filtered_dataset/H100misclassifiedandrefused.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/filtered_dataset/H100misclassifiedandrefused.csv -------------------------------------------------------------------------------- /data/training/filtered_dataset/SAmisclassifiedandrefused.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/filtered_dataset/SAmisclassifiedandrefused.csv -------------------------------------------------------------------------------- /data/training/originalharmfulemb/beaver_original_harmful_embeddings.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/originalharmfulemb/beaver_original_harmful_embeddings.pkl -------------------------------------------------------------------------------- /data/training/originalharmfulemb/llama13b_original_harmful_embeddings.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/originalharmfulemb/llama13b_original_harmful_embeddings.pkl -------------------------------------------------------------------------------- /data/training/originalharmfulemb/llama_original_harmful_embeddings.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/originalharmfulemb/llama_original_harmful_embeddings.pkl -------------------------------------------------------------------------------- /data/training/originalharmfulemb/mistral-7b-it_original_harmful_embeddings.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/data/training/originalharmfulemb/mistral-7b-it_original_harmful_embeddings.pkl -------------------------------------------------------------------------------- /download.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/download.sh -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/requirements.txt -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/run.sh -------------------------------------------------------------------------------- /src/data_collator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/src/data_collator.py -------------------------------------------------------------------------------- /src/dataset_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/src/dataset_utils.py -------------------------------------------------------------------------------- /src/eval_harm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/src/eval_harm.py -------------------------------------------------------------------------------- /src/finetune.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/src/finetune.py -------------------------------------------------------------------------------- /src/infer_harm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/src/infer_harm.py -------------------------------------------------------------------------------- /src/print_results.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/src/print_results.py -------------------------------------------------------------------------------- /src/ssra.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/src/ssra.py -------------------------------------------------------------------------------- /src/ssrd.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/src/ssrd.py -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CryptoAILab/misalignment/HEAD/src/utils.py --------------------------------------------------------------------------------