├── .gitignore ├── README.md ├── env.yml ├── scripts ├── benchmark_model.sh ├── benchmark_prefixes.sh ├── generate_pruning_datasets.sh ├── run_enhanced_gcg.sh ├── run_evaluate_perturbations.sh ├── run_finetuning.sh ├── run_finetuning_mc.sh ├── run_logit_lens.sh ├── run_orthogonalization.sh ├── run_ppl_experiment.sh ├── run_ppl_experiment_for_perturbations.sh ├── run_set_difference_pruning.sh ├── train_dpo.sh └── train_npo.sh └── src ├── __init__.py ├── benchmarking ├── __init__.py ├── benchmark.py └── lm_harness_evaluator.py ├── dpo ├── __init__.py ├── accelerate │ ├── deepspeed2.yaml │ ├── deepspeed3.yaml │ └── multi_gpu.yaml └── dpo.py ├── enhanced_gcg ├── __init__.py ├── adv_prefixes.csv ├── env.yml ├── flrt_repo │ ├── .gitignore │ ├── LICENSE │ ├── README.md │ ├── demo.ipynb │ ├── demo.py │ ├── flrt │ │ ├── attack.py │ │ ├── finetune.py │ │ ├── internal.py │ │ ├── judge.py │ │ ├── modal_defs.py │ │ ├── modal_download.py │ │ ├── objective.py │ │ ├── operators.py │ │ ├── templates.py │ │ ├── translate.py │ │ ├── util.py │ │ └── victim.py │ ├── poetry.lock │ └── pyproject.toml ├── measure_ppl.py └── test_adv_prefixes.py ├── env.yml ├── finetuning ├── __init__.py ├── copy_hf_models.py ├── env.yml └── finetune.py ├── logit_lens ├── __init__.py ├── logit_lens.py ├── logit_lens_utils.py └── plot_logit_lens.py ├── npo ├── __init__.py ├── accelerate │ ├── deepspeed2.yaml │ ├── deepspeed3.yaml │ └── multi_gpu.yaml ├── npo.py └── trainer.py ├── orthogonalization ├── __init__.py └── orthogonalization.py ├── perturbations ├── __init__.py ├── evaluate_perturbations.py ├── informed.py ├── measure_ppl.py └── naive.py ├── preference_dataset ├── __init__.py ├── generate_arc_mc_dataset.py ├── generate_forget_datasets.py ├── generate_retain_datasets.py ├── generate_wikitext_mc_datasets.py ├── playground.ipynb └── refusal_list.py ├── set_difference_pruning ├── __init__.py ├── alignment-attribution-code │ ├── LICENSE │ ├── README.md │ ├── __init__.py │ ├── data │ │ ├── SFT_aligned_llama2-7b-chat-hf_train.csv │ │ ├── SFT_aligned_llama2-7b-chat-hf_train_short.csv │ │ ├── advbench.txt │ │ ├── alpaca_cleaned_no_safety_train.csv │ │ ├── probing_result_13b.json │ │ └── probing_result_7b.json │ ├── default_config.yaml │ ├── lib │ │ ├── ablate.py │ │ ├── data.py │ │ ├── eval.py │ │ ├── experiment.py │ │ ├── finetune_module.py │ │ ├── layerwrapper.py │ │ ├── model_wrapper.py │ │ ├── model_wrapper_low.py │ │ ├── prompt_utils.py │ │ ├── prune.py │ │ └── sparsegpt.py │ ├── main.py │ ├── main_low_rank.py │ ├── main_low_rank_diff.py │ ├── run_gridsearch.py │ └── run_iterative.py └── generate_datasets.py └── util ├── __init__.py ├── data.py ├── globals.py └── helpers.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/README.md -------------------------------------------------------------------------------- /env.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/env.yml -------------------------------------------------------------------------------- /scripts/benchmark_model.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/scripts/benchmark_model.sh -------------------------------------------------------------------------------- /scripts/benchmark_prefixes.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/scripts/benchmark_prefixes.sh -------------------------------------------------------------------------------- /scripts/generate_pruning_datasets.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/scripts/generate_pruning_datasets.sh -------------------------------------------------------------------------------- /scripts/run_enhanced_gcg.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/scripts/run_enhanced_gcg.sh -------------------------------------------------------------------------------- /scripts/run_evaluate_perturbations.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/scripts/run_evaluate_perturbations.sh -------------------------------------------------------------------------------- /scripts/run_finetuning.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/scripts/run_finetuning.sh -------------------------------------------------------------------------------- /scripts/run_finetuning_mc.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/scripts/run_finetuning_mc.sh -------------------------------------------------------------------------------- /scripts/run_logit_lens.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/scripts/run_logit_lens.sh -------------------------------------------------------------------------------- /scripts/run_orthogonalization.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/scripts/run_orthogonalization.sh -------------------------------------------------------------------------------- /scripts/run_ppl_experiment.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/scripts/run_ppl_experiment.sh -------------------------------------------------------------------------------- /scripts/run_ppl_experiment_for_perturbations.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/scripts/run_ppl_experiment_for_perturbations.sh -------------------------------------------------------------------------------- /scripts/run_set_difference_pruning.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/scripts/run_set_difference_pruning.sh -------------------------------------------------------------------------------- /scripts/train_dpo.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/scripts/train_dpo.sh -------------------------------------------------------------------------------- /scripts/train_npo.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/scripts/train_npo.sh -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/benchmarking/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/benchmarking/benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/benchmarking/benchmark.py -------------------------------------------------------------------------------- /src/benchmarking/lm_harness_evaluator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/benchmarking/lm_harness_evaluator.py -------------------------------------------------------------------------------- /src/dpo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/dpo/accelerate/deepspeed2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/dpo/accelerate/deepspeed2.yaml -------------------------------------------------------------------------------- /src/dpo/accelerate/deepspeed3.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/dpo/accelerate/deepspeed3.yaml -------------------------------------------------------------------------------- /src/dpo/accelerate/multi_gpu.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/dpo/accelerate/multi_gpu.yaml -------------------------------------------------------------------------------- /src/dpo/dpo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/dpo/dpo.py -------------------------------------------------------------------------------- /src/enhanced_gcg/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/enhanced_gcg/adv_prefixes.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/adv_prefixes.csv -------------------------------------------------------------------------------- /src/enhanced_gcg/env.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/env.yml -------------------------------------------------------------------------------- /src/enhanced_gcg/flrt_repo/.gitignore: -------------------------------------------------------------------------------- 1 | logs 2 | output 3 | __pycache__ 4 | wandb -------------------------------------------------------------------------------- /src/enhanced_gcg/flrt_repo/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/flrt_repo/LICENSE -------------------------------------------------------------------------------- /src/enhanced_gcg/flrt_repo/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/flrt_repo/README.md -------------------------------------------------------------------------------- /src/enhanced_gcg/flrt_repo/demo.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/flrt_repo/demo.ipynb -------------------------------------------------------------------------------- /src/enhanced_gcg/flrt_repo/demo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/flrt_repo/demo.py -------------------------------------------------------------------------------- /src/enhanced_gcg/flrt_repo/flrt/attack.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/flrt_repo/flrt/attack.py -------------------------------------------------------------------------------- /src/enhanced_gcg/flrt_repo/flrt/finetune.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/flrt_repo/flrt/finetune.py -------------------------------------------------------------------------------- /src/enhanced_gcg/flrt_repo/flrt/internal.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/flrt_repo/flrt/internal.py -------------------------------------------------------------------------------- /src/enhanced_gcg/flrt_repo/flrt/judge.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/flrt_repo/flrt/judge.py -------------------------------------------------------------------------------- /src/enhanced_gcg/flrt_repo/flrt/modal_defs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/flrt_repo/flrt/modal_defs.py -------------------------------------------------------------------------------- /src/enhanced_gcg/flrt_repo/flrt/modal_download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/flrt_repo/flrt/modal_download.py -------------------------------------------------------------------------------- /src/enhanced_gcg/flrt_repo/flrt/objective.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/flrt_repo/flrt/objective.py -------------------------------------------------------------------------------- /src/enhanced_gcg/flrt_repo/flrt/operators.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/flrt_repo/flrt/operators.py -------------------------------------------------------------------------------- /src/enhanced_gcg/flrt_repo/flrt/templates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/flrt_repo/flrt/templates.py -------------------------------------------------------------------------------- /src/enhanced_gcg/flrt_repo/flrt/translate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/flrt_repo/flrt/translate.py -------------------------------------------------------------------------------- /src/enhanced_gcg/flrt_repo/flrt/util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/flrt_repo/flrt/util.py -------------------------------------------------------------------------------- /src/enhanced_gcg/flrt_repo/flrt/victim.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/flrt_repo/flrt/victim.py -------------------------------------------------------------------------------- /src/enhanced_gcg/flrt_repo/poetry.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/flrt_repo/poetry.lock -------------------------------------------------------------------------------- /src/enhanced_gcg/flrt_repo/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/flrt_repo/pyproject.toml -------------------------------------------------------------------------------- /src/enhanced_gcg/measure_ppl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/measure_ppl.py -------------------------------------------------------------------------------- /src/enhanced_gcg/test_adv_prefixes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/enhanced_gcg/test_adv_prefixes.py -------------------------------------------------------------------------------- /src/env.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/env.yml -------------------------------------------------------------------------------- /src/finetuning/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/finetuning/copy_hf_models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/finetuning/copy_hf_models.py -------------------------------------------------------------------------------- /src/finetuning/env.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/finetuning/env.yml -------------------------------------------------------------------------------- /src/finetuning/finetune.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/finetuning/finetune.py -------------------------------------------------------------------------------- /src/logit_lens/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/logit_lens/logit_lens.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/logit_lens/logit_lens.py -------------------------------------------------------------------------------- /src/logit_lens/logit_lens_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/logit_lens/logit_lens_utils.py -------------------------------------------------------------------------------- /src/logit_lens/plot_logit_lens.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/logit_lens/plot_logit_lens.py -------------------------------------------------------------------------------- /src/npo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/npo/accelerate/deepspeed2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/npo/accelerate/deepspeed2.yaml -------------------------------------------------------------------------------- /src/npo/accelerate/deepspeed3.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/npo/accelerate/deepspeed3.yaml -------------------------------------------------------------------------------- /src/npo/accelerate/multi_gpu.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/npo/accelerate/multi_gpu.yaml -------------------------------------------------------------------------------- /src/npo/npo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/npo/npo.py -------------------------------------------------------------------------------- /src/npo/trainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/npo/trainer.py -------------------------------------------------------------------------------- /src/orthogonalization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/orthogonalization/orthogonalization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/orthogonalization/orthogonalization.py -------------------------------------------------------------------------------- /src/perturbations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/perturbations/evaluate_perturbations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/perturbations/evaluate_perturbations.py -------------------------------------------------------------------------------- /src/perturbations/informed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/perturbations/informed.py -------------------------------------------------------------------------------- /src/perturbations/measure_ppl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/perturbations/measure_ppl.py -------------------------------------------------------------------------------- /src/perturbations/naive.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/perturbations/naive.py -------------------------------------------------------------------------------- /src/preference_dataset/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/preference_dataset/generate_arc_mc_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/preference_dataset/generate_arc_mc_dataset.py -------------------------------------------------------------------------------- /src/preference_dataset/generate_forget_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/preference_dataset/generate_forget_datasets.py -------------------------------------------------------------------------------- /src/preference_dataset/generate_retain_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/preference_dataset/generate_retain_datasets.py -------------------------------------------------------------------------------- /src/preference_dataset/generate_wikitext_mc_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/preference_dataset/generate_wikitext_mc_datasets.py -------------------------------------------------------------------------------- /src/preference_dataset/playground.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/preference_dataset/playground.ipynb -------------------------------------------------------------------------------- /src/preference_dataset/refusal_list.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/preference_dataset/refusal_list.py -------------------------------------------------------------------------------- /src/set_difference_pruning/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/LICENSE -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/README.md -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/data/SFT_aligned_llama2-7b-chat-hf_train.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/data/SFT_aligned_llama2-7b-chat-hf_train.csv -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/data/SFT_aligned_llama2-7b-chat-hf_train_short.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/data/SFT_aligned_llama2-7b-chat-hf_train_short.csv -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/data/advbench.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/data/advbench.txt -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/data/alpaca_cleaned_no_safety_train.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/data/alpaca_cleaned_no_safety_train.csv -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/data/probing_result_13b.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/data/probing_result_13b.json -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/data/probing_result_7b.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/data/probing_result_7b.json -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/default_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/default_config.yaml -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/lib/ablate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/lib/ablate.py -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/lib/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/lib/data.py -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/lib/eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/lib/eval.py -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/lib/experiment.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/lib/experiment.py -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/lib/finetune_module.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/lib/finetune_module.py -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/lib/layerwrapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/lib/layerwrapper.py -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/lib/model_wrapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/lib/model_wrapper.py -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/lib/model_wrapper_low.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/lib/model_wrapper_low.py -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/lib/prompt_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/lib/prompt_utils.py -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/lib/prune.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/lib/prune.py -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/lib/sparsegpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/lib/sparsegpt.py -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/main.py -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/main_low_rank.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/main_low_rank.py -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/main_low_rank_diff.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/main_low_rank_diff.py -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/run_gridsearch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/run_gridsearch.py -------------------------------------------------------------------------------- /src/set_difference_pruning/alignment-attribution-code/run_iterative.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/alignment-attribution-code/run_iterative.py -------------------------------------------------------------------------------- /src/set_difference_pruning/generate_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/set_difference_pruning/generate_datasets.py -------------------------------------------------------------------------------- /src/util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/util/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/util/data.py -------------------------------------------------------------------------------- /src/util/globals.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/util/globals.py -------------------------------------------------------------------------------- /src/util/helpers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethz-spylab/unlearning-vs-safety/HEAD/src/util/helpers.py --------------------------------------------------------------------------------