├── .github ├── ISSUE_TEMPLATE │ ├── bug-report.yaml │ ├── config.yaml │ └── feature-request.yaml ├── PULL_REQUEST_TEMPLATE.md └── workflows │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── Makefile ├── README.md ├── assets ├── banner.png └── logo.png ├── community ├── benchmarks │ └── template │ │ ├── README.md │ │ └── run.sh ├── leaderboard.md └── methods │ ├── AltPO │ ├── README.md │ ├── generate.py │ ├── generate.yaml │ └── run.sh │ ├── UNDIAL │ ├── README.md │ └── run.sh │ └── template │ ├── README.md │ └── run.sh ├── configs ├── accelerate │ ├── default_config.yaml │ └── zero_stage3_offload_config.json ├── collator │ ├── DataCollatorForSupervisedDataset.yaml │ └── DataCollatorForSupervisedDatasetwithIndex.yaml ├── data │ ├── datasets │ │ ├── MUSE_MIA.yaml │ │ ├── MUSE_forget.yaml │ │ ├── MUSE_forget_knowmem.yaml │ │ ├── MUSE_forget_scal.yaml │ │ ├── MUSE_forget_sust.yaml │ │ ├── MUSE_forget_verbmem.yaml │ │ ├── MUSE_retain.yaml │ │ ├── MUSE_retain_knowmem.yaml │ │ ├── TOFU_MIA.yaml │ │ ├── TOFU_QA_forget.yaml │ │ ├── TOFU_QA_forget_idk.yaml │ │ ├── TOFU_QA_forget_para.yaml │ │ ├── TOFU_QA_forget_pert.yaml │ │ ├── TOFU_QA_full.yaml │ │ ├── TOFU_QA_ra.yaml │ │ ├── TOFU_QA_ra_pert.yaml │ │ ├── TOFU_QA_retain.yaml │ │ ├── TOFU_QA_retain_eval.yaml │ │ ├── TOFU_QA_retain_para.yaml │ │ ├── TOFU_QA_retain_pert.yaml │ │ ├── TOFU_QA_wf.yaml │ │ ├── TOFU_QA_wf_pert.yaml │ │ ├── WMDP_forget.yaml │ │ └── WMDP_retain.yaml │ ├── finetune.yaml │ └── unlearn.yaml ├── eval.yaml ├── eval │ ├── lm_eval.yaml │ ├── muse.yaml │ ├── muse_metrics │ │ ├── exact_memorization.yaml │ │ ├── extraction_strength.yaml │ │ ├── forget_gibberish.yaml │ │ ├── forget_knowmem_ROUGE.yaml │ │ ├── forget_verbmem_ROUGE.yaml │ │ ├── mia_gradnorm.yaml │ │ ├── mia_loss.yaml │ │ ├── mia_min_k.yaml │ │ ├── mia_min_k_plus_plus.yaml │ │ ├── mia_reference.yaml │ │ ├── mia_zlib.yaml │ │ ├── privleak.yaml │ │ └── retain_knowmem_ROUGE.yaml │ ├── tofu.yaml │ └── tofu_metrics │ │ ├── exact_memorization.yaml │ │ ├── extraction_strength.yaml │ │ ├── forget_Q_A_PARA_Prob.yaml │ │ ├── forget_Q_A_PARA_ROUGE.yaml │ │ ├── forget_Q_A_PERT_Prob.yaml │ │ ├── forget_Q_A_PERT_ROUGE.yaml │ │ ├── forget_Q_A_Prob.yaml │ │ ├── forget_Q_A_ROUGE.yaml │ │ ├── forget_Q_A_gibberish.yaml │ │ ├── forget_Truth_Ratio.yaml │ │ ├── forget_quality.yaml │ │ ├── mia_gradnorm.yaml │ │ ├── mia_loss.yaml │ │ ├── mia_min_k.yaml │ │ ├── mia_min_k_plus_plus.yaml │ │ ├── mia_reference.yaml │ │ ├── mia_zlib.yaml │ │ ├── model_utility.yaml │ │ ├── privleak.yaml │ │ ├── ra_Q_A_PERT_Prob.yaml │ │ ├── ra_Q_A_Prob.yaml │ │ ├── ra_Q_A_Prob_normalised.yaml │ │ ├── ra_Q_A_ROUGE.yaml │ │ ├── ra_Truth_Ratio.yaml │ │ ├── retain_Q_A_PARA_Prob.yaml │ │ ├── retain_Q_A_PERT_Prob.yaml │ │ ├── retain_Q_A_Prob.yaml │ │ ├── retain_Q_A_ROUGE.yaml │ │ ├── retain_Truth_Ratio.yaml │ │ ├── wf_Q_A_PERT_Prob.yaml │ │ ├── wf_Q_A_Prob.yaml │ │ ├── wf_Q_A_Prob_normalised.yaml │ │ ├── wf_Q_A_ROUGE.yaml │ │ └── wf_Truth_Ratio.yaml ├── experiment │ ├── eval │ │ ├── muse │ │ │ └── default.yaml │ │ ├── tofu │ │ │ └── default.yaml │ │ └── wmdp │ │ │ └── default.yaml │ ├── examples │ │ ├── muse_unlearn.yaml │ │ └── tofu_eval.yaml │ ├── finetune │ │ └── tofu │ │ │ └── default.yaml │ └── unlearn │ │ ├── muse │ │ ├── default.yaml │ │ ├── scalability.yaml │ │ └── sustainabilty.yaml │ │ ├── tofu │ │ ├── default.yaml │ │ └── idk.yaml │ │ └── wmdp │ │ └── default.yaml ├── generation │ └── default.yaml ├── hydra │ ├── default.yaml │ └── eval.yaml ├── model │ ├── Llama-2-7b-chat-hf.yaml │ ├── Llama-2-7b-hf.yaml │ ├── Llama-3.1-8B-Instruct.yaml │ ├── Llama-3.2-1B-Instruct.yaml │ ├── Llama-3.2-3B-Instruct.yaml │ ├── Phi-3.5-mini-instruct.yaml │ ├── gemma-7b-it.yaml │ ├── phi-1_5.yaml │ └── zephyr-7b-beta.yaml ├── paths │ └── default.yaml ├── train.yaml ├── trainer │ ├── DPO.yaml │ ├── GradAscent.yaml │ ├── GradDiff.yaml │ ├── NPO.yaml │ ├── RMU.yaml │ ├── SimNPO.yaml │ ├── UNDIAL.yaml │ └── finetune.yaml └── unlearn.yaml ├── docs ├── components.md ├── contributing.md ├── evaluation.md ├── experiments.md ├── hydra.md ├── links.md └── repro.md ├── requirements.txt ├── scripts ├── muse_unlearn.sh ├── tofu_finetune.sh └── tofu_unlearn.sh ├── setup.py ├── setup_data.py └── src ├── data ├── __init__.py ├── collators.py ├── pretraining.py ├── qa.py ├── unlearn.py └── utils.py ├── eval.py ├── evals ├── __init__.py ├── base.py ├── lm_eval.py ├── metrics │ ├── __init__.py │ ├── base.py │ ├── memorization.py │ ├── mia │ │ ├── __init__.py │ │ ├── all_attacks.py │ │ ├── gradnorm.py │ │ ├── loss.py │ │ ├── min_k.py │ │ ├── min_k_plus_plus.py │ │ ├── reference.py │ │ ├── utils.py │ │ └── zlib.py │ ├── privacy.py │ ├── utility.py │ └── utils.py ├── muse.py └── tofu.py ├── model ├── __init__.py └── probe.py ├── train.py └── trainer ├── __init__.py ├── base.py ├── unlearn ├── base.py ├── dpo.py ├── grad_ascent.py ├── grad_diff.py ├── npo.py ├── rmu.py ├── simnpo.py └── undial.py └── utils.py /.github/ISSUE_TEMPLATE/bug-report.yaml: -------------------------------------------------------------------------------- 1 | # Picked from https://github.com/huggingface/transformers/blob/main/.github/ISSUE_TEMPLATE/bug-report.yml 2 | name: "\U0001F41B Bug Report" 3 | description: Submit a bug report to help us improve open-unlearning 4 | labels: [ "bug" ] 5 | body: 6 | - type: checkboxes 7 | id: information-scripts-examples 8 | attributes: 9 | label: Information 10 | description: 'The problem arises when using:' 11 | options: 12 | - label: "The official example scripts" 13 | - label: "My own modified scripts" 14 | 15 | - type: checkboxes 16 | id: information-tasks 17 | attributes: 18 | label: Tasks 19 | description: "The tasks I am working on are:" 20 | options: 21 | - label: "An officially supported task" 22 | - label: "My own task or dataset (give details below)" 23 | 24 | - type: textarea 25 | id: reproduction 26 | validations: 27 | required: true 28 | attributes: 29 | label: Reproduction 30 | description: | 31 | Please provide a code sample that reproduces the problem you ran into. 32 | Please include relevant config information such as deepspeed configs and experiment configs in .hydra folder of your experiment. 33 | If you have code snippets, error messages, stack traces please provide them here as well. 34 | Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting 35 | Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code. 36 | 37 | placeholder: | 38 | Steps to reproduce the behavior: 39 | 40 | 1. 41 | 2. 42 | 3. 43 | 44 | 45 | - type: textarea 46 | id: expected-behavior 47 | validations: 48 | required: true 49 | attributes: 50 | label: Expected behavior 51 | description: "A clear and concise description of what you would expect to happen." -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yaml: -------------------------------------------------------------------------------- 1 | # Picked from https://github.com/huggingface/transformers/blob/main/.github/ISSUE_TEMPLATE/config.yml 2 | blank_issues_enabled: true 3 | version: 2.1 -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.yaml: -------------------------------------------------------------------------------- 1 | # Picked from https://github.com/huggingface/transformers/blob/main/.github/ISSUE_TEMPLATE/feature-request.yml 2 | name: "\U0001F680 Feature request" 3 | description: Submit a proposal/request for a new open-unlearning feature 4 | labels: [ "Feature request" ] 5 | body: 6 | - type: checkboxes 7 | id: information-tasks 8 | attributes: 9 | label: Tasks 10 | description: "New feature belongs to adding" 11 | options: 12 | - label: "Benchmark" 13 | - label: "Unlearning method" 14 | - label: "Evaluation" 15 | - label: "Dataset" 16 | - label: "None of the above" 17 | 18 | - type: textarea 19 | id: feature-request 20 | validations: 21 | required: true 22 | attributes: 23 | label: Feature request 24 | description: | 25 | A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist. 26 | 27 | - type: textarea 28 | id: motivation 29 | validations: 30 | required: true 31 | attributes: 32 | label: Motivation 33 | description: | 34 | Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too. 35 | 36 | - type: textarea 37 | id: implementation 38 | validations: 39 | required: false 40 | attributes: 41 | label: Implementation 42 | description: | 43 | Please describe your proposed solution in detail. Outline the implementation approach, including any key technical considerations. If there are challenges or blockers preventing implementation, specify them along with potential workarounds or dependencies. 44 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | # What does this PR do? 2 | 3 | 4 | Fixes # (issue) 5 | 6 | 7 | ## Before submitting 8 | - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). 9 | - [ ] Have you gone through the contributions [guide](../docs/contributing.md)? 10 | - [ ] Are your changes documented? Read documentation guidelines [here](../README.md#-further-documentation). -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | push: 5 | paths: 6 | - "**.py" 7 | - "requirements.txt" 8 | - ".github/workflows/*.yml" 9 | pull_request: 10 | paths: 11 | - "**.py" 12 | - "requirements.txt" 13 | - ".github/workflows/*.yml" 14 | 15 | jobs: 16 | tests: 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | python-version: 21 | - "3.11" 22 | os: 23 | - "ubuntu-latest" 24 | 25 | runs-on: ubuntu-latest 26 | 27 | environment: 28 | name: tests 29 | 30 | env: 31 | # HF_TOKEN: ${{ secrets.HF_TOKEN }} 32 | OS_NAME: ${{ matrix.os }} 33 | 34 | steps: 35 | - name: Checkout 36 | uses: actions/checkout@v4 37 | 38 | - name: Set up Python 39 | uses: actions/setup-python@v5 40 | with: 41 | python-version: ${{ matrix.python-version }} 42 | cache: "pip" 43 | cache-dependency-path: "setup.py" 44 | 45 | - name: Install dependencies 46 | run: | 47 | python -m pip install --upgrade pip 48 | pip install ruff==0.6.6 49 | 50 | - name: Check Quality 51 | run: make quality 52 | 53 | # - name: Test with pytest 54 | # run: | 55 | # cd 56 | # make test 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # custom .gitignore 2 | submit.py 3 | !src/data/__init__.py 4 | logs/ 5 | unity/ 6 | src/data/test.py 7 | src/data/__pycache__ 8 | ms_cache/ 9 | logs/ 10 | hf_cache/ 11 | cache*/ 12 | saves*/ 13 | notebooks/ 14 | output*/ 15 | wandb/ 16 | data/ 17 | !*/data/ 18 | evals/ 19 | !*/evals/ 20 | # Byte-compiled / optimized / DLL files 21 | __pycache__/ 22 | *.py[cod] 23 | *$py.class 24 | eval_logs/ 25 | eval_dumps/ 26 | # C extensions 27 | *.so 28 | 29 | # Distribution / packaging 30 | .Python 31 | build/ 32 | develop-eggs/ 33 | dist/ 34 | downloads/ 35 | eggs/ 36 | .eggs/ 37 | lib/ 38 | lib64/ 39 | parts/ 40 | sdist/ 41 | var/ 42 | wheels/ 43 | share/python-wheels/ 44 | *.egg-info/ 45 | .installed.cfg 46 | *.egg 47 | MANIFEST 48 | 49 | # PyInstaller 50 | # Usually these files are written by a python script from a template 51 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 52 | *.manifest 53 | *.spec 54 | 55 | # Installer logs 56 | pip-log.txt 57 | pip-delete-this-directory.txt 58 | 59 | # Unit test / coverage reports 60 | htmlcov/ 61 | .tox/ 62 | .nox/ 63 | .coverage 64 | .coverage.* 65 | .cache 66 | nosetests.xml 67 | coverage.xml 68 | *.cover 69 | *.py,cover 70 | .hypothesis/ 71 | .pytest_cache/ 72 | .ruff_cache/ 73 | cover/ 74 | 75 | # Translations 76 | *.mo 77 | *.pot 78 | 79 | # Django stuff: 80 | *.log 81 | local_settings.py 82 | db.sqlite3 83 | db.sqlite3-journal 84 | 85 | # Flask stuff: 86 | instance/ 87 | .webassets-cache 88 | 89 | # Scrapy stuff: 90 | .scrapy 91 | 92 | # Sphinx documentation 93 | docs/_build/ 94 | 95 | # PyBuilder 96 | .pybuilder/ 97 | target/ 98 | 99 | # Jupyter Notebook 100 | .ipynb_checkpoints 101 | 102 | # IPython 103 | profile_default/ 104 | ipython_config.py 105 | 106 | # pyenv 107 | # For a library or package, you might want to ignore these files since the code is 108 | # intended to run in multiple environments; otherwise, check them in: 109 | # .python-version 110 | 111 | # pipenv 112 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 113 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 114 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 115 | # install all needed dependencies. 116 | #Pipfile.lock 117 | 118 | # poetry 119 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 120 | # This is especially recommended for binary packages to ensure reproducibility, and is more 121 | # commonly ignored for libraries. 122 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 123 | #poetry.lock 124 | 125 | # pdm 126 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 127 | #pdm.lock 128 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 129 | # in version control. 130 | # https://pdm.fming.dev/#use-with-ide 131 | .pdm.toml 132 | 133 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 134 | __pypackages__/ 135 | 136 | # Celery stuff 137 | celerybeat-schedule 138 | celerybeat.pid 139 | 140 | # SageMath parsed files 141 | *.sage.py 142 | 143 | # Environments 144 | .env 145 | .venv 146 | env/ 147 | venv/ 148 | ENV/ 149 | env.bak/ 150 | venv.bak/ 151 | 152 | # Spyder project settings 153 | .spyderproject 154 | .spyproject 155 | 156 | # Rope project settings 157 | .ropeproject 158 | 159 | # mkdocs documentation 160 | /site 161 | 162 | # mypy 163 | .mypy_cache/ 164 | .dmypy.json 165 | dmypy.json 166 | 167 | # Pyre type checker 168 | .pyre/ 169 | 170 | # pytype static type analyzer 171 | .pytype/ 172 | 173 | # Cython debug symbols 174 | cython_debug/ 175 | 176 | # PyCharm 177 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 178 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 179 | # and can be added to the global gitignore or merged into this file. For a more nuclear 180 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 181 | .idea/ 182 | 183 | .vscode/ 184 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | 3 | - repo: https://github.com/astral-sh/ruff-pre-commit 4 | rev: v0.6.9 5 | hooks: 6 | - id: ruff 7 | args: [check, --fix, scripts, src, setup.py, setup_data.py] 8 | - id: ruff 9 | args: [format, --check, scripts, src, setup.py setup_data.py] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 CMU Locus Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: quality style 2 | 3 | check_dirs := scripts src #setup.py 4 | 5 | quality: 6 | ruff check $(check_dirs) setup.py setup_data.py 7 | ruff format --check $(check_dirs) setup.py setup_data.py 8 | 9 | style: 10 | ruff check $(check_dirs) setup.py setup_data.py --fix 11 | ruff format $(check_dirs) setup.py setup_data.py 12 | 13 | test: 14 | CUDA_VISIBLE_DEVICES= pytest tests/ 15 | -------------------------------------------------------------------------------- /assets/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/locuslab/open-unlearning/b71de54c179408d447bc383c86fd1fafcc99dc14/assets/banner.png -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/locuslab/open-unlearning/b71de54c179408d447bc383c86fd1fafcc99dc14/assets/logo.png -------------------------------------------------------------------------------- /community/benchmarks/template/README.md: -------------------------------------------------------------------------------- 1 | # TITLE 2 | 3 | - Paper title, authors, links. 4 | 5 | Provide a concise summary of your benchmark details and its contributions. Please avoid using images to keep the repository size manageable. 6 | 7 | # Datasets 8 | 9 | Use a clear and consistent naming convention for dataset splits. 10 | 11 | - [ ] Provide a link to find/download the datasets (preferably HuggingFace). 12 | 13 | # Models 14 | 15 | 16 | - [ ] Upload any unlearning target or reference retain models for unlearning preferably on HuggingFace and provide the path. 17 | - [ ] Model creation details and how they fit in benchmark. 18 | 19 | # Baselines & Results 20 | 21 | Discuss the baselines used and their results. 22 | 23 | 24 | ## Setup 25 | Please include the experimental setup for the baselines 26 | 27 | - [ ] **Hyperparameters & Search Space:** Specify key hyperparameters, their search ranges, number of trials etc. 28 | - [ ] **Computational Setup:** Mention the type and number of GPUs used. 29 | - [ ] **DeepSpeed Configuration** (if used): If any modifications were made to the default DeepSpeed config, specify them here. (You may include the config as a code block.) 30 | - [ ] **Other Details:** Any additional setup details crucial for reproducing your method. 31 | 32 | To replicate your results, provide a `run.sh` script that contains all necessary commands to reproduce the final results. Ensure the script is well-documented. 33 | 34 | 35 | # Citation 36 | 37 | 38 | If you use this work, please cite: 39 | 40 | ```bibtex 41 | 42 | 43 | 44 | @misc{openunlearning2025, 45 | title={OpenUnlearning: A Unified Framework for LLM Unlearning Benchmarks}, 46 | author={Dorna, Vineeth and Mekala, Anmol and Zhao, Wenlong and McCallum, Andrew and Kolter, J Zico and Maini, Pratyush}, 47 | year={2025}, 48 | howpublished={\url{https://github.com/locuslab/open-unlearning}}, 49 | note={Accessed: February 27, 2025} 50 | } 51 | ``` -------------------------------------------------------------------------------- /community/benchmarks/template/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################################################################################################################## 4 | ########################################### RETAIN Finetuned ####$###################################################### 5 | ######################################################################################################################## 6 | 7 | 8 | 9 | ######################################################################################################################### 10 | ############################################ FULL Finetuned models ###################################################### 11 | ######################################################################################################################### 12 | 13 | 14 | 15 | 16 | ######################################################################################################################### 17 | ############################################ Baseline methods ####$###################################################### 18 | ######################################################################################################################### 19 | -------------------------------------------------------------------------------- /community/leaderboard.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | # Leaderboard 4 | 5 |
6 | 7 | We encourage the community to develop new methods, optimize them for specific benchmarks, and compare results with existing approaches. 8 | 9 | To implement a new method, refer to our [contributing guide](../docs/contributing.md). 10 | 11 | > [!NOTE] 12 | > The [results.md](../docs/results.md) file is maintained for reproducibility purposes. However, we encourage contributors to update the leaderboard table instead of the reproducibility table. We will continue refining and tuning baseline methods to keep the leaderboard up to date. 13 | 14 | 15 | ### TOFU unlearning on the `Llama-2-7b-hf-chat` architecture 16 | 17 |
18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 |
Methodforget10
forget_qualitymodel_utility
Finetuned4.35e-250.63
Retain1.00.61
49 |
50 | 51 | 52 | 53 | ### TOFU unlearning on the `Llama-3.2-1B-Instruct` architecture 54 | 55 |
56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 |
Methodforget10
forget_qualitymodel_utility
Finetuned3.91e-220.6
Retain1.00.59
87 |
88 | 89 | 90 | ### MUSE unlearning on the benchmark's target models 91 | 92 | 93 |
94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 |
MethodNewsBooks
forget_knowmem_ROUGEforget_verbmem_ROUGEprivleakretain_knowmem_ROUGEforget_knowmem_ROUGEforget_verbmem_ROUGEprivleakretain_knowmem_ROUGE
Finetuned0.640.58-99.810.560.471.0-57.340.69
Retain0.330.2000.560.30.1400.69
141 |
142 | -------------------------------------------------------------------------------- /community/methods/AltPO/README.md: -------------------------------------------------------------------------------- 1 | # Alternate Preference Optimization for Unlearning Factual Knowledge in Large Language Models 2 | - Authors: Anmol Mekala, Vineeth Dorna, Shreya Dubey, Abhishek Lalwani, David Koleczek, Mukund Rungta, Sadid Hasan, Elita Lobo 3 | - Paper Link: https://arxiv.org/pdf/2409.13474 4 | - Code Link: https://github.com/molereddy/Alternate-Preference-Optimization 5 | 6 | 7 | LLMs struggle to suppress forget set responses using only negative feedback during unlearning, often resulting in inconsistent outputs, reduced utility, and potential privacy risks. To address this, AltPO enables stable and effective unlearning by combining negative feedback on the forget set along with positive feedback through plausible alternative responses. 8 | 9 | 10 | ## Setup 11 | 12 | #### Generate Alternate Dataset 13 | 14 | The following command generates alternate responses for TOFU, which are then used for unlearning. 15 | ```python 16 | python generate.py dataset_config.dataset_kwargs.name=forget10 17 | ``` 18 | 19 | #### Hyperparameters & Search Space 20 | The original paper experiments with LLaMA2-7B; however, the following parameter ranges are reasonable to explore. You can adjust them based on the model and task. Perform a grid search over: beta in [0.05, 0.1, 0.5], learning rate in [1e-5, 2e-5, 5e-5], and alpha in [1, 2, 5]. 21 | 22 | #### Computational Setup 23 | All experiments in `run.sh` are run on single A100 GPU. If larger models are used you can use deepspeed to launch the unlearning job. 24 | 25 | 26 | ## Results 27 | Run `run.sh` script. 28 | 29 | 30 | ## Citation 31 | ```bibtex 32 | @article{mekala2024alternate, 33 | title={Alternate preference optimization for unlearning factual knowledge in large language models}, 34 | author={Mekala, Anmol and Dorna, Vineeth and Dubey, Shreya and Lalwani, Abhishek and Koleczek, David and Rungta, Mukund and Hasan, Sadid and Lobo, Elita}, 35 | journal={arXiv preprint arXiv:2409.13474}, 36 | year={2024} 37 | } 38 | ``` -------------------------------------------------------------------------------- /community/methods/AltPO/generate.yaml: -------------------------------------------------------------------------------- 1 | model_config: 2 | model_name: tofu_Llama-3.2-1B-Instruct_full 3 | model_kwargs: 4 | pretrained_model_name_or_path: open-unlearning/tofu_Llama-3.2-1B-Instruct_full 5 | trust_remote_code: True 6 | device_map: auto 7 | 8 | dataset_config: 9 | dataset_name: tofu 10 | dataset_kwargs: 11 | path: 'locuslab/TOFU' 12 | name: 'forget10' 13 | split: train 14 | cache_dir: _cache_data/ 15 | 16 | prompt_config: 17 | prompt_name: INST_QAS_LLAMA3_TEMPLATE 18 | examples_path: null 19 | fewshot_delimiter: "\n\n" 20 | 21 | repeats: 5 22 | 23 | generation_kwargs: 24 | max_new_tokens: 200 25 | do_sample: True 26 | temperature: 1.0 27 | 28 | until: 29 | - "Question:" 30 | - "Question: " 31 | - "Q: " 32 | - "Q:" 33 | 34 | 35 | batch_size: 1 36 | padding_size: left 37 | truncation: False 38 | seed: 0 39 | device: cuda 40 | output_file: data/${model_config.model_name}/${dataset_config.dataset_kwargs.name}/alt${repeats}_seed_${seed}.json 41 | # limit: 5 -------------------------------------------------------------------------------- /community/methods/AltPO/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export MASTER_PORT=$(python -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()") 4 | echo "Master Port: $MASTER_PORT" 5 | 6 | ######################################################################################################################## 7 | ########################################### Unlearn TOFU models ######################################################## 8 | ######################################################################################################################## 9 | 10 | 11 | models=( 12 | "Llama-3.2-1B-Instruct" 13 | ) 14 | trainers_experiments=( 15 | "DPO unlearn/tofu/default.yaml" 16 | ) 17 | forget_retain_splits=( 18 | "forget10 retain90" 19 | "forget05 retain95" 20 | "forget01 retain99" 21 | ) 22 | 23 | per_device_train_batch_size=8 24 | gradient_accumulation_steps=4 25 | 26 | 27 | lrs=(1e-5 2e-5 5e-5) 28 | betas=(0.05 0.1 0.5) 29 | alphas=(1 2 5) 30 | 31 | 32 | for split in "${forget_retain_splits[@]}"; do 33 | forget_split=$(echo $split | cut -d' ' -f1) 34 | retain_split=$(echo $split | cut -d' ' -f2) 35 | for model in "${models[@]}"; do 36 | for trainer_experiment in "${trainers_experiments[@]}"; do 37 | trainer=$(echo $trainer_experiment | cut -d' ' -f1) 38 | experiment=$(echo $trainer_experiment | cut -d' ' -f2) 39 | for lr in "${lrs[@]}"; do 40 | for beta in "${betas[@]}"; do 41 | for alpha in "${alphas[@]}"; do 42 | task_name=tofu_${model}_${forget_split}_AltPO_lr${lr}_beta${beta}_alpha${alpha} 43 | model_path=open-unlearning/tofu_${model}_full 44 | echo ${task_name}: Unlearning ${model_path} using ${trainer} 45 | 46 | # Unlearn 47 | CUDA_VISIBLE_DEVICES=0 \ 48 | python src/train.py --config-name=unlearn.yaml \ 49 | experiment=${experiment} \ 50 | trainer=${trainer} \ 51 | task_name=${task_name} \ 52 | model=${model} \ 53 | forget_split=${forget_split} \ 54 | retain_split=${retain_split} \ 55 | model.model_args.pretrained_model_name_or_path=${model_path} \ 56 | retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json \ 57 | trainer.args.per_device_train_batch_size=$per_device_train_batch_size \ 58 | trainer.args.gradient_accumulation_steps=$gradient_accumulation_steps \ 59 | trainer.args.eval_strategy=no \ 60 | trainer.args.eval_on_start=False \ 61 | trainer.args.num_train_epochs=2 \ 62 | trainer.args.learning_rate=$lr \ 63 | trainer.method_args.beta=$beta \ 64 | trainer.method_args.alpha=$alpha \ 65 | data.forget.TOFU_QA_forget.handler=QAwithAlternateDataset \ 66 | ~data.forget.TOFU_QA_forget.args.hf_args.name \ 67 | data.forget.TOFU_QA_forget.args.hf_args.path=json \ 68 | +data.forget.TOFU_QA_forget.args.hf_args.data_files=community/methods/AltPO/data/tofu_Llama-3.2-1B-Instruct_full/${forget_split}/alt5_seed_0.json \ 69 | data.forget.TOFU_QA_forget.args.hf_args.split=train \ 70 | +data.forget.TOFU_QA_forget.args.alternate_key=alternate \ 71 | +data.forget.TOFU_QA_forget.args.return_original=True 72 | 73 | # Eval 74 | CUDA_VISIBLE_DEVICES=0 python src/eval.py \ 75 | experiment=eval/tofu/default.yaml \ 76 | forget_split=${forget_split} \ 77 | model=${model} \ 78 | task_name=${task_name} \ 79 | model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \ 80 | paths.output_dir=saves/unlearn/${task_name}/evals \ 81 | retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json 82 | done 83 | done 84 | done 85 | done 86 | done 87 | done 88 | -------------------------------------------------------------------------------- /community/methods/UNDIAL/README.md: -------------------------------------------------------------------------------- 1 | # UNDIAL: Self-Distillation with Adjusted Logits for Robust Unlearning in Large Language Models (NAACL 2025) 2 | 3 | - Authors: Yijiang River Dong, Hongzhou Lin, Mikhail Belkin, Ramón Huerta, Ivan Vulić 4 | - Link​: https://arxiv.org/pdf/2402.10052 5 | 6 | # Setup 7 | - Hyperparameters: The original paper uses Llama-2 7B with LoRA to tune the model (rank=8, alpha=16) and learning rate of 1e-4. It's suggested to search the learning rate over [1e-5, 3e-4, 1e-4], and use an effective batch size of 32 (batch_size * gradient_accumulation). The other important hyperparemeter is beta, the strength of penalty, which typically takes a number between [3,10,30]. If we change to other models, adjusting learning rate accordingly. 8 | 9 | - Computation Setup: All experiments are run on one A100. 10 | - Other Details: The original paper does not use the retain set and aims to retain knowledge in all domains, not just on the retain set. So alpha is set to 0. Practionioners could search over the alpha or gamma to better retain the performance on the retain set. 11 | 12 | # Results 13 | Run `run.sh` script. 14 | 15 | # Citation 16 | @misc{dong2024undial, 17 | title={UNDIAL: Self-Distillation with Adjusted Logits for Robust Unlearning in Large Language Models}, 18 | author={Yijiang River Dong and Hongzhou Lin and Mikhail Belkin and Ramon Huerta and Ivan Vulić}, 19 | year={2024}, 20 | eprint={2402.10052}, 21 | archivePrefix={arXiv}, 22 | primaryClass={cs.CL}, 23 | url={https://arxiv.org/abs/2402.10052}, 24 | } -------------------------------------------------------------------------------- /community/methods/UNDIAL/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export MASTER_PORT=$(python -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()") 4 | echo "Master Port: $MASTER_PORT" 5 | 6 | ######################################################################################################################## 7 | ########################################### Unlearn TOFU models ######################################################## 8 | ######################################################################################################################## 9 | 10 | models=( 11 | "Llama-3.2-1B-Instruct" 12 | ) 13 | trainers_experiments=( 14 | "UNDIAL unlearn/tofu/default.yaml" 15 | ) 16 | forget_retain_splits=( 17 | "forget10 retain90" 18 | "forget05 retain95" 19 | "forget01 retain99" 20 | ) 21 | 22 | per_device_train_batch_size=16 23 | gradient_accumulation_steps=2 24 | 25 | 26 | lrs=(1e-5 1e-4 3e-4) 27 | alphas=(1 2 5) 28 | betas=(3 10 30) 29 | 30 | 31 | for split in "${forget_retain_splits[@]}"; do 32 | forget_split=$(echo $split | cut -d' ' -f1) 33 | retain_split=$(echo $split | cut -d' ' -f2) 34 | for model in "${models[@]}"; do 35 | for trainer_experiment in "${trainers_experiments[@]}"; do 36 | trainer=$(echo $trainer_experiment | cut -d' ' -f1) 37 | experiment=$(echo $trainer_experiment | cut -d' ' -f2) 38 | for lr in "${lrs[@]}"; do 39 | for beta in "${betas[@]}"; do 40 | for alpha in "${alphas[@]}"; do 41 | task_name=tofu_${model}_${forget_split}_${trainer}_lr${lr}_beta${beta}_alpha${alpha} 42 | model_path=open-unlearning/tofu_${model}_full 43 | echo ${task_name}: Unlearning ${model_path} using ${trainer} 44 | 45 | # Unlearn 46 | CUDA_VISIBLE_DEVICES=0 \ 47 | python src/train.py --config-name=unlearn.yaml \ 48 | experiment=${experiment} \ 49 | trainer=${trainer} \ 50 | task_name=${task_name} \ 51 | model=${model} \ 52 | forget_split=${forget_split} \ 53 | retain_split=${retain_split} \ 54 | model.model_args.pretrained_model_name_or_path=${model_path} \ 55 | retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json \ 56 | trainer.args.per_device_train_batch_size=$per_device_train_batch_size \ 57 | trainer.args.gradient_accumulation_steps=$gradient_accumulation_steps \ 58 | trainer.args.eval_strategy=no \ 59 | trainer.args.eval_on_start=False \ 60 | trainer.args.learning_rate=$lr \ 61 | trainer.method_args.beta=$beta \ 62 | trainer.method_args.alpha=$alpha 63 | 64 | # Eval 65 | CUDA_VISIBLE_DEVICES=0 python src/eval.py \ 66 | experiment=eval/tofu/default.yaml \ 67 | forget_split=${forget_split} \ 68 | model=${model} \ 69 | task_name=${task_name} \ 70 | model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \ 71 | paths.output_dir=saves/unlearn/${task_name}/evals \ 72 | retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json 73 | done 74 | done 75 | done 76 | done 77 | done 78 | done 79 | -------------------------------------------------------------------------------- /community/methods/template/README.md: -------------------------------------------------------------------------------- 1 | # TITLE 2 | 3 | - Paper title, authors, links. 4 | 5 | 6 | Provide a concise summary of your method details and its contributions. Please avoid using images to keep the repository size manageable. 7 | 8 | # Setup 9 | 10 | Please include the experimental setup such as 11 | 12 | - [ ] **Hyperparameters & Search Space:** Specify key hyperparameters, their search ranges, number of trials etc. 13 | - [ ] **Computational Setup:** Mention the type and number of GPUs used. 14 | - [ ] **DeepSpeed Configuration** (if used): If any modifications were made to the default DeepSpeed config, specify them here. (You may include the config as a code block.) 15 | - [ ] **Other Details:** Any additional setup details crucial for reproducing your method. 16 | 17 | # Results 18 | 19 | To replicate your results, provide a `run.sh` script that contains all necessary commands to reproduce the final results. Ensure the script is well-documented. 20 | 21 | It would be appreciated if you can upload the final unlearned model(s) along with their `evals` folders to HuggingFace and provide the link(s) here. As the evaluations are updated, this would help us re-evaluate your model(s). 22 | 23 | # Citation 24 | 25 | 26 | If you use this work, please cite: 27 | 28 | ```bibtex 29 | 30 | 31 | 32 | @misc{openunlearning2025, 33 | title={OpenUnlearning: A Unified Framework for LLM Unlearning Benchmarks}, 34 | author={Dorna, Vineeth and Mekala, Anmol and Zhao, Wenlong and McCallum, Andrew and Kolter, J Zico and Maini, Pratyush}, 35 | year={2025}, 36 | howpublished={\url{https://github.com/locuslab/open-unlearning}}, 37 | note={Accessed: February 27, 2025} 38 | } 39 | ``` -------------------------------------------------------------------------------- /community/methods/template/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################################################################################################################## 4 | ########################################### Hyper parameter tuning ##################################################### 5 | ######################################################################################################################## 6 | 7 | # Optional 8 | 9 | ######################################################################################################################## 10 | ########################################### Final best parameters ##################################################### 11 | ######################################################################################################################## 12 | 13 | # Required to replicate your results -------------------------------------------------------------------------------- /configs/accelerate/default_config.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | deepspeed_config_file: configs/accelerate/zero_stage3_offload_config.json 4 | zero3_init_flag: true 5 | distributed_type: DEEPSPEED 6 | fsdp_config: {} 7 | machine_rank: 0 8 | main_process_ip: null 9 | main_process_port: null 10 | main_training_function: main 11 | num_machines: 1 12 | num_processes: 2 13 | use_cpu: false -------------------------------------------------------------------------------- /configs/accelerate/zero_stage3_offload_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "offload_optimizer": { 5 | "device": "none", 6 | "pin_memory": true 7 | }, 8 | "offload_param": { 9 | "device": "none", 10 | "pin_memory": true 11 | }, 12 | "overlap_comm": true, 13 | "contiguous_gradients": true, 14 | "reduce_bucket_size": "auto", 15 | "stage3_prefetch_bucket_size": "auto", 16 | "stage3_param_persistence_threshold": "auto", 17 | "sub_group_size": 1e9, 18 | "stage3_max_live_parameters": 1e9, 19 | "stage3_max_reuse_distance": 1e9, 20 | "stage3_gather_16bit_weights_on_model_save": true 21 | }, 22 | "train_batch_size": "auto", 23 | "train_micro_batch_size_per_gpu": "auto", 24 | "gradient_accumulation_steps": "auto", 25 | "bf16": { 26 | "enabled": true 27 | } 28 | } -------------------------------------------------------------------------------- /configs/collator/DataCollatorForSupervisedDataset.yaml: -------------------------------------------------------------------------------- 1 | DataCollatorForSupervisedDataset: 2 | handler: DataCollatorForSupervisedDataset 3 | args: 4 | padding_side: right 5 | -------------------------------------------------------------------------------- /configs/collator/DataCollatorForSupervisedDatasetwithIndex.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - DataCollatorForSupervisedDataset 3 | 4 | DataCollatorForSupervisedDataset: 5 | args: 6 | index: index -------------------------------------------------------------------------------- /configs/data/datasets/MUSE_MIA.yaml: -------------------------------------------------------------------------------- 1 | MUSE_MIA_holdout: 2 | access_key: holdout 3 | handler: CompletionDataset 4 | args: 5 | hf_args: 6 | path: "muse-bench/MUSE-News" 7 | name: "privleak" 8 | split: "holdout" 9 | prefix_key: "prompt" # doesn't exist in dataset 10 | text_key: "text" 11 | max_length: 2048 12 | MUSE_MIA_forget: 13 | access_key: forget 14 | handler: CompletionDataset 15 | args: 16 | hf_args: 17 | path: "muse-bench/MUSE-News" 18 | name: "privleak" 19 | split: "forget" 20 | prefix_key: "prompt" # doesn't exist in dataset 21 | text_key: "text" 22 | max_length: 2048 -------------------------------------------------------------------------------- /configs/data/datasets/MUSE_forget.yaml: -------------------------------------------------------------------------------- 1 | MUSE_forget: 2 | handler: PretrainingDataset 3 | args: 4 | hf_args: 5 | path: "muse-bench/MUSE-News" 6 | name: "raw" 7 | split: "forget" 8 | text_key: "text" 9 | max_length: 2048 -------------------------------------------------------------------------------- /configs/data/datasets/MUSE_forget_knowmem.yaml: -------------------------------------------------------------------------------- 1 | MUSE_forget_knowmem: 2 | handler: QADataset 3 | args: 4 | hf_args: 5 | path: "muse-bench/MUSE-News" 6 | name: "knowmem" 7 | split: "forget_qa" 8 | few_shot_dataset_hf_args: 9 | path: "muse-bench/MUSE-News" 10 | name: "knowmem" 11 | split: "forget_qa_icl" 12 | question_key: "question" 13 | answer_key: "answer" 14 | max_length: 512 -------------------------------------------------------------------------------- /configs/data/datasets/MUSE_forget_scal.yaml: -------------------------------------------------------------------------------- 1 | MUSE_forget_scal: 2 | handler: PretrainingDataset 3 | args: 4 | hf_args: 5 | path: "muse-bench/MUSE-News" 6 | name: "scal" 7 | split: "forget_4" 8 | text_key: "text" 9 | max_length: 2048 -------------------------------------------------------------------------------- /configs/data/datasets/MUSE_forget_sust.yaml: -------------------------------------------------------------------------------- 1 | MUSE_forget_sust: 2 | handler: PretrainingDataset 3 | args: 4 | hf_args: 5 | path: "muse-bench/MUSE-News" 6 | name: "sust" 7 | split: "forget_1" 8 | text_key: "text" 9 | max_length: 2048 -------------------------------------------------------------------------------- /configs/data/datasets/MUSE_forget_verbmem.yaml: -------------------------------------------------------------------------------- 1 | MUSE_forget_verbmem: 2 | handler: CompletionDataset 3 | args: 4 | hf_args: 5 | path: "muse-bench/MUSE-News" 6 | name: "verbmem" 7 | split: "forget" 8 | prefix_key: "prompt" 9 | text_key: "gt" 10 | max_length: 2048 11 | insert_space: True -------------------------------------------------------------------------------- /configs/data/datasets/MUSE_retain.yaml: -------------------------------------------------------------------------------- 1 | MUSE_retain: 2 | handler: PretrainingDataset 3 | args: 4 | hf_args: 5 | path: "muse-bench/MUSE-News" 6 | name: "raw" 7 | split: "retain1" 8 | text_key: "text" 9 | max_length: 2048 -------------------------------------------------------------------------------- /configs/data/datasets/MUSE_retain_knowmem.yaml: -------------------------------------------------------------------------------- 1 | MUSE_retain_knowmem: 2 | handler: QADataset 3 | args: 4 | hf_args: 5 | path: "muse-bench/MUSE-News" 6 | name: "knowmem" 7 | split: "retain_qa" 8 | few_shot_dataset_hf_args: 9 | path: "muse-bench/MUSE-News" 10 | name: "knowmem" 11 | split: "retain_qa_icl" 12 | question_key: "question" 13 | answer_key: "answer" 14 | max_length: 512 -------------------------------------------------------------------------------- /configs/data/datasets/TOFU_MIA.yaml: -------------------------------------------------------------------------------- 1 | TOFU_QA_forget: 2 | access_key: forget 3 | handler: QADataset 4 | args: 5 | hf_args: 6 | name: "forget10" 7 | split: "train" 8 | path: "locuslab/TOFU" 9 | question_key: "question" 10 | answer_key: "answer" 11 | max_length: 512 12 | TOFU_QA_holdout: 13 | access_key: holdout 14 | handler: QADataset 15 | args: 16 | hf_args: 17 | name: "holdout10" 18 | path: "locuslab/TOFU" 19 | split: "train" 20 | question_key: "question" 21 | answer_key: "answer" 22 | max_length: 512 -------------------------------------------------------------------------------- /configs/data/datasets/TOFU_QA_forget.yaml: -------------------------------------------------------------------------------- 1 | TOFU_QA_forget: 2 | handler: QADataset 3 | args: 4 | hf_args: 5 | name: "forget10" 6 | split: "train" 7 | path: "locuslab/TOFU" 8 | question_key: "question" 9 | answer_key: "answer" 10 | max_length: 512 11 | 12 | -------------------------------------------------------------------------------- /configs/data/datasets/TOFU_QA_forget_idk.yaml: -------------------------------------------------------------------------------- 1 | TOFU_QA_forget_idk: 2 | handler: QAwithIdkDataset 3 | args: 4 | hf_args: 5 | name: "forget10" 6 | split: "train" 7 | path: "locuslab/TOFU" 8 | question_key: "question" 9 | answer_key: "answer" 10 | max_length: 512 11 | idk_path: ./data/idk.jsonl 12 | return_original: true 13 | -------------------------------------------------------------------------------- /configs/data/datasets/TOFU_QA_forget_para.yaml: -------------------------------------------------------------------------------- 1 | TOFU_QA_forget_para: 2 | handler: QADataset 3 | args: 4 | hf_args: 5 | name: "forget10_perturbed" 6 | split: "train" 7 | path: "locuslab/TOFU" 8 | question_key: "question" 9 | answer_key: "paraphrased_answer" 10 | max_length: 512 11 | -------------------------------------------------------------------------------- /configs/data/datasets/TOFU_QA_forget_pert.yaml: -------------------------------------------------------------------------------- 1 | TOFU_QA_forget_pert: 2 | handler: QADataset 3 | args: 4 | hf_args: 5 | name: "forget10_perturbed" 6 | split: "train" 7 | path: "locuslab/TOFU" 8 | question_key: "question" 9 | answer_key: "perturbed_answer" 10 | max_length: 512 11 | -------------------------------------------------------------------------------- /configs/data/datasets/TOFU_QA_full.yaml: -------------------------------------------------------------------------------- 1 | TOFU_QA_full: 2 | handler: QADataset 3 | args: 4 | hf_args: 5 | name: "full" 6 | split: "train" 7 | path: "locuslab/TOFU" 8 | question_key: "question" 9 | answer_key: "answer" 10 | max_length: 512 -------------------------------------------------------------------------------- /configs/data/datasets/TOFU_QA_ra.yaml: -------------------------------------------------------------------------------- 1 | TOFU_QA_ra: 2 | handler: QADataset 3 | args: 4 | hf_args: 5 | name: "real_authors_perturbed" 6 | split: "train" 7 | path: "locuslab/TOFU" 8 | question_key: "question" 9 | answer_key: "answer" 10 | max_length: 512 -------------------------------------------------------------------------------- /configs/data/datasets/TOFU_QA_ra_pert.yaml: -------------------------------------------------------------------------------- 1 | TOFU_QA_ra_pert: 2 | handler: QADataset 3 | args: 4 | hf_args: 5 | name: "real_authors_perturbed" 6 | split: "train" 7 | path: "locuslab/TOFU" 8 | question_key: "question" 9 | answer_key: "perturbed_answer" 10 | max_length: 512 -------------------------------------------------------------------------------- /configs/data/datasets/TOFU_QA_retain.yaml: -------------------------------------------------------------------------------- 1 | TOFU_QA_retain: 2 | handler: QADataset 3 | args: 4 | hf_args: 5 | name: "retain90" 6 | split: "train" 7 | path: "locuslab/TOFU" 8 | question_key: "question" 9 | answer_key: "answer" 10 | max_length: 512 -------------------------------------------------------------------------------- /configs/data/datasets/TOFU_QA_retain_eval.yaml: -------------------------------------------------------------------------------- 1 | TOFU_QA_retain_eval: 2 | handler: QADataset 3 | args: 4 | hf_args: 5 | name: "retain_perturbed" 6 | split: "train" 7 | path: "locuslab/TOFU" 8 | question_key: "question" 9 | answer_key: "answer" 10 | max_length: 512 -------------------------------------------------------------------------------- /configs/data/datasets/TOFU_QA_retain_para.yaml: -------------------------------------------------------------------------------- 1 | TOFU_QA_retain_para: 2 | handler: QADataset 3 | args: 4 | hf_args: 5 | name: "retain_perturbed" 6 | split: "train" 7 | path: "locuslab/TOFU" 8 | question_key: "question" 9 | answer_key: "paraphrased_answer" 10 | max_length: 512 11 | -------------------------------------------------------------------------------- /configs/data/datasets/TOFU_QA_retain_pert.yaml: -------------------------------------------------------------------------------- 1 | TOFU_QA_retain_pert: 2 | handler: QADataset 3 | args: 4 | hf_args: 5 | name: "retain_perturbed" 6 | split: "train" 7 | path: "locuslab/TOFU" 8 | question_key: "question" 9 | answer_key: "perturbed_answer" 10 | max_length: 512 11 | -------------------------------------------------------------------------------- /configs/data/datasets/TOFU_QA_wf.yaml: -------------------------------------------------------------------------------- 1 | TOFU_QA_wf: 2 | handler: QADataset 3 | args: 4 | hf_args: 5 | name: "world_facts_perturbed" 6 | split: "train" 7 | path: "locuslab/TOFU" 8 | question_key: "question" 9 | answer_key: "answer" 10 | max_length: 512 -------------------------------------------------------------------------------- /configs/data/datasets/TOFU_QA_wf_pert.yaml: -------------------------------------------------------------------------------- 1 | TOFU_QA_wf_pert: 2 | handler: QADataset 3 | args: 4 | hf_args: 5 | name: "world_facts_perturbed" 6 | split: "train" 7 | path: "locuslab/TOFU" 8 | question_key: "question" 9 | answer_key: "perturbed_answer" 10 | max_length: 512 -------------------------------------------------------------------------------- /configs/data/datasets/WMDP_forget.yaml: -------------------------------------------------------------------------------- 1 | WMDP_forget: 2 | handler: PretrainingDataset 3 | args: 4 | hf_args: 5 | path: "text" 6 | data_files: "data/wmdp/wmdp-corpora/cyber-forget-corpus.jsonl" 7 | split: "train" 8 | text_key: "text" 9 | max_length: 512 -------------------------------------------------------------------------------- /configs/data/datasets/WMDP_retain.yaml: -------------------------------------------------------------------------------- 1 | WMDP_retain: 2 | handler: PretrainingDataset 3 | args: 4 | hf_args: 5 | path: "text" 6 | data_files: "data/wmdp/wmdp-corpora/cyber-retain-corpus.jsonl" 7 | split: "train" 8 | text_key: "text" 9 | max_length: 512 -------------------------------------------------------------------------------- /configs/data/finetune.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - datasets@train: TOFU_QA_full 3 | - datasets@eval: null -------------------------------------------------------------------------------- /configs/data/unlearn.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - datasets@forget: TOFU_QA_forget 3 | - datasets@retain: TOFU_QA_retain 4 | - datasets@eval: null 5 | 6 | anchor: forget -------------------------------------------------------------------------------- /configs/eval.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - _self_ 5 | - model: Llama-3.2-3B-Instruct 6 | - eval: tofu 7 | - paths: default 8 | - hydra: eval 9 | - experiment: null 10 | 11 | model: 12 | model_args: 13 | device_map: cuda 14 | 15 | mode: eval 16 | task_name: ??? 17 | seed: 0 -------------------------------------------------------------------------------- /configs/eval/lm_eval.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.lm_eval 2 | # NOTE: the above line is not a comment, but sets the package for config. See https://hydra.cc/docs/upgrades/0.11_to_1.0/adding_a_package_directive/ 3 | 4 | handler: LMEvalEvaluator 5 | output_dir: ${paths.output_dir} # set to default eval directory 6 | overwrite: false 7 | 8 | # Define evaluation tasks here 9 | tasks: 10 | - mmlu 11 | # - task: gsm8k 12 | # dataset_path: gsm8k 13 | # # define the entire task config. 14 | # # ^ Example: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml 15 | 16 | 17 | simple_evaluate_args: 18 | batch_size: 16 19 | system_instruction: null 20 | apply_chat_template: false -------------------------------------------------------------------------------- /configs/eval/muse.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.muse 2 | # NOTE: the above line is not a comment, but sets the package for config. See https://hydra.cc/docs/upgrades/0.11_to_1.0/adding_a_package_directive/ 3 | 4 | defaults: 5 | - muse_metrics: 6 | - forget_knowmem_ROUGE 7 | - retain_knowmem_ROUGE 8 | - forget_verbmem_ROUGE 9 | - privleak 10 | - extraction_strength 11 | # - exact_memorization 12 | # - mia_min_k_plus_plus 13 | # - mia_min_k 14 | # - mia_loss 15 | # - mia_reference 16 | # - mia_zlib 17 | # - mia_gradnorm 18 | # - forget_gibberish 19 | 20 | handler: MUSEEvaluator 21 | output_dir: ${paths.output_dir} # set to default eval directory 22 | metrics: {} 23 | overwrite: false 24 | data_split: News 25 | retain_logs_path: null -------------------------------------------------------------------------------- /configs/eval/muse_metrics/exact_memorization.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.muse.metrics.exact_memorization 2 | defaults: 3 | - ../../data/datasets@datasets: MUSE_forget_verbmem 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | 6 | handler: exact_memorization 7 | batch_size: 8 8 | datasets: 9 | MUSE_forget_verbmem: 10 | args: 11 | hf_args: 12 | path: muse-bench/MUSE-${eval.muse.data_split} -------------------------------------------------------------------------------- /configs/eval/muse_metrics/extraction_strength.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.muse.metrics.extraction_strength 2 | defaults: 3 | - ../../data/datasets@datasets: MUSE_forget_verbmem 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | 6 | handler: extraction_strength 7 | batch_size: 8 8 | datasets: 9 | MUSE_forget_verbmem: 10 | args: 11 | hf_args: 12 | path: muse-bench/MUSE-${eval.muse.data_split} -------------------------------------------------------------------------------- /configs/eval/muse_metrics/forget_gibberish.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.muse.metrics.forget_gibberish 2 | defaults: 3 | - .@pre_compute.forget_verbmem_ROUGE: forget_verbmem_ROUGE 4 | 5 | pre_compute: 6 | forget_verbmem_ROUGE: 7 | access_key: text 8 | 9 | handler: classifier_prob 10 | batch_size: 32 11 | max_length: 512 12 | class_id: 0 13 | text_key: generation 14 | device: cuda 15 | 16 | classifier_model_args: 17 | pretrained_model_name_or_path: "madhurjindal/autonlp-Gibberish-Detector-492513457" 18 | 19 | classifier_tokenization_args: 20 | pretrained_model_name_or_path: "madhurjindal/autonlp-Gibberish-Detector-492513457" 21 | -------------------------------------------------------------------------------- /configs/eval/muse_metrics/forget_knowmem_ROUGE.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.muse.metrics.forget_knowmem_ROUGE 2 | defaults: 3 | - ../../data/datasets@datasets: MUSE_forget_knowmem 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | - ../../generation@generation_args: default 6 | handler: rouge 7 | rouge_type: rougeL_f1 8 | batch_size: 16 9 | datasets: 10 | MUSE_forget_knowmem: 11 | args: 12 | hf_args: 13 | path: muse-bench/MUSE-${eval.muse.data_split} 14 | few_shot_dataset_hf_args: 15 | path: muse-bench/MUSE-${eval.muse.data_split} 16 | predict_with_generate: True 17 | collators: 18 | DataCollatorForSupervisedDataset: 19 | args: 20 | padding_side: left 21 | generation_args: 22 | max_new_tokens: 32 23 | stopwords: ["\n\n", "\nQuestion", "Question:"] 24 | -------------------------------------------------------------------------------- /configs/eval/muse_metrics/forget_verbmem_ROUGE.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.muse.metrics.forget_verbmem_ROUGE 2 | defaults: 3 | - ../../data/datasets@datasets: MUSE_forget_verbmem 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | - ../../generation@generation_args: default 6 | handler: rouge 7 | rouge_type: rougeL_f1 8 | batch_size: 8 9 | datasets: 10 | MUSE_forget_verbmem: 11 | args: 12 | hf_args: 13 | path: muse-bench/MUSE-${eval.muse.data_split} 14 | predict_with_generate: True 15 | collators: 16 | DataCollatorForSupervisedDataset: 17 | args: 18 | padding_side: left 19 | generation_args: 20 | max_new_tokens: 128 21 | -------------------------------------------------------------------------------- /configs/eval/muse_metrics/mia_gradnorm.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.muse.metrics.mia_gradnorm 2 | defaults: 3 | - ../../data/datasets@datasets: MUSE_MIA 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | datasets: 6 | MUSE_MIA_holdout: 7 | args: 8 | hf_args: 9 | path: muse-bench/MUSE-${eval.muse.data_split} 10 | MUSE_MIA_forget: 11 | access_key: forget 12 | args: 13 | hf_args: 14 | path: muse-bench/MUSE-${eval.muse.data_split} 15 | 16 | handler: mia_gradnorm 17 | batch_size: 1 18 | p: 2 -------------------------------------------------------------------------------- /configs/eval/muse_metrics/mia_loss.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.muse.metrics.mia_loss 2 | defaults: 3 | - ../../data/datasets@datasets: MUSE_MIA 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | datasets: 6 | MUSE_MIA_holdout: 7 | args: 8 | hf_args: 9 | path: muse-bench/MUSE-${eval.muse.data_split} 10 | MUSE_MIA_forget: 11 | access_key: forget 12 | args: 13 | hf_args: 14 | path: muse-bench/MUSE-${eval.muse.data_split} 15 | 16 | batch_size: 8 17 | handler: mia_loss 18 | -------------------------------------------------------------------------------- /configs/eval/muse_metrics/mia_min_k.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.muse.metrics.mia_min_k 2 | defaults: 3 | - ../../data/datasets@datasets: MUSE_MIA 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | datasets: 6 | MUSE_MIA_holdout: 7 | args: 8 | hf_args: 9 | path: muse-bench/MUSE-${eval.muse.data_split} 10 | MUSE_MIA_forget: 11 | access_key: forget 12 | args: 13 | hf_args: 14 | path: muse-bench/MUSE-${eval.muse.data_split} 15 | 16 | batch_size: 8 17 | handler: mia_min_k 18 | k: 0.4 -------------------------------------------------------------------------------- /configs/eval/muse_metrics/mia_min_k_plus_plus.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.muse.metrics.mia_min_k_plus_plus 2 | defaults: 3 | - ../../data/datasets@datasets: MUSE_MIA 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | datasets: 6 | MUSE_MIA_holdout: 7 | args: 8 | hf_args: 9 | path: muse-bench/MUSE-${eval.muse.data_split} 10 | MUSE_MIA_forget: 11 | access_key: forget 12 | args: 13 | hf_args: 14 | path: muse-bench/MUSE-${eval.muse.data_split} 15 | 16 | batch_size: 8 17 | handler: mia_min_k_plus_plus 18 | k: 0.4 -------------------------------------------------------------------------------- /configs/eval/muse_metrics/mia_reference.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.muse.metrics.mia_reference 2 | defaults: 3 | - ../../data/datasets@datasets: MUSE_MIA 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | datasets: 6 | MUSE_MIA_holdout: 7 | args: 8 | hf_args: 9 | path: muse-bench/MUSE-${eval.muse.data_split} 10 | MUSE_MIA_forget: 11 | access_key: forget 12 | args: 13 | hf_args: 14 | path: muse-bench/MUSE-${eval.muse.data_split} 15 | 16 | batch_size: 8 17 | handler: mia_reference 18 | reference_model_path: muse-bench/MUSE-${eval.muse.data_split}_retrain # modify appropriately 19 | -------------------------------------------------------------------------------- /configs/eval/muse_metrics/mia_zlib.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.muse.metrics.mia_zlib 2 | defaults: 3 | - ../../data/datasets@datasets: MUSE_MIA 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | datasets: 6 | MUSE_MIA_holdout: 7 | args: 8 | hf_args: 9 | path: muse-bench/MUSE-${eval.muse.data_split} 10 | MUSE_MIA_forget: 11 | access_key: forget 12 | args: 13 | hf_args: 14 | path: muse-bench/MUSE-${eval.muse.data_split} 15 | 16 | batch_size: 8 17 | handler: mia_zlib -------------------------------------------------------------------------------- /configs/eval/muse_metrics/privleak.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.muse.metrics.privleak 2 | defaults: 3 | - .@pre_compute.mia_min_k: mia_min_k 4 | 5 | pre_compute: 6 | mia_min_k: 7 | access_key: forget 8 | 9 | reference_logs: 10 | retain_model_logs: 11 | path: ${eval.muse.retain_logs_path} 12 | include: 13 | mia_min_k: 14 | access_key: retain 15 | 16 | handler: privleak 17 | ref_value: 0.5 -------------------------------------------------------------------------------- /configs/eval/muse_metrics/retain_knowmem_ROUGE.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.muse.metrics.retain_knowmem_ROUGE 2 | defaults: 3 | - ../../data/datasets@datasets: MUSE_retain_knowmem 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | - ../../generation@generation_args: default 6 | handler: rouge 7 | rouge_type: rougeL_f1 8 | batch_size: 16 9 | datasets: 10 | MUSE_retain_knowmem: 11 | args: 12 | hf_args: 13 | path: muse-bench/MUSE-${eval.muse.data_split} 14 | few_shot_dataset_hf_args: 15 | path: muse-bench/MUSE-${eval.muse.data_split} 16 | predict_with_generate: True 17 | collators: 18 | DataCollatorForSupervisedDataset: 19 | args: 20 | padding_side: left 21 | generation_args: 22 | max_new_tokens: 32 23 | stopwords: ["\n\n", "\nQuestion", "Question:"] 24 | -------------------------------------------------------------------------------- /configs/eval/tofu.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu 2 | # NOTE: the above line is not a comment, but sets the package for config. See https://hydra.cc/docs/upgrades/0.11_to_1.0/adding_a_package_directive/ 3 | 4 | defaults: # include all defined metrics files 5 | - tofu_metrics: # When you import a metric here, its configuration automatically populates the 6 | # metric key below, enabled by the @package directive at the top of each configuration file. 7 | - forget_quality 8 | - forget_Q_A_Prob 9 | - forget_Q_A_ROUGE 10 | - model_utility # populated in the metrics key as metrics.model_utility 11 | - privleak 12 | - extraction_strength 13 | # - exact_memorization 14 | # - mia_min_k_plus_plus 15 | # - mia_min_k 16 | # - mia_loss 17 | # - mia_zlib 18 | # - mia_gradnorm 19 | # - mia_reference # set reference model path appropriately 20 | # - forget_Q_A_gibberish 21 | 22 | handler: TOFUEvaluator 23 | output_dir: ${paths.output_dir} # set to default eval directory 24 | metrics: {} # lists a mapping from each evaluation metric to its config 25 | # populated through the first (@package) line in each metric config 26 | overwrite: false 27 | forget_split: forget10 28 | holdout_split: holdout10 29 | retain_logs_path: null 30 | question_key: "question" # Specifies which key to use during forget and retain evaluations (e.g., "question" or "paraphrased_question") 31 | batch_size: 32 -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/exact_memorization.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.exact_memorization 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_QA_forget 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | # ^ get default dataset and generation config information 6 | 7 | handler: exact_memorization 8 | batch_size: ${eval.tofu.batch_size} 9 | 10 | datasets: 11 | TOFU_QA_forget: 12 | args: 13 | hf_args: 14 | name: ${eval.tofu.forget_split}_perturbed 15 | question_key: ${eval.tofu.question_key} -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/extraction_strength.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.extraction_strength 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_QA_forget 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | # ^ get default dataset and generation config information 6 | 7 | handler: extraction_strength 8 | batch_size: ${eval.tofu.batch_size} 9 | 10 | datasets: 11 | TOFU_QA_forget: 12 | args: 13 | hf_args: 14 | name: ${eval.tofu.forget_split}_perturbed 15 | question_key: ${eval.tofu.question_key} -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/forget_Q_A_PARA_Prob.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.forget_Q_A_PARA_Prob 2 | 3 | defaults: 4 | - ../../data/datasets@datasets: TOFU_QA_forget_para 5 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 6 | # ^ get default dataset and generation config information 7 | 8 | handler: probability 9 | batch_size: ${eval.tofu.batch_size} 10 | 11 | datasets: 12 | TOFU_QA_forget_para: 13 | args: 14 | hf_args: 15 | name: ${eval.tofu.forget_split}_perturbed 16 | question_key: ${eval.tofu.question_key} -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/forget_Q_A_PARA_ROUGE.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.forget_Q_A_PARA_ROUGE 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_QA_forget_para 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | - ../../generation@generation_args: default 6 | 7 | # ^ get default dataset and generation config information 8 | 9 | handler: rouge 10 | rouge_type: rougeL_recall 11 | batch_size: ${eval.tofu.batch_size} 12 | 13 | datasets: # override as needed 14 | TOFU_QA_forget_para: 15 | args: 16 | hf_args: 17 | name: ${eval.tofu.forget_split}_perturbed 18 | question_key: ${eval.tofu.question_key} 19 | predict_with_generate: True 20 | collators: 21 | DataCollatorForSupervisedDataset: 22 | args: 23 | padding_side: left 24 | -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/forget_Q_A_PERT_Prob.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.forget_Q_A_PERT_Prob 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_QA_forget_pert 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | # ^ get default dataset and generation config information 6 | 7 | handler: probability 8 | batch_size: ${eval.tofu.batch_size} 9 | 10 | datasets: 11 | TOFU_QA_forget_pert: 12 | args: 13 | hf_args: 14 | name: ${eval.tofu.forget_split}_perturbed 15 | question_key: ${eval.tofu.question_key} -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/forget_Q_A_PERT_ROUGE.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.forget_Q_A_PERT_ROUGE 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_QA_forget_pert 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | - ../../generation@generation_args: default 6 | # ^ get default dataset and generation config information 7 | 8 | handler: rouge 9 | rouge_type: rougeL_recall 10 | batch_size: ${eval.tofu.batch_size} 11 | 12 | datasets: # override as needed 13 | TOFU_QA_forget_pert: 14 | args: 15 | hf_args: 16 | name: ${eval.tofu.forget_split}_perturbed 17 | question_key: ${eval.tofu.question_key} 18 | predict_with_generate: True 19 | collators: 20 | DataCollatorForSupervisedDataset: 21 | args: 22 | padding_side: left -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/forget_Q_A_Prob.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.forget_Q_A_Prob 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_QA_forget 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | # ^ get default dataset and generation config information 6 | 7 | handler: probability 8 | batch_size: ${eval.tofu.batch_size} 9 | 10 | datasets: 11 | TOFU_QA_forget: 12 | args: 13 | hf_args: 14 | name: ${eval.tofu.forget_split}_perturbed 15 | question_key: ${eval.tofu.question_key} -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/forget_Q_A_ROUGE.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.forget_Q_A_ROUGE 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_QA_forget 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | - ../../generation@generation_args: default 6 | 7 | # ^ get default dataset and generation config information 8 | 9 | handler: rouge 10 | rouge_type: rougeL_recall 11 | batch_size: ${eval.tofu.batch_size} 12 | 13 | datasets: # override as needed 14 | TOFU_QA_forget: 15 | args: 16 | hf_args: 17 | name: ${eval.tofu.forget_split}_perturbed 18 | question_key: ${eval.tofu.question_key} 19 | predict_with_generate: True 20 | collators: 21 | DataCollatorForSupervisedDataset: 22 | args: 23 | padding_side: left -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/forget_Q_A_gibberish.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.forget_Q_A_gibberish 2 | defaults: 3 | - .@pre_compute.forget_Q_A_ROUGE: forget_Q_A_ROUGE 4 | 5 | pre_compute: 6 | forget_Q_A_ROUGE: 7 | access_key: text 8 | 9 | handler: classifier_prob 10 | batch_size: 32 11 | max_length: 32 12 | class_id: 0 13 | text_key: generation 14 | device: cuda 15 | 16 | classifier_model_args: 17 | pretrained_model_name_or_path: "madhurjindal/autonlp-Gibberish-Detector-492513457" 18 | 19 | classifier_tokenization_args: 20 | pretrained_model_name_or_path: "madhurjindal/autonlp-Gibberish-Detector-492513457" 21 | -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/forget_Truth_Ratio.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.forget_truth_ratio 2 | defaults: 3 | - .@pre_compute.forget_Q_A_PARA_Prob: forget_Q_A_PARA_Prob 4 | - .@pre_compute.forget_Q_A_PERT_Prob: forget_Q_A_PERT_Prob 5 | 6 | pre_compute: 7 | forget_Q_A_PARA_Prob: 8 | access_key: correct 9 | forget_Q_A_PERT_Prob: 10 | access_key: wrong 11 | 12 | handler: truth_ratio 13 | aggregator: closer_to_1_better 14 | -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/forget_quality.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.forget_quality 2 | defaults: 3 | - .@pre_compute.forget_truth_ratio: forget_Truth_Ratio 4 | 5 | reference_logs: 6 | retain_model_logs: 7 | path: ${eval.tofu.retain_logs_path} 8 | include: 9 | forget_truth_ratio: 10 | access_key: retain 11 | 12 | pre_compute: 13 | forget_truth_ratio: 14 | access_key: forget 15 | 16 | handler: ks_test -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/mia_gradnorm.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.mia_gradnorm 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_MIA 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | 6 | handler: mia_gradnorm 7 | batch_size: 1 8 | p: 2 9 | 10 | datasets: 11 | TOFU_QA_forget: 12 | args: 13 | hf_args: 14 | name: ${eval.tofu.forget_split}_perturbed 15 | question_key: ${eval.tofu.question_key} 16 | TOFU_QA_holdout: 17 | args: 18 | hf_args: 19 | name: ${eval.tofu.holdout_split} -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/mia_loss.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.mia_loss 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_MIA 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | batch_size: ${eval.tofu.batch_size} 6 | handler: mia_loss 7 | 8 | datasets: 9 | TOFU_QA_forget: 10 | args: 11 | hf_args: 12 | name: ${eval.tofu.forget_split}_perturbed 13 | question_key: ${eval.tofu.question_key} 14 | TOFU_QA_holdout: 15 | args: 16 | hf_args: 17 | name: ${eval.tofu.holdout_split} -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/mia_min_k.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.mia_min_k 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_MIA 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | batch_size: ${eval.tofu.batch_size} 6 | handler: mia_min_k 7 | k: 0.4 8 | 9 | datasets: 10 | TOFU_QA_forget: 11 | args: 12 | hf_args: 13 | name: ${eval.tofu.forget_split}_perturbed 14 | question_key: ${eval.tofu.question_key} 15 | TOFU_QA_holdout: 16 | args: 17 | hf_args: 18 | name: ${eval.tofu.holdout_split} -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/mia_min_k_plus_plus.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.mia_min_k_plus_plus 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_MIA 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | batch_size: ${eval.tofu.batch_size} 6 | k: 0.4 7 | handler: mia_min_k_plus_plus 8 | 9 | datasets: 10 | TOFU_QA_forget: 11 | args: 12 | hf_args: 13 | name: ${eval.tofu.forget_split}_perturbed 14 | question_key: ${eval.tofu.question_key} 15 | TOFU_QA_holdout: 16 | args: 17 | hf_args: 18 | name: ${eval.tofu.holdout_split} -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/mia_reference.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.mia_reference 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_MIA 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | batch_size: ${eval.tofu.batch_size} 6 | handler: mia_reference 7 | reference_model_path: ??? # modify appropriately for example open-unlearning/tofu_Llama-3.2-1B-Instruct_retain90 8 | 9 | datasets: 10 | TOFU_QA_forget: 11 | args: 12 | hf_args: 13 | name: ${eval.tofu.forget_split}_perturbed 14 | question_key: ${eval.tofu.question_key} 15 | TOFU_QA_holdout: 16 | args: 17 | hf_args: 18 | name: ${eval.tofu.holdout_split} -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/mia_zlib.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.mia_zlib 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_MIA 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | batch_size: ${eval.tofu.batch_size} 6 | handler: mia_zlib 7 | 8 | datasets: 9 | TOFU_QA_forget: 10 | args: 11 | hf_args: 12 | name: ${eval.tofu.forget_split}_perturbed 13 | question_key: ${eval.tofu.question_key} 14 | TOFU_QA_holdout: 15 | args: 16 | hf_args: 17 | name: ${eval.tofu.holdout_split} -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/model_utility.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.model_utility 2 | defaults: 3 | - .@pre_compute.retain_Q_A_Prob: retain_Q_A_Prob 4 | - .@pre_compute.retain_Q_A_ROUGE: retain_Q_A_ROUGE 5 | - .@pre_compute.retain_Truth_Ratio: retain_Truth_Ratio 6 | - .@pre_compute.ra_Q_A_Prob_normalised: ra_Q_A_Prob_normalised 7 | - .@pre_compute.ra_Q_A_ROUGE: ra_Q_A_ROUGE 8 | - .@pre_compute.ra_Truth_Ratio: ra_Truth_Ratio 9 | - .@pre_compute.wf_Q_A_Prob_normalised: wf_Q_A_Prob_normalised 10 | - .@pre_compute.wf_Q_A_ROUGE: wf_Q_A_ROUGE 11 | - .@pre_compute.wf_Truth_Ratio: wf_Truth_Ratio 12 | 13 | handler: hm_aggregate -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/privleak.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.privleak 2 | defaults: 3 | - .@pre_compute.mia_min_k: mia_min_k 4 | 5 | pre_compute: 6 | mia_min_k: 7 | access_key: forget 8 | 9 | reference_logs: 10 | retain_model_logs: 11 | path: ${eval.tofu.retain_logs_path} 12 | include: 13 | mia_min_k: 14 | access_key: retain 15 | 16 | handler: privleak 17 | ref_value: 0.5 18 | -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/ra_Q_A_PERT_Prob.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.ra_Q_A_PERT_Prob 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_QA_ra_pert 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | # ^ get default dataset and generation config information 6 | 7 | handler: probability 8 | batch_size: ${eval.tofu.batch_size} -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/ra_Q_A_Prob.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.ra_Q_A_Prob 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_QA_ra 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | # ^ get default dataset and generation config information 6 | 7 | handler: probability 8 | batch_size: ${eval.tofu.batch_size} -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/ra_Q_A_Prob_normalised.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.ra_Q_A_Prob_normalised 2 | defaults: 3 | - .@pre_compute.ra_Q_A_Prob: ra_Q_A_Prob 4 | - .@pre_compute.ra_Q_A_PERT_Prob: ra_Q_A_PERT_Prob 5 | 6 | pre_compute: 7 | ra_Q_A_Prob: 8 | access_key: correct 9 | ra_Q_A_PERT_Prob: 10 | access_key: wrong 11 | 12 | handler: probability_w_options -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/ra_Q_A_ROUGE.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.ra_Q_A_ROUGE 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_QA_ra 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | - ../../generation@generation_args: default 6 | 7 | # ^ get default dataset and generation config information 8 | 9 | handler: rouge 10 | rouge_type: rougeL_recall 11 | batch_size: ${eval.tofu.batch_size} 12 | datasets: # override as needed 13 | TOFU_QA_ra: 14 | args: 15 | predict_with_generate: True 16 | collators: 17 | DataCollatorForSupervisedDataset: 18 | args: 19 | padding_side: left -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/ra_Truth_Ratio.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.ra_Truth_Ratio 2 | defaults: 3 | - .@pre_compute.ra_Q_A_Prob: ra_Q_A_Prob 4 | - .@pre_compute.ra_Q_A_PERT_Prob: ra_Q_A_PERT_Prob 5 | 6 | pre_compute: 7 | ra_Q_A_Prob: 8 | access_key: correct 9 | ra_Q_A_PERT_Prob: 10 | access_key: wrong 11 | 12 | handler: truth_ratio 13 | aggregator: true_better 14 | -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/retain_Q_A_PARA_Prob.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.retain_Q_A_PARA_Prob 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_QA_retain_para 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | # ^ get default dataset and generation config information 6 | 7 | handler: probability 8 | batch_size: ${eval.tofu.batch_size} 9 | 10 | datasets: 11 | TOFU_QA_retain_para: 12 | args: 13 | question_key: ${eval.tofu.question_key} -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/retain_Q_A_PERT_Prob.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.retain_Q_A_PERT_Prob 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_QA_retain_pert 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | # ^ get default dataset and generation config information 6 | 7 | handler: probability 8 | batch_size: ${eval.tofu.batch_size} 9 | 10 | datasets: 11 | TOFU_QA_retain_pert: 12 | args: 13 | question_key: ${eval.tofu.question_key} -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/retain_Q_A_Prob.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.retain_Q_A_Prob 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_QA_retain_eval 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | # ^ get default dataset and generation config information 6 | 7 | handler: probability 8 | batch_size: ${eval.tofu.batch_size} 9 | 10 | datasets: 11 | TOFU_QA_retain_eval: 12 | args: 13 | question_key: ${eval.tofu.question_key} -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/retain_Q_A_ROUGE.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.retain_Q_A_ROUGE 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_QA_retain_eval 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | - ../../generation@generation_args: default 6 | 7 | # ^ get default dataset and generation config information 8 | 9 | handler: rouge 10 | rouge_type: rougeL_recall 11 | batch_size: ${eval.tofu.batch_size} 12 | datasets: # override as needed 13 | TOFU_QA_retain_eval: 14 | args: 15 | question_key: ${eval.tofu.question_key} 16 | predict_with_generate: True 17 | collators: 18 | DataCollatorForSupervisedDataset: 19 | args: 20 | padding_side: left -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/retain_Truth_Ratio.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.retain_Truth_Ratio 2 | defaults: 3 | - .@pre_compute.retain_Q_A_PARA_Prob: retain_Q_A_PARA_Prob 4 | - .@pre_compute.retain_Q_A_PERT_Prob: retain_Q_A_PERT_Prob 5 | 6 | pre_compute: 7 | retain_Q_A_PARA_Prob: 8 | access_key: correct 9 | retain_Q_A_PERT_Prob: 10 | access_key: wrong 11 | 12 | handler: truth_ratio 13 | aggregator: true_better 14 | -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/wf_Q_A_PERT_Prob.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.wf_Q_A_PERT_Prob 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_QA_wf_pert 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | # ^ get default dataset and generation config information 6 | 7 | handler: probability 8 | batch_size: ${eval.tofu.batch_size} -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/wf_Q_A_Prob.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.wf_Q_A_Prob 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_QA_wf 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | # ^ get default dataset and generation config information 6 | 7 | handler: probability 8 | batch_size: ${eval.tofu.batch_size} -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/wf_Q_A_Prob_normalised.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.wf_Q_A_Prob_normalised 2 | defaults: 3 | - .@pre_compute.wf_Q_A_Prob: wf_Q_A_Prob 4 | - .@pre_compute.wf_Q_A_PERT_Prob: wf_Q_A_PERT_Prob 5 | 6 | pre_compute: 7 | wf_Q_A_Prob: 8 | access_key: correct 9 | wf_Q_A_PERT_Prob: 10 | access_key: wrong 11 | 12 | handler: probability_w_options -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/wf_Q_A_ROUGE.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.wf_Q_A_ROUGE 2 | defaults: 3 | - ../../data/datasets@datasets: TOFU_QA_wf 4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex 5 | - ../../generation@generation_args: default 6 | 7 | # ^ get default dataset and generation config information 8 | 9 | handler: rouge 10 | rouge_type: rougeL_recall 11 | batch_size: ${eval.tofu.batch_size} 12 | datasets: # override as needed 13 | TOFU_QA_wf: 14 | args: 15 | predict_with_generate: True 16 | collators: 17 | DataCollatorForSupervisedDataset: 18 | args: 19 | padding_side: left -------------------------------------------------------------------------------- /configs/eval/tofu_metrics/wf_Truth_Ratio.yaml: -------------------------------------------------------------------------------- 1 | # @package eval.tofu.metrics.wf_Truth_Ratio 2 | defaults: 3 | - .@pre_compute.wf_Q_A_Prob: wf_Q_A_Prob 4 | - .@pre_compute.wf_Q_A_PERT_Prob: wf_Q_A_PERT_Prob 5 | 6 | pre_compute: 7 | wf_Q_A_Prob: 8 | access_key: correct 9 | wf_Q_A_PERT_Prob: 10 | access_key: wrong 11 | 12 | handler: truth_ratio 13 | aggregator: true_better 14 | -------------------------------------------------------------------------------- /configs/experiment/eval/muse/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /model: Llama-2-7b-hf 5 | - override /eval: muse 6 | 7 | data_split: News 8 | retain_logs_path: null 9 | 10 | model: 11 | model_args: 12 | pretrained_model_name_or_path: muse-bench/MUSE-${data_split}_target 13 | 14 | eval: 15 | muse: 16 | data_split: ${data_split} 17 | retain_logs_path: ${retain_logs_path} 18 | 19 | task_name: ??? -------------------------------------------------------------------------------- /configs/experiment/eval/tofu/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /model: Llama-3.2-1B-Instruct 5 | - override /eval: tofu 6 | 7 | forget_split: forget10 8 | holdout_split: holdout10 9 | retain_logs_path: null 10 | 11 | model: 12 | model_args: 13 | pretrained_model_name_or_path: open-unlearning/tofu_Llama-3.2-1B-Instruct_full 14 | 15 | eval: 16 | tofu: 17 | forget_split: ${forget_split} 18 | holdout_split: ${holdout_split} 19 | retain_logs_path: ${retain_logs_path} 20 | 21 | task_name: ??? -------------------------------------------------------------------------------- /configs/experiment/eval/wmdp/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /model: zephyr-7b-beta 5 | - override /eval: lm_eval 6 | 7 | data_split: cyber 8 | 9 | eval: 10 | lm_eval: 11 | tasks: 12 | - wmdp_${data_split} 13 | - mmlu 14 | 15 | task_name: ??? -------------------------------------------------------------------------------- /configs/experiment/examples/tofu_eval.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | model_args: 3 | device_map: cuda 4 | pretrained_model_name_or_path: open-unlearning/tofu_Llama-3.2-1B-Instruct_full 5 | attn_implementation: flash_attention_2 6 | torch_dtype: bfloat16 7 | tokenizer_args: 8 | pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct 9 | template_args: 10 | apply_chat_template: true 11 | system_prompt: You are a helpful assistant. 12 | system_prompt_with_special_tokens: '<|begin_of_text|><|start_header_id|>system<|end_header_id|> 13 | 14 | 15 | You are a helpful assistant.<|eot_id|>' 16 | user_start_tag: '<|start_header_id|>user<|end_header_id|> 17 | 18 | 19 | ' 20 | user_end_tag: <|eot_id|> 21 | asst_start_tag: '<|start_header_id|>assistant<|end_header_id|> 22 | 23 | 24 | ' 25 | asst_end_tag: <|eot_id|> 26 | mode: eval 27 | task_name: SAMPLE_EVAL 28 | seed: 0 29 | eval: 30 | tofu: 31 | metrics: 32 | forget_quality: 33 | pre_compute: 34 | forget_truth_ratio: 35 | pre_compute: 36 | forget_Q_A_PARA_Prob: 37 | datasets: 38 | TOFU_QA_forget_para: 39 | handler: QADataset 40 | args: 41 | hf_args: 42 | name: ${eval.tofu.forget_split}_perturbed 43 | split: train 44 | path: locuslab/TOFU 45 | question_key: question 46 | answer_key: paraphrased_answer 47 | max_length: 512 48 | collators: 49 | DataCollatorForSupervisedDataset: 50 | handler: DataCollatorForSupervisedDataset 51 | args: 52 | padding_side: right 53 | index: index 54 | handler: probability 55 | batch_size: 32 56 | access_key: correct 57 | forget_Q_A_PERT_Prob: 58 | datasets: 59 | TOFU_QA_forget_pert: 60 | handler: QADataset 61 | args: 62 | hf_args: 63 | name: ${eval.tofu.forget_split}_perturbed 64 | split: train 65 | path: locuslab/TOFU 66 | question_key: question 67 | answer_key: perturbed_answer 68 | max_length: 512 69 | collators: 70 | DataCollatorForSupervisedDataset: 71 | handler: DataCollatorForSupervisedDataset 72 | args: 73 | padding_side: right 74 | index: index 75 | handler: probability 76 | batch_size: 32 77 | access_key: wrong 78 | handler: truth_ratio 79 | aggregator: closer_to_1_better 80 | access_key: forget 81 | reference_logs: 82 | retain_model_logs: 83 | path: ${eval.tofu.retain_logs_path} 84 | include: 85 | forget_truth_ratio: 86 | access_key: retain 87 | handler: ks_test 88 | forget_Q_A_Prob: 89 | datasets: 90 | TOFU_QA_forget: 91 | handler: QADataset 92 | args: 93 | hf_args: 94 | name: ${eval.tofu.forget_split} 95 | split: train 96 | path: locuslab/TOFU 97 | question_key: question 98 | answer_key: answer 99 | max_length: 512 100 | collators: 101 | DataCollatorForSupervisedDataset: 102 | handler: DataCollatorForSupervisedDataset 103 | args: 104 | padding_side: right 105 | index: index 106 | handler: probability 107 | batch_size: 32 108 | handler: TOFUEvaluator 109 | output_dir: ${paths.output_dir} 110 | overwrite: false 111 | forget_split: ${forget_split} 112 | holdout_split: ${holdout_split} 113 | retain_logs_path: ${retain_logs_path} 114 | paths: 115 | root_dir: . 116 | data_dir: ${paths.root_dir}/data/ 117 | datasets: ${paths.root_dir}/configs/data/datasets 118 | output_dir: ${paths.root_dir}/saves/${mode}/${task_name} 119 | work_dir: ${hydra:runtime.cwd} 120 | forget_split: forget10 121 | holdout_split: holdout10 122 | retain_logs_path: saves/eval/tofu_Llama-3.2-1B-Instruct_retain90/TOFU_EVAL.json 123 | -------------------------------------------------------------------------------- /configs/experiment/finetune/tofu/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /model: Llama-3.2-1B-Instruct 5 | - override /trainer: finetune 6 | - override /data/datasets@data.train: TOFU_QA_full 7 | - override /eval: tofu 8 | 9 | mode: finetune 10 | trainer: 11 | args: 12 | learning_rate: 1e-5 13 | weight_decay: 0.01 14 | warmup_epochs: 1.0 # custom parameter 15 | num_train_epochs: 5 16 | 17 | 18 | forget_split: forget10 19 | holdout_split: holdout10 20 | retain_logs_path: null 21 | 22 | eval: 23 | tofu: 24 | forget_split: ${forget_split} 25 | holdout_split: ${holdout_split} 26 | retain_logs_path: ${retain_logs_path} 27 | overwrite: true 28 | 29 | 30 | task_name: tofu_Llama-3.2-1B-Instruct_full -------------------------------------------------------------------------------- /configs/experiment/unlearn/muse/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /model: Llama-2-7b-hf 5 | - override /trainer: GradAscent 6 | - override /data: unlearn 7 | - override /data/datasets@data.forget: MUSE_forget 8 | - override /data/datasets@data.retain: MUSE_retain 9 | - override /eval: muse 10 | 11 | data_split: News 12 | forget_split: forget 13 | retain_split: retain1 14 | retain_logs_path: null 15 | 16 | model: 17 | model_args: 18 | pretrained_model_name_or_path: muse-bench/MUSE-${data_split}_target 19 | 20 | data: 21 | anchor: forget 22 | forget: 23 | MUSE_forget: 24 | args: 25 | hf_args: 26 | split: ${forget_split} 27 | path: muse-bench/MUSE-${data_split} 28 | retain: 29 | MUSE_retain: 30 | args: 31 | hf_args: 32 | path: muse-bench/MUSE-${data_split} 33 | split: ${retain_split} 34 | 35 | 36 | eval: 37 | muse: 38 | data_split: ${data_split} 39 | retain_logs_path: ${retain_logs_path} 40 | overwrite: true 41 | 42 | trainer: 43 | args: 44 | per_device_train_batch_size: 4 45 | gradient_accumulation_steps: 8 46 | learning_rate: 1e-5 47 | num_train_epochs: 10 48 | lr_scheduler_type: constant 49 | # save_strategy: steps 50 | # save_steps: 0.5 51 | # optim: paged_adamw_32bit 52 | # optim: adamw_torch 53 | 54 | task_name: ??? 55 | -------------------------------------------------------------------------------- /configs/experiment/unlearn/muse/scalability.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /model: Llama-2-7b-hf 5 | - override /trainer: GradAscent 6 | - override /data: unlearn 7 | - override /data/datasets@data.forget: MUSE_forget_scal 8 | - override /data/datasets@data.retain: MUSE_retain 9 | - override /eval: muse 10 | 11 | data_split: News 12 | forget_split: forget_4 13 | retain_split: retain1 14 | retain_logs_path: null 15 | 16 | model: 17 | model_args: 18 | pretrained_model_name_or_path: muse-bench/MUSE-${data_split}_target 19 | 20 | data: 21 | anchor: forget 22 | forget: 23 | MUSE_forget_scal: 24 | args: 25 | hf_args: 26 | path: muse-bench/MUSE-${data_split} 27 | split: ${forget_split} 28 | retain: 29 | MUSE_retain: 30 | args: 31 | hf_args: 32 | path: muse-bench/MUSE-${data_split} 33 | split: ${retain_split} 34 | 35 | eval: 36 | muse: 37 | data_split: ${data_split} 38 | retain_logs_path: ${retain_logs_path} 39 | overwrite: true 40 | 41 | trainer: 42 | args: 43 | per_device_train_batch_size: 4 44 | gradient_accumulation_steps: 8 45 | learning_rate: 1e-5 46 | num_train_epochs: 10 47 | lr_scheduler_type: constant 48 | # save_strategy: steps 49 | # save_steps: 0.5 50 | # optim: paged_adamw_32bit 51 | # optim: adamw_torch 52 | 53 | task_name: ??? 54 | -------------------------------------------------------------------------------- /configs/experiment/unlearn/muse/sustainabilty.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /model: Llama-2-7b-hf 5 | - override /trainer: GradAscent 6 | - override /data: unlearn 7 | - override /data/datasets@data.forget: MUSE_forget_sust 8 | - override /data/datasets@data.retain: MUSE_retain 9 | - override /eval: muse 10 | 11 | data_split: News 12 | forget_split: forget_4 13 | retain_split: retain1 14 | retain_logs_path: null 15 | 16 | model: 17 | model_args: 18 | pretrained_model_name_or_path: muse-bench/MUSE-${data_split}_target 19 | 20 | data: 21 | anchor: forget 22 | forget: 23 | MUSE_forget_sust: 24 | args: 25 | hf_args: 26 | path: muse-bench/MUSE-${data_split} 27 | split: ${forget_split} 28 | retain: 29 | MUSE_retain: 30 | args: 31 | hf_args: 32 | path: muse-bench/MUSE-${data_split} 33 | split: ${retain_split} 34 | 35 | eval: 36 | muse: 37 | data_split: ${data_split} 38 | retain_logs_path: ${retain_logs_path} 39 | overwrite: true 40 | 41 | trainer: 42 | args: 43 | per_device_train_batch_size: 4 44 | gradient_accumulation_steps: 8 45 | learning_rate: 1e-5 46 | num_train_epochs: 10 47 | lr_scheduler_type: constant 48 | # save_strategy: steps 49 | # save_steps: 0.5 50 | # optim: paged_adamw_32bit 51 | # optim: adamw_torch 52 | 53 | task_name: ??? 54 | -------------------------------------------------------------------------------- /configs/experiment/unlearn/tofu/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /model: Llama-3.2-1B-Instruct 5 | - override /trainer: GradAscent 6 | - override /data: unlearn 7 | - override /data/datasets@data.forget: TOFU_QA_forget 8 | - override /data/datasets@data.retain: TOFU_QA_retain 9 | - override /eval: tofu 10 | 11 | model: 12 | model_args: 13 | pretrained_model_name_or_path: open-unlearning/tofu_Llama-3.2-1B-Instruct_full 14 | 15 | forget_split: forget10 16 | retain_split: retain90 17 | holdout_split: holdout10 18 | retain_logs_path: null 19 | question_key: "question" 20 | 21 | eval: 22 | tofu: 23 | forget_split: ${forget_split} 24 | holdout_split: ${holdout_split} 25 | retain_logs_path: ${retain_logs_path} 26 | overwrite: true 27 | question_key: ${question_key} 28 | 29 | data: 30 | anchor: forget 31 | forget: 32 | TOFU_QA_forget: 33 | args: 34 | hf_args: 35 | name: ${forget_split} 36 | retain: 37 | TOFU_QA_retain: 38 | args: 39 | hf_args: 40 | name: ${retain_split} 41 | 42 | trainer: 43 | args: 44 | warmup_epochs: 1.0 # custom parameter 45 | learning_rate: 1e-5 46 | weight_decay: 0.01 47 | num_train_epochs: 10 48 | # save_strategy: steps 49 | # save_steps: 0.5 50 | 51 | task_name: ??? -------------------------------------------------------------------------------- /configs/experiment/unlearn/tofu/idk.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /model: Llama-3.2-3B-Instruct 5 | - override /trainer: DPO 6 | - override /data: unlearn 7 | - override /data/datasets@data.forget: TOFU_QA_forget_idk 8 | - override /data/datasets@data.retain: TOFU_QA_retain 9 | - override /eval: tofu 10 | 11 | model: 12 | model_args: 13 | pretrained_model_name_or_path: open-unlearning/tofu_Llama-3.2-1B-Instruct_full 14 | 15 | forget_split: forget10 16 | retain_split: retain90 17 | retain_logs_path: null 18 | 19 | eval: 20 | tofu: 21 | forget_split: ${forget_split} 22 | retain_logs_path: ${retain_logs_path} 23 | overwrite: true 24 | 25 | data: 26 | anchor: forget 27 | forget: 28 | TOFU_QA_forget_idk: 29 | args: 30 | hf_args: 31 | name: ${forget_split} 32 | retain: 33 | TOFU_QA_retain: 34 | args: 35 | hf_args: 36 | name: ${retain_split} 37 | 38 | trainer: 39 | args: 40 | warmup_epochs: 1.0 # custom parameter 41 | learning_rate: 1e-5 42 | weight_decay: 0.01 43 | num_train_epochs: 10 44 | # save_strategy: steps 45 | # save_steps: 0.5 46 | 47 | task_name: ??? 48 | -------------------------------------------------------------------------------- /configs/experiment/unlearn/wmdp/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - override /model: zephyr-7b-beta 5 | - override /trainer: RMU 6 | - override /data: unlearn 7 | - override /data/datasets@data.forget: WMDP_forget 8 | - override /data/datasets@data.retain: WMDP_retain 9 | - override /eval: lm_eval 10 | 11 | data_split: cyber 12 | 13 | data: 14 | anchor: forget 15 | forget: 16 | WMDP_forget: 17 | args: 18 | hf_args: 19 | data_files: data/wmdp/wmdp-corpora/${data_split}-forget-corpus.jsonl 20 | retain: 21 | WMDP_retain: 22 | args: 23 | hf_args: 24 | data_files: data/wmdp/wmdp-corpora/${data_split}-retain-corpus.jsonl 25 | 26 | eval: 27 | lm_eval: 28 | tasks: 29 | - wmdp_${data_split} 30 | - mmlu 31 | 32 | 33 | collator: 34 | DataCollatorForSupervisedDataset: 35 | args: 36 | padding_side: left # Usually left but for mistral and zephyr its right (https://github.com/hongshi97/CAD/issues/2) 37 | 38 | trainer: 39 | args: 40 | per_device_train_batch_size: 1 41 | gradient_accumulation_steps: 16 42 | learning_rate: 5e-5 43 | eval_strategy: steps 44 | eval_steps: 0.5 45 | max_steps: 80 46 | lr_scheduler_type: constant 47 | 48 | method_args: 49 | # The params here are more dependent on model and dataset. Tune them carefully to work 50 | gamma: 1.0 51 | steering_coeff: 2 52 | retain_loss_type: EMBED_DIFF 53 | alpha: 1 54 | module_regex: model\.layers\.7 55 | trainable_params_regex: 56 | - model\.layers\.(5|6|7)\.mlp\.down_proj\.weight # If you want to update only these weights (as done in https://github.com/centerforaisafety/wmdp/blob/bc5e1ba0367ea826caeeeaa50656336a1e87acfb/rmu/unlearn.py#L26) 57 | 58 | task_name: ??? -------------------------------------------------------------------------------- /configs/generation/default.yaml: -------------------------------------------------------------------------------- 1 | do_sample: False 2 | top_p: null 3 | temperature: null 4 | max_new_tokens: 200 5 | use_cache: True -------------------------------------------------------------------------------- /configs/hydra/default.yaml: -------------------------------------------------------------------------------- 1 | # https://hydra.cc/docs/configure_hydra/intro/ 2 | 3 | # enable color logging 4 | defaults: 5 | - override hydra_logging: colorlog 6 | - override job_logging: colorlog 7 | 8 | # output directory, generated dynamically on each run 9 | run: 10 | # dir: ${paths.save_dir}/${now:%Y-%m-%d}_${now:%H-%M-%S} 11 | dir: ${paths.output_dir} 12 | # sweep: 13 | # dir: ${paths.log_dir}/${task_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S} 14 | # subdir: ${hydra.job.num} 15 | 16 | job_logging: 17 | handlers: 18 | file: 19 | # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242 20 | filename: ${hydra.runtime.output_dir}/${trainer.handler}.log -------------------------------------------------------------------------------- /configs/hydra/eval.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | run: 5 | # dir: ${paths.save_dir}/${now:%Y-%m-%d}_${now:%H-%M-%S} 6 | dir: ${paths.output_dir} 7 | 8 | job_logging: 9 | handlers: 10 | file: 11 | # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242 12 | filename: ${hydra.runtime.output_dir}/eval.log -------------------------------------------------------------------------------- /configs/model/Llama-2-7b-chat-hf.yaml: -------------------------------------------------------------------------------- 1 | model_args: 2 | pretrained_model_name_or_path: meta-llama/Llama-2-7b-chat-hf 3 | attn_implementation: 'flash_attention_2' 4 | torch_dtype: bfloat16 5 | tokenizer_args: 6 | pretrained_model_name_or_path: meta-llama/Llama-2-7b-chat-hf 7 | template_args: # Used in creating prompts for the dataset. See src/data/utils.py#preprocess_chat_instance. 8 | # following https://www.reddit.com/r/LocalLLaMA/comments/1561vn5/here_is_a_practical_multiturn_llama2chat_prompt/ 9 | apply_chat_template: False 10 | user_start_tag: "[INST] " 11 | user_end_tag: " [/INST]" 12 | asst_start_tag: "" 13 | # ^the above link says this must be " ", but we observed this leads to very bad tokenization at the border which affects scores 14 | asst_end_tag: " " -------------------------------------------------------------------------------- /configs/model/Llama-2-7b-hf.yaml: -------------------------------------------------------------------------------- 1 | model_args: 2 | pretrained_model_name_or_path: "meta-llama/Llama-2-7b-hf" 3 | attn_implementation: 'flash_attention_2' 4 | torch_dtype: bfloat16 5 | tokenizer_args: 6 | pretrained_model_name_or_path: "meta-llama/Llama-2-7b-hf" 7 | template_args: # Used in creating prompts for the dataset. See src/data/utils.py#preprocess_chat_instance. 8 | apply_chat_template: False 9 | user_start_tag: "Question: " 10 | user_end_tag: "\n" 11 | asst_start_tag: "Answer: " 12 | asst_end_tag: "\n\n" -------------------------------------------------------------------------------- /configs/model/Llama-3.1-8B-Instruct.yaml: -------------------------------------------------------------------------------- 1 | model_args: 2 | pretrained_model_name_or_path: "meta-llama/Llama-3.1-8B-Instruct" 3 | attn_implementation: 'flash_attention_2' 4 | torch_dtype: bfloat16 5 | tokenizer_args: 6 | pretrained_model_name_or_path: "meta-llama/Llama-3.1-8B-Instruct" 7 | template_args: 8 | apply_chat_template: True 9 | system_prompt: You are a helpful assistant. 10 | system_prompt_with_special_tokens: "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|>" 11 | user_start_tag: "<|start_header_id|>user<|end_header_id|>\n\n" 12 | user_end_tag: "<|eot_id|>" 13 | asst_start_tag: "<|start_header_id|>assistant<|end_header_id|>\n\n" 14 | asst_end_tag: "<|eot_id|>" 15 | date_string: 10 Apr 2025 -------------------------------------------------------------------------------- /configs/model/Llama-3.2-1B-Instruct.yaml: -------------------------------------------------------------------------------- 1 | model_args: 2 | pretrained_model_name_or_path: "meta-llama/Llama-3.2-1B-Instruct" 3 | attn_implementation: 'flash_attention_2' 4 | torch_dtype: bfloat16 5 | tokenizer_args: 6 | pretrained_model_name_or_path: "meta-llama/Llama-3.2-1B-Instruct" 7 | template_args: 8 | apply_chat_template: True 9 | system_prompt: You are a helpful assistant. 10 | system_prompt_with_special_tokens: "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|>" 11 | user_start_tag: "<|start_header_id|>user<|end_header_id|>\n\n" 12 | user_end_tag: "<|eot_id|>" 13 | asst_start_tag: "<|start_header_id|>assistant<|end_header_id|>\n\n" 14 | asst_end_tag: "<|eot_id|>" 15 | date_string: 10 Apr 2025 -------------------------------------------------------------------------------- /configs/model/Llama-3.2-3B-Instruct.yaml: -------------------------------------------------------------------------------- 1 | model_args: 2 | pretrained_model_name_or_path: "meta-llama/Llama-3.2-3B-Instruct" 3 | attn_implementation: 'flash_attention_2' 4 | torch_dtype: bfloat16 5 | tokenizer_args: 6 | pretrained_model_name_or_path: "meta-llama/Llama-3.2-3B-Instruct" 7 | template_args: 8 | apply_chat_template: True 9 | system_prompt: You are a helpful assistant. 10 | system_prompt_with_special_tokens: "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|>" 11 | user_start_tag: "<|start_header_id|>user<|end_header_id|>\n\n" 12 | user_end_tag: "<|eot_id|>" 13 | asst_start_tag: "<|start_header_id|>assistant<|end_header_id|>\n\n" 14 | asst_end_tag: "<|eot_id|>" 15 | date_string: 10 Apr 2025 -------------------------------------------------------------------------------- /configs/model/Phi-3.5-mini-instruct.yaml: -------------------------------------------------------------------------------- 1 | model_args: 2 | pretrained_model_name_or_path: "microsoft/Phi-3.5-mini-instruct" 3 | attn_implementation: 'flash_attention_2' 4 | torch_dtype: bfloat16 5 | tokenizer_args: 6 | pretrained_model_name_or_path: "microsoft/Phi-3.5-mini-instruct" 7 | template_args: 8 | apply_chat_template: True 9 | system_prompt: You are a helpful assistant. 10 | system_prompt_with_special_tokens: "<|system|>\nYou are a helpful assistant.<|end|>\n" 11 | user_start_tag: "<|user|>\n" 12 | user_end_tag: "<|end|>\n" 13 | asst_start_tag: "<|assistant|>\n" 14 | asst_end_tag: "<|end|>\n" -------------------------------------------------------------------------------- /configs/model/gemma-7b-it.yaml: -------------------------------------------------------------------------------- 1 | model_args: 2 | pretrained_model_name_or_path: "google/gemma-7b-it" 3 | attn_implementation: 'flash_attention_2' 4 | torch_dtype: bfloat16 5 | tokenizer_args: 6 | pretrained_model_name_or_path: "google/gemma-7b-it" 7 | template_args: 8 | apply_chat_template: True 9 | user_start_tag: "user\n" 10 | user_end_tag: "\n" 11 | asst_start_tag: "model\n" 12 | asst_end_tag: "\n" 13 | -------------------------------------------------------------------------------- /configs/model/phi-1_5.yaml: -------------------------------------------------------------------------------- 1 | model_args: 2 | pretrained_model_name_or_path: "microsoft/phi-1_5" # "locuslab/phi-1_5" 3 | tokenizer_args: 4 | pretrained_model_name_or_path: "microsoft/phi-1_5" 5 | template_args: 6 | apply_chat_template: False 7 | user_start_tag: "Question: " 8 | user_end_tag: "\n" 9 | asst_start_tag: "Answer: " 10 | asst_end_tag: "\n\n" -------------------------------------------------------------------------------- /configs/model/zephyr-7b-beta.yaml: -------------------------------------------------------------------------------- 1 | model_args: 2 | pretrained_model_name_or_path: "HuggingFaceH4/zephyr-7b-beta" 3 | attn_implementation: 'flash_attention_2' 4 | torch_dtype: bfloat16 5 | tokenizer_args: 6 | pretrained_model_name_or_path: "HuggingFaceH4/zephyr-7b-beta" 7 | template_args: 8 | apply_chat_template: True 9 | system_prompt: You are a helpful assistant. 10 | system_prompt_with_special_tokens: "<|system|>\nYou are a helpful assistant.\n" 11 | user_start_tag: "<|user|>\n" 12 | user_end_tag: "" 13 | asst_start_tag: "<|assistant|>\n" 14 | asst_end_tag: "" 15 | date_string: 10 Apr 2025 -------------------------------------------------------------------------------- /configs/paths/default.yaml: -------------------------------------------------------------------------------- 1 | # path to root directory 2 | root_dir: . 3 | 4 | # path to data directory 5 | data_dir: ${paths.root_dir}/data/ 6 | 7 | # path to dataset configs 8 | datasets: ${paths.root_dir}/configs/data/datasets 9 | 10 | # path to output directory, created dynamically by hydra 11 | # path generation pattern is specified in `configs/hydra/default.yaml` 12 | # use it to store all files generated during the run, like ckpts and metrics 13 | # save_dir: ${paths.root_dir}/saves/${mode}/${task_name} 14 | 15 | output_dir: ${paths.root_dir}/saves/${mode}/${task_name} 16 | 17 | # path to working directory 18 | work_dir: ${hydra:runtime.cwd} -------------------------------------------------------------------------------- /configs/train.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - model: Llama-3.2-3B-Instruct 4 | - trainer: finetune 5 | - data: finetune 6 | - collator: DataCollatorForSupervisedDataset 7 | - eval: tofu 8 | - hydra: default 9 | - paths: default 10 | - experiment: null 11 | 12 | mode: train 13 | task_name: ??? -------------------------------------------------------------------------------- /configs/trainer/DPO.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - GradDiff 3 | 4 | handler: DPO 5 | method_args: 6 | beta: 0.1 7 | alpha: 1.0 8 | gamma: 1.0 9 | retain_loss_type: NLL -------------------------------------------------------------------------------- /configs/trainer/GradAscent.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - finetune 3 | 4 | handler: GradAscent -------------------------------------------------------------------------------- /configs/trainer/GradDiff.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - finetune 3 | 4 | handler: GradDiff 5 | method_args: 6 | gamma: 1.0 7 | alpha: 1.0 8 | retain_loss_type: NLL 9 | -------------------------------------------------------------------------------- /configs/trainer/NPO.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - GradDiff 3 | 4 | handler: NPO 5 | method_args: 6 | beta: 0.1 7 | alpha: 1.0 8 | gamma: 1.0 9 | retain_loss_type: NLL 10 | -------------------------------------------------------------------------------- /configs/trainer/RMU.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - GradDiff 3 | 4 | handler: RMU 5 | method_args: 6 | # The params here are more dependent on model and dataset. Tune them carefully to work 7 | gamma: 1.0 8 | steering_coeff: 2 9 | retain_loss_type: EMBED_DIFF 10 | alpha: 1 11 | module_regex: model\.layers\.7 12 | trainable_params_regex: 13 | - .* # update all parameters (as done in https://github.com/tmlr-group/G-effect/blob/ef368eea3b2c6dba1e090b9ebb021ac9f047e0ae/dataloader.py#L271) 14 | # - model\.layers\.(5|6|7)\.mlp\.down_proj\.weight # If you want to update only these weights (as done in https://github.com/centerforaisafety/wmdp/blob/bc5e1ba0367ea826caeeeaa50656336a1e87acfb/rmu/unlearn.py#L26) -------------------------------------------------------------------------------- /configs/trainer/SimNPO.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - GradDiff 3 | 4 | handler: SimNPO 5 | method_args: 6 | delta: 0.0 # gamma in https://github.com/OPTML-Group/Unlearn-Simple/blob/main/TOFU/config/forget.yaml 7 | beta: 4.5 8 | alpha: 1.0 9 | gamma: 0.125 # npo_coeff in https://github.com/OPTML-Group/Unlearn-Simple/blob/main/TOFU/config/forget.yaml 10 | retain_loss_type: NLL 11 | 12 | -------------------------------------------------------------------------------- /configs/trainer/UNDIAL.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - finetune 3 | 4 | handler: UNDIAL # corresponds to the class defined in src/trainer/unlearn/grad_diff.py 5 | args: # HuggingFace TrainingArguments 6 | learning_rate: 1e-4 7 | num_train_epochs: 10 8 | method_args: # Your own method-specific arguments 9 | gamma: 1.0 10 | alpha: 0.0 11 | beta: 10.0 # the strength of penalty for memorized tokens 12 | retain_loss_type: NLL -------------------------------------------------------------------------------- /configs/trainer/finetune.yaml: -------------------------------------------------------------------------------- 1 | handler: FinetuneTrainer 2 | args: 3 | per_device_train_batch_size: 8 4 | per_device_eval_batch_size: 16 5 | gradient_accumulation_steps: 4 6 | learning_rate: 1e-5 7 | bf16: True 8 | bf16_full_eval: True 9 | logging_steps: 5 10 | output_dir: ${paths.output_dir} 11 | logging_dir: ${trainer.args.output_dir}/logs 12 | report_to: tensorboard 13 | ddp_find_unused_parameters: None 14 | gradient_checkpointing: False 15 | optim: paged_adamw_32bit 16 | save_strategy: 'no' 17 | save_only_model: True 18 | weight_decay: 0.00 19 | do_train: True 20 | do_eval: True 21 | eval_on_start: True 22 | eval_strategy: epoch 23 | num_train_epochs: 10 24 | seed: 0 -------------------------------------------------------------------------------- /configs/unlearn.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - model: Llama-3.2-3B-Instruct 3 | - trainer: GradAscent 4 | - data: unlearn 5 | - collator: DataCollatorForSupervisedDataset 6 | - eval: tofu 7 | - hydra: default 8 | - paths: default 9 | - experiment: null 10 | - _self_ 11 | 12 | trainer: 13 | args: 14 | remove_unused_columns: False 15 | 16 | mode: unlearn 17 | task_name: ??? -------------------------------------------------------------------------------- /docs/hydra.md: -------------------------------------------------------------------------------- 1 | ## Hydra Features 2 | 3 | The below are some important Hydra features we use for flexible composition while writing configurations to our YAML files. 4 | 5 | We use this config file for illustration, from [`configs/experiment/unlearn/muse/default.yaml`](../configs/experiment/unlearn/muse/default.yaml): 6 | 7 | ```yaml 8 | # @package _global_ 9 | # ^ not a comment, sets the path of this config to be the the config root directory 10 | defaults: 11 | - override /model: Llama-2-7b-hf # loads from model/Llama-2-7b-hf.yaml into the model attribute 12 | - override /trainer: GradAscent # loads from trainer/GradAscent.yaml into the trainer attribute 13 | - override /data: unlearn # loads from data/unlearn.yaml into the "data" attribute,, setting up data structures for loading datasets during unlearning 14 | - override /eval: muse # loads MUSE evaluation suite from eval/muse.yaml into the eval attribute 15 | 16 | # define variables 17 | data_split: News 18 | forget_split: forget 19 | retain_split: retain1 20 | retain_logs_path: null 21 | 22 | model: 23 | model_args: 24 | pretrained_model_name_or_path: muse-bench/MUSE-${data_split}_target 25 | tokenizer_args: 26 | pretrained_model_name_or_path: muse-bench/MUSE-${data_split}_target 27 | data: 28 | anchor: forget 29 | forget: 30 | MUSE_forget: 31 | args: 32 | hf_args: 33 | split: ${forget_split} 34 | retain: 35 | MUSE_retain: 36 | args: 37 | hf_args: 38 | split: ${retain_split} 39 | 40 | eval: 41 | muse: 42 | data_split: ${data_split} 43 | retain_logs_path: ${retain_logs_path} 44 | 45 | trainer: 46 | args: 47 | per_device_train_batch_size: 4 48 | gradient_accumulation_steps: 8 49 | learning_rate: 1e-5 50 | num_train_epochs: 10 51 | lr_scheduler_type: constant 52 | # save_strategy: steps 53 | # save_steps: 0.5 54 | # optim: paged_adamw_32bit 55 | # optim: adamw_torch 56 | 57 | task_name: ??? # ??? raises and error if this attribute is not set 58 | ``` 59 | 60 | - **Structure & Attribute Access:** Configs are written in YAML and structured hierarchically like a dictionary. Attributes are accessed using dot notation: In code `cfg.model.args.learning_rate`, in command-line: `model.args.learning_rate=1e-5`. 61 | 62 | - **Defaults & Overrides:** Configs are files are included in one another using `defaults` and `override` commands. 63 | 64 | - **Command-Line Overrides:** Any parameter can be overridden directly from the command line. For instance: 65 | ```bash 66 | python src/train.py --config-name=unlearn.yaml experiment=unlearn/muse/default \ 67 | trainer.args.num_train_epochs=50 data_split=Books trainer=SimNPO trainer.method_args.beta=3 \ 68 | task_name=unlearn_muse_simnpo 69 | ``` 70 | 71 | - **Package Directives:** The `# @package` directive organizes configurations into namespaces for cleaner composition and specifies the configuration path. At the head of a YAML file, you might see directives like `# @package _global_` or more specific ones such as `# @package eval.muse.metrics.forget_knowmem_ROUGE` which inform Hydra exactly where the configuration parameters should be placed within the final composed config. 72 | 73 | For example, refer [`configs/eval/muse_metrics/forget_knowmem_ROUGE.yaml`](../configs/eval/muse_metrics/forget_knowmem_ROUGE.yaml) 74 | 75 | - **Variable Substitution:** Variables are defined once and reused using the `${}` syntax. 76 | 77 | - **Adding New Attributes with `+`:** Use the `+` prefix to add attributes that are not already in the config. For example, to add a new argument to the trainer: 78 | ```bash 79 | python src/train.py experiment=unlearn/muse/default +trainer.args.my_new_arg=10 80 | ``` 81 | 82 | - **Attribute Removal with `~`:** You can remove an attribute from the config at runtime using the tilde `~`. For example, to remove flash attention setting: 83 | ```bash 84 | python src/train.py experiment=unlearn/muse/default ~model.model_args.attn_implementation 85 | ``` 86 | > [!NOTE] 87 | > In `zsh`, you must **quote** or **escape** the `~` to avoid it being misinterpreted as a home directory: e.g.: 88 | ```bash 89 | python src/train.py \~model.model_args.attn_implementation 90 | python src/train.py "~model.model_args.attn_implementation" 91 | ``` 92 | > [!NOTE] 93 | > Hydra uses PyYAML to handle yaml files and transform inputs while giving config inputs. This handles cases like converting `true` to `True` 94 | 95 | Refer to the following for config structures and overridable parameters: 96 | - Evaluation: [`configs/experiment/examples/tofu_eval.yaml`](../configs/experiment/examples/tofu_eval.yaml) 97 | - Unlearning: [`configs/experiment/examples/muse_unlearn.yaml`](../configs/experiment/examples/muse_unlearn.yaml) -------------------------------------------------------------------------------- /docs/links.md: -------------------------------------------------------------------------------- 1 | # 🔗 Links and References 2 | 3 | Links to research papers and resources corresponding to implemented features in this repository. Please feel free to fill in any missing references! 4 | 5 | --- 6 | 7 | ## 📌 Table of Contents 8 | - [🔗 Links and References](#-links-and-references) 9 | - [📌 Table of Contents](#-table-of-contents) 10 | - [📗 Implemented Methods](#-implemented-methods) 11 | - [📘 Benchmarks](#-benchmarks) 12 | - [📙 Evaluation Metrics](#-evaluation-metrics) 13 | - [🌐 Useful Links](#-useful-links) 14 | - [📚 Surveys](#-surveys) 15 | - [🐙 Other GitHub Repositories](#-other-github-repositories) 16 | 17 | --- 18 | 19 | ## 📗 Implemented Methods 20 | 21 | | Method | Resource | 22 | |-----------------|----------| 23 | | GradAscent, GradDiff | Naive baselines found in many papers including MUSE, TOFU etc. | 24 | | NPO | Paper [📄](https://arxiv.org/abs/2404.05868), Code [🐙](https://github.com/licong-lin/negative-preference-optimization) | 25 | | SimNPO | Paper [📄](https://arxiv.org/abs/2410.07163), Code [🐙](https://github.com/OPTML-Group/Unlearn-Simple) | 26 | | IdkDPO | TOFU ([📄](https://arxiv.org/abs/2401.06121)) | 27 | | RMU | WMDP paper ([🐙](https://github.com/centerforaisafety/wmdp/tree/main/rmu), [🌐](https://www.wmdp.ai/)), later used in G-effect ([🐙](https://github.com/tmlr-group/G-effect/blob/main/dataloader.py)) | 28 | | UNDIAL | Paper [📄](https://arxiv.org/pdf/2402.10052), Code [🐙](https://github.com/dong-river/LLM_unlearning/tree/main) | 29 | | AltPO | Paper [📄](https://arxiv.org/pdf/2409.13474), Code [🐙](https://github.com/molereddy/Alternate-Preference-Optimization) | 30 | 31 | --- 32 | 33 | ## 📘 Benchmarks 34 | 35 | | Benchmark | Resource | 36 | |-----------|----------| 37 | | TOFU | Paper [📄](https://arxiv.org/abs/2401.06121) | 38 | | MUSE | Paper [📄](https://arxiv.org/abs/2407.06460) | 39 | | WMDP | Paper [📄](https://arxiv.org/abs/2403.03218) | 40 | 41 | --- 42 | 43 | ## 📙 Evaluation Metrics 44 | 45 | | Metric | Resource | 46 | |--------|----------| 47 | | Verbatim Probability / ROUGE, simple QA-ROUGE | Naive metrics found in many papers including MUSE, TOFU etc. | 48 | | Membership Inference Attacks (LOSS, ZLib, Reference, GradNorm, MinK, MinK++) | MIMIR ([🐙](https://github.com/iamgroot42/mimir)), MUSE ([📄](https://arxiv.org/abs/2407.06460)) | 49 | | PrivLeak | MUSE ([📄](https://arxiv.org/abs/2407.06460)) | 50 | | Forget Quality, Truth Ratio, Model Utility | TOFU ([📄](https://arxiv.org/abs/2401.06121)) | 51 | | Extraction Strength (ES) | Carlini et al., 2021 ([📄](https://www.usenix.org/conference/usenixsecurity21/presentation/carlini-extracting)), used for unlearning in Wang et al., 2025 ([📄](https://openreview.net/pdf?id=wUtCieKuQU)) | 52 | | Exact Memorization (EM) | Tirumala et al., 2022 ([📄](https://proceedings.neurips.cc/paper_files/paper/2022/hash/fa0509f4dab6807e2cb465715bf2d249-Abstract-Conference.html)), used for unlearning in Wang et al., 2025 ([📄](https://openreview.net/pdf?id=wUtCieKuQU)) | 53 | | lm-evaluation-harness | [💻](https://github.com/EleutherAI/lm-evaluation-harness/tree/main) | 54 | 55 | --- 56 | 57 | ## 🌐 Useful Links 58 | 59 | ### 📚 Surveys 60 | - [Machine Unlearning in 2024](https://ai.stanford.edu/~kzliu/blog/unlearning) 61 | - [Rethinking Machine Unlearning for Large Language Models](https://arxiv.org/abs/2402.08787) 62 | 63 | ### 🐙 Other GitHub Repositories 64 | - [TOFU Benchmark (original)](https://github.com/locuslab/tofu) 65 | - [MUSE Benchmark (original)](https://github.com/swj0419/muse_bench) 66 | - [Awesome LLM Unlearning](https://github.com/chrisliu298/awesome-llm-unlearning) 67 | - [Awesome Machine Unlearning](https://github.com/tamlhp/awesome-machine-unlearning) 68 | - [Awesome GenAI Unlearning](https://github.com/franciscoliu/Awesome-GenAI-Unlearning) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | huggingface-hub==0.29.1 2 | transformers==4.45.1 3 | numpy==2.2.3 4 | hydra-core==1.3 5 | hydra_colorlog==1.2.0 6 | torch==2.4.1 7 | datasets==3.0.1 8 | accelerate==0.34.2 9 | bitsandbytes==0.44.1 10 | rouge-score==0.1.2 11 | scipy==1.14.1 12 | tensorboard==2.18.0 13 | scikit-learn==1.5.2 14 | deepspeed==0.15.4 15 | -------------------------------------------------------------------------------- /scripts/muse_unlearn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export MASTER_PORT=$(python -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()") 4 | echo "Master Port: $MASTER_PORT" 5 | 6 | 7 | per_device_train_batch_size=4 8 | gradient_accumulation_steps=8 9 | 10 | 11 | model=Llama-2-7b-hf 12 | 13 | data_splits=( 14 | "News" 15 | "Books" 16 | ) 17 | 18 | trainers=( 19 | "GradAscent" 20 | "GradDiff" 21 | "NPO" 22 | "SimNPO" 23 | ) 24 | 25 | # ######################################################### 26 | # #################### MUSE Unlearning #################### 27 | # ######################################################### 28 | 29 | 30 | for data_split in "${data_splits[@]}"; do 31 | for trainer in "${trainers[@]}"; do 32 | 33 | task_name=muse_${model}_${data_split}_${trainer} 34 | 35 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file configs/accelerate/default_config.yaml --main_process_port $MASTER_PORT \ 36 | src/train.py --config-name=unlearn.yaml \ 37 | experiment=unlearn/muse/default.yaml \ 38 | model=${model} \ 39 | data_split=${data_split} \ 40 | trainer=${trainer} \ 41 | task_name=${task_name} \ 42 | retain_logs_path=saves/eval/muse_${model}_${data_split}_retrain/MUSE_EVAL.json \ 43 | trainer.args.per_device_train_batch_size=${per_device_train_batch_size} \ 44 | trainer.args.gradient_accumulation_steps=${gradient_accumulation_steps} \ 45 | trainer.args.ddp_find_unused_parameters=true \ 46 | trainer.args.gradient_checkpointing=true 47 | 48 | CUDA_VISIBLE_DEVICES=0 python src/eval.py \ 49 | experiment=eval/muse/default.yaml \ 50 | data_split=${data_split} \ 51 | task_name=${task_name} \ 52 | model=${model} \ 53 | model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \ 54 | paths.output_dir=saves/unlearn/${trainer}/evals \ 55 | retain_logs_path=saves/eval/muse_${model}_${data_split}_retrain/MUSE_EVAL.json 56 | done 57 | done 58 | 59 | 60 | 61 | # ######################################################### 62 | # ########### MUSE News Unlearning Scalability ############ 63 | # ######################################################### 64 | 65 | 66 | for data_split in "${data_splits[@]}"; do 67 | for trainer in "${trainers[@]}"; do 68 | for scal in "forget_1" "forget_2" "forget_3" "forget_4"; do 69 | 70 | task_name=muse_${model}_${data_split}_${trainer}_scal_${scal} \ 71 | 72 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file configs/accelerate/default_config.yaml --main_process_port $MASTER_PORT \ 73 | src/train.py --config-name=unlearn.yaml \ 74 | experiment=unlearn/muse/scalability.yaml \ 75 | model=${model} \ 76 | data_split=${data_split} \ 77 | forget_split=${scal} \ 78 | trainer=${trainer} \ 79 | task_name=${task_name} \ 80 | retain_logs_path=saves/eval/muse_${model}_${data_split}_retrain/MUSE_EVAL.json \ 81 | trainer.args.per_device_train_batch_size=${per_device_train_batch_size} \ 82 | trainer.args.gradient_accumulation_steps=${gradient_accumulation_steps} \ 83 | trainer.args.ddp_find_unused_parameters=true \ 84 | trainer.args.gradient_checkpointing=true 85 | 86 | CUDA_VISIBLE_DEVICES=0 python src/eval.py \ 87 | experiment=eval/muse/default.yaml \ 88 | data_split=${data_split} \ 89 | task_name=${task_name} \ 90 | model=${model} \ 91 | model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \ 92 | paths.output_dir=saves/unlearn/${trainer}/evals \ 93 | retain_logs_path=saves/eval/muse_${model}_${data_split}_retrain/MUSE_EVAL.json 94 | done 95 | done 96 | done 97 | 98 | 99 | 100 | ######################################################### 101 | ########### MUSE News Unlearning sustainability ######### 102 | ######################################################### 103 | 104 | 105 | for data_split in "${data_splits[@]}"; do 106 | for trainer in "${trainers[@]}"; do 107 | model_path=muse-bench/MUSE-${data_split}_target 108 | for sust in "forget_1" "forget_2" "forget_3" "forget_4"; do 109 | 110 | task_name=muse_${model}_${data_split}_${trainer}_sust_${sust} 111 | 112 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file configs/accelerate/default_config.yaml --main_process_port $MASTER_PORT \ 113 | src/train.py --config-name=unlearn.yaml \ 114 | experiment=unlearn/muse/sustainabilty.yaml \ 115 | model=${model} \ 116 | model.model_args.pretrained_model_name_or_path=${model_path} \ 117 | data_split=${data_split} \ 118 | trainer=${trainer} \ 119 | task_name=${task_name} \ 120 | retain_logs_path=saves/eval/muse_${model}_${data_split}_retrain/MUSE_EVAL.json \ 121 | trainer.args.per_device_train_batch_size=${per_device_train_batch_size} \ 122 | trainer.args.gradient_accumulation_steps=${gradient_accumulation_steps} \ 123 | trainer.args.ddp_find_unused_parameters=true \ 124 | trainer.args.gradient_checkpointing=true 125 | 126 | CUDA_VISIBLE_DEVICES=0 python src/eval.py \ 127 | experiment=eval/muse/default.yaml \ 128 | data_split=${data_split} \ 129 | task_name=${task_name} \ 130 | model=${model} \ 131 | model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \ 132 | paths.output_dir=saves/unlearn/${trainer}/evals \ 133 | retain_logs_path=saves/eval/muse_${model}_${data_split}_retrain/MUSE_EVAL.json 134 | 135 | model_path=saves/unlearn/${task_name} 136 | done 137 | done 138 | done -------------------------------------------------------------------------------- /scripts/tofu_finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export MASTER_PORT=$(python -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()") 4 | echo "Master Port: $MASTER_PORT" 5 | 6 | 7 | models=( 8 | "Llama-3.2-1B-Instruct" 9 | "Llama-3.2-3B-Instruct" 10 | "Llama-3.1-8B-Instruct" 11 | ) 12 | per_device_train_batch_size=4 # Effective batch size 32 on two GPUs with gradent_accumulation_steps=8 13 | 14 | splits=( 15 | "forget01 holdout01 retain99" 16 | "forget05 holdout05 retain95" 17 | "forget10 holdout10 retain90" 18 | ) 19 | 20 | 21 | 22 | ######################################################################################################################## 23 | ########################################### RETAIN Finetuned TOFU ###################################################### 24 | ######################################################################################################################## 25 | 26 | for split in "${splits[@]}"; do 27 | forget_split=$(echo $split | cut -d' ' -f1) 28 | holdout_split=$(echo $split | cut -d' ' -f2) 29 | retain_split=$(echo $split | cut -d' ' -f3) 30 | 31 | for model in "${models[@]}"; do 32 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file configs/accelerate/default_config.yaml --main_process_port $MASTER_PORT \ 33 | src/train.py experiment=finetune/tofu/default.yaml \ 34 | task_name=tofu_${model}_${retain_split} \ 35 | model=${model} \ 36 | data/datasets@data.train=TOFU_QA_retain \ 37 | data.train.TOFU_QA_retain.args.hf_args.name=${retain_split} \ 38 | trainer.args.per_device_train_batch_size=4 \ 39 | trainer.args.ddp_find_unused_parameters=true \ 40 | trainer.args.gradient_checkpointing=true 41 | 42 | 43 | CUDA_VISIBLE_DEVICES=0 python src/eval.py experiment=eval/tofu/default.yaml \ 44 | forget_split=${forget_split} \ 45 | holdout_split=${holdout_split} \ 46 | task_name=tofu_${model}_${retain_split} \ 47 | model=${model} \ 48 | model.model_args.pretrained_model_name_or_path=saves/finetune/tofu_${model}_${retain_split} 49 | done 50 | done 51 | 52 | 53 | # ######################################################################################################################## 54 | # ########################################### FULL Finetuned TOFU models ################################################# 55 | # ######################################################################################################################## 56 | 57 | 58 | for model in "${models[@]}"; do 59 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file configs/accelerate/default_config.yaml --main_process_port $MASTER_PORT \ 60 | src/train.py experiment=finetune/tofu/default.yaml \ 61 | task_name=tofu_${model}_full \ 62 | model=${model} \ 63 | data/datasets@data.train=TOFU_QA_full \ 64 | data.train.TOFU_QA_full.args.hf_args.name=full \ 65 | trainer.args.per_device_train_batch_size=4 \ 66 | trainer.args.ddp_find_unused_parameters=true \ 67 | trainer.args.gradient_checkpointing=true 68 | 69 | # Evaluate the full models on each forget split 70 | for split in "${splits[@]}"; do 71 | forget_split=$(echo $split | cut -d' ' -f1) 72 | holdout_split=$(echo $split | cut -d' ' -f2) 73 | retain_split=$(echo $split | cut -d' ' -f3) 74 | 75 | CUDA_VISIBLE_DEVICES=0 python src/eval.py experiment=eval/tofu/default.yaml \ 76 | forget_split=${forget_split} \ 77 | holdout_split=${holdout_split} \ 78 | task_name=tofu_${model}_full_${forget_split} \ 79 | model=${model} \ 80 | model.model_args.pretrained_model_name_or_path=saves/finetune/tofu_${model}_full \ 81 | retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json \ 82 | paths.output_dir=saves/eval/tofu_${model}_full/evals_${forget_split} 83 | done 84 | done -------------------------------------------------------------------------------- /scripts/tofu_unlearn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | export MASTER_PORT=$(python -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()") 5 | echo "Master Port: $MASTER_PORT" 6 | 7 | models=( 8 | "Llama-3.2-1B-Instruct" 9 | "Llama-3.2-3B-Instruct" 10 | "Llama-3.1-8B-Instruct" 11 | ) 12 | trainers_experiments=( 13 | "GradAscent unlearn/tofu/default.yaml" 14 | "GradDiff unlearn/tofu/default.yaml" 15 | "NPO unlearn/tofu/default.yaml" 16 | "DPO unlearn/tofu/idk.yaml" 17 | "RMU unlearn/tofu/default.yaml" 18 | ) 19 | splits=( 20 | "forget01 holdout01 retain99" 21 | "forget05 holdout05 retain95" 22 | "forget10 holdout10 retain90" 23 | ) 24 | 25 | 26 | per_device_train_batch_size=4 # on two gpus would make effective batch size 32 27 | gradient_accumulation_steps=4 28 | 29 | 30 | ######################################################################################################################## 31 | ########################################### Unlearn TOFU models ######################################################## 32 | ######################################################################################################################## 33 | 34 | 35 | for split in "${splits[@]}"; do 36 | forget_split=$(echo $split | cut -d' ' -f1) 37 | holdout_split=$(echo $split | cut -d' ' -f2) 38 | retain_split=$(echo $split | cut -d' ' -f3) 39 | 40 | for model in "${models[@]}"; do 41 | for trainer_experiment in "${trainers_experiments[@]}"; do 42 | trainer=$(echo $trainer_experiment | cut -d' ' -f1) 43 | experiment=$(echo $trainer_experiment | cut -d' ' -f2) 44 | 45 | task_name=tofu_${model}_${forget_split}_${trainer} 46 | model_path=open-unlearning/tofu_${model}_full 47 | echo ${task_name}: Unlearning ${model_path} using ${trainer} 48 | 49 | # Unlearn 50 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file configs/accelerate/default_config.yaml --main_process_port $MASTER_PORT \ 51 | src/train.py --config-name=unlearn.yaml \ 52 | experiment=${experiment} \ 53 | trainer=${trainer} \ 54 | task_name=${task_name} \ 55 | model=${model} \ 56 | forget_split=${forget_split} \ 57 | retain_split=${retain_split} \ 58 | model.model_args.pretrained_model_name_or_path=${model_path} \ 59 | retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json \ 60 | trainer.args.per_device_train_batch_size=$per_device_train_batch_size \ 61 | trainer.args.gradient_accumulation_steps=$gradient_accumulation_steps \ 62 | trainer.args.ddp_find_unused_parameters=true \ 63 | trainer.args.gradient_checkpointing=true 64 | 65 | # Eval 66 | CUDA_VISIBLE_DEVICES=0 python src/eval.py \ 67 | experiment=eval/tofu/default.yaml \ 68 | forget_split=${forget_split} \ 69 | holdout_split=${holdout_split} \ 70 | model=${model} \ 71 | task_name=${task_name} \ 72 | model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \ 73 | paths.output_dir=saves/unlearn/${task_name}/evals \ 74 | retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json 75 | done 76 | done 77 | done -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | # Read dependencies from requirements.txt 4 | with open("requirements.txt") as f: 5 | requirements = f.read().splitlines() 6 | 7 | setup( 8 | name="open-unlearning", 9 | version="0.1.0", 10 | author="Vineeth Dorna, Anmol Mekala", 11 | author_email="vineethdorna@gmail.com, m.anmolreddy@gmail.com", 12 | description="A library for machine unlearning in LLMs.", 13 | long_description=open("README.md").read(), 14 | long_description_content_type="text/markdown", 15 | url="https://github.com/locuslab/open-unlearning", 16 | license="MIT", 17 | packages=find_packages(), 18 | install_requires=requirements, # Uses requirements.txt 19 | extras_require={ 20 | "lm-eval": [ 21 | "lm-eval==0.4.8", 22 | ], # Install using `pip install .[lm-eval]` 23 | "dev": [ 24 | "pre-commit==4.0.1", 25 | "ruff==0.6.9", 26 | ], # Install using `pip install .[dev]` 27 | }, 28 | python_requires=">=3.11", 29 | ) 30 | -------------------------------------------------------------------------------- /setup_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import subprocess 4 | from huggingface_hub import snapshot_download 5 | 6 | 7 | def download_eval_data(): 8 | snapshot_download( 9 | repo_id="open-unlearning/eval", 10 | allow_patterns="*.json", 11 | repo_type="dataset", 12 | local_dir="saves/eval", 13 | ) 14 | 15 | 16 | def download_idk_data(): 17 | snapshot_download( 18 | repo_id="open-unlearning/idk", 19 | allow_patterns="*.jsonl", 20 | repo_type="dataset", 21 | local_dir="data", 22 | ) 23 | 24 | 25 | def download_wmdp(): 26 | url = "https://cais-wmdp.s3.us-west-1.amazonaws.com/wmdp-corpora.zip" 27 | dest_dir = "data/wmdp" 28 | zip_path = os.path.join(dest_dir, "wmdp-corpora.zip") 29 | 30 | os.makedirs(dest_dir, exist_ok=True) 31 | subprocess.run(["wget", url, "-O", zip_path], check=True) 32 | subprocess.run(["unzip", "-P", "wmdpcorpora", zip_path, "-d", dest_dir], check=True) 33 | 34 | 35 | def main(): 36 | parser = argparse.ArgumentParser(description="Download and setup evaluation data.") 37 | parser.add_argument( 38 | "--eval_logs", 39 | action="store_true", 40 | help="Downloads TOFU, MUSE - retain and finetuned models eval logs and saves them in saves/eval", 41 | ) 42 | parser.add_argument( 43 | "--idk", 44 | action="store_true", 45 | help="Download idk dataset from HF hub and stores it data/idk.jsonl", 46 | ) 47 | parser.add_argument( 48 | "--wmdp", 49 | action="store_true", 50 | help="Download and unzip WMDP dataset into data/wmdp", 51 | ) 52 | 53 | args = parser.parse_args() 54 | 55 | if args.eval_logs: 56 | download_eval_data() 57 | if args.idk: 58 | download_idk_data() 59 | if args.wmdp: 60 | download_wmdp() 61 | 62 | 63 | if __name__ == "__main__": 64 | main() 65 | -------------------------------------------------------------------------------- /src/data/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any, Union 2 | from omegaconf import DictConfig 3 | 4 | from data.qa import QADataset, QAwithIdkDataset, QAwithAlternateDataset 5 | from data.collators import ( 6 | DataCollatorForSupervisedDataset, 7 | ) 8 | from data.unlearn import ForgetRetainDataset 9 | from data.pretraining import PretrainingDataset, CompletionDataset 10 | 11 | DATASET_REGISTRY: Dict[str, Any] = {} 12 | COLLATOR_REGISTRY: Dict[str, Any] = {} 13 | 14 | 15 | def _register_data(data_class): 16 | DATASET_REGISTRY[data_class.__name__] = data_class 17 | 18 | 19 | def _register_collator(collator_class): 20 | COLLATOR_REGISTRY[collator_class.__name__] = collator_class 21 | 22 | 23 | def _load_single_dataset(dataset_name, dataset_cfg: DictConfig, **kwargs): 24 | dataset_handler_name = dataset_cfg.get("handler") 25 | assert dataset_handler_name is not None, ValueError( 26 | f"{dataset_name} handler not set" 27 | ) 28 | dataset_handler = DATASET_REGISTRY.get(dataset_handler_name) 29 | if dataset_handler is None: 30 | raise NotImplementedError( 31 | f"{dataset_handler_name} not implemented or not registered" 32 | ) 33 | dataset_args = dataset_cfg.args 34 | return dataset_handler(**dataset_args, **kwargs) 35 | 36 | 37 | def get_datasets(dataset_cfgs: Union[Dict, DictConfig], **kwargs): 38 | dataset = {} 39 | for dataset_name, dataset_cfg in dataset_cfgs.items(): 40 | access_name = dataset_cfg.get("access_key", dataset_name) 41 | dataset[access_name] = _load_single_dataset(dataset_name, dataset_cfg, **kwargs) 42 | if len(dataset) == 1: 43 | # return a single dataset 44 | return list(dataset.values())[0] 45 | # return mapping to multiple datasets 46 | return dataset 47 | 48 | 49 | def get_data(data_cfg: DictConfig, mode="train", **kwargs): 50 | data = {} 51 | data_cfg = dict(data_cfg) 52 | anchor = data_cfg.pop("anchor", "forget") 53 | for split, dataset_cfgs in data_cfg.items(): 54 | data[split] = get_datasets(dataset_cfgs, **kwargs) 55 | if mode == "train": 56 | return data 57 | elif mode == "unlearn": 58 | unlearn_splits = {k: v for k, v in data.items() if k not in ("eval", "test")} 59 | unlearn_dataset = ForgetRetainDataset(**unlearn_splits, anchor=anchor) 60 | data["train"] = unlearn_dataset 61 | for split in unlearn_splits: 62 | data.pop(split) 63 | return data 64 | 65 | 66 | def _get_single_collator(collator_name: str, collator_cfg: DictConfig, **kwargs): 67 | collator_handler_name = collator_cfg.get("handler") 68 | assert collator_handler_name is not None, ValueError( 69 | f"{collator_name} handler not set" 70 | ) 71 | collator_handler = COLLATOR_REGISTRY.get(collator_handler_name) 72 | if collator_handler is None: 73 | raise NotImplementedError( 74 | f"{collator_handler_name} not implemented or not registered" 75 | ) 76 | collator_args = collator_cfg.args 77 | return collator_handler(**collator_args, **kwargs) 78 | 79 | 80 | def get_collators(collator_cfgs, **kwargs): 81 | collators = {} 82 | for collator_name, collator_cfg in collator_cfgs.items(): 83 | collators[collator_name] = _get_single_collator( 84 | collator_name, collator_cfg, **kwargs 85 | ) 86 | if len(collators) == 1: 87 | # return a single collator 88 | return list(collators.values())[0] 89 | # return collators in a dict 90 | return collators 91 | 92 | 93 | # Register datasets 94 | _register_data(QADataset) 95 | _register_data(QAwithIdkDataset) 96 | _register_data(PretrainingDataset) 97 | _register_data(CompletionDataset) 98 | _register_data(QAwithAlternateDataset) 99 | 100 | # Register composite datasets used in unlearning 101 | # groups: unlearn 102 | _register_data(ForgetRetainDataset) 103 | 104 | # Register collators 105 | _register_collator(DataCollatorForSupervisedDataset) 106 | -------------------------------------------------------------------------------- /src/data/collators.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import transformers 3 | from typing import Dict, Sequence 4 | from data.utils import IGNORE_INDEX 5 | 6 | 7 | class DataCollatorForSupervisedDataset(object): 8 | """Collate examples for supervised fine-tuning.""" 9 | 10 | def __init__( 11 | self, 12 | tokenizer: transformers.PreTrainedTokenizer, 13 | padding_side: str = "right", 14 | index: str = None, 15 | ): 16 | self.tokenizer = tokenizer 17 | self.padding_side = padding_side 18 | self.index = index 19 | 20 | def get_instances_from_key(self, instances: Sequence[Dict], key: str): 21 | ret_instances = [instance[key] for instance in instances] 22 | return ret_instances 23 | 24 | def _pad_tokens(self, input_ids, padding_value): 25 | if self.padding_side == "right": 26 | input_ids = torch.nn.utils.rnn.pad_sequence( 27 | input_ids, batch_first=True, padding_value=padding_value 28 | ) 29 | else: 30 | input_ids = torch.nn.utils.rnn.pad_sequence( 31 | [torch.flip(i, dims=[0]) for i in input_ids], 32 | batch_first=True, 33 | padding_value=padding_value, 34 | ).flip(dims=[1]) 35 | return input_ids 36 | 37 | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: 38 | assert isinstance(instances[0], dict) 39 | return_dct = {} 40 | if "input_ids" not in instances[0]: 41 | for key in instances[0].keys(): 42 | key_instances = self.get_instances_from_key( 43 | instances=instances, key=key 44 | ) 45 | return_dct[key] = self(key_instances) 46 | else: 47 | input_ids = [instance["input_ids"] for instance in instances] 48 | input_ids = self._pad_tokens(input_ids, self.tokenizer.pad_token_id) 49 | attention_mask = input_ids.ne(self.tokenizer.pad_token_id) 50 | return_dct.update({"input_ids": input_ids}) 51 | return_dct.update({"attention_mask": attention_mask}) 52 | if "labels" in instances[0]: 53 | labels = [instance["labels"] for instance in instances] 54 | labels = self._pad_tokens(labels, IGNORE_INDEX) 55 | return_dct.update({"labels": labels}) 56 | if self.index: 57 | if self.index in instances[0]: 58 | return_dct.update( 59 | { 60 | self.index: torch.tensor( 61 | [example[self.index] for example in instances] 62 | ) 63 | } 64 | ) 65 | else: 66 | raise Warning(f"{self.index} not found in dataset") 67 | return return_dct 68 | -------------------------------------------------------------------------------- /src/data/pretraining.py: -------------------------------------------------------------------------------- 1 | # import torch 2 | from torch.utils.data import Dataset 3 | from data.utils import ( 4 | load_hf_dataset, 5 | add_dataset_index, 6 | preprocess_pretraining_instance, 7 | ) 8 | 9 | 10 | class CompletionDataset(Dataset): 11 | def __init__( 12 | self, 13 | hf_args, 14 | template_args, 15 | tokenizer, 16 | prefix_key="prompt", 17 | text_key="text", 18 | max_length=2048, 19 | predict_with_generate=False, 20 | insert_space=False, 21 | ): 22 | super(CompletionDataset, self).__init__() 23 | self.tokenizer = tokenizer 24 | self.max_length = max_length 25 | self.data = load_hf_dataset(**hf_args) 26 | self.data = add_dataset_index(self.data) 27 | # if either key does not exist in dataset, it is taken as "" 28 | self.prefix_key = prefix_key 29 | self.text_key = text_key 30 | self.predict_with_generate = predict_with_generate 31 | self.insert_space = insert_space 32 | 33 | def __len__(self): 34 | return len(self.data) 35 | 36 | def _process_sample(self, prefix, text_content, index=-1): 37 | tokenized_data = preprocess_pretraining_instance( 38 | self.tokenizer, 39 | prefix, 40 | text_content, 41 | self.max_length, 42 | self.predict_with_generate, 43 | self.insert_space, 44 | ) 45 | item_dct = { 46 | "input_ids": tokenized_data["input_ids"], 47 | "labels": tokenized_data["labels"], 48 | "attention_mask": tokenized_data["attention_mask"], 49 | } 50 | if index != -1: 51 | item_dct["index"] = index 52 | return item_dct 53 | 54 | def __getitem__(self, idx): 55 | pref = self.data[idx].get(self.prefix_key, "") 56 | text_content = self.data[idx].get(self.text_key, "") 57 | index = self.data[idx]["index"] 58 | item = self._process_sample(pref, text_content, index) 59 | return item 60 | 61 | 62 | class PretrainingDataset(Dataset): 63 | def __init__( 64 | self, hf_args, template_args, tokenizer, text_key="text", max_length=2048 65 | ): 66 | super(PretrainingDataset, self).__init__() 67 | self.tokenizer = tokenizer 68 | self.max_length = max_length 69 | self.chunks = self._chunk_raw_text(load_hf_dataset(**hf_args)[text_key]) 70 | 71 | def _chunk_raw_text(self, raw_text): 72 | raw_text = "\n\n".join(raw_text) 73 | full_token_sequence = self.tokenizer(raw_text, add_special_tokens=False)[ 74 | "input_ids" 75 | ] 76 | num_chunks = len(full_token_sequence) // self.max_length + 1 77 | chunks = [] 78 | for i in range(num_chunks): 79 | chunks.append( 80 | self.tokenizer.decode( 81 | full_token_sequence[i * self.max_length : (i + 1) * self.max_length] 82 | ) 83 | ) 84 | return chunks 85 | 86 | def __len__(self): 87 | return len(self.chunks) 88 | 89 | def __getitem__(self, idx): 90 | return preprocess_pretraining_instance( 91 | self.tokenizer, "", self.chunks[idx], self.max_length 92 | ) 93 | -------------------------------------------------------------------------------- /src/data/qa.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | 4 | from data.utils import load_hf_dataset, preprocess_chat_instance, add_dataset_index 5 | 6 | 7 | class QADataset(Dataset): 8 | def __init__( 9 | self, 10 | hf_args, 11 | template_args, 12 | tokenizer, 13 | question_key="question", 14 | answer_key="answer", 15 | few_shot_dataset_hf_args=None, 16 | max_length=512, 17 | predict_with_generate=False, 18 | ): 19 | super(QADataset, self).__init__() 20 | self.tokenizer = tokenizer 21 | self.max_length = max_length 22 | self.data = load_hf_dataset(**hf_args) 23 | self.data = add_dataset_index(self.data) 24 | self.fs_data = None 25 | if few_shot_dataset_hf_args is not None: 26 | raw_data = load_hf_dataset(**few_shot_dataset_hf_args) 27 | self.fs_data = {} 28 | self.fs_data[question_key] = raw_data[question_key] 29 | self.fs_data[answer_key] = raw_data[answer_key] 30 | self.template_args = template_args 31 | self.question_key = question_key 32 | self.answer_key = answer_key 33 | self.predict_with_generate = predict_with_generate 34 | 35 | def __len__(self): 36 | return len(self.data) 37 | 38 | def _process_sample(self, question, answer, index=-1): 39 | if self.fs_data is None: 40 | prompt_msgs, response_msgs = [question], [answer] 41 | else: 42 | prompt_msgs = self.fs_data[self.question_key] + [question] 43 | response_msgs = self.fs_data[self.answer_key] + [answer] 44 | tokenized_data = preprocess_chat_instance( 45 | self.tokenizer, 46 | self.template_args, 47 | prompt_msgs, 48 | response_msgs, 49 | self.max_length, 50 | self.predict_with_generate, 51 | ) 52 | item_dct = { 53 | "input_ids": tokenized_data["input_ids"], 54 | "labels": tokenized_data["labels"], 55 | "attention_mask": tokenized_data["attention_mask"], 56 | "index": index, 57 | } 58 | return item_dct 59 | 60 | def __getitem__(self, idx): 61 | question = self.data[idx][self.question_key] 62 | answer = self.data[idx][self.answer_key] 63 | index = self.data[idx]["index"] 64 | if isinstance(answer, str): 65 | item = self._process_sample(question=question, answer=answer, index=index) 66 | elif isinstance(answer, list): 67 | item = {} 68 | for i, ans in enumerate(answer): 69 | sample_item = self._process_sample( 70 | question=question, answer=ans, index=index 71 | ) 72 | item[i] = sample_item 73 | else: 74 | raise NotImplementedError("answer format not found") 75 | return item 76 | 77 | 78 | class QAwithIdkDataset(QADataset): 79 | def __init__(self, idk_path, return_original=True, *args, **kwargs): 80 | self.idk_path = idk_path 81 | self.return_original = return_original 82 | self.idk_responses = open(self.idk_path, "r").readlines() 83 | super().__init__(*args, **kwargs) 84 | 85 | def item_with_idk(self, question): 86 | rand_pos = torch.randint(0, len(self.idk_responses), (1,)).item() 87 | idk_response = self.idk_responses[rand_pos].strip() 88 | idk_item = self._process_sample(question=question, answer=idk_response) 89 | return idk_item 90 | 91 | def __getitem__(self, idx): 92 | item = super().__getitem__(idx) 93 | question = self.data[idx][self.question_key] 94 | if isinstance(item, dict): 95 | return_item = {"original": item} 96 | idk_item = self.item_with_idk(question) 97 | return_item["alternate"] = idk_item 98 | # return_item = [item, idk_item] 99 | elif isinstance(item, list) or isinstance(item, tuple): 100 | return_item = [] 101 | for sample_item in item: 102 | return_item = {"original": sample_item} 103 | idk_item = self.item_with_idk(question) 104 | return_item["alternate"] = idk_item 105 | # return_item.append([sample_item, idk_item]) 106 | return return_item if self.return_original else return_item["alternate"] 107 | 108 | 109 | class QAwithAlternateDataset(QADataset): 110 | def __init__(self, alternate_key, return_original=True, *args, **kwargs): 111 | self.alternate_key = alternate_key 112 | self.return_original = return_original 113 | super().__init__(*args, **kwargs) 114 | 115 | def __getitem__(self, idx): 116 | item = super().__getitem__(idx) 117 | question = self.data[idx][self.question_key] 118 | if isinstance(item, dict): 119 | return_item = {"original": item} 120 | alt_item = self._process_sample( 121 | question=question, answer=self.data[idx][self.alternate_key] 122 | ) 123 | return_item["alternate"] = alt_item 124 | # return_item = [item, idk_item] 125 | elif isinstance(item, list) or isinstance(item, tuple): 126 | return_item = [] 127 | for sample_item in item: 128 | return_item = {"original": sample_item} 129 | alt_item = self._process_sample( 130 | question=question, answer=self.data[idx][self.alternate_key] 131 | ) 132 | return_item["alternate"] = alt_item 133 | # return_item.append([sample_item, idk_item]) 134 | return return_item if self.return_original else return_item["alternate"] 135 | -------------------------------------------------------------------------------- /src/data/unlearn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | 4 | 5 | class ForgetRetainDataset(Dataset): 6 | # https://github.com/OPTML-Group/SOUL/blob/main/src/dataset/Base.py 7 | def __init__(self, forget, retain, anchor="forget"): 8 | """Wraps the forget retain dataset into unlearning dataset. 9 | 10 | Args: 11 | forget (Dataset): Forget Dataset 12 | retain (Dataset): Retain Dataset 13 | anchor (str, optional): Specifies which dataset to anchor while randomly sampling from the other dataset. Defaults to 'forget'. 14 | """ 15 | self.forget = forget 16 | self.retain = retain 17 | self.anchor = anchor 18 | 19 | def __len__(self): 20 | """Ensures the sampled dataset matches the anchor dataset's length.""" 21 | if self.anchor == "forget": 22 | assert self.forget is not None, ValueError( 23 | "forget dataset can't be None when anchor=forget" 24 | ) 25 | return len(self.forget) 26 | elif self.anchor == "retain": 27 | assert self.retain is not None, ValueError( 28 | "retain dataset can't be None when anchor=retain" 29 | ) 30 | return len(self.retain) 31 | else: 32 | raise NotImplementedError(f"{self.anchor} can be only forget or retain") 33 | 34 | def __getitem__(self, idx): 35 | item = {} 36 | if self.anchor == "forget": 37 | item["forget"] = self.forget[idx] 38 | if self.retain: 39 | retain_idx = torch.randint(0, len(self.retain), (1,)).item() 40 | item["retain"] = self.retain[retain_idx] 41 | elif self.anchor == "retain": 42 | item["retain"] = self.retain[idx] 43 | if self.forget: 44 | forget_idx = torch.randint(0, len(self.forget), (1,)).item() 45 | item["forget"] = self.forget[forget_idx] 46 | return item 47 | -------------------------------------------------------------------------------- /src/eval.py: -------------------------------------------------------------------------------- 1 | import hydra 2 | from omegaconf import DictConfig 3 | 4 | from trainer.utils import seed_everything 5 | from model import get_model 6 | from evals import get_evaluators 7 | 8 | 9 | @hydra.main(version_base=None, config_path="../configs", config_name="eval.yaml") 10 | def main(cfg: DictConfig): 11 | """Entry point of the code to evaluate models 12 | Args: 13 | cfg (DictConfig): Config to train 14 | """ 15 | seed_everything(cfg.seed) 16 | model_cfg = cfg.model 17 | template_args = model_cfg.template_args 18 | assert model_cfg is not None, "Invalid model yaml passed in train config." 19 | model, tokenizer = get_model(model_cfg) 20 | 21 | eval_cfgs = cfg.eval 22 | evaluators = get_evaluators(eval_cfgs) 23 | for evaluator_name, evaluator in evaluators.items(): 24 | eval_args = { 25 | "template_args": template_args, 26 | "model": model, 27 | "tokenizer": tokenizer, 28 | } 29 | _ = evaluator.evaluate(**eval_args) 30 | 31 | 32 | if __name__ == "__main__": 33 | main() 34 | -------------------------------------------------------------------------------- /src/evals/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any 2 | from omegaconf import DictConfig 3 | from evals.tofu import TOFUEvaluator 4 | from evals.muse import MUSEEvaluator 5 | from evals.lm_eval import LMEvalEvaluator 6 | 7 | EVALUATOR_REGISTRY: Dict[str, Any] = {} 8 | 9 | 10 | def _register_evaluator(evaluator_class): 11 | EVALUATOR_REGISTRY[evaluator_class.__name__] = evaluator_class 12 | 13 | 14 | def get_evaluator(name: str, eval_cfg: DictConfig, **kwargs): 15 | evaluator_handler_name = eval_cfg.get("handler") 16 | assert evaluator_handler_name is not None, ValueError(f"{name} handler not set") 17 | eval_handler = EVALUATOR_REGISTRY.get(evaluator_handler_name) 18 | if eval_handler is None: 19 | raise NotImplementedError( 20 | f"{evaluator_handler_name} not implemented or not registered" 21 | ) 22 | return eval_handler(eval_cfg, **kwargs) 23 | 24 | 25 | def get_evaluators(eval_cfgs: DictConfig, **kwargs): 26 | evaluators = {} 27 | for eval_name, eval_cfg in eval_cfgs.items(): 28 | evaluators[eval_name] = get_evaluator(eval_name, eval_cfg, **kwargs) 29 | return evaluators 30 | 31 | 32 | # Register Your benchmark evaluators 33 | _register_evaluator(TOFUEvaluator) 34 | _register_evaluator(MUSEEvaluator) 35 | _register_evaluator(LMEvalEvaluator) 36 | -------------------------------------------------------------------------------- /src/evals/base.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | from evals.metrics import get_metrics 5 | 6 | logger = logging.getLogger("evaluator") 7 | 8 | 9 | class Evaluator: 10 | def __init__(self, name, eval_cfg, **kwargs): 11 | self.name = name 12 | self.eval_cfg = eval_cfg 13 | self.metrics_cfg = self.eval_cfg.metrics 14 | self.metrics = self.load_metrics(self.metrics_cfg) 15 | logger.info( 16 | f"Evaluations stored in the experiment directory: {self.eval_cfg.output_dir}" 17 | ) 18 | 19 | def get_logs_file_path(self, output_dir, suffix="EVAL"): 20 | """Returns the path to json file to store results""" 21 | logs_filename = os.path.join(output_dir, f"{self.name}_{suffix}.json") 22 | return logs_filename 23 | 24 | def load_logs_from_file(self, file): 25 | """Returns the cache of existing results""" 26 | logs = {} 27 | if os.path.exists(file): 28 | logger.info(f"Loading existing evaluations from {file}") 29 | with open(file, "r") as f: 30 | logs = json.load(f) 31 | return logs 32 | 33 | def save_logs(self, logs, file): 34 | """Save the logs in a json file""" 35 | logs = dict(sorted(logs.items())) 36 | os.makedirs(os.path.dirname(file), exist_ok=True) 37 | try: 38 | with open(file, "w") as f: 39 | json.dump(logs, f, indent=4) 40 | except Exception as e: 41 | raise RuntimeError(f"Failed to save {file}: {e}") 42 | 43 | def prepare_model(self, model): 44 | """Prepare model for evaluation""" 45 | model.eval() 46 | return model 47 | 48 | def load_metrics(self, metrics_cfg): 49 | """Load metrics for evaluation""" 50 | metrics = get_metrics(metrics_cfg) 51 | return metrics 52 | 53 | def summarize(self, logs): 54 | """Summarize the metrics results""" 55 | metric_summary = {} 56 | for metric_name, metric_results in logs.items(): 57 | if metric_name not in self.metrics: 58 | continue 59 | agg_value = metric_results.get("agg_value", None) 60 | if agg_value is not None: 61 | metric_summary[metric_name] = agg_value 62 | return metric_summary 63 | 64 | def evaluate(self, model, output_dir=None, overwrite=None, **kwargs): 65 | # set flag to overwrite metrics 66 | overwrite = self.eval_cfg.overwrite if overwrite is None else overwrite 67 | 68 | # Prepare model for evaluation 69 | model = self.prepare_model(model) 70 | 71 | # Set output_dir and file to store results 72 | output_dir = output_dir if output_dir else self.eval_cfg.output_dir 73 | logs_file_path = self.get_logs_file_path(output_dir) 74 | summary_file_path = self.get_logs_file_path(output_dir, suffix="SUMMARY") 75 | 76 | # Load existing results from file if any. 77 | logs = self.load_logs_from_file(logs_file_path) if not overwrite else {} 78 | 79 | logger.info(f"***** Running {self.name} evaluation suite *****") 80 | logger.info(f"Fine-grained evaluations will be saved to: {logs_file_path}") 81 | logger.info( 82 | f"Aggregated evaluations will be summarised in: {summary_file_path}" 83 | ) 84 | for metric_name, metric_fn in self.metrics.items(): 85 | if not overwrite and metric_name in logs and logs[metric_name]: 86 | logger.info(f"Skipping {metric_name}, already evaluated.") 87 | if "agg_value" in logs[metric_name]: 88 | logger.info( 89 | f"Result for metric {metric_name}:\t{logs[metric_name]['agg_value']}" 90 | ) 91 | self.save_logs(self.summarize(logs), summary_file_path) 92 | continue 93 | _ = logs.pop(metric_name, None) # overwriting existing evals if present 94 | kwargs = { 95 | "tokenizer": kwargs.get("tokenizer", None), 96 | "template_args": kwargs.get("template_args", None), 97 | } 98 | metrics_args = self.eval_cfg.metrics[metric_name] 99 | _ 100 | result = metric_fn( 101 | model, 102 | metric_name=metric_name, 103 | cache=logs, 104 | **kwargs, 105 | **metrics_args, 106 | ) 107 | if "agg_value" in result: 108 | logger.info(f"Result for metric {metric_name}:\t{result['agg_value']}") 109 | self.save_logs(logs, logs_file_path) 110 | self.save_logs(self.summarize(logs), summary_file_path) 111 | 112 | return self.summarize(logs) 113 | -------------------------------------------------------------------------------- /src/evals/lm_eval.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from omegaconf import OmegaConf 3 | 4 | from lm_eval.models.hf_vlms import HFLM 5 | from lm_eval.tasks import TaskManager 6 | from lm_eval import simple_evaluate 7 | 8 | from evals.base import Evaluator 9 | 10 | 11 | logger = logging.getLogger("evaluator") 12 | 13 | 14 | class LMEvalEvaluator(Evaluator): 15 | def __init__(self, eval_cfg, **kwargs): 16 | self.name = "LMEval" 17 | self.eval_cfg = eval_cfg 18 | self.tasks = OmegaConf.to_container( 19 | self.eval_cfg.tasks, resolve=True, throw_on_missing=True 20 | ) 21 | self.task_manager = TaskManager() 22 | self.simple_evaluate_args = dict(kwargs.get("simple_evaluate_args", {})) 23 | 24 | def prepare_model(self, model, **kwargs): 25 | """Prepare model for evaluation""" 26 | model.eval() 27 | return HFLM(model) 28 | 29 | def summarize(self, eval_results: dict, task_name: str) -> dict: 30 | """ 31 | Summarize evaluation metrics from lm_eval.simple_evaluate. 32 | - If task_name is a group, return only aggregated group-level metrics. 33 | - If it's a single task, return per-task metrics from 'results'. 34 | - Always exclude 'alias' entries and strip ',none' suffixes. 35 | """ 36 | summary = {} 37 | 38 | def clean_metric_key(prefix: str, metric_name: str) -> str | None: 39 | if metric_name == "alias": 40 | return None 41 | base = metric_name.split(",", 1)[0].strip() 42 | return f"{prefix}/{base}" 43 | 44 | # Check if task is a group (e.g., 'mmlu') 45 | if task_name in self.task_manager.all_groups: 46 | group_metrics = eval_results.get("groups", {}).get(task_name, {}) 47 | for metric_name, value in group_metrics.items(): 48 | key = clean_metric_key(task_name, metric_name) 49 | if key is None: 50 | continue 51 | try: 52 | summary[key] = float(value) 53 | except (TypeError, ValueError): 54 | summary[key] = value 55 | else: 56 | task_metrics = eval_results.get("results", {}).get(task_name, {}) 57 | for metric_name, value in task_metrics.items(): 58 | key = clean_metric_key(task_name, metric_name) 59 | if key is None: 60 | continue 61 | try: 62 | summary[key] = float(value) 63 | except (TypeError, ValueError): 64 | summary[key] = value 65 | 66 | return summary 67 | 68 | def get_task_name(self, task): 69 | if isinstance(task, str): 70 | return task 71 | elif isinstance(task, dict): 72 | if "task" in task: 73 | return task.get("task") 74 | raise ValueError(f"Invalid task format: {task}") 75 | 76 | def evaluate(self, model, output_dir=None, overwrite=None, **kwargs): 77 | # set flag to overwrite metrics 78 | overwrite = self.eval_cfg.overwrite if overwrite is None else overwrite 79 | 80 | # Prepare model for evaluation 81 | kwargs = {"tokenizer": kwargs.get("tokenizer", None)} 82 | model = self.prepare_model(model, **kwargs) 83 | 84 | # Set output_dir and file to store results 85 | output_dir = output_dir if output_dir else self.eval_cfg.output_dir 86 | logs_file_path = self.get_logs_file_path(output_dir) 87 | summary_file_path = self.get_logs_file_path(output_dir, suffix="SUMMARY") 88 | 89 | # Load existing results from file if any. 90 | logs = self.load_logs_from_file(logs_file_path) if not overwrite else {} 91 | summary = self.load_logs_from_file(summary_file_path) if not overwrite else {} 92 | 93 | logger.info(f"***** Running {self.name} evaluation suite *****") 94 | logger.info(f"Fine-grained evaluations will be saved to: {logs_file_path}") 95 | logger.info( 96 | f"Aggregated evaluations will be summarised in: {summary_file_path}" 97 | ) 98 | 99 | for task in self.tasks: 100 | task_name = self.get_task_name(task) 101 | if not overwrite and task_name in logs and logs[task_name]: 102 | logger.info(f"Skipping {task_name}, already evaluated.") 103 | continue 104 | _ = logs.pop(task_name, None) # overwriting existing evals if present 105 | results = simple_evaluate( 106 | model=model, 107 | tasks=[task], 108 | task_manager=self.task_manager, 109 | **self.simple_evaluate_args, 110 | ) 111 | logs.update({task_name: results["samples"]}) 112 | summary.update(self.summarize(results, task_name)) 113 | self.save_logs(logs, logs_file_path) 114 | self.save_logs(summary, summary_file_path) 115 | return summary 116 | -------------------------------------------------------------------------------- /src/evals/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | from omegaconf import DictConfig 3 | from evals.metrics.base import UnlearningMetric 4 | from evals.metrics.memorization import ( 5 | probability, 6 | probability_w_options, 7 | rouge, 8 | truth_ratio, 9 | extraction_strength, 10 | exact_memorization, 11 | ) 12 | from evals.metrics.privacy import ks_test, privleak, rel_diff 13 | from evals.metrics.mia import ( 14 | mia_loss, 15 | mia_min_k, 16 | mia_min_k_plus_plus, 17 | mia_gradnorm, 18 | mia_zlib, 19 | mia_reference, 20 | ) 21 | from evals.metrics.utility import ( 22 | hm_aggregate, 23 | classifier_prob, 24 | ) 25 | 26 | METRICS_REGISTRY: Dict[str, UnlearningMetric] = {} 27 | 28 | 29 | def _register_metric(metric): 30 | METRICS_REGISTRY[metric.name] = metric 31 | 32 | 33 | def _get_single_metric(name: str, metric_cfg, **kwargs): 34 | metric_handler_name = metric_cfg.get("handler") 35 | assert metric_handler_name is not None, ValueError(f"{name} handler not set") 36 | metric = METRICS_REGISTRY.get(metric_handler_name) 37 | if metric is None: 38 | raise NotImplementedError( 39 | f"{metric_handler_name} not implemented or not registered" 40 | ) 41 | pre_compute_cfg = metric_cfg.get("pre_compute", {}) 42 | pre_compute_metrics = get_metrics(pre_compute_cfg, **kwargs) 43 | metric.set_pre_compute_metrics(pre_compute_metrics) 44 | return metric 45 | 46 | 47 | def get_metrics(metric_cfgs: DictConfig, **kwargs): 48 | metrics = {} 49 | for metric_name, metric_cfg in metric_cfgs.items(): 50 | metrics[metric_name] = _get_single_metric(metric_name, metric_cfg, **kwargs) 51 | return metrics 52 | 53 | 54 | # Register metrics here 55 | _register_metric(probability) 56 | _register_metric(probability_w_options) 57 | _register_metric(rouge) 58 | _register_metric(truth_ratio) 59 | _register_metric(ks_test) 60 | _register_metric(hm_aggregate) 61 | _register_metric(privleak) 62 | _register_metric(rel_diff) 63 | _register_metric(exact_memorization) 64 | _register_metric(extraction_strength) 65 | 66 | # Register MIA metrics 67 | _register_metric(mia_loss) 68 | _register_metric(mia_min_k) 69 | _register_metric(mia_min_k_plus_plus) 70 | _register_metric(mia_gradnorm) 71 | _register_metric(mia_zlib) 72 | _register_metric(mia_reference) 73 | 74 | # Register Utility metrics 75 | _register_metric(classifier_prob) 76 | -------------------------------------------------------------------------------- /src/evals/metrics/mia/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Attack implementations. 3 | """ 4 | 5 | from transformers import AutoModelForCausalLM 6 | 7 | from evals.metrics.base import unlearning_metric 8 | from evals.metrics.mia.loss import LOSSAttack 9 | from evals.metrics.mia.min_k import MinKProbAttack 10 | from evals.metrics.mia.min_k_plus_plus import MinKPlusPlusAttack 11 | from evals.metrics.mia.gradnorm import GradNormAttack 12 | from evals.metrics.mia.zlib import ZLIBAttack 13 | from evals.metrics.mia.reference import ReferenceAttack 14 | 15 | from evals.metrics.mia.utils import mia_auc 16 | import logging 17 | 18 | logger = logging.getLogger("metrics") 19 | 20 | ## NOTE: all MIA attack statistics are signed as required in order to show the 21 | # same trends as loss (higher the score on an example, less likely the membership) 22 | 23 | 24 | @unlearning_metric(name="mia_loss") 25 | def mia_loss(model, **kwargs): 26 | return mia_auc( 27 | LOSSAttack, 28 | model, 29 | data=kwargs["data"], 30 | collator=kwargs["collators"], 31 | batch_size=kwargs["batch_size"], 32 | ) 33 | 34 | 35 | @unlearning_metric(name="mia_min_k") 36 | def mia_min_k(model, **kwargs): 37 | return mia_auc( 38 | MinKProbAttack, 39 | model, 40 | data=kwargs["data"], 41 | collator=kwargs["collators"], 42 | batch_size=kwargs["batch_size"], 43 | k=kwargs["k"], 44 | ) 45 | 46 | 47 | @unlearning_metric(name="mia_min_k_plus_plus") 48 | def mia_min_k_plus_plus(model, **kwargs): 49 | return mia_auc( 50 | MinKPlusPlusAttack, 51 | model, 52 | data=kwargs["data"], 53 | collator=kwargs["collators"], 54 | batch_size=kwargs["batch_size"], 55 | k=kwargs["k"], 56 | ) 57 | 58 | 59 | @unlearning_metric(name="mia_gradnorm") 60 | def mia_gradnorm(model, **kwargs): 61 | return mia_auc( 62 | GradNormAttack, 63 | model, 64 | data=kwargs["data"], 65 | collator=kwargs["collators"], 66 | batch_size=kwargs["batch_size"], 67 | p=kwargs["p"], 68 | ) 69 | 70 | 71 | @unlearning_metric(name="mia_zlib") 72 | def mia_zlib(model, **kwargs): 73 | return mia_auc( 74 | ZLIBAttack, 75 | model, 76 | data=kwargs["data"], 77 | collator=kwargs["collators"], 78 | batch_size=kwargs["batch_size"], 79 | tokenizer=kwargs.get("tokenizer"), 80 | ) 81 | 82 | 83 | @unlearning_metric(name="mia_reference") 84 | def mia_reference(model, **kwargs): 85 | if "reference_model_path" not in kwargs: 86 | raise ValueError("Reference model must be provided in kwargs") 87 | logger.info(f"Loading reference model from {kwargs['reference_model_path']}") 88 | reference_model = AutoModelForCausalLM.from_pretrained( 89 | kwargs["reference_model_path"], 90 | torch_dtype=model.dtype, 91 | device_map={"": model.device}, 92 | ) 93 | return mia_auc( 94 | ReferenceAttack, 95 | model, 96 | data=kwargs["data"], 97 | collator=kwargs["collators"], 98 | batch_size=kwargs["batch_size"], 99 | reference_model=reference_model, 100 | ) 101 | -------------------------------------------------------------------------------- /src/evals/metrics/mia/all_attacks.py: -------------------------------------------------------------------------------- 1 | """ 2 | Enum class for attacks. Also contains the base attack class. 3 | """ 4 | 5 | from enum import Enum 6 | from torch.utils.data import DataLoader 7 | import numpy as np 8 | from tqdm import tqdm 9 | 10 | 11 | # Attack definitions 12 | class AllAttacks(str, Enum): 13 | LOSS = "loss" 14 | REFERENCE_BASED = "ref" 15 | ZLIB = "zlib" 16 | MIN_K = "min_k" 17 | MIN_K_PLUS_PLUS = "min_k++" 18 | GRADNORM = "gradnorm" 19 | RECALL = "recall" 20 | 21 | 22 | # Base attack class 23 | class Attack: 24 | def __init__(self, model, data, collator, batch_size, **kwargs): 25 | """Initialize attack with model and create dataloader.""" 26 | self.model = model 27 | self.dataloader = DataLoader(data, batch_size=batch_size, collate_fn=collator) 28 | self.setup(**kwargs) 29 | 30 | def setup(self, **kwargs): 31 | """Setup attack-specific parameters.""" 32 | pass 33 | 34 | def compute_batch_values(self, batch): 35 | """Process a batch through model to get needed statistics.""" 36 | raise NotImplementedError 37 | 38 | def compute_score(self, sample_stats): 39 | """Compute MIA score for a single sample.""" 40 | raise NotImplementedError 41 | 42 | def attack(self): 43 | """Run full MIA attack.""" 44 | all_scores = [] 45 | all_indices = [] 46 | 47 | for batch in tqdm(self.dataloader, total=len(self.dataloader)): 48 | indices = batch.pop("index").cpu().numpy().tolist() 49 | batch_values = self.compute_batch_values(batch) 50 | scores = [self.compute_score(values) for values in batch_values] 51 | 52 | all_scores.extend(scores) 53 | all_indices.extend(indices) 54 | 55 | scores_by_index = { 56 | str(idx): {"score": float(score)} 57 | for idx, score in zip(all_indices, all_scores) 58 | } 59 | 60 | return { 61 | "agg_value": float(np.mean(all_scores)), 62 | "value_by_index": scores_by_index, 63 | } 64 | -------------------------------------------------------------------------------- /src/evals/metrics/mia/gradnorm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Gradient-norm attack. Proposed for MIA in multiple settings, and particularly 3 | experimented for pre-training data and LLMs in https://arxiv.org/abs/2402.17012 4 | """ 5 | 6 | import torch 7 | from evals.metrics.mia.all_attacks import Attack 8 | from evals.metrics.utils import tokenwise_logprobs 9 | 10 | 11 | # DO NOT use gradnorm in a way so that it runs when your accumulated gradients during training aren't used yet 12 | # gradnorm zeros out the gradients of the model during its computation 13 | class GradNormAttack(Attack): 14 | def setup(self, p, **kwargs): 15 | if p not in [1, 2, float("inf")]: 16 | raise ValueError(f"Invalid p-norm value: {p}") 17 | self.p = p 18 | 19 | def compute_batch_values(self, batch): 20 | """Compute gradients of examples w.r.t model parameters. More grad norm => more loss.""" 21 | self.model.train() 22 | batch_log_probs = tokenwise_logprobs(self.model, batch, grad=True) 23 | batch_loss = [-torch.mean(lps) for lps in batch_log_probs] 24 | batch_grad_norms = [] 25 | for sample_loss in batch_loss: 26 | sample_grad_norms = [] 27 | self.model.zero_grad() 28 | sample_loss.backward() 29 | for param in self.model.parameters(): 30 | if param.grad is not None: 31 | sample_grad_norms.append(param.grad.detach().norm(p=self.p)) 32 | batch_grad_norms.append(torch.stack(sample_grad_norms).mean()) 33 | self.model.eval() 34 | return batch_grad_norms 35 | 36 | def compute_score(self, sample_stats): 37 | """Return negative gradient norm as the attack score.""" 38 | return sample_stats.cpu().to(torch.float32).numpy() 39 | -------------------------------------------------------------------------------- /src/evals/metrics/mia/loss.py: -------------------------------------------------------------------------------- 1 | """ 2 | Straight-forward LOSS attack, as described in https://ieeexplore.ieee.org/abstract/document/8429311 3 | """ 4 | 5 | from evals.metrics.mia.all_attacks import Attack 6 | from evals.metrics.utils import evaluate_probability 7 | 8 | 9 | class LOSSAttack(Attack): 10 | def compute_batch_values(self, batch): 11 | """Compute probabilities and losses for the batch.""" 12 | return evaluate_probability(self.model, batch) 13 | 14 | def compute_score(self, sample_stats): 15 | """Return the average loss for the sample.""" 16 | return sample_stats["avg_loss"] 17 | -------------------------------------------------------------------------------- /src/evals/metrics/mia/min_k.py: -------------------------------------------------------------------------------- 1 | """ 2 | Min-k % Prob Attack: https://arxiv.org/pdf/2310.16789.pdf 3 | """ 4 | 5 | import numpy as np 6 | from evals.metrics.mia.all_attacks import Attack 7 | from evals.metrics.utils import tokenwise_logprobs 8 | 9 | 10 | class MinKProbAttack(Attack): 11 | def setup(self, k=0.2, **kwargs): 12 | self.k = k 13 | 14 | def compute_batch_values(self, batch): 15 | """Get token-wise log probabilities for the batch.""" 16 | return tokenwise_logprobs(self.model, batch, grad=False) 17 | 18 | def compute_score(self, sample_stats): 19 | """Score single sample using min-k negative log probs scores attack.""" 20 | lp = sample_stats.cpu().numpy() 21 | if lp.size == 0: 22 | return 0 23 | 24 | num_k = max(1, int(len(lp) * self.k)) 25 | sorted_vals = np.sort(lp) 26 | return -np.mean(sorted_vals[:num_k]) 27 | -------------------------------------------------------------------------------- /src/evals/metrics/mia/min_k_plus_plus.py: -------------------------------------------------------------------------------- 1 | import torch as torch 2 | import numpy as np 3 | from evals.metrics.mia.min_k import MinKProbAttack 4 | from evals.metrics.utils import tokenwise_vocab_logprobs, tokenwise_logprobs 5 | 6 | 7 | class MinKPlusPlusAttack(MinKProbAttack): 8 | def compute_batch_values(self, batch): 9 | """Get both token-wise and vocab-wise log probabilities for the batch.""" 10 | vocab_log_probs = tokenwise_vocab_logprobs(self.model, batch, grad=False) 11 | token_log_probs = tokenwise_logprobs(self.model, batch, grad=False) 12 | return [ 13 | {"vocab_log_probs": vlp, "token_log_probs": tlp} 14 | for vlp, tlp in zip(vocab_log_probs, token_log_probs) 15 | ] 16 | 17 | def compute_score(self, sample_stats): 18 | """Score using min-k negative log probs scores with vocab-wise normalization.""" 19 | all_probs = sample_stats["vocab_log_probs"] 20 | target_prob = sample_stats["token_log_probs"] 21 | 22 | if len(target_prob) == 0: 23 | return 0 24 | 25 | # Compute normalized scores using vocab distribution 26 | mu = (torch.exp(all_probs) * all_probs).sum(-1) 27 | sigma = (torch.exp(all_probs) * torch.square(all_probs)).sum(-1) - torch.square( 28 | mu 29 | ) 30 | 31 | # Handle numerical stability 32 | sigma = torch.clamp(sigma, min=1e-6) 33 | scores = (target_prob.cpu().numpy() - mu.cpu().numpy()) / torch.sqrt( 34 | sigma 35 | ).cpu().numpy() 36 | 37 | # Take bottom k% as the attack score 38 | num_k = max(1, int(len(scores) * self.k)) 39 | return -np.mean(sorted(scores)[:num_k]) 40 | -------------------------------------------------------------------------------- /src/evals/metrics/mia/reference.py: -------------------------------------------------------------------------------- 1 | """ 2 | Reference-based attacks. 3 | """ 4 | 5 | from evals.metrics.mia.all_attacks import Attack 6 | from evals.metrics.utils import evaluate_probability 7 | 8 | 9 | class ReferenceAttack(Attack): 10 | def setup(self, reference_model, **kwargs): 11 | """Setup reference model.""" 12 | self.reference_model = reference_model 13 | 14 | def compute_batch_values(self, batch): 15 | """Compute loss scores for both target and reference models.""" 16 | ref_results = evaluate_probability(self.reference_model, batch) 17 | target_results = evaluate_probability(self.model, batch) 18 | return [ 19 | {"target_loss": t["avg_loss"], "ref_loss": r["avg_loss"]} 20 | for t, r in zip(target_results, ref_results) 21 | ] 22 | 23 | def compute_score(self, sample_stats): 24 | """Score using difference between target and reference model losses.""" 25 | return sample_stats["target_loss"] - sample_stats["ref_loss"] 26 | -------------------------------------------------------------------------------- /src/evals/metrics/mia/utils.py: -------------------------------------------------------------------------------- 1 | from evals.metrics.mia.all_attacks import AllAttacks 2 | from evals.metrics.mia.loss import LOSSAttack 3 | from evals.metrics.mia.reference import ReferenceAttack 4 | from evals.metrics.mia.zlib import ZLIBAttack 5 | from evals.metrics.mia.min_k import MinKProbAttack 6 | from evals.metrics.mia.min_k_plus_plus import MinKPlusPlusAttack 7 | from evals.metrics.mia.gradnorm import GradNormAttack 8 | 9 | from sklearn.metrics import roc_auc_score 10 | 11 | 12 | import numpy as np 13 | 14 | 15 | def get_attacker(attack: str): 16 | mapping = { 17 | AllAttacks.LOSS: LOSSAttack, 18 | AllAttacks.REFERENCE_BASED: ReferenceAttack, 19 | AllAttacks.ZLIB: ZLIBAttack, 20 | AllAttacks.MIN_K: MinKProbAttack, 21 | AllAttacks.MIN_K_PLUS_PLUS: MinKPlusPlusAttack, 22 | AllAttacks.GRADNORM: GradNormAttack, 23 | } 24 | attack_cls = mapping.get(attack, None) 25 | if attack_cls is None: 26 | raise ValueError(f"Attack {attack} not found") 27 | return attack_cls 28 | 29 | 30 | def mia_auc(attack_cls, model, data, collator, batch_size, **kwargs): 31 | """ 32 | Compute the MIA AUC and accuracy. 33 | 34 | Parameters: 35 | - attack_cls: the attack class to use. 36 | - model: the target model. 37 | - data: a dict with keys "forget" and "holdout". 38 | - collator: data collator. 39 | - batch_size: batch size. 40 | - kwargs: additional optional parameters (e.g. k, p, tokenizer, reference_model). 41 | 42 | Returns a dict containing the attack outputs, including "acc" and "auc". 43 | 44 | Note on convention: auc is 1 when the forget data is much more likely than the holdout data 45 | """ 46 | # Build attack arguments from common parameters and any extras. 47 | attack_args = { 48 | "model": model, 49 | "collator": collator, 50 | "batch_size": batch_size, 51 | } 52 | attack_args.update(kwargs) 53 | 54 | output = { 55 | "forget": attack_cls(data=data["forget"], **attack_args).attack(), 56 | "holdout": attack_cls(data=data["holdout"], **attack_args).attack(), 57 | } 58 | forget_scores = [ 59 | elem["score"] for elem in output["forget"]["value_by_index"].values() 60 | ] 61 | holdout_scores = [ 62 | elem["score"] for elem in output["holdout"]["value_by_index"].values() 63 | ] 64 | scores = np.array(forget_scores + holdout_scores) 65 | labels = np.array( 66 | [0] * len(forget_scores) + [1] * len(holdout_scores) 67 | ) # see note above 68 | auc_value = roc_auc_score(labels, scores) 69 | output["auc"], output["agg_value"] = auc_value, auc_value 70 | return output 71 | -------------------------------------------------------------------------------- /src/evals/metrics/mia/zlib.py: -------------------------------------------------------------------------------- 1 | """ 2 | zlib-normalization Attack: https://www.usenix.org/system/files/sec21-carlini-extracting.pdf 3 | """ 4 | 5 | import zlib 6 | 7 | from evals.metrics.mia.all_attacks import Attack 8 | from evals.metrics.utils import ( 9 | evaluate_probability, 10 | extract_target_texts_from_processed_data, 11 | ) 12 | 13 | 14 | class ZLIBAttack(Attack): 15 | def setup(self, tokenizer=None, **kwargs): 16 | """Setup tokenizer.""" 17 | self.tokenizer = tokenizer or self.model.tokenizer 18 | 19 | def compute_batch_values(self, batch): 20 | """Get loss and text for batch.""" 21 | eval_results = evaluate_probability(self.model, batch) 22 | texts = extract_target_texts_from_processed_data(self.tokenizer, batch) 23 | return [{"loss": r["avg_loss"], "text": t} for r, t in zip(eval_results, texts)] 24 | 25 | def compute_score(self, sample_stats): 26 | """Score using loss normalized by compressed text length.""" 27 | text = sample_stats["text"] 28 | zlib_entropy = len(zlib.compress(text.encode("utf-8"))) 29 | return sample_stats["loss"] / zlib_entropy 30 | -------------------------------------------------------------------------------- /src/evals/metrics/privacy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.stats import ks_2samp 3 | from evals.metrics.base import unlearning_metric, logger 4 | 5 | 6 | @unlearning_metric(name="ks_test") 7 | def ks_test(model, **kwargs): 8 | """Compare two forget and retain model distributions with a 2-sample KS-test and report the p-value. 9 | Used in the TOFU benchmark as forget_quality when computed over the truth_ratio statistic.""" 10 | forget_tr_stats = np.array( 11 | [ 12 | evals["score"] 13 | for evals in kwargs["pre_compute"]["forget"]["value_by_index"].values() 14 | ] 15 | ) 16 | reference_logs = kwargs.get("reference_logs", None) 17 | if reference_logs: 18 | reference_logs = reference_logs["retain_model_logs"] 19 | retain_tr_stats = np.array( 20 | [ 21 | evals["score"] 22 | for evals in reference_logs["retain"]["value_by_index"].values() 23 | ] 24 | ) 25 | fq = ks_2samp(forget_tr_stats, retain_tr_stats) 26 | pvalue = fq.pvalue 27 | else: 28 | logger.warning( 29 | "retain_model_logs not provided in reference_logs, setting forget_quality to None" 30 | ) 31 | pvalue = None 32 | return {"agg_value": pvalue} 33 | 34 | 35 | @unlearning_metric(name="privleak") 36 | def privleak(model, **kwargs): 37 | """Compare two forget and retain model scores using a relative comparison of a single statistic. 38 | To be used for MIA AUC scores in ensuring consistency and reproducibility of the MUSE benchmark. 39 | This function is similar to the rel_diff function below, but due to the MUSE benchmark reporting AUC 40 | scores as (1-x) when the more conventional way is x, we do adjustments here to our MIA AUC scores. 41 | calculations in the reverse way,""" 42 | score = kwargs["pre_compute"]["forget"]["agg_value"] 43 | try: 44 | ref = kwargs["reference_logs"]["retain_model_logs"]["retain"]["agg_value"] 45 | except Exception as _: 46 | logger.warning( 47 | f"retain_model_logs evals not provided for privleak, using default retain auc of {kwargs['ref_value']}" 48 | ) 49 | ref = kwargs["ref_value"] 50 | score = 1 - score 51 | ref = 1 - ref 52 | return {"agg_value": (score - ref) / (ref + 1e-10) * 100} 53 | 54 | 55 | @unlearning_metric(name="rel_diff") 56 | def rel_diff(model, **kwargs): 57 | """Compare two forget and retain model scores using a relative comparison of a single statistic.""" 58 | score = kwargs["pre_compute"]["forget"]["agg_value"] 59 | try: 60 | ref = kwargs["reference_logs"]["retain_model_logs"]["retain"]["agg_value"] 61 | except Exception as _: 62 | logger.warning( 63 | f"retain_model_logs evals not provided for privleak, using default retain auc of {kwargs['ref_value']}" 64 | ) 65 | ref = kwargs["ref_value"] 66 | return {"agg_value": (score - ref) / (ref + 1e-10) * 100} 67 | -------------------------------------------------------------------------------- /src/evals/metrics/utility.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import scipy as sc 4 | from tqdm import tqdm 5 | import torch.nn.functional as F 6 | from torch.utils.data import DataLoader 7 | from transformers import AutoTokenizer, AutoModelForSequenceClassification 8 | 9 | from evals.metrics.utils import aggregate_to_1D 10 | from evals.metrics.base import unlearning_metric 11 | 12 | 13 | @unlearning_metric(name="hm_aggregate") 14 | def hm_aggregate(model, **kwargs): 15 | values = [result["agg_value"] for _, result in kwargs["pre_compute"].items()] 16 | return {"agg_value": sc.stats.hmean(values)} 17 | 18 | 19 | @unlearning_metric(name="classifier_prob") 20 | def classifier_prob(model, **kwargs): 21 | batch_size = kwargs.get("batch_size", 32) 22 | max_length = kwargs.get("max_length", 512) 23 | class_id = kwargs.get("class_id", 0) 24 | text_key = kwargs.get("text_key", "generation") 25 | classifier_model_args = kwargs["classifier_model_args"] 26 | classifier_tokenization_args = kwargs["classifier_tokenization_args"] 27 | device = kwargs.get("device", "cuda") 28 | 29 | tokenizer = AutoTokenizer.from_pretrained(**classifier_tokenization_args) 30 | classifier = AutoModelForSequenceClassification.from_pretrained( 31 | **classifier_model_args 32 | ).to(device) 33 | 34 | data = kwargs["pre_compute"]["text"]["value_by_index"] 35 | data_list = [ 36 | {"text": entry[text_key], "index": int(key)} for key, entry in data.items() 37 | ] 38 | 39 | # Create DataLoader 40 | dataloader = DataLoader(data_list, batch_size=batch_size, shuffle=False) 41 | 42 | scores_by_index = {} 43 | for batch in tqdm(dataloader): 44 | batch_texts = batch["text"] 45 | batch_indices = batch["index"].tolist() 46 | 47 | # Tokenize the batch of texts 48 | inputs = tokenizer( 49 | batch_texts, 50 | return_tensors="pt", 51 | padding=True, 52 | truncation=True, 53 | max_length=max_length, 54 | return_attention_mask=True, 55 | ) 56 | inputs = {k: v.to(device) for k, v in inputs.items()} 57 | 58 | # Run the classifier 59 | with torch.no_grad(): 60 | outputs = classifier(**inputs) 61 | # Convert logits to probabilities 62 | scores = F.softmax(outputs.logits, dim=-1)[:, class_id].cpu().numpy().tolist() 63 | 64 | # Map predictions to labels 65 | for idx, prob, text in zip(batch_indices, scores, batch_texts): 66 | # Add the prediction to the original data 67 | scores_by_index[idx] = {"score": prob, text_key: text} 68 | class_scores = np.array( 69 | [ 70 | evals["score"] 71 | for evals in scores_by_index.values() 72 | if evals["score"] is not None 73 | ] 74 | ) 75 | class_scores = aggregate_to_1D(class_scores) 76 | return {"agg_value": np.mean(class_scores), "value_by_index": scores_by_index} 77 | -------------------------------------------------------------------------------- /src/evals/muse.py: -------------------------------------------------------------------------------- 1 | from evals.base import Evaluator 2 | 3 | 4 | class MUSEEvaluator(Evaluator): 5 | def __init__(self, eval_cfg, **kwargs): 6 | super().__init__("MUSE", eval_cfg, **kwargs) 7 | -------------------------------------------------------------------------------- /src/evals/tofu.py: -------------------------------------------------------------------------------- 1 | from evals.base import Evaluator 2 | 3 | 4 | class TOFUEvaluator(Evaluator): 5 | def __init__(self, eval_cfg, **kwargs): 6 | super().__init__("TOFU", eval_cfg, **kwargs) 7 | -------------------------------------------------------------------------------- /src/model/__init__.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | from omegaconf import DictConfig, open_dict 3 | from typing import Dict, Any 4 | import os 5 | import torch 6 | import logging 7 | from model.probe import ProbedLlamaForCausalLM 8 | 9 | hf_home = os.getenv("HF_HOME", default=None) 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | MODEL_REGISTRY: Dict[str, Any] = {} 14 | 15 | 16 | def _register_model(model_class): 17 | MODEL_REGISTRY[model_class.__name__] = model_class 18 | 19 | 20 | def get_dtype(model_args): 21 | with open_dict(model_args): 22 | torch_dtype = model_args.pop("torch_dtype", None) 23 | if model_args.get("attn_implementation", None) == "flash_attention_2": 24 | # This check handles https://github.com/Dao-AILab/flash-attention/blob/7153673c1a3c7753c38e4c10ef2c98a02be5f778/flash_attn/flash_attn_triton.py#L820 25 | # If you want to run at other precisions consider running "training or inference using 26 | # Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` 27 | # decorator" or using an attn_implementation compatible with the precision in the model 28 | # config. 29 | assert torch_dtype in ["float16", "bfloat16"], ValueError( 30 | f"Invalid torch_dtype '{torch_dtype}' for the requested attention " 31 | f"implementation: 'flash_attention_2'. Supported types are 'float16' " 32 | f"and 'bfloat16'." 33 | ) 34 | if torch_dtype == "float16": 35 | return torch.float16 36 | elif torch_dtype == "bfloat16": 37 | return torch.bfloat16 38 | return torch.float32 39 | 40 | 41 | def get_model(model_cfg: DictConfig): 42 | assert model_cfg is not None and model_cfg.model_args is not None, ValueError( 43 | "Model config not found or model_args absent in configs/model." 44 | ) 45 | model_args = model_cfg.model_args 46 | tokenizer_args = model_cfg.tokenizer_args 47 | torch_dtype = get_dtype(model_args) 48 | model_handler = model_cfg.get("model_handler", "AutoModelForCausalLM") 49 | model_cls = MODEL_REGISTRY[model_handler] 50 | with open_dict(model_args): 51 | model_path = model_args.pop("pretrained_model_name_or_path", None) 52 | try: 53 | model = model_cls.from_pretrained( 54 | pretrained_model_name_or_path=model_path, 55 | torch_dtype=torch_dtype, 56 | **model_args, 57 | cache_dir=hf_home, 58 | ) 59 | except Exception as e: 60 | logger.warning(f"Model {model_path} requested with {model_cfg.model_args}") 61 | raise ValueError( 62 | f"Error {e} while fetching model using {model_handler}.from_pretrained()." 63 | ) 64 | tokenizer = get_tokenizer(tokenizer_args) 65 | return model, tokenizer 66 | 67 | 68 | def _add_or_replace_eos_token(tokenizer, eos_token: str) -> None: 69 | is_added = tokenizer.eos_token_id is None 70 | num_added_tokens = tokenizer.add_special_tokens({"eos_token": eos_token}) 71 | 72 | if is_added: 73 | logger.info("Add eos token: {}".format(tokenizer.eos_token)) 74 | else: 75 | logger.info("Replace eos token: {}".format(tokenizer.eos_token)) 76 | 77 | if num_added_tokens > 0: 78 | logger.info("New tokens have been added, make sure `resize_vocab` is True.") 79 | 80 | 81 | def get_tokenizer(tokenizer_cfg: DictConfig): 82 | try: 83 | tokenizer = AutoTokenizer.from_pretrained(**tokenizer_cfg, cache_dir=hf_home) 84 | except Exception as e: 85 | error_message = ( 86 | f"{'--' * 40}\n" 87 | f"Error {e} fetching tokenizer using AutoTokenizer.\n" 88 | f"Tokenizer requested from path: {tokenizer_cfg.get('pretrained_model_name_or_path', None)}\n" 89 | f"Full tokenizer config: {tokenizer_cfg}\n" 90 | f"{'--' * 40}" 91 | ) 92 | raise RuntimeError(error_message) 93 | 94 | if tokenizer.eos_token_id is None: 95 | logger.info("replacing eos_token with <|endoftext|>") 96 | _add_or_replace_eos_token(tokenizer, eos_token="<|endoftext|>") 97 | 98 | if tokenizer.pad_token_id is None: 99 | tokenizer.pad_token = tokenizer.eos_token 100 | logger.info("Setting pad_token as eos token: {}".format(tokenizer.pad_token)) 101 | 102 | return tokenizer 103 | 104 | 105 | # register models 106 | _register_model(AutoModelForCausalLM) 107 | _register_model(ProbedLlamaForCausalLM) 108 | -------------------------------------------------------------------------------- /src/model/probe.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig, LlamaForCausalLM 2 | import torch 3 | import torch.nn as nn 4 | import logging 5 | import gc 6 | from copy import deepcopy 7 | from transformers import AutoModelForCausalLM 8 | 9 | logger = logging.getLogger("model") 10 | 11 | 12 | class ProbedLlamaForCausalLM(LlamaForCausalLM): 13 | """ 14 | Class for loading a LlamaForCausalLM model with the following custom behavior: 15 | - Initializes only the first `n_layers` of the model. 16 | - Sets up a newly initialized `lm_head`, optionally using weights from 17 | `head_pretrained_model_name_or_path` 18 | - Trains only the lm_head parameters with rest of the model frozen. 19 | - Once the model is saved during training, for inference it can also be loaded using 20 | AutoModelForCausalLM 21 | """ 22 | 23 | @classmethod 24 | def from_pretrained( 25 | cls, 26 | pretrained_model_name_or_path: str, 27 | head_pretrained_model_name_or_path: str = None, 28 | n_layers: int = 100, 29 | freeze_base_model: bool = True, 30 | **kwargs, 31 | ): 32 | config, unused_kwargs = AutoConfig.from_pretrained( 33 | pretrained_model_name_or_path, return_unused_kwargs=True, **kwargs 34 | ) 35 | config.tie_word_embeddings = False 36 | model: LlamaForCausalLM = super().from_pretrained( 37 | pretrained_model_name_or_path, config=config, **unused_kwargs 38 | ) 39 | 40 | # Limit number of transformer layers 41 | n_layers = min(n_layers, model.config.num_hidden_layers) 42 | model.config.num_hidden_layers = n_layers 43 | model.model.layers = nn.ModuleList(model.model.layers[:n_layers]) 44 | 45 | # Reinitialize lm_head 46 | ref_params = list(model.model.layers[-1].parameters())[0] 47 | device = ref_params.device 48 | if head_pretrained_model_name_or_path is not None: 49 | logger.info( 50 | f"Initialising lm_head from {head_pretrained_model_name_or_path}" 51 | ) 52 | head_model: LlamaForCausalLM = AutoModelForCausalLM.from_pretrained( 53 | head_pretrained_model_name_or_path, config=config, **unused_kwargs 54 | ) 55 | lm_head = deepcopy(head_model.lm_head).to(device) 56 | model.set_output_embeddings(lm_head) 57 | else: 58 | logger.info("Initialising new lm_head") 59 | model._init_weights(model.lm_head) 60 | 61 | # Cleanup 62 | gc.collect() 63 | torch.cuda.empty_cache() 64 | 65 | # Set trainable params 66 | for name, p in model.named_parameters(): 67 | p.requires_grad = not freeze_base_model or name.startswith("lm_head") 68 | logger.info( 69 | f"Initialised a ProbedLlamaForCausalLM model with {n_layers} layers" 70 | ) 71 | return model 72 | -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | import hydra 2 | from omegaconf import DictConfig 3 | from data import get_data, get_collators 4 | from model import get_model 5 | from trainer import load_trainer 6 | from evals import get_evaluators 7 | from trainer.utils import seed_everything 8 | 9 | 10 | @hydra.main(version_base=None, config_path="../configs", config_name="train.yaml") 11 | def main(cfg: DictConfig): 12 | """Entry point of the code to train models 13 | Args: 14 | cfg (DictConfig): Config to train 15 | """ 16 | seed_everything(cfg.trainer.args.seed) 17 | mode = cfg.get("mode", "train") 18 | model_cfg = cfg.model 19 | template_args = model_cfg.template_args 20 | assert model_cfg is not None, "Invalid model yaml passed in train config." 21 | model, tokenizer = get_model(model_cfg) 22 | 23 | # Load Dataset 24 | data_cfg = cfg.data 25 | data = get_data( 26 | data_cfg, mode=mode, tokenizer=tokenizer, template_args=template_args 27 | ) 28 | 29 | # Load collator 30 | collator_cfg = cfg.collator 31 | collator = get_collators(collator_cfg, tokenizer=tokenizer) 32 | 33 | # Get Trainer 34 | trainer_cfg = cfg.trainer 35 | assert trainer_cfg is not None, ValueError("Please set trainer") 36 | 37 | # Get Evaluators 38 | evaluators = None 39 | eval_cfgs = cfg.get("eval", None) 40 | if eval_cfgs: 41 | evaluators = get_evaluators( 42 | eval_cfgs=eval_cfgs, 43 | template_args=template_args, 44 | model=model, 45 | tokenizer=tokenizer, 46 | ) 47 | 48 | trainer, trainer_args = load_trainer( 49 | trainer_cfg=trainer_cfg, 50 | model=model, 51 | train_dataset=data.get("train", None), 52 | eval_dataset=data.get("eval", None), 53 | tokenizer=tokenizer, 54 | data_collator=collator, 55 | evaluators=evaluators, 56 | template_args=template_args, 57 | ) 58 | 59 | if trainer_args.do_train: 60 | trainer.train() 61 | trainer.save_state() 62 | trainer.save_model(trainer_args.output_dir) 63 | 64 | if trainer_args.do_eval: 65 | trainer.evaluate(metric_key_prefix="eval") 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /src/trainer/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typing import Dict, Any 3 | from omegaconf import DictConfig 4 | from transformers import Trainer, TrainingArguments 5 | 6 | from trainer.base import FinetuneTrainer 7 | from trainer.unlearn.grad_ascent import GradAscent 8 | from trainer.unlearn.grad_diff import GradDiff 9 | from trainer.unlearn.npo import NPO 10 | from trainer.unlearn.dpo import DPO 11 | from trainer.unlearn.simnpo import SimNPO 12 | from trainer.unlearn.rmu import RMU 13 | from trainer.unlearn.undial import UNDIAL 14 | 15 | import logging 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | TRAINER_REGISTRY: Dict[str, Any] = {} 20 | 21 | 22 | def _register_trainer(trainer_class): 23 | TRAINER_REGISTRY[trainer_class.__name__] = trainer_class 24 | 25 | 26 | def load_trainer_args(trainer_args: DictConfig, dataset): 27 | trainer_args = dict(trainer_args) 28 | warmup_epochs = trainer_args.pop("warmup_epochs", None) 29 | if warmup_epochs: 30 | batch_size = trainer_args["per_device_train_batch_size"] 31 | grad_accum_steps = trainer_args["gradient_accumulation_steps"] 32 | num_devices = torch.cuda.device_count() 33 | dataset_len = len(dataset) 34 | trainer_args["warmup_steps"] = int( 35 | (warmup_epochs * dataset_len) 36 | // (batch_size * grad_accum_steps * num_devices) 37 | ) 38 | 39 | trainer_args = TrainingArguments(**trainer_args) 40 | return trainer_args 41 | 42 | 43 | def load_trainer( 44 | trainer_cfg: DictConfig, 45 | model, 46 | train_dataset=None, 47 | eval_dataset=None, 48 | tokenizer=None, 49 | data_collator=None, 50 | evaluators=None, 51 | template_args=None, 52 | ): 53 | trainer_args = trainer_cfg.args 54 | method_args = trainer_cfg.get("method_args", {}) 55 | trainer_args = load_trainer_args(trainer_args, train_dataset) 56 | trainer_handler_name = trainer_cfg.get("handler") 57 | assert trainer_handler_name is not None, ValueError( 58 | f"{trainer_handler_name} handler not set" 59 | ) 60 | trainer_cls = TRAINER_REGISTRY.get(trainer_handler_name, None) 61 | assert trainer_cls is not None, NotImplementedError( 62 | f"{trainer_handler_name} not implemented or not registered" 63 | ) 64 | trainer = trainer_cls( 65 | model=model, 66 | train_dataset=train_dataset, 67 | eval_dataset=eval_dataset, 68 | tokenizer=tokenizer, 69 | data_collator=data_collator, 70 | args=trainer_args, 71 | evaluators=evaluators, 72 | template_args=template_args, 73 | **method_args, 74 | ) 75 | logger.info( 76 | f"{trainer_handler_name} Trainer loaded, output_dir: {trainer_args.output_dir}" 77 | ) 78 | return trainer, trainer_args 79 | 80 | 81 | # Register Finetuning Trainer 82 | _register_trainer(Trainer) 83 | _register_trainer(FinetuneTrainer) 84 | 85 | # Register Unlearning Trainer 86 | _register_trainer(GradAscent) 87 | _register_trainer(GradDiff) 88 | _register_trainer(NPO) 89 | _register_trainer(DPO) 90 | _register_trainer(SimNPO) 91 | _register_trainer(RMU) 92 | _register_trainer(UNDIAL) 93 | -------------------------------------------------------------------------------- /src/trainer/base.py: -------------------------------------------------------------------------------- 1 | # Modified from https://github.com/huggingface/transformers/blob/v4.45.1/src/transformers/trainer.py 2 | 3 | from typing import Dict, List, Optional, Union 4 | 5 | import os 6 | import logging 7 | from transformers import Trainer 8 | from torch.utils.data import Dataset 9 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR 10 | from typing import Any 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class FinetuneTrainer(Trainer): 16 | def __init__(self, evaluators=None, template_args=None, *args, **kwargs): 17 | self.evaluators = evaluators 18 | self.template_args = template_args 19 | super().__init__(*args, **kwargs) 20 | 21 | def evaluate( 22 | self, 23 | eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None, 24 | ignore_keys: Optional[List[str]] = None, 25 | metric_key_prefix: str = "eval", 26 | trial: Dict[str, Any] = None, 27 | ) -> Dict[str, float]: 28 | # Run a custom evaluator and save results 29 | if self.evaluators: 30 | if self.accelerator.is_local_main_process: 31 | eval_metrics = {} 32 | if self.accelerator.num_processes == 1: 33 | run_dir = self._get_output_dir(trial=trial) 34 | checkpoint_folder = ( 35 | f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}" 36 | ) 37 | output_dir = os.path.join(run_dir, checkpoint_folder, "evals") 38 | os.makedirs(output_dir, exist_ok=True) 39 | eval_metrics = {} 40 | for _, evaluator in self.evaluators.items(): 41 | eval_args = { 42 | "output_dir": output_dir, 43 | "template_args": self.template_args, 44 | "model": self.model, 45 | "tokenizer": self.tokenizer, 46 | } 47 | eval_metrics.update(evaluator.evaluate(**eval_args)) 48 | self.log(eval_metrics) 49 | else: 50 | logger.warning( 51 | "Custom evaluator can be run with this Trainer only when a single accelerator process is running." 52 | ) 53 | return eval_metrics 54 | 55 | if eval_dataset is None: 56 | return {} 57 | # Run the default HF Trainer evaluate method when eval dataset is provided 58 | return super().evaluate(eval_dataset, ignore_keys, metric_key_prefix) 59 | -------------------------------------------------------------------------------- /src/trainer/unlearn/dpo.py: -------------------------------------------------------------------------------- 1 | from trainer.utils import compute_dpo_loss 2 | from trainer.unlearn.grad_diff import GradDiff 3 | 4 | 5 | class DPO(GradDiff): 6 | def __init__(self, beta=1.0, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | self.beta = beta 9 | if self.ref_model is None: 10 | self.ref_model = self._prepare_ref_model(self.model) 11 | 12 | def compute_loss(self, model, inputs, return_outputs=False): 13 | forget_inputs = inputs["forget"]["original"] 14 | alternate_inputs = inputs["forget"]["alternate"] 15 | 16 | forget_loss, forget_outputs = compute_dpo_loss( 17 | model=model, 18 | ref_model=self.ref_model, 19 | win_inputs=alternate_inputs, 20 | lose_inputs=forget_inputs, 21 | beta=self.beta, 22 | ) 23 | 24 | retain_inputs = inputs["retain"] 25 | retain_inputs = { 26 | "input_ids": retain_inputs["input_ids"], 27 | "attention_mask": retain_inputs["attention_mask"], 28 | "labels": retain_inputs["labels"], 29 | } 30 | retain_loss = self.compute_retain_loss(model=model, retain_inputs=retain_inputs) 31 | 32 | loss = self.gamma * forget_loss + self.alpha * retain_loss 33 | return (loss, forget_outputs) if return_outputs else loss 34 | -------------------------------------------------------------------------------- /src/trainer/unlearn/grad_ascent.py: -------------------------------------------------------------------------------- 1 | from trainer.unlearn.base import UnlearnTrainer 2 | 3 | 4 | class GradAscent(UnlearnTrainer): 5 | def compute_loss(self, model, inputs, return_outputs=False): 6 | forget_inputs = inputs["forget"] 7 | forget_inputs = { 8 | "input_ids": forget_inputs["input_ids"], 9 | "attention_mask": forget_inputs["attention_mask"], 10 | "labels": forget_inputs["labels"], 11 | } 12 | outputs = model(**forget_inputs) 13 | loss = -outputs.loss 14 | return (loss, outputs) if return_outputs else loss 15 | -------------------------------------------------------------------------------- /src/trainer/unlearn/grad_diff.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from trainer.utils import compute_kl_divergence 3 | from trainer.unlearn.base import UnlearnTrainer 4 | 5 | 6 | class GradDiff(UnlearnTrainer): 7 | def __init__(self, gamma=1.0, alpha=1.0, retain_loss_type="NLL", *args, **kwargs): 8 | super().__init__(*args, **kwargs) 9 | self.gamma = gamma 10 | self.alpha = alpha 11 | self.retain_loss_type = retain_loss_type 12 | self.ref_model = None 13 | if retain_loss_type == "KL": 14 | self.ref_model = self._prepare_ref_model(self.model) 15 | 16 | def _prepare_ref_model(self, model): 17 | ref_model = copy.deepcopy(model).to(self.accelerator.device) 18 | ref_model.eval() 19 | if self.is_deepspeed_enabled: 20 | ref_model = self._prepare_deepspeed(ref_model) 21 | else: 22 | ref_model = self.accelerator.prepare_model(ref_model, evaluation_mode=True) 23 | return ref_model 24 | 25 | def compute_retain_loss(self, model, retain_inputs): 26 | retain_outputs = model(**retain_inputs) 27 | retain_loss = 0.0 28 | if self.retain_loss_type == "NLL": 29 | retain_loss += retain_outputs.loss 30 | elif self.retain_loss_type == "KL": 31 | kl_loss, retain_outputs = compute_kl_divergence( 32 | self.model, self.ref_model, retain_inputs 33 | ) 34 | retain_loss += kl_loss 35 | else: 36 | raise NotImplementedError( 37 | f"{self.retain_loss_type} not implemented for retain set" 38 | ) 39 | return retain_loss 40 | 41 | def compute_loss(self, model, inputs, return_outputs=False): 42 | forget_inputs = inputs["forget"] 43 | forget_inputs = { 44 | "input_ids": forget_inputs["input_ids"], 45 | "attention_mask": forget_inputs["attention_mask"], 46 | "labels": forget_inputs["labels"], 47 | } 48 | 49 | forget_outputs = model(**forget_inputs) 50 | forget_loss = -forget_outputs.loss 51 | 52 | retain_inputs = inputs["retain"] 53 | retain_inputs = { 54 | "input_ids": retain_inputs["input_ids"], 55 | "attention_mask": retain_inputs["attention_mask"], 56 | "labels": retain_inputs["labels"], 57 | } 58 | retain_loss = self.compute_retain_loss(model=model, retain_inputs=retain_inputs) 59 | 60 | loss = self.gamma * forget_loss + self.alpha * retain_loss 61 | 62 | return (loss, forget_outputs) if return_outputs else loss 63 | -------------------------------------------------------------------------------- /src/trainer/unlearn/npo.py: -------------------------------------------------------------------------------- 1 | from trainer.utils import compute_dpo_loss 2 | from trainer.unlearn.grad_diff import GradDiff 3 | 4 | 5 | class NPO(GradDiff): 6 | def __init__(self, beta=1.0, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | self.beta = beta 9 | if self.ref_model is None: 10 | self.ref_model = self._prepare_ref_model(self.model) 11 | 12 | def compute_loss(self, model, inputs, return_outputs=False): 13 | forget_inputs = inputs["forget"] 14 | 15 | forget_loss, forget_outputs = compute_dpo_loss( 16 | model=model, 17 | ref_model=self.ref_model, 18 | win_inputs=None, 19 | lose_inputs=forget_inputs, 20 | beta=self.beta, 21 | ) 22 | 23 | retain_inputs = inputs["retain"] 24 | retain_inputs = { 25 | "input_ids": retain_inputs["input_ids"], 26 | "attention_mask": retain_inputs["attention_mask"], 27 | "labels": retain_inputs["labels"], 28 | } 29 | retain_loss = self.compute_retain_loss(model=model, retain_inputs=retain_inputs) 30 | 31 | loss = self.gamma * forget_loss + self.alpha * retain_loss 32 | return (loss, forget_outputs) if return_outputs else loss 33 | -------------------------------------------------------------------------------- /src/trainer/unlearn/simnpo.py: -------------------------------------------------------------------------------- 1 | import torch.nn.functional as F 2 | 3 | from trainer.utils import compute_batch_nll 4 | from trainer.unlearn.grad_diff import GradDiff 5 | 6 | 7 | class SimNPO(GradDiff): 8 | def __init__(self, delta=0.0, beta=1.0, *args, **kwargs): 9 | super().__init__(*args, **kwargs) 10 | self.delta = delta 11 | self.beta = beta 12 | 13 | def compute_loss(self, model, inputs, return_outputs=False): 14 | forget_inputs = inputs["forget"] 15 | 16 | forget_labels = forget_inputs["labels"] 17 | loss_mask = forget_labels != -100 18 | forget_loss, forget_outputs = compute_batch_nll(model, forget_inputs) 19 | forget_loss = forget_loss / loss_mask.sum(-1) - self.delta 20 | forget_loss = -F.logsigmoid(self.beta * forget_loss).mean() * 2 / self.beta 21 | 22 | retain_inputs = inputs["retain"] 23 | retain_inputs = { 24 | "input_ids": retain_inputs["input_ids"], 25 | "attention_mask": retain_inputs["attention_mask"], 26 | "labels": retain_inputs["labels"], 27 | } 28 | retain_loss = self.compute_retain_loss(model=model, retain_inputs=retain_inputs) 29 | 30 | loss = self.gamma * forget_loss + self.alpha * retain_loss 31 | return (loss, forget_outputs) if return_outputs else loss 32 | -------------------------------------------------------------------------------- /src/trainer/unlearn/undial.py: -------------------------------------------------------------------------------- 1 | from trainer.utils import compute_undial_loss 2 | from trainer.unlearn.grad_diff import GradDiff 3 | 4 | 5 | class UNDIAL(GradDiff): 6 | def __init__(self, beta=1.0, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | self.beta = beta 9 | if self.ref_model is None: 10 | self.ref_model = self._prepare_ref_model(self.model) 11 | 12 | def compute_loss(self, model, inputs, return_outputs=False): 13 | forget_inputs = inputs["forget"] 14 | forget_loss, forget_outputs = compute_undial_loss( 15 | model, self.ref_model, forget_inputs, self.beta 16 | ) 17 | 18 | retain_inputs = inputs["retain"] 19 | retain_inputs = { 20 | "input_ids": retain_inputs["input_ids"], 21 | "attention_mask": retain_inputs["attention_mask"], 22 | "labels": retain_inputs["labels"], 23 | } 24 | retain_loss = self.compute_retain_loss(model=model, retain_inputs=retain_inputs) 25 | 26 | loss = self.gamma * forget_loss + self.alpha * retain_loss 27 | return (loss, forget_outputs) if return_outputs else loss 28 | -------------------------------------------------------------------------------- /src/trainer/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | import numpy as np 4 | from torch import nn 5 | import torch.nn.functional as F 6 | 7 | 8 | def seed_everything(seed=42): 9 | random.seed(seed) 10 | np.random.seed(seed) 11 | torch.manual_seed(seed) 12 | torch.cuda.manual_seed_all(seed) 13 | torch.backends.cudnn.deterministic = True 14 | torch.backends.cudnn.benchmark = False 15 | 16 | 17 | def compute_kl_divergence(model, target_model, inputs): 18 | with torch.no_grad(): 19 | ref_outputs = target_model(**inputs) 20 | 21 | ref_probs = F.log_softmax(ref_outputs.logits, dim=-1) 22 | ref_probs = F.log_softmax(ref_outputs.logits, dim=-1) 23 | ref_probs = ref_probs.view(-1, ref_outputs.logits.shape[-1]) 24 | 25 | outputs = model(**inputs) 26 | current_probs = F.log_softmax(outputs.logits, dim=-1) 27 | current_probs = current_probs.view(-1, outputs.logits.shape[-1]) 28 | 29 | # minimum KL divergence 30 | return nn.functional.kl_div( 31 | current_probs, ref_probs, reduction="batchmean", log_target=True 32 | ), outputs 33 | 34 | 35 | def compute_batch_nll(model, inputs): 36 | # get the sum loss for each sequence in a batch 37 | # NOTE: not same as model(**inputs).loss but has sum loss for each seq in a batch 38 | outputs = model(**inputs) 39 | logits = outputs.logits 40 | labels = inputs["labels"] 41 | shifted_labels = labels[..., 1:].contiguous() 42 | logits = logits[..., :-1, :].contiguous() 43 | loss_function = nn.CrossEntropyLoss(ignore_index=-100, reduction="none") 44 | loss = loss_function(logits.transpose(-1, -2), shifted_labels).sum(dim=-1) 45 | return loss, outputs 46 | 47 | 48 | def compute_dpo_loss(model, ref_model, win_inputs=None, lose_inputs=None, beta=1.0): 49 | if win_inputs is None and lose_inputs is None: 50 | raise ValueError("Both win_inputs and lose_inputs can't be None") 51 | 52 | win_log_ratio, lose_log_ratio = 0.0, 0.0 53 | win_outputs, lose_outputs = None, None 54 | 55 | if win_inputs is not None: 56 | win_loss, win_outputs = compute_batch_nll(model, win_inputs) 57 | with torch.no_grad(): 58 | win_ref_loss, _ = compute_batch_nll(ref_model, win_inputs) 59 | win_log_ratio = -(win_loss - win_ref_loss) 60 | 61 | if lose_inputs is not None: 62 | lose_loss, lose_outputs = compute_batch_nll(model, lose_inputs) 63 | with torch.no_grad(): 64 | lose_ref_loss, _ = compute_batch_nll(ref_model, lose_inputs) 65 | lose_log_ratio = -(lose_loss - lose_ref_loss) 66 | 67 | loss = -2 / beta * F.logsigmoid(beta * (win_log_ratio - lose_log_ratio)).mean() 68 | return loss, (win_outputs, lose_outputs) 69 | 70 | 71 | def compute_undial_loss(model, ref_model, inputs, beta): 72 | # Forward pass on the student (trainable) model 73 | outputs = model(**inputs) 74 | logits = outputs.logits 75 | labels = inputs["labels"] 76 | 77 | shift_labels = labels[..., 1:].contiguous() 78 | shift_logits = logits[..., :-1, :].contiguous() 79 | 80 | # Forward pass on the teacher model (no grad) 81 | with torch.no_grad(): 82 | teacher_logits = ref_model(**inputs).logits 83 | shift_teacher_logits = teacher_logits[..., :-1, :].contiguous() 84 | 85 | # Build the mask that identifies the tokens need to be unlearned 86 | mask = torch.zeros_like(shift_teacher_logits) 87 | batch_idx = torch.arange(mask.shape[0]).view(-1, 1, 1) 88 | seq_idx = torch.arange(mask.shape[1]).view(1, -1, 1) 89 | mask[batch_idx, seq_idx, shift_labels.unsqueeze(-1)] = 1.0 90 | 91 | # Adjust teacher logits: subtract di_strength on the correct token 92 | pre_softmax = shift_teacher_logits - mask * beta 93 | soft_label = F.softmax(pre_softmax, dim=-1) 94 | 95 | loss_fct = nn.CrossEntropyLoss(reduction="none") 96 | loss = loss_fct( 97 | shift_logits.view(-1, shift_logits.size(-1)), 98 | soft_label.view(-1, soft_label.size(-1)), 99 | ) 100 | return loss.mean(), outputs 101 | --------------------------------------------------------------------------------