├── .github
├── ISSUE_TEMPLATE
│ ├── bug-report.yaml
│ ├── config.yaml
│ └── feature-request.yaml
├── PULL_REQUEST_TEMPLATE.md
└── workflows
│ └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── Makefile
├── README.md
├── assets
├── banner.png
└── logo.png
├── community
├── benchmarks
│ └── template
│ │ ├── README.md
│ │ └── run.sh
├── leaderboard.md
└── methods
│ ├── AltPO
│ ├── README.md
│ ├── generate.py
│ ├── generate.yaml
│ └── run.sh
│ ├── UNDIAL
│ ├── README.md
│ └── run.sh
│ └── template
│ ├── README.md
│ └── run.sh
├── configs
├── accelerate
│ ├── default_config.yaml
│ └── zero_stage3_offload_config.json
├── collator
│ ├── DataCollatorForSupervisedDataset.yaml
│ └── DataCollatorForSupervisedDatasetwithIndex.yaml
├── data
│ ├── datasets
│ │ ├── MUSE_MIA.yaml
│ │ ├── MUSE_forget.yaml
│ │ ├── MUSE_forget_knowmem.yaml
│ │ ├── MUSE_forget_scal.yaml
│ │ ├── MUSE_forget_sust.yaml
│ │ ├── MUSE_forget_verbmem.yaml
│ │ ├── MUSE_retain.yaml
│ │ ├── MUSE_retain_knowmem.yaml
│ │ ├── TOFU_MIA.yaml
│ │ ├── TOFU_QA_forget.yaml
│ │ ├── TOFU_QA_forget_idk.yaml
│ │ ├── TOFU_QA_forget_para.yaml
│ │ ├── TOFU_QA_forget_pert.yaml
│ │ ├── TOFU_QA_full.yaml
│ │ ├── TOFU_QA_ra.yaml
│ │ ├── TOFU_QA_ra_pert.yaml
│ │ ├── TOFU_QA_retain.yaml
│ │ ├── TOFU_QA_retain_eval.yaml
│ │ ├── TOFU_QA_retain_para.yaml
│ │ ├── TOFU_QA_retain_pert.yaml
│ │ ├── TOFU_QA_wf.yaml
│ │ ├── TOFU_QA_wf_pert.yaml
│ │ ├── WMDP_forget.yaml
│ │ └── WMDP_retain.yaml
│ ├── finetune.yaml
│ └── unlearn.yaml
├── eval.yaml
├── eval
│ ├── lm_eval.yaml
│ ├── muse.yaml
│ ├── muse_metrics
│ │ ├── exact_memorization.yaml
│ │ ├── extraction_strength.yaml
│ │ ├── forget_gibberish.yaml
│ │ ├── forget_knowmem_ROUGE.yaml
│ │ ├── forget_verbmem_ROUGE.yaml
│ │ ├── mia_gradnorm.yaml
│ │ ├── mia_loss.yaml
│ │ ├── mia_min_k.yaml
│ │ ├── mia_min_k_plus_plus.yaml
│ │ ├── mia_reference.yaml
│ │ ├── mia_zlib.yaml
│ │ ├── privleak.yaml
│ │ └── retain_knowmem_ROUGE.yaml
│ ├── tofu.yaml
│ └── tofu_metrics
│ │ ├── exact_memorization.yaml
│ │ ├── extraction_strength.yaml
│ │ ├── forget_Q_A_PARA_Prob.yaml
│ │ ├── forget_Q_A_PARA_ROUGE.yaml
│ │ ├── forget_Q_A_PERT_Prob.yaml
│ │ ├── forget_Q_A_PERT_ROUGE.yaml
│ │ ├── forget_Q_A_Prob.yaml
│ │ ├── forget_Q_A_ROUGE.yaml
│ │ ├── forget_Q_A_gibberish.yaml
│ │ ├── forget_Truth_Ratio.yaml
│ │ ├── forget_quality.yaml
│ │ ├── mia_gradnorm.yaml
│ │ ├── mia_loss.yaml
│ │ ├── mia_min_k.yaml
│ │ ├── mia_min_k_plus_plus.yaml
│ │ ├── mia_reference.yaml
│ │ ├── mia_zlib.yaml
│ │ ├── model_utility.yaml
│ │ ├── privleak.yaml
│ │ ├── ra_Q_A_PERT_Prob.yaml
│ │ ├── ra_Q_A_Prob.yaml
│ │ ├── ra_Q_A_Prob_normalised.yaml
│ │ ├── ra_Q_A_ROUGE.yaml
│ │ ├── ra_Truth_Ratio.yaml
│ │ ├── retain_Q_A_PARA_Prob.yaml
│ │ ├── retain_Q_A_PERT_Prob.yaml
│ │ ├── retain_Q_A_Prob.yaml
│ │ ├── retain_Q_A_ROUGE.yaml
│ │ ├── retain_Truth_Ratio.yaml
│ │ ├── wf_Q_A_PERT_Prob.yaml
│ │ ├── wf_Q_A_Prob.yaml
│ │ ├── wf_Q_A_Prob_normalised.yaml
│ │ ├── wf_Q_A_ROUGE.yaml
│ │ └── wf_Truth_Ratio.yaml
├── experiment
│ ├── eval
│ │ ├── muse
│ │ │ └── default.yaml
│ │ ├── tofu
│ │ │ └── default.yaml
│ │ └── wmdp
│ │ │ └── default.yaml
│ ├── examples
│ │ ├── muse_unlearn.yaml
│ │ └── tofu_eval.yaml
│ ├── finetune
│ │ └── tofu
│ │ │ └── default.yaml
│ └── unlearn
│ │ ├── muse
│ │ ├── default.yaml
│ │ ├── scalability.yaml
│ │ └── sustainabilty.yaml
│ │ ├── tofu
│ │ ├── default.yaml
│ │ └── idk.yaml
│ │ └── wmdp
│ │ └── default.yaml
├── generation
│ └── default.yaml
├── hydra
│ ├── default.yaml
│ └── eval.yaml
├── model
│ ├── Llama-2-7b-chat-hf.yaml
│ ├── Llama-2-7b-hf.yaml
│ ├── Llama-3.1-8B-Instruct.yaml
│ ├── Llama-3.2-1B-Instruct.yaml
│ ├── Llama-3.2-3B-Instruct.yaml
│ ├── Phi-3.5-mini-instruct.yaml
│ ├── gemma-7b-it.yaml
│ ├── phi-1_5.yaml
│ └── zephyr-7b-beta.yaml
├── paths
│ └── default.yaml
├── train.yaml
├── trainer
│ ├── DPO.yaml
│ ├── GradAscent.yaml
│ ├── GradDiff.yaml
│ ├── NPO.yaml
│ ├── RMU.yaml
│ ├── SimNPO.yaml
│ ├── UNDIAL.yaml
│ └── finetune.yaml
└── unlearn.yaml
├── docs
├── components.md
├── contributing.md
├── evaluation.md
├── experiments.md
├── hydra.md
├── links.md
└── repro.md
├── requirements.txt
├── scripts
├── muse_unlearn.sh
├── tofu_finetune.sh
└── tofu_unlearn.sh
├── setup.py
├── setup_data.py
└── src
├── data
├── __init__.py
├── collators.py
├── pretraining.py
├── qa.py
├── unlearn.py
└── utils.py
├── eval.py
├── evals
├── __init__.py
├── base.py
├── lm_eval.py
├── metrics
│ ├── __init__.py
│ ├── base.py
│ ├── memorization.py
│ ├── mia
│ │ ├── __init__.py
│ │ ├── all_attacks.py
│ │ ├── gradnorm.py
│ │ ├── loss.py
│ │ ├── min_k.py
│ │ ├── min_k_plus_plus.py
│ │ ├── reference.py
│ │ ├── utils.py
│ │ └── zlib.py
│ ├── privacy.py
│ ├── utility.py
│ └── utils.py
├── muse.py
└── tofu.py
├── model
├── __init__.py
└── probe.py
├── train.py
└── trainer
├── __init__.py
├── base.py
├── unlearn
├── base.py
├── dpo.py
├── grad_ascent.py
├── grad_diff.py
├── npo.py
├── rmu.py
├── simnpo.py
└── undial.py
└── utils.py
/.github/ISSUE_TEMPLATE/bug-report.yaml:
--------------------------------------------------------------------------------
1 | # Picked from https://github.com/huggingface/transformers/blob/main/.github/ISSUE_TEMPLATE/bug-report.yml
2 | name: "\U0001F41B Bug Report"
3 | description: Submit a bug report to help us improve open-unlearning
4 | labels: [ "bug" ]
5 | body:
6 | - type: checkboxes
7 | id: information-scripts-examples
8 | attributes:
9 | label: Information
10 | description: 'The problem arises when using:'
11 | options:
12 | - label: "The official example scripts"
13 | - label: "My own modified scripts"
14 |
15 | - type: checkboxes
16 | id: information-tasks
17 | attributes:
18 | label: Tasks
19 | description: "The tasks I am working on are:"
20 | options:
21 | - label: "An officially supported task"
22 | - label: "My own task or dataset (give details below)"
23 |
24 | - type: textarea
25 | id: reproduction
26 | validations:
27 | required: true
28 | attributes:
29 | label: Reproduction
30 | description: |
31 | Please provide a code sample that reproduces the problem you ran into.
32 | Please include relevant config information such as deepspeed configs and experiment configs in .hydra folder of your experiment.
33 | If you have code snippets, error messages, stack traces please provide them here as well.
34 | Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
35 | Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
36 |
37 | placeholder: |
38 | Steps to reproduce the behavior:
39 |
40 | 1.
41 | 2.
42 | 3.
43 |
44 |
45 | - type: textarea
46 | id: expected-behavior
47 | validations:
48 | required: true
49 | attributes:
50 | label: Expected behavior
51 | description: "A clear and concise description of what you would expect to happen."
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yaml:
--------------------------------------------------------------------------------
1 | # Picked from https://github.com/huggingface/transformers/blob/main/.github/ISSUE_TEMPLATE/config.yml
2 | blank_issues_enabled: true
3 | version: 2.1
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.yaml:
--------------------------------------------------------------------------------
1 | # Picked from https://github.com/huggingface/transformers/blob/main/.github/ISSUE_TEMPLATE/feature-request.yml
2 | name: "\U0001F680 Feature request"
3 | description: Submit a proposal/request for a new open-unlearning feature
4 | labels: [ "Feature request" ]
5 | body:
6 | - type: checkboxes
7 | id: information-tasks
8 | attributes:
9 | label: Tasks
10 | description: "New feature belongs to adding"
11 | options:
12 | - label: "Benchmark"
13 | - label: "Unlearning method"
14 | - label: "Evaluation"
15 | - label: "Dataset"
16 | - label: "None of the above"
17 |
18 | - type: textarea
19 | id: feature-request
20 | validations:
21 | required: true
22 | attributes:
23 | label: Feature request
24 | description: |
25 | A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist.
26 |
27 | - type: textarea
28 | id: motivation
29 | validations:
30 | required: true
31 | attributes:
32 | label: Motivation
33 | description: |
34 | Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
35 |
36 | - type: textarea
37 | id: implementation
38 | validations:
39 | required: false
40 | attributes:
41 | label: Implementation
42 | description: |
43 | Please describe your proposed solution in detail. Outline the implementation approach, including any key technical considerations. If there are challenges or blockers preventing implementation, specify them along with potential workarounds or dependencies.
44 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | # What does this PR do?
2 |
3 |
4 | Fixes # (issue)
5 |
6 |
7 | ## Before submitting
8 | - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
9 | - [ ] Have you gone through the contributions [guide](../docs/contributing.md)?
10 | - [ ] Are your changes documented? Read documentation guidelines [here](../README.md#-further-documentation).
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: tests
2 |
3 | on:
4 | push:
5 | paths:
6 | - "**.py"
7 | - "requirements.txt"
8 | - ".github/workflows/*.yml"
9 | pull_request:
10 | paths:
11 | - "**.py"
12 | - "requirements.txt"
13 | - ".github/workflows/*.yml"
14 |
15 | jobs:
16 | tests:
17 | strategy:
18 | fail-fast: false
19 | matrix:
20 | python-version:
21 | - "3.11"
22 | os:
23 | - "ubuntu-latest"
24 |
25 | runs-on: ubuntu-latest
26 |
27 | environment:
28 | name: tests
29 |
30 | env:
31 | # HF_TOKEN: ${{ secrets.HF_TOKEN }}
32 | OS_NAME: ${{ matrix.os }}
33 |
34 | steps:
35 | - name: Checkout
36 | uses: actions/checkout@v4
37 |
38 | - name: Set up Python
39 | uses: actions/setup-python@v5
40 | with:
41 | python-version: ${{ matrix.python-version }}
42 | cache: "pip"
43 | cache-dependency-path: "setup.py"
44 |
45 | - name: Install dependencies
46 | run: |
47 | python -m pip install --upgrade pip
48 | pip install ruff==0.6.6
49 |
50 | - name: Check Quality
51 | run: make quality
52 |
53 | # - name: Test with pytest
54 | # run: |
55 | # cd
56 | # make test
57 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # custom .gitignore
2 | submit.py
3 | !src/data/__init__.py
4 | logs/
5 | unity/
6 | src/data/test.py
7 | src/data/__pycache__
8 | ms_cache/
9 | logs/
10 | hf_cache/
11 | cache*/
12 | saves*/
13 | notebooks/
14 | output*/
15 | wandb/
16 | data/
17 | !*/data/
18 | evals/
19 | !*/evals/
20 | # Byte-compiled / optimized / DLL files
21 | __pycache__/
22 | *.py[cod]
23 | *$py.class
24 | eval_logs/
25 | eval_dumps/
26 | # C extensions
27 | *.so
28 |
29 | # Distribution / packaging
30 | .Python
31 | build/
32 | develop-eggs/
33 | dist/
34 | downloads/
35 | eggs/
36 | .eggs/
37 | lib/
38 | lib64/
39 | parts/
40 | sdist/
41 | var/
42 | wheels/
43 | share/python-wheels/
44 | *.egg-info/
45 | .installed.cfg
46 | *.egg
47 | MANIFEST
48 |
49 | # PyInstaller
50 | # Usually these files are written by a python script from a template
51 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
52 | *.manifest
53 | *.spec
54 |
55 | # Installer logs
56 | pip-log.txt
57 | pip-delete-this-directory.txt
58 |
59 | # Unit test / coverage reports
60 | htmlcov/
61 | .tox/
62 | .nox/
63 | .coverage
64 | .coverage.*
65 | .cache
66 | nosetests.xml
67 | coverage.xml
68 | *.cover
69 | *.py,cover
70 | .hypothesis/
71 | .pytest_cache/
72 | .ruff_cache/
73 | cover/
74 |
75 | # Translations
76 | *.mo
77 | *.pot
78 |
79 | # Django stuff:
80 | *.log
81 | local_settings.py
82 | db.sqlite3
83 | db.sqlite3-journal
84 |
85 | # Flask stuff:
86 | instance/
87 | .webassets-cache
88 |
89 | # Scrapy stuff:
90 | .scrapy
91 |
92 | # Sphinx documentation
93 | docs/_build/
94 |
95 | # PyBuilder
96 | .pybuilder/
97 | target/
98 |
99 | # Jupyter Notebook
100 | .ipynb_checkpoints
101 |
102 | # IPython
103 | profile_default/
104 | ipython_config.py
105 |
106 | # pyenv
107 | # For a library or package, you might want to ignore these files since the code is
108 | # intended to run in multiple environments; otherwise, check them in:
109 | # .python-version
110 |
111 | # pipenv
112 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
113 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
114 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
115 | # install all needed dependencies.
116 | #Pipfile.lock
117 |
118 | # poetry
119 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
120 | # This is especially recommended for binary packages to ensure reproducibility, and is more
121 | # commonly ignored for libraries.
122 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
123 | #poetry.lock
124 |
125 | # pdm
126 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
127 | #pdm.lock
128 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
129 | # in version control.
130 | # https://pdm.fming.dev/#use-with-ide
131 | .pdm.toml
132 |
133 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
134 | __pypackages__/
135 |
136 | # Celery stuff
137 | celerybeat-schedule
138 | celerybeat.pid
139 |
140 | # SageMath parsed files
141 | *.sage.py
142 |
143 | # Environments
144 | .env
145 | .venv
146 | env/
147 | venv/
148 | ENV/
149 | env.bak/
150 | venv.bak/
151 |
152 | # Spyder project settings
153 | .spyderproject
154 | .spyproject
155 |
156 | # Rope project settings
157 | .ropeproject
158 |
159 | # mkdocs documentation
160 | /site
161 |
162 | # mypy
163 | .mypy_cache/
164 | .dmypy.json
165 | dmypy.json
166 |
167 | # Pyre type checker
168 | .pyre/
169 |
170 | # pytype static type analyzer
171 | .pytype/
172 |
173 | # Cython debug symbols
174 | cython_debug/
175 |
176 | # PyCharm
177 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
178 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
179 | # and can be added to the global gitignore or merged into this file. For a more nuclear
180 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
181 | .idea/
182 |
183 | .vscode/
184 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 |
3 | - repo: https://github.com/astral-sh/ruff-pre-commit
4 | rev: v0.6.9
5 | hooks:
6 | - id: ruff
7 | args: [check, --fix, scripts, src, setup.py, setup_data.py]
8 | - id: ruff
9 | args: [format, --check, scripts, src, setup.py setup_data.py]
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 CMU Locus Lab
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: quality style
2 |
3 | check_dirs := scripts src #setup.py
4 |
5 | quality:
6 | ruff check $(check_dirs) setup.py setup_data.py
7 | ruff format --check $(check_dirs) setup.py setup_data.py
8 |
9 | style:
10 | ruff check $(check_dirs) setup.py setup_data.py --fix
11 | ruff format $(check_dirs) setup.py setup_data.py
12 |
13 | test:
14 | CUDA_VISIBLE_DEVICES= pytest tests/
15 |
--------------------------------------------------------------------------------
/assets/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/locuslab/open-unlearning/b71de54c179408d447bc383c86fd1fafcc99dc14/assets/banner.png
--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/locuslab/open-unlearning/b71de54c179408d447bc383c86fd1fafcc99dc14/assets/logo.png
--------------------------------------------------------------------------------
/community/benchmarks/template/README.md:
--------------------------------------------------------------------------------
1 | # TITLE
2 |
3 | - Paper title, authors, links.
4 |
5 | Provide a concise summary of your benchmark details and its contributions. Please avoid using images to keep the repository size manageable.
6 |
7 | # Datasets
8 |
9 | Use a clear and consistent naming convention for dataset splits.
10 |
11 | - [ ] Provide a link to find/download the datasets (preferably HuggingFace).
12 |
13 | # Models
14 |
15 |
16 | - [ ] Upload any unlearning target or reference retain models for unlearning preferably on HuggingFace and provide the path.
17 | - [ ] Model creation details and how they fit in benchmark.
18 |
19 | # Baselines & Results
20 |
21 | Discuss the baselines used and their results.
22 |
23 |
24 | ## Setup
25 | Please include the experimental setup for the baselines
26 |
27 | - [ ] **Hyperparameters & Search Space:** Specify key hyperparameters, their search ranges, number of trials etc.
28 | - [ ] **Computational Setup:** Mention the type and number of GPUs used.
29 | - [ ] **DeepSpeed Configuration** (if used): If any modifications were made to the default DeepSpeed config, specify them here. (You may include the config as a code block.)
30 | - [ ] **Other Details:** Any additional setup details crucial for reproducing your method.
31 |
32 | To replicate your results, provide a `run.sh` script that contains all necessary commands to reproduce the final results. Ensure the script is well-documented.
33 |
34 |
35 | # Citation
36 |
37 |
38 | If you use this work, please cite:
39 |
40 | ```bibtex
41 |
42 |
43 |
44 | @misc{openunlearning2025,
45 | title={OpenUnlearning: A Unified Framework for LLM Unlearning Benchmarks},
46 | author={Dorna, Vineeth and Mekala, Anmol and Zhao, Wenlong and McCallum, Andrew and Kolter, J Zico and Maini, Pratyush},
47 | year={2025},
48 | howpublished={\url{https://github.com/locuslab/open-unlearning}},
49 | note={Accessed: February 27, 2025}
50 | }
51 | ```
--------------------------------------------------------------------------------
/community/benchmarks/template/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ########################################################################################################################
4 | ########################################### RETAIN Finetuned ####$######################################################
5 | ########################################################################################################################
6 |
7 |
8 |
9 | #########################################################################################################################
10 | ############################################ FULL Finetuned models ######################################################
11 | #########################################################################################################################
12 |
13 |
14 |
15 |
16 | #########################################################################################################################
17 | ############################################ Baseline methods ####$######################################################
18 | #########################################################################################################################
19 |
--------------------------------------------------------------------------------
/community/leaderboard.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Leaderboard
4 |
5 |
6 |
7 | We encourage the community to develop new methods, optimize them for specific benchmarks, and compare results with existing approaches.
8 |
9 | To implement a new method, refer to our [contributing guide](../docs/contributing.md).
10 |
11 | > [!NOTE]
12 | > The [results.md](../docs/results.md) file is maintained for reproducibility purposes. However, we encourage contributors to update the leaderboard table instead of the reproducibility table. We will continue refining and tuning baseline methods to keep the leaderboard up to date.
13 |
14 |
15 | ### TOFU unlearning on the `Llama-2-7b-hf-chat` architecture
16 |
17 |
18 |
19 |
20 |
21 | Method |
22 | forget10 |
23 |
24 |
25 | |
26 | forget_quality |
27 | model_utility |
28 |
29 |
30 |
31 |
32 | Finetuned |
33 | 4.35e-25 |
34 | 0.63 |
35 |
36 |
37 | Retain |
38 | 1.0 |
39 | 0.61 |
40 |
41 |
42 | |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 | ### TOFU unlearning on the `Llama-3.2-1B-Instruct` architecture
54 |
55 |
56 |
57 |
58 |
59 | Method |
60 | forget10 |
61 |
62 |
63 | |
64 | forget_quality |
65 | model_utility |
66 |
67 |
68 |
69 |
70 | Finetuned |
71 | 3.91e-22 |
72 | 0.6 |
73 |
74 |
75 | Retain |
76 | 1.0 |
77 | 0.59 |
78 |
79 |
80 | |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 | ### MUSE unlearning on the benchmark's target models
91 |
92 |
93 |
94 |
95 |
96 |
97 | Method |
98 | News |
99 | Books |
100 |
101 |
102 | |
103 | forget_knowmem_ROUGE |
104 | forget_verbmem_ROUGE |
105 | privleak |
106 | retain_knowmem_ROUGE |
107 | forget_knowmem_ROUGE |
108 | forget_verbmem_ROUGE |
109 | privleak |
110 | retain_knowmem_ROUGE |
111 |
112 |
113 |
114 |
115 | Finetuned |
116 | 0.64 |
117 | 0.58 |
118 | -99.81 |
119 | 0.56 |
120 | 0.47 |
121 | 1.0 |
122 | -57.34 |
123 | 0.69 |
124 |
125 |
126 | Retain |
127 | 0.33 |
128 | 0.20 |
129 | 0 |
130 | 0.56 |
131 | 0.3 |
132 | 0.14 |
133 | 0 |
134 | 0.69 |
135 |
136 |
137 | |
138 |
139 |
140 |
141 |
142 |
--------------------------------------------------------------------------------
/community/methods/AltPO/README.md:
--------------------------------------------------------------------------------
1 | # Alternate Preference Optimization for Unlearning Factual Knowledge in Large Language Models
2 | - Authors: Anmol Mekala, Vineeth Dorna, Shreya Dubey, Abhishek Lalwani, David Koleczek, Mukund Rungta, Sadid Hasan, Elita Lobo
3 | - Paper Link: https://arxiv.org/pdf/2409.13474
4 | - Code Link: https://github.com/molereddy/Alternate-Preference-Optimization
5 |
6 |
7 | LLMs struggle to suppress forget set responses using only negative feedback during unlearning, often resulting in inconsistent outputs, reduced utility, and potential privacy risks. To address this, AltPO enables stable and effective unlearning by combining negative feedback on the forget set along with positive feedback through plausible alternative responses.
8 |
9 |
10 | ## Setup
11 |
12 | #### Generate Alternate Dataset
13 |
14 | The following command generates alternate responses for TOFU, which are then used for unlearning.
15 | ```python
16 | python generate.py dataset_config.dataset_kwargs.name=forget10
17 | ```
18 |
19 | #### Hyperparameters & Search Space
20 | The original paper experiments with LLaMA2-7B; however, the following parameter ranges are reasonable to explore. You can adjust them based on the model and task. Perform a grid search over: beta in [0.05, 0.1, 0.5], learning rate in [1e-5, 2e-5, 5e-5], and alpha in [1, 2, 5].
21 |
22 | #### Computational Setup
23 | All experiments in `run.sh` are run on single A100 GPU. If larger models are used you can use deepspeed to launch the unlearning job.
24 |
25 |
26 | ## Results
27 | Run `run.sh` script.
28 |
29 |
30 | ## Citation
31 | ```bibtex
32 | @article{mekala2024alternate,
33 | title={Alternate preference optimization for unlearning factual knowledge in large language models},
34 | author={Mekala, Anmol and Dorna, Vineeth and Dubey, Shreya and Lalwani, Abhishek and Koleczek, David and Rungta, Mukund and Hasan, Sadid and Lobo, Elita},
35 | journal={arXiv preprint arXiv:2409.13474},
36 | year={2024}
37 | }
38 | ```
--------------------------------------------------------------------------------
/community/methods/AltPO/generate.yaml:
--------------------------------------------------------------------------------
1 | model_config:
2 | model_name: tofu_Llama-3.2-1B-Instruct_full
3 | model_kwargs:
4 | pretrained_model_name_or_path: open-unlearning/tofu_Llama-3.2-1B-Instruct_full
5 | trust_remote_code: True
6 | device_map: auto
7 |
8 | dataset_config:
9 | dataset_name: tofu
10 | dataset_kwargs:
11 | path: 'locuslab/TOFU'
12 | name: 'forget10'
13 | split: train
14 | cache_dir: _cache_data/
15 |
16 | prompt_config:
17 | prompt_name: INST_QAS_LLAMA3_TEMPLATE
18 | examples_path: null
19 | fewshot_delimiter: "\n\n"
20 |
21 | repeats: 5
22 |
23 | generation_kwargs:
24 | max_new_tokens: 200
25 | do_sample: True
26 | temperature: 1.0
27 |
28 | until:
29 | - "Question:"
30 | - "Question: "
31 | - "Q: "
32 | - "Q:"
33 |
34 |
35 | batch_size: 1
36 | padding_size: left
37 | truncation: False
38 | seed: 0
39 | device: cuda
40 | output_file: data/${model_config.model_name}/${dataset_config.dataset_kwargs.name}/alt${repeats}_seed_${seed}.json
41 | # limit: 5
--------------------------------------------------------------------------------
/community/methods/AltPO/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export MASTER_PORT=$(python -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
4 | echo "Master Port: $MASTER_PORT"
5 |
6 | ########################################################################################################################
7 | ########################################### Unlearn TOFU models ########################################################
8 | ########################################################################################################################
9 |
10 |
11 | models=(
12 | "Llama-3.2-1B-Instruct"
13 | )
14 | trainers_experiments=(
15 | "DPO unlearn/tofu/default.yaml"
16 | )
17 | forget_retain_splits=(
18 | "forget10 retain90"
19 | "forget05 retain95"
20 | "forget01 retain99"
21 | )
22 |
23 | per_device_train_batch_size=8
24 | gradient_accumulation_steps=4
25 |
26 |
27 | lrs=(1e-5 2e-5 5e-5)
28 | betas=(0.05 0.1 0.5)
29 | alphas=(1 2 5)
30 |
31 |
32 | for split in "${forget_retain_splits[@]}"; do
33 | forget_split=$(echo $split | cut -d' ' -f1)
34 | retain_split=$(echo $split | cut -d' ' -f2)
35 | for model in "${models[@]}"; do
36 | for trainer_experiment in "${trainers_experiments[@]}"; do
37 | trainer=$(echo $trainer_experiment | cut -d' ' -f1)
38 | experiment=$(echo $trainer_experiment | cut -d' ' -f2)
39 | for lr in "${lrs[@]}"; do
40 | for beta in "${betas[@]}"; do
41 | for alpha in "${alphas[@]}"; do
42 | task_name=tofu_${model}_${forget_split}_AltPO_lr${lr}_beta${beta}_alpha${alpha}
43 | model_path=open-unlearning/tofu_${model}_full
44 | echo ${task_name}: Unlearning ${model_path} using ${trainer}
45 |
46 | # Unlearn
47 | CUDA_VISIBLE_DEVICES=0 \
48 | python src/train.py --config-name=unlearn.yaml \
49 | experiment=${experiment} \
50 | trainer=${trainer} \
51 | task_name=${task_name} \
52 | model=${model} \
53 | forget_split=${forget_split} \
54 | retain_split=${retain_split} \
55 | model.model_args.pretrained_model_name_or_path=${model_path} \
56 | retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json \
57 | trainer.args.per_device_train_batch_size=$per_device_train_batch_size \
58 | trainer.args.gradient_accumulation_steps=$gradient_accumulation_steps \
59 | trainer.args.eval_strategy=no \
60 | trainer.args.eval_on_start=False \
61 | trainer.args.num_train_epochs=2 \
62 | trainer.args.learning_rate=$lr \
63 | trainer.method_args.beta=$beta \
64 | trainer.method_args.alpha=$alpha \
65 | data.forget.TOFU_QA_forget.handler=QAwithAlternateDataset \
66 | ~data.forget.TOFU_QA_forget.args.hf_args.name \
67 | data.forget.TOFU_QA_forget.args.hf_args.path=json \
68 | +data.forget.TOFU_QA_forget.args.hf_args.data_files=community/methods/AltPO/data/tofu_Llama-3.2-1B-Instruct_full/${forget_split}/alt5_seed_0.json \
69 | data.forget.TOFU_QA_forget.args.hf_args.split=train \
70 | +data.forget.TOFU_QA_forget.args.alternate_key=alternate \
71 | +data.forget.TOFU_QA_forget.args.return_original=True
72 |
73 | # Eval
74 | CUDA_VISIBLE_DEVICES=0 python src/eval.py \
75 | experiment=eval/tofu/default.yaml \
76 | forget_split=${forget_split} \
77 | model=${model} \
78 | task_name=${task_name} \
79 | model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \
80 | paths.output_dir=saves/unlearn/${task_name}/evals \
81 | retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json
82 | done
83 | done
84 | done
85 | done
86 | done
87 | done
88 |
--------------------------------------------------------------------------------
/community/methods/UNDIAL/README.md:
--------------------------------------------------------------------------------
1 | # UNDIAL: Self-Distillation with Adjusted Logits for Robust Unlearning in Large Language Models (NAACL 2025)
2 |
3 | - Authors: Yijiang River Dong, Hongzhou Lin, Mikhail Belkin, Ramón Huerta, Ivan Vulić
4 | - Link: https://arxiv.org/pdf/2402.10052
5 |
6 | # Setup
7 | - Hyperparameters: The original paper uses Llama-2 7B with LoRA to tune the model (rank=8, alpha=16) and learning rate of 1e-4. It's suggested to search the learning rate over [1e-5, 3e-4, 1e-4], and use an effective batch size of 32 (batch_size * gradient_accumulation). The other important hyperparemeter is beta, the strength of penalty, which typically takes a number between [3,10,30]. If we change to other models, adjusting learning rate accordingly.
8 |
9 | - Computation Setup: All experiments are run on one A100.
10 | - Other Details: The original paper does not use the retain set and aims to retain knowledge in all domains, not just on the retain set. So alpha is set to 0. Practionioners could search over the alpha or gamma to better retain the performance on the retain set.
11 |
12 | # Results
13 | Run `run.sh` script.
14 |
15 | # Citation
16 | @misc{dong2024undial,
17 | title={UNDIAL: Self-Distillation with Adjusted Logits for Robust Unlearning in Large Language Models},
18 | author={Yijiang River Dong and Hongzhou Lin and Mikhail Belkin and Ramon Huerta and Ivan Vulić},
19 | year={2024},
20 | eprint={2402.10052},
21 | archivePrefix={arXiv},
22 | primaryClass={cs.CL},
23 | url={https://arxiv.org/abs/2402.10052},
24 | }
--------------------------------------------------------------------------------
/community/methods/UNDIAL/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export MASTER_PORT=$(python -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
4 | echo "Master Port: $MASTER_PORT"
5 |
6 | ########################################################################################################################
7 | ########################################### Unlearn TOFU models ########################################################
8 | ########################################################################################################################
9 |
10 | models=(
11 | "Llama-3.2-1B-Instruct"
12 | )
13 | trainers_experiments=(
14 | "UNDIAL unlearn/tofu/default.yaml"
15 | )
16 | forget_retain_splits=(
17 | "forget10 retain90"
18 | "forget05 retain95"
19 | "forget01 retain99"
20 | )
21 |
22 | per_device_train_batch_size=16
23 | gradient_accumulation_steps=2
24 |
25 |
26 | lrs=(1e-5 1e-4 3e-4)
27 | alphas=(1 2 5)
28 | betas=(3 10 30)
29 |
30 |
31 | for split in "${forget_retain_splits[@]}"; do
32 | forget_split=$(echo $split | cut -d' ' -f1)
33 | retain_split=$(echo $split | cut -d' ' -f2)
34 | for model in "${models[@]}"; do
35 | for trainer_experiment in "${trainers_experiments[@]}"; do
36 | trainer=$(echo $trainer_experiment | cut -d' ' -f1)
37 | experiment=$(echo $trainer_experiment | cut -d' ' -f2)
38 | for lr in "${lrs[@]}"; do
39 | for beta in "${betas[@]}"; do
40 | for alpha in "${alphas[@]}"; do
41 | task_name=tofu_${model}_${forget_split}_${trainer}_lr${lr}_beta${beta}_alpha${alpha}
42 | model_path=open-unlearning/tofu_${model}_full
43 | echo ${task_name}: Unlearning ${model_path} using ${trainer}
44 |
45 | # Unlearn
46 | CUDA_VISIBLE_DEVICES=0 \
47 | python src/train.py --config-name=unlearn.yaml \
48 | experiment=${experiment} \
49 | trainer=${trainer} \
50 | task_name=${task_name} \
51 | model=${model} \
52 | forget_split=${forget_split} \
53 | retain_split=${retain_split} \
54 | model.model_args.pretrained_model_name_or_path=${model_path} \
55 | retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json \
56 | trainer.args.per_device_train_batch_size=$per_device_train_batch_size \
57 | trainer.args.gradient_accumulation_steps=$gradient_accumulation_steps \
58 | trainer.args.eval_strategy=no \
59 | trainer.args.eval_on_start=False \
60 | trainer.args.learning_rate=$lr \
61 | trainer.method_args.beta=$beta \
62 | trainer.method_args.alpha=$alpha
63 |
64 | # Eval
65 | CUDA_VISIBLE_DEVICES=0 python src/eval.py \
66 | experiment=eval/tofu/default.yaml \
67 | forget_split=${forget_split} \
68 | model=${model} \
69 | task_name=${task_name} \
70 | model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \
71 | paths.output_dir=saves/unlearn/${task_name}/evals \
72 | retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json
73 | done
74 | done
75 | done
76 | done
77 | done
78 | done
79 |
--------------------------------------------------------------------------------
/community/methods/template/README.md:
--------------------------------------------------------------------------------
1 | # TITLE
2 |
3 | - Paper title, authors, links.
4 |
5 |
6 | Provide a concise summary of your method details and its contributions. Please avoid using images to keep the repository size manageable.
7 |
8 | # Setup
9 |
10 | Please include the experimental setup such as
11 |
12 | - [ ] **Hyperparameters & Search Space:** Specify key hyperparameters, their search ranges, number of trials etc.
13 | - [ ] **Computational Setup:** Mention the type and number of GPUs used.
14 | - [ ] **DeepSpeed Configuration** (if used): If any modifications were made to the default DeepSpeed config, specify them here. (You may include the config as a code block.)
15 | - [ ] **Other Details:** Any additional setup details crucial for reproducing your method.
16 |
17 | # Results
18 |
19 | To replicate your results, provide a `run.sh` script that contains all necessary commands to reproduce the final results. Ensure the script is well-documented.
20 |
21 | It would be appreciated if you can upload the final unlearned model(s) along with their `evals` folders to HuggingFace and provide the link(s) here. As the evaluations are updated, this would help us re-evaluate your model(s).
22 |
23 | # Citation
24 |
25 |
26 | If you use this work, please cite:
27 |
28 | ```bibtex
29 |
30 |
31 |
32 | @misc{openunlearning2025,
33 | title={OpenUnlearning: A Unified Framework for LLM Unlearning Benchmarks},
34 | author={Dorna, Vineeth and Mekala, Anmol and Zhao, Wenlong and McCallum, Andrew and Kolter, J Zico and Maini, Pratyush},
35 | year={2025},
36 | howpublished={\url{https://github.com/locuslab/open-unlearning}},
37 | note={Accessed: February 27, 2025}
38 | }
39 | ```
--------------------------------------------------------------------------------
/community/methods/template/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ########################################################################################################################
4 | ########################################### Hyper parameter tuning #####################################################
5 | ########################################################################################################################
6 |
7 | # Optional
8 |
9 | ########################################################################################################################
10 | ########################################### Final best parameters #####################################################
11 | ########################################################################################################################
12 |
13 | # Required to replicate your results
--------------------------------------------------------------------------------
/configs/accelerate/default_config.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | deepspeed_config:
3 | deepspeed_config_file: configs/accelerate/zero_stage3_offload_config.json
4 | zero3_init_flag: true
5 | distributed_type: DEEPSPEED
6 | fsdp_config: {}
7 | machine_rank: 0
8 | main_process_ip: null
9 | main_process_port: null
10 | main_training_function: main
11 | num_machines: 1
12 | num_processes: 2
13 | use_cpu: false
--------------------------------------------------------------------------------
/configs/accelerate/zero_stage3_offload_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "zero_optimization": {
3 | "stage": 3,
4 | "offload_optimizer": {
5 | "device": "none",
6 | "pin_memory": true
7 | },
8 | "offload_param": {
9 | "device": "none",
10 | "pin_memory": true
11 | },
12 | "overlap_comm": true,
13 | "contiguous_gradients": true,
14 | "reduce_bucket_size": "auto",
15 | "stage3_prefetch_bucket_size": "auto",
16 | "stage3_param_persistence_threshold": "auto",
17 | "sub_group_size": 1e9,
18 | "stage3_max_live_parameters": 1e9,
19 | "stage3_max_reuse_distance": 1e9,
20 | "stage3_gather_16bit_weights_on_model_save": true
21 | },
22 | "train_batch_size": "auto",
23 | "train_micro_batch_size_per_gpu": "auto",
24 | "gradient_accumulation_steps": "auto",
25 | "bf16": {
26 | "enabled": true
27 | }
28 | }
--------------------------------------------------------------------------------
/configs/collator/DataCollatorForSupervisedDataset.yaml:
--------------------------------------------------------------------------------
1 | DataCollatorForSupervisedDataset:
2 | handler: DataCollatorForSupervisedDataset
3 | args:
4 | padding_side: right
5 |
--------------------------------------------------------------------------------
/configs/collator/DataCollatorForSupervisedDatasetwithIndex.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - DataCollatorForSupervisedDataset
3 |
4 | DataCollatorForSupervisedDataset:
5 | args:
6 | index: index
--------------------------------------------------------------------------------
/configs/data/datasets/MUSE_MIA.yaml:
--------------------------------------------------------------------------------
1 | MUSE_MIA_holdout:
2 | access_key: holdout
3 | handler: CompletionDataset
4 | args:
5 | hf_args:
6 | path: "muse-bench/MUSE-News"
7 | name: "privleak"
8 | split: "holdout"
9 | prefix_key: "prompt" # doesn't exist in dataset
10 | text_key: "text"
11 | max_length: 2048
12 | MUSE_MIA_forget:
13 | access_key: forget
14 | handler: CompletionDataset
15 | args:
16 | hf_args:
17 | path: "muse-bench/MUSE-News"
18 | name: "privleak"
19 | split: "forget"
20 | prefix_key: "prompt" # doesn't exist in dataset
21 | text_key: "text"
22 | max_length: 2048
--------------------------------------------------------------------------------
/configs/data/datasets/MUSE_forget.yaml:
--------------------------------------------------------------------------------
1 | MUSE_forget:
2 | handler: PretrainingDataset
3 | args:
4 | hf_args:
5 | path: "muse-bench/MUSE-News"
6 | name: "raw"
7 | split: "forget"
8 | text_key: "text"
9 | max_length: 2048
--------------------------------------------------------------------------------
/configs/data/datasets/MUSE_forget_knowmem.yaml:
--------------------------------------------------------------------------------
1 | MUSE_forget_knowmem:
2 | handler: QADataset
3 | args:
4 | hf_args:
5 | path: "muse-bench/MUSE-News"
6 | name: "knowmem"
7 | split: "forget_qa"
8 | few_shot_dataset_hf_args:
9 | path: "muse-bench/MUSE-News"
10 | name: "knowmem"
11 | split: "forget_qa_icl"
12 | question_key: "question"
13 | answer_key: "answer"
14 | max_length: 512
--------------------------------------------------------------------------------
/configs/data/datasets/MUSE_forget_scal.yaml:
--------------------------------------------------------------------------------
1 | MUSE_forget_scal:
2 | handler: PretrainingDataset
3 | args:
4 | hf_args:
5 | path: "muse-bench/MUSE-News"
6 | name: "scal"
7 | split: "forget_4"
8 | text_key: "text"
9 | max_length: 2048
--------------------------------------------------------------------------------
/configs/data/datasets/MUSE_forget_sust.yaml:
--------------------------------------------------------------------------------
1 | MUSE_forget_sust:
2 | handler: PretrainingDataset
3 | args:
4 | hf_args:
5 | path: "muse-bench/MUSE-News"
6 | name: "sust"
7 | split: "forget_1"
8 | text_key: "text"
9 | max_length: 2048
--------------------------------------------------------------------------------
/configs/data/datasets/MUSE_forget_verbmem.yaml:
--------------------------------------------------------------------------------
1 | MUSE_forget_verbmem:
2 | handler: CompletionDataset
3 | args:
4 | hf_args:
5 | path: "muse-bench/MUSE-News"
6 | name: "verbmem"
7 | split: "forget"
8 | prefix_key: "prompt"
9 | text_key: "gt"
10 | max_length: 2048
11 | insert_space: True
--------------------------------------------------------------------------------
/configs/data/datasets/MUSE_retain.yaml:
--------------------------------------------------------------------------------
1 | MUSE_retain:
2 | handler: PretrainingDataset
3 | args:
4 | hf_args:
5 | path: "muse-bench/MUSE-News"
6 | name: "raw"
7 | split: "retain1"
8 | text_key: "text"
9 | max_length: 2048
--------------------------------------------------------------------------------
/configs/data/datasets/MUSE_retain_knowmem.yaml:
--------------------------------------------------------------------------------
1 | MUSE_retain_knowmem:
2 | handler: QADataset
3 | args:
4 | hf_args:
5 | path: "muse-bench/MUSE-News"
6 | name: "knowmem"
7 | split: "retain_qa"
8 | few_shot_dataset_hf_args:
9 | path: "muse-bench/MUSE-News"
10 | name: "knowmem"
11 | split: "retain_qa_icl"
12 | question_key: "question"
13 | answer_key: "answer"
14 | max_length: 512
--------------------------------------------------------------------------------
/configs/data/datasets/TOFU_MIA.yaml:
--------------------------------------------------------------------------------
1 | TOFU_QA_forget:
2 | access_key: forget
3 | handler: QADataset
4 | args:
5 | hf_args:
6 | name: "forget10"
7 | split: "train"
8 | path: "locuslab/TOFU"
9 | question_key: "question"
10 | answer_key: "answer"
11 | max_length: 512
12 | TOFU_QA_holdout:
13 | access_key: holdout
14 | handler: QADataset
15 | args:
16 | hf_args:
17 | name: "holdout10"
18 | path: "locuslab/TOFU"
19 | split: "train"
20 | question_key: "question"
21 | answer_key: "answer"
22 | max_length: 512
--------------------------------------------------------------------------------
/configs/data/datasets/TOFU_QA_forget.yaml:
--------------------------------------------------------------------------------
1 | TOFU_QA_forget:
2 | handler: QADataset
3 | args:
4 | hf_args:
5 | name: "forget10"
6 | split: "train"
7 | path: "locuslab/TOFU"
8 | question_key: "question"
9 | answer_key: "answer"
10 | max_length: 512
11 |
12 |
--------------------------------------------------------------------------------
/configs/data/datasets/TOFU_QA_forget_idk.yaml:
--------------------------------------------------------------------------------
1 | TOFU_QA_forget_idk:
2 | handler: QAwithIdkDataset
3 | args:
4 | hf_args:
5 | name: "forget10"
6 | split: "train"
7 | path: "locuslab/TOFU"
8 | question_key: "question"
9 | answer_key: "answer"
10 | max_length: 512
11 | idk_path: ./data/idk.jsonl
12 | return_original: true
13 |
--------------------------------------------------------------------------------
/configs/data/datasets/TOFU_QA_forget_para.yaml:
--------------------------------------------------------------------------------
1 | TOFU_QA_forget_para:
2 | handler: QADataset
3 | args:
4 | hf_args:
5 | name: "forget10_perturbed"
6 | split: "train"
7 | path: "locuslab/TOFU"
8 | question_key: "question"
9 | answer_key: "paraphrased_answer"
10 | max_length: 512
11 |
--------------------------------------------------------------------------------
/configs/data/datasets/TOFU_QA_forget_pert.yaml:
--------------------------------------------------------------------------------
1 | TOFU_QA_forget_pert:
2 | handler: QADataset
3 | args:
4 | hf_args:
5 | name: "forget10_perturbed"
6 | split: "train"
7 | path: "locuslab/TOFU"
8 | question_key: "question"
9 | answer_key: "perturbed_answer"
10 | max_length: 512
11 |
--------------------------------------------------------------------------------
/configs/data/datasets/TOFU_QA_full.yaml:
--------------------------------------------------------------------------------
1 | TOFU_QA_full:
2 | handler: QADataset
3 | args:
4 | hf_args:
5 | name: "full"
6 | split: "train"
7 | path: "locuslab/TOFU"
8 | question_key: "question"
9 | answer_key: "answer"
10 | max_length: 512
--------------------------------------------------------------------------------
/configs/data/datasets/TOFU_QA_ra.yaml:
--------------------------------------------------------------------------------
1 | TOFU_QA_ra:
2 | handler: QADataset
3 | args:
4 | hf_args:
5 | name: "real_authors_perturbed"
6 | split: "train"
7 | path: "locuslab/TOFU"
8 | question_key: "question"
9 | answer_key: "answer"
10 | max_length: 512
--------------------------------------------------------------------------------
/configs/data/datasets/TOFU_QA_ra_pert.yaml:
--------------------------------------------------------------------------------
1 | TOFU_QA_ra_pert:
2 | handler: QADataset
3 | args:
4 | hf_args:
5 | name: "real_authors_perturbed"
6 | split: "train"
7 | path: "locuslab/TOFU"
8 | question_key: "question"
9 | answer_key: "perturbed_answer"
10 | max_length: 512
--------------------------------------------------------------------------------
/configs/data/datasets/TOFU_QA_retain.yaml:
--------------------------------------------------------------------------------
1 | TOFU_QA_retain:
2 | handler: QADataset
3 | args:
4 | hf_args:
5 | name: "retain90"
6 | split: "train"
7 | path: "locuslab/TOFU"
8 | question_key: "question"
9 | answer_key: "answer"
10 | max_length: 512
--------------------------------------------------------------------------------
/configs/data/datasets/TOFU_QA_retain_eval.yaml:
--------------------------------------------------------------------------------
1 | TOFU_QA_retain_eval:
2 | handler: QADataset
3 | args:
4 | hf_args:
5 | name: "retain_perturbed"
6 | split: "train"
7 | path: "locuslab/TOFU"
8 | question_key: "question"
9 | answer_key: "answer"
10 | max_length: 512
--------------------------------------------------------------------------------
/configs/data/datasets/TOFU_QA_retain_para.yaml:
--------------------------------------------------------------------------------
1 | TOFU_QA_retain_para:
2 | handler: QADataset
3 | args:
4 | hf_args:
5 | name: "retain_perturbed"
6 | split: "train"
7 | path: "locuslab/TOFU"
8 | question_key: "question"
9 | answer_key: "paraphrased_answer"
10 | max_length: 512
11 |
--------------------------------------------------------------------------------
/configs/data/datasets/TOFU_QA_retain_pert.yaml:
--------------------------------------------------------------------------------
1 | TOFU_QA_retain_pert:
2 | handler: QADataset
3 | args:
4 | hf_args:
5 | name: "retain_perturbed"
6 | split: "train"
7 | path: "locuslab/TOFU"
8 | question_key: "question"
9 | answer_key: "perturbed_answer"
10 | max_length: 512
11 |
--------------------------------------------------------------------------------
/configs/data/datasets/TOFU_QA_wf.yaml:
--------------------------------------------------------------------------------
1 | TOFU_QA_wf:
2 | handler: QADataset
3 | args:
4 | hf_args:
5 | name: "world_facts_perturbed"
6 | split: "train"
7 | path: "locuslab/TOFU"
8 | question_key: "question"
9 | answer_key: "answer"
10 | max_length: 512
--------------------------------------------------------------------------------
/configs/data/datasets/TOFU_QA_wf_pert.yaml:
--------------------------------------------------------------------------------
1 | TOFU_QA_wf_pert:
2 | handler: QADataset
3 | args:
4 | hf_args:
5 | name: "world_facts_perturbed"
6 | split: "train"
7 | path: "locuslab/TOFU"
8 | question_key: "question"
9 | answer_key: "perturbed_answer"
10 | max_length: 512
--------------------------------------------------------------------------------
/configs/data/datasets/WMDP_forget.yaml:
--------------------------------------------------------------------------------
1 | WMDP_forget:
2 | handler: PretrainingDataset
3 | args:
4 | hf_args:
5 | path: "text"
6 | data_files: "data/wmdp/wmdp-corpora/cyber-forget-corpus.jsonl"
7 | split: "train"
8 | text_key: "text"
9 | max_length: 512
--------------------------------------------------------------------------------
/configs/data/datasets/WMDP_retain.yaml:
--------------------------------------------------------------------------------
1 | WMDP_retain:
2 | handler: PretrainingDataset
3 | args:
4 | hf_args:
5 | path: "text"
6 | data_files: "data/wmdp/wmdp-corpora/cyber-retain-corpus.jsonl"
7 | split: "train"
8 | text_key: "text"
9 | max_length: 512
--------------------------------------------------------------------------------
/configs/data/finetune.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - datasets@train: TOFU_QA_full
3 | - datasets@eval: null
--------------------------------------------------------------------------------
/configs/data/unlearn.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - datasets@forget: TOFU_QA_forget
3 | - datasets@retain: TOFU_QA_retain
4 | - datasets@eval: null
5 |
6 | anchor: forget
--------------------------------------------------------------------------------
/configs/eval.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 |
3 | defaults:
4 | - _self_
5 | - model: Llama-3.2-3B-Instruct
6 | - eval: tofu
7 | - paths: default
8 | - hydra: eval
9 | - experiment: null
10 |
11 | model:
12 | model_args:
13 | device_map: cuda
14 |
15 | mode: eval
16 | task_name: ???
17 | seed: 0
--------------------------------------------------------------------------------
/configs/eval/lm_eval.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.lm_eval
2 | # NOTE: the above line is not a comment, but sets the package for config. See https://hydra.cc/docs/upgrades/0.11_to_1.0/adding_a_package_directive/
3 |
4 | handler: LMEvalEvaluator
5 | output_dir: ${paths.output_dir} # set to default eval directory
6 | overwrite: false
7 |
8 | # Define evaluation tasks here
9 | tasks:
10 | - mmlu
11 | # - task: gsm8k
12 | # dataset_path: gsm8k
13 | # # define the entire task config.
14 | # # ^ Example: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml
15 |
16 |
17 | simple_evaluate_args:
18 | batch_size: 16
19 | system_instruction: null
20 | apply_chat_template: false
--------------------------------------------------------------------------------
/configs/eval/muse.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.muse
2 | # NOTE: the above line is not a comment, but sets the package for config. See https://hydra.cc/docs/upgrades/0.11_to_1.0/adding_a_package_directive/
3 |
4 | defaults:
5 | - muse_metrics:
6 | - forget_knowmem_ROUGE
7 | - retain_knowmem_ROUGE
8 | - forget_verbmem_ROUGE
9 | - privleak
10 | - extraction_strength
11 | # - exact_memorization
12 | # - mia_min_k_plus_plus
13 | # - mia_min_k
14 | # - mia_loss
15 | # - mia_reference
16 | # - mia_zlib
17 | # - mia_gradnorm
18 | # - forget_gibberish
19 |
20 | handler: MUSEEvaluator
21 | output_dir: ${paths.output_dir} # set to default eval directory
22 | metrics: {}
23 | overwrite: false
24 | data_split: News
25 | retain_logs_path: null
--------------------------------------------------------------------------------
/configs/eval/muse_metrics/exact_memorization.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.muse.metrics.exact_memorization
2 | defaults:
3 | - ../../data/datasets@datasets: MUSE_forget_verbmem
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 |
6 | handler: exact_memorization
7 | batch_size: 8
8 | datasets:
9 | MUSE_forget_verbmem:
10 | args:
11 | hf_args:
12 | path: muse-bench/MUSE-${eval.muse.data_split}
--------------------------------------------------------------------------------
/configs/eval/muse_metrics/extraction_strength.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.muse.metrics.extraction_strength
2 | defaults:
3 | - ../../data/datasets@datasets: MUSE_forget_verbmem
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 |
6 | handler: extraction_strength
7 | batch_size: 8
8 | datasets:
9 | MUSE_forget_verbmem:
10 | args:
11 | hf_args:
12 | path: muse-bench/MUSE-${eval.muse.data_split}
--------------------------------------------------------------------------------
/configs/eval/muse_metrics/forget_gibberish.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.muse.metrics.forget_gibberish
2 | defaults:
3 | - .@pre_compute.forget_verbmem_ROUGE: forget_verbmem_ROUGE
4 |
5 | pre_compute:
6 | forget_verbmem_ROUGE:
7 | access_key: text
8 |
9 | handler: classifier_prob
10 | batch_size: 32
11 | max_length: 512
12 | class_id: 0
13 | text_key: generation
14 | device: cuda
15 |
16 | classifier_model_args:
17 | pretrained_model_name_or_path: "madhurjindal/autonlp-Gibberish-Detector-492513457"
18 |
19 | classifier_tokenization_args:
20 | pretrained_model_name_or_path: "madhurjindal/autonlp-Gibberish-Detector-492513457"
21 |
--------------------------------------------------------------------------------
/configs/eval/muse_metrics/forget_knowmem_ROUGE.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.muse.metrics.forget_knowmem_ROUGE
2 | defaults:
3 | - ../../data/datasets@datasets: MUSE_forget_knowmem
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | - ../../generation@generation_args: default
6 | handler: rouge
7 | rouge_type: rougeL_f1
8 | batch_size: 16
9 | datasets:
10 | MUSE_forget_knowmem:
11 | args:
12 | hf_args:
13 | path: muse-bench/MUSE-${eval.muse.data_split}
14 | few_shot_dataset_hf_args:
15 | path: muse-bench/MUSE-${eval.muse.data_split}
16 | predict_with_generate: True
17 | collators:
18 | DataCollatorForSupervisedDataset:
19 | args:
20 | padding_side: left
21 | generation_args:
22 | max_new_tokens: 32
23 | stopwords: ["\n\n", "\nQuestion", "Question:"]
24 |
--------------------------------------------------------------------------------
/configs/eval/muse_metrics/forget_verbmem_ROUGE.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.muse.metrics.forget_verbmem_ROUGE
2 | defaults:
3 | - ../../data/datasets@datasets: MUSE_forget_verbmem
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | - ../../generation@generation_args: default
6 | handler: rouge
7 | rouge_type: rougeL_f1
8 | batch_size: 8
9 | datasets:
10 | MUSE_forget_verbmem:
11 | args:
12 | hf_args:
13 | path: muse-bench/MUSE-${eval.muse.data_split}
14 | predict_with_generate: True
15 | collators:
16 | DataCollatorForSupervisedDataset:
17 | args:
18 | padding_side: left
19 | generation_args:
20 | max_new_tokens: 128
21 |
--------------------------------------------------------------------------------
/configs/eval/muse_metrics/mia_gradnorm.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.muse.metrics.mia_gradnorm
2 | defaults:
3 | - ../../data/datasets@datasets: MUSE_MIA
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | datasets:
6 | MUSE_MIA_holdout:
7 | args:
8 | hf_args:
9 | path: muse-bench/MUSE-${eval.muse.data_split}
10 | MUSE_MIA_forget:
11 | access_key: forget
12 | args:
13 | hf_args:
14 | path: muse-bench/MUSE-${eval.muse.data_split}
15 |
16 | handler: mia_gradnorm
17 | batch_size: 1
18 | p: 2
--------------------------------------------------------------------------------
/configs/eval/muse_metrics/mia_loss.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.muse.metrics.mia_loss
2 | defaults:
3 | - ../../data/datasets@datasets: MUSE_MIA
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | datasets:
6 | MUSE_MIA_holdout:
7 | args:
8 | hf_args:
9 | path: muse-bench/MUSE-${eval.muse.data_split}
10 | MUSE_MIA_forget:
11 | access_key: forget
12 | args:
13 | hf_args:
14 | path: muse-bench/MUSE-${eval.muse.data_split}
15 |
16 | batch_size: 8
17 | handler: mia_loss
18 |
--------------------------------------------------------------------------------
/configs/eval/muse_metrics/mia_min_k.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.muse.metrics.mia_min_k
2 | defaults:
3 | - ../../data/datasets@datasets: MUSE_MIA
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | datasets:
6 | MUSE_MIA_holdout:
7 | args:
8 | hf_args:
9 | path: muse-bench/MUSE-${eval.muse.data_split}
10 | MUSE_MIA_forget:
11 | access_key: forget
12 | args:
13 | hf_args:
14 | path: muse-bench/MUSE-${eval.muse.data_split}
15 |
16 | batch_size: 8
17 | handler: mia_min_k
18 | k: 0.4
--------------------------------------------------------------------------------
/configs/eval/muse_metrics/mia_min_k_plus_plus.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.muse.metrics.mia_min_k_plus_plus
2 | defaults:
3 | - ../../data/datasets@datasets: MUSE_MIA
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | datasets:
6 | MUSE_MIA_holdout:
7 | args:
8 | hf_args:
9 | path: muse-bench/MUSE-${eval.muse.data_split}
10 | MUSE_MIA_forget:
11 | access_key: forget
12 | args:
13 | hf_args:
14 | path: muse-bench/MUSE-${eval.muse.data_split}
15 |
16 | batch_size: 8
17 | handler: mia_min_k_plus_plus
18 | k: 0.4
--------------------------------------------------------------------------------
/configs/eval/muse_metrics/mia_reference.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.muse.metrics.mia_reference
2 | defaults:
3 | - ../../data/datasets@datasets: MUSE_MIA
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | datasets:
6 | MUSE_MIA_holdout:
7 | args:
8 | hf_args:
9 | path: muse-bench/MUSE-${eval.muse.data_split}
10 | MUSE_MIA_forget:
11 | access_key: forget
12 | args:
13 | hf_args:
14 | path: muse-bench/MUSE-${eval.muse.data_split}
15 |
16 | batch_size: 8
17 | handler: mia_reference
18 | reference_model_path: muse-bench/MUSE-${eval.muse.data_split}_retrain # modify appropriately
19 |
--------------------------------------------------------------------------------
/configs/eval/muse_metrics/mia_zlib.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.muse.metrics.mia_zlib
2 | defaults:
3 | - ../../data/datasets@datasets: MUSE_MIA
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | datasets:
6 | MUSE_MIA_holdout:
7 | args:
8 | hf_args:
9 | path: muse-bench/MUSE-${eval.muse.data_split}
10 | MUSE_MIA_forget:
11 | access_key: forget
12 | args:
13 | hf_args:
14 | path: muse-bench/MUSE-${eval.muse.data_split}
15 |
16 | batch_size: 8
17 | handler: mia_zlib
--------------------------------------------------------------------------------
/configs/eval/muse_metrics/privleak.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.muse.metrics.privleak
2 | defaults:
3 | - .@pre_compute.mia_min_k: mia_min_k
4 |
5 | pre_compute:
6 | mia_min_k:
7 | access_key: forget
8 |
9 | reference_logs:
10 | retain_model_logs:
11 | path: ${eval.muse.retain_logs_path}
12 | include:
13 | mia_min_k:
14 | access_key: retain
15 |
16 | handler: privleak
17 | ref_value: 0.5
--------------------------------------------------------------------------------
/configs/eval/muse_metrics/retain_knowmem_ROUGE.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.muse.metrics.retain_knowmem_ROUGE
2 | defaults:
3 | - ../../data/datasets@datasets: MUSE_retain_knowmem
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | - ../../generation@generation_args: default
6 | handler: rouge
7 | rouge_type: rougeL_f1
8 | batch_size: 16
9 | datasets:
10 | MUSE_retain_knowmem:
11 | args:
12 | hf_args:
13 | path: muse-bench/MUSE-${eval.muse.data_split}
14 | few_shot_dataset_hf_args:
15 | path: muse-bench/MUSE-${eval.muse.data_split}
16 | predict_with_generate: True
17 | collators:
18 | DataCollatorForSupervisedDataset:
19 | args:
20 | padding_side: left
21 | generation_args:
22 | max_new_tokens: 32
23 | stopwords: ["\n\n", "\nQuestion", "Question:"]
24 |
--------------------------------------------------------------------------------
/configs/eval/tofu.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu
2 | # NOTE: the above line is not a comment, but sets the package for config. See https://hydra.cc/docs/upgrades/0.11_to_1.0/adding_a_package_directive/
3 |
4 | defaults: # include all defined metrics files
5 | - tofu_metrics: # When you import a metric here, its configuration automatically populates the
6 | # metric key below, enabled by the @package directive at the top of each configuration file.
7 | - forget_quality
8 | - forget_Q_A_Prob
9 | - forget_Q_A_ROUGE
10 | - model_utility # populated in the metrics key as metrics.model_utility
11 | - privleak
12 | - extraction_strength
13 | # - exact_memorization
14 | # - mia_min_k_plus_plus
15 | # - mia_min_k
16 | # - mia_loss
17 | # - mia_zlib
18 | # - mia_gradnorm
19 | # - mia_reference # set reference model path appropriately
20 | # - forget_Q_A_gibberish
21 |
22 | handler: TOFUEvaluator
23 | output_dir: ${paths.output_dir} # set to default eval directory
24 | metrics: {} # lists a mapping from each evaluation metric to its config
25 | # populated through the first (@package) line in each metric config
26 | overwrite: false
27 | forget_split: forget10
28 | holdout_split: holdout10
29 | retain_logs_path: null
30 | question_key: "question" # Specifies which key to use during forget and retain evaluations (e.g., "question" or "paraphrased_question")
31 | batch_size: 32
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/exact_memorization.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.exact_memorization
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_QA_forget
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | # ^ get default dataset and generation config information
6 |
7 | handler: exact_memorization
8 | batch_size: ${eval.tofu.batch_size}
9 |
10 | datasets:
11 | TOFU_QA_forget:
12 | args:
13 | hf_args:
14 | name: ${eval.tofu.forget_split}_perturbed
15 | question_key: ${eval.tofu.question_key}
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/extraction_strength.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.extraction_strength
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_QA_forget
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | # ^ get default dataset and generation config information
6 |
7 | handler: extraction_strength
8 | batch_size: ${eval.tofu.batch_size}
9 |
10 | datasets:
11 | TOFU_QA_forget:
12 | args:
13 | hf_args:
14 | name: ${eval.tofu.forget_split}_perturbed
15 | question_key: ${eval.tofu.question_key}
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/forget_Q_A_PARA_Prob.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.forget_Q_A_PARA_Prob
2 |
3 | defaults:
4 | - ../../data/datasets@datasets: TOFU_QA_forget_para
5 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
6 | # ^ get default dataset and generation config information
7 |
8 | handler: probability
9 | batch_size: ${eval.tofu.batch_size}
10 |
11 | datasets:
12 | TOFU_QA_forget_para:
13 | args:
14 | hf_args:
15 | name: ${eval.tofu.forget_split}_perturbed
16 | question_key: ${eval.tofu.question_key}
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/forget_Q_A_PARA_ROUGE.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.forget_Q_A_PARA_ROUGE
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_QA_forget_para
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | - ../../generation@generation_args: default
6 |
7 | # ^ get default dataset and generation config information
8 |
9 | handler: rouge
10 | rouge_type: rougeL_recall
11 | batch_size: ${eval.tofu.batch_size}
12 |
13 | datasets: # override as needed
14 | TOFU_QA_forget_para:
15 | args:
16 | hf_args:
17 | name: ${eval.tofu.forget_split}_perturbed
18 | question_key: ${eval.tofu.question_key}
19 | predict_with_generate: True
20 | collators:
21 | DataCollatorForSupervisedDataset:
22 | args:
23 | padding_side: left
24 |
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/forget_Q_A_PERT_Prob.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.forget_Q_A_PERT_Prob
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_QA_forget_pert
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | # ^ get default dataset and generation config information
6 |
7 | handler: probability
8 | batch_size: ${eval.tofu.batch_size}
9 |
10 | datasets:
11 | TOFU_QA_forget_pert:
12 | args:
13 | hf_args:
14 | name: ${eval.tofu.forget_split}_perturbed
15 | question_key: ${eval.tofu.question_key}
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/forget_Q_A_PERT_ROUGE.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.forget_Q_A_PERT_ROUGE
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_QA_forget_pert
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | - ../../generation@generation_args: default
6 | # ^ get default dataset and generation config information
7 |
8 | handler: rouge
9 | rouge_type: rougeL_recall
10 | batch_size: ${eval.tofu.batch_size}
11 |
12 | datasets: # override as needed
13 | TOFU_QA_forget_pert:
14 | args:
15 | hf_args:
16 | name: ${eval.tofu.forget_split}_perturbed
17 | question_key: ${eval.tofu.question_key}
18 | predict_with_generate: True
19 | collators:
20 | DataCollatorForSupervisedDataset:
21 | args:
22 | padding_side: left
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/forget_Q_A_Prob.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.forget_Q_A_Prob
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_QA_forget
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | # ^ get default dataset and generation config information
6 |
7 | handler: probability
8 | batch_size: ${eval.tofu.batch_size}
9 |
10 | datasets:
11 | TOFU_QA_forget:
12 | args:
13 | hf_args:
14 | name: ${eval.tofu.forget_split}_perturbed
15 | question_key: ${eval.tofu.question_key}
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/forget_Q_A_ROUGE.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.forget_Q_A_ROUGE
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_QA_forget
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | - ../../generation@generation_args: default
6 |
7 | # ^ get default dataset and generation config information
8 |
9 | handler: rouge
10 | rouge_type: rougeL_recall
11 | batch_size: ${eval.tofu.batch_size}
12 |
13 | datasets: # override as needed
14 | TOFU_QA_forget:
15 | args:
16 | hf_args:
17 | name: ${eval.tofu.forget_split}_perturbed
18 | question_key: ${eval.tofu.question_key}
19 | predict_with_generate: True
20 | collators:
21 | DataCollatorForSupervisedDataset:
22 | args:
23 | padding_side: left
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/forget_Q_A_gibberish.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.forget_Q_A_gibberish
2 | defaults:
3 | - .@pre_compute.forget_Q_A_ROUGE: forget_Q_A_ROUGE
4 |
5 | pre_compute:
6 | forget_Q_A_ROUGE:
7 | access_key: text
8 |
9 | handler: classifier_prob
10 | batch_size: 32
11 | max_length: 32
12 | class_id: 0
13 | text_key: generation
14 | device: cuda
15 |
16 | classifier_model_args:
17 | pretrained_model_name_or_path: "madhurjindal/autonlp-Gibberish-Detector-492513457"
18 |
19 | classifier_tokenization_args:
20 | pretrained_model_name_or_path: "madhurjindal/autonlp-Gibberish-Detector-492513457"
21 |
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/forget_Truth_Ratio.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.forget_truth_ratio
2 | defaults:
3 | - .@pre_compute.forget_Q_A_PARA_Prob: forget_Q_A_PARA_Prob
4 | - .@pre_compute.forget_Q_A_PERT_Prob: forget_Q_A_PERT_Prob
5 |
6 | pre_compute:
7 | forget_Q_A_PARA_Prob:
8 | access_key: correct
9 | forget_Q_A_PERT_Prob:
10 | access_key: wrong
11 |
12 | handler: truth_ratio
13 | aggregator: closer_to_1_better
14 |
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/forget_quality.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.forget_quality
2 | defaults:
3 | - .@pre_compute.forget_truth_ratio: forget_Truth_Ratio
4 |
5 | reference_logs:
6 | retain_model_logs:
7 | path: ${eval.tofu.retain_logs_path}
8 | include:
9 | forget_truth_ratio:
10 | access_key: retain
11 |
12 | pre_compute:
13 | forget_truth_ratio:
14 | access_key: forget
15 |
16 | handler: ks_test
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/mia_gradnorm.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.mia_gradnorm
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_MIA
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 |
6 | handler: mia_gradnorm
7 | batch_size: 1
8 | p: 2
9 |
10 | datasets:
11 | TOFU_QA_forget:
12 | args:
13 | hf_args:
14 | name: ${eval.tofu.forget_split}_perturbed
15 | question_key: ${eval.tofu.question_key}
16 | TOFU_QA_holdout:
17 | args:
18 | hf_args:
19 | name: ${eval.tofu.holdout_split}
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/mia_loss.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.mia_loss
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_MIA
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | batch_size: ${eval.tofu.batch_size}
6 | handler: mia_loss
7 |
8 | datasets:
9 | TOFU_QA_forget:
10 | args:
11 | hf_args:
12 | name: ${eval.tofu.forget_split}_perturbed
13 | question_key: ${eval.tofu.question_key}
14 | TOFU_QA_holdout:
15 | args:
16 | hf_args:
17 | name: ${eval.tofu.holdout_split}
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/mia_min_k.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.mia_min_k
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_MIA
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | batch_size: ${eval.tofu.batch_size}
6 | handler: mia_min_k
7 | k: 0.4
8 |
9 | datasets:
10 | TOFU_QA_forget:
11 | args:
12 | hf_args:
13 | name: ${eval.tofu.forget_split}_perturbed
14 | question_key: ${eval.tofu.question_key}
15 | TOFU_QA_holdout:
16 | args:
17 | hf_args:
18 | name: ${eval.tofu.holdout_split}
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/mia_min_k_plus_plus.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.mia_min_k_plus_plus
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_MIA
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | batch_size: ${eval.tofu.batch_size}
6 | k: 0.4
7 | handler: mia_min_k_plus_plus
8 |
9 | datasets:
10 | TOFU_QA_forget:
11 | args:
12 | hf_args:
13 | name: ${eval.tofu.forget_split}_perturbed
14 | question_key: ${eval.tofu.question_key}
15 | TOFU_QA_holdout:
16 | args:
17 | hf_args:
18 | name: ${eval.tofu.holdout_split}
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/mia_reference.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.mia_reference
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_MIA
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | batch_size: ${eval.tofu.batch_size}
6 | handler: mia_reference
7 | reference_model_path: ??? # modify appropriately for example open-unlearning/tofu_Llama-3.2-1B-Instruct_retain90
8 |
9 | datasets:
10 | TOFU_QA_forget:
11 | args:
12 | hf_args:
13 | name: ${eval.tofu.forget_split}_perturbed
14 | question_key: ${eval.tofu.question_key}
15 | TOFU_QA_holdout:
16 | args:
17 | hf_args:
18 | name: ${eval.tofu.holdout_split}
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/mia_zlib.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.mia_zlib
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_MIA
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | batch_size: ${eval.tofu.batch_size}
6 | handler: mia_zlib
7 |
8 | datasets:
9 | TOFU_QA_forget:
10 | args:
11 | hf_args:
12 | name: ${eval.tofu.forget_split}_perturbed
13 | question_key: ${eval.tofu.question_key}
14 | TOFU_QA_holdout:
15 | args:
16 | hf_args:
17 | name: ${eval.tofu.holdout_split}
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/model_utility.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.model_utility
2 | defaults:
3 | - .@pre_compute.retain_Q_A_Prob: retain_Q_A_Prob
4 | - .@pre_compute.retain_Q_A_ROUGE: retain_Q_A_ROUGE
5 | - .@pre_compute.retain_Truth_Ratio: retain_Truth_Ratio
6 | - .@pre_compute.ra_Q_A_Prob_normalised: ra_Q_A_Prob_normalised
7 | - .@pre_compute.ra_Q_A_ROUGE: ra_Q_A_ROUGE
8 | - .@pre_compute.ra_Truth_Ratio: ra_Truth_Ratio
9 | - .@pre_compute.wf_Q_A_Prob_normalised: wf_Q_A_Prob_normalised
10 | - .@pre_compute.wf_Q_A_ROUGE: wf_Q_A_ROUGE
11 | - .@pre_compute.wf_Truth_Ratio: wf_Truth_Ratio
12 |
13 | handler: hm_aggregate
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/privleak.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.privleak
2 | defaults:
3 | - .@pre_compute.mia_min_k: mia_min_k
4 |
5 | pre_compute:
6 | mia_min_k:
7 | access_key: forget
8 |
9 | reference_logs:
10 | retain_model_logs:
11 | path: ${eval.tofu.retain_logs_path}
12 | include:
13 | mia_min_k:
14 | access_key: retain
15 |
16 | handler: privleak
17 | ref_value: 0.5
18 |
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/ra_Q_A_PERT_Prob.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.ra_Q_A_PERT_Prob
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_QA_ra_pert
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | # ^ get default dataset and generation config information
6 |
7 | handler: probability
8 | batch_size: ${eval.tofu.batch_size}
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/ra_Q_A_Prob.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.ra_Q_A_Prob
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_QA_ra
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | # ^ get default dataset and generation config information
6 |
7 | handler: probability
8 | batch_size: ${eval.tofu.batch_size}
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/ra_Q_A_Prob_normalised.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.ra_Q_A_Prob_normalised
2 | defaults:
3 | - .@pre_compute.ra_Q_A_Prob: ra_Q_A_Prob
4 | - .@pre_compute.ra_Q_A_PERT_Prob: ra_Q_A_PERT_Prob
5 |
6 | pre_compute:
7 | ra_Q_A_Prob:
8 | access_key: correct
9 | ra_Q_A_PERT_Prob:
10 | access_key: wrong
11 |
12 | handler: probability_w_options
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/ra_Q_A_ROUGE.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.ra_Q_A_ROUGE
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_QA_ra
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | - ../../generation@generation_args: default
6 |
7 | # ^ get default dataset and generation config information
8 |
9 | handler: rouge
10 | rouge_type: rougeL_recall
11 | batch_size: ${eval.tofu.batch_size}
12 | datasets: # override as needed
13 | TOFU_QA_ra:
14 | args:
15 | predict_with_generate: True
16 | collators:
17 | DataCollatorForSupervisedDataset:
18 | args:
19 | padding_side: left
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/ra_Truth_Ratio.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.ra_Truth_Ratio
2 | defaults:
3 | - .@pre_compute.ra_Q_A_Prob: ra_Q_A_Prob
4 | - .@pre_compute.ra_Q_A_PERT_Prob: ra_Q_A_PERT_Prob
5 |
6 | pre_compute:
7 | ra_Q_A_Prob:
8 | access_key: correct
9 | ra_Q_A_PERT_Prob:
10 | access_key: wrong
11 |
12 | handler: truth_ratio
13 | aggregator: true_better
14 |
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/retain_Q_A_PARA_Prob.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.retain_Q_A_PARA_Prob
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_QA_retain_para
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | # ^ get default dataset and generation config information
6 |
7 | handler: probability
8 | batch_size: ${eval.tofu.batch_size}
9 |
10 | datasets:
11 | TOFU_QA_retain_para:
12 | args:
13 | question_key: ${eval.tofu.question_key}
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/retain_Q_A_PERT_Prob.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.retain_Q_A_PERT_Prob
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_QA_retain_pert
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | # ^ get default dataset and generation config information
6 |
7 | handler: probability
8 | batch_size: ${eval.tofu.batch_size}
9 |
10 | datasets:
11 | TOFU_QA_retain_pert:
12 | args:
13 | question_key: ${eval.tofu.question_key}
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/retain_Q_A_Prob.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.retain_Q_A_Prob
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_QA_retain_eval
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | # ^ get default dataset and generation config information
6 |
7 | handler: probability
8 | batch_size: ${eval.tofu.batch_size}
9 |
10 | datasets:
11 | TOFU_QA_retain_eval:
12 | args:
13 | question_key: ${eval.tofu.question_key}
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/retain_Q_A_ROUGE.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.retain_Q_A_ROUGE
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_QA_retain_eval
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | - ../../generation@generation_args: default
6 |
7 | # ^ get default dataset and generation config information
8 |
9 | handler: rouge
10 | rouge_type: rougeL_recall
11 | batch_size: ${eval.tofu.batch_size}
12 | datasets: # override as needed
13 | TOFU_QA_retain_eval:
14 | args:
15 | question_key: ${eval.tofu.question_key}
16 | predict_with_generate: True
17 | collators:
18 | DataCollatorForSupervisedDataset:
19 | args:
20 | padding_side: left
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/retain_Truth_Ratio.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.retain_Truth_Ratio
2 | defaults:
3 | - .@pre_compute.retain_Q_A_PARA_Prob: retain_Q_A_PARA_Prob
4 | - .@pre_compute.retain_Q_A_PERT_Prob: retain_Q_A_PERT_Prob
5 |
6 | pre_compute:
7 | retain_Q_A_PARA_Prob:
8 | access_key: correct
9 | retain_Q_A_PERT_Prob:
10 | access_key: wrong
11 |
12 | handler: truth_ratio
13 | aggregator: true_better
14 |
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/wf_Q_A_PERT_Prob.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.wf_Q_A_PERT_Prob
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_QA_wf_pert
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | # ^ get default dataset and generation config information
6 |
7 | handler: probability
8 | batch_size: ${eval.tofu.batch_size}
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/wf_Q_A_Prob.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.wf_Q_A_Prob
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_QA_wf
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | # ^ get default dataset and generation config information
6 |
7 | handler: probability
8 | batch_size: ${eval.tofu.batch_size}
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/wf_Q_A_Prob_normalised.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.wf_Q_A_Prob_normalised
2 | defaults:
3 | - .@pre_compute.wf_Q_A_Prob: wf_Q_A_Prob
4 | - .@pre_compute.wf_Q_A_PERT_Prob: wf_Q_A_PERT_Prob
5 |
6 | pre_compute:
7 | wf_Q_A_Prob:
8 | access_key: correct
9 | wf_Q_A_PERT_Prob:
10 | access_key: wrong
11 |
12 | handler: probability_w_options
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/wf_Q_A_ROUGE.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.wf_Q_A_ROUGE
2 | defaults:
3 | - ../../data/datasets@datasets: TOFU_QA_wf
4 | - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
5 | - ../../generation@generation_args: default
6 |
7 | # ^ get default dataset and generation config information
8 |
9 | handler: rouge
10 | rouge_type: rougeL_recall
11 | batch_size: ${eval.tofu.batch_size}
12 | datasets: # override as needed
13 | TOFU_QA_wf:
14 | args:
15 | predict_with_generate: True
16 | collators:
17 | DataCollatorForSupervisedDataset:
18 | args:
19 | padding_side: left
--------------------------------------------------------------------------------
/configs/eval/tofu_metrics/wf_Truth_Ratio.yaml:
--------------------------------------------------------------------------------
1 | # @package eval.tofu.metrics.wf_Truth_Ratio
2 | defaults:
3 | - .@pre_compute.wf_Q_A_Prob: wf_Q_A_Prob
4 | - .@pre_compute.wf_Q_A_PERT_Prob: wf_Q_A_PERT_Prob
5 |
6 | pre_compute:
7 | wf_Q_A_Prob:
8 | access_key: correct
9 | wf_Q_A_PERT_Prob:
10 | access_key: wrong
11 |
12 | handler: truth_ratio
13 | aggregator: true_better
14 |
--------------------------------------------------------------------------------
/configs/experiment/eval/muse/default.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 |
3 | defaults:
4 | - override /model: Llama-2-7b-hf
5 | - override /eval: muse
6 |
7 | data_split: News
8 | retain_logs_path: null
9 |
10 | model:
11 | model_args:
12 | pretrained_model_name_or_path: muse-bench/MUSE-${data_split}_target
13 |
14 | eval:
15 | muse:
16 | data_split: ${data_split}
17 | retain_logs_path: ${retain_logs_path}
18 |
19 | task_name: ???
--------------------------------------------------------------------------------
/configs/experiment/eval/tofu/default.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 |
3 | defaults:
4 | - override /model: Llama-3.2-1B-Instruct
5 | - override /eval: tofu
6 |
7 | forget_split: forget10
8 | holdout_split: holdout10
9 | retain_logs_path: null
10 |
11 | model:
12 | model_args:
13 | pretrained_model_name_or_path: open-unlearning/tofu_Llama-3.2-1B-Instruct_full
14 |
15 | eval:
16 | tofu:
17 | forget_split: ${forget_split}
18 | holdout_split: ${holdout_split}
19 | retain_logs_path: ${retain_logs_path}
20 |
21 | task_name: ???
--------------------------------------------------------------------------------
/configs/experiment/eval/wmdp/default.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 |
3 | defaults:
4 | - override /model: zephyr-7b-beta
5 | - override /eval: lm_eval
6 |
7 | data_split: cyber
8 |
9 | eval:
10 | lm_eval:
11 | tasks:
12 | - wmdp_${data_split}
13 | - mmlu
14 |
15 | task_name: ???
--------------------------------------------------------------------------------
/configs/experiment/examples/tofu_eval.yaml:
--------------------------------------------------------------------------------
1 | model:
2 | model_args:
3 | device_map: cuda
4 | pretrained_model_name_or_path: open-unlearning/tofu_Llama-3.2-1B-Instruct_full
5 | attn_implementation: flash_attention_2
6 | torch_dtype: bfloat16
7 | tokenizer_args:
8 | pretrained_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
9 | template_args:
10 | apply_chat_template: true
11 | system_prompt: You are a helpful assistant.
12 | system_prompt_with_special_tokens: '<|begin_of_text|><|start_header_id|>system<|end_header_id|>
13 |
14 |
15 | You are a helpful assistant.<|eot_id|>'
16 | user_start_tag: '<|start_header_id|>user<|end_header_id|>
17 |
18 |
19 | '
20 | user_end_tag: <|eot_id|>
21 | asst_start_tag: '<|start_header_id|>assistant<|end_header_id|>
22 |
23 |
24 | '
25 | asst_end_tag: <|eot_id|>
26 | mode: eval
27 | task_name: SAMPLE_EVAL
28 | seed: 0
29 | eval:
30 | tofu:
31 | metrics:
32 | forget_quality:
33 | pre_compute:
34 | forget_truth_ratio:
35 | pre_compute:
36 | forget_Q_A_PARA_Prob:
37 | datasets:
38 | TOFU_QA_forget_para:
39 | handler: QADataset
40 | args:
41 | hf_args:
42 | name: ${eval.tofu.forget_split}_perturbed
43 | split: train
44 | path: locuslab/TOFU
45 | question_key: question
46 | answer_key: paraphrased_answer
47 | max_length: 512
48 | collators:
49 | DataCollatorForSupervisedDataset:
50 | handler: DataCollatorForSupervisedDataset
51 | args:
52 | padding_side: right
53 | index: index
54 | handler: probability
55 | batch_size: 32
56 | access_key: correct
57 | forget_Q_A_PERT_Prob:
58 | datasets:
59 | TOFU_QA_forget_pert:
60 | handler: QADataset
61 | args:
62 | hf_args:
63 | name: ${eval.tofu.forget_split}_perturbed
64 | split: train
65 | path: locuslab/TOFU
66 | question_key: question
67 | answer_key: perturbed_answer
68 | max_length: 512
69 | collators:
70 | DataCollatorForSupervisedDataset:
71 | handler: DataCollatorForSupervisedDataset
72 | args:
73 | padding_side: right
74 | index: index
75 | handler: probability
76 | batch_size: 32
77 | access_key: wrong
78 | handler: truth_ratio
79 | aggregator: closer_to_1_better
80 | access_key: forget
81 | reference_logs:
82 | retain_model_logs:
83 | path: ${eval.tofu.retain_logs_path}
84 | include:
85 | forget_truth_ratio:
86 | access_key: retain
87 | handler: ks_test
88 | forget_Q_A_Prob:
89 | datasets:
90 | TOFU_QA_forget:
91 | handler: QADataset
92 | args:
93 | hf_args:
94 | name: ${eval.tofu.forget_split}
95 | split: train
96 | path: locuslab/TOFU
97 | question_key: question
98 | answer_key: answer
99 | max_length: 512
100 | collators:
101 | DataCollatorForSupervisedDataset:
102 | handler: DataCollatorForSupervisedDataset
103 | args:
104 | padding_side: right
105 | index: index
106 | handler: probability
107 | batch_size: 32
108 | handler: TOFUEvaluator
109 | output_dir: ${paths.output_dir}
110 | overwrite: false
111 | forget_split: ${forget_split}
112 | holdout_split: ${holdout_split}
113 | retain_logs_path: ${retain_logs_path}
114 | paths:
115 | root_dir: .
116 | data_dir: ${paths.root_dir}/data/
117 | datasets: ${paths.root_dir}/configs/data/datasets
118 | output_dir: ${paths.root_dir}/saves/${mode}/${task_name}
119 | work_dir: ${hydra:runtime.cwd}
120 | forget_split: forget10
121 | holdout_split: holdout10
122 | retain_logs_path: saves/eval/tofu_Llama-3.2-1B-Instruct_retain90/TOFU_EVAL.json
123 |
--------------------------------------------------------------------------------
/configs/experiment/finetune/tofu/default.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 |
3 | defaults:
4 | - override /model: Llama-3.2-1B-Instruct
5 | - override /trainer: finetune
6 | - override /data/datasets@data.train: TOFU_QA_full
7 | - override /eval: tofu
8 |
9 | mode: finetune
10 | trainer:
11 | args:
12 | learning_rate: 1e-5
13 | weight_decay: 0.01
14 | warmup_epochs: 1.0 # custom parameter
15 | num_train_epochs: 5
16 |
17 |
18 | forget_split: forget10
19 | holdout_split: holdout10
20 | retain_logs_path: null
21 |
22 | eval:
23 | tofu:
24 | forget_split: ${forget_split}
25 | holdout_split: ${holdout_split}
26 | retain_logs_path: ${retain_logs_path}
27 | overwrite: true
28 |
29 |
30 | task_name: tofu_Llama-3.2-1B-Instruct_full
--------------------------------------------------------------------------------
/configs/experiment/unlearn/muse/default.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 |
3 | defaults:
4 | - override /model: Llama-2-7b-hf
5 | - override /trainer: GradAscent
6 | - override /data: unlearn
7 | - override /data/datasets@data.forget: MUSE_forget
8 | - override /data/datasets@data.retain: MUSE_retain
9 | - override /eval: muse
10 |
11 | data_split: News
12 | forget_split: forget
13 | retain_split: retain1
14 | retain_logs_path: null
15 |
16 | model:
17 | model_args:
18 | pretrained_model_name_or_path: muse-bench/MUSE-${data_split}_target
19 |
20 | data:
21 | anchor: forget
22 | forget:
23 | MUSE_forget:
24 | args:
25 | hf_args:
26 | split: ${forget_split}
27 | path: muse-bench/MUSE-${data_split}
28 | retain:
29 | MUSE_retain:
30 | args:
31 | hf_args:
32 | path: muse-bench/MUSE-${data_split}
33 | split: ${retain_split}
34 |
35 |
36 | eval:
37 | muse:
38 | data_split: ${data_split}
39 | retain_logs_path: ${retain_logs_path}
40 | overwrite: true
41 |
42 | trainer:
43 | args:
44 | per_device_train_batch_size: 4
45 | gradient_accumulation_steps: 8
46 | learning_rate: 1e-5
47 | num_train_epochs: 10
48 | lr_scheduler_type: constant
49 | # save_strategy: steps
50 | # save_steps: 0.5
51 | # optim: paged_adamw_32bit
52 | # optim: adamw_torch
53 |
54 | task_name: ???
55 |
--------------------------------------------------------------------------------
/configs/experiment/unlearn/muse/scalability.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 |
3 | defaults:
4 | - override /model: Llama-2-7b-hf
5 | - override /trainer: GradAscent
6 | - override /data: unlearn
7 | - override /data/datasets@data.forget: MUSE_forget_scal
8 | - override /data/datasets@data.retain: MUSE_retain
9 | - override /eval: muse
10 |
11 | data_split: News
12 | forget_split: forget_4
13 | retain_split: retain1
14 | retain_logs_path: null
15 |
16 | model:
17 | model_args:
18 | pretrained_model_name_or_path: muse-bench/MUSE-${data_split}_target
19 |
20 | data:
21 | anchor: forget
22 | forget:
23 | MUSE_forget_scal:
24 | args:
25 | hf_args:
26 | path: muse-bench/MUSE-${data_split}
27 | split: ${forget_split}
28 | retain:
29 | MUSE_retain:
30 | args:
31 | hf_args:
32 | path: muse-bench/MUSE-${data_split}
33 | split: ${retain_split}
34 |
35 | eval:
36 | muse:
37 | data_split: ${data_split}
38 | retain_logs_path: ${retain_logs_path}
39 | overwrite: true
40 |
41 | trainer:
42 | args:
43 | per_device_train_batch_size: 4
44 | gradient_accumulation_steps: 8
45 | learning_rate: 1e-5
46 | num_train_epochs: 10
47 | lr_scheduler_type: constant
48 | # save_strategy: steps
49 | # save_steps: 0.5
50 | # optim: paged_adamw_32bit
51 | # optim: adamw_torch
52 |
53 | task_name: ???
54 |
--------------------------------------------------------------------------------
/configs/experiment/unlearn/muse/sustainabilty.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 |
3 | defaults:
4 | - override /model: Llama-2-7b-hf
5 | - override /trainer: GradAscent
6 | - override /data: unlearn
7 | - override /data/datasets@data.forget: MUSE_forget_sust
8 | - override /data/datasets@data.retain: MUSE_retain
9 | - override /eval: muse
10 |
11 | data_split: News
12 | forget_split: forget_4
13 | retain_split: retain1
14 | retain_logs_path: null
15 |
16 | model:
17 | model_args:
18 | pretrained_model_name_or_path: muse-bench/MUSE-${data_split}_target
19 |
20 | data:
21 | anchor: forget
22 | forget:
23 | MUSE_forget_sust:
24 | args:
25 | hf_args:
26 | path: muse-bench/MUSE-${data_split}
27 | split: ${forget_split}
28 | retain:
29 | MUSE_retain:
30 | args:
31 | hf_args:
32 | path: muse-bench/MUSE-${data_split}
33 | split: ${retain_split}
34 |
35 | eval:
36 | muse:
37 | data_split: ${data_split}
38 | retain_logs_path: ${retain_logs_path}
39 | overwrite: true
40 |
41 | trainer:
42 | args:
43 | per_device_train_batch_size: 4
44 | gradient_accumulation_steps: 8
45 | learning_rate: 1e-5
46 | num_train_epochs: 10
47 | lr_scheduler_type: constant
48 | # save_strategy: steps
49 | # save_steps: 0.5
50 | # optim: paged_adamw_32bit
51 | # optim: adamw_torch
52 |
53 | task_name: ???
54 |
--------------------------------------------------------------------------------
/configs/experiment/unlearn/tofu/default.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 |
3 | defaults:
4 | - override /model: Llama-3.2-1B-Instruct
5 | - override /trainer: GradAscent
6 | - override /data: unlearn
7 | - override /data/datasets@data.forget: TOFU_QA_forget
8 | - override /data/datasets@data.retain: TOFU_QA_retain
9 | - override /eval: tofu
10 |
11 | model:
12 | model_args:
13 | pretrained_model_name_or_path: open-unlearning/tofu_Llama-3.2-1B-Instruct_full
14 |
15 | forget_split: forget10
16 | retain_split: retain90
17 | holdout_split: holdout10
18 | retain_logs_path: null
19 | question_key: "question"
20 |
21 | eval:
22 | tofu:
23 | forget_split: ${forget_split}
24 | holdout_split: ${holdout_split}
25 | retain_logs_path: ${retain_logs_path}
26 | overwrite: true
27 | question_key: ${question_key}
28 |
29 | data:
30 | anchor: forget
31 | forget:
32 | TOFU_QA_forget:
33 | args:
34 | hf_args:
35 | name: ${forget_split}
36 | retain:
37 | TOFU_QA_retain:
38 | args:
39 | hf_args:
40 | name: ${retain_split}
41 |
42 | trainer:
43 | args:
44 | warmup_epochs: 1.0 # custom parameter
45 | learning_rate: 1e-5
46 | weight_decay: 0.01
47 | num_train_epochs: 10
48 | # save_strategy: steps
49 | # save_steps: 0.5
50 |
51 | task_name: ???
--------------------------------------------------------------------------------
/configs/experiment/unlearn/tofu/idk.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 |
3 | defaults:
4 | - override /model: Llama-3.2-3B-Instruct
5 | - override /trainer: DPO
6 | - override /data: unlearn
7 | - override /data/datasets@data.forget: TOFU_QA_forget_idk
8 | - override /data/datasets@data.retain: TOFU_QA_retain
9 | - override /eval: tofu
10 |
11 | model:
12 | model_args:
13 | pretrained_model_name_or_path: open-unlearning/tofu_Llama-3.2-1B-Instruct_full
14 |
15 | forget_split: forget10
16 | retain_split: retain90
17 | retain_logs_path: null
18 |
19 | eval:
20 | tofu:
21 | forget_split: ${forget_split}
22 | retain_logs_path: ${retain_logs_path}
23 | overwrite: true
24 |
25 | data:
26 | anchor: forget
27 | forget:
28 | TOFU_QA_forget_idk:
29 | args:
30 | hf_args:
31 | name: ${forget_split}
32 | retain:
33 | TOFU_QA_retain:
34 | args:
35 | hf_args:
36 | name: ${retain_split}
37 |
38 | trainer:
39 | args:
40 | warmup_epochs: 1.0 # custom parameter
41 | learning_rate: 1e-5
42 | weight_decay: 0.01
43 | num_train_epochs: 10
44 | # save_strategy: steps
45 | # save_steps: 0.5
46 |
47 | task_name: ???
48 |
--------------------------------------------------------------------------------
/configs/experiment/unlearn/wmdp/default.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 |
3 | defaults:
4 | - override /model: zephyr-7b-beta
5 | - override /trainer: RMU
6 | - override /data: unlearn
7 | - override /data/datasets@data.forget: WMDP_forget
8 | - override /data/datasets@data.retain: WMDP_retain
9 | - override /eval: lm_eval
10 |
11 | data_split: cyber
12 |
13 | data:
14 | anchor: forget
15 | forget:
16 | WMDP_forget:
17 | args:
18 | hf_args:
19 | data_files: data/wmdp/wmdp-corpora/${data_split}-forget-corpus.jsonl
20 | retain:
21 | WMDP_retain:
22 | args:
23 | hf_args:
24 | data_files: data/wmdp/wmdp-corpora/${data_split}-retain-corpus.jsonl
25 |
26 | eval:
27 | lm_eval:
28 | tasks:
29 | - wmdp_${data_split}
30 | - mmlu
31 |
32 |
33 | collator:
34 | DataCollatorForSupervisedDataset:
35 | args:
36 | padding_side: left # Usually left but for mistral and zephyr its right (https://github.com/hongshi97/CAD/issues/2)
37 |
38 | trainer:
39 | args:
40 | per_device_train_batch_size: 1
41 | gradient_accumulation_steps: 16
42 | learning_rate: 5e-5
43 | eval_strategy: steps
44 | eval_steps: 0.5
45 | max_steps: 80
46 | lr_scheduler_type: constant
47 |
48 | method_args:
49 | # The params here are more dependent on model and dataset. Tune them carefully to work
50 | gamma: 1.0
51 | steering_coeff: 2
52 | retain_loss_type: EMBED_DIFF
53 | alpha: 1
54 | module_regex: model\.layers\.7
55 | trainable_params_regex:
56 | - model\.layers\.(5|6|7)\.mlp\.down_proj\.weight # If you want to update only these weights (as done in https://github.com/centerforaisafety/wmdp/blob/bc5e1ba0367ea826caeeeaa50656336a1e87acfb/rmu/unlearn.py#L26)
57 |
58 | task_name: ???
--------------------------------------------------------------------------------
/configs/generation/default.yaml:
--------------------------------------------------------------------------------
1 | do_sample: False
2 | top_p: null
3 | temperature: null
4 | max_new_tokens: 200
5 | use_cache: True
--------------------------------------------------------------------------------
/configs/hydra/default.yaml:
--------------------------------------------------------------------------------
1 | # https://hydra.cc/docs/configure_hydra/intro/
2 |
3 | # enable color logging
4 | defaults:
5 | - override hydra_logging: colorlog
6 | - override job_logging: colorlog
7 |
8 | # output directory, generated dynamically on each run
9 | run:
10 | # dir: ${paths.save_dir}/${now:%Y-%m-%d}_${now:%H-%M-%S}
11 | dir: ${paths.output_dir}
12 | # sweep:
13 | # dir: ${paths.log_dir}/${task_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
14 | # subdir: ${hydra.job.num}
15 |
16 | job_logging:
17 | handlers:
18 | file:
19 | # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242
20 | filename: ${hydra.runtime.output_dir}/${trainer.handler}.log
--------------------------------------------------------------------------------
/configs/hydra/eval.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - default
3 |
4 | run:
5 | # dir: ${paths.save_dir}/${now:%Y-%m-%d}_${now:%H-%M-%S}
6 | dir: ${paths.output_dir}
7 |
8 | job_logging:
9 | handlers:
10 | file:
11 | # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242
12 | filename: ${hydra.runtime.output_dir}/eval.log
--------------------------------------------------------------------------------
/configs/model/Llama-2-7b-chat-hf.yaml:
--------------------------------------------------------------------------------
1 | model_args:
2 | pretrained_model_name_or_path: meta-llama/Llama-2-7b-chat-hf
3 | attn_implementation: 'flash_attention_2'
4 | torch_dtype: bfloat16
5 | tokenizer_args:
6 | pretrained_model_name_or_path: meta-llama/Llama-2-7b-chat-hf
7 | template_args: # Used in creating prompts for the dataset. See src/data/utils.py#preprocess_chat_instance.
8 | # following https://www.reddit.com/r/LocalLLaMA/comments/1561vn5/here_is_a_practical_multiturn_llama2chat_prompt/
9 | apply_chat_template: False
10 | user_start_tag: "[INST] "
11 | user_end_tag: " [/INST]"
12 | asst_start_tag: ""
13 | # ^the above link says this must be " ", but we observed this leads to very bad tokenization at the border which affects scores
14 | asst_end_tag: " "
--------------------------------------------------------------------------------
/configs/model/Llama-2-7b-hf.yaml:
--------------------------------------------------------------------------------
1 | model_args:
2 | pretrained_model_name_or_path: "meta-llama/Llama-2-7b-hf"
3 | attn_implementation: 'flash_attention_2'
4 | torch_dtype: bfloat16
5 | tokenizer_args:
6 | pretrained_model_name_or_path: "meta-llama/Llama-2-7b-hf"
7 | template_args: # Used in creating prompts for the dataset. See src/data/utils.py#preprocess_chat_instance.
8 | apply_chat_template: False
9 | user_start_tag: "Question: "
10 | user_end_tag: "\n"
11 | asst_start_tag: "Answer: "
12 | asst_end_tag: "\n\n"
--------------------------------------------------------------------------------
/configs/model/Llama-3.1-8B-Instruct.yaml:
--------------------------------------------------------------------------------
1 | model_args:
2 | pretrained_model_name_or_path: "meta-llama/Llama-3.1-8B-Instruct"
3 | attn_implementation: 'flash_attention_2'
4 | torch_dtype: bfloat16
5 | tokenizer_args:
6 | pretrained_model_name_or_path: "meta-llama/Llama-3.1-8B-Instruct"
7 | template_args:
8 | apply_chat_template: True
9 | system_prompt: You are a helpful assistant.
10 | system_prompt_with_special_tokens: "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|>"
11 | user_start_tag: "<|start_header_id|>user<|end_header_id|>\n\n"
12 | user_end_tag: "<|eot_id|>"
13 | asst_start_tag: "<|start_header_id|>assistant<|end_header_id|>\n\n"
14 | asst_end_tag: "<|eot_id|>"
15 | date_string: 10 Apr 2025
--------------------------------------------------------------------------------
/configs/model/Llama-3.2-1B-Instruct.yaml:
--------------------------------------------------------------------------------
1 | model_args:
2 | pretrained_model_name_or_path: "meta-llama/Llama-3.2-1B-Instruct"
3 | attn_implementation: 'flash_attention_2'
4 | torch_dtype: bfloat16
5 | tokenizer_args:
6 | pretrained_model_name_or_path: "meta-llama/Llama-3.2-1B-Instruct"
7 | template_args:
8 | apply_chat_template: True
9 | system_prompt: You are a helpful assistant.
10 | system_prompt_with_special_tokens: "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|>"
11 | user_start_tag: "<|start_header_id|>user<|end_header_id|>\n\n"
12 | user_end_tag: "<|eot_id|>"
13 | asst_start_tag: "<|start_header_id|>assistant<|end_header_id|>\n\n"
14 | asst_end_tag: "<|eot_id|>"
15 | date_string: 10 Apr 2025
--------------------------------------------------------------------------------
/configs/model/Llama-3.2-3B-Instruct.yaml:
--------------------------------------------------------------------------------
1 | model_args:
2 | pretrained_model_name_or_path: "meta-llama/Llama-3.2-3B-Instruct"
3 | attn_implementation: 'flash_attention_2'
4 | torch_dtype: bfloat16
5 | tokenizer_args:
6 | pretrained_model_name_or_path: "meta-llama/Llama-3.2-3B-Instruct"
7 | template_args:
8 | apply_chat_template: True
9 | system_prompt: You are a helpful assistant.
10 | system_prompt_with_special_tokens: "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|>"
11 | user_start_tag: "<|start_header_id|>user<|end_header_id|>\n\n"
12 | user_end_tag: "<|eot_id|>"
13 | asst_start_tag: "<|start_header_id|>assistant<|end_header_id|>\n\n"
14 | asst_end_tag: "<|eot_id|>"
15 | date_string: 10 Apr 2025
--------------------------------------------------------------------------------
/configs/model/Phi-3.5-mini-instruct.yaml:
--------------------------------------------------------------------------------
1 | model_args:
2 | pretrained_model_name_or_path: "microsoft/Phi-3.5-mini-instruct"
3 | attn_implementation: 'flash_attention_2'
4 | torch_dtype: bfloat16
5 | tokenizer_args:
6 | pretrained_model_name_or_path: "microsoft/Phi-3.5-mini-instruct"
7 | template_args:
8 | apply_chat_template: True
9 | system_prompt: You are a helpful assistant.
10 | system_prompt_with_special_tokens: "<|system|>\nYou are a helpful assistant.<|end|>\n"
11 | user_start_tag: "<|user|>\n"
12 | user_end_tag: "<|end|>\n"
13 | asst_start_tag: "<|assistant|>\n"
14 | asst_end_tag: "<|end|>\n"
--------------------------------------------------------------------------------
/configs/model/gemma-7b-it.yaml:
--------------------------------------------------------------------------------
1 | model_args:
2 | pretrained_model_name_or_path: "google/gemma-7b-it"
3 | attn_implementation: 'flash_attention_2'
4 | torch_dtype: bfloat16
5 | tokenizer_args:
6 | pretrained_model_name_or_path: "google/gemma-7b-it"
7 | template_args:
8 | apply_chat_template: True
9 | user_start_tag: "user\n"
10 | user_end_tag: "\n"
11 | asst_start_tag: "model\n"
12 | asst_end_tag: "\n"
13 |
--------------------------------------------------------------------------------
/configs/model/phi-1_5.yaml:
--------------------------------------------------------------------------------
1 | model_args:
2 | pretrained_model_name_or_path: "microsoft/phi-1_5" # "locuslab/phi-1_5"
3 | tokenizer_args:
4 | pretrained_model_name_or_path: "microsoft/phi-1_5"
5 | template_args:
6 | apply_chat_template: False
7 | user_start_tag: "Question: "
8 | user_end_tag: "\n"
9 | asst_start_tag: "Answer: "
10 | asst_end_tag: "\n\n"
--------------------------------------------------------------------------------
/configs/model/zephyr-7b-beta.yaml:
--------------------------------------------------------------------------------
1 | model_args:
2 | pretrained_model_name_or_path: "HuggingFaceH4/zephyr-7b-beta"
3 | attn_implementation: 'flash_attention_2'
4 | torch_dtype: bfloat16
5 | tokenizer_args:
6 | pretrained_model_name_or_path: "HuggingFaceH4/zephyr-7b-beta"
7 | template_args:
8 | apply_chat_template: True
9 | system_prompt: You are a helpful assistant.
10 | system_prompt_with_special_tokens: "<|system|>\nYou are a helpful assistant.\n"
11 | user_start_tag: "<|user|>\n"
12 | user_end_tag: ""
13 | asst_start_tag: "<|assistant|>\n"
14 | asst_end_tag: ""
15 | date_string: 10 Apr 2025
--------------------------------------------------------------------------------
/configs/paths/default.yaml:
--------------------------------------------------------------------------------
1 | # path to root directory
2 | root_dir: .
3 |
4 | # path to data directory
5 | data_dir: ${paths.root_dir}/data/
6 |
7 | # path to dataset configs
8 | datasets: ${paths.root_dir}/configs/data/datasets
9 |
10 | # path to output directory, created dynamically by hydra
11 | # path generation pattern is specified in `configs/hydra/default.yaml`
12 | # use it to store all files generated during the run, like ckpts and metrics
13 | # save_dir: ${paths.root_dir}/saves/${mode}/${task_name}
14 |
15 | output_dir: ${paths.root_dir}/saves/${mode}/${task_name}
16 |
17 | # path to working directory
18 | work_dir: ${hydra:runtime.cwd}
--------------------------------------------------------------------------------
/configs/train.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - _self_
3 | - model: Llama-3.2-3B-Instruct
4 | - trainer: finetune
5 | - data: finetune
6 | - collator: DataCollatorForSupervisedDataset
7 | - eval: tofu
8 | - hydra: default
9 | - paths: default
10 | - experiment: null
11 |
12 | mode: train
13 | task_name: ???
--------------------------------------------------------------------------------
/configs/trainer/DPO.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - GradDiff
3 |
4 | handler: DPO
5 | method_args:
6 | beta: 0.1
7 | alpha: 1.0
8 | gamma: 1.0
9 | retain_loss_type: NLL
--------------------------------------------------------------------------------
/configs/trainer/GradAscent.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - finetune
3 |
4 | handler: GradAscent
--------------------------------------------------------------------------------
/configs/trainer/GradDiff.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - finetune
3 |
4 | handler: GradDiff
5 | method_args:
6 | gamma: 1.0
7 | alpha: 1.0
8 | retain_loss_type: NLL
9 |
--------------------------------------------------------------------------------
/configs/trainer/NPO.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - GradDiff
3 |
4 | handler: NPO
5 | method_args:
6 | beta: 0.1
7 | alpha: 1.0
8 | gamma: 1.0
9 | retain_loss_type: NLL
10 |
--------------------------------------------------------------------------------
/configs/trainer/RMU.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - GradDiff
3 |
4 | handler: RMU
5 | method_args:
6 | # The params here are more dependent on model and dataset. Tune them carefully to work
7 | gamma: 1.0
8 | steering_coeff: 2
9 | retain_loss_type: EMBED_DIFF
10 | alpha: 1
11 | module_regex: model\.layers\.7
12 | trainable_params_regex:
13 | - .* # update all parameters (as done in https://github.com/tmlr-group/G-effect/blob/ef368eea3b2c6dba1e090b9ebb021ac9f047e0ae/dataloader.py#L271)
14 | # - model\.layers\.(5|6|7)\.mlp\.down_proj\.weight # If you want to update only these weights (as done in https://github.com/centerforaisafety/wmdp/blob/bc5e1ba0367ea826caeeeaa50656336a1e87acfb/rmu/unlearn.py#L26)
--------------------------------------------------------------------------------
/configs/trainer/SimNPO.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - GradDiff
3 |
4 | handler: SimNPO
5 | method_args:
6 | delta: 0.0 # gamma in https://github.com/OPTML-Group/Unlearn-Simple/blob/main/TOFU/config/forget.yaml
7 | beta: 4.5
8 | alpha: 1.0
9 | gamma: 0.125 # npo_coeff in https://github.com/OPTML-Group/Unlearn-Simple/blob/main/TOFU/config/forget.yaml
10 | retain_loss_type: NLL
11 |
12 |
--------------------------------------------------------------------------------
/configs/trainer/UNDIAL.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - finetune
3 |
4 | handler: UNDIAL # corresponds to the class defined in src/trainer/unlearn/grad_diff.py
5 | args: # HuggingFace TrainingArguments
6 | learning_rate: 1e-4
7 | num_train_epochs: 10
8 | method_args: # Your own method-specific arguments
9 | gamma: 1.0
10 | alpha: 0.0
11 | beta: 10.0 # the strength of penalty for memorized tokens
12 | retain_loss_type: NLL
--------------------------------------------------------------------------------
/configs/trainer/finetune.yaml:
--------------------------------------------------------------------------------
1 | handler: FinetuneTrainer
2 | args:
3 | per_device_train_batch_size: 8
4 | per_device_eval_batch_size: 16
5 | gradient_accumulation_steps: 4
6 | learning_rate: 1e-5
7 | bf16: True
8 | bf16_full_eval: True
9 | logging_steps: 5
10 | output_dir: ${paths.output_dir}
11 | logging_dir: ${trainer.args.output_dir}/logs
12 | report_to: tensorboard
13 | ddp_find_unused_parameters: None
14 | gradient_checkpointing: False
15 | optim: paged_adamw_32bit
16 | save_strategy: 'no'
17 | save_only_model: True
18 | weight_decay: 0.00
19 | do_train: True
20 | do_eval: True
21 | eval_on_start: True
22 | eval_strategy: epoch
23 | num_train_epochs: 10
24 | seed: 0
--------------------------------------------------------------------------------
/configs/unlearn.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - model: Llama-3.2-3B-Instruct
3 | - trainer: GradAscent
4 | - data: unlearn
5 | - collator: DataCollatorForSupervisedDataset
6 | - eval: tofu
7 | - hydra: default
8 | - paths: default
9 | - experiment: null
10 | - _self_
11 |
12 | trainer:
13 | args:
14 | remove_unused_columns: False
15 |
16 | mode: unlearn
17 | task_name: ???
--------------------------------------------------------------------------------
/docs/hydra.md:
--------------------------------------------------------------------------------
1 | ## Hydra Features
2 |
3 | The below are some important Hydra features we use for flexible composition while writing configurations to our YAML files.
4 |
5 | We use this config file for illustration, from [`configs/experiment/unlearn/muse/default.yaml`](../configs/experiment/unlearn/muse/default.yaml):
6 |
7 | ```yaml
8 | # @package _global_
9 | # ^ not a comment, sets the path of this config to be the the config root directory
10 | defaults:
11 | - override /model: Llama-2-7b-hf # loads from model/Llama-2-7b-hf.yaml into the model attribute
12 | - override /trainer: GradAscent # loads from trainer/GradAscent.yaml into the trainer attribute
13 | - override /data: unlearn # loads from data/unlearn.yaml into the "data" attribute,, setting up data structures for loading datasets during unlearning
14 | - override /eval: muse # loads MUSE evaluation suite from eval/muse.yaml into the eval attribute
15 |
16 | # define variables
17 | data_split: News
18 | forget_split: forget
19 | retain_split: retain1
20 | retain_logs_path: null
21 |
22 | model:
23 | model_args:
24 | pretrained_model_name_or_path: muse-bench/MUSE-${data_split}_target
25 | tokenizer_args:
26 | pretrained_model_name_or_path: muse-bench/MUSE-${data_split}_target
27 | data:
28 | anchor: forget
29 | forget:
30 | MUSE_forget:
31 | args:
32 | hf_args:
33 | split: ${forget_split}
34 | retain:
35 | MUSE_retain:
36 | args:
37 | hf_args:
38 | split: ${retain_split}
39 |
40 | eval:
41 | muse:
42 | data_split: ${data_split}
43 | retain_logs_path: ${retain_logs_path}
44 |
45 | trainer:
46 | args:
47 | per_device_train_batch_size: 4
48 | gradient_accumulation_steps: 8
49 | learning_rate: 1e-5
50 | num_train_epochs: 10
51 | lr_scheduler_type: constant
52 | # save_strategy: steps
53 | # save_steps: 0.5
54 | # optim: paged_adamw_32bit
55 | # optim: adamw_torch
56 |
57 | task_name: ??? # ??? raises and error if this attribute is not set
58 | ```
59 |
60 | - **Structure & Attribute Access:** Configs are written in YAML and structured hierarchically like a dictionary. Attributes are accessed using dot notation: In code `cfg.model.args.learning_rate`, in command-line: `model.args.learning_rate=1e-5`.
61 |
62 | - **Defaults & Overrides:** Configs are files are included in one another using `defaults` and `override` commands.
63 |
64 | - **Command-Line Overrides:** Any parameter can be overridden directly from the command line. For instance:
65 | ```bash
66 | python src/train.py --config-name=unlearn.yaml experiment=unlearn/muse/default \
67 | trainer.args.num_train_epochs=50 data_split=Books trainer=SimNPO trainer.method_args.beta=3 \
68 | task_name=unlearn_muse_simnpo
69 | ```
70 |
71 | - **Package Directives:** The `# @package` directive organizes configurations into namespaces for cleaner composition and specifies the configuration path. At the head of a YAML file, you might see directives like `# @package _global_` or more specific ones such as `# @package eval.muse.metrics.forget_knowmem_ROUGE` which inform Hydra exactly where the configuration parameters should be placed within the final composed config.
72 |
73 | For example, refer [`configs/eval/muse_metrics/forget_knowmem_ROUGE.yaml`](../configs/eval/muse_metrics/forget_knowmem_ROUGE.yaml)
74 |
75 | - **Variable Substitution:** Variables are defined once and reused using the `${}` syntax.
76 |
77 | - **Adding New Attributes with `+`:** Use the `+` prefix to add attributes that are not already in the config. For example, to add a new argument to the trainer:
78 | ```bash
79 | python src/train.py experiment=unlearn/muse/default +trainer.args.my_new_arg=10
80 | ```
81 |
82 | - **Attribute Removal with `~`:** You can remove an attribute from the config at runtime using the tilde `~`. For example, to remove flash attention setting:
83 | ```bash
84 | python src/train.py experiment=unlearn/muse/default ~model.model_args.attn_implementation
85 | ```
86 | > [!NOTE]
87 | > In `zsh`, you must **quote** or **escape** the `~` to avoid it being misinterpreted as a home directory: e.g.:
88 | ```bash
89 | python src/train.py \~model.model_args.attn_implementation
90 | python src/train.py "~model.model_args.attn_implementation"
91 | ```
92 | > [!NOTE]
93 | > Hydra uses PyYAML to handle yaml files and transform inputs while giving config inputs. This handles cases like converting `true` to `True`
94 |
95 | Refer to the following for config structures and overridable parameters:
96 | - Evaluation: [`configs/experiment/examples/tofu_eval.yaml`](../configs/experiment/examples/tofu_eval.yaml)
97 | - Unlearning: [`configs/experiment/examples/muse_unlearn.yaml`](../configs/experiment/examples/muse_unlearn.yaml)
--------------------------------------------------------------------------------
/docs/links.md:
--------------------------------------------------------------------------------
1 | # 🔗 Links and References
2 |
3 | Links to research papers and resources corresponding to implemented features in this repository. Please feel free to fill in any missing references!
4 |
5 | ---
6 |
7 | ## 📌 Table of Contents
8 | - [🔗 Links and References](#-links-and-references)
9 | - [📌 Table of Contents](#-table-of-contents)
10 | - [📗 Implemented Methods](#-implemented-methods)
11 | - [📘 Benchmarks](#-benchmarks)
12 | - [📙 Evaluation Metrics](#-evaluation-metrics)
13 | - [🌐 Useful Links](#-useful-links)
14 | - [📚 Surveys](#-surveys)
15 | - [🐙 Other GitHub Repositories](#-other-github-repositories)
16 |
17 | ---
18 |
19 | ## 📗 Implemented Methods
20 |
21 | | Method | Resource |
22 | |-----------------|----------|
23 | | GradAscent, GradDiff | Naive baselines found in many papers including MUSE, TOFU etc. |
24 | | NPO | Paper [📄](https://arxiv.org/abs/2404.05868), Code [🐙](https://github.com/licong-lin/negative-preference-optimization) |
25 | | SimNPO | Paper [📄](https://arxiv.org/abs/2410.07163), Code [🐙](https://github.com/OPTML-Group/Unlearn-Simple) |
26 | | IdkDPO | TOFU ([📄](https://arxiv.org/abs/2401.06121)) |
27 | | RMU | WMDP paper ([🐙](https://github.com/centerforaisafety/wmdp/tree/main/rmu), [🌐](https://www.wmdp.ai/)), later used in G-effect ([🐙](https://github.com/tmlr-group/G-effect/blob/main/dataloader.py)) |
28 | | UNDIAL | Paper [📄](https://arxiv.org/pdf/2402.10052), Code [🐙](https://github.com/dong-river/LLM_unlearning/tree/main) |
29 | | AltPO | Paper [📄](https://arxiv.org/pdf/2409.13474), Code [🐙](https://github.com/molereddy/Alternate-Preference-Optimization) |
30 |
31 | ---
32 |
33 | ## 📘 Benchmarks
34 |
35 | | Benchmark | Resource |
36 | |-----------|----------|
37 | | TOFU | Paper [📄](https://arxiv.org/abs/2401.06121) |
38 | | MUSE | Paper [📄](https://arxiv.org/abs/2407.06460) |
39 | | WMDP | Paper [📄](https://arxiv.org/abs/2403.03218) |
40 |
41 | ---
42 |
43 | ## 📙 Evaluation Metrics
44 |
45 | | Metric | Resource |
46 | |--------|----------|
47 | | Verbatim Probability / ROUGE, simple QA-ROUGE | Naive metrics found in many papers including MUSE, TOFU etc. |
48 | | Membership Inference Attacks (LOSS, ZLib, Reference, GradNorm, MinK, MinK++) | MIMIR ([🐙](https://github.com/iamgroot42/mimir)), MUSE ([📄](https://arxiv.org/abs/2407.06460)) |
49 | | PrivLeak | MUSE ([📄](https://arxiv.org/abs/2407.06460)) |
50 | | Forget Quality, Truth Ratio, Model Utility | TOFU ([📄](https://arxiv.org/abs/2401.06121)) |
51 | | Extraction Strength (ES) | Carlini et al., 2021 ([📄](https://www.usenix.org/conference/usenixsecurity21/presentation/carlini-extracting)), used for unlearning in Wang et al., 2025 ([📄](https://openreview.net/pdf?id=wUtCieKuQU)) |
52 | | Exact Memorization (EM) | Tirumala et al., 2022 ([📄](https://proceedings.neurips.cc/paper_files/paper/2022/hash/fa0509f4dab6807e2cb465715bf2d249-Abstract-Conference.html)), used for unlearning in Wang et al., 2025 ([📄](https://openreview.net/pdf?id=wUtCieKuQU)) |
53 | | lm-evaluation-harness | [💻](https://github.com/EleutherAI/lm-evaluation-harness/tree/main) |
54 |
55 | ---
56 |
57 | ## 🌐 Useful Links
58 |
59 | ### 📚 Surveys
60 | - [Machine Unlearning in 2024](https://ai.stanford.edu/~kzliu/blog/unlearning)
61 | - [Rethinking Machine Unlearning for Large Language Models](https://arxiv.org/abs/2402.08787)
62 |
63 | ### 🐙 Other GitHub Repositories
64 | - [TOFU Benchmark (original)](https://github.com/locuslab/tofu)
65 | - [MUSE Benchmark (original)](https://github.com/swj0419/muse_bench)
66 | - [Awesome LLM Unlearning](https://github.com/chrisliu298/awesome-llm-unlearning)
67 | - [Awesome Machine Unlearning](https://github.com/tamlhp/awesome-machine-unlearning)
68 | - [Awesome GenAI Unlearning](https://github.com/franciscoliu/Awesome-GenAI-Unlearning)
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | huggingface-hub==0.29.1
2 | transformers==4.45.1
3 | numpy==2.2.3
4 | hydra-core==1.3
5 | hydra_colorlog==1.2.0
6 | torch==2.4.1
7 | datasets==3.0.1
8 | accelerate==0.34.2
9 | bitsandbytes==0.44.1
10 | rouge-score==0.1.2
11 | scipy==1.14.1
12 | tensorboard==2.18.0
13 | scikit-learn==1.5.2
14 | deepspeed==0.15.4
15 |
--------------------------------------------------------------------------------
/scripts/muse_unlearn.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export MASTER_PORT=$(python -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
4 | echo "Master Port: $MASTER_PORT"
5 |
6 |
7 | per_device_train_batch_size=4
8 | gradient_accumulation_steps=8
9 |
10 |
11 | model=Llama-2-7b-hf
12 |
13 | data_splits=(
14 | "News"
15 | "Books"
16 | )
17 |
18 | trainers=(
19 | "GradAscent"
20 | "GradDiff"
21 | "NPO"
22 | "SimNPO"
23 | )
24 |
25 | # #########################################################
26 | # #################### MUSE Unlearning ####################
27 | # #########################################################
28 |
29 |
30 | for data_split in "${data_splits[@]}"; do
31 | for trainer in "${trainers[@]}"; do
32 |
33 | task_name=muse_${model}_${data_split}_${trainer}
34 |
35 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file configs/accelerate/default_config.yaml --main_process_port $MASTER_PORT \
36 | src/train.py --config-name=unlearn.yaml \
37 | experiment=unlearn/muse/default.yaml \
38 | model=${model} \
39 | data_split=${data_split} \
40 | trainer=${trainer} \
41 | task_name=${task_name} \
42 | retain_logs_path=saves/eval/muse_${model}_${data_split}_retrain/MUSE_EVAL.json \
43 | trainer.args.per_device_train_batch_size=${per_device_train_batch_size} \
44 | trainer.args.gradient_accumulation_steps=${gradient_accumulation_steps} \
45 | trainer.args.ddp_find_unused_parameters=true \
46 | trainer.args.gradient_checkpointing=true
47 |
48 | CUDA_VISIBLE_DEVICES=0 python src/eval.py \
49 | experiment=eval/muse/default.yaml \
50 | data_split=${data_split} \
51 | task_name=${task_name} \
52 | model=${model} \
53 | model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \
54 | paths.output_dir=saves/unlearn/${trainer}/evals \
55 | retain_logs_path=saves/eval/muse_${model}_${data_split}_retrain/MUSE_EVAL.json
56 | done
57 | done
58 |
59 |
60 |
61 | # #########################################################
62 | # ########### MUSE News Unlearning Scalability ############
63 | # #########################################################
64 |
65 |
66 | for data_split in "${data_splits[@]}"; do
67 | for trainer in "${trainers[@]}"; do
68 | for scal in "forget_1" "forget_2" "forget_3" "forget_4"; do
69 |
70 | task_name=muse_${model}_${data_split}_${trainer}_scal_${scal} \
71 |
72 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file configs/accelerate/default_config.yaml --main_process_port $MASTER_PORT \
73 | src/train.py --config-name=unlearn.yaml \
74 | experiment=unlearn/muse/scalability.yaml \
75 | model=${model} \
76 | data_split=${data_split} \
77 | forget_split=${scal} \
78 | trainer=${trainer} \
79 | task_name=${task_name} \
80 | retain_logs_path=saves/eval/muse_${model}_${data_split}_retrain/MUSE_EVAL.json \
81 | trainer.args.per_device_train_batch_size=${per_device_train_batch_size} \
82 | trainer.args.gradient_accumulation_steps=${gradient_accumulation_steps} \
83 | trainer.args.ddp_find_unused_parameters=true \
84 | trainer.args.gradient_checkpointing=true
85 |
86 | CUDA_VISIBLE_DEVICES=0 python src/eval.py \
87 | experiment=eval/muse/default.yaml \
88 | data_split=${data_split} \
89 | task_name=${task_name} \
90 | model=${model} \
91 | model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \
92 | paths.output_dir=saves/unlearn/${trainer}/evals \
93 | retain_logs_path=saves/eval/muse_${model}_${data_split}_retrain/MUSE_EVAL.json
94 | done
95 | done
96 | done
97 |
98 |
99 |
100 | #########################################################
101 | ########### MUSE News Unlearning sustainability #########
102 | #########################################################
103 |
104 |
105 | for data_split in "${data_splits[@]}"; do
106 | for trainer in "${trainers[@]}"; do
107 | model_path=muse-bench/MUSE-${data_split}_target
108 | for sust in "forget_1" "forget_2" "forget_3" "forget_4"; do
109 |
110 | task_name=muse_${model}_${data_split}_${trainer}_sust_${sust}
111 |
112 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file configs/accelerate/default_config.yaml --main_process_port $MASTER_PORT \
113 | src/train.py --config-name=unlearn.yaml \
114 | experiment=unlearn/muse/sustainabilty.yaml \
115 | model=${model} \
116 | model.model_args.pretrained_model_name_or_path=${model_path} \
117 | data_split=${data_split} \
118 | trainer=${trainer} \
119 | task_name=${task_name} \
120 | retain_logs_path=saves/eval/muse_${model}_${data_split}_retrain/MUSE_EVAL.json \
121 | trainer.args.per_device_train_batch_size=${per_device_train_batch_size} \
122 | trainer.args.gradient_accumulation_steps=${gradient_accumulation_steps} \
123 | trainer.args.ddp_find_unused_parameters=true \
124 | trainer.args.gradient_checkpointing=true
125 |
126 | CUDA_VISIBLE_DEVICES=0 python src/eval.py \
127 | experiment=eval/muse/default.yaml \
128 | data_split=${data_split} \
129 | task_name=${task_name} \
130 | model=${model} \
131 | model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \
132 | paths.output_dir=saves/unlearn/${trainer}/evals \
133 | retain_logs_path=saves/eval/muse_${model}_${data_split}_retrain/MUSE_EVAL.json
134 |
135 | model_path=saves/unlearn/${task_name}
136 | done
137 | done
138 | done
--------------------------------------------------------------------------------
/scripts/tofu_finetune.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export MASTER_PORT=$(python -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
4 | echo "Master Port: $MASTER_PORT"
5 |
6 |
7 | models=(
8 | "Llama-3.2-1B-Instruct"
9 | "Llama-3.2-3B-Instruct"
10 | "Llama-3.1-8B-Instruct"
11 | )
12 | per_device_train_batch_size=4 # Effective batch size 32 on two GPUs with gradent_accumulation_steps=8
13 |
14 | splits=(
15 | "forget01 holdout01 retain99"
16 | "forget05 holdout05 retain95"
17 | "forget10 holdout10 retain90"
18 | )
19 |
20 |
21 |
22 | ########################################################################################################################
23 | ########################################### RETAIN Finetuned TOFU ######################################################
24 | ########################################################################################################################
25 |
26 | for split in "${splits[@]}"; do
27 | forget_split=$(echo $split | cut -d' ' -f1)
28 | holdout_split=$(echo $split | cut -d' ' -f2)
29 | retain_split=$(echo $split | cut -d' ' -f3)
30 |
31 | for model in "${models[@]}"; do
32 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file configs/accelerate/default_config.yaml --main_process_port $MASTER_PORT \
33 | src/train.py experiment=finetune/tofu/default.yaml \
34 | task_name=tofu_${model}_${retain_split} \
35 | model=${model} \
36 | data/datasets@data.train=TOFU_QA_retain \
37 | data.train.TOFU_QA_retain.args.hf_args.name=${retain_split} \
38 | trainer.args.per_device_train_batch_size=4 \
39 | trainer.args.ddp_find_unused_parameters=true \
40 | trainer.args.gradient_checkpointing=true
41 |
42 |
43 | CUDA_VISIBLE_DEVICES=0 python src/eval.py experiment=eval/tofu/default.yaml \
44 | forget_split=${forget_split} \
45 | holdout_split=${holdout_split} \
46 | task_name=tofu_${model}_${retain_split} \
47 | model=${model} \
48 | model.model_args.pretrained_model_name_or_path=saves/finetune/tofu_${model}_${retain_split}
49 | done
50 | done
51 |
52 |
53 | # ########################################################################################################################
54 | # ########################################### FULL Finetuned TOFU models #################################################
55 | # ########################################################################################################################
56 |
57 |
58 | for model in "${models[@]}"; do
59 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file configs/accelerate/default_config.yaml --main_process_port $MASTER_PORT \
60 | src/train.py experiment=finetune/tofu/default.yaml \
61 | task_name=tofu_${model}_full \
62 | model=${model} \
63 | data/datasets@data.train=TOFU_QA_full \
64 | data.train.TOFU_QA_full.args.hf_args.name=full \
65 | trainer.args.per_device_train_batch_size=4 \
66 | trainer.args.ddp_find_unused_parameters=true \
67 | trainer.args.gradient_checkpointing=true
68 |
69 | # Evaluate the full models on each forget split
70 | for split in "${splits[@]}"; do
71 | forget_split=$(echo $split | cut -d' ' -f1)
72 | holdout_split=$(echo $split | cut -d' ' -f2)
73 | retain_split=$(echo $split | cut -d' ' -f3)
74 |
75 | CUDA_VISIBLE_DEVICES=0 python src/eval.py experiment=eval/tofu/default.yaml \
76 | forget_split=${forget_split} \
77 | holdout_split=${holdout_split} \
78 | task_name=tofu_${model}_full_${forget_split} \
79 | model=${model} \
80 | model.model_args.pretrained_model_name_or_path=saves/finetune/tofu_${model}_full \
81 | retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json \
82 | paths.output_dir=saves/eval/tofu_${model}_full/evals_${forget_split}
83 | done
84 | done
--------------------------------------------------------------------------------
/scripts/tofu_unlearn.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 |
4 | export MASTER_PORT=$(python -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
5 | echo "Master Port: $MASTER_PORT"
6 |
7 | models=(
8 | "Llama-3.2-1B-Instruct"
9 | "Llama-3.2-3B-Instruct"
10 | "Llama-3.1-8B-Instruct"
11 | )
12 | trainers_experiments=(
13 | "GradAscent unlearn/tofu/default.yaml"
14 | "GradDiff unlearn/tofu/default.yaml"
15 | "NPO unlearn/tofu/default.yaml"
16 | "DPO unlearn/tofu/idk.yaml"
17 | "RMU unlearn/tofu/default.yaml"
18 | )
19 | splits=(
20 | "forget01 holdout01 retain99"
21 | "forget05 holdout05 retain95"
22 | "forget10 holdout10 retain90"
23 | )
24 |
25 |
26 | per_device_train_batch_size=4 # on two gpus would make effective batch size 32
27 | gradient_accumulation_steps=4
28 |
29 |
30 | ########################################################################################################################
31 | ########################################### Unlearn TOFU models ########################################################
32 | ########################################################################################################################
33 |
34 |
35 | for split in "${splits[@]}"; do
36 | forget_split=$(echo $split | cut -d' ' -f1)
37 | holdout_split=$(echo $split | cut -d' ' -f2)
38 | retain_split=$(echo $split | cut -d' ' -f3)
39 |
40 | for model in "${models[@]}"; do
41 | for trainer_experiment in "${trainers_experiments[@]}"; do
42 | trainer=$(echo $trainer_experiment | cut -d' ' -f1)
43 | experiment=$(echo $trainer_experiment | cut -d' ' -f2)
44 |
45 | task_name=tofu_${model}_${forget_split}_${trainer}
46 | model_path=open-unlearning/tofu_${model}_full
47 | echo ${task_name}: Unlearning ${model_path} using ${trainer}
48 |
49 | # Unlearn
50 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file configs/accelerate/default_config.yaml --main_process_port $MASTER_PORT \
51 | src/train.py --config-name=unlearn.yaml \
52 | experiment=${experiment} \
53 | trainer=${trainer} \
54 | task_name=${task_name} \
55 | model=${model} \
56 | forget_split=${forget_split} \
57 | retain_split=${retain_split} \
58 | model.model_args.pretrained_model_name_or_path=${model_path} \
59 | retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json \
60 | trainer.args.per_device_train_batch_size=$per_device_train_batch_size \
61 | trainer.args.gradient_accumulation_steps=$gradient_accumulation_steps \
62 | trainer.args.ddp_find_unused_parameters=true \
63 | trainer.args.gradient_checkpointing=true
64 |
65 | # Eval
66 | CUDA_VISIBLE_DEVICES=0 python src/eval.py \
67 | experiment=eval/tofu/default.yaml \
68 | forget_split=${forget_split} \
69 | holdout_split=${holdout_split} \
70 | model=${model} \
71 | task_name=${task_name} \
72 | model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \
73 | paths.output_dir=saves/unlearn/${task_name}/evals \
74 | retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json
75 | done
76 | done
77 | done
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | # Read dependencies from requirements.txt
4 | with open("requirements.txt") as f:
5 | requirements = f.read().splitlines()
6 |
7 | setup(
8 | name="open-unlearning",
9 | version="0.1.0",
10 | author="Vineeth Dorna, Anmol Mekala",
11 | author_email="vineethdorna@gmail.com, m.anmolreddy@gmail.com",
12 | description="A library for machine unlearning in LLMs.",
13 | long_description=open("README.md").read(),
14 | long_description_content_type="text/markdown",
15 | url="https://github.com/locuslab/open-unlearning",
16 | license="MIT",
17 | packages=find_packages(),
18 | install_requires=requirements, # Uses requirements.txt
19 | extras_require={
20 | "lm-eval": [
21 | "lm-eval==0.4.8",
22 | ], # Install using `pip install .[lm-eval]`
23 | "dev": [
24 | "pre-commit==4.0.1",
25 | "ruff==0.6.9",
26 | ], # Install using `pip install .[dev]`
27 | },
28 | python_requires=">=3.11",
29 | )
30 |
--------------------------------------------------------------------------------
/setup_data.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import subprocess
4 | from huggingface_hub import snapshot_download
5 |
6 |
7 | def download_eval_data():
8 | snapshot_download(
9 | repo_id="open-unlearning/eval",
10 | allow_patterns="*.json",
11 | repo_type="dataset",
12 | local_dir="saves/eval",
13 | )
14 |
15 |
16 | def download_idk_data():
17 | snapshot_download(
18 | repo_id="open-unlearning/idk",
19 | allow_patterns="*.jsonl",
20 | repo_type="dataset",
21 | local_dir="data",
22 | )
23 |
24 |
25 | def download_wmdp():
26 | url = "https://cais-wmdp.s3.us-west-1.amazonaws.com/wmdp-corpora.zip"
27 | dest_dir = "data/wmdp"
28 | zip_path = os.path.join(dest_dir, "wmdp-corpora.zip")
29 |
30 | os.makedirs(dest_dir, exist_ok=True)
31 | subprocess.run(["wget", url, "-O", zip_path], check=True)
32 | subprocess.run(["unzip", "-P", "wmdpcorpora", zip_path, "-d", dest_dir], check=True)
33 |
34 |
35 | def main():
36 | parser = argparse.ArgumentParser(description="Download and setup evaluation data.")
37 | parser.add_argument(
38 | "--eval_logs",
39 | action="store_true",
40 | help="Downloads TOFU, MUSE - retain and finetuned models eval logs and saves them in saves/eval",
41 | )
42 | parser.add_argument(
43 | "--idk",
44 | action="store_true",
45 | help="Download idk dataset from HF hub and stores it data/idk.jsonl",
46 | )
47 | parser.add_argument(
48 | "--wmdp",
49 | action="store_true",
50 | help="Download and unzip WMDP dataset into data/wmdp",
51 | )
52 |
53 | args = parser.parse_args()
54 |
55 | if args.eval_logs:
56 | download_eval_data()
57 | if args.idk:
58 | download_idk_data()
59 | if args.wmdp:
60 | download_wmdp()
61 |
62 |
63 | if __name__ == "__main__":
64 | main()
65 |
--------------------------------------------------------------------------------
/src/data/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Any, Union
2 | from omegaconf import DictConfig
3 |
4 | from data.qa import QADataset, QAwithIdkDataset, QAwithAlternateDataset
5 | from data.collators import (
6 | DataCollatorForSupervisedDataset,
7 | )
8 | from data.unlearn import ForgetRetainDataset
9 | from data.pretraining import PretrainingDataset, CompletionDataset
10 |
11 | DATASET_REGISTRY: Dict[str, Any] = {}
12 | COLLATOR_REGISTRY: Dict[str, Any] = {}
13 |
14 |
15 | def _register_data(data_class):
16 | DATASET_REGISTRY[data_class.__name__] = data_class
17 |
18 |
19 | def _register_collator(collator_class):
20 | COLLATOR_REGISTRY[collator_class.__name__] = collator_class
21 |
22 |
23 | def _load_single_dataset(dataset_name, dataset_cfg: DictConfig, **kwargs):
24 | dataset_handler_name = dataset_cfg.get("handler")
25 | assert dataset_handler_name is not None, ValueError(
26 | f"{dataset_name} handler not set"
27 | )
28 | dataset_handler = DATASET_REGISTRY.get(dataset_handler_name)
29 | if dataset_handler is None:
30 | raise NotImplementedError(
31 | f"{dataset_handler_name} not implemented or not registered"
32 | )
33 | dataset_args = dataset_cfg.args
34 | return dataset_handler(**dataset_args, **kwargs)
35 |
36 |
37 | def get_datasets(dataset_cfgs: Union[Dict, DictConfig], **kwargs):
38 | dataset = {}
39 | for dataset_name, dataset_cfg in dataset_cfgs.items():
40 | access_name = dataset_cfg.get("access_key", dataset_name)
41 | dataset[access_name] = _load_single_dataset(dataset_name, dataset_cfg, **kwargs)
42 | if len(dataset) == 1:
43 | # return a single dataset
44 | return list(dataset.values())[0]
45 | # return mapping to multiple datasets
46 | return dataset
47 |
48 |
49 | def get_data(data_cfg: DictConfig, mode="train", **kwargs):
50 | data = {}
51 | data_cfg = dict(data_cfg)
52 | anchor = data_cfg.pop("anchor", "forget")
53 | for split, dataset_cfgs in data_cfg.items():
54 | data[split] = get_datasets(dataset_cfgs, **kwargs)
55 | if mode == "train":
56 | return data
57 | elif mode == "unlearn":
58 | unlearn_splits = {k: v for k, v in data.items() if k not in ("eval", "test")}
59 | unlearn_dataset = ForgetRetainDataset(**unlearn_splits, anchor=anchor)
60 | data["train"] = unlearn_dataset
61 | for split in unlearn_splits:
62 | data.pop(split)
63 | return data
64 |
65 |
66 | def _get_single_collator(collator_name: str, collator_cfg: DictConfig, **kwargs):
67 | collator_handler_name = collator_cfg.get("handler")
68 | assert collator_handler_name is not None, ValueError(
69 | f"{collator_name} handler not set"
70 | )
71 | collator_handler = COLLATOR_REGISTRY.get(collator_handler_name)
72 | if collator_handler is None:
73 | raise NotImplementedError(
74 | f"{collator_handler_name} not implemented or not registered"
75 | )
76 | collator_args = collator_cfg.args
77 | return collator_handler(**collator_args, **kwargs)
78 |
79 |
80 | def get_collators(collator_cfgs, **kwargs):
81 | collators = {}
82 | for collator_name, collator_cfg in collator_cfgs.items():
83 | collators[collator_name] = _get_single_collator(
84 | collator_name, collator_cfg, **kwargs
85 | )
86 | if len(collators) == 1:
87 | # return a single collator
88 | return list(collators.values())[0]
89 | # return collators in a dict
90 | return collators
91 |
92 |
93 | # Register datasets
94 | _register_data(QADataset)
95 | _register_data(QAwithIdkDataset)
96 | _register_data(PretrainingDataset)
97 | _register_data(CompletionDataset)
98 | _register_data(QAwithAlternateDataset)
99 |
100 | # Register composite datasets used in unlearning
101 | # groups: unlearn
102 | _register_data(ForgetRetainDataset)
103 |
104 | # Register collators
105 | _register_collator(DataCollatorForSupervisedDataset)
106 |
--------------------------------------------------------------------------------
/src/data/collators.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import transformers
3 | from typing import Dict, Sequence
4 | from data.utils import IGNORE_INDEX
5 |
6 |
7 | class DataCollatorForSupervisedDataset(object):
8 | """Collate examples for supervised fine-tuning."""
9 |
10 | def __init__(
11 | self,
12 | tokenizer: transformers.PreTrainedTokenizer,
13 | padding_side: str = "right",
14 | index: str = None,
15 | ):
16 | self.tokenizer = tokenizer
17 | self.padding_side = padding_side
18 | self.index = index
19 |
20 | def get_instances_from_key(self, instances: Sequence[Dict], key: str):
21 | ret_instances = [instance[key] for instance in instances]
22 | return ret_instances
23 |
24 | def _pad_tokens(self, input_ids, padding_value):
25 | if self.padding_side == "right":
26 | input_ids = torch.nn.utils.rnn.pad_sequence(
27 | input_ids, batch_first=True, padding_value=padding_value
28 | )
29 | else:
30 | input_ids = torch.nn.utils.rnn.pad_sequence(
31 | [torch.flip(i, dims=[0]) for i in input_ids],
32 | batch_first=True,
33 | padding_value=padding_value,
34 | ).flip(dims=[1])
35 | return input_ids
36 |
37 | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
38 | assert isinstance(instances[0], dict)
39 | return_dct = {}
40 | if "input_ids" not in instances[0]:
41 | for key in instances[0].keys():
42 | key_instances = self.get_instances_from_key(
43 | instances=instances, key=key
44 | )
45 | return_dct[key] = self(key_instances)
46 | else:
47 | input_ids = [instance["input_ids"] for instance in instances]
48 | input_ids = self._pad_tokens(input_ids, self.tokenizer.pad_token_id)
49 | attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
50 | return_dct.update({"input_ids": input_ids})
51 | return_dct.update({"attention_mask": attention_mask})
52 | if "labels" in instances[0]:
53 | labels = [instance["labels"] for instance in instances]
54 | labels = self._pad_tokens(labels, IGNORE_INDEX)
55 | return_dct.update({"labels": labels})
56 | if self.index:
57 | if self.index in instances[0]:
58 | return_dct.update(
59 | {
60 | self.index: torch.tensor(
61 | [example[self.index] for example in instances]
62 | )
63 | }
64 | )
65 | else:
66 | raise Warning(f"{self.index} not found in dataset")
67 | return return_dct
68 |
--------------------------------------------------------------------------------
/src/data/pretraining.py:
--------------------------------------------------------------------------------
1 | # import torch
2 | from torch.utils.data import Dataset
3 | from data.utils import (
4 | load_hf_dataset,
5 | add_dataset_index,
6 | preprocess_pretraining_instance,
7 | )
8 |
9 |
10 | class CompletionDataset(Dataset):
11 | def __init__(
12 | self,
13 | hf_args,
14 | template_args,
15 | tokenizer,
16 | prefix_key="prompt",
17 | text_key="text",
18 | max_length=2048,
19 | predict_with_generate=False,
20 | insert_space=False,
21 | ):
22 | super(CompletionDataset, self).__init__()
23 | self.tokenizer = tokenizer
24 | self.max_length = max_length
25 | self.data = load_hf_dataset(**hf_args)
26 | self.data = add_dataset_index(self.data)
27 | # if either key does not exist in dataset, it is taken as ""
28 | self.prefix_key = prefix_key
29 | self.text_key = text_key
30 | self.predict_with_generate = predict_with_generate
31 | self.insert_space = insert_space
32 |
33 | def __len__(self):
34 | return len(self.data)
35 |
36 | def _process_sample(self, prefix, text_content, index=-1):
37 | tokenized_data = preprocess_pretraining_instance(
38 | self.tokenizer,
39 | prefix,
40 | text_content,
41 | self.max_length,
42 | self.predict_with_generate,
43 | self.insert_space,
44 | )
45 | item_dct = {
46 | "input_ids": tokenized_data["input_ids"],
47 | "labels": tokenized_data["labels"],
48 | "attention_mask": tokenized_data["attention_mask"],
49 | }
50 | if index != -1:
51 | item_dct["index"] = index
52 | return item_dct
53 |
54 | def __getitem__(self, idx):
55 | pref = self.data[idx].get(self.prefix_key, "")
56 | text_content = self.data[idx].get(self.text_key, "")
57 | index = self.data[idx]["index"]
58 | item = self._process_sample(pref, text_content, index)
59 | return item
60 |
61 |
62 | class PretrainingDataset(Dataset):
63 | def __init__(
64 | self, hf_args, template_args, tokenizer, text_key="text", max_length=2048
65 | ):
66 | super(PretrainingDataset, self).__init__()
67 | self.tokenizer = tokenizer
68 | self.max_length = max_length
69 | self.chunks = self._chunk_raw_text(load_hf_dataset(**hf_args)[text_key])
70 |
71 | def _chunk_raw_text(self, raw_text):
72 | raw_text = "\n\n".join(raw_text)
73 | full_token_sequence = self.tokenizer(raw_text, add_special_tokens=False)[
74 | "input_ids"
75 | ]
76 | num_chunks = len(full_token_sequence) // self.max_length + 1
77 | chunks = []
78 | for i in range(num_chunks):
79 | chunks.append(
80 | self.tokenizer.decode(
81 | full_token_sequence[i * self.max_length : (i + 1) * self.max_length]
82 | )
83 | )
84 | return chunks
85 |
86 | def __len__(self):
87 | return len(self.chunks)
88 |
89 | def __getitem__(self, idx):
90 | return preprocess_pretraining_instance(
91 | self.tokenizer, "", self.chunks[idx], self.max_length
92 | )
93 |
--------------------------------------------------------------------------------
/src/data/qa.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.data import Dataset
3 |
4 | from data.utils import load_hf_dataset, preprocess_chat_instance, add_dataset_index
5 |
6 |
7 | class QADataset(Dataset):
8 | def __init__(
9 | self,
10 | hf_args,
11 | template_args,
12 | tokenizer,
13 | question_key="question",
14 | answer_key="answer",
15 | few_shot_dataset_hf_args=None,
16 | max_length=512,
17 | predict_with_generate=False,
18 | ):
19 | super(QADataset, self).__init__()
20 | self.tokenizer = tokenizer
21 | self.max_length = max_length
22 | self.data = load_hf_dataset(**hf_args)
23 | self.data = add_dataset_index(self.data)
24 | self.fs_data = None
25 | if few_shot_dataset_hf_args is not None:
26 | raw_data = load_hf_dataset(**few_shot_dataset_hf_args)
27 | self.fs_data = {}
28 | self.fs_data[question_key] = raw_data[question_key]
29 | self.fs_data[answer_key] = raw_data[answer_key]
30 | self.template_args = template_args
31 | self.question_key = question_key
32 | self.answer_key = answer_key
33 | self.predict_with_generate = predict_with_generate
34 |
35 | def __len__(self):
36 | return len(self.data)
37 |
38 | def _process_sample(self, question, answer, index=-1):
39 | if self.fs_data is None:
40 | prompt_msgs, response_msgs = [question], [answer]
41 | else:
42 | prompt_msgs = self.fs_data[self.question_key] + [question]
43 | response_msgs = self.fs_data[self.answer_key] + [answer]
44 | tokenized_data = preprocess_chat_instance(
45 | self.tokenizer,
46 | self.template_args,
47 | prompt_msgs,
48 | response_msgs,
49 | self.max_length,
50 | self.predict_with_generate,
51 | )
52 | item_dct = {
53 | "input_ids": tokenized_data["input_ids"],
54 | "labels": tokenized_data["labels"],
55 | "attention_mask": tokenized_data["attention_mask"],
56 | "index": index,
57 | }
58 | return item_dct
59 |
60 | def __getitem__(self, idx):
61 | question = self.data[idx][self.question_key]
62 | answer = self.data[idx][self.answer_key]
63 | index = self.data[idx]["index"]
64 | if isinstance(answer, str):
65 | item = self._process_sample(question=question, answer=answer, index=index)
66 | elif isinstance(answer, list):
67 | item = {}
68 | for i, ans in enumerate(answer):
69 | sample_item = self._process_sample(
70 | question=question, answer=ans, index=index
71 | )
72 | item[i] = sample_item
73 | else:
74 | raise NotImplementedError("answer format not found")
75 | return item
76 |
77 |
78 | class QAwithIdkDataset(QADataset):
79 | def __init__(self, idk_path, return_original=True, *args, **kwargs):
80 | self.idk_path = idk_path
81 | self.return_original = return_original
82 | self.idk_responses = open(self.idk_path, "r").readlines()
83 | super().__init__(*args, **kwargs)
84 |
85 | def item_with_idk(self, question):
86 | rand_pos = torch.randint(0, len(self.idk_responses), (1,)).item()
87 | idk_response = self.idk_responses[rand_pos].strip()
88 | idk_item = self._process_sample(question=question, answer=idk_response)
89 | return idk_item
90 |
91 | def __getitem__(self, idx):
92 | item = super().__getitem__(idx)
93 | question = self.data[idx][self.question_key]
94 | if isinstance(item, dict):
95 | return_item = {"original": item}
96 | idk_item = self.item_with_idk(question)
97 | return_item["alternate"] = idk_item
98 | # return_item = [item, idk_item]
99 | elif isinstance(item, list) or isinstance(item, tuple):
100 | return_item = []
101 | for sample_item in item:
102 | return_item = {"original": sample_item}
103 | idk_item = self.item_with_idk(question)
104 | return_item["alternate"] = idk_item
105 | # return_item.append([sample_item, idk_item])
106 | return return_item if self.return_original else return_item["alternate"]
107 |
108 |
109 | class QAwithAlternateDataset(QADataset):
110 | def __init__(self, alternate_key, return_original=True, *args, **kwargs):
111 | self.alternate_key = alternate_key
112 | self.return_original = return_original
113 | super().__init__(*args, **kwargs)
114 |
115 | def __getitem__(self, idx):
116 | item = super().__getitem__(idx)
117 | question = self.data[idx][self.question_key]
118 | if isinstance(item, dict):
119 | return_item = {"original": item}
120 | alt_item = self._process_sample(
121 | question=question, answer=self.data[idx][self.alternate_key]
122 | )
123 | return_item["alternate"] = alt_item
124 | # return_item = [item, idk_item]
125 | elif isinstance(item, list) or isinstance(item, tuple):
126 | return_item = []
127 | for sample_item in item:
128 | return_item = {"original": sample_item}
129 | alt_item = self._process_sample(
130 | question=question, answer=self.data[idx][self.alternate_key]
131 | )
132 | return_item["alternate"] = alt_item
133 | # return_item.append([sample_item, idk_item])
134 | return return_item if self.return_original else return_item["alternate"]
135 |
--------------------------------------------------------------------------------
/src/data/unlearn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.data import Dataset
3 |
4 |
5 | class ForgetRetainDataset(Dataset):
6 | # https://github.com/OPTML-Group/SOUL/blob/main/src/dataset/Base.py
7 | def __init__(self, forget, retain, anchor="forget"):
8 | """Wraps the forget retain dataset into unlearning dataset.
9 |
10 | Args:
11 | forget (Dataset): Forget Dataset
12 | retain (Dataset): Retain Dataset
13 | anchor (str, optional): Specifies which dataset to anchor while randomly sampling from the other dataset. Defaults to 'forget'.
14 | """
15 | self.forget = forget
16 | self.retain = retain
17 | self.anchor = anchor
18 |
19 | def __len__(self):
20 | """Ensures the sampled dataset matches the anchor dataset's length."""
21 | if self.anchor == "forget":
22 | assert self.forget is not None, ValueError(
23 | "forget dataset can't be None when anchor=forget"
24 | )
25 | return len(self.forget)
26 | elif self.anchor == "retain":
27 | assert self.retain is not None, ValueError(
28 | "retain dataset can't be None when anchor=retain"
29 | )
30 | return len(self.retain)
31 | else:
32 | raise NotImplementedError(f"{self.anchor} can be only forget or retain")
33 |
34 | def __getitem__(self, idx):
35 | item = {}
36 | if self.anchor == "forget":
37 | item["forget"] = self.forget[idx]
38 | if self.retain:
39 | retain_idx = torch.randint(0, len(self.retain), (1,)).item()
40 | item["retain"] = self.retain[retain_idx]
41 | elif self.anchor == "retain":
42 | item["retain"] = self.retain[idx]
43 | if self.forget:
44 | forget_idx = torch.randint(0, len(self.forget), (1,)).item()
45 | item["forget"] = self.forget[forget_idx]
46 | return item
47 |
--------------------------------------------------------------------------------
/src/eval.py:
--------------------------------------------------------------------------------
1 | import hydra
2 | from omegaconf import DictConfig
3 |
4 | from trainer.utils import seed_everything
5 | from model import get_model
6 | from evals import get_evaluators
7 |
8 |
9 | @hydra.main(version_base=None, config_path="../configs", config_name="eval.yaml")
10 | def main(cfg: DictConfig):
11 | """Entry point of the code to evaluate models
12 | Args:
13 | cfg (DictConfig): Config to train
14 | """
15 | seed_everything(cfg.seed)
16 | model_cfg = cfg.model
17 | template_args = model_cfg.template_args
18 | assert model_cfg is not None, "Invalid model yaml passed in train config."
19 | model, tokenizer = get_model(model_cfg)
20 |
21 | eval_cfgs = cfg.eval
22 | evaluators = get_evaluators(eval_cfgs)
23 | for evaluator_name, evaluator in evaluators.items():
24 | eval_args = {
25 | "template_args": template_args,
26 | "model": model,
27 | "tokenizer": tokenizer,
28 | }
29 | _ = evaluator.evaluate(**eval_args)
30 |
31 |
32 | if __name__ == "__main__":
33 | main()
34 |
--------------------------------------------------------------------------------
/src/evals/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Any
2 | from omegaconf import DictConfig
3 | from evals.tofu import TOFUEvaluator
4 | from evals.muse import MUSEEvaluator
5 | from evals.lm_eval import LMEvalEvaluator
6 |
7 | EVALUATOR_REGISTRY: Dict[str, Any] = {}
8 |
9 |
10 | def _register_evaluator(evaluator_class):
11 | EVALUATOR_REGISTRY[evaluator_class.__name__] = evaluator_class
12 |
13 |
14 | def get_evaluator(name: str, eval_cfg: DictConfig, **kwargs):
15 | evaluator_handler_name = eval_cfg.get("handler")
16 | assert evaluator_handler_name is not None, ValueError(f"{name} handler not set")
17 | eval_handler = EVALUATOR_REGISTRY.get(evaluator_handler_name)
18 | if eval_handler is None:
19 | raise NotImplementedError(
20 | f"{evaluator_handler_name} not implemented or not registered"
21 | )
22 | return eval_handler(eval_cfg, **kwargs)
23 |
24 |
25 | def get_evaluators(eval_cfgs: DictConfig, **kwargs):
26 | evaluators = {}
27 | for eval_name, eval_cfg in eval_cfgs.items():
28 | evaluators[eval_name] = get_evaluator(eval_name, eval_cfg, **kwargs)
29 | return evaluators
30 |
31 |
32 | # Register Your benchmark evaluators
33 | _register_evaluator(TOFUEvaluator)
34 | _register_evaluator(MUSEEvaluator)
35 | _register_evaluator(LMEvalEvaluator)
36 |
--------------------------------------------------------------------------------
/src/evals/base.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import logging
4 | from evals.metrics import get_metrics
5 |
6 | logger = logging.getLogger("evaluator")
7 |
8 |
9 | class Evaluator:
10 | def __init__(self, name, eval_cfg, **kwargs):
11 | self.name = name
12 | self.eval_cfg = eval_cfg
13 | self.metrics_cfg = self.eval_cfg.metrics
14 | self.metrics = self.load_metrics(self.metrics_cfg)
15 | logger.info(
16 | f"Evaluations stored in the experiment directory: {self.eval_cfg.output_dir}"
17 | )
18 |
19 | def get_logs_file_path(self, output_dir, suffix="EVAL"):
20 | """Returns the path to json file to store results"""
21 | logs_filename = os.path.join(output_dir, f"{self.name}_{suffix}.json")
22 | return logs_filename
23 |
24 | def load_logs_from_file(self, file):
25 | """Returns the cache of existing results"""
26 | logs = {}
27 | if os.path.exists(file):
28 | logger.info(f"Loading existing evaluations from {file}")
29 | with open(file, "r") as f:
30 | logs = json.load(f)
31 | return logs
32 |
33 | def save_logs(self, logs, file):
34 | """Save the logs in a json file"""
35 | logs = dict(sorted(logs.items()))
36 | os.makedirs(os.path.dirname(file), exist_ok=True)
37 | try:
38 | with open(file, "w") as f:
39 | json.dump(logs, f, indent=4)
40 | except Exception as e:
41 | raise RuntimeError(f"Failed to save {file}: {e}")
42 |
43 | def prepare_model(self, model):
44 | """Prepare model for evaluation"""
45 | model.eval()
46 | return model
47 |
48 | def load_metrics(self, metrics_cfg):
49 | """Load metrics for evaluation"""
50 | metrics = get_metrics(metrics_cfg)
51 | return metrics
52 |
53 | def summarize(self, logs):
54 | """Summarize the metrics results"""
55 | metric_summary = {}
56 | for metric_name, metric_results in logs.items():
57 | if metric_name not in self.metrics:
58 | continue
59 | agg_value = metric_results.get("agg_value", None)
60 | if agg_value is not None:
61 | metric_summary[metric_name] = agg_value
62 | return metric_summary
63 |
64 | def evaluate(self, model, output_dir=None, overwrite=None, **kwargs):
65 | # set flag to overwrite metrics
66 | overwrite = self.eval_cfg.overwrite if overwrite is None else overwrite
67 |
68 | # Prepare model for evaluation
69 | model = self.prepare_model(model)
70 |
71 | # Set output_dir and file to store results
72 | output_dir = output_dir if output_dir else self.eval_cfg.output_dir
73 | logs_file_path = self.get_logs_file_path(output_dir)
74 | summary_file_path = self.get_logs_file_path(output_dir, suffix="SUMMARY")
75 |
76 | # Load existing results from file if any.
77 | logs = self.load_logs_from_file(logs_file_path) if not overwrite else {}
78 |
79 | logger.info(f"***** Running {self.name} evaluation suite *****")
80 | logger.info(f"Fine-grained evaluations will be saved to: {logs_file_path}")
81 | logger.info(
82 | f"Aggregated evaluations will be summarised in: {summary_file_path}"
83 | )
84 | for metric_name, metric_fn in self.metrics.items():
85 | if not overwrite and metric_name in logs and logs[metric_name]:
86 | logger.info(f"Skipping {metric_name}, already evaluated.")
87 | if "agg_value" in logs[metric_name]:
88 | logger.info(
89 | f"Result for metric {metric_name}:\t{logs[metric_name]['agg_value']}"
90 | )
91 | self.save_logs(self.summarize(logs), summary_file_path)
92 | continue
93 | _ = logs.pop(metric_name, None) # overwriting existing evals if present
94 | kwargs = {
95 | "tokenizer": kwargs.get("tokenizer", None),
96 | "template_args": kwargs.get("template_args", None),
97 | }
98 | metrics_args = self.eval_cfg.metrics[metric_name]
99 | _
100 | result = metric_fn(
101 | model,
102 | metric_name=metric_name,
103 | cache=logs,
104 | **kwargs,
105 | **metrics_args,
106 | )
107 | if "agg_value" in result:
108 | logger.info(f"Result for metric {metric_name}:\t{result['agg_value']}")
109 | self.save_logs(logs, logs_file_path)
110 | self.save_logs(self.summarize(logs), summary_file_path)
111 |
112 | return self.summarize(logs)
113 |
--------------------------------------------------------------------------------
/src/evals/lm_eval.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from omegaconf import OmegaConf
3 |
4 | from lm_eval.models.hf_vlms import HFLM
5 | from lm_eval.tasks import TaskManager
6 | from lm_eval import simple_evaluate
7 |
8 | from evals.base import Evaluator
9 |
10 |
11 | logger = logging.getLogger("evaluator")
12 |
13 |
14 | class LMEvalEvaluator(Evaluator):
15 | def __init__(self, eval_cfg, **kwargs):
16 | self.name = "LMEval"
17 | self.eval_cfg = eval_cfg
18 | self.tasks = OmegaConf.to_container(
19 | self.eval_cfg.tasks, resolve=True, throw_on_missing=True
20 | )
21 | self.task_manager = TaskManager()
22 | self.simple_evaluate_args = dict(kwargs.get("simple_evaluate_args", {}))
23 |
24 | def prepare_model(self, model, **kwargs):
25 | """Prepare model for evaluation"""
26 | model.eval()
27 | return HFLM(model)
28 |
29 | def summarize(self, eval_results: dict, task_name: str) -> dict:
30 | """
31 | Summarize evaluation metrics from lm_eval.simple_evaluate.
32 | - If task_name is a group, return only aggregated group-level metrics.
33 | - If it's a single task, return per-task metrics from 'results'.
34 | - Always exclude 'alias' entries and strip ',none' suffixes.
35 | """
36 | summary = {}
37 |
38 | def clean_metric_key(prefix: str, metric_name: str) -> str | None:
39 | if metric_name == "alias":
40 | return None
41 | base = metric_name.split(",", 1)[0].strip()
42 | return f"{prefix}/{base}"
43 |
44 | # Check if task is a group (e.g., 'mmlu')
45 | if task_name in self.task_manager.all_groups:
46 | group_metrics = eval_results.get("groups", {}).get(task_name, {})
47 | for metric_name, value in group_metrics.items():
48 | key = clean_metric_key(task_name, metric_name)
49 | if key is None:
50 | continue
51 | try:
52 | summary[key] = float(value)
53 | except (TypeError, ValueError):
54 | summary[key] = value
55 | else:
56 | task_metrics = eval_results.get("results", {}).get(task_name, {})
57 | for metric_name, value in task_metrics.items():
58 | key = clean_metric_key(task_name, metric_name)
59 | if key is None:
60 | continue
61 | try:
62 | summary[key] = float(value)
63 | except (TypeError, ValueError):
64 | summary[key] = value
65 |
66 | return summary
67 |
68 | def get_task_name(self, task):
69 | if isinstance(task, str):
70 | return task
71 | elif isinstance(task, dict):
72 | if "task" in task:
73 | return task.get("task")
74 | raise ValueError(f"Invalid task format: {task}")
75 |
76 | def evaluate(self, model, output_dir=None, overwrite=None, **kwargs):
77 | # set flag to overwrite metrics
78 | overwrite = self.eval_cfg.overwrite if overwrite is None else overwrite
79 |
80 | # Prepare model for evaluation
81 | kwargs = {"tokenizer": kwargs.get("tokenizer", None)}
82 | model = self.prepare_model(model, **kwargs)
83 |
84 | # Set output_dir and file to store results
85 | output_dir = output_dir if output_dir else self.eval_cfg.output_dir
86 | logs_file_path = self.get_logs_file_path(output_dir)
87 | summary_file_path = self.get_logs_file_path(output_dir, suffix="SUMMARY")
88 |
89 | # Load existing results from file if any.
90 | logs = self.load_logs_from_file(logs_file_path) if not overwrite else {}
91 | summary = self.load_logs_from_file(summary_file_path) if not overwrite else {}
92 |
93 | logger.info(f"***** Running {self.name} evaluation suite *****")
94 | logger.info(f"Fine-grained evaluations will be saved to: {logs_file_path}")
95 | logger.info(
96 | f"Aggregated evaluations will be summarised in: {summary_file_path}"
97 | )
98 |
99 | for task in self.tasks:
100 | task_name = self.get_task_name(task)
101 | if not overwrite and task_name in logs and logs[task_name]:
102 | logger.info(f"Skipping {task_name}, already evaluated.")
103 | continue
104 | _ = logs.pop(task_name, None) # overwriting existing evals if present
105 | results = simple_evaluate(
106 | model=model,
107 | tasks=[task],
108 | task_manager=self.task_manager,
109 | **self.simple_evaluate_args,
110 | )
111 | logs.update({task_name: results["samples"]})
112 | summary.update(self.summarize(results, task_name))
113 | self.save_logs(logs, logs_file_path)
114 | self.save_logs(summary, summary_file_path)
115 | return summary
116 |
--------------------------------------------------------------------------------
/src/evals/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 | from omegaconf import DictConfig
3 | from evals.metrics.base import UnlearningMetric
4 | from evals.metrics.memorization import (
5 | probability,
6 | probability_w_options,
7 | rouge,
8 | truth_ratio,
9 | extraction_strength,
10 | exact_memorization,
11 | )
12 | from evals.metrics.privacy import ks_test, privleak, rel_diff
13 | from evals.metrics.mia import (
14 | mia_loss,
15 | mia_min_k,
16 | mia_min_k_plus_plus,
17 | mia_gradnorm,
18 | mia_zlib,
19 | mia_reference,
20 | )
21 | from evals.metrics.utility import (
22 | hm_aggregate,
23 | classifier_prob,
24 | )
25 |
26 | METRICS_REGISTRY: Dict[str, UnlearningMetric] = {}
27 |
28 |
29 | def _register_metric(metric):
30 | METRICS_REGISTRY[metric.name] = metric
31 |
32 |
33 | def _get_single_metric(name: str, metric_cfg, **kwargs):
34 | metric_handler_name = metric_cfg.get("handler")
35 | assert metric_handler_name is not None, ValueError(f"{name} handler not set")
36 | metric = METRICS_REGISTRY.get(metric_handler_name)
37 | if metric is None:
38 | raise NotImplementedError(
39 | f"{metric_handler_name} not implemented or not registered"
40 | )
41 | pre_compute_cfg = metric_cfg.get("pre_compute", {})
42 | pre_compute_metrics = get_metrics(pre_compute_cfg, **kwargs)
43 | metric.set_pre_compute_metrics(pre_compute_metrics)
44 | return metric
45 |
46 |
47 | def get_metrics(metric_cfgs: DictConfig, **kwargs):
48 | metrics = {}
49 | for metric_name, metric_cfg in metric_cfgs.items():
50 | metrics[metric_name] = _get_single_metric(metric_name, metric_cfg, **kwargs)
51 | return metrics
52 |
53 |
54 | # Register metrics here
55 | _register_metric(probability)
56 | _register_metric(probability_w_options)
57 | _register_metric(rouge)
58 | _register_metric(truth_ratio)
59 | _register_metric(ks_test)
60 | _register_metric(hm_aggregate)
61 | _register_metric(privleak)
62 | _register_metric(rel_diff)
63 | _register_metric(exact_memorization)
64 | _register_metric(extraction_strength)
65 |
66 | # Register MIA metrics
67 | _register_metric(mia_loss)
68 | _register_metric(mia_min_k)
69 | _register_metric(mia_min_k_plus_plus)
70 | _register_metric(mia_gradnorm)
71 | _register_metric(mia_zlib)
72 | _register_metric(mia_reference)
73 |
74 | # Register Utility metrics
75 | _register_metric(classifier_prob)
76 |
--------------------------------------------------------------------------------
/src/evals/metrics/mia/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Attack implementations.
3 | """
4 |
5 | from transformers import AutoModelForCausalLM
6 |
7 | from evals.metrics.base import unlearning_metric
8 | from evals.metrics.mia.loss import LOSSAttack
9 | from evals.metrics.mia.min_k import MinKProbAttack
10 | from evals.metrics.mia.min_k_plus_plus import MinKPlusPlusAttack
11 | from evals.metrics.mia.gradnorm import GradNormAttack
12 | from evals.metrics.mia.zlib import ZLIBAttack
13 | from evals.metrics.mia.reference import ReferenceAttack
14 |
15 | from evals.metrics.mia.utils import mia_auc
16 | import logging
17 |
18 | logger = logging.getLogger("metrics")
19 |
20 | ## NOTE: all MIA attack statistics are signed as required in order to show the
21 | # same trends as loss (higher the score on an example, less likely the membership)
22 |
23 |
24 | @unlearning_metric(name="mia_loss")
25 | def mia_loss(model, **kwargs):
26 | return mia_auc(
27 | LOSSAttack,
28 | model,
29 | data=kwargs["data"],
30 | collator=kwargs["collators"],
31 | batch_size=kwargs["batch_size"],
32 | )
33 |
34 |
35 | @unlearning_metric(name="mia_min_k")
36 | def mia_min_k(model, **kwargs):
37 | return mia_auc(
38 | MinKProbAttack,
39 | model,
40 | data=kwargs["data"],
41 | collator=kwargs["collators"],
42 | batch_size=kwargs["batch_size"],
43 | k=kwargs["k"],
44 | )
45 |
46 |
47 | @unlearning_metric(name="mia_min_k_plus_plus")
48 | def mia_min_k_plus_plus(model, **kwargs):
49 | return mia_auc(
50 | MinKPlusPlusAttack,
51 | model,
52 | data=kwargs["data"],
53 | collator=kwargs["collators"],
54 | batch_size=kwargs["batch_size"],
55 | k=kwargs["k"],
56 | )
57 |
58 |
59 | @unlearning_metric(name="mia_gradnorm")
60 | def mia_gradnorm(model, **kwargs):
61 | return mia_auc(
62 | GradNormAttack,
63 | model,
64 | data=kwargs["data"],
65 | collator=kwargs["collators"],
66 | batch_size=kwargs["batch_size"],
67 | p=kwargs["p"],
68 | )
69 |
70 |
71 | @unlearning_metric(name="mia_zlib")
72 | def mia_zlib(model, **kwargs):
73 | return mia_auc(
74 | ZLIBAttack,
75 | model,
76 | data=kwargs["data"],
77 | collator=kwargs["collators"],
78 | batch_size=kwargs["batch_size"],
79 | tokenizer=kwargs.get("tokenizer"),
80 | )
81 |
82 |
83 | @unlearning_metric(name="mia_reference")
84 | def mia_reference(model, **kwargs):
85 | if "reference_model_path" not in kwargs:
86 | raise ValueError("Reference model must be provided in kwargs")
87 | logger.info(f"Loading reference model from {kwargs['reference_model_path']}")
88 | reference_model = AutoModelForCausalLM.from_pretrained(
89 | kwargs["reference_model_path"],
90 | torch_dtype=model.dtype,
91 | device_map={"": model.device},
92 | )
93 | return mia_auc(
94 | ReferenceAttack,
95 | model,
96 | data=kwargs["data"],
97 | collator=kwargs["collators"],
98 | batch_size=kwargs["batch_size"],
99 | reference_model=reference_model,
100 | )
101 |
--------------------------------------------------------------------------------
/src/evals/metrics/mia/all_attacks.py:
--------------------------------------------------------------------------------
1 | """
2 | Enum class for attacks. Also contains the base attack class.
3 | """
4 |
5 | from enum import Enum
6 | from torch.utils.data import DataLoader
7 | import numpy as np
8 | from tqdm import tqdm
9 |
10 |
11 | # Attack definitions
12 | class AllAttacks(str, Enum):
13 | LOSS = "loss"
14 | REFERENCE_BASED = "ref"
15 | ZLIB = "zlib"
16 | MIN_K = "min_k"
17 | MIN_K_PLUS_PLUS = "min_k++"
18 | GRADNORM = "gradnorm"
19 | RECALL = "recall"
20 |
21 |
22 | # Base attack class
23 | class Attack:
24 | def __init__(self, model, data, collator, batch_size, **kwargs):
25 | """Initialize attack with model and create dataloader."""
26 | self.model = model
27 | self.dataloader = DataLoader(data, batch_size=batch_size, collate_fn=collator)
28 | self.setup(**kwargs)
29 |
30 | def setup(self, **kwargs):
31 | """Setup attack-specific parameters."""
32 | pass
33 |
34 | def compute_batch_values(self, batch):
35 | """Process a batch through model to get needed statistics."""
36 | raise NotImplementedError
37 |
38 | def compute_score(self, sample_stats):
39 | """Compute MIA score for a single sample."""
40 | raise NotImplementedError
41 |
42 | def attack(self):
43 | """Run full MIA attack."""
44 | all_scores = []
45 | all_indices = []
46 |
47 | for batch in tqdm(self.dataloader, total=len(self.dataloader)):
48 | indices = batch.pop("index").cpu().numpy().tolist()
49 | batch_values = self.compute_batch_values(batch)
50 | scores = [self.compute_score(values) for values in batch_values]
51 |
52 | all_scores.extend(scores)
53 | all_indices.extend(indices)
54 |
55 | scores_by_index = {
56 | str(idx): {"score": float(score)}
57 | for idx, score in zip(all_indices, all_scores)
58 | }
59 |
60 | return {
61 | "agg_value": float(np.mean(all_scores)),
62 | "value_by_index": scores_by_index,
63 | }
64 |
--------------------------------------------------------------------------------
/src/evals/metrics/mia/gradnorm.py:
--------------------------------------------------------------------------------
1 | """
2 | Gradient-norm attack. Proposed for MIA in multiple settings, and particularly
3 | experimented for pre-training data and LLMs in https://arxiv.org/abs/2402.17012
4 | """
5 |
6 | import torch
7 | from evals.metrics.mia.all_attacks import Attack
8 | from evals.metrics.utils import tokenwise_logprobs
9 |
10 |
11 | # DO NOT use gradnorm in a way so that it runs when your accumulated gradients during training aren't used yet
12 | # gradnorm zeros out the gradients of the model during its computation
13 | class GradNormAttack(Attack):
14 | def setup(self, p, **kwargs):
15 | if p not in [1, 2, float("inf")]:
16 | raise ValueError(f"Invalid p-norm value: {p}")
17 | self.p = p
18 |
19 | def compute_batch_values(self, batch):
20 | """Compute gradients of examples w.r.t model parameters. More grad norm => more loss."""
21 | self.model.train()
22 | batch_log_probs = tokenwise_logprobs(self.model, batch, grad=True)
23 | batch_loss = [-torch.mean(lps) for lps in batch_log_probs]
24 | batch_grad_norms = []
25 | for sample_loss in batch_loss:
26 | sample_grad_norms = []
27 | self.model.zero_grad()
28 | sample_loss.backward()
29 | for param in self.model.parameters():
30 | if param.grad is not None:
31 | sample_grad_norms.append(param.grad.detach().norm(p=self.p))
32 | batch_grad_norms.append(torch.stack(sample_grad_norms).mean())
33 | self.model.eval()
34 | return batch_grad_norms
35 |
36 | def compute_score(self, sample_stats):
37 | """Return negative gradient norm as the attack score."""
38 | return sample_stats.cpu().to(torch.float32).numpy()
39 |
--------------------------------------------------------------------------------
/src/evals/metrics/mia/loss.py:
--------------------------------------------------------------------------------
1 | """
2 | Straight-forward LOSS attack, as described in https://ieeexplore.ieee.org/abstract/document/8429311
3 | """
4 |
5 | from evals.metrics.mia.all_attacks import Attack
6 | from evals.metrics.utils import evaluate_probability
7 |
8 |
9 | class LOSSAttack(Attack):
10 | def compute_batch_values(self, batch):
11 | """Compute probabilities and losses for the batch."""
12 | return evaluate_probability(self.model, batch)
13 |
14 | def compute_score(self, sample_stats):
15 | """Return the average loss for the sample."""
16 | return sample_stats["avg_loss"]
17 |
--------------------------------------------------------------------------------
/src/evals/metrics/mia/min_k.py:
--------------------------------------------------------------------------------
1 | """
2 | Min-k % Prob Attack: https://arxiv.org/pdf/2310.16789.pdf
3 | """
4 |
5 | import numpy as np
6 | from evals.metrics.mia.all_attacks import Attack
7 | from evals.metrics.utils import tokenwise_logprobs
8 |
9 |
10 | class MinKProbAttack(Attack):
11 | def setup(self, k=0.2, **kwargs):
12 | self.k = k
13 |
14 | def compute_batch_values(self, batch):
15 | """Get token-wise log probabilities for the batch."""
16 | return tokenwise_logprobs(self.model, batch, grad=False)
17 |
18 | def compute_score(self, sample_stats):
19 | """Score single sample using min-k negative log probs scores attack."""
20 | lp = sample_stats.cpu().numpy()
21 | if lp.size == 0:
22 | return 0
23 |
24 | num_k = max(1, int(len(lp) * self.k))
25 | sorted_vals = np.sort(lp)
26 | return -np.mean(sorted_vals[:num_k])
27 |
--------------------------------------------------------------------------------
/src/evals/metrics/mia/min_k_plus_plus.py:
--------------------------------------------------------------------------------
1 | import torch as torch
2 | import numpy as np
3 | from evals.metrics.mia.min_k import MinKProbAttack
4 | from evals.metrics.utils import tokenwise_vocab_logprobs, tokenwise_logprobs
5 |
6 |
7 | class MinKPlusPlusAttack(MinKProbAttack):
8 | def compute_batch_values(self, batch):
9 | """Get both token-wise and vocab-wise log probabilities for the batch."""
10 | vocab_log_probs = tokenwise_vocab_logprobs(self.model, batch, grad=False)
11 | token_log_probs = tokenwise_logprobs(self.model, batch, grad=False)
12 | return [
13 | {"vocab_log_probs": vlp, "token_log_probs": tlp}
14 | for vlp, tlp in zip(vocab_log_probs, token_log_probs)
15 | ]
16 |
17 | def compute_score(self, sample_stats):
18 | """Score using min-k negative log probs scores with vocab-wise normalization."""
19 | all_probs = sample_stats["vocab_log_probs"]
20 | target_prob = sample_stats["token_log_probs"]
21 |
22 | if len(target_prob) == 0:
23 | return 0
24 |
25 | # Compute normalized scores using vocab distribution
26 | mu = (torch.exp(all_probs) * all_probs).sum(-1)
27 | sigma = (torch.exp(all_probs) * torch.square(all_probs)).sum(-1) - torch.square(
28 | mu
29 | )
30 |
31 | # Handle numerical stability
32 | sigma = torch.clamp(sigma, min=1e-6)
33 | scores = (target_prob.cpu().numpy() - mu.cpu().numpy()) / torch.sqrt(
34 | sigma
35 | ).cpu().numpy()
36 |
37 | # Take bottom k% as the attack score
38 | num_k = max(1, int(len(scores) * self.k))
39 | return -np.mean(sorted(scores)[:num_k])
40 |
--------------------------------------------------------------------------------
/src/evals/metrics/mia/reference.py:
--------------------------------------------------------------------------------
1 | """
2 | Reference-based attacks.
3 | """
4 |
5 | from evals.metrics.mia.all_attacks import Attack
6 | from evals.metrics.utils import evaluate_probability
7 |
8 |
9 | class ReferenceAttack(Attack):
10 | def setup(self, reference_model, **kwargs):
11 | """Setup reference model."""
12 | self.reference_model = reference_model
13 |
14 | def compute_batch_values(self, batch):
15 | """Compute loss scores for both target and reference models."""
16 | ref_results = evaluate_probability(self.reference_model, batch)
17 | target_results = evaluate_probability(self.model, batch)
18 | return [
19 | {"target_loss": t["avg_loss"], "ref_loss": r["avg_loss"]}
20 | for t, r in zip(target_results, ref_results)
21 | ]
22 |
23 | def compute_score(self, sample_stats):
24 | """Score using difference between target and reference model losses."""
25 | return sample_stats["target_loss"] - sample_stats["ref_loss"]
26 |
--------------------------------------------------------------------------------
/src/evals/metrics/mia/utils.py:
--------------------------------------------------------------------------------
1 | from evals.metrics.mia.all_attacks import AllAttacks
2 | from evals.metrics.mia.loss import LOSSAttack
3 | from evals.metrics.mia.reference import ReferenceAttack
4 | from evals.metrics.mia.zlib import ZLIBAttack
5 | from evals.metrics.mia.min_k import MinKProbAttack
6 | from evals.metrics.mia.min_k_plus_plus import MinKPlusPlusAttack
7 | from evals.metrics.mia.gradnorm import GradNormAttack
8 |
9 | from sklearn.metrics import roc_auc_score
10 |
11 |
12 | import numpy as np
13 |
14 |
15 | def get_attacker(attack: str):
16 | mapping = {
17 | AllAttacks.LOSS: LOSSAttack,
18 | AllAttacks.REFERENCE_BASED: ReferenceAttack,
19 | AllAttacks.ZLIB: ZLIBAttack,
20 | AllAttacks.MIN_K: MinKProbAttack,
21 | AllAttacks.MIN_K_PLUS_PLUS: MinKPlusPlusAttack,
22 | AllAttacks.GRADNORM: GradNormAttack,
23 | }
24 | attack_cls = mapping.get(attack, None)
25 | if attack_cls is None:
26 | raise ValueError(f"Attack {attack} not found")
27 | return attack_cls
28 |
29 |
30 | def mia_auc(attack_cls, model, data, collator, batch_size, **kwargs):
31 | """
32 | Compute the MIA AUC and accuracy.
33 |
34 | Parameters:
35 | - attack_cls: the attack class to use.
36 | - model: the target model.
37 | - data: a dict with keys "forget" and "holdout".
38 | - collator: data collator.
39 | - batch_size: batch size.
40 | - kwargs: additional optional parameters (e.g. k, p, tokenizer, reference_model).
41 |
42 | Returns a dict containing the attack outputs, including "acc" and "auc".
43 |
44 | Note on convention: auc is 1 when the forget data is much more likely than the holdout data
45 | """
46 | # Build attack arguments from common parameters and any extras.
47 | attack_args = {
48 | "model": model,
49 | "collator": collator,
50 | "batch_size": batch_size,
51 | }
52 | attack_args.update(kwargs)
53 |
54 | output = {
55 | "forget": attack_cls(data=data["forget"], **attack_args).attack(),
56 | "holdout": attack_cls(data=data["holdout"], **attack_args).attack(),
57 | }
58 | forget_scores = [
59 | elem["score"] for elem in output["forget"]["value_by_index"].values()
60 | ]
61 | holdout_scores = [
62 | elem["score"] for elem in output["holdout"]["value_by_index"].values()
63 | ]
64 | scores = np.array(forget_scores + holdout_scores)
65 | labels = np.array(
66 | [0] * len(forget_scores) + [1] * len(holdout_scores)
67 | ) # see note above
68 | auc_value = roc_auc_score(labels, scores)
69 | output["auc"], output["agg_value"] = auc_value, auc_value
70 | return output
71 |
--------------------------------------------------------------------------------
/src/evals/metrics/mia/zlib.py:
--------------------------------------------------------------------------------
1 | """
2 | zlib-normalization Attack: https://www.usenix.org/system/files/sec21-carlini-extracting.pdf
3 | """
4 |
5 | import zlib
6 |
7 | from evals.metrics.mia.all_attacks import Attack
8 | from evals.metrics.utils import (
9 | evaluate_probability,
10 | extract_target_texts_from_processed_data,
11 | )
12 |
13 |
14 | class ZLIBAttack(Attack):
15 | def setup(self, tokenizer=None, **kwargs):
16 | """Setup tokenizer."""
17 | self.tokenizer = tokenizer or self.model.tokenizer
18 |
19 | def compute_batch_values(self, batch):
20 | """Get loss and text for batch."""
21 | eval_results = evaluate_probability(self.model, batch)
22 | texts = extract_target_texts_from_processed_data(self.tokenizer, batch)
23 | return [{"loss": r["avg_loss"], "text": t} for r, t in zip(eval_results, texts)]
24 |
25 | def compute_score(self, sample_stats):
26 | """Score using loss normalized by compressed text length."""
27 | text = sample_stats["text"]
28 | zlib_entropy = len(zlib.compress(text.encode("utf-8")))
29 | return sample_stats["loss"] / zlib_entropy
30 |
--------------------------------------------------------------------------------
/src/evals/metrics/privacy.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy.stats import ks_2samp
3 | from evals.metrics.base import unlearning_metric, logger
4 |
5 |
6 | @unlearning_metric(name="ks_test")
7 | def ks_test(model, **kwargs):
8 | """Compare two forget and retain model distributions with a 2-sample KS-test and report the p-value.
9 | Used in the TOFU benchmark as forget_quality when computed over the truth_ratio statistic."""
10 | forget_tr_stats = np.array(
11 | [
12 | evals["score"]
13 | for evals in kwargs["pre_compute"]["forget"]["value_by_index"].values()
14 | ]
15 | )
16 | reference_logs = kwargs.get("reference_logs", None)
17 | if reference_logs:
18 | reference_logs = reference_logs["retain_model_logs"]
19 | retain_tr_stats = np.array(
20 | [
21 | evals["score"]
22 | for evals in reference_logs["retain"]["value_by_index"].values()
23 | ]
24 | )
25 | fq = ks_2samp(forget_tr_stats, retain_tr_stats)
26 | pvalue = fq.pvalue
27 | else:
28 | logger.warning(
29 | "retain_model_logs not provided in reference_logs, setting forget_quality to None"
30 | )
31 | pvalue = None
32 | return {"agg_value": pvalue}
33 |
34 |
35 | @unlearning_metric(name="privleak")
36 | def privleak(model, **kwargs):
37 | """Compare two forget and retain model scores using a relative comparison of a single statistic.
38 | To be used for MIA AUC scores in ensuring consistency and reproducibility of the MUSE benchmark.
39 | This function is similar to the rel_diff function below, but due to the MUSE benchmark reporting AUC
40 | scores as (1-x) when the more conventional way is x, we do adjustments here to our MIA AUC scores.
41 | calculations in the reverse way,"""
42 | score = kwargs["pre_compute"]["forget"]["agg_value"]
43 | try:
44 | ref = kwargs["reference_logs"]["retain_model_logs"]["retain"]["agg_value"]
45 | except Exception as _:
46 | logger.warning(
47 | f"retain_model_logs evals not provided for privleak, using default retain auc of {kwargs['ref_value']}"
48 | )
49 | ref = kwargs["ref_value"]
50 | score = 1 - score
51 | ref = 1 - ref
52 | return {"agg_value": (score - ref) / (ref + 1e-10) * 100}
53 |
54 |
55 | @unlearning_metric(name="rel_diff")
56 | def rel_diff(model, **kwargs):
57 | """Compare two forget and retain model scores using a relative comparison of a single statistic."""
58 | score = kwargs["pre_compute"]["forget"]["agg_value"]
59 | try:
60 | ref = kwargs["reference_logs"]["retain_model_logs"]["retain"]["agg_value"]
61 | except Exception as _:
62 | logger.warning(
63 | f"retain_model_logs evals not provided for privleak, using default retain auc of {kwargs['ref_value']}"
64 | )
65 | ref = kwargs["ref_value"]
66 | return {"agg_value": (score - ref) / (ref + 1e-10) * 100}
67 |
--------------------------------------------------------------------------------
/src/evals/metrics/utility.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import scipy as sc
4 | from tqdm import tqdm
5 | import torch.nn.functional as F
6 | from torch.utils.data import DataLoader
7 | from transformers import AutoTokenizer, AutoModelForSequenceClassification
8 |
9 | from evals.metrics.utils import aggregate_to_1D
10 | from evals.metrics.base import unlearning_metric
11 |
12 |
13 | @unlearning_metric(name="hm_aggregate")
14 | def hm_aggregate(model, **kwargs):
15 | values = [result["agg_value"] for _, result in kwargs["pre_compute"].items()]
16 | return {"agg_value": sc.stats.hmean(values)}
17 |
18 |
19 | @unlearning_metric(name="classifier_prob")
20 | def classifier_prob(model, **kwargs):
21 | batch_size = kwargs.get("batch_size", 32)
22 | max_length = kwargs.get("max_length", 512)
23 | class_id = kwargs.get("class_id", 0)
24 | text_key = kwargs.get("text_key", "generation")
25 | classifier_model_args = kwargs["classifier_model_args"]
26 | classifier_tokenization_args = kwargs["classifier_tokenization_args"]
27 | device = kwargs.get("device", "cuda")
28 |
29 | tokenizer = AutoTokenizer.from_pretrained(**classifier_tokenization_args)
30 | classifier = AutoModelForSequenceClassification.from_pretrained(
31 | **classifier_model_args
32 | ).to(device)
33 |
34 | data = kwargs["pre_compute"]["text"]["value_by_index"]
35 | data_list = [
36 | {"text": entry[text_key], "index": int(key)} for key, entry in data.items()
37 | ]
38 |
39 | # Create DataLoader
40 | dataloader = DataLoader(data_list, batch_size=batch_size, shuffle=False)
41 |
42 | scores_by_index = {}
43 | for batch in tqdm(dataloader):
44 | batch_texts = batch["text"]
45 | batch_indices = batch["index"].tolist()
46 |
47 | # Tokenize the batch of texts
48 | inputs = tokenizer(
49 | batch_texts,
50 | return_tensors="pt",
51 | padding=True,
52 | truncation=True,
53 | max_length=max_length,
54 | return_attention_mask=True,
55 | )
56 | inputs = {k: v.to(device) for k, v in inputs.items()}
57 |
58 | # Run the classifier
59 | with torch.no_grad():
60 | outputs = classifier(**inputs)
61 | # Convert logits to probabilities
62 | scores = F.softmax(outputs.logits, dim=-1)[:, class_id].cpu().numpy().tolist()
63 |
64 | # Map predictions to labels
65 | for idx, prob, text in zip(batch_indices, scores, batch_texts):
66 | # Add the prediction to the original data
67 | scores_by_index[idx] = {"score": prob, text_key: text}
68 | class_scores = np.array(
69 | [
70 | evals["score"]
71 | for evals in scores_by_index.values()
72 | if evals["score"] is not None
73 | ]
74 | )
75 | class_scores = aggregate_to_1D(class_scores)
76 | return {"agg_value": np.mean(class_scores), "value_by_index": scores_by_index}
77 |
--------------------------------------------------------------------------------
/src/evals/muse.py:
--------------------------------------------------------------------------------
1 | from evals.base import Evaluator
2 |
3 |
4 | class MUSEEvaluator(Evaluator):
5 | def __init__(self, eval_cfg, **kwargs):
6 | super().__init__("MUSE", eval_cfg, **kwargs)
7 |
--------------------------------------------------------------------------------
/src/evals/tofu.py:
--------------------------------------------------------------------------------
1 | from evals.base import Evaluator
2 |
3 |
4 | class TOFUEvaluator(Evaluator):
5 | def __init__(self, eval_cfg, **kwargs):
6 | super().__init__("TOFU", eval_cfg, **kwargs)
7 |
--------------------------------------------------------------------------------
/src/model/__init__.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoModelForCausalLM, AutoTokenizer
2 | from omegaconf import DictConfig, open_dict
3 | from typing import Dict, Any
4 | import os
5 | import torch
6 | import logging
7 | from model.probe import ProbedLlamaForCausalLM
8 |
9 | hf_home = os.getenv("HF_HOME", default=None)
10 |
11 | logger = logging.getLogger(__name__)
12 |
13 | MODEL_REGISTRY: Dict[str, Any] = {}
14 |
15 |
16 | def _register_model(model_class):
17 | MODEL_REGISTRY[model_class.__name__] = model_class
18 |
19 |
20 | def get_dtype(model_args):
21 | with open_dict(model_args):
22 | torch_dtype = model_args.pop("torch_dtype", None)
23 | if model_args.get("attn_implementation", None) == "flash_attention_2":
24 | # This check handles https://github.com/Dao-AILab/flash-attention/blob/7153673c1a3c7753c38e4c10ef2c98a02be5f778/flash_attn/flash_attn_triton.py#L820
25 | # If you want to run at other precisions consider running "training or inference using
26 | # Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):`
27 | # decorator" or using an attn_implementation compatible with the precision in the model
28 | # config.
29 | assert torch_dtype in ["float16", "bfloat16"], ValueError(
30 | f"Invalid torch_dtype '{torch_dtype}' for the requested attention "
31 | f"implementation: 'flash_attention_2'. Supported types are 'float16' "
32 | f"and 'bfloat16'."
33 | )
34 | if torch_dtype == "float16":
35 | return torch.float16
36 | elif torch_dtype == "bfloat16":
37 | return torch.bfloat16
38 | return torch.float32
39 |
40 |
41 | def get_model(model_cfg: DictConfig):
42 | assert model_cfg is not None and model_cfg.model_args is not None, ValueError(
43 | "Model config not found or model_args absent in configs/model."
44 | )
45 | model_args = model_cfg.model_args
46 | tokenizer_args = model_cfg.tokenizer_args
47 | torch_dtype = get_dtype(model_args)
48 | model_handler = model_cfg.get("model_handler", "AutoModelForCausalLM")
49 | model_cls = MODEL_REGISTRY[model_handler]
50 | with open_dict(model_args):
51 | model_path = model_args.pop("pretrained_model_name_or_path", None)
52 | try:
53 | model = model_cls.from_pretrained(
54 | pretrained_model_name_or_path=model_path,
55 | torch_dtype=torch_dtype,
56 | **model_args,
57 | cache_dir=hf_home,
58 | )
59 | except Exception as e:
60 | logger.warning(f"Model {model_path} requested with {model_cfg.model_args}")
61 | raise ValueError(
62 | f"Error {e} while fetching model using {model_handler}.from_pretrained()."
63 | )
64 | tokenizer = get_tokenizer(tokenizer_args)
65 | return model, tokenizer
66 |
67 |
68 | def _add_or_replace_eos_token(tokenizer, eos_token: str) -> None:
69 | is_added = tokenizer.eos_token_id is None
70 | num_added_tokens = tokenizer.add_special_tokens({"eos_token": eos_token})
71 |
72 | if is_added:
73 | logger.info("Add eos token: {}".format(tokenizer.eos_token))
74 | else:
75 | logger.info("Replace eos token: {}".format(tokenizer.eos_token))
76 |
77 | if num_added_tokens > 0:
78 | logger.info("New tokens have been added, make sure `resize_vocab` is True.")
79 |
80 |
81 | def get_tokenizer(tokenizer_cfg: DictConfig):
82 | try:
83 | tokenizer = AutoTokenizer.from_pretrained(**tokenizer_cfg, cache_dir=hf_home)
84 | except Exception as e:
85 | error_message = (
86 | f"{'--' * 40}\n"
87 | f"Error {e} fetching tokenizer using AutoTokenizer.\n"
88 | f"Tokenizer requested from path: {tokenizer_cfg.get('pretrained_model_name_or_path', None)}\n"
89 | f"Full tokenizer config: {tokenizer_cfg}\n"
90 | f"{'--' * 40}"
91 | )
92 | raise RuntimeError(error_message)
93 |
94 | if tokenizer.eos_token_id is None:
95 | logger.info("replacing eos_token with <|endoftext|>")
96 | _add_or_replace_eos_token(tokenizer, eos_token="<|endoftext|>")
97 |
98 | if tokenizer.pad_token_id is None:
99 | tokenizer.pad_token = tokenizer.eos_token
100 | logger.info("Setting pad_token as eos token: {}".format(tokenizer.pad_token))
101 |
102 | return tokenizer
103 |
104 |
105 | # register models
106 | _register_model(AutoModelForCausalLM)
107 | _register_model(ProbedLlamaForCausalLM)
108 |
--------------------------------------------------------------------------------
/src/model/probe.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoConfig, LlamaForCausalLM
2 | import torch
3 | import torch.nn as nn
4 | import logging
5 | import gc
6 | from copy import deepcopy
7 | from transformers import AutoModelForCausalLM
8 |
9 | logger = logging.getLogger("model")
10 |
11 |
12 | class ProbedLlamaForCausalLM(LlamaForCausalLM):
13 | """
14 | Class for loading a LlamaForCausalLM model with the following custom behavior:
15 | - Initializes only the first `n_layers` of the model.
16 | - Sets up a newly initialized `lm_head`, optionally using weights from
17 | `head_pretrained_model_name_or_path`
18 | - Trains only the lm_head parameters with rest of the model frozen.
19 | - Once the model is saved during training, for inference it can also be loaded using
20 | AutoModelForCausalLM
21 | """
22 |
23 | @classmethod
24 | def from_pretrained(
25 | cls,
26 | pretrained_model_name_or_path: str,
27 | head_pretrained_model_name_or_path: str = None,
28 | n_layers: int = 100,
29 | freeze_base_model: bool = True,
30 | **kwargs,
31 | ):
32 | config, unused_kwargs = AutoConfig.from_pretrained(
33 | pretrained_model_name_or_path, return_unused_kwargs=True, **kwargs
34 | )
35 | config.tie_word_embeddings = False
36 | model: LlamaForCausalLM = super().from_pretrained(
37 | pretrained_model_name_or_path, config=config, **unused_kwargs
38 | )
39 |
40 | # Limit number of transformer layers
41 | n_layers = min(n_layers, model.config.num_hidden_layers)
42 | model.config.num_hidden_layers = n_layers
43 | model.model.layers = nn.ModuleList(model.model.layers[:n_layers])
44 |
45 | # Reinitialize lm_head
46 | ref_params = list(model.model.layers[-1].parameters())[0]
47 | device = ref_params.device
48 | if head_pretrained_model_name_or_path is not None:
49 | logger.info(
50 | f"Initialising lm_head from {head_pretrained_model_name_or_path}"
51 | )
52 | head_model: LlamaForCausalLM = AutoModelForCausalLM.from_pretrained(
53 | head_pretrained_model_name_or_path, config=config, **unused_kwargs
54 | )
55 | lm_head = deepcopy(head_model.lm_head).to(device)
56 | model.set_output_embeddings(lm_head)
57 | else:
58 | logger.info("Initialising new lm_head")
59 | model._init_weights(model.lm_head)
60 |
61 | # Cleanup
62 | gc.collect()
63 | torch.cuda.empty_cache()
64 |
65 | # Set trainable params
66 | for name, p in model.named_parameters():
67 | p.requires_grad = not freeze_base_model or name.startswith("lm_head")
68 | logger.info(
69 | f"Initialised a ProbedLlamaForCausalLM model with {n_layers} layers"
70 | )
71 | return model
72 |
--------------------------------------------------------------------------------
/src/train.py:
--------------------------------------------------------------------------------
1 | import hydra
2 | from omegaconf import DictConfig
3 | from data import get_data, get_collators
4 | from model import get_model
5 | from trainer import load_trainer
6 | from evals import get_evaluators
7 | from trainer.utils import seed_everything
8 |
9 |
10 | @hydra.main(version_base=None, config_path="../configs", config_name="train.yaml")
11 | def main(cfg: DictConfig):
12 | """Entry point of the code to train models
13 | Args:
14 | cfg (DictConfig): Config to train
15 | """
16 | seed_everything(cfg.trainer.args.seed)
17 | mode = cfg.get("mode", "train")
18 | model_cfg = cfg.model
19 | template_args = model_cfg.template_args
20 | assert model_cfg is not None, "Invalid model yaml passed in train config."
21 | model, tokenizer = get_model(model_cfg)
22 |
23 | # Load Dataset
24 | data_cfg = cfg.data
25 | data = get_data(
26 | data_cfg, mode=mode, tokenizer=tokenizer, template_args=template_args
27 | )
28 |
29 | # Load collator
30 | collator_cfg = cfg.collator
31 | collator = get_collators(collator_cfg, tokenizer=tokenizer)
32 |
33 | # Get Trainer
34 | trainer_cfg = cfg.trainer
35 | assert trainer_cfg is not None, ValueError("Please set trainer")
36 |
37 | # Get Evaluators
38 | evaluators = None
39 | eval_cfgs = cfg.get("eval", None)
40 | if eval_cfgs:
41 | evaluators = get_evaluators(
42 | eval_cfgs=eval_cfgs,
43 | template_args=template_args,
44 | model=model,
45 | tokenizer=tokenizer,
46 | )
47 |
48 | trainer, trainer_args = load_trainer(
49 | trainer_cfg=trainer_cfg,
50 | model=model,
51 | train_dataset=data.get("train", None),
52 | eval_dataset=data.get("eval", None),
53 | tokenizer=tokenizer,
54 | data_collator=collator,
55 | evaluators=evaluators,
56 | template_args=template_args,
57 | )
58 |
59 | if trainer_args.do_train:
60 | trainer.train()
61 | trainer.save_state()
62 | trainer.save_model(trainer_args.output_dir)
63 |
64 | if trainer_args.do_eval:
65 | trainer.evaluate(metric_key_prefix="eval")
66 |
67 |
68 | if __name__ == "__main__":
69 | main()
70 |
--------------------------------------------------------------------------------
/src/trainer/__init__.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from typing import Dict, Any
3 | from omegaconf import DictConfig
4 | from transformers import Trainer, TrainingArguments
5 |
6 | from trainer.base import FinetuneTrainer
7 | from trainer.unlearn.grad_ascent import GradAscent
8 | from trainer.unlearn.grad_diff import GradDiff
9 | from trainer.unlearn.npo import NPO
10 | from trainer.unlearn.dpo import DPO
11 | from trainer.unlearn.simnpo import SimNPO
12 | from trainer.unlearn.rmu import RMU
13 | from trainer.unlearn.undial import UNDIAL
14 |
15 | import logging
16 |
17 | logger = logging.getLogger(__name__)
18 |
19 | TRAINER_REGISTRY: Dict[str, Any] = {}
20 |
21 |
22 | def _register_trainer(trainer_class):
23 | TRAINER_REGISTRY[trainer_class.__name__] = trainer_class
24 |
25 |
26 | def load_trainer_args(trainer_args: DictConfig, dataset):
27 | trainer_args = dict(trainer_args)
28 | warmup_epochs = trainer_args.pop("warmup_epochs", None)
29 | if warmup_epochs:
30 | batch_size = trainer_args["per_device_train_batch_size"]
31 | grad_accum_steps = trainer_args["gradient_accumulation_steps"]
32 | num_devices = torch.cuda.device_count()
33 | dataset_len = len(dataset)
34 | trainer_args["warmup_steps"] = int(
35 | (warmup_epochs * dataset_len)
36 | // (batch_size * grad_accum_steps * num_devices)
37 | )
38 |
39 | trainer_args = TrainingArguments(**trainer_args)
40 | return trainer_args
41 |
42 |
43 | def load_trainer(
44 | trainer_cfg: DictConfig,
45 | model,
46 | train_dataset=None,
47 | eval_dataset=None,
48 | tokenizer=None,
49 | data_collator=None,
50 | evaluators=None,
51 | template_args=None,
52 | ):
53 | trainer_args = trainer_cfg.args
54 | method_args = trainer_cfg.get("method_args", {})
55 | trainer_args = load_trainer_args(trainer_args, train_dataset)
56 | trainer_handler_name = trainer_cfg.get("handler")
57 | assert trainer_handler_name is not None, ValueError(
58 | f"{trainer_handler_name} handler not set"
59 | )
60 | trainer_cls = TRAINER_REGISTRY.get(trainer_handler_name, None)
61 | assert trainer_cls is not None, NotImplementedError(
62 | f"{trainer_handler_name} not implemented or not registered"
63 | )
64 | trainer = trainer_cls(
65 | model=model,
66 | train_dataset=train_dataset,
67 | eval_dataset=eval_dataset,
68 | tokenizer=tokenizer,
69 | data_collator=data_collator,
70 | args=trainer_args,
71 | evaluators=evaluators,
72 | template_args=template_args,
73 | **method_args,
74 | )
75 | logger.info(
76 | f"{trainer_handler_name} Trainer loaded, output_dir: {trainer_args.output_dir}"
77 | )
78 | return trainer, trainer_args
79 |
80 |
81 | # Register Finetuning Trainer
82 | _register_trainer(Trainer)
83 | _register_trainer(FinetuneTrainer)
84 |
85 | # Register Unlearning Trainer
86 | _register_trainer(GradAscent)
87 | _register_trainer(GradDiff)
88 | _register_trainer(NPO)
89 | _register_trainer(DPO)
90 | _register_trainer(SimNPO)
91 | _register_trainer(RMU)
92 | _register_trainer(UNDIAL)
93 |
--------------------------------------------------------------------------------
/src/trainer/base.py:
--------------------------------------------------------------------------------
1 | # Modified from https://github.com/huggingface/transformers/blob/v4.45.1/src/transformers/trainer.py
2 |
3 | from typing import Dict, List, Optional, Union
4 |
5 | import os
6 | import logging
7 | from transformers import Trainer
8 | from torch.utils.data import Dataset
9 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
10 | from typing import Any
11 |
12 | logger = logging.getLogger(__name__)
13 |
14 |
15 | class FinetuneTrainer(Trainer):
16 | def __init__(self, evaluators=None, template_args=None, *args, **kwargs):
17 | self.evaluators = evaluators
18 | self.template_args = template_args
19 | super().__init__(*args, **kwargs)
20 |
21 | def evaluate(
22 | self,
23 | eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
24 | ignore_keys: Optional[List[str]] = None,
25 | metric_key_prefix: str = "eval",
26 | trial: Dict[str, Any] = None,
27 | ) -> Dict[str, float]:
28 | # Run a custom evaluator and save results
29 | if self.evaluators:
30 | if self.accelerator.is_local_main_process:
31 | eval_metrics = {}
32 | if self.accelerator.num_processes == 1:
33 | run_dir = self._get_output_dir(trial=trial)
34 | checkpoint_folder = (
35 | f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
36 | )
37 | output_dir = os.path.join(run_dir, checkpoint_folder, "evals")
38 | os.makedirs(output_dir, exist_ok=True)
39 | eval_metrics = {}
40 | for _, evaluator in self.evaluators.items():
41 | eval_args = {
42 | "output_dir": output_dir,
43 | "template_args": self.template_args,
44 | "model": self.model,
45 | "tokenizer": self.tokenizer,
46 | }
47 | eval_metrics.update(evaluator.evaluate(**eval_args))
48 | self.log(eval_metrics)
49 | else:
50 | logger.warning(
51 | "Custom evaluator can be run with this Trainer only when a single accelerator process is running."
52 | )
53 | return eval_metrics
54 |
55 | if eval_dataset is None:
56 | return {}
57 | # Run the default HF Trainer evaluate method when eval dataset is provided
58 | return super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)
59 |
--------------------------------------------------------------------------------
/src/trainer/unlearn/dpo.py:
--------------------------------------------------------------------------------
1 | from trainer.utils import compute_dpo_loss
2 | from trainer.unlearn.grad_diff import GradDiff
3 |
4 |
5 | class DPO(GradDiff):
6 | def __init__(self, beta=1.0, *args, **kwargs):
7 | super().__init__(*args, **kwargs)
8 | self.beta = beta
9 | if self.ref_model is None:
10 | self.ref_model = self._prepare_ref_model(self.model)
11 |
12 | def compute_loss(self, model, inputs, return_outputs=False):
13 | forget_inputs = inputs["forget"]["original"]
14 | alternate_inputs = inputs["forget"]["alternate"]
15 |
16 | forget_loss, forget_outputs = compute_dpo_loss(
17 | model=model,
18 | ref_model=self.ref_model,
19 | win_inputs=alternate_inputs,
20 | lose_inputs=forget_inputs,
21 | beta=self.beta,
22 | )
23 |
24 | retain_inputs = inputs["retain"]
25 | retain_inputs = {
26 | "input_ids": retain_inputs["input_ids"],
27 | "attention_mask": retain_inputs["attention_mask"],
28 | "labels": retain_inputs["labels"],
29 | }
30 | retain_loss = self.compute_retain_loss(model=model, retain_inputs=retain_inputs)
31 |
32 | loss = self.gamma * forget_loss + self.alpha * retain_loss
33 | return (loss, forget_outputs) if return_outputs else loss
34 |
--------------------------------------------------------------------------------
/src/trainer/unlearn/grad_ascent.py:
--------------------------------------------------------------------------------
1 | from trainer.unlearn.base import UnlearnTrainer
2 |
3 |
4 | class GradAscent(UnlearnTrainer):
5 | def compute_loss(self, model, inputs, return_outputs=False):
6 | forget_inputs = inputs["forget"]
7 | forget_inputs = {
8 | "input_ids": forget_inputs["input_ids"],
9 | "attention_mask": forget_inputs["attention_mask"],
10 | "labels": forget_inputs["labels"],
11 | }
12 | outputs = model(**forget_inputs)
13 | loss = -outputs.loss
14 | return (loss, outputs) if return_outputs else loss
15 |
--------------------------------------------------------------------------------
/src/trainer/unlearn/grad_diff.py:
--------------------------------------------------------------------------------
1 | import copy
2 | from trainer.utils import compute_kl_divergence
3 | from trainer.unlearn.base import UnlearnTrainer
4 |
5 |
6 | class GradDiff(UnlearnTrainer):
7 | def __init__(self, gamma=1.0, alpha=1.0, retain_loss_type="NLL", *args, **kwargs):
8 | super().__init__(*args, **kwargs)
9 | self.gamma = gamma
10 | self.alpha = alpha
11 | self.retain_loss_type = retain_loss_type
12 | self.ref_model = None
13 | if retain_loss_type == "KL":
14 | self.ref_model = self._prepare_ref_model(self.model)
15 |
16 | def _prepare_ref_model(self, model):
17 | ref_model = copy.deepcopy(model).to(self.accelerator.device)
18 | ref_model.eval()
19 | if self.is_deepspeed_enabled:
20 | ref_model = self._prepare_deepspeed(ref_model)
21 | else:
22 | ref_model = self.accelerator.prepare_model(ref_model, evaluation_mode=True)
23 | return ref_model
24 |
25 | def compute_retain_loss(self, model, retain_inputs):
26 | retain_outputs = model(**retain_inputs)
27 | retain_loss = 0.0
28 | if self.retain_loss_type == "NLL":
29 | retain_loss += retain_outputs.loss
30 | elif self.retain_loss_type == "KL":
31 | kl_loss, retain_outputs = compute_kl_divergence(
32 | self.model, self.ref_model, retain_inputs
33 | )
34 | retain_loss += kl_loss
35 | else:
36 | raise NotImplementedError(
37 | f"{self.retain_loss_type} not implemented for retain set"
38 | )
39 | return retain_loss
40 |
41 | def compute_loss(self, model, inputs, return_outputs=False):
42 | forget_inputs = inputs["forget"]
43 | forget_inputs = {
44 | "input_ids": forget_inputs["input_ids"],
45 | "attention_mask": forget_inputs["attention_mask"],
46 | "labels": forget_inputs["labels"],
47 | }
48 |
49 | forget_outputs = model(**forget_inputs)
50 | forget_loss = -forget_outputs.loss
51 |
52 | retain_inputs = inputs["retain"]
53 | retain_inputs = {
54 | "input_ids": retain_inputs["input_ids"],
55 | "attention_mask": retain_inputs["attention_mask"],
56 | "labels": retain_inputs["labels"],
57 | }
58 | retain_loss = self.compute_retain_loss(model=model, retain_inputs=retain_inputs)
59 |
60 | loss = self.gamma * forget_loss + self.alpha * retain_loss
61 |
62 | return (loss, forget_outputs) if return_outputs else loss
63 |
--------------------------------------------------------------------------------
/src/trainer/unlearn/npo.py:
--------------------------------------------------------------------------------
1 | from trainer.utils import compute_dpo_loss
2 | from trainer.unlearn.grad_diff import GradDiff
3 |
4 |
5 | class NPO(GradDiff):
6 | def __init__(self, beta=1.0, *args, **kwargs):
7 | super().__init__(*args, **kwargs)
8 | self.beta = beta
9 | if self.ref_model is None:
10 | self.ref_model = self._prepare_ref_model(self.model)
11 |
12 | def compute_loss(self, model, inputs, return_outputs=False):
13 | forget_inputs = inputs["forget"]
14 |
15 | forget_loss, forget_outputs = compute_dpo_loss(
16 | model=model,
17 | ref_model=self.ref_model,
18 | win_inputs=None,
19 | lose_inputs=forget_inputs,
20 | beta=self.beta,
21 | )
22 |
23 | retain_inputs = inputs["retain"]
24 | retain_inputs = {
25 | "input_ids": retain_inputs["input_ids"],
26 | "attention_mask": retain_inputs["attention_mask"],
27 | "labels": retain_inputs["labels"],
28 | }
29 | retain_loss = self.compute_retain_loss(model=model, retain_inputs=retain_inputs)
30 |
31 | loss = self.gamma * forget_loss + self.alpha * retain_loss
32 | return (loss, forget_outputs) if return_outputs else loss
33 |
--------------------------------------------------------------------------------
/src/trainer/unlearn/simnpo.py:
--------------------------------------------------------------------------------
1 | import torch.nn.functional as F
2 |
3 | from trainer.utils import compute_batch_nll
4 | from trainer.unlearn.grad_diff import GradDiff
5 |
6 |
7 | class SimNPO(GradDiff):
8 | def __init__(self, delta=0.0, beta=1.0, *args, **kwargs):
9 | super().__init__(*args, **kwargs)
10 | self.delta = delta
11 | self.beta = beta
12 |
13 | def compute_loss(self, model, inputs, return_outputs=False):
14 | forget_inputs = inputs["forget"]
15 |
16 | forget_labels = forget_inputs["labels"]
17 | loss_mask = forget_labels != -100
18 | forget_loss, forget_outputs = compute_batch_nll(model, forget_inputs)
19 | forget_loss = forget_loss / loss_mask.sum(-1) - self.delta
20 | forget_loss = -F.logsigmoid(self.beta * forget_loss).mean() * 2 / self.beta
21 |
22 | retain_inputs = inputs["retain"]
23 | retain_inputs = {
24 | "input_ids": retain_inputs["input_ids"],
25 | "attention_mask": retain_inputs["attention_mask"],
26 | "labels": retain_inputs["labels"],
27 | }
28 | retain_loss = self.compute_retain_loss(model=model, retain_inputs=retain_inputs)
29 |
30 | loss = self.gamma * forget_loss + self.alpha * retain_loss
31 | return (loss, forget_outputs) if return_outputs else loss
32 |
--------------------------------------------------------------------------------
/src/trainer/unlearn/undial.py:
--------------------------------------------------------------------------------
1 | from trainer.utils import compute_undial_loss
2 | from trainer.unlearn.grad_diff import GradDiff
3 |
4 |
5 | class UNDIAL(GradDiff):
6 | def __init__(self, beta=1.0, *args, **kwargs):
7 | super().__init__(*args, **kwargs)
8 | self.beta = beta
9 | if self.ref_model is None:
10 | self.ref_model = self._prepare_ref_model(self.model)
11 |
12 | def compute_loss(self, model, inputs, return_outputs=False):
13 | forget_inputs = inputs["forget"]
14 | forget_loss, forget_outputs = compute_undial_loss(
15 | model, self.ref_model, forget_inputs, self.beta
16 | )
17 |
18 | retain_inputs = inputs["retain"]
19 | retain_inputs = {
20 | "input_ids": retain_inputs["input_ids"],
21 | "attention_mask": retain_inputs["attention_mask"],
22 | "labels": retain_inputs["labels"],
23 | }
24 | retain_loss = self.compute_retain_loss(model=model, retain_inputs=retain_inputs)
25 |
26 | loss = self.gamma * forget_loss + self.alpha * retain_loss
27 | return (loss, forget_outputs) if return_outputs else loss
28 |
--------------------------------------------------------------------------------
/src/trainer/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import random
3 | import numpy as np
4 | from torch import nn
5 | import torch.nn.functional as F
6 |
7 |
8 | def seed_everything(seed=42):
9 | random.seed(seed)
10 | np.random.seed(seed)
11 | torch.manual_seed(seed)
12 | torch.cuda.manual_seed_all(seed)
13 | torch.backends.cudnn.deterministic = True
14 | torch.backends.cudnn.benchmark = False
15 |
16 |
17 | def compute_kl_divergence(model, target_model, inputs):
18 | with torch.no_grad():
19 | ref_outputs = target_model(**inputs)
20 |
21 | ref_probs = F.log_softmax(ref_outputs.logits, dim=-1)
22 | ref_probs = F.log_softmax(ref_outputs.logits, dim=-1)
23 | ref_probs = ref_probs.view(-1, ref_outputs.logits.shape[-1])
24 |
25 | outputs = model(**inputs)
26 | current_probs = F.log_softmax(outputs.logits, dim=-1)
27 | current_probs = current_probs.view(-1, outputs.logits.shape[-1])
28 |
29 | # minimum KL divergence
30 | return nn.functional.kl_div(
31 | current_probs, ref_probs, reduction="batchmean", log_target=True
32 | ), outputs
33 |
34 |
35 | def compute_batch_nll(model, inputs):
36 | # get the sum loss for each sequence in a batch
37 | # NOTE: not same as model(**inputs).loss but has sum loss for each seq in a batch
38 | outputs = model(**inputs)
39 | logits = outputs.logits
40 | labels = inputs["labels"]
41 | shifted_labels = labels[..., 1:].contiguous()
42 | logits = logits[..., :-1, :].contiguous()
43 | loss_function = nn.CrossEntropyLoss(ignore_index=-100, reduction="none")
44 | loss = loss_function(logits.transpose(-1, -2), shifted_labels).sum(dim=-1)
45 | return loss, outputs
46 |
47 |
48 | def compute_dpo_loss(model, ref_model, win_inputs=None, lose_inputs=None, beta=1.0):
49 | if win_inputs is None and lose_inputs is None:
50 | raise ValueError("Both win_inputs and lose_inputs can't be None")
51 |
52 | win_log_ratio, lose_log_ratio = 0.0, 0.0
53 | win_outputs, lose_outputs = None, None
54 |
55 | if win_inputs is not None:
56 | win_loss, win_outputs = compute_batch_nll(model, win_inputs)
57 | with torch.no_grad():
58 | win_ref_loss, _ = compute_batch_nll(ref_model, win_inputs)
59 | win_log_ratio = -(win_loss - win_ref_loss)
60 |
61 | if lose_inputs is not None:
62 | lose_loss, lose_outputs = compute_batch_nll(model, lose_inputs)
63 | with torch.no_grad():
64 | lose_ref_loss, _ = compute_batch_nll(ref_model, lose_inputs)
65 | lose_log_ratio = -(lose_loss - lose_ref_loss)
66 |
67 | loss = -2 / beta * F.logsigmoid(beta * (win_log_ratio - lose_log_ratio)).mean()
68 | return loss, (win_outputs, lose_outputs)
69 |
70 |
71 | def compute_undial_loss(model, ref_model, inputs, beta):
72 | # Forward pass on the student (trainable) model
73 | outputs = model(**inputs)
74 | logits = outputs.logits
75 | labels = inputs["labels"]
76 |
77 | shift_labels = labels[..., 1:].contiguous()
78 | shift_logits = logits[..., :-1, :].contiguous()
79 |
80 | # Forward pass on the teacher model (no grad)
81 | with torch.no_grad():
82 | teacher_logits = ref_model(**inputs).logits
83 | shift_teacher_logits = teacher_logits[..., :-1, :].contiguous()
84 |
85 | # Build the mask that identifies the tokens need to be unlearned
86 | mask = torch.zeros_like(shift_teacher_logits)
87 | batch_idx = torch.arange(mask.shape[0]).view(-1, 1, 1)
88 | seq_idx = torch.arange(mask.shape[1]).view(1, -1, 1)
89 | mask[batch_idx, seq_idx, shift_labels.unsqueeze(-1)] = 1.0
90 |
91 | # Adjust teacher logits: subtract di_strength on the correct token
92 | pre_softmax = shift_teacher_logits - mask * beta
93 | soft_label = F.softmax(pre_softmax, dim=-1)
94 |
95 | loss_fct = nn.CrossEntropyLoss(reduction="none")
96 | loss = loss_fct(
97 | shift_logits.view(-1, shift_logits.size(-1)),
98 | soft_label.view(-1, soft_label.size(-1)),
99 | )
100 | return loss.mean(), outputs
101 |
--------------------------------------------------------------------------------