├── .github ├── CODEOWNERS └── workflows │ ├── build-package.yml │ ├── codeql.yml │ ├── publish-package.yml │ ├── test-examples-env.yml │ └── test-examples.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE.txt ├── README.md ├── SECURITY.md ├── examples ├── nlg-reddit │ ├── author-level-dp │ │ ├── README.md │ │ ├── aml │ │ │ ├── fuft-eps_8.yml │ │ │ └── peft-eps_8.yml │ │ ├── environment.yml │ │ └── fine-tune-dp.py │ └── sample-level-dp │ │ ├── README.md │ │ ├── aml │ │ ├── fuft-eps_8.yml │ │ ├── fuft-eps_inf.yml │ │ ├── peft-eps_8-gpus_1.yml │ │ ├── peft-eps_8.yml │ │ └── peft-eps_inf.yml │ │ ├── environment.yml │ │ ├── fine-tune-dp.py │ │ └── fine-tune-nodp.py └── test_examples.py ├── research ├── fine_tune_llm_w_qlora │ ├── README.md │ ├── aml │ │ ├── cnn │ │ │ ├── peft-eps_8.yml │ │ │ └── peft-eps_inf.yml │ │ ├── qnli │ │ │ ├── peft-eps_8.yml │ │ │ └── peft-eps_inf.yml │ │ └── sst2 │ │ │ ├── peft-eps_8.yml │ │ │ └── peft-eps_inf.yml │ ├── data_utils.py │ ├── environment.yml │ ├── fine-tune-dp.py │ ├── fine-tune-nodp.py │ └── linear.py └── synthetic-text-generation-with-DP │ ├── NOTICE.txt │ ├── README.md │ ├── fine-tune-dp.py │ ├── fine-tune-nodp.py │ ├── generate-text.py │ ├── requirements.txt │ └── run-classification.py ├── setup.py ├── src └── dp_transformers │ ├── __init__.py │ ├── arguments.py │ ├── dp_utils.py │ ├── grad_sample │ ├── __init__.py │ └── transformers │ │ ├── __init__.py │ │ └── conv_1d.py │ ├── module_modification.py │ └── sampler.py └── tests ├── test_dp_utils.py ├── test_grad_sample ├── __init__.py └── test_transformers_conv_1d.py └── test_models.py /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # These owners will be the default owners for everything in 2 | # the repo. Unless a later match takes precedence, 3 | # @microsoft/ppml will be requested for 4 | # review when someone opens a pull request. 5 | * @microsoft/ppml 6 | -------------------------------------------------------------------------------- /.github/workflows/build-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Build package 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.8", "3.9"] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install flake8 pytest 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | python -m pip install .[test] 33 | - name: Lint with flake8 34 | run: | 35 | # stop the build if there are Python syntax errors or undefined names 36 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 37 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 38 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 39 | - name: Test with pytest 40 | run: | 41 | pytest tests 42 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # This is based on the standard CodeQL workflow provided by Github 2 | name: "CodeQL" 3 | 4 | on: 5 | push: 6 | branches: [ "main" ] 7 | pull_request: 8 | # The branches below must be a subset of the branches above 9 | branches: [ "main" ] 10 | schedule: 11 | - cron: '35 2 * * 3' 12 | 13 | jobs: 14 | analyze: 15 | name: Analyze 16 | runs-on: ubuntu-latest 17 | permissions: 18 | actions: read 19 | contents: read 20 | security-events: write 21 | 22 | strategy: 23 | fail-fast: false 24 | matrix: 25 | language: [ 'python' ] 26 | 27 | steps: 28 | - name: Checkout repository 29 | uses: actions/checkout@v3 30 | 31 | # Initializes the CodeQL tools for scanning. 32 | - name: Initialize CodeQL 33 | uses: github/codeql-action/init@v2 34 | with: 35 | languages: ${{ matrix.language }} 36 | 37 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 38 | # If this step fails, then you should remove it and run the build manually (see below) 39 | - name: Autobuild 40 | uses: github/codeql-action/autobuild@v2 41 | 42 | - name: Perform CodeQL Analysis 43 | uses: github/codeql-action/analyze@v2 44 | -------------------------------------------------------------------------------- /.github/workflows/publish-package.yml: -------------------------------------------------------------------------------- 1 | name: Publish package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - uses: actions/checkout@v2 13 | - uses: actions/setup-python@v2 14 | - name: Install dependencies 15 | run: | 16 | python -m pip install --upgrade pip 17 | python -m pip install setuptools wheel twine build 18 | - name: Build 19 | run: | 20 | python -m build --sdist --wheel --outdir dist/ . 21 | - name: Publish to PyPI 22 | uses: pypa/gh-action-pypi-publish@master 23 | with: 24 | password: ${{ secrets.PYPI_API_TOKEN }} -------------------------------------------------------------------------------- /.github/workflows/test-examples-env.yml: -------------------------------------------------------------------------------- 1 | name: Test examples environment 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | jobs: 10 | run: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | example: [ 16 | "examples/nlg-reddit/sample-level-dp", 17 | "examples/nlg-reddit/author-level-dp" 18 | ] 19 | steps: 20 | - uses: actions/checkout@v3 21 | - uses: conda-incubator/setup-miniconda@v2 22 | with: 23 | activate-environment: env 24 | environment-file: ${{ matrix.example }}/environment.yml 25 | auto-activate-base: false 26 | - name: Install package 27 | run: | 28 | /usr/share/miniconda/envs/env/bin/pip install -e . 29 | -------------------------------------------------------------------------------- /.github/workflows/test-examples.yml: -------------------------------------------------------------------------------- 1 | name: Test examples 2 | 3 | on: 4 | workflow_dispatch 5 | 6 | jobs: 7 | submit: 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@v3 12 | - name: Set up Python 3.9 13 | uses: actions/setup-python@v3 14 | with: 15 | python-version: 3.9 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | python -m pip install azure-cli 20 | az extension add -n ml 21 | python -m pip install pytest pytest-xdist azureml-core 22 | - name: Set up Azure ML CLI 23 | run: | 24 | az login --service-principal -u "${{ secrets.AZ_CLIENT_ID }}" -p "${{ secrets.AZ_CLIENT_SECRET }}" --tenant "${{ secrets.AZ_TENANT_ID }}" 25 | az account set --subscription "${{ secrets.AZ_SUBSCRIPTION_ID }}" 26 | az configure --defaults group=${{ secrets.AZ_RESOURCE_GROUP }} workspace=${{ secrets.AZ_WORKSPACE_NAME }} 27 | - name: Run examples with pytest 28 | run: | 29 | pytest -n 16 -s examples -v --junitxml=junit/test-results.xml 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | 140 | /data 141 | /.vscode 142 | /.amltconfig 143 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) Microsoft Corporation. 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dp-transformers 2 | 3 | :warning: This repo is intended for research projects and prototypes. 4 | While we try to provide tests for all the functionality, the repo has not (yet) undergone the detailed review process that is necessary for deploying a system of critical nature such as privacy. 5 | 6 | ## Introduction 7 | 8 | See [dp-transformers](https://www.microsoft.com/en-us/research/project/dp-transformers) for a brief introduction to our repository. 9 | 10 | ## Installation 11 | 12 | For installing the `dp-transformers` package, you can just type 13 | 14 | ``` 15 | pip install . 16 | ``` 17 | 18 | ## Examples 19 | 20 | See `./examples` for end to end examples of how to use the library. 21 | 22 | A basic example can be found in `examples/nlg-reddit/sample-level-dp/fine-tune-dp.py`. 23 | First, create an Anaconda environment by doing `conda env create -f examples/nlg-reddit/sample-level-dp/environment.yml`. 24 | Then, you can run the example using the following command (here we assume there are 16 GPUs in the machine, and thus set `--nproc_per_node 16`): 25 | 26 | ``` 27 | python -m torch.distributed.run --nproc_per_node 16 examples/nlg-reddit/sample-level-dp/fine-tune-dp.py \ 28 | --output_dir scratch \ 29 | --model_name gpt2 \ 30 | --sequence_len 128 \ 31 | --per_device_train_batch_size 32 \ 32 | --gradient_accumulation_steps 2 \ 33 | --evaluation_strategy steps \ 34 | --eval_steps 45 \ 35 | --log_level info \ 36 | --per_device_eval_batch_size 64 \ 37 | --eval_accumulation_steps 1 \ 38 | --seed 42 \ 39 | --target_epsilon 8 \ 40 | --per_sample_max_grad_norm 1.0 \ 41 | --prediction_loss_only \ 42 | --weight_decay 0.01 \ 43 | --remove_unused_columns False \ 44 | --num_train_epochs 3 \ 45 | --logging_steps 5 \ 46 | --max_grad_norm 0 \ 47 | --lr_scheduler_type constant \ 48 | --learning_rate 1e-4 \ 49 | --disable_tqdm True \ 50 | --dataloader_num_workers 2 51 | ``` 52 | 53 | ## 🤗 Transformers with Opacus 54 | 55 | ### Trainer 56 | 57 | Huggingface's trainer provides callback hooks which we can use to make sure the required methods in the privacy engine are called. 58 | 59 | You can use the callback as demonstrated in the example below 60 | 61 | ``` python 62 | privacy_engine = opacus.PrivacyEngine(module=model, ...) 63 | 64 | # No need to attach the privacy engine to the optimizer. The callback will automatically attach the optimizer. 65 | 66 | trainer = transformers.Trainer( 67 | model = model, 68 | [...], 69 | callbacks = [dp_transformers.PrivacyEngineCallback(privacy_engine)] # <-- Add this line to make sure the privacy engine is used in the trainer 70 | [...] 71 | ) 72 | ``` 73 | 74 | ### Data Collation 75 | 76 | 🤗 Transformers library often provides sensible default arguments. 77 | For example, when no `position_ids` are provided, the library automatically will use incrementing integers. 78 | The way this is implemented is by first creating a tensor of shape `[1, sequence_length]` filled with increasing integers. 79 | During a second step that tensor is replicated for the whole batch. 80 | However, the replication is part of the computational graph and hence Opacus cannot infer the batch size from this input tensor. 81 | 82 | We have therefore implemented a custom data collator (see `dp_transformers.DataCollatorForPrivateCausalLanguageModeling`) which automatically creates the `position_ids` input tensor by using `torch.repeat`. 83 | This works with opacus since the `position_ids` tensor appears as batch size different inputs in the computation graph. 84 | 85 | ### GPT2 86 | 87 | The 🤗 Transformers implementation for GPT2 uses a custom layer type namely `Conv1D`. 88 | It is not quite clear why this was introduced since it is essentially a regular linear layer. 89 | This causes problems with Opacus though since it is not sure how to apply the backward hooks for this layer. 90 | 91 | In this repo we provide an implementation for handling this type of layer. 92 | See `dp_transformers.grad_sample.transformers.conv_1d` 93 | 94 | All necessary grad samplers can be registered by merely importing `conv_1d` before the model training. 95 | See the Known Issues section below for more details. 96 | 97 | ## General tips for DP training 98 | 99 | In this section, we collect a few helpful strategies for training models with DP. 100 | Also Opacus's FAQs have a few tips on how to get started with DP training (see [Opacus FAQ](https://opacus.ai/docs/faq)) 101 | 102 | ### Hyper-parameters 103 | 104 | Larger batch sizes help DP training. 105 | As a general rule, try starting with $\sqrt{|D|}$ where $D$ is the training dataset. 106 | Since Opacus increases memory consumption significantly, this is only possible using gradient accumulation. 107 | 108 | We have found a surprisingly small dependence on the clipping norm. 109 | As a general rule of thumb start with a clipping parameter of 0.1 110 | 111 | Fine-tuning the model longer is also helpful. 112 | 113 | 114 | ### Deploying DP trained models 115 | 116 | Pay attention which pseudo random number generator (PRNG) was used. 117 | Pytorch's default (Mersenne Twister) might be attackable. 118 | See [Opacus FAQ](https://opacus.ai/docs/faq#what-is-the-secure_rng-argument-in-privacyengine) 119 | Make sure to use a better PRNG before deploying models. 120 | 121 | ## Known issues 122 | 123 | ### Register custom grad samplers late in the training process 124 | 125 | When registering custom grad sampler like `dp_transformers.grad_sample.transformers.conv_1d`, functions are added to a global dictionary that Opacus handles. 126 | This global dictionary is used to establish whether models are compatible with Opacus and how to handle the per-sample gradient computation. 127 | All grad samplers need to be registered as early as possible in the training process. 128 | Definitely, before the model is wrapped with `GradSampleModule`. 129 | 130 | ## How to Cite 131 | 132 | ``` 133 | @misc{dp-transformers, 134 | author = {Lukas Wutschitz and Huseyin A. Inan and Andre Manoel}, 135 | title = {dp-transformers: Training transformer models with differential privacy}, 136 | year = {2022}, 137 | month = {August}, 138 | howpublished = {\url{https://www.microsoft.com/en-us/research/project/dp-transformers}} 139 | } 140 | ``` 141 | 142 | ## Contributing 143 | 144 | This project welcomes contributions and suggestions. Most contributions require you to 145 | agree to a Contributor License Agreement (CLA) declaring that you have the right to, 146 | and actually do, grant us the rights to use your contribution. For details, visit 147 | https://cla.microsoft.com. 148 | 149 | When you submit a pull request, a CLA-bot will automatically determine whether you need 150 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the 151 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA. 152 | 153 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 154 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 155 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 156 | 157 | For any other questions, feel free to open an issue on GitHub. 158 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /examples/nlg-reddit/author-level-dp/README.md: -------------------------------------------------------------------------------- 1 | # Author-level differentially private fine-tuning of a GPT-2 style model 2 | 3 | This example fine-tunes generative language models (such as GPT-2 series) with Author-level Differential Privacy on a text corpus. 4 | In this case 500,000 samples of Reddit comments belong to 304,279 authors in the dataset. 5 | 6 | **We point out that `Author` is only an abstraction that can represent any of the following: user, group, organization, etc.** 7 | 8 | We compare different fine-tuning techniques (full fine-tuning, LoRA) and also provide a data distributed implementation for faster training. 9 | These merely serve as examples as hyperparameters are not optimized and corresponding commands are presented below. 10 | 11 | # Results 12 | 13 | | Model (HF) | Fine-tuning Method | DP | GPUs | Epochs | Train Loss | Eval Loss | $\varepsilon$ | Run Time [s] | 14 | | ---------- | ------------------ | --- | ------- | ------ | ---------- | --------- | ------------- | ------------ | 15 | | gpt2 | Full | Yes | 16xV100 | 3 | 3.76 | 3.62 | 8.0 | 1167 | 16 | | gpt2 | LoRA | Yes | 16xV100 | 3 | 3.75 | 3.60 | 8.0 | 659 | 17 | 18 | ## Fine-tune the full model with DP 19 | 20 | ``` 21 | python -m torch.distributed.run --nproc_per_node 16 fine-tune-dp.py \ 22 | --output_dir scratch \ 23 | --model_name gpt2 \ 24 | --sequence_len 128 \ 25 | --per_device_train_batch_size 32 \ 26 | --gradient_accumulation_steps 2 \ 27 | --evaluation_strategy steps \ 28 | --eval_steps 45 \ 29 | --log_level info \ 30 | --per_device_eval_batch_size 64 \ 31 | --eval_accumulation_steps 1 \ 32 | --seed 42 \ 33 | --target_epsilon 8 \ 34 | --per_sample_max_grad_norm 1.0 \ 35 | --prediction_loss_only \ 36 | --weight_decay 0.01 \ 37 | --remove_unused_columns False \ 38 | --num_train_epochs 3 \ 39 | --logging_steps 5 \ 40 | --max_grad_norm 0 \ 41 | --lr_scheduler_type constant \ 42 | --learning_rate 1e-4 \ 43 | --disable_tqdm True \ 44 | --label_names labels \ 45 | --dataloader_num_workers 2 46 | ``` 47 | 48 | ## Fine-tune only the LoRA layers introduced into the model with DP 49 | 50 | ``` 51 | python -m torch.distributed.run --nproc_per_node 16 fine-tune-dp.py \ 52 | --output_dir scratch \ 53 | --model_name gpt2 \ 54 | --sequence_len 128 \ 55 | --per_device_train_batch_size 64 \ 56 | --gradient_accumulation_steps 1 \ 57 | --evaluation_strategy steps \ 58 | --eval_steps 45 \ 59 | --log_level info \ 60 | --per_device_eval_batch_size 64 \ 61 | --eval_accumulation_steps 1 \ 62 | --seed 42 \ 63 | --target_epsilon 8 \ 64 | --per_sample_max_grad_norm 1.0 \ 65 | --prediction_loss_only \ 66 | --weight_decay 0.01 \ 67 | --remove_unused_columns False \ 68 | --num_train_epochs 3 \ 69 | --logging_steps 5 \ 70 | --lora_dim 4 \ 71 | --lora_alpha 32 \ 72 | --lora_dropout 0.0 \ 73 | --max_grad_norm 0 \ 74 | --lr_scheduler_type constant \ 75 | --learning_rate 3e-4 \ 76 | --disable_tqdm True \ 77 | --dataloader_num_workers 2 \ 78 | --label_names labels \ 79 | --enable_lora 80 | ``` -------------------------------------------------------------------------------- /examples/nlg-reddit/author-level-dp/aml/fuft-eps_8.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json 2 | code: ../../../../ 3 | command: >- 4 | python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 examples/nlg-reddit/author-level-dp/fine-tune-dp.py \ 5 | --output_dir outputs \ 6 | --model_name gpt2 \ 7 | --sequence_len 128 \ 8 | --per_device_train_batch_size 32 \ 9 | --gradient_accumulation_steps 4 \ 10 | --evaluation_strategy steps \ 11 | --eval_steps 128 \ 12 | --log_level info \ 13 | --per_device_eval_batch_size 64 \ 14 | --eval_accumulation_steps 1 \ 15 | --seed 42 \ 16 | --target_epsilon 8 \ 17 | --target_delta 2e-6 \ 18 | --per_sample_max_grad_norm 1.0 \ 19 | --prediction_loss_only \ 20 | --weight_decay 0.01 \ 21 | --remove_unused_columns False \ 22 | --num_train_epochs 3 \ 23 | --logging_steps 5 \ 24 | --max_grad_norm 0 \ 25 | --lr_scheduler_type constant \ 26 | --learning_rate 1e-4 \ 27 | --disable_tqdm True \ 28 | --dataloader_num_workers 2 \ 29 | --label_names labels 30 | environment: 31 | image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04 32 | conda_file: ../environment.yml 33 | compute: azureml:ND40rsv2 34 | display_name: full_fine_tuning-epsilon_8 35 | experiment_name: dp-transformers-nlg-reddit-author-level-dp 36 | description: Train a model on the Reddit dataset using differential privacy -------------------------------------------------------------------------------- /examples/nlg-reddit/author-level-dp/aml/peft-eps_8.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json 2 | code: ../../../../ 3 | command: >- 4 | python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 examples/nlg-reddit/author-level-dp/fine-tune-dp.py \ 5 | --output_dir outputs \ 6 | --model_name gpt2 \ 7 | --sequence_len 128 \ 8 | --per_device_train_batch_size 128 \ 9 | --gradient_accumulation_steps 1 \ 10 | --evaluation_strategy steps \ 11 | --eval_steps 128 \ 12 | --log_level info \ 13 | --per_device_eval_batch_size 64 \ 14 | --eval_accumulation_steps 1 \ 15 | --seed 42 \ 16 | --target_epsilon 8 \ 17 | --target_delta 2e-6 \ 18 | --per_sample_max_grad_norm 1.0 \ 19 | --prediction_loss_only \ 20 | --weight_decay 0.01 \ 21 | --remove_unused_columns False \ 22 | --num_train_epochs 3 \ 23 | --logging_steps 5 \ 24 | --max_grad_norm 0 \ 25 | --lr_scheduler_type constant \ 26 | --learning_rate 1e-4 \ 27 | --disable_tqdm True \ 28 | --dataloader_num_workers 2 \ 29 | --lora_dim 4 \ 30 | --lora_alpha 32 \ 31 | --lora_dropout 0.0 \ 32 | --label_names labels \ 33 | --enable_lora 34 | environment: 35 | image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04 36 | conda_file: ../environment.yml 37 | compute: azureml:ND40rsv2 38 | display_name: parameter_efficient_fine_tuning-epsilon_8 39 | experiment_name: dp-transformers-nlg-reddit-author-level-dp 40 | description: Train a model on the Reddit dataset using differential privacy -------------------------------------------------------------------------------- /examples/nlg-reddit/author-level-dp/environment.yml: -------------------------------------------------------------------------------- 1 | name: dp-transformers 2 | channels: 3 | - pytorch 4 | - nvidia 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1 8 | - _openmp_mutex=5.1 9 | - blas=1.0 10 | - bzip2=1.0.8 11 | - ca-certificates=2023.05.30 12 | - cuda-cudart=11.8.89 13 | - cuda-cupti=11.8.87 14 | - cuda-libraries=11.8.0 15 | - cuda-nvrtc=11.8.89 16 | - cuda-nvtx=11.8.86 17 | - cuda-runtime=11.8.0 18 | - filelock=3.9.0 19 | - gmp=6.2.1 20 | - gmpy2=2.1.2 21 | - intel-openmp=2023.1.0 22 | - jinja2=3.1.2 23 | - ld_impl_linux-64=2.38 24 | - libcublas=11.11.3.6 25 | - libcufft=10.9.0.58 26 | - libcufile=1.7.1.12 27 | - libcurand=10.3.3.129 28 | - libcusolver=11.4.1.48 29 | - libcusparse=11.7.5.86 30 | - libffi=3.4.4 31 | - libgcc-ng=11.2.0 32 | - libgomp=11.2.0 33 | - libnpp=11.8.0.86 34 | - libnvjpeg=11.9.0.86 35 | - libstdcxx-ng=11.2.0 36 | - libuuid=1.41.5 37 | - markupsafe=2.1.1 38 | - mkl=2023.1.0 39 | - mpc=1.1.0 40 | - mpfr=4.0.2 41 | - mpmath=1.3.0 42 | - ncurses=6.4 43 | - networkx=3.1 44 | - openssl=3.0.10 45 | - pip=23.2.1 46 | - python=3.10.12 47 | - pytorch=2.0.1 48 | - pytorch-cuda=11.8 49 | - pytorch-mutex=1.0 50 | - readline=8.2 51 | - setuptools=68.0.0 52 | - sqlite=3.41.2 53 | - sympy=1.11.1 54 | - tbb=2021.8.0 55 | - tk=8.6.12 56 | - torchtriton=2.0.0 57 | - typing_extensions=4.7.1 58 | - wheel=0.38.4 59 | - xz=5.4.2 60 | - zlib=1.2.13 61 | - pip: 62 | - accelerate==0.21.0 63 | - aiohttp==3.8.5 64 | - aiosignal==1.3.1 65 | - alembic==1.11.2 66 | - async-timeout==4.0.3 67 | - attrs==23.1.0 68 | - azure-common==1.1.28 69 | - azure-core==1.29.2 70 | - azure-identity==1.14.0 71 | - azure-mgmt-core==1.4.0 72 | - azure-storage-blob==12.13.0 73 | - azureml-mlflow==1.52.0 74 | - blinker==1.6.2 75 | - certifi==2023.7.22 76 | - cffi==1.15.1 77 | - charset-normalizer==3.2.0 78 | - click==8.1.6 79 | - cloudpickle==2.2.1 80 | - contourpy==1.1.0 81 | - cryptography==41.0.3 82 | - cycler==0.11.0 83 | - databricks-cli==0.17.7 84 | - datasets==2.14.4 85 | - dill==0.3.7 86 | - docker==6.1.3 87 | - entrypoints==0.4 88 | - exceptiongroup==1.1.3 89 | - flask==2.3.2 90 | - fonttools==4.42.0 91 | - frozenlist==1.4.0 92 | - fsspec==2023.6.0 93 | - gitdb==4.0.10 94 | - gitpython==3.1.32 95 | - greenlet==2.0.2 96 | - gunicorn==21.2.0 97 | - huggingface-hub==0.19.4 98 | - idna==3.4 99 | - importlib-metadata==6.8.0 100 | - iniconfig==2.0.0 101 | - isodate==0.6.1 102 | - itsdangerous==2.1.2 103 | - joblib==1.3.2 104 | - jsonpickle==3.0.2 105 | - kiwisolver==1.4.4 106 | - mako==1.2.4 107 | - markdown==3.4.4 108 | - matplotlib==3.7.2 109 | - mlflow==2.6.0 110 | - mlflow-skinny==2.6.0 111 | - msal==1.23.0 112 | - msal-extensions==1.0.0 113 | - msrest==0.7.1 114 | - multidict==6.0.4 115 | - multiprocess==0.70.15 116 | - numpy==1.25.2 117 | - oauthlib==3.2.2 118 | - opacus==1.4.0 119 | - opt-einsum==3.3.0 120 | - packaging==23.1 121 | - pandas==2.0.3 122 | - peft==0.4.0 123 | - pillow==10.0.0 124 | - pluggy==1.2.0 125 | - portalocker==2.7.0 126 | - protobuf==4.24.0 127 | - prv-accountant==0.1.1.post1 128 | - psutil==5.9.5 129 | - pyarrow==12.0.1 130 | - pycparser==2.21 131 | - pyjwt==2.8.0 132 | - pyparsing==3.0.9 133 | - pytest==7.4.0 134 | - python-dateutil==2.8.2 135 | - pytz==2023.3 136 | - pyyaml==6.0.1 137 | - querystring-parser==1.2.4 138 | - regex==2023.8.8 139 | - requests==2.31.0 140 | - requests-oauthlib==1.3.1 141 | - safetensors==0.3.2 142 | - scikit-learn==1.3.0 143 | - scipy==1.11.1 144 | - six==1.16.0 145 | - smmap==5.0.0 146 | - sqlalchemy==2.0.19 147 | - sqlparse==0.4.4 148 | - tabulate==0.9.0 149 | - threadpoolctl==3.2.0 150 | - tokenizers==0.15.0 151 | - tomli==2.0.1 152 | - tqdm==4.66.1 153 | - transformers==4.36.1 154 | - tzdata==2023.3 155 | - urllib3==1.26.16 156 | - websocket-client==1.6.1 157 | - werkzeug==2.3.7 158 | - xxhash==3.3.0 159 | - yarl==1.9.2 160 | - zipp==3.16.2 161 | -------------------------------------------------------------------------------- /examples/nlg-reddit/author-level-dp/fine-tune-dp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | '''Train GPT2 model series with author-level DP (w/ parameter-efficient approach LoRA when lora_dim > 0)''' 5 | 6 | import datasets 7 | import dp_transformers 8 | import transformers 9 | import sys 10 | import logging 11 | 12 | from dataclasses import dataclass, field, asdict 13 | from peft import get_peft_model, LoraConfig 14 | 15 | from dp_transformers.grad_sample.transformers import conv_1d 16 | 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | @dataclass 22 | class ModelArguments: 23 | model_name: str = field(default="gpt2", metadata={ 24 | "help": "Model name in HuggingFace, e.g. 'gpt2'" 25 | }) 26 | 27 | sequence_len: int = field(default=128, metadata={ 28 | "help": "Model sequence length" 29 | }) 30 | 31 | 32 | @dataclass 33 | class LoraArguments: 34 | enable_lora: bool = field(default=False, metadata={ 35 | "help": "Whether to enable LoRA" 36 | }) 37 | lora_dim: int = field(default=8, metadata={ 38 | "help": "LoRA dimension" 39 | }) 40 | lora_alpha: int = field(default=8, metadata={ 41 | "help": "LoRA alpha" 42 | }) 43 | lora_dropout: float = field(default=0.0, metadata={ 44 | "help": "LoRA dropout" 45 | }) 46 | 47 | def as_peft_config(self) -> LoraConfig: 48 | if not self.enable_lora: 49 | raise ValueError("LoRA is not enabled, cannot convert to LoRA config") 50 | params = asdict(self) 51 | params.pop("enable_lora") 52 | params["r"] = params.pop("lora_dim") 53 | return LoraConfig(**params) 54 | 55 | 56 | @dataclass 57 | class Arguments: 58 | train: dp_transformers.TrainingArguments 59 | privacy: dp_transformers.PrivacyArguments 60 | model: ModelArguments 61 | lora: LoraArguments 62 | 63 | 64 | def main(args: Arguments): 65 | transformers.set_seed(args.train.seed) 66 | 67 | # Setup logging 68 | logging.basicConfig( 69 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 70 | datefmt="%m/%d/%Y %H:%M:%S", 71 | handlers=[logging.StreamHandler(sys.stdout)], 72 | ) 73 | 74 | log_level = train_args.get_process_log_level() 75 | logger.setLevel(log_level) 76 | datasets.utils.logging.set_verbosity(log_level) 77 | transformers.utils.logging.set_verbosity(log_level) 78 | transformers.utils.logging.enable_default_handler() 79 | transformers.utils.logging.enable_explicit_format() 80 | 81 | # Log on each process the small summary: 82 | logger.warning( 83 | f"Process rank: {train_args.local_rank}, device: {train_args.device}, n_gpu: {train_args.n_gpu}, " 84 | f"distributed training: {bool(train_args.local_rank != -1)}, 16-bits training: {train_args.fp16}, " 85 | f"world size: {train_args.world_size}" 86 | ) 87 | logger.info(f"Training/evaluation parameters {train_args}") 88 | logger.info(f"Privacy parameters {privacy_args}") 89 | 90 | # Load model 91 | model = transformers.AutoModelForCausalLM.from_pretrained(args.model.model_name) 92 | model = model.to(train_args.device) 93 | 94 | # Load data 95 | dataset = datasets.load_dataset('reddit', split="train[:500000]").train_test_split(0.02, seed=args.train.seed) 96 | train_dataset = dataset['train'] 97 | test_dataset = dataset['test'] 98 | 99 | # Load tokenizer 100 | tokenizer = transformers.AutoTokenizer.from_pretrained(args.model.model_name) 101 | tokenizer.pad_token = tokenizer.eos_token 102 | 103 | # Tokenize data 104 | with train_args.main_process_first(desc="tokenizing dataset"): 105 | train_dataset = train_dataset.map( 106 | lambda batch: tokenizer(batch['content'], padding="max_length", truncation=True, max_length=args.model.sequence_len), 107 | batched=True, num_proc=8, desc="tokenizing dataset", 108 | remove_columns=[c for c in train_dataset.column_names if c != 'author'] 109 | ) 110 | test_dataset = test_dataset.map( 111 | lambda batch: tokenizer(batch['content'], padding="max_length", truncation=True, max_length=args.model.sequence_len), 112 | batched=True, num_proc=8, desc="tokenizing dataset", remove_columns=test_dataset.column_names 113 | ) 114 | 115 | author_mapping = dp_transformers.dp_utils.create_author_mapping(train_dataset, author="author") 116 | train_dataset = train_dataset.remove_columns('author') 117 | 118 | if train_args.local_rank == 0 or train_args.local_rank == -1: 119 | logger.info(f"Number of authors in the training set: {len(author_mapping)}") 120 | 121 | if args.lora.enable_lora: 122 | logger.info("Using LoRA") 123 | model = get_peft_model(model=model, peft_config=args.lora.as_peft_config()) 124 | else: 125 | logger.info("Not using LoRA") 126 | 127 | if train_args.local_rank == 0 or train_args.local_rank == -1: 128 | logger.info(f"Total number of parameters of the model: {model.num_parameters(only_trainable=False)}") 129 | logger.info(f"Fine-tuned number of parameters of the model: {model.num_parameters(only_trainable=True)}") 130 | 131 | model = model.cuda() 132 | model.train() 133 | 134 | 135 | data_collator = dp_transformers.DataCollatorForPrivateCausalLanguageModeling(tokenizer) 136 | 137 | trainer = dp_transformers.dp_utils.OpacusDPTrainer( 138 | args=train_args, 139 | model=model, 140 | train_dataset=train_dataset, 141 | eval_dataset=test_dataset, 142 | data_collator=data_collator, 143 | author_mapping=author_mapping, 144 | privacy_args=privacy_args, 145 | ) 146 | 147 | try: 148 | trainer.train() 149 | finally: 150 | eps_prv = trainer.get_prv_epsilon() 151 | eps_rdp = trainer.get_rdp_epsilon() 152 | trainer.log({ 153 | "final_epsilon_prv": eps_prv, 154 | "final_epsilon_rdp": eps_rdp 155 | }) 156 | 157 | if __name__ == "__main__": 158 | arg_parser = transformers.HfArgumentParser((dp_transformers.TrainingArguments, dp_transformers.PrivacyArguments, ModelArguments, LoraArguments )) 159 | train_args, privacy_args, model_args, lora_args = arg_parser.parse_args_into_dataclasses() 160 | main(Arguments(train=train_args, privacy=privacy_args, model=model_args, lora=lora_args)) 161 | -------------------------------------------------------------------------------- /examples/nlg-reddit/sample-level-dp/README.md: -------------------------------------------------------------------------------- 1 | # Differentially private fine-tuning of a GPT-2 style model 2 | 3 | This example fine-tunes generative language models (such as GPT-2 series) with Differential Privacy on a text corpus. 4 | In this case 500,000 samples of Reddit comments. 5 | We compare different fine-tuning techniques (full fine-tuning, LoRA) and also provide a data distributed implementation for faster training. 6 | These merely serve as examples as hyperparameters are not optimized and corresponding commands are presented below. 7 | 8 | # Results 9 | 10 | | Model (HF) | Fine-tuning Method | DP | GPUs | Epochs | Train Loss | Eval Loss | $\varepsilon$ | Run Time [s] | AML Config | 11 | | ---------- | ------------------ | --- | ------ | ------ | ---------- | --------- | ------------- | ------------ | --------------------- | 12 | | gpt2 | Full | Yes | 8xV100 | 3 | 3.75 | 3.61 | 8.0 | 1944 | fuft-eps_8.yml | 13 | | gpt2 | Full | No | 8xV100 | 3 | 3.56 | 3.46 | - | 1227 | fuft-no_inf.yml | 14 | | gpt2 | LoRA | Yes | 8xV100 | 3 | 3.74 | 3.60 | 8.0 | 1128 | peft-eps_8.yml | 15 | | gpt2 | LoRA | Yes | 1xV100 | 3 | 3.74 | 3.60 | 8.0 | 12248 | peft-eps_8-gpus_1.yml | 16 | | gpt2 | LoRA | No | 8xV100 | 3 | 3.70 | 3.58 | - | 1006 | peft-eps_inf.yml | 17 | 18 | 19 | ## Azure Machine Learning 20 | 21 | We provide Azure Machine Learning (AML) configuration files for the above experiments. 22 | 23 | ``` 24 | az ml job create --file aml/ 25 | ``` 26 | 27 | 28 | ## Local Training 29 | 30 | Alternatively, you can run the training script directly on your local machine. 31 | 32 | Install the environment (assuming CUDA 11.6) with 33 | 34 | ``` 35 | conda env create -f environment.yml 36 | conda activate dp-transformers 37 | ``` 38 | 39 | And run one of the following training scripts. 40 | 41 | ### Fine-tune the full model with DP 42 | 43 | ``` 44 | python -m torch.distributed.run --nproc_per_node 16 fine-tune-dp.py \ 45 | --output_dir scratch \ 46 | --model_name gpt2 \ 47 | --sequence_len 128 \ 48 | --per_device_train_batch_size 32 \ 49 | --gradient_accumulation_steps 2 \ 50 | --evaluation_strategy steps \ 51 | --eval_steps 45 \ 52 | --log_level info \ 53 | --per_device_eval_batch_size 64 \ 54 | --eval_accumulation_steps 1 \ 55 | --seed 42 \ 56 | --target_epsilon 8 \ 57 | --per_sample_max_grad_norm 1.0 \ 58 | --prediction_loss_only \ 59 | --weight_decay 0.01 \ 60 | --remove_unused_columns False \ 61 | --num_train_epochs 3 \ 62 | --logging_steps 5 \ 63 | --max_grad_norm 0 \ 64 | --lr_scheduler_type constant \ 65 | --learning_rate 1e-4 \ 66 | --disable_tqdm True \ 67 | --dataloader_num_workers 2 \ 68 | --label_names labels 69 | ``` 70 | 71 | ### Fine-tune the full model without DP 72 | 73 | ``` 74 | python -m torch.distributed.run --nproc_per_node 16 fine-tune-nodp.py \ 75 | --output_dir scratch \ 76 | --model_name gpt2 \ 77 | --sequence_len 128 \ 78 | --per_device_train_batch_size 64 \ 79 | --gradient_accumulation_steps 1 \ 80 | --evaluation_strategy steps \ 81 | --eval_steps 45 \ 82 | --log_level info \ 83 | --per_device_eval_batch_size 64 \ 84 | --eval_accumulation_steps 1 \ 85 | --seed 42 \ 86 | --prediction_loss_only \ 87 | --weight_decay 0.01 \ 88 | --remove_unused_columns False \ 89 | --num_train_epochs 3 \ 90 | --logging_steps 5 \ 91 | --max_grad_norm 0 \ 92 | --lr_scheduler_type constant \ 93 | --learning_rate 2e-4 \ 94 | --disable_tqdm True \ 95 | --dataloader_num_workers 2 \ 96 | --label_names labels 97 | ``` 98 | 99 | ### Fine-tune only the LoRA layers introduced into the model with DP 100 | 101 | ``` 102 | python -m torch.distributed.run --nproc_per_node 16 fine-tune-dp.py \ 103 | --output_dir scratch \ 104 | --model_name gpt2 \ 105 | --sequence_len 128 \ 106 | --per_device_train_batch_size 64 \ 107 | --gradient_accumulation_steps 1 \ 108 | --evaluation_strategy steps \ 109 | --eval_steps 45 \ 110 | --log_level info \ 111 | --per_device_eval_batch_size 64 \ 112 | --eval_accumulation_steps 1 \ 113 | --seed 42 \ 114 | --target_epsilon 8 \ 115 | --per_sample_max_grad_norm 1.0 \ 116 | --prediction_loss_only \ 117 | --weight_decay 0.01 \ 118 | --remove_unused_columns False \ 119 | --num_train_epochs 3 \ 120 | --logging_steps 5 \ 121 | --lora_dim 4 \ 122 | --lora_alpha 32 \ 123 | --lora_dropout 0.0 \ 124 | --max_grad_norm 0 \ 125 | --lr_scheduler_type constant \ 126 | --learning_rate 3e-4 \ 127 | --disable_tqdm True \ 128 | --dataloader_num_workers 2 \ 129 | --label_names labels \ 130 | --enable_lora 131 | ``` 132 | 133 | ### Fine-tune only the LoRA layers introduced into the model without DP 134 | 135 | ``` 136 | python -m torch.distributed.run --nproc_per_node 16 fine-tune-nodp.py \ 137 | --output_dir scratch \ 138 | --model_name gpt2 \ 139 | --sequence_len 128 \ 140 | --per_device_train_batch_size 64 \ 141 | --gradient_accumulation_steps 1 \ 142 | --evaluation_strategy steps \ 143 | --eval_steps 45 \ 144 | --log_level info \ 145 | --per_device_eval_batch_size 64 \ 146 | --eval_accumulation_steps 1 \ 147 | --seed 42 \ 148 | --prediction_loss_only \ 149 | --weight_decay 0.01 \ 150 | --remove_unused_columns False \ 151 | --num_train_epochs 3 \ 152 | --logging_steps 5 \ 153 | --lora_dim 4 \ 154 | --lora_alpha 32 \ 155 | --lora_dropout 0.0 \ 156 | --max_grad_norm 0 \ 157 | --lr_scheduler_type constant \ 158 | --learning_rate 5e-4 \ 159 | --disable_tqdm True \ 160 | --dataloader_num_workers 2 \ 161 | --label_names labels \ 162 | --enable_lora 163 | ``` -------------------------------------------------------------------------------- /examples/nlg-reddit/sample-level-dp/aml/fuft-eps_8.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json 2 | code: ../../../../ 3 | command: >- 4 | python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 examples/nlg-reddit/sample-level-dp/fine-tune-dp.py \ 5 | --output_dir outputs \ 6 | --model_name gpt2 \ 7 | --sequence_len 128 \ 8 | --per_device_train_batch_size 16 \ 9 | --gradient_accumulation_steps 8 \ 10 | --evaluation_strategy steps \ 11 | --eval_steps 128 \ 12 | --log_level info \ 13 | --per_device_eval_batch_size 64 \ 14 | --eval_accumulation_steps 1 \ 15 | --seed 42 \ 16 | --target_epsilon 8 \ 17 | --target_delta 2e-6 \ 18 | --per_sample_max_grad_norm 1.0 \ 19 | --prediction_loss_only \ 20 | --weight_decay 0.01 \ 21 | --remove_unused_columns False \ 22 | --num_train_epochs 3 \ 23 | --logging_steps 5 \ 24 | --max_grad_norm 0 \ 25 | --lr_scheduler_type constant \ 26 | --learning_rate 1e-4 \ 27 | --disable_tqdm True \ 28 | --dataloader_num_workers 2 \ 29 | --label_names labels 30 | environment: 31 | image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04 32 | conda_file: ../environment.yml 33 | compute: azureml:ND40rsv2 34 | display_name: full_fine_tuning-epsilon_8 35 | experiment_name: dp-transformers-nlg-reddit-sample-level-dp 36 | description: Train a model on the Reddit dataset using differential privacy -------------------------------------------------------------------------------- /examples/nlg-reddit/sample-level-dp/aml/fuft-eps_inf.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json 2 | code: ../../../../ 3 | command: >- 4 | python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 examples/nlg-reddit/sample-level-dp/fine-tune-nodp.py \ 5 | --output_dir outputs \ 6 | --model_name gpt2 \ 7 | --sequence_len 128 \ 8 | --per_device_train_batch_size 64 \ 9 | --gradient_accumulation_steps 2 \ 10 | --evaluation_strategy steps \ 11 | --eval_steps 128 \ 12 | --log_level info \ 13 | --per_device_eval_batch_size 64 \ 14 | --eval_accumulation_steps 1 \ 15 | --seed 42 \ 16 | --prediction_loss_only \ 17 | --weight_decay 0.01 \ 18 | --remove_unused_columns False \ 19 | --num_train_epochs 3 \ 20 | --logging_steps 5 \ 21 | --max_grad_norm 0 \ 22 | --lr_scheduler_type constant \ 23 | --learning_rate 1e-4 \ 24 | --disable_tqdm True \ 25 | --dataloader_num_workers 2 \ 26 | --label_names labels 27 | environment: 28 | image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04 29 | conda_file: ../environment.yml 30 | compute: azureml:ND40rsv2 31 | display_name: full_fine_tuning-epsilon_inf 32 | experiment_name: dp-transformers-nlg-reddit-sample-level-dp 33 | description: Train a model on the Reddit dataset using differential privacy -------------------------------------------------------------------------------- /examples/nlg-reddit/sample-level-dp/aml/peft-eps_8-gpus_1.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json 2 | code: ../../../../ 3 | command: >- 4 | python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 1 examples/nlg-reddit/sample-level-dp/fine-tune-dp.py \ 5 | --output_dir outputs \ 6 | --model_name gpt2 \ 7 | --sequence_len 128 \ 8 | --per_device_train_batch_size 64 \ 9 | --gradient_accumulation_steps 16 \ 10 | --evaluation_strategy steps \ 11 | --eval_steps 128 \ 12 | --log_level info \ 13 | --per_device_eval_batch_size 64 \ 14 | --eval_accumulation_steps 1 \ 15 | --seed 42 \ 16 | --target_epsilon 8 \ 17 | --target_delta 2e-6 \ 18 | --per_sample_max_grad_norm 1.0 \ 19 | --prediction_loss_only \ 20 | --weight_decay 0.01 \ 21 | --remove_unused_columns False \ 22 | --num_train_epochs 3 \ 23 | --logging_steps 5 \ 24 | --max_grad_norm 0 \ 25 | --lr_scheduler_type constant \ 26 | --learning_rate 1e-4 \ 27 | --disable_tqdm True \ 28 | --dataloader_num_workers 2 \ 29 | --lora_dim 4 \ 30 | --lora_alpha 32 \ 31 | --lora_dropout 0.0 \ 32 | --label_names labels \ 33 | --enable_lora 34 | environment: 35 | image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04 36 | conda_file: ../environment.yml 37 | compute: azureml:NC6v3 38 | display_name: parameter_efficient_fine_tuning-epsilon_8-gpus_1 39 | experiment_name: dp-transformers-nlg-reddit-sample-level-dp 40 | description: Train a model on the Reddit dataset using differential privacy -------------------------------------------------------------------------------- /examples/nlg-reddit/sample-level-dp/aml/peft-eps_8.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json 2 | code: ../../../../ 3 | command: >- 4 | python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 examples/nlg-reddit/sample-level-dp/fine-tune-dp.py \ 5 | --output_dir outputs \ 6 | --model_name gpt2 \ 7 | --sequence_len 128 \ 8 | --per_device_train_batch_size 128 \ 9 | --gradient_accumulation_steps 1 \ 10 | --evaluation_strategy steps \ 11 | --eval_steps 128 \ 12 | --log_level info \ 13 | --per_device_eval_batch_size 64 \ 14 | --eval_accumulation_steps 1 \ 15 | --seed 42 \ 16 | --target_epsilon 8 \ 17 | --target_delta 2e-6 \ 18 | --per_sample_max_grad_norm 1.0 \ 19 | --prediction_loss_only \ 20 | --weight_decay 0.01 \ 21 | --remove_unused_columns False \ 22 | --num_train_epochs 3 \ 23 | --logging_steps 5 \ 24 | --max_grad_norm 0 \ 25 | --lr_scheduler_type constant \ 26 | --learning_rate 1e-4 \ 27 | --disable_tqdm True \ 28 | --dataloader_num_workers 2 \ 29 | --lora_dim 4 \ 30 | --lora_alpha 32 \ 31 | --lora_dropout 0.0 \ 32 | --enable_lora \ 33 | --label_names labels 34 | environment: 35 | image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04 36 | conda_file: ../environment.yml 37 | compute: azureml:ND40rsv2 38 | display_name: parameter_efficient_fine_tuning-epsilon_8 39 | experiment_name: dp-transformers-nlg-reddit-sample-level-dp 40 | description: Train a model on the Reddit dataset using differential privacy -------------------------------------------------------------------------------- /examples/nlg-reddit/sample-level-dp/aml/peft-eps_inf.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json 2 | code: ../../../../ 3 | command: >- 4 | python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 examples/nlg-reddit/sample-level-dp/fine-tune-nodp.py \ 5 | --output_dir outputs \ 6 | --model_name gpt2 \ 7 | --sequence_len 128 \ 8 | --per_device_train_batch_size 128 \ 9 | --gradient_accumulation_steps 1 \ 10 | --evaluation_strategy steps \ 11 | --eval_steps 128 \ 12 | --log_level info \ 13 | --per_device_eval_batch_size 64 \ 14 | --eval_accumulation_steps 1 \ 15 | --seed 42 \ 16 | --prediction_loss_only \ 17 | --weight_decay 0.01 \ 18 | --remove_unused_columns False \ 19 | --num_train_epochs 3 \ 20 | --logging_steps 5 \ 21 | --max_grad_norm 0 \ 22 | --lr_scheduler_type constant \ 23 | --learning_rate 1e-4 \ 24 | --disable_tqdm True \ 25 | --dataloader_num_workers 2 \ 26 | --lora_dim 4 \ 27 | --lora_alpha 32 \ 28 | --lora_dropout 0.0 \ 29 | --enable_lora \ 30 | --label_names labels 31 | environment: 32 | image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04 33 | conda_file: ../environment.yml 34 | compute: azureml:ND40rsv2 35 | display_name: parameter_efficient_fine_tuning-epsilon_inf 36 | experiment_name: dp-transformers-nlg-reddit-sample-level-dp 37 | description: Train a model on the Reddit dataset using differential privacy -------------------------------------------------------------------------------- /examples/nlg-reddit/sample-level-dp/environment.yml: -------------------------------------------------------------------------------- 1 | name: dp-transformers 2 | channels: 3 | - pytorch 4 | - nvidia 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1 8 | - _openmp_mutex=5.1 9 | - blas=1.0 10 | - bzip2=1.0.8 11 | - ca-certificates=2023.05.30 12 | - cuda-cudart=11.8.89 13 | - cuda-cupti=11.8.87 14 | - cuda-libraries=11.8.0 15 | - cuda-nvrtc=11.8.89 16 | - cuda-nvtx=11.8.86 17 | - cuda-runtime=11.8.0 18 | - filelock=3.9.0 19 | - gmp=6.2.1 20 | - gmpy2=2.1.2 21 | - intel-openmp=2023.1.0 22 | - jinja2=3.1.2 23 | - ld_impl_linux-64=2.38 24 | - libcublas=11.11.3.6 25 | - libcufft=10.9.0.58 26 | - libcufile=1.7.1.12 27 | - libcurand=10.3.3.129 28 | - libcusolver=11.4.1.48 29 | - libcusparse=11.7.5.86 30 | - libffi=3.4.4 31 | - libgcc-ng=11.2.0 32 | - libgomp=11.2.0 33 | - libnpp=11.8.0.86 34 | - libnvjpeg=11.9.0.86 35 | - libstdcxx-ng=11.2.0 36 | - libuuid=1.41.5 37 | - markupsafe=2.1.1 38 | - mkl=2023.1.0 39 | - mpc=1.1.0 40 | - mpfr=4.0.2 41 | - mpmath=1.3.0 42 | - ncurses=6.4 43 | - networkx=3.1 44 | - openssl=3.0.10 45 | - pip=23.2.1 46 | - python=3.10.12 47 | - pytorch=2.0.1 48 | - pytorch-cuda=11.8 49 | - pytorch-mutex=1.0 50 | - readline=8.2 51 | - setuptools=68.0.0 52 | - sqlite=3.41.2 53 | - sympy=1.11.1 54 | - tbb=2021.8.0 55 | - tk=8.6.12 56 | - torchtriton=2.0.0 57 | - typing_extensions=4.7.1 58 | - wheel=0.38.4 59 | - xz=5.4.2 60 | - zlib=1.2.13 61 | - pip: 62 | - accelerate==0.21.0 63 | - aiohttp==3.8.5 64 | - aiosignal==1.3.1 65 | - alembic==1.11.2 66 | - async-timeout==4.0.3 67 | - attrs==23.1.0 68 | - azure-common==1.1.28 69 | - azure-core==1.29.2 70 | - azure-identity==1.14.0 71 | - azure-mgmt-core==1.4.0 72 | - azure-storage-blob==12.13.0 73 | - azureml-mlflow==1.52.0 74 | - blinker==1.6.2 75 | - certifi==2023.7.22 76 | - cffi==1.15.1 77 | - charset-normalizer==3.2.0 78 | - click==8.1.6 79 | - cloudpickle==2.2.1 80 | - contourpy==1.1.0 81 | - cryptography==41.0.3 82 | - cycler==0.11.0 83 | - databricks-cli==0.17.7 84 | - datasets==2.14.4 85 | - dill==0.3.7 86 | - docker==6.1.3 87 | - entrypoints==0.4 88 | - exceptiongroup==1.1.3 89 | - flask==2.3.2 90 | - fonttools==4.42.0 91 | - frozenlist==1.4.0 92 | - fsspec==2023.6.0 93 | - gitdb==4.0.10 94 | - gitpython==3.1.32 95 | - greenlet==2.0.2 96 | - gunicorn==21.2.0 97 | - huggingface-hub==0.19.4 98 | - idna==3.4 99 | - importlib-metadata==6.8.0 100 | - iniconfig==2.0.0 101 | - isodate==0.6.1 102 | - itsdangerous==2.1.2 103 | - joblib==1.3.2 104 | - jsonpickle==3.0.2 105 | - kiwisolver==1.4.4 106 | - mako==1.2.4 107 | - markdown==3.4.4 108 | - matplotlib==3.7.2 109 | - mlflow==2.6.0 110 | - mlflow-skinny==2.6.0 111 | - msal==1.23.0 112 | - msal-extensions==1.0.0 113 | - msrest==0.7.1 114 | - multidict==6.0.4 115 | - multiprocess==0.70.15 116 | - numpy==1.25.2 117 | - oauthlib==3.2.2 118 | - opacus==1.4.0 119 | - opt-einsum==3.3.0 120 | - packaging==23.1 121 | - pandas==2.0.3 122 | - peft==0.4.0 123 | - pillow==10.0.0 124 | - pluggy==1.2.0 125 | - portalocker==2.7.0 126 | - protobuf==4.24.0 127 | - prv-accountant==0.1.1.post1 128 | - psutil==5.9.5 129 | - pyarrow==12.0.1 130 | - pycparser==2.21 131 | - pyjwt==2.8.0 132 | - pyparsing==3.0.9 133 | - pytest==7.4.0 134 | - python-dateutil==2.8.2 135 | - pytz==2023.3 136 | - pyyaml==6.0.1 137 | - querystring-parser==1.2.4 138 | - regex==2023.8.8 139 | - requests==2.31.0 140 | - requests-oauthlib==1.3.1 141 | - safetensors==0.3.2 142 | - scikit-learn==1.3.0 143 | - scipy==1.11.1 144 | - six==1.16.0 145 | - smmap==5.0.0 146 | - sqlalchemy==2.0.19 147 | - sqlparse==0.4.4 148 | - tabulate==0.9.0 149 | - threadpoolctl==3.2.0 150 | - tokenizers==0.15.0 151 | - tomli==2.0.1 152 | - tqdm==4.66.1 153 | - transformers==4.36.1 154 | - tzdata==2023.3 155 | - urllib3==1.26.16 156 | - websocket-client==1.6.1 157 | - werkzeug==2.3.7 158 | - xxhash==3.3.0 159 | - yarl==1.9.2 160 | - zipp==3.16.2 161 | -------------------------------------------------------------------------------- /examples/nlg-reddit/sample-level-dp/fine-tune-dp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | '''Train GPT2 model series with DP (w/ optional parameter-efficient approach LoRA)''' 5 | 6 | import datasets 7 | import dp_transformers 8 | import transformers 9 | import sys 10 | import logging 11 | 12 | from dataclasses import dataclass, field, asdict 13 | from peft import get_peft_model, LoraConfig 14 | 15 | from dp_transformers.grad_sample.transformers import conv_1d 16 | 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | @dataclass 22 | class ModelArguments: 23 | model_name: str = field(default="gpt2", metadata={ 24 | "help": "Model name in HuggingFace, e.g. 'gpt2'" 25 | }) 26 | sequence_len: int = field(default=128, metadata={ 27 | "help": "Maximum sequence length" 28 | }) 29 | 30 | 31 | @dataclass 32 | class LoraArguments: 33 | enable_lora: bool = field(default=False, metadata={ 34 | "help": "Whether to enable LoRA" 35 | }) 36 | lora_dim: int = field(default=8, metadata={ 37 | "help": "LoRA dimension" 38 | }) 39 | lora_alpha: int = field(default=8, metadata={ 40 | "help": "LoRA alpha" 41 | }) 42 | lora_dropout: float = field(default=0.0, metadata={ 43 | "help": "LoRA dropout" 44 | }) 45 | 46 | def as_peft_config(self) -> LoraConfig: 47 | if not self.enable_lora: 48 | raise ValueError("LoRA is not enabled, cannot convert to LoRA config") 49 | params = asdict(self) 50 | params.pop("enable_lora") 51 | params["r"] = params.pop("lora_dim") 52 | return LoraConfig(**params) 53 | 54 | 55 | @dataclass 56 | class Arguments: 57 | train: dp_transformers.TrainingArguments 58 | privacy: dp_transformers.PrivacyArguments 59 | model: ModelArguments 60 | lora: LoraConfig 61 | 62 | 63 | def main(args: Arguments): 64 | transformers.set_seed(args.train.seed) 65 | 66 | # Setup logging 67 | logging.basicConfig( 68 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 69 | datefmt="%m/%d/%Y %H:%M:%S", 70 | handlers=[logging.StreamHandler(sys.stdout)], 71 | ) 72 | 73 | log_level = train_args.get_process_log_level() 74 | logger.setLevel(log_level) 75 | datasets.utils.logging.set_verbosity(log_level) 76 | transformers.utils.logging.set_verbosity(log_level) 77 | transformers.utils.logging.enable_default_handler() 78 | transformers.utils.logging.enable_explicit_format() 79 | 80 | # Log on each process the small summary: 81 | logger.warning( 82 | f"Process rank: {train_args.local_rank}, device: {train_args.device}, n_gpu: {train_args.n_gpu}, " 83 | f"distributed training: {bool(train_args.local_rank != -1)}, 16-bits training: {train_args.fp16}" 84 | ) 85 | logger.info(f"Training/evaluation parameters {train_args}") 86 | logger.info(f"Privacy parameters {privacy_args}") 87 | 88 | # Load model 89 | model = transformers.AutoModelForCausalLM.from_pretrained(args.model.model_name) 90 | model = model.to(train_args.device) 91 | 92 | # Load data 93 | dataset = datasets.load_dataset('reddit', split="train[:500000]").train_test_split(0.02, seed=args.train.seed) 94 | 95 | # Load tokenizer 96 | tokenizer = transformers.AutoTokenizer.from_pretrained(args.model.model_name) 97 | tokenizer.pad_token = tokenizer.eos_token 98 | 99 | # Tokenize data 100 | with train_args.main_process_first(desc="tokenizing dataset"): 101 | dataset = dataset.map( 102 | lambda batch: tokenizer(batch['content'], padding="max_length", truncation=True, max_length=args.model.sequence_len), 103 | batched=True, num_proc=8, desc="tokenizing dataset", remove_columns=dataset.column_names['train'] 104 | ) 105 | 106 | if args.lora.enable_lora: 107 | logger.info("Using LoRA") 108 | model = get_peft_model(model=model, peft_config=args.lora.as_peft_config()) 109 | else: 110 | logger.info("Not using LoRA") 111 | 112 | if train_args.local_rank == 0: 113 | logger.info(f"Total number of parameters of the model: {model.num_parameters(only_trainable=False)}") 114 | logger.info(f"Fine-tuned number of parameters of the model: {model.num_parameters(only_trainable=True)}") 115 | 116 | model = model.cuda() 117 | model.train() 118 | 119 | data_collator = dp_transformers.DataCollatorForPrivateCausalLanguageModeling(tokenizer) 120 | 121 | trainer = dp_transformers.dp_utils.OpacusDPTrainer( 122 | args=train_args, 123 | model=model, 124 | train_dataset=dataset['train'], 125 | eval_dataset=dataset['test'], 126 | data_collator=data_collator, 127 | privacy_args=privacy_args, 128 | ) 129 | 130 | try: 131 | trainer.train() 132 | finally: 133 | eps_prv = trainer.get_prv_epsilon() 134 | eps_rdp = trainer.get_rdp_epsilon() 135 | trainer.log({ 136 | "final_epsilon_prv": eps_prv, 137 | "final_epsilon_rdp": eps_rdp 138 | }) 139 | 140 | if __name__ == "__main__": 141 | arg_parser = transformers.HfArgumentParser((dp_transformers.TrainingArguments, dp_transformers.PrivacyArguments, ModelArguments, LoraArguments)) 142 | train_args, privacy_args, model_args, lora_args = arg_parser.parse_args_into_dataclasses() 143 | main(Arguments(train=train_args, privacy=privacy_args, model=model_args, lora=lora_args)) 144 | -------------------------------------------------------------------------------- /examples/nlg-reddit/sample-level-dp/fine-tune-nodp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | '''Train GPT2 model series without DP (w/ parameter-efficient approach LoRA when lora_dim > 0)''' 5 | 6 | import datasets 7 | import dp_transformers 8 | import transformers 9 | import sys 10 | import logging 11 | 12 | from dataclasses import dataclass, field 13 | from dataclasses import dataclass, field, asdict 14 | from peft import get_peft_model, LoraConfig 15 | 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | @dataclass 21 | class ModelArguments: 22 | model_name: str = field(default="gpt2", metadata={ 23 | "help": "Model name in HuggingFace, e.g. 'gpt2'" 24 | }) 25 | sequence_len: int = field(default=128, metadata={ 26 | "help": "Maximum sequence length" 27 | }) 28 | 29 | 30 | @dataclass 31 | class LoraArguments: 32 | enable_lora: bool = field(default=False, metadata={ 33 | "help": "Whether to enable LoRA" 34 | }) 35 | lora_dim: int = field(default=8, metadata={ 36 | "help": "LoRA dimension" 37 | }) 38 | lora_alpha: int = field(default=8, metadata={ 39 | "help": "LoRA alpha" 40 | }) 41 | lora_dropout: float = field(default=0.0, metadata={ 42 | "help": "LoRA dropout" 43 | }) 44 | 45 | def as_peft_config(self) -> LoraConfig: 46 | if not self.enable_lora: 47 | raise ValueError("LoRA is not enabled, cannot convert to LoRA config") 48 | params = asdict(self) 49 | params.pop("enable_lora") 50 | params["r"] = params.pop("lora_dim") 51 | return LoraConfig(**params) 52 | 53 | 54 | @dataclass 55 | class Arguments: 56 | train: dp_transformers.TrainingArguments 57 | model: ModelArguments 58 | lora: LoraArguments 59 | 60 | 61 | def main(args: Arguments): 62 | transformers.set_seed(args.train.seed) 63 | 64 | # Setup logging 65 | logging.basicConfig( 66 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 67 | datefmt="%m/%d/%Y %H:%M:%S", 68 | handlers=[logging.StreamHandler(sys.stdout)], 69 | ) 70 | 71 | log_level = train_args.get_process_log_level() 72 | logger.setLevel(log_level) 73 | datasets.utils.logging.set_verbosity(log_level) 74 | transformers.utils.logging.set_verbosity(log_level) 75 | transformers.utils.logging.enable_default_handler() 76 | transformers.utils.logging.enable_explicit_format() 77 | 78 | # Log on each process the small summary: 79 | logger.warning( 80 | f"Process rank: {train_args.local_rank}, device: {train_args.device}, n_gpu: {train_args.n_gpu}, " 81 | f"distributed training: {bool(train_args.local_rank != -1)}, 16-bits training: {train_args.fp16}" 82 | ) 83 | logger.info(f"Training/evaluation parameters {train_args}") 84 | 85 | # Load model 86 | model = transformers.AutoModelForCausalLM.from_pretrained(args.model.model_name) 87 | model = model.to(train_args.device) 88 | 89 | # Load data 90 | dataset = datasets.load_dataset('reddit', split="train[:500000]").train_test_split(0.02, seed=args.train.seed) 91 | 92 | # Load tokenizer 93 | tokenizer = transformers.AutoTokenizer.from_pretrained(args.model.model_name) 94 | tokenizer.pad_token = tokenizer.eos_token 95 | 96 | # Tokenize data 97 | with train_args.main_process_first(desc="tokenizing dataset"): 98 | dataset = dataset.map( 99 | lambda batch: tokenizer(batch['content'], padding="max_length", truncation=True, max_length=args.model.sequence_len), 100 | batched=True, num_proc=8, desc="tokenizing dataset", remove_columns=dataset.column_names['train'] 101 | ) 102 | 103 | if args.lora.enable_lora: 104 | logger.info("Using LoRA") 105 | model = get_peft_model(model=model, peft_config=args.lora.as_peft_config()) 106 | else: 107 | logger.info("Not using LoRA") 108 | 109 | if train_args.local_rank == 0: 110 | logger.info(f"Total number of parameters of the model: {model.num_parameters(only_trainable=False)}") 111 | logger.info(f"Fine-tuned number of parameters of the model: {model.num_parameters(only_trainable=True)}") 112 | 113 | model = model.cuda() 114 | model.train() 115 | 116 | data_collator = dp_transformers.DataCollatorForPrivateCausalLanguageModeling(tokenizer) 117 | 118 | trainer = transformers.Trainer( 119 | args=train_args, 120 | model=model, 121 | train_dataset=dataset['train'], 122 | eval_dataset=dataset['test'], 123 | data_collator=data_collator 124 | ) 125 | 126 | trainer.train() 127 | 128 | if __name__ == "__main__": 129 | arg_parser = transformers.HfArgumentParser((dp_transformers.TrainingArguments, ModelArguments, LoraArguments)) 130 | train_args, model_args, lora_args = arg_parser.parse_args_into_dataclasses() 131 | main(Arguments(train=train_args, model=model_args, lora=lora_args)) 132 | -------------------------------------------------------------------------------- /examples/test_examples.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import json 3 | import time 4 | 5 | from subprocess import check_output 6 | from pathlib import Path 7 | from typing import Dict, Union 8 | from azureml.core import Workspace, Run 9 | from dataclasses import dataclass 10 | from datetime import timedelta, datetime 11 | 12 | 13 | @pytest.fixture(scope="session") 14 | def az_workspace() -> Workspace: 15 | subscription_id = json.loads(check_output(["az", "account", "show", "--query", "id"])) 16 | 17 | output = json.loads(check_output(["az", "configure", "--list-defaults"])) 18 | resource_group = next(item for item in output if item["name"] == "group")["value"] 19 | workspace_name = next(item for item in output if item["name"] == "workspace")["value"] 20 | 21 | workspace = Workspace( 22 | subscription_id=subscription_id, 23 | resource_group=resource_group, 24 | workspace_name=workspace_name, 25 | ) 26 | 27 | return workspace 28 | 29 | 30 | def submit_example_and_wait_for_metrics(ws: Workspace, aml_config_path: Path) -> Dict[str, Union[float, int]]: 31 | raw_output = check_output(["az", "ml", "job", "create", "--file", aml_config_path]) 32 | output = json.loads(raw_output) 33 | run = Run.get(ws, run_id=output["name"]) 34 | print(f"Submitted run {run.get_portal_url()}") 35 | try: 36 | run.wait_for_completion() 37 | except KeyboardInterrupt as e: 38 | run.cancel() 39 | raise e 40 | except Exception as e: 41 | run.cancel() 42 | raise e 43 | 44 | waiting_for_details = True 45 | while waiting_for_details: 46 | details = run.get_details() 47 | if "endTimeUtc" in details: 48 | waiting_for_details = False 49 | else: 50 | time.sleep(30) 51 | 52 | if run.get_status() != "Completed": 53 | raise RuntimeError(f"Run did not complete successfully. Status: {run.get_status()}, AML URL: {run.get_portal_url()}") 54 | 55 | 56 | metrics = run.get_metrics() 57 | 58 | metrics["runtime"] = ( 59 | datetime.strptime(details["endTimeUtc"], '%Y-%m-%dT%H:%M:%S.%fZ') - 60 | datetime.strptime(details["startTimeUtc"], '%Y-%m-%dT%H:%M:%S.%fZ') 61 | ) 62 | 63 | return metrics 64 | 65 | 66 | @dataclass 67 | class ExampleTest: 68 | aml_config_path: Path 69 | expected_trn_loss: float 70 | expected_val_loss: float 71 | expected_time: timedelta 72 | 73 | def __str__(self): 74 | return f"Example({self.aml_config_path})" 75 | 76 | 77 | @pytest.mark.parametrize("example_test", 78 | [ 79 | ExampleTest( 80 | aml_config_path=Path("examples")/"nlg-reddit"/"author-level-dp"/"aml"/"fuft-eps_8.yml", 81 | expected_trn_loss=3.76, 82 | expected_val_loss=3.62, 83 | expected_time=timedelta(minutes=52, seconds=15), 84 | ), 85 | ExampleTest( 86 | aml_config_path=Path("examples")/"nlg-reddit"/"author-level-dp"/"aml"/"peft-eps_8.yml", 87 | expected_trn_loss=3.79, 88 | expected_val_loss=3.62, 89 | expected_time=timedelta(minutes=32, seconds=45), 90 | ), 91 | ExampleTest( 92 | aml_config_path=Path("examples")/"nlg-reddit"/"sample-level-dp"/"aml"/"fuft-eps_8.yml", 93 | expected_trn_loss=3.74, 94 | expected_val_loss=3.59, 95 | expected_time=timedelta(hours=1, minutes=15), 96 | ), 97 | ExampleTest( 98 | aml_config_path=Path("examples")/"nlg-reddit"/"sample-level-dp"/"aml"/"fuft-eps_inf.yml", 99 | expected_trn_loss=3.58, 100 | expected_val_loss=3.47, 101 | expected_time=timedelta(minutes=50, seconds=15), 102 | ), 103 | ExampleTest( 104 | aml_config_path=Path("examples")/"nlg-reddit"/"sample-level-dp"/"aml"/"peft-eps_8.yml", 105 | expected_trn_loss=3.76, 106 | expected_val_loss=3.60, 107 | expected_time=timedelta(minutes=42, seconds=30), 108 | ), 109 | ExampleTest( 110 | aml_config_path=Path("examples")/"nlg-reddit"/"sample-level-dp"/"aml"/"peft-eps_inf.yml", 111 | expected_trn_loss=3.72, 112 | expected_val_loss=3.60, 113 | expected_time=timedelta(minutes=42, seconds=0), 114 | ), 115 | ], 116 | ids=str 117 | ) 118 | def test_nlg_reddit(az_workspace, example_test: ExampleTest): 119 | metrics = submit_example_and_wait_for_metrics(az_workspace, aml_config_path=example_test.aml_config_path) 120 | 121 | print(f"Test {example_test.aml_config_path}: {metrics}") 122 | assert metrics["train_loss"] == pytest.approx(example_test.expected_trn_loss, abs=0.02) 123 | assert metrics["eval_loss"][-1] == pytest.approx(example_test.expected_val_loss, abs=0.02) 124 | allowed_time_delta = timedelta(minutes=5) 125 | if abs(metrics["runtime"] - example_test.expected_time) > allowed_time_delta: 126 | print(f"::warning file={__file__}:: {example_test.aml_config_path} took {metrics['runtime']} to run, expected " 127 | f"{example_test.expected_time} +- {allowed_time_delta}") 128 | -------------------------------------------------------------------------------- /research/fine_tune_llm_w_qlora/README.md: -------------------------------------------------------------------------------- 1 | # Differentially private fine-tuning of LLMs using QLoRA 2 | 3 | We demonstrate examples of fine-tuning Mistral 7B using QLoRA with and without DP. 4 | 5 | # Results 6 | 7 | | Dataset (HF) | DP | GPUs | Epochs | Max Eval Accuracy | $\varepsilon$ | Run Time [s] | AML Config | 8 | | ---------- | --- | ------ | ------ | --------- | ------------- | ------------ | --------------------- | 9 | | sst2 | Yes | 8xA100 | 3 | 96.44 | 8.0 | . | sst2/peft-eps_8.yml | 10 | | sst2 | No | 8xA100 | 3 | 97.25 | - | . | sst2/peft-eps_inf.yml | 11 | | qnli | Yes | 8xA100 | 3 | 94.80 | 8.0 | . | qnli/peft-eps_8.yml | 12 | | qnli | No | 8xA100 | 3 | 96.40 | - | . | qnli/peft-eps_inf.yml | 13 | 14 | | Dataset (HF) | DP | GPUs | Epochs | Min Eval Loss | Test ROUGE1 | Test ROUGE2 | Test ROUGEL | $\varepsilon$ | Run Time [s] | AML Config | 15 | | ------------ | --- | ------ | ------ | ------------- | ----------- | ----------- | ----------- | ------------- | ------------ | ---------- | 16 | | cnn | Yes | 8xA100 | 3 | 0.9624 | 44.16 | 22.16 | 30.99 | 8.0 | . | cnn/peft-eps_8.yml | 17 | | cnn | No | 8xA100 | 3 | 0.9188 | 45.05 | 22.99 | 31.69 | - | . | cnn/peft-eps_inf.yml | 18 | 19 | ## Azure Machine Learning 20 | 21 | We provide Azure Machine Learning (AML) configuration files for the above experiments. 22 | 23 | ``` 24 | az ml job create --file aml/ 25 | ``` 26 | 27 | 28 | ## Local Training 29 | 30 | Alternatively, you can run the training script directly on your local machine. 31 | 32 | Install the environment with 33 | 34 | ``` 35 | conda env create -f environment.yml 36 | conda activate dp-transformers 37 | ``` 38 | 39 | Follow the training scripts under aml folder. 40 | -------------------------------------------------------------------------------- /research/fine_tune_llm_w_qlora/aml/cnn/peft-eps_8.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json 2 | code: ../../../../ 3 | command: >- 4 | python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 research/fine_tune_llm_w_qlora/fine-tune-dp.py \ 5 | --output_dir outputs \ 6 | --model_name mistralai/Mistral-7B-v0.1 \ 7 | --dataset_name cnn \ 8 | --sequence_len 1024 \ 9 | --per_device_train_batch_size 16 \ 10 | --gradient_accumulation_steps 8 \ 11 | --evaluation_strategy steps \ 12 | --eval_steps 10 \ 13 | --save_strategy no \ 14 | --log_level info \ 15 | --per_device_eval_batch_size 32 \ 16 | --eval_accumulation_steps 1 \ 17 | --seed 42 \ 18 | --target_epsilon 8 \ 19 | --target_delta 1e-5 \ 20 | --per_sample_max_grad_norm 1.0 \ 21 | --weight_decay 0.01 \ 22 | --remove_unused_columns False \ 23 | --num_train_epochs 3 \ 24 | --logging_steps 4 \ 25 | --max_grad_norm 0 \ 26 | --lr_scheduler_type constant \ 27 | --learning_rate 3e-4 \ 28 | --disable_tqdm True \ 29 | --dataloader_num_workers 2 \ 30 | --lora_dim 4 \ 31 | --lora_alpha 32 \ 32 | --lora_dropout 0.0 \ 33 | --enable_lora \ 34 | --target_modules "['q_proj', 'v_proj']" \ 35 | --label_names labels \ 36 | --bf16 \ 37 | --gradient_checkpointing 38 | environment: 39 | image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04 40 | conda_file: ../../environment.yml 41 | compute: azureml:ND96asrv4 42 | display_name: mistral_7b_qlora_dp_cnn 43 | experiment_name: dp-transformers-mistral-7b-qlora-dp-cnn 44 | description: DP fine-tune Mistral 7B model with QLoRA on CNN dataset -------------------------------------------------------------------------------- /research/fine_tune_llm_w_qlora/aml/cnn/peft-eps_inf.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json 2 | code: ../../../../ 3 | command: >- 4 | python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 research/fine_tune_llm_w_qlora/fine-tune-nodp.py \ 5 | --output_dir outputs \ 6 | --model_name mistralai/Mistral-7B-v0.1 \ 7 | --dataset_name cnn \ 8 | --sequence_len 1024 \ 9 | --per_device_train_batch_size 4 \ 10 | --gradient_accumulation_steps 1 \ 11 | --evaluation_strategy steps \ 12 | --eval_steps 300 \ 13 | --save_strategy no \ 14 | --log_level info \ 15 | --per_device_eval_batch_size 32 \ 16 | --eval_accumulation_steps 1 \ 17 | --seed 42 \ 18 | --weight_decay 0.01 \ 19 | --remove_unused_columns False \ 20 | --num_train_epochs 3 \ 21 | --logging_steps 5 \ 22 | --max_grad_norm 0 \ 23 | --lr_scheduler_type constant \ 24 | --learning_rate 2.5e-5 \ 25 | --disable_tqdm True \ 26 | --dataloader_num_workers 2 \ 27 | --lora_dim 4 \ 28 | --lora_alpha 32 \ 29 | --lora_dropout 0.0 \ 30 | --enable_lora \ 31 | --target_modules "['q_proj', 'v_proj']" \ 32 | --label_names labels \ 33 | --ddp_find_unused_parameters False \ 34 | --bf16 \ 35 | --gradient_checkpointing 36 | environment: 37 | image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04 38 | conda_file: ../../environment.yml 39 | compute: azureml:ND96asrv4 40 | display_name: mistral_7b_qlora_nodp_cnn 41 | experiment_name: dp-transformers-mistral-7b-qlora-nodp-cnn 42 | description: Fine-tune Mistral 7B model with QLoRA on CNN dataset -------------------------------------------------------------------------------- /research/fine_tune_llm_w_qlora/aml/qnli/peft-eps_8.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json 2 | code: ../../../../ 3 | command: >- 4 | python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 research/fine_tune_llm_w_qlora/fine-tune-dp.py \ 5 | --output_dir outputs \ 6 | --model_name mistralai/Mistral-7B-v0.1 \ 7 | --dataset_name qnli \ 8 | --sequence_len 256 \ 9 | --per_device_train_batch_size 8 \ 10 | --gradient_accumulation_steps 16 \ 11 | --evaluation_strategy steps \ 12 | --eval_steps 4 \ 13 | --save_strategy no \ 14 | --log_level info \ 15 | --per_device_eval_batch_size 16 \ 16 | --eval_accumulation_steps 1 \ 17 | --seed 42 \ 18 | --target_epsilon 8 \ 19 | --target_delta 1e-5 \ 20 | --per_sample_max_grad_norm 1.0 \ 21 | --weight_decay 0.01 \ 22 | --remove_unused_columns False \ 23 | --num_train_epochs 3 \ 24 | --logging_steps 4 \ 25 | --max_grad_norm 0 \ 26 | --lr_scheduler_type constant \ 27 | --learning_rate 3e-4 \ 28 | --disable_tqdm True \ 29 | --dataloader_num_workers 2 \ 30 | --lora_dim 4 \ 31 | --lora_alpha 32 \ 32 | --lora_dropout 0.0 \ 33 | --enable_lora \ 34 | --target_modules "['q_proj', 'v_proj']" \ 35 | --label_names labels \ 36 | --bf16 37 | environment: 38 | image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04 39 | conda_file: ../../environment.yml 40 | compute: azureml:ND96asrv4 41 | display_name: mistral_7b_qlora_dp_qnli 42 | experiment_name: dp-transformers-mistral-7b-qlora-dp-qnli 43 | description: DP fine-tune Mistral 7B model with QLoRA on QNLI dataset -------------------------------------------------------------------------------- /research/fine_tune_llm_w_qlora/aml/qnli/peft-eps_inf.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json 2 | code: ../../../../ 3 | command: >- 4 | python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 research/fine_tune_llm_w_qlora/fine-tune-nodp.py \ 5 | --output_dir outputs \ 6 | --model_name mistralai/Mistral-7B-v0.1 \ 7 | --dataset_name qnli \ 8 | --sequence_len 256 \ 9 | --per_device_train_batch_size 4 \ 10 | --gradient_accumulation_steps 1 \ 11 | --evaluation_strategy steps \ 12 | --eval_steps 100 \ 13 | --save_strategy no \ 14 | --log_level info \ 15 | --per_device_eval_batch_size 8 \ 16 | --eval_accumulation_steps 1 \ 17 | --seed 42 \ 18 | --weight_decay 0.01 \ 19 | --remove_unused_columns False \ 20 | --num_train_epochs 3 \ 21 | --logging_steps 5 \ 22 | --max_grad_norm 0 \ 23 | --lr_scheduler_type constant \ 24 | --learning_rate 2.5e-5 \ 25 | --disable_tqdm True \ 26 | --dataloader_num_workers 2 \ 27 | --lora_dim 4 \ 28 | --lora_alpha 32 \ 29 | --lora_dropout 0.0 \ 30 | --enable_lora \ 31 | --target_modules "['q_proj', 'v_proj']" \ 32 | --label_names labels \ 33 | --bf16 34 | environment: 35 | image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04 36 | conda_file: ../../environment.yml 37 | compute: azureml:ND96asrv4 38 | display_name: mistral_7b_qlora_nodp_qnli 39 | experiment_name: dp-transformers-mistral-7b-qlora-nodp-qnli 40 | description: Fine-tune Mistral 7B model with QLoRA on QNLI dataset -------------------------------------------------------------------------------- /research/fine_tune_llm_w_qlora/aml/sst2/peft-eps_8.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json 2 | code: ../../../../ 3 | command: >- 4 | python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 research/fine_tune_llm_w_qlora/fine-tune-dp.py \ 5 | --output_dir outputs \ 6 | --model_name mistralai/Mistral-7B-v0.1 \ 7 | --dataset_name sst2 \ 8 | --sequence_len 128 \ 9 | --per_device_train_batch_size 8 \ 10 | --gradient_accumulation_steps 16 \ 11 | --evaluation_strategy steps \ 12 | --eval_steps 4 \ 13 | --save_strategy no \ 14 | --log_level info \ 15 | --per_device_eval_batch_size 16 \ 16 | --eval_accumulation_steps 1 \ 17 | --seed 42 \ 18 | --target_epsilon 8 \ 19 | --target_delta 1e-5 \ 20 | --per_sample_max_grad_norm 1.0 \ 21 | --weight_decay 0.01 \ 22 | --remove_unused_columns False \ 23 | --num_train_epochs 3 \ 24 | --logging_steps 4 \ 25 | --max_grad_norm 0 \ 26 | --lr_scheduler_type constant \ 27 | --learning_rate 3e-4 \ 28 | --disable_tqdm True \ 29 | --dataloader_num_workers 2 \ 30 | --lora_dim 4 \ 31 | --lora_alpha 32 \ 32 | --lora_dropout 0.0 \ 33 | --enable_lora \ 34 | --target_modules "['q_proj', 'v_proj']" \ 35 | --label_names labels \ 36 | --bf16 37 | environment: 38 | image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04 39 | conda_file: ../../environment.yml 40 | compute: azureml:ND96asrv4 41 | display_name: mistral_7b_qlora_dp_sst2 42 | experiment_name: dp-transformers-mistral-7b-qlora-dp-sst2 43 | description: DP fine-tune Mistral 7B model with QLoRA on SST-2 dataset -------------------------------------------------------------------------------- /research/fine_tune_llm_w_qlora/aml/sst2/peft-eps_inf.yml: -------------------------------------------------------------------------------- 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json 2 | code: ../../../../ 3 | command: >- 4 | python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 research/fine_tune_llm_w_qlora/fine-tune-nodp.py \ 5 | --output_dir outputs \ 6 | --model_name mistralai/Mistral-7B-v0.1 \ 7 | --dataset_name sst2 \ 8 | --sequence_len 128 \ 9 | --per_device_train_batch_size 4 \ 10 | --gradient_accumulation_steps 1 \ 11 | --evaluation_strategy steps \ 12 | --eval_steps 100 \ 13 | --save_strategy no \ 14 | --log_level info \ 15 | --per_device_eval_batch_size 8 \ 16 | --eval_accumulation_steps 1 \ 17 | --seed 42 \ 18 | --weight_decay 0.01 \ 19 | --remove_unused_columns False \ 20 | --num_train_epochs 3 \ 21 | --logging_steps 5 \ 22 | --max_grad_norm 0 \ 23 | --lr_scheduler_type constant \ 24 | --learning_rate 2.5e-5 \ 25 | --disable_tqdm True \ 26 | --dataloader_num_workers 2 \ 27 | --lora_dim 4 \ 28 | --lora_alpha 32 \ 29 | --lora_dropout 0.0 \ 30 | --enable_lora \ 31 | --target_modules "['q_proj', 'v_proj']" \ 32 | --label_names labels \ 33 | --bf16 34 | environment: 35 | image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04 36 | conda_file: ../../environment.yml 37 | compute: azureml:ND96asrv4 38 | display_name: mistral_7b_qlora_nodp_sst2 39 | experiment_name: dp-transformers-mistral-7b-qlora-nodp-sst2 40 | description: Fine-tune Mistral 7B model with QLoRA on SST-2 dataset -------------------------------------------------------------------------------- /research/fine_tune_llm_w_qlora/data_utils.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | import evaluate 3 | import torch 4 | import numpy as np 5 | 6 | 7 | # Modified from https://huggingface.co/docs/peft/task_guides/clm-prompt-tuning 8 | def main_preprocess_function(examples, tokenizer, text_field, prompt_begin, prompt_end, label_field, sequence_len, single_token=True): 9 | batch_size = len(examples[text_field]) 10 | 11 | # Prepare the context with the text in between of prompts, e.g. "Sentence : Label :" 12 | inputs = [prompt_begin + x + prompt_end for x in examples[text_field]] 13 | 14 | # Prepare the prediction part 15 | targets = [str(x) for x in examples[label_field]] 16 | 17 | model_inputs = tokenizer(inputs) 18 | labels = tokenizer(targets) 19 | 20 | # Concatenate the context and prediction parts as one input and set -100 to the labels of the context part 21 | # This is because only the label part will be used to calculate the loss 22 | for i in range(batch_size): 23 | sample_input_ids = model_inputs["input_ids"][i] 24 | if single_token: 25 | # Tokenizer adds to input_ids so just take the last id 26 | # NOTE THAT THIS ASSUMES THE LABEL IS SINGLE TOKEN 27 | label_input_ids = [labels["input_ids"][i][-1]] 28 | else: 29 | # Tokenizer adds to input_ids so just take the rest 30 | label_input_ids = labels["input_ids"][i][1:] 31 | model_inputs["input_ids"][i] = sample_input_ids + label_input_ids 32 | labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids 33 | model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i]) 34 | 35 | # Pad the samples with sequence_len and trim if longer than sequence_len 36 | # NOTE THAT IF CONTEXT IS LONGER THAN SEQUENCE_LEN, THERE WILL BE NOTHING TO PREDICT, LABEL IS ALL -100 37 | for i in range(batch_size): 38 | sample_input_ids = model_inputs["input_ids"][i] 39 | label_input_ids = labels["input_ids"][i] 40 | model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * ( 41 | sequence_len - len(sample_input_ids) 42 | ) + sample_input_ids 43 | model_inputs["attention_mask"][i] = [0] * (sequence_len - len(sample_input_ids)) + model_inputs[ 44 | "attention_mask" 45 | ][i] 46 | labels["input_ids"][i] = [-100] * (sequence_len - len(sample_input_ids)) + label_input_ids 47 | model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:sequence_len]) 48 | model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:sequence_len]) 49 | labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:sequence_len]) 50 | 51 | model_inputs["labels"] = labels["input_ids"] 52 | return model_inputs 53 | 54 | 55 | class Dataset: 56 | dataset = None 57 | classes = None # List of class labels 58 | text_field = None # Name of the field in the dataset that contains the text 59 | prompt_begin = None # Prompt to add to the beginning of the text, e.g. "Sentence : " 60 | prompt_end = None # Prompt to add to the end of the text, e.g. " Label :" 61 | label_field = None # Name of the field in the dataset that contains the label 62 | evaluate = None # Evaluation metric 63 | run_test = False # Whether to run test set evaluation 64 | 65 | def __init__(self, tokenizer, sequence_len): 66 | self.tokenizer = tokenizer 67 | self.sequence_len = sequence_len 68 | 69 | def target_max_len(self): 70 | target_lens = [len(self.tokenizer(class_label)["input_ids"]) for class_label in self.classes] 71 | target_max_len = max(target_lens) 72 | return target_max_len 73 | 74 | def preprocess_function(self, example): 75 | return main_preprocess_function(example, self.tokenizer, self.text_field, self.prompt_begin, 76 | self.prompt_end, self.label_field, self.sequence_len) 77 | 78 | # Define the evaluation metric (NOTE THAT THIS ASSUMES THE LABEL IS SINGLE TOKEN) 79 | def compute_metrics(self, eval_pred): 80 | predictions, labels = eval_pred 81 | # Only keep predictions for the last token shifted by 1 82 | predictions = predictions[..., -2] 83 | # Only keep labels for the last token 84 | labels = labels[..., -1] 85 | return self.evaluate.compute(predictions=predictions, references=labels) 86 | 87 | def preprocess_logits_for_metrics(self, logits, labels): 88 | """ 89 | Original Trainer may lead to a memory issue. 90 | This is a workaround to avoid storing too many tensors that are not needed. 91 | """ 92 | pred_ids = torch.argmax(logits, dim=-1) 93 | return pred_ids 94 | 95 | 96 | class SST2Dataset(Dataset): 97 | def __init__(self, tokenizer, sequence_len): 98 | # Load data 99 | self.dataset = datasets.load_dataset('sst2') 100 | # Map labels from 0/1 to negative/positive 101 | self.classes = ['negative', 'positive'] 102 | self.dataset = self.dataset.map( 103 | lambda x: {"text_label": [self.classes[label] for label in x["label"]]}, 104 | batched=True, 105 | num_proc=8, 106 | ) 107 | self.text_field = "sentence" 108 | self.prompt_begin = "Sentence : " 109 | self.prompt_end = " Label :" 110 | self.label_field = "text_label" 111 | self.evaluate = evaluate.load("accuracy") 112 | super().__init__(tokenizer, sequence_len) 113 | 114 | 115 | class QNLIDataset(Dataset): 116 | def __init__(self, tokenizer, sequence_len): 117 | # Load data 118 | self.dataset = datasets.load_dataset('glue', 'qnli') 119 | self.classes = ['0', '1'] 120 | self.dataset = self.dataset.map( 121 | lambda x: {"text_concat": [question + " ### " + sentence for question, sentence in zip(x["question"], x["sentence"])]}, 122 | batched=True, 123 | num_proc=8, 124 | ) 125 | # 5k eval samples too large, shuffle and reduce it to 1k 126 | self.dataset['validation'] = self.dataset['validation'].shuffle().select(range(1000)) 127 | self.text_field = "text_concat" 128 | self.prompt_begin = "Two sentences separated with ### : " 129 | self.prompt_end = " Label :" 130 | self.label_field = "label" 131 | self.evaluate = evaluate.load("accuracy") 132 | super().__init__(tokenizer, sequence_len) 133 | 134 | 135 | class CNNDataset(Dataset): 136 | def __init__(self, tokenizer, sequence_len): 137 | # Load data 138 | self.dataset = datasets.load_dataset("cnn_dailymail", "3.0.0") 139 | # 13.4k eval samples too large, shuffle and reduce it to 1k 140 | self.dataset['validation'] = self.dataset['validation'].shuffle().select(range(1000)) 141 | # Get rid of the test dataset 142 | del self.dataset['test'] 143 | self.text_field = "article" 144 | self.prompt_begin = "Article : " 145 | self.prompt_end = " Summary :" 146 | self.label_field = "highlights" 147 | self.evaluate = evaluate.load("rouge") 148 | self.run_test = True 149 | super().__init__(tokenizer, sequence_len) 150 | 151 | def preprocess_function(self, example): 152 | return main_preprocess_function(example, self.tokenizer, self.text_field, self.prompt_begin, 153 | self.prompt_end, self.label_field, self.sequence_len, single_token=False) 154 | 155 | def compute_metrics(self, eval_pred): 156 | predictions, labels = eval_pred 157 | # Only keep predictions up to last token 158 | predictions = predictions[..., :-1] 159 | # Only keep labels from the first token 160 | labels = labels[..., 1:] 161 | # Replace -100 of the labels as we don't want the content 162 | predictions = np.where(labels != -100, predictions, self.tokenizer.pad_token_id) 163 | # Decode generated summaries into text 164 | decoded_preds = self.tokenizer.batch_decode(predictions, skip_special_tokens=True) 165 | # Replace -100 in the labels as we can't decode them 166 | labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id) 167 | # Decode reference summaries into text 168 | decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True) 169 | # Compute ROUGE scores 170 | result = self.evaluate.compute( 171 | predictions=decoded_preds, references=decoded_labels, use_stemmer=True 172 | ) 173 | return {k: round(v, 4) for k, v in result.items()} 174 | 175 | def compute_test_metrics(self, trainer): 176 | test_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split='test') 177 | # Filter out samples too long, e.g. more than 500 words 178 | test_dataset = test_dataset.filter(lambda x: len(x['article'].split()) < 500) 179 | # 11.4k test samples too large, shuffle and reduce it to 1k 180 | test_dataset = test_dataset.shuffle().select(range(1000)) 181 | # Add prompt_begin and prompt_end 182 | test_dataset = test_dataset.map( 183 | lambda x: {"article": [self.prompt_begin + article + self.prompt_end for article in x["article"]]}, 184 | batched=True, 185 | num_proc=None, 186 | ) 187 | 188 | # Tokenize data 189 | def test_preprocess_function(examples): 190 | model_inputs = trainer.tokenizer(examples['article'], padding=False) 191 | 192 | # 2. reserve the original article and summary for saving 193 | model_inputs['summary'] = examples['highlights'] 194 | return model_inputs 195 | 196 | with trainer.args.main_process_first(desc="tokenizing test dataset"): 197 | test_dataset = test_dataset.map( 198 | test_preprocess_function, 199 | batched=True, num_proc=None, desc="tokenizing dataset", 200 | remove_columns=test_dataset.column_names) 201 | 202 | # Filter out samples too long, e.g. more than 750 tokens 203 | test_dataset = test_dataset.filter(lambda x: len(x['input_ids']) < 750) 204 | 205 | test_dataset.set_format(type="torch") 206 | 207 | def generate_batched( 208 | model, 209 | tokenizer, 210 | device, 211 | query_tensors, 212 | batch_size: int = 4, 213 | return_prompt: bool = True, 214 | pad_to_multiple_of: int = None, 215 | **generation_kwargs, 216 | ): 217 | outputs = [] 218 | 219 | tokenizer.padding_side = "left" 220 | 221 | # handle distributed case and distribute query_tensors among gpus 222 | query_tensors = query_tensors[device.index::trainer.args.world_size] 223 | 224 | # in case we have fewer examples than bs 225 | batch_size = min(len(query_tensors), batch_size) 226 | 227 | for i in range(0, len(query_tensors), batch_size): 228 | # prevent overflow if query tensors are not even multiple of bs 229 | end_index = min(len(query_tensors), i + batch_size) 230 | 231 | batch = query_tensors[i:end_index] 232 | batch_mask = [torch.ones_like(element) for element in batch] 233 | inputs = {"input_ids": batch, "attention_mask": batch_mask} 234 | 235 | padded_inputs = tokenizer.pad( 236 | inputs, 237 | padding=True, 238 | max_length=None, 239 | pad_to_multiple_of=pad_to_multiple_of, 240 | return_tensors="pt", 241 | ).to(device) 242 | 243 | with torch.no_grad(): 244 | generations = model.generate(**padded_inputs, **generation_kwargs) 245 | 246 | for generation, mask in zip(generations, padded_inputs["attention_mask"]): 247 | output = generation[(1 - mask).sum() :] # remove padding 248 | 249 | if not return_prompt: 250 | output = output[(mask).sum() :] # remove prompt 251 | outputs.append(output) 252 | 253 | return outputs 254 | 255 | if hasattr(trainer.model, "generate"): 256 | model = trainer.model 257 | # The following is for GradSampleModule wrapping 258 | elif hasattr(trainer.model._module, "generate"): 259 | model = trainer.model._module 260 | # The following is for GradSampleModule and DPDDP wrapping 261 | elif hasattr(trainer.model._module.module, "generate"): 262 | model = trainer.model._module.module 263 | else: 264 | raise ValueError("Cannot find generate function in the model.") 265 | 266 | model.eval() 267 | generation_kwargs = {"max_new_tokens": 100, "pad_token_id": trainer.tokenizer.pad_token_id, 268 | "eos_token_id": trainer.tokenizer.eos_token_id,} 269 | 270 | response_tensors = generate_batched( 271 | model, trainer.tokenizer, trainer.args.device, 272 | test_dataset["input_ids"], 273 | batch_size=trainer.args.eval_batch_size, return_prompt=False, 274 | **generation_kwargs 275 | ) 276 | 277 | responses = [trainer.tokenizer.decode(r.squeeze(), skip_special_tokens=True) 278 | for r in response_tensors] 279 | 280 | result = self.evaluate.compute( 281 | predictions=responses, references=test_dataset["summary"][trainer.args.device.index::trainer.args.world_size], 282 | use_stemmer=True 283 | ) 284 | 285 | r1 = trainer.accelerator.reduce(torch.tensor(result['rouge1']).to(trainer.args.device), reduction="mean") 286 | r2 = trainer.accelerator.reduce(torch.tensor(result['rouge2']).to(trainer.args.device), reduction="mean") 287 | rl = trainer.accelerator.reduce(torch.tensor(result['rougeL']).to(trainer.args.device), reduction="mean") 288 | 289 | result = {'rouge1': r1.item(), 'rouge2': r2.item(), 'rougeL': rl.item()} 290 | return {k: round(v, 4) for k, v in result.items()} 291 | 292 | 293 | ALL_DATASETS = {"sst2": SST2Dataset, "qnli": QNLIDataset, "cnn": CNNDataset} -------------------------------------------------------------------------------- /research/fine_tune_llm_w_qlora/environment.yml: -------------------------------------------------------------------------------- 1 | name: dp-transformers 2 | channels: 3 | - pytorch 4 | - nvidia 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1 8 | - _openmp_mutex=5.1 9 | - blas=1.0 10 | - bzip2=1.0.8 11 | - ca-certificates=2023.05.30 12 | - cuda-cudart=11.8.89 13 | - cuda-cupti=11.8.87 14 | - cuda-libraries=11.8.0 15 | - cuda-nvrtc=11.8.89 16 | - cuda-nvtx=11.8.86 17 | - cuda-runtime=11.8.0 18 | - filelock=3.9.0 19 | - gmp=6.2.1 20 | - gmpy2=2.1.2 21 | - intel-openmp=2023.1.0 22 | - jinja2=3.1.2 23 | - ld_impl_linux-64=2.38 24 | - libcublas=11.11.3.6 25 | - libcufft=10.9.0.58 26 | - libcufile=1.7.1.12 27 | - libcurand=10.3.3.129 28 | - libcusolver=11.4.1.48 29 | - libcusparse=11.7.5.86 30 | - libffi=3.4.4 31 | - libgcc-ng=11.2.0 32 | - libgomp=11.2.0 33 | - libnpp=11.8.0.86 34 | - libnvjpeg=11.9.0.86 35 | - libstdcxx-ng=11.2.0 36 | - libuuid=1.41.5 37 | - markupsafe=2.1.1 38 | - mkl=2023.1.0 39 | - mpc=1.1.0 40 | - mpfr=4.0.2 41 | - mpmath=1.3.0 42 | - ncurses=6.4 43 | - networkx=3.1 44 | - openssl=3.0.10 45 | - pip=23.2.1 46 | - python=3.10.12 47 | - pytorch=2.0.1 48 | - pytorch-cuda=11.8 49 | - pytorch-mutex=1.0 50 | - readline=8.2 51 | - setuptools=68.0.0 52 | - sqlite=3.41.2 53 | - sympy=1.11.1 54 | - tbb=2021.8.0 55 | - tk=8.6.12 56 | - torchtriton=2.0.0 57 | - typing_extensions=4.7.1 58 | - wheel=0.38.4 59 | - xz=5.4.2 60 | - zlib=1.2.13 61 | - pip: 62 | - absl-py==2.0.0 63 | - accelerate==0.21.0 64 | - aiohttp==3.8.5 65 | - aiosignal==1.3.1 66 | - alembic==1.11.2 67 | - async-timeout==4.0.3 68 | - attrs==23.1.0 69 | - azure-common==1.1.28 70 | - azure-core==1.29.2 71 | - azure-identity==1.14.0 72 | - azure-mgmt-core==1.4.0 73 | - azure-storage-blob==12.13.0 74 | - azureml-mlflow==1.52.0 75 | - bitsandbytes==0.41.1 76 | - blinker==1.6.2 77 | - certifi==2023.7.22 78 | - cffi==1.15.1 79 | - charset-normalizer==3.2.0 80 | - click==8.1.6 81 | - cloudpickle==2.2.1 82 | - contourpy==1.1.0 83 | - cryptography==41.0.3 84 | - cycler==0.11.0 85 | - databricks-cli==0.17.7 86 | - datasets==2.14.4 87 | - dill==0.3.7 88 | - docker==6.1.3 89 | - entrypoints==0.4 90 | - evaluate==0.4.1 91 | - exceptiongroup==1.1.3 92 | - flask==2.3.2 93 | - fonttools==4.42.0 94 | - frozenlist==1.4.0 95 | - fsspec==2023.6.0 96 | - gitdb==4.0.10 97 | - gitpython==3.1.32 98 | - greenlet==2.0.2 99 | - gunicorn==21.2.0 100 | - huggingface-hub==0.16.4 101 | - idna==3.4 102 | - importlib-metadata==6.8.0 103 | - iniconfig==2.0.0 104 | - isodate==0.6.1 105 | - itsdangerous==2.1.2 106 | - joblib==1.3.2 107 | - jsonpickle==3.0.2 108 | - kiwisolver==1.4.4 109 | - mako==1.2.4 110 | - markdown==3.4.4 111 | - matplotlib==3.7.2 112 | - mlflow==2.6.0 113 | - mlflow-skinny==2.6.0 114 | - msal==1.23.0 115 | - msal-extensions==1.0.0 116 | - msrest==0.7.1 117 | - multidict==6.0.4 118 | - multiprocess==0.70.15 119 | - nltk==3.8.1 120 | - numpy==1.25.2 121 | - nvidia-ml-py3==7.352.0 122 | - oauthlib==3.2.2 123 | - opacus==1.4.0 124 | - opt-einsum==3.3.0 125 | - packaging==23.1 126 | - pandas==2.0.3 127 | - peft==0.4.0 128 | - pillow==10.0.0 129 | - pluggy==1.2.0 130 | - portalocker==2.7.0 131 | - protobuf==4.24.0 132 | - prv-accountant==0.1.1.post1 133 | - psutil==5.9.5 134 | - pyarrow==12.0.1 135 | - pycparser==2.21 136 | - pyjwt==2.8.0 137 | - pynvml==11.5.0 138 | - pyparsing==3.0.9 139 | - pytest==7.4.0 140 | - python-dateutil==2.8.2 141 | - pytz==2023.3 142 | - pyyaml==6.0.1 143 | - querystring-parser==1.2.4 144 | - regex==2023.8.8 145 | - requests==2.31.0 146 | - requests-oauthlib==1.3.1 147 | - responses==0.18.0 148 | - rouge-score==0.1.2 149 | - safetensors==0.3.2 150 | - scikit-learn==1.3.0 151 | - scipy==1.11.1 152 | - six==1.16.0 153 | - smmap==5.0.0 154 | - sqlalchemy==2.0.19 155 | - sqlparse==0.4.4 156 | - tabulate==0.9.0 157 | - threadpoolctl==3.2.0 158 | - tokenizers==0.14.1 159 | - tomli==2.0.1 160 | - tqdm==4.66.1 161 | - transformers==4.35.1 162 | - tzdata==2023.3 163 | - urllib3==1.26.16 164 | - websocket-client==1.6.1 165 | - werkzeug==2.3.7 166 | - xxhash==3.3.0 167 | - yarl==1.9.2 168 | - zipp==3.16.2 169 | -------------------------------------------------------------------------------- /research/fine_tune_llm_w_qlora/fine-tune-dp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | '''Train LLMs with DP using QLoRA''' 5 | 6 | import datasets 7 | import dp_transformers 8 | import transformers 9 | import sys 10 | import logging 11 | import torch 12 | import ast 13 | import linear 14 | import data_utils 15 | 16 | from dataclasses import dataclass, field, asdict 17 | from typing import List, Optional, Tuple, Union 18 | from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training 19 | 20 | from pynvml import * 21 | 22 | def print_gpu_utilization(): 23 | nvmlInit() 24 | handle = nvmlDeviceGetHandleByIndex(0) 25 | info = nvmlDeviceGetMemoryInfo(handle) 26 | print(f"GPU memory occupied: {info.used//1024**2} MB.") 27 | 28 | logger = logging.getLogger(__name__) 29 | 30 | 31 | @dataclass 32 | class ModelArguments: 33 | model_name: str = field(default="gpt2", metadata={ 34 | "help": "Model name in HuggingFace, e.g. 'gpt2'" 35 | }) 36 | dataset_name: str = field(default="sst2", metadata={ 37 | "help": "Dataset name in HuggingFace, e.g. 'sst2'" 38 | }) 39 | sequence_len: int = field(default=128, metadata={ 40 | "help": "Maximum sequence length" 41 | }) 42 | 43 | 44 | @dataclass 45 | class LoraArguments: 46 | enable_lora: bool = field(default=False, metadata={ 47 | "help": "Whether to enable LoRA" 48 | }) 49 | lora_dim: int = field(default=8, metadata={ 50 | "help": "LoRA dimension" 51 | }) 52 | lora_alpha: int = field(default=8, metadata={ 53 | "help": "LoRA alpha" 54 | }) 55 | lora_dropout: float = field(default=0.0, metadata={ 56 | "help": "LoRA dropout" 57 | }) 58 | 59 | target_modules: List[str] = field( 60 | default_factory=list, 61 | metadata={ 62 | "help": "List of module names or regex expression of the module names to replace with Lora." 63 | "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' " 64 | }, 65 | ) 66 | 67 | def as_peft_config(self) -> LoraConfig: 68 | if not self.enable_lora: 69 | raise ValueError("LoRA is not enabled, cannot convert to LoRA config") 70 | params = asdict(self) 71 | params.pop("enable_lora") 72 | params["r"] = params.pop("lora_dim") 73 | params["target_modules"] = ast.literal_eval(params["target_modules"][0]) 74 | return LoraConfig(**params) 75 | 76 | 77 | @dataclass 78 | class Arguments: 79 | train: dp_transformers.TrainingArguments 80 | privacy: dp_transformers.PrivacyArguments 81 | model: ModelArguments 82 | lora: LoraArguments 83 | 84 | 85 | def main(args: Arguments): 86 | transformers.set_seed(args.train.seed) 87 | 88 | # Setup logging 89 | logging.basicConfig( 90 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 91 | datefmt="%m/%d/%Y %H:%M:%S", 92 | handlers=[logging.StreamHandler(sys.stdout)], 93 | ) 94 | 95 | log_level = train_args.get_process_log_level() 96 | logger.setLevel(log_level) 97 | datasets.utils.logging.set_verbosity(log_level) 98 | transformers.utils.logging.set_verbosity(log_level) 99 | transformers.utils.logging.enable_default_handler() 100 | transformers.utils.logging.enable_explicit_format() 101 | 102 | # Log on each process the small summary: 103 | logger.warning( 104 | f"Process rank: {train_args.local_rank}, device: {train_args.device}, n_gpu: {train_args.n_gpu}, " 105 | f"distributed training: {bool(train_args.local_rank != -1)}, 16-bits training: {train_args.fp16}" 106 | ) 107 | logger.info(f"Training/evaluation parameters {train_args}") 108 | logger.info(f"Privacy parameters {privacy_args}") 109 | 110 | # Load tokenizer 111 | tokenizer = transformers.AutoTokenizer.from_pretrained(args.model.model_name) 112 | if tokenizer.pad_token_id is None: 113 | tokenizer.pad_token_id = tokenizer.eos_token_id 114 | 115 | # Load dataset 116 | dataset = data_utils.ALL_DATASETS[args.model.dataset_name](tokenizer, args.model.sequence_len) 117 | 118 | if dataset.classes is not None: 119 | target_max_len = dataset.target_max_len() 120 | logger.info(f"Labels tokenized into max length: {target_max_len}") 121 | 122 | # Tokenize data 123 | with train_args.main_process_first(desc="tokenizing dataset"): 124 | dataset.dataset = dataset.dataset.map( 125 | dataset.preprocess_function, batched=True, num_proc=8, desc="tokenizing dataset", 126 | remove_columns=dataset.dataset.column_names['train'] 127 | ) 128 | 129 | bnb_config = transformers.BitsAndBytesConfig( 130 | load_in_4bit=True, 131 | bnb_4bit_use_double_quant=True, 132 | bnb_4bit_quant_type="nf4", 133 | bnb_4bit_compute_dtype=torch.bfloat16 134 | ) 135 | 136 | # Load model 137 | model = transformers.AutoModelForCausalLM.from_pretrained(args.model.model_name, quantization_config=bnb_config) 138 | model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=train_args.gradient_checkpointing) 139 | 140 | if args.lora.enable_lora: 141 | logger.info("Using LoRA") 142 | model = get_peft_model(model=model, peft_config=args.lora.as_peft_config()) 143 | else: 144 | logger.info("Not using LoRA") 145 | 146 | if train_args.local_rank == 0: 147 | logger.info(f"Total number of parameters of the model: {model.num_parameters(only_trainable=False)}") 148 | logger.info(f"Fine-tuned number of parameters of the model: {model.num_parameters(only_trainable=True)}") 149 | 150 | trainer = dp_transformers.dp_utils.OpacusDPTrainer( 151 | args=train_args, 152 | model=model, 153 | train_dataset=dataset.dataset['train'], 154 | eval_dataset=dataset.dataset['validation'], 155 | tokenizer=tokenizer, 156 | compute_metrics=dataset.compute_metrics, 157 | preprocess_logits_for_metrics=dataset.preprocess_logits_for_metrics, 158 | privacy_args=privacy_args, 159 | ) 160 | 161 | if hasattr(trainer.model._module, "config"): 162 | # The following is for GradSampleModule wrapping 163 | ignore_keys = getattr(trainer.model._module.config, "keys_to_ignore_at_inference", []) 164 | elif hasattr(trainer.model._module.module, "config"): 165 | # The following is for GradSampleModule and DPDDP wrapping 166 | ignore_keys = getattr(trainer.model._module.module.config, "keys_to_ignore_at_inference", []) 167 | else: 168 | ignore_keys = [] 169 | 170 | try: 171 | # A workaround to avoid the following error: 172 | # AttributeError: 'GradSampleModule' object has no attribute 'gradient_checkpointing_enable' 173 | # inside Trainer _inner_training_loop. Already done by prepare_model_for_kbit_training 174 | trainer.args.gradient_checkpointing = False 175 | result = trainer.train(ignore_keys_for_eval=ignore_keys) 176 | finally: 177 | eps_prv = trainer.get_prv_epsilon() 178 | eps_rdp = trainer.get_rdp_epsilon() 179 | trainer.log({ 180 | "final_epsilon_prv": eps_prv, 181 | "final_epsilon_rdp": eps_rdp 182 | }) 183 | 184 | if dataset.run_test: 185 | logger.info("Running test set evaluation after training") 186 | test_metrics = dataset.compute_test_metrics(trainer) 187 | trainer.log(test_metrics) 188 | 189 | def print_summary(result): 190 | print(f"Time: {result.metrics['train_runtime']:.2f}") 191 | print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}") 192 | print_gpu_utilization() 193 | 194 | print_summary(result) 195 | 196 | if __name__ == "__main__": 197 | arg_parser = transformers.HfArgumentParser((dp_transformers.TrainingArguments, dp_transformers.PrivacyArguments, ModelArguments, LoraArguments)) 198 | train_args, privacy_args, model_args, lora_args = arg_parser.parse_args_into_dataclasses() 199 | main(Arguments(train=train_args, privacy=privacy_args, model=model_args, lora=lora_args)) 200 | -------------------------------------------------------------------------------- /research/fine_tune_llm_w_qlora/fine-tune-nodp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | '''Train LLMs without DP using QLoRA''' 5 | 6 | import datasets 7 | import dp_transformers 8 | import transformers 9 | import sys 10 | import logging 11 | import torch 12 | import ast 13 | import data_utils 14 | 15 | from dataclasses import dataclass, field, asdict 16 | from typing import List, Optional, Tuple, Union 17 | from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training 18 | 19 | from pynvml import * 20 | 21 | def print_gpu_utilization(): 22 | nvmlInit() 23 | handle = nvmlDeviceGetHandleByIndex(0) 24 | info = nvmlDeviceGetMemoryInfo(handle) 25 | print(f"GPU memory occupied: {info.used//1024**2} MB.") 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | @dataclass 31 | class ModelArguments: 32 | model_name: str = field(default="gpt2", metadata={ 33 | "help": "Model name in HuggingFace, e.g. 'gpt2'" 34 | }) 35 | dataset_name: str = field(default="sst2", metadata={ 36 | "help": "Dataset name in HuggingFace, e.g. 'sst2'" 37 | }) 38 | sequence_len: int = field(default=128, metadata={ 39 | "help": "Maximum sequence length" 40 | }) 41 | 42 | 43 | @dataclass 44 | class LoraArguments: 45 | enable_lora: bool = field(default=False, metadata={ 46 | "help": "Whether to enable LoRA" 47 | }) 48 | lora_dim: int = field(default=8, metadata={ 49 | "help": "LoRA dimension" 50 | }) 51 | lora_alpha: int = field(default=8, metadata={ 52 | "help": "LoRA alpha" 53 | }) 54 | lora_dropout: float = field(default=0.0, metadata={ 55 | "help": "LoRA dropout" 56 | }) 57 | 58 | target_modules: List[str] = field( 59 | default_factory=list, 60 | metadata={ 61 | "help": "List of module names or regex expression of the module names to replace with Lora." 62 | "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' " 63 | }, 64 | ) 65 | 66 | def as_peft_config(self) -> LoraConfig: 67 | if not self.enable_lora: 68 | raise ValueError("LoRA is not enabled, cannot convert to LoRA config") 69 | params = asdict(self) 70 | params.pop("enable_lora") 71 | params["r"] = params.pop("lora_dim") 72 | params["target_modules"] = ast.literal_eval(params["target_modules"][0]) 73 | return LoraConfig(**params) 74 | 75 | 76 | @dataclass 77 | class Arguments: 78 | train: dp_transformers.TrainingArguments 79 | model: ModelArguments 80 | lora: LoraArguments 81 | 82 | 83 | def main(args: Arguments): 84 | transformers.set_seed(args.train.seed) 85 | 86 | # Setup logging 87 | logging.basicConfig( 88 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 89 | datefmt="%m/%d/%Y %H:%M:%S", 90 | handlers=[logging.StreamHandler(sys.stdout)], 91 | ) 92 | 93 | log_level = train_args.get_process_log_level() 94 | logger.setLevel(log_level) 95 | datasets.utils.logging.set_verbosity(log_level) 96 | transformers.utils.logging.set_verbosity(log_level) 97 | transformers.utils.logging.enable_default_handler() 98 | transformers.utils.logging.enable_explicit_format() 99 | 100 | # Log on each process the small summary: 101 | logger.warning( 102 | f"Process rank: {train_args.local_rank}, device: {train_args.device}, n_gpu: {train_args.n_gpu}, " 103 | f"distributed training: {bool(train_args.local_rank != -1)}, 16-bits training: {train_args.fp16}" 104 | ) 105 | logger.info(f"Training/evaluation parameters {train_args}") 106 | 107 | # Load tokenizer 108 | tokenizer = transformers.AutoTokenizer.from_pretrained(args.model.model_name) 109 | if tokenizer.pad_token_id is None: 110 | tokenizer.pad_token_id = tokenizer.eos_token_id 111 | 112 | # Load dataset 113 | dataset = data_utils.ALL_DATASETS[args.model.dataset_name](tokenizer, args.model.sequence_len) 114 | 115 | if dataset.classes is not None: 116 | target_max_len = dataset.target_max_len() 117 | logger.info(f"Labels tokenized into max length: {target_max_len}") 118 | 119 | # Tokenize data 120 | with train_args.main_process_first(desc="tokenizing dataset"): 121 | dataset.dataset = dataset.dataset.map( 122 | dataset.preprocess_function, batched=True, num_proc=8, desc="tokenizing dataset", 123 | remove_columns=dataset.dataset.column_names['train'] 124 | ) 125 | 126 | bnb_config = transformers.BitsAndBytesConfig( 127 | load_in_4bit=True, 128 | bnb_4bit_use_double_quant=True, 129 | bnb_4bit_quant_type="nf4", 130 | bnb_4bit_compute_dtype=torch.bfloat16 131 | ) 132 | 133 | # Load model 134 | model = transformers.AutoModelForCausalLM.from_pretrained(args.model.model_name, quantization_config=bnb_config) 135 | model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=train_args.gradient_checkpointing) 136 | 137 | if args.lora.enable_lora: 138 | logger.info("Using LoRA") 139 | model = get_peft_model(model=model, peft_config=args.lora.as_peft_config()) 140 | else: 141 | logger.info("Not using LoRA") 142 | 143 | if train_args.local_rank == 0: 144 | logger.info(f"Total number of parameters of the model: {model.num_parameters(only_trainable=False)}") 145 | logger.info(f"Fine-tuned number of parameters of the model: {model.num_parameters(only_trainable=True)}") 146 | 147 | trainer = transformers.Trainer( 148 | args=train_args, 149 | model=model, 150 | train_dataset=dataset.dataset['train'], 151 | eval_dataset=dataset.dataset['validation'], 152 | tokenizer=tokenizer, 153 | compute_metrics=dataset.compute_metrics, 154 | preprocess_logits_for_metrics=dataset.preprocess_logits_for_metrics, 155 | ) 156 | 157 | result = trainer.train() 158 | 159 | if dataset.run_test: 160 | logger.info("Running test set evaluation after training") 161 | test_metrics = dataset.compute_test_metrics(trainer) 162 | trainer.log(test_metrics) 163 | 164 | def print_summary(result): 165 | print(f"Time: {result.metrics['train_runtime']:.2f}") 166 | print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}") 167 | print_gpu_utilization() 168 | 169 | print_summary(result) 170 | 171 | if __name__ == "__main__": 172 | arg_parser = transformers.HfArgumentParser((dp_transformers.TrainingArguments, ModelArguments, LoraArguments)) 173 | train_args, model_args, lora_args = arg_parser.parse_args_into_dataclasses() 174 | main(Arguments(train=train_args, model=model_args, lora=lora_args)) 175 | -------------------------------------------------------------------------------- /research/fine_tune_llm_w_qlora/linear.py: -------------------------------------------------------------------------------- 1 | ### Convert activations and backprops to float for per-sample gradient computation 2 | ### During mixed precision training it is possible that the activations and/or backprops are not in full precision 3 | 4 | from typing import Dict, List 5 | 6 | import torch 7 | import torch.nn as nn 8 | from opt_einsum import contract 9 | 10 | from opacus.grad_sample.utils import register_grad_sampler 11 | 12 | 13 | @register_grad_sampler(nn.Linear) 14 | def compute_linear_grad_sample( 15 | layer: nn.Linear, activations: List[torch.Tensor], backprops: torch.Tensor 16 | ) -> Dict[nn.Parameter, torch.Tensor]: 17 | """ 18 | Computes per sample gradients for ``nn.Linear`` layer 19 | 20 | Args: 21 | layer: Layer 22 | activations: Activations 23 | backprops: Backpropagations 24 | """ 25 | activations = activations[0] 26 | ret = {} 27 | if layer.weight.requires_grad: 28 | gs = contract("n...i,n...j->nij", backprops.float(), activations.float()) 29 | ret[layer.weight] = gs 30 | if layer.bias is not None and layer.bias.requires_grad: 31 | ret[layer.bias] = contract("n...k->nk", backprops.float()) 32 | return ret 33 | -------------------------------------------------------------------------------- /research/synthetic-text-generation-with-DP/NOTICE.txt: -------------------------------------------------------------------------------- 1 | THIRD-PARTY SOFTWARE NOTICES AND INFORMATION 2 | Do Not Translate or Localize 3 | 4 | This software incorporates components from the projects listed below. The original copyright notices 5 | and the licenses under which Microsoft received such components are set forth below and are provided for 6 | informational purposes only. Microsoft reserves all rights not expressly granted herein, whether by 7 | implication, estoppel or otherwise. 8 | 9 | This software includes parts of the Huggingface/Transformers Library (https://github.com/huggingface/transformers). 10 | State-of-the-art of Natural Language Processing for Jax, PyTorch and TensorFlow. Huggingface/Transformers library is 11 | licensed under Apache License 2.0, you can find a copy of this license at https://github.com/huggingface/transformers/blob/master/LICENSE 12 | -------------------------------------------------------------------------------- /research/synthetic-text-generation-with-DP/README.md: -------------------------------------------------------------------------------- 1 | We present the code of our paper "Synthetic Text Generation with Differential Privacy: A Simple and Practical Recipe" at ACL 2023. 2 | 3 | ## Fine-tuning with DP 4 | 5 | The following script assumes distributed training on 8 GPUs. 6 | 7 | ```console 8 | python -m torch.distributed.run --nproc_per_node 8 fine-tune-dp.py \ 9 | --data_dir $DATA \ 10 | --output_dir $OUTPUT_DIR \ 11 | --model_name gpt2 \ 12 | --per_device_train_batch_size 32 \ 13 | --gradient_accumulation_steps 16 \ 14 | --evaluation_strategy epoch \ 15 | --save_strategy epoch \ 16 | --log_level info \ 17 | --per_device_eval_batch_size 64 \ 18 | --eval_accumulation_steps 1 \ 19 | --seed 42 \ 20 | --target_epsilon 4.0 \ 21 | --per_sample_max_grad_norm 1.0 \ 22 | --weight_decay 0.01 \ 23 | --remove_unused_columns False \ 24 | --num_train_epochs 50 \ 25 | --logging_steps 10 \ 26 | --max_grad_norm 0 \ 27 | --sequence_len 128 \ 28 | --learning_rate 0.0001 \ 29 | --lr_scheduler_type constant \ 30 | --dataloader_num_workers 2 \ 31 | --disable_tqdm True \ 32 | --load_best_model_at_end True \ 33 | ``` 34 | 35 | ## Fine-tuning without DP 36 | 37 | The following script assumes distributed training on 8 GPUs. 38 | 39 | ```console 40 | python -m torch.distributed.run --nproc_per_node 8 fine-tune-nodp.py \ 41 | --data_dir $DATA \ 42 | --output_dir $OUTPUT_DIR \ 43 | --model_name gpt2 \ 44 | --per_device_train_batch_size 4 \ 45 | --gradient_accumulation_steps 1 \ 46 | --evaluation_strategy epoch \ 47 | --save_strategy epoch \ 48 | --log_level info \ 49 | --per_device_eval_batch_size 64 \ 50 | --eval_accumulation_steps 1 \ 51 | --seed 42 \ 52 | --weight_decay 0.01 \ 53 | --remove_unused_columns False \ 54 | --num_train_epochs 5 \ 55 | --logging_steps 2400 \ 56 | --max_grad_norm 0 \ 57 | --sequence_len 128 \ 58 | --learning_rate 0.00005 \ 59 | --lr_scheduler_type constant \ 60 | --dataloader_num_workers 2 \ 61 | --disable_tqdm True \ 62 | --load_best_model_at_end True \ 63 | ``` 64 | 65 | ## Synthetic Text Generation 66 | 67 | The following script generates synthetic data from a fine-tuned model on a single GPU. 68 | 69 | ```console 70 | python generate-text.py \ 71 | --model_type gpt2 \ 72 | --model_name_or_path $CHECKPOINT_FOLDER \ 73 | --input_training_file $TRAINING_DATA_FILE \ 74 | --output_dir $OUTPUT_DIR \ 75 | --length 128 \ 76 | --total_sequences 100000 \ 77 | --do_sample \ 78 | --batch_size 8 \ 79 | ``` 80 | 81 | ## Classification model 82 | 83 | The following script assumes distributed training on 8 GPUs. 84 | Set --sample_dataset True to train the classifier on the original data to sample 100000 data points. 85 | 86 | ```console 87 | python -m torch.distributed.run --nproc_per_node 8 run-classification.py \ 88 | --model_name_or_path roberta-base \ 89 | --output_dir $OUTPUT_DIR \ 90 | --train_file $TRAINING_DATA_FILE \ 91 | --validation_file $VAL_DATA_FILE \ 92 | --test_file $TEST_DATA_FILE \ 93 | --do_train \ 94 | --do_eval \ 95 | --do_predict \ 96 | --max_seq_length 512 \ 97 | --per_device_train_batch_size 4 \ 98 | --per_device_eval_batch_size 64 \ 99 | --learning_rate 3e-5 \ 100 | --num_train_epochs 3 \ 101 | --logging_steps 100 \ 102 | --overwrite_output_dir \ 103 | --overwrite_cache True \ 104 | --evaluation_strategy steps \ 105 | --eval_steps 31 \ 106 | --save_steps 31 \ 107 | --load_best_model_at_end True \ 108 | --label_column_name "label1" \ 109 | --sample_dataset False \ 110 | --disable_tqdm True 111 | ``` 112 | 113 | ## Using LoRA during fine-tuning 114 | 115 | Although not used in the paper, LoRA fine-tuning significantly improves the runtime by allowing much larger 116 | batch sizes to fit in each GPU. A starting point could be to add `--lora_dim 4 --lora_alpha 32 --lora_dropout 0.0` 117 | and use larger learning rates such as `--learning_rate 3e-4` or `4e-4`. 118 | 119 | ## Third Party Notice 120 | 121 | This software includes the files listed below from the Huggingface/Transformers Library (https://github.com/huggingface/transformers) 122 | as part of text generation and task performance. 123 | 124 | research/synthetic-text-generation-with-DP 125 | ├── generate-text.py 126 | └── run-classification.py 127 | -------------------------------------------------------------------------------- /research/synthetic-text-generation-with-DP/fine-tune-dp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | '''Train GPT2 model series with DP (w/ parameter-efficient approach LoRA when lora_dim > 0)''' 5 | 6 | import os 7 | import datasets 8 | import dp_transformers 9 | import transformers 10 | import sys 11 | import logging 12 | 13 | from dataclasses import dataclass, field 14 | from dp_transformers.layers.dp_merged_linear import mark_only_lora_as_trainable 15 | from dp_transformers.module_modification import convert_gpt2_attention_to_lora 16 | 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | @dataclass 22 | class ModelArguments: 23 | data_dir: str = field(default="./", metadata={ 24 | "help": "Path to training data" 25 | }) 26 | 27 | model_name: str = field(default="gpt2", metadata={ 28 | "help": "Model name in HuggingFace, e.g. 'gpt2'" 29 | }) 30 | 31 | lora_dim: int = field(default=0, metadata={ 32 | "help": "LoRA dimension; 0 means LoRA is disabled" 33 | }) 34 | 35 | sequence_len: int = field(default=128, metadata={ 36 | "help": "Model sequence length" 37 | }) 38 | 39 | lora_dropout: float = field(default=0.0, metadata={ 40 | "help": "Dropout probability for LoRA layers" 41 | }) 42 | 43 | lora_alpha: int = field(default=32, metadata={ 44 | "help": "LoRA attention alpha" 45 | }) 46 | 47 | @dataclass 48 | class Arguments: 49 | train: dp_transformers.TrainingArguments 50 | privacy: dp_transformers.PrivacyArguments 51 | model: ModelArguments 52 | 53 | 54 | def main(args: Arguments): 55 | 56 | transformers.set_seed(args.train.seed) 57 | 58 | # Setup logging 59 | logging.basicConfig( 60 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 61 | datefmt="%m/%d/%Y %H:%M:%S", 62 | handlers=[logging.StreamHandler(sys.stdout)], 63 | ) 64 | 65 | log_level = train_args.get_process_log_level() 66 | logger.setLevel(log_level) 67 | datasets.utils.logging.set_verbosity(log_level) 68 | transformers.utils.logging.set_verbosity(log_level) 69 | transformers.utils.logging.enable_default_handler() 70 | transformers.utils.logging.enable_explicit_format() 71 | 72 | # Log on each process the small summary: 73 | logger.warning( 74 | f"Process rank: {train_args.local_rank}, device: {train_args.device}, n_gpu: {train_args.n_gpu}, " 75 | f"distributed training: {bool(train_args.local_rank != -1)}, 16-bits training: {train_args.fp16}" 76 | ) 77 | logger.info(f"Training/evaluation parameters {train_args}") 78 | logger.info(f"Privacy parameters {privacy_args}") 79 | 80 | # Load model 81 | model = transformers.AutoModelForCausalLM.from_pretrained(args.model.model_name) 82 | model = model.to(train_args.device) 83 | 84 | # Load data 85 | data_path_train = os.path.join(args.model.data_dir, "train.csv") 86 | data_path_val = os.path.join(args.model.data_dir, "val.csv") 87 | dataset = datasets.load_dataset('csv', data_files={'train': data_path_train, 'validation': data_path_val}) 88 | 89 | # Load tokenizer 90 | tokenizer = transformers.AutoTokenizer.from_pretrained(args.model.model_name) 91 | num_added_toks = tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 92 | mean_tok_emb = model.transformer.wte.weight.data.mean(dim=0) 93 | model.resize_token_embeddings(len(tokenizer)) 94 | 95 | # Initialize the newly-added token embedding to the mean of all token embeddings 96 | for i in range(num_added_toks): 97 | model.transformer.wte.weight.data[-(i + 1), :] = mean_tok_emb 98 | 99 | label_column_names = [name for name in dataset["train"].column_names if "label" in name] 100 | 101 | # Tokenize data 102 | def preprocess_function(examples): 103 | batch = [] 104 | for t in range(len(examples['text'])): 105 | text = "\t".join([examples[name][t] for name in label_column_names]) + "\n\n" + examples['text'][t] + tokenizer.eos_token 106 | batch.append(text) 107 | 108 | result = tokenizer(batch, padding="max_length", truncation=True, 109 | max_length=args.model.sequence_len) 110 | 111 | return result 112 | 113 | # Tokenize data 114 | with train_args.main_process_first(desc="tokenizing dataset"): 115 | dataset = dataset.map( 116 | preprocess_function, batched=True, desc="tokenizing dataset", remove_columns=dataset.column_names['train'] 117 | ) 118 | 119 | if args.model.lora_dim > 0: 120 | model = convert_gpt2_attention_to_lora( 121 | model, r=args.model.lora_dim, lora_alpha=args.model.lora_alpha, lora_dropout=args.model.lora_dropout, 122 | enable_lora=[True, False, True], merge_weights=False 123 | ) 124 | mark_only_lora_as_trainable(model) 125 | 126 | if train_args.local_rank == 0: 127 | logger.info(f"Total number of parameters of the model: {model.num_parameters(only_trainable=False)}") 128 | logger.info(f"Fine-tuned number of parameters of the model: {model.num_parameters(only_trainable=True)}") 129 | 130 | model = model.cuda() 131 | model.train() 132 | 133 | data_collator = dp_transformers.DataCollatorForPrivateCausalLanguageModeling(tokenizer) 134 | 135 | trainer = dp_transformers.dp_utils.OpacusDPTrainer( 136 | args=train_args, 137 | model=model, 138 | train_dataset=dataset['train'], 139 | eval_dataset=dataset['validation'], 140 | data_collator=data_collator, 141 | privacy_args=privacy_args, 142 | tokenizer=tokenizer 143 | ) 144 | 145 | try: 146 | train_result = trainer.train() 147 | finally: 148 | eps_prv = trainer.get_prv_epsilon() 149 | eps_rdp = trainer.get_rdp_epsilon() 150 | trainer.log({ 151 | "final_epsilon_prv": eps_prv, 152 | "final_epsilon_rdp": eps_rdp 153 | }) 154 | 155 | if train_args.local_rank == 0 or train_args.local_rank == -1: 156 | metrics = train_result.metrics 157 | trainer.save_model() 158 | trainer.log_metrics("train", metrics) 159 | trainer.save_metrics("train", metrics) 160 | 161 | 162 | if __name__ == "__main__": 163 | arg_parser = transformers.HfArgumentParser((dp_transformers.TrainingArguments, dp_transformers.PrivacyArguments, ModelArguments)) 164 | train_args, privacy_args, model_args = arg_parser.parse_args_into_dataclasses() 165 | main(Arguments(train=train_args, privacy=privacy_args, model=model_args)) -------------------------------------------------------------------------------- /research/synthetic-text-generation-with-DP/fine-tune-nodp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | '''Train GPT2 model series without DP (w/ parameter-efficient approach LoRA when lora_dim > 0)''' 5 | 6 | import os 7 | import datasets 8 | import dp_transformers 9 | import transformers 10 | import sys 11 | import logging 12 | 13 | from dataclasses import dataclass, field 14 | from dp_transformers.layers.dp_merged_linear import mark_only_lora_as_trainable 15 | from dp_transformers.module_modification import convert_gpt2_attention_to_lora 16 | 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | @dataclass 22 | class ModelArguments: 23 | data_dir: str = field(default="./", metadata={ 24 | "help": "Path to training data" 25 | }) 26 | 27 | model_name: str = field(default="gpt2", metadata={ 28 | "help": "Model name in HuggingFace, e.g. 'gpt2'" 29 | }) 30 | 31 | lora_dim: int = field(default=0, metadata={ 32 | "help": "LoRA dimension; 0 means LoRA is disabled" 33 | }) 34 | 35 | sequence_len: int = field(default=128, metadata={ 36 | "help": "Model sequence length" 37 | }) 38 | 39 | lora_dropout: float = field(default=0.0, metadata={ 40 | "help": "Dropout probability for LoRA layers" 41 | }) 42 | 43 | lora_alpha: int = field(default=32, metadata={ 44 | "help": "LoRA attention alpha" 45 | }) 46 | 47 | 48 | @dataclass 49 | class Arguments: 50 | train: dp_transformers.TrainingArguments 51 | model: ModelArguments 52 | 53 | 54 | def main(args: Arguments): 55 | transformers.set_seed(args.train.seed) 56 | 57 | # Setup logging 58 | logging.basicConfig( 59 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 60 | datefmt="%m/%d/%Y %H:%M:%S", 61 | handlers=[logging.StreamHandler(sys.stdout)], 62 | ) 63 | 64 | log_level = train_args.get_process_log_level() 65 | logger.setLevel(log_level) 66 | datasets.utils.logging.set_verbosity(log_level) 67 | transformers.utils.logging.set_verbosity(log_level) 68 | transformers.utils.logging.enable_default_handler() 69 | transformers.utils.logging.enable_explicit_format() 70 | 71 | # Log on each process the small summary: 72 | logger.warning( 73 | f"Process rank: {train_args.local_rank}, device: {train_args.device}, n_gpu: {train_args.n_gpu}, " 74 | f"distributed training: {bool(train_args.local_rank != -1)}, 16-bits training: {train_args.fp16}" 75 | ) 76 | logger.info(f"Training/evaluation parameters {train_args}") 77 | 78 | # Load model 79 | model = transformers.AutoModelForCausalLM.from_pretrained(args.model.model_name) 80 | model = model.to(train_args.device) 81 | 82 | # Load data 83 | data_path_train = os.path.join(args.model.data_dir, "train.csv") 84 | data_path_val = os.path.join(args.model.data_dir, "val.csv") 85 | dataset = datasets.load_dataset('csv', data_files={'train': data_path_train, 'validation': data_path_val}) 86 | 87 | # Load tokenizer 88 | tokenizer = transformers.AutoTokenizer.from_pretrained(args.model.model_name) 89 | num_added_toks = tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 90 | mean_tok_emb = model.transformer.wte.weight.data.mean(dim=0) 91 | model.resize_token_embeddings(len(tokenizer)) 92 | 93 | # Initialize the newly-added token embedding to the mean of all token embeddings 94 | for i in range(num_added_toks): 95 | model.transformer.wte.weight.data[-(i + 1), :] = mean_tok_emb 96 | 97 | label_column_names = [name for name in dataset["train"].column_names if "label" in name] 98 | 99 | # Tokenize data 100 | def preprocess_function(examples): 101 | batch = [] 102 | for t in range(len(examples['text'])): 103 | text = "\t".join([examples[name][t] for name in label_column_names]) + "\n\n" + examples['text'][t] + tokenizer.eos_token 104 | batch.append(text) 105 | 106 | result = tokenizer(batch, padding="max_length", truncation=True, 107 | max_length=args.model.sequence_len) 108 | 109 | return result 110 | 111 | # Tokenize data 112 | with train_args.main_process_first(desc="tokenizing dataset"): 113 | dataset = dataset.map( 114 | preprocess_function, batched=True, desc="tokenizing dataset", remove_columns=dataset.column_names['train'] 115 | ) 116 | 117 | if args.model.lora_dim > 0: 118 | model = convert_gpt2_attention_to_lora( 119 | model, r=args.model.lora_dim, lora_alpha=args.model.lora_alpha, lora_dropout=args.model.lora_dropout, 120 | enable_lora=[True, False, True], merge_weights=False 121 | ) 122 | mark_only_lora_as_trainable(model) 123 | 124 | if train_args.local_rank == 0: 125 | logger.info(f"Total number of parameters of the model: {model.num_parameters(only_trainable=False)}") 126 | logger.info(f"Fine-tuned number of parameters of the model: {model.num_parameters(only_trainable=True)}") 127 | 128 | model = model.cuda() 129 | model.train() 130 | 131 | data_collator = dp_transformers.DataCollatorForPrivateCausalLanguageModeling(tokenizer) 132 | 133 | trainer = transformers.Trainer( 134 | args=train_args, 135 | model=model, 136 | train_dataset=dataset['train'], 137 | eval_dataset=dataset['validation'], 138 | data_collator=data_collator, 139 | tokenizer=tokenizer 140 | ) 141 | 142 | train_result = trainer.train() 143 | 144 | if train_args.local_rank == 0 or train_args.local_rank == -1: 145 | metrics = train_result.metrics 146 | trainer.save_model() 147 | trainer.log_metrics("train", metrics) 148 | trainer.save_metrics("train", metrics) 149 | 150 | if __name__ == "__main__": 151 | arg_parser = transformers.HfArgumentParser((dp_transformers.TrainingArguments, ModelArguments)) 152 | train_args, model_args = arg_parser.parse_args_into_dataclasses() 153 | main(Arguments(train=train_args, model=model_args)) -------------------------------------------------------------------------------- /research/synthetic-text-generation-with-DP/generate-text.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | '''Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)''' 17 | 18 | import argparse 19 | import collections 20 | import csv 21 | import os.path 22 | import random 23 | import sys 24 | import numpy as np 25 | import torch 26 | import transformers 27 | from transformers import GPT2LMHeadModel, GPT2Tokenizer 28 | from tqdm import tqdm 29 | 30 | from dp_transformers.module_modification import convert_gpt2_attention_to_lora 31 | 32 | import logging 33 | logger = logging.getLogger(__name__) 34 | logger.setLevel(logging.INFO) 35 | 36 | 37 | MODEL_CLASSES = { 38 | "distilgpt2": (GPT2LMHeadModel, GPT2Tokenizer), 39 | "gpt2": (GPT2LMHeadModel, GPT2Tokenizer), 40 | "gpt2-medium": (GPT2LMHeadModel, GPT2Tokenizer), 41 | "gpt2-large": (GPT2LMHeadModel, GPT2Tokenizer), 42 | "gpt2-xl": (GPT2LMHeadModel, GPT2Tokenizer), 43 | } 44 | 45 | 46 | def set_seed(args): 47 | np.random.seed(args.seed) 48 | torch.manual_seed(args.seed) 49 | if args.n_gpu > 0: 50 | torch.cuda.manual_seed_all(args.seed) 51 | 52 | def calc_perplexity(encodings, cur_model): 53 | max_length = cur_model.config.n_positions 54 | stride = 512 55 | device = 'cuda' if torch.cuda.is_available() else "cpu" 56 | nlls_cur = [] 57 | 58 | for i in range(0, encodings.size(1), stride): 59 | begin_loc = max(i + stride - max_length, 0) 60 | end_loc = min(i + stride, encodings.size(1)) 61 | trg_len = end_loc - i # may be different from stride on last loop 62 | input_ids = encodings[:, begin_loc:end_loc].to(device) 63 | target_ids = input_ids.clone() 64 | target_ids[:, :-trg_len] = -100 65 | target_ids[target_ids==cur_model.config.pad_token_id] = -100 66 | 67 | with torch.no_grad(): 68 | outputs = cur_model(input_ids, labels=target_ids) 69 | nlls_cur.append(outputs[0] * trg_len) 70 | 71 | ppl_cur = torch.exp(torch.stack(nlls_cur).sum() / end_loc) 72 | 73 | return ppl_cur.item() 74 | 75 | def convert_model(checkpoint_path): 76 | sd = torch.load(os.path.join(checkpoint_path, "pytorch_model.bin"), map_location="cpu") 77 | state_dict = {} 78 | for key, value in sd.items(): 79 | key = key.replace("_module.module.", "") 80 | state_dict[key] = value 81 | return state_dict 82 | 83 | 84 | def main(): 85 | parser = argparse.ArgumentParser() 86 | parser.add_argument( 87 | "--model_type", 88 | default=None, 89 | type=str, 90 | required=True, 91 | help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), 92 | ) 93 | parser.add_argument( 94 | "--model_name_or_path", 95 | default=None, 96 | type=str, 97 | required=True, 98 | help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()), 99 | ) 100 | 101 | parser.add_argument( 102 | "--input_training_file", 103 | default=None, 104 | type=str, 105 | required=True, 106 | ) 107 | 108 | parser.add_argument( 109 | "--output_dir", 110 | default=None, 111 | type=str, 112 | required=True, 113 | ) 114 | 115 | parser.add_argument("--length", type=int, default=128) 116 | parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped") 117 | 118 | parser.add_argument( 119 | "--temperature", 120 | type=float, 121 | default=1.0, 122 | help="temperature of 1.0 has no effect, lower tend toward greedy sampling", 123 | ) 124 | parser.add_argument( 125 | "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2" 126 | ) 127 | parser.add_argument("--k", type=int, default=50) 128 | parser.add_argument("--p", type=float, default=0.9) 129 | parser.add_argument("--num_beams", type=int, default=5) 130 | parser.add_argument("--batch_size", type=int, default=32) 131 | 132 | parser.add_argument("--do_sample", action="store_true", help="sampling when generation") 133 | 134 | parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") 135 | parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") 136 | parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.") 137 | parser.add_argument("--total_sequences", type=int, default=100000, help="The number of total samples to generate.") 138 | 139 | parser.add_argument( 140 | "--fp16", 141 | action="store_true", 142 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", 143 | ) 144 | 145 | parser.add_argument("--lora_dim", type=int, default=0) 146 | parser.add_argument("--lora_alpha", type=int, default=32) 147 | parser.add_argument("--lora_dropout", type=float, default=0.0) 148 | 149 | args = parser.parse_args() 150 | 151 | args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 152 | args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() 153 | 154 | logger.warning(f"device: {args.device}, n_gpu: {args.n_gpu}, 16-bits training: {args.fp16}") 155 | 156 | set_seed(args) 157 | 158 | # Initialize the model and tokenizer 159 | try: 160 | args.model_type = args.model_type.lower() 161 | model_class, tokenizer_class = MODEL_CLASSES[args.model_type] 162 | except KeyError: 163 | raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)") 164 | 165 | tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) 166 | 167 | if tokenizer.pad_token_id: 168 | model = transformers.AutoModelForCausalLM.from_pretrained(args.model_type, pad_token_id=tokenizer.pad_token_id) 169 | else: 170 | model = transformers.AutoModelForCausalLM.from_pretrained(args.model_type, pad_token_id=tokenizer.eos_token_id) 171 | 172 | model.resize_token_embeddings(len(tokenizer)) 173 | 174 | if args.lora_dim > 0: 175 | model = convert_gpt2_attention_to_lora( 176 | model, r=args.lora_dim, lora_alpha=args.lora_alpha, lora_dropout=args.lora_dropout, 177 | enable_lora=[True, False, True], merge_weights=False 178 | ) 179 | 180 | state_dict = convert_model(args.model_name_or_path) 181 | model, *_ = model_class._load_pretrained_model( 182 | model, 183 | state_dict, 184 | [k for k in state_dict.keys()], # XXX: rename? 185 | os.path.join(args.model_name_or_path, "pytorch_model.bin"), 186 | args.model_name_or_path, 187 | ) 188 | 189 | # Make sure token embedding weights are still tied if needed 190 | model.tie_weights() 191 | 192 | model.eval() 193 | model.to(args.device) 194 | 195 | if args.fp16: 196 | model.half() 197 | 198 | logger.info(args) 199 | 200 | def generate_text(prompt,seq_num,prompt_length): 201 | ppls_cur = [] 202 | all_data = [] 203 | 204 | for _ in tqdm(range(seq_num // args.batch_size + 1)): 205 | input_ids = torch.tensor(prompt, device=args.device).repeat(args.batch_size, 1) 206 | output_sequences = model.generate( 207 | input_ids=input_ids, 208 | max_length=args.length, 209 | temperature=args.temperature, 210 | top_k=args.k, 211 | top_p=args.p, 212 | early_stopping=True, 213 | repetition_penalty=args.repetition_penalty, 214 | do_sample=args.do_sample, 215 | num_return_sequences=2, # overgenerate to ensure we have enough non-empty generated sequences 216 | no_repeat_ngram_size=2, 217 | ) 218 | 219 | ppl = calc_perplexity(output_sequences, model) 220 | ppls_cur.append(ppl) 221 | 222 | generated_sequences = tokenizer.batch_decode(output_sequences, skip_special_tokens=True, 223 | clean_up_tokenization_spaces=True) 224 | 225 | for g in generated_sequences: 226 | labels, seq = g[:prompt_length], g[prompt_length:] 227 | seq = " ".join(seq.split()) 228 | labels = labels.strip().split("\t") 229 | if seq: 230 | all_data.append([seq]+labels) 231 | 232 | if len(all_data) >seq_num: 233 | all_data = random.sample(all_data,seq_num) 234 | return all_data,ppls_cur 235 | 236 | with torch.no_grad(): 237 | prompt_counter = collections.Counter() 238 | with open(args.input_training_file,encoding='utf-8') as rf: 239 | csv_reader = csv.reader(rf) 240 | title = next(csv_reader) 241 | 242 | label_column_index = [i for i,name in enumerate(title) if "label" in name] 243 | 244 | for line in csv_reader: 245 | prompt = "\t".join([line[idx] for idx in label_column_index]) + "\n\n" 246 | prompt_counter[prompt] += 1 247 | 248 | ratio_generation_training = args.total_sequences / sum(prompt_counter.values()) 249 | all_sequences = [] 250 | ppls_cur = [] 251 | 252 | for prompt_text in tqdm(prompt_counter): 253 | prompt = tokenizer(prompt_text)['input_ids'] 254 | num_seq_to_generate = round(prompt_counter[prompt_text] * ratio_generation_training) 255 | if num_seq_to_generate>0: 256 | sequences, ppls = generate_text(prompt, num_seq_to_generate, len(prompt_text)) 257 | all_sequences += sequences 258 | ppls_cur += ppls 259 | 260 | logger.info(f"Current PPL: %.2f±%.2f", np.mean(ppls_cur),np.std(ppls_cur)) 261 | logger.info(f"Total generated sequences: %d", len(all_sequences)) 262 | random.shuffle(all_sequences) 263 | 264 | #prefix = list(filter(None, args.model_name_or_path.split("/"))).pop() 265 | os.makedirs(args.output_dir, exist_ok=True) 266 | output_path = os.path.join(args.output_dir, str(args.length) + ".generations.csv") 267 | with open(output_path, 'w', newline='', encoding="utf-8") as wf: 268 | csv_writer = csv.writer(wf) 269 | csv_writer.writerow(title) 270 | for obj in all_sequences: 271 | if obj[0]: # remove empty sequences 272 | csv_writer.writerow(obj) 273 | 274 | 275 | if __name__ == "__main__": 276 | main() -------------------------------------------------------------------------------- /research/synthetic-text-generation-with-DP/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas==1.5.3 3 | scikit-learn 4 | torch==1.12.1 5 | transformers==4.20.1 6 | datasets==2.0.0 7 | prv-accountant==0.1.1.post1 8 | opacus==1.2.0 9 | git+https://github.com/microsoft/dp-transformers.git@39fb6878623594cb0ab1c9a273058487b8f8a710 -------------------------------------------------------------------------------- /research/synthetic-text-generation-with-DP/run-classification.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ Finetuning the library models for sequence classification on GLUE.""" 17 | # You can also adapt this script on your own text classification task. Pointers for this are left as comments. 18 | 19 | import logging 20 | import os 21 | import random 22 | import sys 23 | from dataclasses import dataclass, field 24 | from typing import Optional 25 | from numpy import expand_dims 26 | from numpy import log 27 | from numpy import mean,std 28 | from numpy import exp 29 | import datasets 30 | import numpy as np 31 | from datasets import load_dataset, load_metric 32 | from math import floor 33 | import transformers 34 | from transformers import ( 35 | AutoConfig, 36 | AutoModelForSequenceClassification, 37 | AutoTokenizer, 38 | DataCollatorWithPadding, 39 | EvalPrediction, 40 | HfArgumentParser, 41 | PretrainedConfig, 42 | Trainer, 43 | TrainingArguments, 44 | default_data_collator, 45 | set_seed, 46 | ) 47 | from transformers.trainer_utils import get_last_checkpoint 48 | from transformers.utils import check_min_version 49 | from transformers.utils.versions import require_version 50 | from sklearn.metrics import confusion_matrix 51 | from scipy.special import softmax 52 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks. 53 | check_min_version("4.18.0") 54 | 55 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") 56 | 57 | task_to_keys = { 58 | "cola": ("sentence", None), 59 | "mnli": ("premise", "hypothesis"), 60 | "mrpc": ("sentence1", "sentence2"), 61 | "qnli": ("question", "sentence"), 62 | "qqp": ("question1", "question2"), 63 | "rte": ("sentence1", "sentence2"), 64 | "sst2": ("sentence", None), 65 | "stsb": ("sentence1", "sentence2"), 66 | "wnli": ("sentence1", "sentence2"), 67 | } 68 | 69 | logger = logging.getLogger(__name__) 70 | 71 | 72 | @dataclass 73 | class DataTrainingArguments: 74 | """ 75 | Arguments pertaining to what data we are going to input our model for training and eval. 76 | Using `HfArgumentParser` we can turn this class 77 | into argparse arguments to be able to specify them on 78 | the command line. 79 | """ 80 | 81 | task_name: Optional[str] = field( 82 | default=None, 83 | metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())}, 84 | ) 85 | dataset_name: Optional[str] = field( 86 | default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} 87 | ) 88 | label_column_name: Optional[str] = field( 89 | default=None, metadata={"help": "The name of the label column"} 90 | ) 91 | dataset_config_name: Optional[str] = field( 92 | default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} 93 | ) 94 | max_seq_length: int = field( 95 | default=128, 96 | metadata={ 97 | "help": ( 98 | "The maximum total input sequence length after tokenization. Sequences longer " 99 | "than this will be truncated, sequences shorter will be padded." 100 | ) 101 | }, 102 | ) 103 | overwrite_cache: bool = field( 104 | default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} 105 | ) 106 | pad_to_max_length: bool = field( 107 | default=True, 108 | metadata={ 109 | "help": ( 110 | "Whether to pad all samples to `max_seq_length`. " 111 | "If False, will pad the samples dynamically when batching to the maximum length in the batch." 112 | ) 113 | }, 114 | ) 115 | max_train_samples: Optional[int] = field( 116 | default=None, 117 | metadata={ 118 | "help": ( 119 | "For debugging purposes or quicker training, truncate the number of training examples to this " 120 | "value if set." 121 | ) 122 | }, 123 | ) 124 | max_eval_samples: Optional[int] = field( 125 | default=None, 126 | metadata={ 127 | "help": ( 128 | "For debugging purposes or quicker training, truncate the number of evaluation examples to this " 129 | "value if set." 130 | ) 131 | }, 132 | ) 133 | max_predict_samples: Optional[int] = field( 134 | default=None, 135 | metadata={ 136 | "help": ( 137 | "For debugging purposes or quicker training, truncate the number of prediction examples to this " 138 | "value if set." 139 | ) 140 | }, 141 | ) 142 | train_file: Optional[str] = field( 143 | default=None, metadata={"help": "A csv or a json file containing the training data."} 144 | ) 145 | validation_file: Optional[str] = field( 146 | default=None, metadata={"help": "A csv or a json file containing the validation data."} 147 | ) 148 | test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."}) 149 | 150 | def __post_init__(self): 151 | if self.task_name is not None: 152 | self.task_name = self.task_name.lower() 153 | if self.task_name not in task_to_keys.keys(): 154 | raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys())) 155 | elif self.dataset_name is not None: 156 | pass 157 | elif self.train_file is None or self.validation_file is None: 158 | raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.") 159 | else: 160 | train_extension = self.train_file.split(".")[-1] 161 | assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file." 162 | validation_extension = self.validation_file.split(".")[-1] 163 | assert ( 164 | validation_extension == train_extension 165 | ), "`validation_file` should have the same extension (csv or json) as `train_file`." 166 | 167 | 168 | @dataclass 169 | class ModelArguments: 170 | """ 171 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. 172 | """ 173 | 174 | model_name_or_path: str = field( 175 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} 176 | ) 177 | config_name: Optional[str] = field( 178 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 179 | ) 180 | tokenizer_name: Optional[str] = field( 181 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 182 | ) 183 | cache_dir: Optional[str] = field( 184 | default=None, 185 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, 186 | ) 187 | use_fast_tokenizer: bool = field( 188 | default=True, 189 | metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, 190 | ) 191 | model_revision: str = field( 192 | default="main", 193 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, 194 | ) 195 | use_auth_token: bool = field( 196 | default=False, 197 | metadata={ 198 | "help": ( 199 | "Will use the token generated when running `transformers-cli login` (necessary to use this script " 200 | "with private models)." 201 | ) 202 | }, 203 | ) 204 | ignore_mismatched_sizes: bool = field( 205 | default=False, 206 | metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."}, 207 | ) 208 | 209 | binary_classification: bool = field( 210 | default=False, 211 | ) 212 | 213 | sample_dataset: bool = field( 214 | default=False, 215 | ) 216 | 217 | def main(): 218 | # See all possible arguments in src/transformers/training_args.py 219 | # or by passing the --help flag to this script. 220 | # We now keep distinct sets of args, for a cleaner separation of concerns. 221 | 222 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) 223 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 224 | # If we pass only one argument to the script and it's the path to a json file, 225 | # let's parse it to get our arguments. 226 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 227 | else: 228 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 229 | 230 | # Setup logging 231 | logging.basicConfig( 232 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 233 | datefmt="%m/%d/%Y %H:%M:%S", 234 | handlers=[logging.StreamHandler(sys.stdout)], 235 | ) 236 | 237 | log_level = training_args.get_process_log_level() 238 | logger.setLevel(log_level) 239 | datasets.utils.logging.set_verbosity(log_level) 240 | transformers.utils.logging.set_verbosity(log_level) 241 | transformers.utils.logging.enable_default_handler() 242 | transformers.utils.logging.enable_explicit_format() 243 | 244 | # Log on each process the small summary: 245 | logger.warning( 246 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 247 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 248 | ) 249 | logger.info(f"Training/evaluation parameters {training_args}") 250 | 251 | # Detecting last checkpoint. 252 | last_checkpoint = None 253 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: 254 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 255 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 256 | raise ValueError( 257 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 258 | "Use --overwrite_output_dir to overcome." 259 | ) 260 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: 261 | logger.info( 262 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 263 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 264 | ) 265 | 266 | # Set seed before initializing model. 267 | set_seed(training_args.seed) 268 | 269 | def sample_dataset(dataset,label_column_name,sample_size=100000): 270 | training_dataset = dataset['train'] 271 | sample_indices = [] 272 | label_list = training_dataset.unique(label_column_name) 273 | for label in label_list: 274 | indices = np.where(np.array(training_dataset[label_column_name])==label)[0] 275 | sample_num = round(sample_size * (len(indices)/len(training_dataset))) 276 | sample_indices.append(np.random.choice(indices,size=sample_num,replace=False)) 277 | sample_indices = np.concatenate(sample_indices) 278 | np.random.shuffle(sample_indices) 279 | training_dataset = training_dataset.select(sample_indices) 280 | dataset['train'] = training_dataset 281 | return dataset 282 | 283 | # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) 284 | # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). 285 | # 286 | # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the 287 | # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named 288 | # label if at least two columns are provided. 289 | # 290 | # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this 291 | # single column. You can easily tweak this behavior (see below) 292 | # 293 | # In distributed training, the load_dataset function guarantee that only one local process can concurrently 294 | # download the dataset. 295 | if data_args.task_name is not None: 296 | # Downloading and loading a dataset from the hub. 297 | raw_datasets = load_dataset( 298 | "glue", 299 | data_args.task_name, 300 | cache_dir=model_args.cache_dir, 301 | use_auth_token=True if model_args.use_auth_token else None, 302 | ) 303 | elif data_args.dataset_name is not None: 304 | # Downloading and loading a dataset from the hub. 305 | raw_datasets = load_dataset( 306 | data_args.dataset_name, 307 | data_args.dataset_config_name, 308 | cache_dir=model_args.cache_dir, 309 | use_auth_token=True if model_args.use_auth_token else None, 310 | ) 311 | else: 312 | # Loading a dataset from your local files. 313 | # CSV/JSON training and evaluation files are needed. 314 | data_files = {"train": data_args.train_file, "validation": data_args.validation_file} 315 | 316 | # Get the test dataset: you can provide your own CSV/JSON test file (see below) 317 | # when you use `do_predict` without specifying a GLUE benchmark task. 318 | if training_args.do_predict: 319 | if data_args.test_file is not None: 320 | train_extension = data_args.train_file.split(".")[-1] 321 | test_extension = data_args.test_file.split(".")[-1] 322 | assert ( 323 | test_extension == train_extension 324 | ), "`test_file` should have the same extension (csv or json) as `train_file`." 325 | data_files["test"] = data_args.test_file 326 | else: 327 | raise ValueError("Need either a GLUE task or a test file for `do_predict`.") 328 | 329 | for key in data_files.keys(): 330 | logger.info(f"load a local file for {key}: {data_files[key]}") 331 | 332 | if data_args.train_file.endswith(".csv"): 333 | # Loading a dataset from local csv files 334 | raw_datasets = load_dataset( 335 | "csv", 336 | data_files=data_files, 337 | cache_dir=model_args.cache_dir, 338 | use_auth_token=True if model_args.use_auth_token else None, 339 | ) 340 | 341 | else: 342 | # Loading a dataset from local json files 343 | raw_datasets = load_dataset( 344 | "json", 345 | data_files=data_files, 346 | cache_dir=model_args.cache_dir, 347 | use_auth_token=True if model_args.use_auth_token else None, 348 | ) 349 | # See more about loading any type of standard or custom dataset at 350 | # https://huggingface.co/docs/datasets/loading_datasets.html. 351 | if not data_args.label_column_name: 352 | label_column_name = [name for name in raw_datasets["train"].column_names if "label" in name][-1] 353 | else: 354 | label_column_name = data_args.label_column_name 355 | 356 | # Labels 357 | if data_args.task_name is not None: 358 | is_regression = data_args.task_name == "stsb" 359 | if not is_regression: 360 | label_list = raw_datasets["train"].features[label_column_name].names 361 | num_labels = len(label_list) 362 | else: 363 | num_labels = 1 364 | else: 365 | # Trying to have good defaults here, don't hesitate to tweak to your needs. 366 | # is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"] 367 | is_regression = False 368 | if is_regression: 369 | num_labels = 1 370 | else: 371 | # A useful fast method: 372 | # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique 373 | label_list = raw_datasets["train"].unique(label_column_name) 374 | label_list.sort() # Let's sort it for determinism 375 | num_labels = len(label_list) 376 | if model_args.sample_dataset: 377 | raw_datasets = sample_dataset(raw_datasets,label_column_name) 378 | 379 | # Load pretrained model and tokenizer 380 | # 381 | # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently 382 | # download model & vocab. 383 | config = AutoConfig.from_pretrained( 384 | model_args.config_name if model_args.config_name else model_args.model_name_or_path, 385 | num_labels=num_labels, 386 | finetuning_task=data_args.task_name, 387 | cache_dir=model_args.cache_dir, 388 | revision=model_args.model_revision, 389 | use_auth_token=True if model_args.use_auth_token else None, 390 | ) 391 | tokenizer = AutoTokenizer.from_pretrained( 392 | model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, 393 | cache_dir=model_args.cache_dir, 394 | use_fast=model_args.use_fast_tokenizer, 395 | revision=model_args.model_revision, 396 | use_auth_token=True if model_args.use_auth_token else None, 397 | ) 398 | model = AutoModelForSequenceClassification.from_pretrained( 399 | model_args.model_name_or_path, 400 | from_tf=bool(".ckpt" in model_args.model_name_or_path), 401 | config=config, 402 | cache_dir=model_args.cache_dir, 403 | revision=model_args.model_revision, 404 | use_auth_token=True if model_args.use_auth_token else None, 405 | ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, 406 | ) 407 | 408 | # Preprocessing the raw_datasets 409 | if data_args.task_name is not None: 410 | sentence1_key, sentence2_key = task_to_keys[data_args.task_name] 411 | else: 412 | # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. 413 | non_label_column_names = [name for name in raw_datasets["train"].column_names if "label" not in name] 414 | if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: 415 | sentence1_key, sentence2_key = "sentence1", "sentence2" 416 | else: 417 | if len(non_label_column_names) >= 2: 418 | sentence1_key, sentence2_key = non_label_column_names[:2] 419 | else: 420 | sentence1_key, sentence2_key = non_label_column_names[0], None 421 | 422 | # Padding strategy 423 | if data_args.pad_to_max_length: 424 | padding = "max_length" 425 | else: 426 | # We will pad later, dynamically at batch creation, to the max sequence length in each batch 427 | padding = False 428 | 429 | # Some models have set the order of the labels to use, so let's make sure we do use it. 430 | label_to_id = None 431 | if ( 432 | model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id 433 | and data_args.task_name is not None 434 | and not is_regression 435 | ): 436 | # Some have all caps in their config, some don't. 437 | label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()} 438 | if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): 439 | label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)} 440 | else: 441 | logger.warning( 442 | "Your model seems to have been trained with labels, but they don't match the dataset: ", 443 | f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." 444 | "\nIgnoring the model labels as a result.", 445 | ) 446 | elif data_args.task_name is None and not is_regression: 447 | label_to_id = {v: i for i, v in enumerate(label_list)} 448 | 449 | if label_to_id is not None: 450 | model.config.label2id = label_to_id 451 | model.config.id2label = {id: label for label, id in config.label2id.items()} 452 | elif data_args.task_name is not None and not is_regression: 453 | model.config.label2id = {l: i for i, l in enumerate(label_list)} 454 | model.config.id2label = {id: label for label, id in config.label2id.items()} 455 | 456 | if data_args.max_seq_length > tokenizer.model_max_length: 457 | logger.warning( 458 | f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" 459 | f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." 460 | ) 461 | max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) 462 | 463 | def preprocess_function(examples): 464 | # Tokenize the texts 465 | args = ( 466 | (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) 467 | ) 468 | result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) 469 | 470 | # Map labels to IDs (not necessary for GLUE tasks) 471 | if label_to_id is not None and label_column_name in examples: 472 | result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples[label_column_name]] 473 | return result 474 | 475 | with training_args.main_process_first(desc="dataset map pre-processing"): 476 | raw_datasets = raw_datasets.map( 477 | preprocess_function, 478 | batched=True, 479 | load_from_cache_file=not data_args.overwrite_cache, 480 | desc="Running tokenizer on dataset", 481 | ) 482 | if training_args.do_train: 483 | if "train" not in raw_datasets: 484 | raise ValueError("--do_train requires a train dataset") 485 | train_dataset = raw_datasets["train"] 486 | if data_args.max_train_samples is not None: 487 | max_train_samples = min(len(train_dataset), data_args.max_train_samples) 488 | train_dataset = train_dataset.select(range(max_train_samples)) 489 | 490 | if training_args.do_eval: 491 | if "validation" not in raw_datasets and "validation_matched" not in raw_datasets: 492 | raise ValueError("--do_eval requires a validation dataset") 493 | eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] 494 | if data_args.max_eval_samples is not None: 495 | max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) 496 | eval_dataset = eval_dataset.select(range(max_eval_samples)) 497 | 498 | if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None: 499 | if "test" not in raw_datasets and "test_matched" not in raw_datasets: 500 | raise ValueError("--do_predict requires a test dataset") 501 | predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"] 502 | if data_args.max_predict_samples is not None: 503 | max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) 504 | predict_dataset = predict_dataset.select(range(max_predict_samples)) 505 | 506 | # Log a few random samples from the training set: 507 | if training_args.do_train: 508 | for index in random.sample(range(len(train_dataset)), 3): 509 | logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") 510 | 511 | # Get the metric function 512 | if data_args.task_name is not None: 513 | metric = load_metric("glue", data_args.task_name) 514 | else: 515 | metric = load_metric("accuracy") 516 | 517 | # assumes images have the shape 299x299x3, pixels in [0,255] 518 | def calculate_inception_score(yhat, n_split=5, eps=1E-16): 519 | # enumerate splits of images/predictions 520 | scores = list() 521 | n_part = floor(yhat.shape[0] / n_split) 522 | for i in range(n_split): 523 | # retrieve p(y|x) 524 | ix_start, ix_end = i * n_part, i * n_part + n_part 525 | p_yx = yhat[ix_start:ix_end] 526 | # calculate p(y) 527 | p_y = expand_dims(p_yx.mean(axis=0), 0) 528 | # calculate KL divergence using log probabilities 529 | kl_d = p_yx * (log(p_yx + eps) - log(p_y + eps)) 530 | # sum over classes 531 | sum_kl_d = kl_d.sum(axis=1) 532 | # average over images 533 | avg_kl_d = mean(sum_kl_d) 534 | # undo the log 535 | is_score = exp(avg_kl_d) 536 | # store 537 | scores.append(is_score) 538 | # average across images 539 | is_avg, is_std = mean(scores), std(scores) 540 | return is_avg 541 | 542 | # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a 543 | # predictions and label_ids field) and has to return a dictionary string to float. 544 | def compute_metrics(p: EvalPrediction): 545 | preds_prob = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions 546 | preds = np.squeeze(preds_prob) if is_regression else np.argmax(preds_prob, axis=1) 547 | if data_args.task_name is not None: 548 | result = metric.compute(predictions=preds, references=p.label_ids) 549 | if len(result) > 1: 550 | result["combined_score"] = np.mean(list(result.values())).item() 551 | return result 552 | elif is_regression: 553 | return {"mse": ((preds - p.label_ids) ** 2).mean().item()} 554 | else: 555 | matrix = confusion_matrix(p.label_ids, preds) 556 | per_class_acc = matrix.diagonal() / matrix.sum(axis=1) 557 | results = {"accuracy_class_" + str(k+1):per_class_acc[k] for k in range(len(per_class_acc))} 558 | results["accuracy_all"] = (preds == p.label_ids).astype(np.float32).mean().item() 559 | results['inception_score'] = calculate_inception_score(softmax(preds_prob,axis=1)) 560 | return results 561 | 562 | # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if 563 | # we already did the padding. 564 | if data_args.pad_to_max_length: 565 | data_collator = default_data_collator 566 | elif training_args.fp16: 567 | data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) 568 | else: 569 | data_collator = None 570 | 571 | # Initialize our Trainer 572 | trainer = Trainer( 573 | model=model, 574 | args=training_args, 575 | train_dataset=train_dataset if training_args.do_train else None, 576 | eval_dataset=eval_dataset if training_args.do_eval else None, 577 | compute_metrics=compute_metrics, 578 | tokenizer=tokenizer, 579 | data_collator=data_collator, 580 | ) 581 | 582 | # Training 583 | if training_args.do_train: 584 | checkpoint = None 585 | if training_args.resume_from_checkpoint is not None: 586 | checkpoint = training_args.resume_from_checkpoint 587 | elif last_checkpoint is not None: 588 | checkpoint = last_checkpoint 589 | train_result = trainer.train(resume_from_checkpoint=checkpoint) 590 | metrics = train_result.metrics 591 | max_train_samples = ( 592 | data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) 593 | ) 594 | metrics["train_samples"] = min(max_train_samples, len(train_dataset)) 595 | 596 | #trainer.save_model() # Saves the tokenizer too for easy upload 597 | 598 | trainer.log_metrics("train", metrics) 599 | trainer.save_metrics("train", metrics) 600 | #trainer.save_state() 601 | 602 | # Evaluation 603 | if training_args.do_eval: 604 | logger.info("*** Evaluate ***") 605 | 606 | # Loop to handle MNLI double evaluation (matched, mis-matched) 607 | tasks = [data_args.task_name] 608 | eval_datasets = [eval_dataset] 609 | if data_args.task_name == "mnli": 610 | tasks.append("mnli-mm") 611 | eval_datasets.append(raw_datasets["validation_mismatched"]) 612 | combined = {} 613 | 614 | for eval_dataset, task in zip(eval_datasets, tasks): 615 | metrics = trainer.evaluate(eval_dataset=eval_dataset) 616 | 617 | max_eval_samples = ( 618 | data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) 619 | ) 620 | metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) 621 | 622 | if task == "mnli-mm": 623 | metrics = {k + "_mm": v for k, v in metrics.items()} 624 | if task is not None and "mnli" in task: 625 | combined.update(metrics) 626 | 627 | trainer.log_metrics("eval", metrics) 628 | trainer.save_metrics("eval", combined if task is not None and "mnli" in task else metrics) 629 | 630 | if training_args.do_predict: 631 | logger.info("*** Predict ***") 632 | 633 | # Loop to handle MNLI double evaluation (matched, mis-matched) 634 | tasks = [data_args.task_name] 635 | predict_datasets = [predict_dataset] 636 | if data_args.task_name == "mnli": 637 | tasks.append("mnli-mm") 638 | predict_datasets.append(raw_datasets["test_mismatched"]) 639 | 640 | for predict_dataset, task in zip(predict_datasets, tasks): 641 | metrics = trainer.evaluate(eval_dataset=predict_dataset) 642 | trainer.log_metrics("test", metrics) 643 | trainer.save_metrics("test", combined if task is not None and "mnli" in task else metrics) 644 | 645 | # Removing the `label` columns because it contains -1 and Trainer won't like that. 646 | predict_dataset = predict_dataset.remove_columns("label") 647 | predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions 648 | predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1) 649 | 650 | output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt") 651 | if trainer.is_world_process_zero(): 652 | with open(output_predict_file, "w") as writer: 653 | logger.info(f"***** Predict results {task} *****") 654 | writer.write("index\tprediction\n") 655 | for index, item in enumerate(predictions): 656 | if is_regression: 657 | writer.write(f"{index}\t{item:3.3f}\n") 658 | else: 659 | item = label_list[item] 660 | writer.write(f"{index}\t{item}\n") 661 | 662 | kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"} 663 | if data_args.task_name is not None: 664 | kwargs["language"] = "en" 665 | kwargs["dataset_tags"] = "glue" 666 | kwargs["dataset_args"] = data_args.task_name 667 | kwargs["dataset"] = f"GLUE {data_args.task_name.upper()}" 668 | 669 | if training_args.push_to_hub: 670 | trainer.push_to_hub(**kwargs) 671 | else: 672 | trainer.create_model_card(**kwargs) 673 | 674 | 675 | def _mp_fn(index): 676 | # For xla_spawn (TPUs) 677 | main() 678 | 679 | 680 | if __name__ == "__main__": 681 | main() -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | from setuptools import setup, find_packages 6 | 7 | version = '1.0.1' 8 | 9 | with open('README.md') as f: 10 | long_description = f.read() 11 | 12 | setup( 13 | name='dp-transformers', 14 | version=version, 15 | description='Differentially-private transformers using HuggingFace and Opacus', 16 | long_description=long_description, 17 | long_description_content_type='text/markdown', 18 | url="https://www.github.com/microsoft/dp-transformers", 19 | author='Microsoft', 20 | packages=find_packages('src'), 21 | package_dir={'': 'src'}, 22 | python_requires=">=3.7.0", 23 | include_package_data=True, 24 | extras_require={ 25 | "test": [ 26 | "pytest", 27 | ] 28 | }, 29 | install_requires=[ 30 | "transformers>=4.30.0", 31 | "datasets>=2.0.0", 32 | "opacus>=1.3.0", 33 | "peft", 34 | "prv_accountant<0.2.0", 35 | "torch>=1.13.1", 36 | ], 37 | test_suite="tests", 38 | zip_safe=False 39 | ) 40 | -------------------------------------------------------------------------------- /src/dp_transformers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from .arguments import PrivacyArguments, TrainingArguments # noqa: F401 5 | from .dp_utils import DPCallback, DataCollatorForPrivateCausalLanguageModeling # noqa: F401 6 | from .sampler import PoissonAuthorSampler, ShuffledAuthorSampler # noqa: F401 7 | -------------------------------------------------------------------------------- /src/dp_transformers/arguments.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from typing import Optional 5 | 6 | import numpy as np 7 | from scipy import optimize 8 | from transformers import TrainingArguments as HfTrainingArguments 9 | from transformers import IntervalStrategy, logging 10 | from dataclasses import dataclass, field 11 | from datasets.utils import disable_progress_bar 12 | from prv_accountant import Accountant 13 | 14 | logger = logging.get_logger(__name__) 15 | 16 | 17 | @dataclass 18 | class PrivacyArguments: 19 | per_sample_max_grad_norm: Optional[float] = field(default=None, metadata={"help": "Max per sample clip norm"}) 20 | noise_multiplier: Optional[float] = field(default=None, metadata={"help": "Noise multiplier for DP training"}) 21 | target_epsilon: Optional[float] = field(default=None, metadata={ 22 | "help": "Target epsilon at end of training (mutually exclusive with noise multiplier)" 23 | }) 24 | target_delta: Optional[float] = field(default=None, metadata={ 25 | "help": "Target delta, defaults to 1/N" 26 | }) 27 | disable_dp: bool = field(default=False, metadata={ 28 | "help": "Disable DP training." 29 | }) 30 | 31 | def initialize(self, sampling_probability: float, num_steps: int, num_samples: int) -> None: 32 | if self.target_delta is None: 33 | self.target_delta = 1.0/num_samples 34 | logger.info(f"The target delta is set to be: {self.target_delta}") 35 | 36 | # Set up noise multiplier 37 | if self.noise_multiplier is None: 38 | self.noise_multiplier = find_noise_multiplier( 39 | sampling_probability=sampling_probability, 40 | num_steps=num_steps, 41 | target_delta=self.target_delta, 42 | target_epsilon=self.target_epsilon 43 | ) 44 | logger.info(f"The noise multiplier is set to be: {self.noise_multiplier}") 45 | 46 | @property 47 | def is_initialized(self) -> bool: 48 | return ( 49 | self.per_sample_max_grad_norm is not None and 50 | self.noise_multiplier is not None and 51 | self.target_delta is not None 52 | ) 53 | 54 | def __post_init__(self): 55 | if self.disable_dp: 56 | logger.warning("Disabling differentially private training...") 57 | self.noise_multiplier = 0.0 58 | self.per_sample_max_grad_norm = float('inf') 59 | self.target_epsilon = None 60 | else: 61 | if bool(self.target_epsilon) == bool(self.noise_multiplier): 62 | raise ValueError("Exactly one of the arguments --target_epsilon and --noise_multiplier must be used.") 63 | if self.per_sample_max_grad_norm is None: 64 | raise ValueError("DP training requires --per_sample_max_grad_norm argument.") 65 | 66 | 67 | @dataclass 68 | class TrainingArguments(HfTrainingArguments): 69 | dry_run: bool = field( 70 | default=False, 71 | metadata={"help": "Option for reducing training steps (2) and logging intervals (1) for quick sanity checking of arguments."} 72 | ) 73 | 74 | def __post_init__(self): 75 | super().__post_init__() 76 | if self.dry_run: 77 | logger.warning("--dry_run was specified. Reducing number of training steps to 2 and logging intervals to 1...") 78 | self.logging_steps = 1 79 | self.logging_strategy = IntervalStrategy.STEPS 80 | self.eval_steps = 1 81 | self.evaluation_strategy = IntervalStrategy.STEPS 82 | 83 | self.max_steps = 2 84 | 85 | if self.disable_tqdm: 86 | disable_progress_bar() 87 | 88 | 89 | def find_noise_multiplier(sampling_probability: float, num_steps: int, target_epsilon: float, target_delta: float, 90 | eps_error: float=0.1) -> float: 91 | """ 92 | Find a noise multiplier that satisfies a given target epsilon. 93 | 94 | :param float sampling_probability: Probability of a record being in batch for Poisson sampling 95 | :param int num_steps: Number of optimisation steps 96 | :param float target_epsilon: Desired target epsilon 97 | :param float target_delta: Value of DP delta 98 | :param float eps_error: Error allowed for final epsilon 99 | """ 100 | def compute_epsilon(mu: float) -> float: 101 | acc = Accountant( 102 | noise_multiplier=mu, 103 | sampling_probability=sampling_probability, 104 | delta=target_delta, 105 | max_compositions=num_steps, 106 | eps_error=eps_error/2 107 | ) 108 | return acc.compute_epsilon(num_steps) 109 | 110 | mu_max = 100.0 111 | 112 | mu_R = 1.0 113 | eps_R = float('inf') 114 | while eps_R > target_epsilon: 115 | mu_R *= np.sqrt(2) 116 | try: 117 | eps_R = compute_epsilon(mu_R)[2] 118 | except (OverflowError, RuntimeError): 119 | pass 120 | if mu_R > mu_max: 121 | raise RuntimeError("Finding a suitable noise multiplier has not converged. " 122 | "Try increasing target epsilon or decreasing sampling probability.") 123 | 124 | mu_L = mu_R 125 | eps_L = eps_R 126 | while eps_L < target_epsilon: 127 | mu_L /= np.sqrt(2) 128 | eps_L = compute_epsilon(mu_L)[0] 129 | 130 | has_converged = False 131 | bracket = [mu_L, mu_R] 132 | while not has_converged: 133 | mu_err = (bracket[1]-bracket[0])*0.01 134 | mu_guess = optimize.root_scalar(lambda mu: compute_epsilon(mu)[2]-target_epsilon, bracket=bracket, xtol=mu_err).root 135 | bracket = [mu_guess-mu_err, mu_guess+mu_err] 136 | eps_up = compute_epsilon(mu_guess-mu_err)[2] 137 | eps_low = compute_epsilon(mu_guess+mu_err)[0] 138 | has_converged = (eps_up - eps_low) < 2*eps_error 139 | assert compute_epsilon(bracket[1])[2] < target_epsilon + eps_error 140 | 141 | return bracket[1] -------------------------------------------------------------------------------- /src/dp_transformers/dp_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import pandas as pd 5 | import datasets 6 | from datasets import Dataset 7 | import torch 8 | from torch import nn 9 | from torch.utils.data import DataLoader 10 | from transformers import ( 11 | Trainer, TrainerCallback, TrainerState, TrainerControl, logging, 12 | DataCollatorForLanguageModeling, PreTrainedTokenizer, training_args, modeling_utils 13 | ) 14 | from transformers.file_utils import is_sagemaker_mp_enabled, is_datasets_available 15 | import opacus 16 | from opacus.accountants import RDPAccountant 17 | from prv_accountant import Accountant as PRVAccountant 18 | from contextlib import contextmanager 19 | from typing import Any, Callable, List, Optional, Union, Dict, Sequence 20 | from accelerate.optimizer import AcceleratedOptimizer 21 | 22 | from dp_transformers import sampler, arguments 23 | 24 | logger = logging.get_logger(__name__) 25 | 26 | 27 | class DPCallback(TrainerCallback): 28 | """ 29 | This class registers all the necessary callbacks to make transformers.Trainer compatible with opacus. 30 | """ 31 | def __init__( 32 | self, 33 | noise_multiplier: float, 34 | target_delta: float, 35 | sampling_probability: float, 36 | rdp_accountant: RDPAccountant, 37 | prv_accountant: PRVAccountant, 38 | max_epsilon: float = float('inf') 39 | ) -> None: 40 | 41 | self.noise_multiplier = noise_multiplier 42 | self.target_delta = target_delta 43 | self.sampling_probability = sampling_probability 44 | self.rdp_accountant = rdp_accountant 45 | self.prv_accountant = prv_accountant 46 | 47 | self.max_epsilon = max_epsilon 48 | self.on_substep_end_was_called = False 49 | self.compute_rdp_epsilon = lambda: self.rdp_accountant.get_epsilon(self.target_delta) 50 | self.compute_prv_epsilon = lambda s: self.prv_accountant.compute_epsilon(s)[2] 51 | 52 | def on_substep_end(self, args: training_args.TrainingArguments, state: TrainerState, control: TrainerControl, optimizer=None, **kwargs): 53 | if optimizer is None: 54 | raise RuntimeError("Impossible to access optimizer from inside callback") 55 | if isinstance(optimizer, AcceleratedOptimizer): 56 | dp_optimizer = optimizer.optimizer 57 | else: 58 | dp_optimizer = optimizer 59 | dp_optimizer.signal_skip_step(do_skip=True) 60 | dp_optimizer.step() 61 | dp_optimizer.zero_grad() 62 | 63 | self.on_substep_end_was_called = True 64 | 65 | def on_step_end(self, args: training_args.TrainingArguments, state: TrainerState, control: TrainerControl, optimizer=None, **kwargs): 66 | if not ( 67 | args.gradient_accumulation_steps <= 1 or 68 | self.on_substep_end_was_called 69 | ): 70 | raise RuntimeError( 71 | "Gradient accumulation was specified but `on_substep_end` wasn't called. " 72 | "Make sure you're using a recent version of transformers (>=4.10.0) " 73 | "which has an appropriate callback in the trainer." 74 | ) 75 | 76 | if optimizer is None: 77 | raise RuntimeError("Impossible to access optimizer from inside callback") 78 | optimizer.zero_grad() # Opacus is bothered that HF does not call .zero_grad() on the optimizer 79 | 80 | self.rdp_accountant.step(noise_multiplier=self.noise_multiplier, sample_rate=self.sampling_probability) 81 | 82 | def on_save(self, args: training_args.TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): 83 | return self._check_max_epsilon_exceeded(state, control) 84 | 85 | def on_evaluate(self, args: training_args.TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): 86 | return self._check_max_epsilon_exceeded(state, control) 87 | 88 | def _check_max_epsilon_exceeded(self, state: TrainerState, control: TrainerControl) -> TrainerControl: 89 | eps_rdp = self.compute_rdp_epsilon() 90 | eps_prv = self.compute_prv_epsilon(state.global_step) 91 | if eps_rdp > self.max_epsilon or eps_prv > self.max_epsilon: 92 | logger.error("Max epsilon exceeded. Stopping training...") 93 | control.should_training_stop = True 94 | return control 95 | 96 | 97 | class DataCollatorForPrivateCausalLanguageModeling(DataCollatorForLanguageModeling): 98 | def __init__(self, tokenizer: PreTrainedTokenizer): 99 | super().__init__(tokenizer=tokenizer, mlm=False) 100 | 101 | def __call__(self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]]) -> Dict[str, torch.Tensor]: 102 | batch = super().__call__(examples) 103 | 104 | # Huggingface's default way of constructing position_ids is not compatible with Opacus 105 | # since Opacus is not able to deduce the batch size from the input. Here we manually 106 | # generate a position_ids tensor which has the same values as Huggingface's default tensor 107 | # but it is constructed in a way that is compatile with Opacus by using expand_as. 108 | if "position_ids" not in batch: 109 | input_ids = batch["input_ids"] 110 | batch["position_ids"] = torch.arange( 111 | input_ids.shape[1], dtype=torch.long, device=input_ids.device 112 | ).repeat(input_ids.shape[0], 1) 113 | return batch 114 | 115 | 116 | class GradSampleModule(opacus.GradSampleModule): 117 | """ 118 | Little wrapper to provide `no_sync` context which is assumed by Huggingface trainer. 119 | We don't need to do anything in addition here 120 | """ 121 | @contextmanager 122 | def no_sync(self): 123 | yield 124 | 125 | 126 | def create_author_mapping(dataset: Dataset, author: str) -> Sequence[Sequence[int]]: 127 | """ 128 | Creates a mapping from authors to samples in a dataset. 129 | """ 130 | with dataset.formatted_as(type="pandas"): 131 | authors = pd.DataFrame(data={"author": dataset[author]}) 132 | author_mapping = [g.index.values for _, g in authors.groupby("author")] 133 | return author_mapping 134 | 135 | 136 | class OpacusDPTrainer(Trainer): 137 | """ 138 | Wrapper to modify Huggingface Trainer to: 139 | (i) remove "loss = loss / self.args.gradient_accumulation_steps" operation in training_step 140 | as this is already handled by Opacus package. 141 | (ii) enable author-level DP training by modifing the sampler and the dataloader. In the case 142 | of sample-level DP, each sample can be represented by a unique author. 143 | (iii) wrap the optimizer with Opacus' DPOptimizer/DistributedDPOptimizer 144 | """ 145 | def __init__( 146 | self, 147 | model: Union[modeling_utils.PreTrainedModel, torch.nn.modules.module.Module] = None, 148 | args: arguments.TrainingArguments = None, 149 | train_dataset: Optional[torch.utils.data.dataset.Dataset] = None, 150 | privacy_args: arguments.PrivacyArguments = None, 151 | author_mapping: Optional[Sequence[Sequence[int]]] = None, 152 | **kwargs: Dict 153 | ) -> None: 154 | 155 | self.train_args = args 156 | self.privacy_args = privacy_args 157 | 158 | # Sample-level DP is equivalent to mapping each sample to a unique author. 159 | if author_mapping is None: 160 | author_mapping = [[i] for i in range(len(train_dataset))] 161 | self.author_mapping = author_mapping 162 | 163 | if not self.privacy_args.is_initialized: 164 | self.privacy_args.initialize( 165 | sampling_probability=self.sampling_probability, 166 | num_steps=self.num_steps, 167 | num_samples=len(self.author_mapping), 168 | ) 169 | 170 | # Wrap model in DDP and GradSampleModule 171 | if args.parallel_mode == training_args.ParallelMode.DISTRIBUTED: 172 | logger.info(f"Wrapping the model with DPDDP in distributed training.") 173 | model = opacus.distributed.DifferentiallyPrivateDistributedDataParallel(model) 174 | 175 | model = GradSampleModule(model) 176 | 177 | # Instantiate privacy accountants 178 | self.rdp_accountant = RDPAccountant() 179 | self.prv_accountant = PRVAccountant( 180 | noise_multiplier=self.privacy_args.noise_multiplier, 181 | sampling_probability=self.sampling_probability, 182 | delta=self.privacy_args.target_delta, 183 | eps_error=0.1, 184 | max_compositions=self.num_steps 185 | ) 186 | 187 | # Set up callback for accounting and handling grad acc 188 | self.dp_callback = DPCallback( 189 | noise_multiplier=self.privacy_args.noise_multiplier, 190 | target_delta=self.privacy_args.target_delta, 191 | sampling_probability=self.sampling_probability, 192 | rdp_accountant=self.rdp_accountant, 193 | prv_accountant=self.prv_accountant 194 | ) 195 | super().__init__(model=model, args=args, train_dataset=train_dataset, callbacks=[self.dp_callback], **kwargs) 196 | 197 | self.get_rdp_epsilon = lambda: self.rdp_accountant.get_epsilon(self.privacy_args.target_delta) # RDP epsilon 198 | self.get_prv_epsilon = lambda: self.prv_accountant.compute_epsilon(self.state.global_step)[2] 199 | 200 | @property 201 | def sampling_probability(self) -> float: 202 | return self.train_args.per_device_train_batch_size * self.train_args.world_size * \ 203 | self.train_args.gradient_accumulation_steps / len(self.author_mapping) 204 | 205 | @property 206 | def num_steps(self) -> int: 207 | return int(self.train_args.num_train_epochs * (1 / self.sampling_probability + 1)) 208 | 209 | def create_optimizer(self): 210 | _ = super().create_optimizer() 211 | 212 | if self.args.parallel_mode == training_args.ParallelMode.DISTRIBUTED: 213 | optimizer_generator = opacus.optimizers.DistributedDPOptimizer 214 | else: 215 | optimizer_generator = opacus.optimizers.DPOptimizer 216 | 217 | self.optimizer = optimizer_generator( 218 | optimizer=self.optimizer, 219 | noise_multiplier=self.privacy_args.noise_multiplier, 220 | max_grad_norm=self.privacy_args.per_sample_max_grad_norm, 221 | expected_batch_size=self.args.per_device_train_batch_size * self.args.gradient_accumulation_steps, 222 | ) 223 | 224 | return self.optimizer 225 | 226 | def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor: 227 | """ 228 | Perform a training step on a batch of inputs. 229 | 230 | Subclass and override to inject custom behavior. 231 | 232 | Args: 233 | model (:obj:`nn.Module`): 234 | The model to train. 235 | inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): 236 | The inputs and targets of the model. 237 | 238 | The dictionary will be unpacked before being fed to the model. Most models expect the targets under the 239 | argument :obj:`labels`. Check your model's documentation for all accepted arguments. 240 | 241 | Return: 242 | :obj:`torch.Tensor`: The tensor with training loss on this batch. 243 | """ 244 | model.train() 245 | inputs = self._prepare_inputs(inputs) 246 | 247 | if is_sagemaker_mp_enabled(): 248 | raise NotImplementedError("DP currently doesn't support this") 249 | 250 | with self.compute_loss_context_manager(): 251 | loss = self.compute_loss(model, inputs) 252 | 253 | if self.args.n_gpu > 1: 254 | loss = loss.mean() # mean() to average on multi-gpu parallel training 255 | 256 | # Compared to the original HF implementation, we have to remove the loss scaling by the number of gradient 257 | # accumulation steps since opacus scales the gradients accordingly. However, we still need to scale the loss 258 | # that is returned in order for the logging to work correctly. Hence we scale the loss after the call to 259 | # loss.backward() 260 | 261 | if self.use_apex: 262 | raise NotImplementedError("DP currently doesn't support this") 263 | else: 264 | loss.backward() 265 | 266 | return loss.detach()/self.args.gradient_accumulation_steps 267 | 268 | def _get_train_sampler(self): 269 | """ 270 | Provides author sampler. 271 | """ 272 | train_sampler = sampler.ShuffledAuthorSampler( 273 | author_mapping=self.author_mapping, 274 | batch_size=self.args.per_device_train_batch_size, 275 | world_size=self.args.world_size 276 | ) 277 | return train_sampler 278 | 279 | def get_train_dataloader(self) -> DataLoader: 280 | """ 281 | Returns the training :class:`~torch.utils.data.DataLoader`. 282 | 283 | Will use the author-level sampler from dp_transformers. 284 | """ 285 | if self.train_dataset is None: 286 | raise ValueError("Trainer: training requires a train_dataset.") 287 | 288 | train_sampler = self._get_train_sampler() 289 | 290 | train_dataset = self.train_dataset 291 | if is_datasets_available() and isinstance(train_dataset, datasets.Dataset): 292 | train_dataset = self._remove_unused_columns(train_dataset, description="training") 293 | 294 | return DataLoader( 295 | train_dataset, 296 | batch_sampler=train_sampler, 297 | collate_fn=self.data_collator, 298 | drop_last=self.args.dataloader_drop_last, 299 | num_workers=self.args.dataloader_num_workers, 300 | pin_memory=self.args.dataloader_pin_memory, 301 | ) 302 | -------------------------------------------------------------------------------- /src/dp_transformers/grad_sample/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dp-transformers/f9fae445b1d3bb28355dbaac6720c007abb974ce/src/dp_transformers/grad_sample/__init__.py -------------------------------------------------------------------------------- /src/dp_transformers/grad_sample/transformers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dp-transformers/f9fae445b1d3bb28355dbaac6720c007abb974ce/src/dp_transformers/grad_sample/transformers/__init__.py -------------------------------------------------------------------------------- /src/dp_transformers/grad_sample/transformers/conv_1d.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from typing import Dict 5 | 6 | import torch 7 | import torch.nn as nn 8 | from opt_einsum import contract 9 | from typing import List 10 | 11 | from opacus.grad_sample.utils import register_grad_sampler 12 | 13 | from transformers.modeling_utils import Conv1D 14 | 15 | 16 | @register_grad_sampler(Conv1D) 17 | def compute_transformers_conv1d_grad_sample( 18 | layer: Conv1D, activations: List[torch.Tensor], backprops: torch.Tensor 19 | ) -> Dict[nn.Parameter, torch.Tensor]: 20 | activations = activations[0] 21 | ret = {} 22 | if layer.weight.requires_grad: 23 | ret[layer.weight] = contract("n...i,n...j->nji", backprops, activations).contiguous() 24 | if layer.bias is not None and layer.bias.requires_grad: 25 | ret[layer.bias] = contract("n...k->nk", backprops) 26 | return ret 27 | -------------------------------------------------------------------------------- /src/dp_transformers/module_modification.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import warnings 5 | import torch 6 | from transformers import GPT2Model, GPT2PreTrainedModel 7 | from typing import List 8 | 9 | 10 | def force_causal_attention(model: GPT2Model): 11 | """ 12 | Force a GPT2 model to use causal attention 13 | 14 | Some variants of GPT2 may use bi-directional attention for the context. 15 | This can cause issues when training in an auto-regressive fashion. This function forces causal attention 16 | """ 17 | if not isinstance(model, GPT2Model): 18 | raise TypeError("Requires a GPT2 model") 19 | 20 | if not hasattr(model, "h") and hasattr(model, "transformer"): 21 | warnings.warn("""It looks like you have a model with a classification or LM head. """ 22 | """If this is the case, pass `model.transformer` to `force_causal_attention` to avoid this warning. """, UserWarning) 23 | transformer = model.transformer 24 | else: 25 | transformer = model 26 | 27 | 28 | for h_i in transformer.h: 29 | h_i.attn.bias = torch.tril(h_i.attn.bias) 30 | 31 | return model 32 | 33 | 34 | -------------------------------------------------------------------------------- /src/dp_transformers/sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import torch 5 | 6 | from typing import Sequence 7 | from torch.utils.data.sampler import Sampler, BatchSampler, SubsetRandomSampler, RandomSampler 8 | from torch.utils.data.distributed import DistributedSampler 9 | from opacus.utils.uniform_sampler import UniformWithReplacementSampler 10 | 11 | from typing import Iterator, List 12 | 13 | class AuthorSampler(Sampler): 14 | def __init__(self, author_sampler: Sampler, author_mapping: Sequence[Sequence[int]]): 15 | self.author_mapping = list(author_mapping) 16 | self.author_sampler = author_sampler 17 | self.indices = [0 for _ in range(len(self.author_mapping))] 18 | 19 | def __len__(self) -> int: 20 | return len(self.author_sampler) 21 | 22 | def __iter__(self) -> Iterator[List[int]]: 23 | for batch_author_ids in self.author_sampler: 24 | sample_ids = [self.indices[author_id] for author_id in batch_author_ids] 25 | for author_id in batch_author_ids: 26 | self.indices[author_id] += 1 27 | self.indices[author_id] = self.indices[author_id] % len(self.author_mapping[author_id]) 28 | yield [int(self.author_mapping[author_id][sample_id]) for author_id, sample_id in zip(batch_author_ids, sample_ids)] 29 | 30 | 31 | class PoissonAuthorSampler(AuthorSampler): 32 | def __init__(self, author_mapping: Sequence[Sequence[int]], sample_rate: float) -> None: 33 | """ 34 | Create batches by first sampling authors with uniform probability and then sampling a random element from the author 35 | 36 | :param author_mapping: A mapping where `dataset[author_mapping[i][j]]` produces the j-th sample of the i-th author in the dataset. 37 | :type author_mapping: Sequence[Sequence[int]] 38 | :param float sample_rate: Probability with which a author is sampled `E[len(batch_size)] = sample_rate*len(dataset)` 39 | """ 40 | author_sampler = UniformWithReplacementSampler( 41 | num_samples=len(author_mapping), 42 | sample_rate=sample_rate 43 | ) 44 | super().__init__(author_sampler, author_mapping) 45 | 46 | 47 | class ShuffledAuthorSampler(AuthorSampler): 48 | def __init__(self, author_mapping: Sequence[Sequence[int]], batch_size: int, world_size: int) -> None: 49 | """ 50 | Create batches by first shuffling the authors and then sampling the next element from the author 51 | 52 | :param author_mapping: A mapping where `dataset[author_mapping[i][j]]` produces the j-th sample of the i-th author in the dataset. 53 | :type author_mapping: Sequence[Sequence[int]] 54 | :param int batch_size: Batch size of the output 55 | """ 56 | if world_size <= 1: 57 | author_sampler = BatchSampler(RandomSampler(author_mapping), batch_size=batch_size, drop_last=True) 58 | else: 59 | author_sampler = BatchSampler(DistributedSampler(author_mapping), batch_size=batch_size, drop_last=True) 60 | super().__init__(author_sampler, author_mapping) -------------------------------------------------------------------------------- /tests/test_dp_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from dp_transformers.arguments import find_noise_multiplier 5 | from prv_accountant import Accountant 6 | import pytest 7 | 8 | class TestFindNoiseMultiplier: 9 | def test_sensible_range(self): 10 | mu = find_noise_multiplier(2e-3, 10_000, 4.0, 1e-7) 11 | assert 0 < mu and mu < 2 # Check that mu is in a sensible interval 12 | 13 | def test_inverse(self): 14 | mu = find_noise_multiplier(2e-3, 10_000, 4.0, 1e-7) 15 | acc = Accountant(mu, 2e-3, 1e-7, 10_000, eps_error = 0.5) 16 | eps = acc.compute_epsilon(10_000) 17 | assert eps[2] == pytest.approx(4, abs=0.5) 18 | 19 | def test_robustness(self): 20 | with pytest.warns(None) as record: 21 | mu = find_noise_multiplier( 22 | sampling_probability=256/50_000, 23 | num_steps=int(50*50_000/256), 24 | target_epsilon=10.0, 25 | target_delta=1e-5 26 | ) 27 | assert len(record) == 0 28 | 29 | def test_robustness_2(self): 30 | mu = find_noise_multiplier( 31 | sampling_probability=0.26058631921824105, 32 | num_steps=18800, 33 | target_delta=0.00011448277499759097, 34 | target_epsilon=4.0 35 | ) 36 | # Just test that this doesn't cause a floating point overflow 37 | print(mu) -------------------------------------------------------------------------------- /tests/test_grad_sample/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/dp-transformers/f9fae445b1d3bb28355dbaac6720c007abb974ce/tests/test_grad_sample/__init__.py -------------------------------------------------------------------------------- /tests/test_grad_sample/test_transformers_conv_1d.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import torch 5 | 6 | from opacus.tests.grad_samples.common import GradSampleHooks_test 7 | 8 | from transformers.modeling_utils import Conv1D 9 | 10 | from dp_transformers.grad_sample.transformers import conv_1d 11 | 12 | 13 | class TestConv1D(GradSampleHooks_test): 14 | def test_grad_sample(self): 15 | """ 16 | Verify that our custom implementation of the grad sample for huggingface's Conv1D 17 | layer works. We largely build on the test routines in opacus's library. 18 | """ 19 | x = torch.randn(16, 8) 20 | layer = Conv1D(4, 8) 21 | self.run_test(x, layer, batch_first=True, ew_compatible=False) 22 | 23 | self.run_test(torch.randn(24, 8, 8), Conv1D(4, 8), batch_first=True, ew_compatible=False) 24 | -------------------------------------------------------------------------------- /tests/test_models.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | 6 | from transformers import AutoModelForCausalLM 7 | from opacus.validators import ModuleValidator 8 | from opacus.validators.errors import UnsupportedModuleError 9 | 10 | 11 | @pytest.mark.xfail(reason='functorch can deal with module in Opacus 1.2') 12 | def test_gpt2_grad_sample_layers_registered(): 13 | """ 14 | Test whether all layers in GPT2 are registered in the grad sampler. 15 | """ 16 | model = AutoModelForCausalLM.from_pretrained("distilgpt2") 17 | model.train() 18 | 19 | validator = ModuleValidator() 20 | 21 | # We haven't registered the grad samples yet so make sure that it actually fails 22 | with pytest.raises(UnsupportedModuleError): 23 | validator.validate(model, strict=True) 24 | 25 | # Register the grad samples 26 | from dp_transformers.grad_sample.transformers import conv_1d 27 | 28 | # Now make sure that it works 29 | validator.validate(model, strict=True) 30 | --------------------------------------------------------------------------------