├── .github
    ├── CODEOWNERS
    └── workflows
    │   ├── build-package.yml
    │   ├── codeql.yml
    │   ├── publish-package.yml
    │   ├── test-examples-env.yml
    │   └── test-examples.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE.txt
├── README.md
├── SECURITY.md
├── examples
    ├── nlg-reddit
    │   ├── author-level-dp
    │   │   ├── README.md
    │   │   ├── aml
    │   │   │   ├── fuft-eps_8.yml
    │   │   │   └── peft-eps_8.yml
    │   │   ├── environment.yml
    │   │   └── fine-tune-dp.py
    │   └── sample-level-dp
    │   │   ├── README.md
    │   │   ├── aml
    │   │       ├── fuft-eps_8.yml
    │   │       ├── fuft-eps_inf.yml
    │   │       ├── peft-eps_8-gpus_1.yml
    │   │       ├── peft-eps_8.yml
    │   │       └── peft-eps_inf.yml
    │   │   ├── environment.yml
    │   │   ├── fine-tune-dp.py
    │   │   └── fine-tune-nodp.py
    └── test_examples.py
├── research
    ├── fine_tune_llm_w_qlora
    │   ├── README.md
    │   ├── aml
    │   │   ├── cnn
    │   │   │   ├── peft-eps_8.yml
    │   │   │   └── peft-eps_inf.yml
    │   │   ├── qnli
    │   │   │   ├── peft-eps_8.yml
    │   │   │   └── peft-eps_inf.yml
    │   │   └── sst2
    │   │   │   ├── peft-eps_8.yml
    │   │   │   └── peft-eps_inf.yml
    │   ├── data_utils.py
    │   ├── environment.yml
    │   ├── fine-tune-dp.py
    │   ├── fine-tune-nodp.py
    │   └── linear.py
    └── synthetic-text-generation-with-DP
    │   ├── NOTICE.txt
    │   ├── README.md
    │   ├── fine-tune-dp.py
    │   ├── fine-tune-nodp.py
    │   ├── generate-text.py
    │   ├── requirements.txt
    │   └── run-classification.py
├── setup.py
├── src
    └── dp_transformers
    │   ├── __init__.py
    │   ├── arguments.py
    │   ├── dp_utils.py
    │   ├── grad_sample
    │       ├── __init__.py
    │       └── transformers
    │       │   ├── __init__.py
    │       │   └── conv_1d.py
    │   ├── module_modification.py
    │   └── sampler.py
└── tests
    ├── test_dp_utils.py
    ├── test_grad_sample
        ├── __init__.py
        └── test_transformers_conv_1d.py
    └── test_models.py


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # These owners will be the default owners for everything in
2 | # the repo. Unless a later match takes precedence,
3 | # @microsoft/ppml will be requested for
4 | # review when someone opens a pull request.
5 | *       @microsoft/ppml
6 | 


--------------------------------------------------------------------------------
/.github/workflows/build-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Build package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     branches: [ "main" ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         python-version: ["3.8", "3.9"]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v3
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v3
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         python -m pip install flake8 pytest
31 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 |         python -m pip install .[test]
33 |     - name: Lint with flake8
34 |       run: |
35 |         # stop the build if there are Python syntax errors or undefined names
36 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
37 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
38 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
39 |     - name: Test with pytest
40 |       run: |
41 |         pytest tests
42 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
 1 | # This is based on the standard CodeQL workflow provided by Github
 2 | name: "CodeQL"
 3 | 
 4 | on:
 5 |   push:
 6 |     branches: [ "main" ]
 7 |   pull_request:
 8 |     # The branches below must be a subset of the branches above
 9 |     branches: [ "main" ]
10 |   schedule:
11 |     - cron: '35 2 * * 3'
12 | 
13 | jobs:
14 |   analyze:
15 |     name: Analyze
16 |     runs-on: ubuntu-latest
17 |     permissions:
18 |       actions: read
19 |       contents: read
20 |       security-events: write
21 | 
22 |     strategy:
23 |       fail-fast: false
24 |       matrix:
25 |         language: [ 'python' ]
26 | 
27 |     steps:
28 |     - name: Checkout repository
29 |       uses: actions/checkout@v3
30 | 
31 |     # Initializes the CodeQL tools for scanning.
32 |     - name: Initialize CodeQL
33 |       uses: github/codeql-action/init@v2
34 |       with:
35 |         languages: ${{ matrix.language }}
36 |         
37 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
38 |     # If this step fails, then you should remove it and run the build manually (see below)
39 |     - name: Autobuild
40 |       uses: github/codeql-action/autobuild@v2
41 | 
42 |     - name: Perform CodeQL Analysis
43 |       uses: github/codeql-action/analyze@v2
44 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-package.yml:
--------------------------------------------------------------------------------
 1 | name: Publish package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 | 
11 |     steps:
12 |     - uses: actions/checkout@v2
13 |     - uses: actions/setup-python@v2
14 |     - name: Install dependencies
15 |       run: |
16 |         python -m pip install --upgrade pip
17 |         python -m pip install setuptools wheel twine build
18 |     - name: Build
19 |       run: |
20 |         python -m build --sdist --wheel --outdir dist/ .
21 |     - name: Publish to PyPI
22 |       uses: pypa/gh-action-pypi-publish@master
23 |       with:
24 |         password: ${{ secrets.PYPI_API_TOKEN }}


--------------------------------------------------------------------------------
/.github/workflows/test-examples-env.yml:
--------------------------------------------------------------------------------
 1 | name: Test examples environment
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |   pull_request:
 7 |     branches: [ "main" ]
 8 | 
 9 | jobs:
10 |   run:
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       fail-fast: false
14 |       matrix:
15 |         example: [
16 |           "examples/nlg-reddit/sample-level-dp",
17 |           "examples/nlg-reddit/author-level-dp"
18 |         ]
19 |     steps:
20 |     - uses: actions/checkout@v3
21 |     - uses: conda-incubator/setup-miniconda@v2
22 |       with:
23 |         activate-environment: env
24 |         environment-file: ${{ matrix.example }}/environment.yml
25 |         auto-activate-base: false
26 |     - name: Install package
27 |       run: |
28 |         /usr/share/miniconda/envs/env/bin/pip install -e .
29 | 


--------------------------------------------------------------------------------
/.github/workflows/test-examples.yml:
--------------------------------------------------------------------------------
 1 | name: Test examples
 2 | 
 3 | on:
 4 |   workflow_dispatch
 5 | 
 6 | jobs:
 7 |   submit:
 8 |     runs-on: ubuntu-latest
 9 | 
10 |     steps:
11 |     - uses: actions/checkout@v3
12 |     - name: Set up Python 3.9
13 |       uses: actions/setup-python@v3
14 |       with:
15 |         python-version: 3.9
16 |     - name: Install dependencies
17 |       run: |
18 |         python -m pip install --upgrade pip
19 |         python -m pip install azure-cli
20 |         az extension add -n ml
21 |         python -m pip install pytest pytest-xdist azureml-core
22 |     - name: Set up Azure ML CLI
23 |       run: |
24 |         az login --service-principal -u "${{ secrets.AZ_CLIENT_ID }}" -p "${{ secrets.AZ_CLIENT_SECRET }}" --tenant "${{ secrets.AZ_TENANT_ID }}"
25 |         az account set --subscription "${{ secrets.AZ_SUBSCRIPTION_ID }}"
26 |         az configure --defaults group=${{ secrets.AZ_RESOURCE_GROUP }} workspace=${{ secrets.AZ_WORKSPACE_NAME }}
27 |     - name: Run examples with pytest
28 |       run: |
29 |         pytest -n 16 -s examples -v --junitxml=junit/test-results.xml
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 
140 | /data
141 | /.vscode
142 | /.amltconfig
143 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) Microsoft Corporation.
 2 | 
 3 | MIT License
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # dp-transformers
  2 | 
  3 | :warning: This repo is intended for research projects and prototypes.
  4 | While we try to provide tests for all the functionality, the repo has not (yet) undergone the detailed review process that is necessary for deploying a system of critical nature such as privacy.
  5 | 
  6 | ## Introduction 
  7 | 
  8 | See [dp-transformers](https://www.microsoft.com/en-us/research/project/dp-transformers) for a brief introduction to our repository.
  9 | 
 10 | ## Installation
 11 | 
 12 | For installing the `dp-transformers` package, you can just type
 13 | 
 14 | ```
 15 | pip install .
 16 | ```
 17 | 
 18 | ## Examples
 19 | 
 20 | See `./examples` for end to end examples of how to use the library.
 21 | 
 22 | A basic example can be found in `examples/nlg-reddit/sample-level-dp/fine-tune-dp.py`.
 23 | First, create an Anaconda environment by doing `conda env create -f examples/nlg-reddit/sample-level-dp/environment.yml`.
 24 | Then, you can run the example using the following command (here we assume there are 16 GPUs in the machine, and thus set `--nproc_per_node 16`):
 25 | 
 26 | ```
 27 | python -m torch.distributed.run --nproc_per_node 16 examples/nlg-reddit/sample-level-dp/fine-tune-dp.py \
 28 | --output_dir scratch \
 29 | --model_name gpt2 \
 30 | --sequence_len 128 \
 31 | --per_device_train_batch_size 32 \
 32 | --gradient_accumulation_steps 2 \
 33 | --evaluation_strategy steps \
 34 | --eval_steps 45 \
 35 | --log_level info \
 36 | --per_device_eval_batch_size 64 \
 37 | --eval_accumulation_steps 1 \
 38 | --seed 42 \
 39 | --target_epsilon 8 \
 40 | --per_sample_max_grad_norm 1.0 \
 41 | --prediction_loss_only \
 42 | --weight_decay 0.01 \
 43 | --remove_unused_columns False \
 44 | --num_train_epochs 3 \
 45 | --logging_steps 5 \
 46 | --max_grad_norm 0 \
 47 | --lr_scheduler_type constant \
 48 | --learning_rate 1e-4 \
 49 | --disable_tqdm True \
 50 | --dataloader_num_workers 2
 51 | ```
 52 | 
 53 | ## 🤗 Transformers with Opacus
 54 | 
 55 | ### Trainer
 56 | 
 57 | Huggingface's trainer provides callback hooks which we can use to make sure the required methods in the privacy engine are called.
 58 | 
 59 | You can use the callback as demonstrated in the example below
 60 | 
 61 | ``` python
 62 | privacy_engine = opacus.PrivacyEngine(module=model, ...)
 63 | 
 64 | # No need to attach the privacy engine to the optimizer. The callback will automatically attach the optimizer.
 65 | 
 66 | trainer = transformers.Trainer(
 67 |     model = model,
 68 |     [...],
 69 |     callbacks = [dp_transformers.PrivacyEngineCallback(privacy_engine)]  # <-- Add this line to make sure the privacy engine is used in the trainer
 70 |     [...]
 71 | )
 72 | ```
 73 | 
 74 | ### Data Collation
 75 | 
 76 | 🤗 Transformers library often provides sensible default arguments.
 77 | For example, when no `position_ids` are provided, the library automatically will use incrementing integers.
 78 | The way this is implemented is by first creating a tensor of shape `[1, sequence_length]` filled with increasing integers.
 79 | During a second step that tensor is replicated for the whole batch.
 80 | However, the replication is part of the computational graph and hence Opacus cannot infer the batch size from this input tensor.
 81 | 
 82 | We have therefore implemented a custom data collator (see `dp_transformers.DataCollatorForPrivateCausalLanguageModeling`) which automatically creates the `position_ids` input tensor by using `torch.repeat`.
 83 | This works with opacus since the `position_ids` tensor appears as batch size different inputs in the computation graph.
 84 | 
 85 | ### GPT2
 86 | 
 87 | The 🤗 Transformers implementation for GPT2 uses a custom layer type namely `Conv1D`.
 88 | It is not quite clear why this was introduced since it is essentially a regular linear layer.
 89 | This causes problems with Opacus though since it is not sure how to apply the backward hooks for this layer.
 90 | 
 91 | In this repo we provide an implementation for handling this type of layer.
 92 | See `dp_transformers.grad_sample.transformers.conv_1d`
 93 | 
 94 | All necessary grad samplers can be registered by merely importing `conv_1d` before the model training.
 95 | See the Known Issues section below for more details.
 96 | 
 97 | ## General tips for DP training
 98 | 
 99 | In this section, we collect a few helpful strategies for training models with DP.
100 | Also Opacus's FAQs have a few tips on how to get started with DP training (see [Opacus FAQ](https://opacus.ai/docs/faq))
101 | 
102 | ### Hyper-parameters
103 | 
104 | Larger batch sizes help DP training.
105 | As a general rule, try starting with $\sqrt{|D|}$ where $D$ is the training dataset.
106 | Since Opacus increases memory consumption significantly, this is only possible using gradient accumulation.
107 | 
108 | We have found a surprisingly small dependence on the clipping norm.
109 | As a general rule of thumb start with a clipping parameter of 0.1
110 | 
111 | Fine-tuning the model longer is also helpful.
112 | 
113 | 
114 | ### Deploying DP trained models
115 | 
116 | Pay attention which pseudo random number generator (PRNG) was used.
117 | Pytorch's default (Mersenne Twister) might be attackable.
118 | See [Opacus FAQ](https://opacus.ai/docs/faq#what-is-the-secure_rng-argument-in-privacyengine)
119 | Make sure to use a better PRNG before deploying models.
120 | 
121 | ## Known issues
122 | 
123 | ### Register custom grad samplers late in the training process
124 | 
125 | When registering custom grad sampler like `dp_transformers.grad_sample.transformers.conv_1d`, functions are added to a global dictionary that Opacus handles.
126 | This global dictionary is used to establish whether models are compatible with Opacus and how to handle the per-sample gradient computation.
127 | All grad samplers need to be registered as early as possible in the training process.
128 | Definitely, before the model is wrapped with `GradSampleModule`.
129 | 
130 | ## How to Cite
131 | 
132 | ```
133 | @misc{dp-transformers,
134 |   author        = {Lukas Wutschitz and Huseyin A. Inan and Andre Manoel},
135 |   title         = {dp-transformers: Training transformer models with differential privacy},
136 |   year          = {2022},
137 |   month         = {August},
138 |   howpublished  = {\url{https://www.microsoft.com/en-us/research/project/dp-transformers}}
139 | }
140 | ```
141 | 
142 | ## Contributing
143 | 
144 | This project welcomes contributions and suggestions. Most contributions require you to
145 | agree to a Contributor License Agreement (CLA) declaring that you have the right to,
146 | and actually do, grant us the rights to use your contribution. For details, visit
147 | https://cla.microsoft.com.
148 | 
149 | When you submit a pull request, a CLA-bot will automatically determine whether you need
150 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
151 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
152 | 
153 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
154 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
155 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
156 | 
157 | For any other questions, feel free to open an issue on GitHub.
158 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.7 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/examples/nlg-reddit/author-level-dp/README.md:
--------------------------------------------------------------------------------
 1 | # Author-level differentially private fine-tuning of a GPT-2 style model
 2 | 
 3 | This example fine-tunes generative language models (such as GPT-2 series) with Author-level Differential Privacy on a text corpus.
 4 | In this case 500,000 samples of Reddit comments belong to 304,279 authors in the dataset.
 5 | 
 6 | **We point out that `Author` is only an abstraction that can represent any of the following: user, group, organization, etc.**
 7 | 
 8 | We compare different fine-tuning techniques (full fine-tuning, LoRA) and also provide a data distributed implementation for faster training.
 9 | These merely serve as examples as hyperparameters are not optimized and corresponding commands are presented below.
10 | 
11 | # Results
12 | 
13 | | Model (HF) | Fine-tuning Method | DP  | GPUs    | Epochs | Train Loss | Eval Loss | $\varepsilon$ | Run Time [s] |
14 | | ---------- | ------------------ | --- | ------- | ------ | ---------- | --------- | ------------- | ------------ | 
15 | | gpt2       | Full               | Yes | 16xV100 |    3   |    3.76    |   3.62    |      8.0      |     1167     | 
16 | | gpt2       | LoRA               | Yes | 16xV100 |    3   |    3.75    |   3.60    |      8.0      |     659      | 
17 | 
18 | ## Fine-tune the full model with DP
19 | 
20 | ```
21 | python -m torch.distributed.run --nproc_per_node 16 fine-tune-dp.py \
22 | --output_dir scratch \
23 | --model_name gpt2 \
24 | --sequence_len 128 \
25 | --per_device_train_batch_size 32 \
26 | --gradient_accumulation_steps 2 \
27 | --evaluation_strategy steps \
28 | --eval_steps 45 \
29 | --log_level info \
30 | --per_device_eval_batch_size 64 \
31 | --eval_accumulation_steps 1 \
32 | --seed 42 \
33 | --target_epsilon 8 \
34 | --per_sample_max_grad_norm 1.0 \
35 | --prediction_loss_only \
36 | --weight_decay 0.01 \
37 | --remove_unused_columns False \
38 | --num_train_epochs 3 \
39 | --logging_steps 5 \
40 | --max_grad_norm 0 \
41 | --lr_scheduler_type constant \
42 | --learning_rate 1e-4 \
43 | --disable_tqdm True \
44 | --label_names labels \
45 | --dataloader_num_workers 2
46 | ```
47 | 
48 | ## Fine-tune only the LoRA layers introduced into the model with DP
49 | 
50 | ```
51 | python -m torch.distributed.run --nproc_per_node 16 fine-tune-dp.py \
52 | --output_dir scratch \
53 | --model_name gpt2 \
54 | --sequence_len 128 \
55 | --per_device_train_batch_size 64 \
56 | --gradient_accumulation_steps 1 \
57 | --evaluation_strategy steps \
58 | --eval_steps 45 \
59 | --log_level info \
60 | --per_device_eval_batch_size 64 \
61 | --eval_accumulation_steps 1 \
62 | --seed 42 \
63 | --target_epsilon 8 \
64 | --per_sample_max_grad_norm 1.0 \
65 | --prediction_loss_only \
66 | --weight_decay 0.01 \
67 | --remove_unused_columns False \
68 | --num_train_epochs 3 \
69 | --logging_steps 5 \
70 | --lora_dim 4 \
71 | --lora_alpha 32 \
72 | --lora_dropout 0.0 \
73 | --max_grad_norm 0 \
74 | --lr_scheduler_type constant \
75 | --learning_rate 3e-4 \
76 | --disable_tqdm True \
77 | --dataloader_num_workers 2 \
78 | --label_names labels \
79 | --enable_lora
80 | ```


--------------------------------------------------------------------------------
/examples/nlg-reddit/author-level-dp/aml/fuft-eps_8.yml:
--------------------------------------------------------------------------------
 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 2 | code: ../../../../
 3 | command: >-
 4 |   python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 examples/nlg-reddit/author-level-dp/fine-tune-dp.py \
 5 |     --output_dir outputs \
 6 |     --model_name gpt2 \
 7 |     --sequence_len 128 \
 8 |     --per_device_train_batch_size 32 \
 9 |     --gradient_accumulation_steps 4 \
10 |     --evaluation_strategy steps \
11 |     --eval_steps 128 \
12 |     --log_level info \
13 |     --per_device_eval_batch_size 64 \
14 |     --eval_accumulation_steps 1 \
15 |     --seed 42 \
16 |     --target_epsilon 8 \
17 |     --target_delta 2e-6 \
18 |     --per_sample_max_grad_norm 1.0 \
19 |     --prediction_loss_only \
20 |     --weight_decay 0.01 \
21 |     --remove_unused_columns False \
22 |     --num_train_epochs 3 \
23 |     --logging_steps 5 \
24 |     --max_grad_norm 0 \
25 |     --lr_scheduler_type constant \
26 |     --learning_rate 1e-4 \
27 |     --disable_tqdm True \
28 |     --dataloader_num_workers 2 \
29 |     --label_names labels
30 | environment:
31 |   image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04
32 |   conda_file: ../environment.yml
33 | compute: azureml:ND40rsv2
34 | display_name: full_fine_tuning-epsilon_8
35 | experiment_name: dp-transformers-nlg-reddit-author-level-dp
36 | description: Train a model on the Reddit dataset using differential privacy


--------------------------------------------------------------------------------
/examples/nlg-reddit/author-level-dp/aml/peft-eps_8.yml:
--------------------------------------------------------------------------------
 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 2 | code: ../../../../
 3 | command: >-
 4 |   python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 examples/nlg-reddit/author-level-dp/fine-tune-dp.py \
 5 |     --output_dir outputs \
 6 |     --model_name gpt2 \
 7 |     --sequence_len 128 \
 8 |     --per_device_train_batch_size 128 \
 9 |     --gradient_accumulation_steps 1 \
10 |     --evaluation_strategy steps \
11 |     --eval_steps 128 \
12 |     --log_level info \
13 |     --per_device_eval_batch_size 64 \
14 |     --eval_accumulation_steps 1 \
15 |     --seed 42 \
16 |     --target_epsilon 8 \
17 |     --target_delta 2e-6 \
18 |     --per_sample_max_grad_norm 1.0 \
19 |     --prediction_loss_only \
20 |     --weight_decay 0.01 \
21 |     --remove_unused_columns False \
22 |     --num_train_epochs 3 \
23 |     --logging_steps 5 \
24 |     --max_grad_norm 0 \
25 |     --lr_scheduler_type constant \
26 |     --learning_rate 1e-4 \
27 |     --disable_tqdm True \
28 |     --dataloader_num_workers 2 \
29 |     --lora_dim 4 \
30 |     --lora_alpha 32 \
31 |     --lora_dropout 0.0 \
32 |     --label_names labels \
33 |     --enable_lora
34 | environment:
35 |   image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04
36 |   conda_file: ../environment.yml
37 | compute: azureml:ND40rsv2
38 | display_name: parameter_efficient_fine_tuning-epsilon_8
39 | experiment_name: dp-transformers-nlg-reddit-author-level-dp
40 | description: Train a model on the Reddit dataset using differential privacy


--------------------------------------------------------------------------------
/examples/nlg-reddit/author-level-dp/environment.yml:
--------------------------------------------------------------------------------
  1 | name: dp-transformers
  2 | channels:
  3 |   - pytorch
  4 |   - nvidia
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1
  8 |   - _openmp_mutex=5.1
  9 |   - blas=1.0
 10 |   - bzip2=1.0.8
 11 |   - ca-certificates=2023.05.30
 12 |   - cuda-cudart=11.8.89
 13 |   - cuda-cupti=11.8.87
 14 |   - cuda-libraries=11.8.0
 15 |   - cuda-nvrtc=11.8.89
 16 |   - cuda-nvtx=11.8.86
 17 |   - cuda-runtime=11.8.0
 18 |   - filelock=3.9.0
 19 |   - gmp=6.2.1
 20 |   - gmpy2=2.1.2
 21 |   - intel-openmp=2023.1.0
 22 |   - jinja2=3.1.2
 23 |   - ld_impl_linux-64=2.38
 24 |   - libcublas=11.11.3.6
 25 |   - libcufft=10.9.0.58
 26 |   - libcufile=1.7.1.12
 27 |   - libcurand=10.3.3.129
 28 |   - libcusolver=11.4.1.48
 29 |   - libcusparse=11.7.5.86
 30 |   - libffi=3.4.4
 31 |   - libgcc-ng=11.2.0
 32 |   - libgomp=11.2.0
 33 |   - libnpp=11.8.0.86
 34 |   - libnvjpeg=11.9.0.86
 35 |   - libstdcxx-ng=11.2.0
 36 |   - libuuid=1.41.5
 37 |   - markupsafe=2.1.1
 38 |   - mkl=2023.1.0
 39 |   - mpc=1.1.0
 40 |   - mpfr=4.0.2
 41 |   - mpmath=1.3.0
 42 |   - ncurses=6.4
 43 |   - networkx=3.1
 44 |   - openssl=3.0.10
 45 |   - pip=23.2.1
 46 |   - python=3.10.12
 47 |   - pytorch=2.0.1
 48 |   - pytorch-cuda=11.8
 49 |   - pytorch-mutex=1.0
 50 |   - readline=8.2
 51 |   - setuptools=68.0.0
 52 |   - sqlite=3.41.2
 53 |   - sympy=1.11.1
 54 |   - tbb=2021.8.0
 55 |   - tk=8.6.12
 56 |   - torchtriton=2.0.0
 57 |   - typing_extensions=4.7.1
 58 |   - wheel=0.38.4
 59 |   - xz=5.4.2
 60 |   - zlib=1.2.13
 61 |   - pip:
 62 |       - accelerate==0.21.0
 63 |       - aiohttp==3.8.5
 64 |       - aiosignal==1.3.1
 65 |       - alembic==1.11.2
 66 |       - async-timeout==4.0.3
 67 |       - attrs==23.1.0
 68 |       - azure-common==1.1.28
 69 |       - azure-core==1.29.2
 70 |       - azure-identity==1.14.0
 71 |       - azure-mgmt-core==1.4.0
 72 |       - azure-storage-blob==12.13.0
 73 |       - azureml-mlflow==1.52.0
 74 |       - blinker==1.6.2
 75 |       - certifi==2023.7.22
 76 |       - cffi==1.15.1
 77 |       - charset-normalizer==3.2.0
 78 |       - click==8.1.6
 79 |       - cloudpickle==2.2.1
 80 |       - contourpy==1.1.0
 81 |       - cryptography==41.0.3
 82 |       - cycler==0.11.0
 83 |       - databricks-cli==0.17.7
 84 |       - datasets==2.14.4
 85 |       - dill==0.3.7
 86 |       - docker==6.1.3
 87 |       - entrypoints==0.4
 88 |       - exceptiongroup==1.1.3
 89 |       - flask==2.3.2
 90 |       - fonttools==4.42.0
 91 |       - frozenlist==1.4.0
 92 |       - fsspec==2023.6.0
 93 |       - gitdb==4.0.10
 94 |       - gitpython==3.1.32
 95 |       - greenlet==2.0.2
 96 |       - gunicorn==21.2.0
 97 |       - huggingface-hub==0.19.4
 98 |       - idna==3.4
 99 |       - importlib-metadata==6.8.0
100 |       - iniconfig==2.0.0
101 |       - isodate==0.6.1
102 |       - itsdangerous==2.1.2
103 |       - joblib==1.3.2
104 |       - jsonpickle==3.0.2
105 |       - kiwisolver==1.4.4
106 |       - mako==1.2.4
107 |       - markdown==3.4.4
108 |       - matplotlib==3.7.2
109 |       - mlflow==2.6.0
110 |       - mlflow-skinny==2.6.0
111 |       - msal==1.23.0
112 |       - msal-extensions==1.0.0
113 |       - msrest==0.7.1
114 |       - multidict==6.0.4
115 |       - multiprocess==0.70.15
116 |       - numpy==1.25.2
117 |       - oauthlib==3.2.2
118 |       - opacus==1.4.0
119 |       - opt-einsum==3.3.0
120 |       - packaging==23.1
121 |       - pandas==2.0.3
122 |       - peft==0.4.0
123 |       - pillow==10.0.0
124 |       - pluggy==1.2.0
125 |       - portalocker==2.7.0
126 |       - protobuf==4.24.0
127 |       - prv-accountant==0.1.1.post1
128 |       - psutil==5.9.5
129 |       - pyarrow==12.0.1
130 |       - pycparser==2.21
131 |       - pyjwt==2.8.0
132 |       - pyparsing==3.0.9
133 |       - pytest==7.4.0
134 |       - python-dateutil==2.8.2
135 |       - pytz==2023.3
136 |       - pyyaml==6.0.1
137 |       - querystring-parser==1.2.4
138 |       - regex==2023.8.8
139 |       - requests==2.31.0
140 |       - requests-oauthlib==1.3.1
141 |       - safetensors==0.3.2
142 |       - scikit-learn==1.3.0
143 |       - scipy==1.11.1
144 |       - six==1.16.0
145 |       - smmap==5.0.0
146 |       - sqlalchemy==2.0.19
147 |       - sqlparse==0.4.4
148 |       - tabulate==0.9.0
149 |       - threadpoolctl==3.2.0
150 |       - tokenizers==0.15.0
151 |       - tomli==2.0.1
152 |       - tqdm==4.66.1
153 |       - transformers==4.36.1
154 |       - tzdata==2023.3
155 |       - urllib3==1.26.16
156 |       - websocket-client==1.6.1
157 |       - werkzeug==2.3.7
158 |       - xxhash==3.3.0
159 |       - yarl==1.9.2
160 |       - zipp==3.16.2
161 | 


--------------------------------------------------------------------------------
/examples/nlg-reddit/author-level-dp/fine-tune-dp.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT License.
  3 | 
  4 | '''Train GPT2 model series with author-level DP (w/ parameter-efficient approach LoRA when lora_dim > 0)'''
  5 | 
  6 | import datasets
  7 | import dp_transformers
  8 | import transformers
  9 | import sys
 10 | import logging
 11 | 
 12 | from dataclasses import dataclass, field, asdict
 13 | from peft import get_peft_model, LoraConfig
 14 | 
 15 | from dp_transformers.grad_sample.transformers import conv_1d
 16 | 
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | @dataclass
 22 | class ModelArguments:
 23 |     model_name: str = field(default="gpt2", metadata={
 24 |         "help": "Model name in HuggingFace, e.g. 'gpt2'"
 25 |     })
 26 | 
 27 |     sequence_len: int = field(default=128, metadata={
 28 |         "help": "Model sequence length"
 29 |     })
 30 | 
 31 | 
 32 | @dataclass
 33 | class LoraArguments:
 34 |     enable_lora: bool = field(default=False, metadata={
 35 |         "help": "Whether to enable LoRA"
 36 |     })
 37 |     lora_dim: int = field(default=8, metadata={
 38 |         "help": "LoRA dimension"
 39 |     })
 40 |     lora_alpha: int = field(default=8, metadata={
 41 |         "help": "LoRA alpha"
 42 |     })
 43 |     lora_dropout: float = field(default=0.0, metadata={
 44 |         "help": "LoRA dropout"
 45 |     })
 46 | 
 47 |     def as_peft_config(self) -> LoraConfig:
 48 |         if not self.enable_lora:
 49 |             raise ValueError("LoRA is not enabled, cannot convert to LoRA config")
 50 |         params = asdict(self)
 51 |         params.pop("enable_lora")
 52 |         params["r"] = params.pop("lora_dim")
 53 |         return LoraConfig(**params)
 54 | 
 55 | 
 56 | @dataclass
 57 | class Arguments:
 58 |     train: dp_transformers.TrainingArguments
 59 |     privacy: dp_transformers.PrivacyArguments
 60 |     model: ModelArguments
 61 |     lora: LoraArguments
 62 | 
 63 | 
 64 | def main(args: Arguments):
 65 |     transformers.set_seed(args.train.seed)
 66 | 
 67 |     # Setup logging
 68 |     logging.basicConfig(
 69 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 70 |         datefmt="%m/%d/%Y %H:%M:%S",
 71 |         handlers=[logging.StreamHandler(sys.stdout)],
 72 |     )
 73 | 
 74 |     log_level = train_args.get_process_log_level()
 75 |     logger.setLevel(log_level)
 76 |     datasets.utils.logging.set_verbosity(log_level)
 77 |     transformers.utils.logging.set_verbosity(log_level)
 78 |     transformers.utils.logging.enable_default_handler()
 79 |     transformers.utils.logging.enable_explicit_format()
 80 | 
 81 |     # Log on each process the small summary:
 82 |     logger.warning(
 83 |         f"Process rank: {train_args.local_rank}, device: {train_args.device}, n_gpu: {train_args.n_gpu}, "
 84 |         f"distributed training: {bool(train_args.local_rank != -1)}, 16-bits training: {train_args.fp16}, "
 85 |         f"world size: {train_args.world_size}"
 86 |     )
 87 |     logger.info(f"Training/evaluation parameters {train_args}")
 88 |     logger.info(f"Privacy parameters {privacy_args}")
 89 | 
 90 |     # Load model
 91 |     model = transformers.AutoModelForCausalLM.from_pretrained(args.model.model_name)
 92 |     model = model.to(train_args.device)
 93 | 
 94 |     # Load data
 95 |     dataset = datasets.load_dataset('reddit', split="train[:500000]").train_test_split(0.02, seed=args.train.seed)
 96 |     train_dataset = dataset['train']
 97 |     test_dataset = dataset['test']
 98 | 
 99 |     # Load tokenizer
100 |     tokenizer = transformers.AutoTokenizer.from_pretrained(args.model.model_name)
101 |     tokenizer.pad_token = tokenizer.eos_token
102 | 
103 |     # Tokenize data
104 |     with train_args.main_process_first(desc="tokenizing dataset"):
105 |         train_dataset = train_dataset.map(
106 |             lambda batch: tokenizer(batch['content'], padding="max_length", truncation=True, max_length=args.model.sequence_len),
107 |             batched=True, num_proc=8, desc="tokenizing dataset",
108 |             remove_columns=[c for c in train_dataset.column_names if c != 'author']
109 |         )
110 |         test_dataset = test_dataset.map(
111 |             lambda batch: tokenizer(batch['content'], padding="max_length", truncation=True, max_length=args.model.sequence_len),
112 |             batched=True, num_proc=8, desc="tokenizing dataset", remove_columns=test_dataset.column_names
113 |         )
114 | 
115 |     author_mapping = dp_transformers.dp_utils.create_author_mapping(train_dataset, author="author")
116 |     train_dataset = train_dataset.remove_columns('author')
117 | 
118 |     if train_args.local_rank == 0 or train_args.local_rank == -1:
119 |         logger.info(f"Number of authors in the training set: {len(author_mapping)}")
120 | 
121 |     if args.lora.enable_lora:
122 |         logger.info("Using LoRA")
123 |         model = get_peft_model(model=model, peft_config=args.lora.as_peft_config())
124 |     else:
125 |         logger.info("Not using LoRA")
126 | 
127 |     if train_args.local_rank == 0 or train_args.local_rank == -1:
128 |         logger.info(f"Total number of parameters of the model: {model.num_parameters(only_trainable=False)}")
129 |         logger.info(f"Fine-tuned number of parameters of the model: {model.num_parameters(only_trainable=True)}")
130 | 
131 |     model = model.cuda()
132 |     model.train()
133 | 
134 | 
135 |     data_collator = dp_transformers.DataCollatorForPrivateCausalLanguageModeling(tokenizer)
136 | 
137 |     trainer = dp_transformers.dp_utils.OpacusDPTrainer(
138 |         args=train_args,
139 |         model=model,
140 |         train_dataset=train_dataset,
141 |         eval_dataset=test_dataset,
142 |         data_collator=data_collator,
143 |         author_mapping=author_mapping,
144 |         privacy_args=privacy_args,
145 |     )
146 | 
147 |     try:
148 |         trainer.train()
149 |     finally:
150 |         eps_prv = trainer.get_prv_epsilon()
151 |         eps_rdp = trainer.get_rdp_epsilon()
152 |         trainer.log({
153 |             "final_epsilon_prv": eps_prv,
154 |             "final_epsilon_rdp": eps_rdp
155 |         })
156 | 
157 | if __name__ == "__main__":
158 |     arg_parser = transformers.HfArgumentParser((dp_transformers.TrainingArguments, dp_transformers.PrivacyArguments, ModelArguments, LoraArguments ))
159 |     train_args, privacy_args, model_args, lora_args = arg_parser.parse_args_into_dataclasses()
160 |     main(Arguments(train=train_args, privacy=privacy_args, model=model_args, lora=lora_args))
161 | 


--------------------------------------------------------------------------------
/examples/nlg-reddit/sample-level-dp/README.md:
--------------------------------------------------------------------------------
  1 | # Differentially private fine-tuning of a GPT-2 style model
  2 | 
  3 | This example fine-tunes generative language models (such as GPT-2 series) with Differential Privacy on a text corpus.
  4 | In this case 500,000 samples of Reddit comments.
  5 | We compare different fine-tuning techniques (full fine-tuning, LoRA) and also provide a data distributed implementation for faster training.
  6 | These merely serve as examples as hyperparameters are not optimized and corresponding commands are presented below.
  7 | 
  8 | # Results
  9 | 
 10 | | Model (HF) | Fine-tuning Method | DP  | GPUs   | Epochs | Train Loss | Eval Loss | $\varepsilon$ | Run Time [s] | AML Config            |
 11 | | ---------- | ------------------ | --- | ------ | ------ | ---------- | --------- | ------------- | ------------ | --------------------- |
 12 | | gpt2       | Full               | Yes | 8xV100 |    3   |    3.75    |   3.61    |      8.0      |    1944      | fuft-eps_8.yml        |
 13 | | gpt2       | Full               | No  | 8xV100 |    3   |    3.56    |   3.46    | -             |    1227      | fuft-no_inf.yml       |
 14 | | gpt2       | LoRA               | Yes | 8xV100 |    3   |    3.74    |   3.60    |      8.0      |    1128      | peft-eps_8.yml        |
 15 | | gpt2       | LoRA               | Yes | 1xV100 |    3   |    3.74    |   3.60    |      8.0      |    12248     | peft-eps_8-gpus_1.yml |
 16 | | gpt2       | LoRA               | No  | 8xV100 |    3   |    3.70    |   3.58    | -             |    1006      | peft-eps_inf.yml      |
 17 | 
 18 | 
 19 | ## Azure Machine Learning
 20 | 
 21 | We provide Azure Machine Learning (AML) configuration files for the above experiments.
 22 | 
 23 | ```
 24 | az ml job create --file aml/<aml config>
 25 | ```
 26 | 
 27 | 
 28 | ## Local Training
 29 | 
 30 | Alternatively, you can run the training script directly on your local machine.
 31 | 
 32 | Install the environment (assuming CUDA 11.6) with
 33 | 
 34 | ```
 35 | conda env create -f environment.yml
 36 | conda activate dp-transformers
 37 | ```
 38 | 
 39 | And run one of the following training scripts.
 40 | 
 41 | ### Fine-tune the full model with DP
 42 | 
 43 | ```
 44 | python -m torch.distributed.run --nproc_per_node 16 fine-tune-dp.py \
 45 | --output_dir scratch \
 46 | --model_name gpt2 \
 47 | --sequence_len 128 \
 48 | --per_device_train_batch_size 32 \
 49 | --gradient_accumulation_steps 2 \
 50 | --evaluation_strategy steps \
 51 | --eval_steps 45 \
 52 | --log_level info \
 53 | --per_device_eval_batch_size 64 \
 54 | --eval_accumulation_steps 1 \
 55 | --seed 42 \
 56 | --target_epsilon 8 \
 57 | --per_sample_max_grad_norm 1.0 \
 58 | --prediction_loss_only \
 59 | --weight_decay 0.01 \
 60 | --remove_unused_columns False \
 61 | --num_train_epochs 3 \
 62 | --logging_steps 5 \
 63 | --max_grad_norm 0 \
 64 | --lr_scheduler_type constant \
 65 | --learning_rate 1e-4 \
 66 | --disable_tqdm True \
 67 | --dataloader_num_workers 2 \
 68 | --label_names labels
 69 | ```
 70 | 
 71 | ### Fine-tune the full model without DP
 72 | 
 73 | ```
 74 | python -m torch.distributed.run --nproc_per_node 16 fine-tune-nodp.py \
 75 | --output_dir scratch \
 76 | --model_name gpt2 \
 77 | --sequence_len 128 \
 78 | --per_device_train_batch_size 64 \
 79 | --gradient_accumulation_steps 1 \
 80 | --evaluation_strategy steps \
 81 | --eval_steps 45 \
 82 | --log_level info \
 83 | --per_device_eval_batch_size 64 \
 84 | --eval_accumulation_steps 1 \
 85 | --seed 42 \
 86 | --prediction_loss_only \
 87 | --weight_decay 0.01 \
 88 | --remove_unused_columns False \
 89 | --num_train_epochs 3 \
 90 | --logging_steps 5 \
 91 | --max_grad_norm 0 \
 92 | --lr_scheduler_type constant \
 93 | --learning_rate 2e-4 \
 94 | --disable_tqdm True \
 95 | --dataloader_num_workers 2 \
 96 | --label_names labels
 97 | ```
 98 | 
 99 | ### Fine-tune only the LoRA layers introduced into the model with DP
100 | 
101 | ```
102 | python -m torch.distributed.run --nproc_per_node 16 fine-tune-dp.py \
103 | --output_dir scratch \
104 | --model_name gpt2 \
105 | --sequence_len 128 \
106 | --per_device_train_batch_size 64 \
107 | --gradient_accumulation_steps 1 \
108 | --evaluation_strategy steps \
109 | --eval_steps 45 \
110 | --log_level info \
111 | --per_device_eval_batch_size 64 \
112 | --eval_accumulation_steps 1 \
113 | --seed 42 \
114 | --target_epsilon 8 \
115 | --per_sample_max_grad_norm 1.0 \
116 | --prediction_loss_only \
117 | --weight_decay 0.01 \
118 | --remove_unused_columns False \
119 | --num_train_epochs 3 \
120 | --logging_steps 5 \
121 | --lora_dim 4 \
122 | --lora_alpha 32 \
123 | --lora_dropout 0.0 \
124 | --max_grad_norm 0 \
125 | --lr_scheduler_type constant \
126 | --learning_rate 3e-4 \
127 | --disable_tqdm True \
128 | --dataloader_num_workers 2 \
129 | --label_names labels \
130 | --enable_lora
131 | ```
132 | 
133 | ### Fine-tune only the LoRA layers introduced into the model without DP
134 | 
135 | ```
136 | python -m torch.distributed.run --nproc_per_node 16 fine-tune-nodp.py \
137 | --output_dir scratch \
138 | --model_name gpt2 \
139 | --sequence_len 128 \
140 | --per_device_train_batch_size 64 \
141 | --gradient_accumulation_steps 1 \
142 | --evaluation_strategy steps \
143 | --eval_steps 45 \
144 | --log_level info \
145 | --per_device_eval_batch_size 64 \
146 | --eval_accumulation_steps 1 \
147 | --seed 42 \
148 | --prediction_loss_only \
149 | --weight_decay 0.01 \
150 | --remove_unused_columns False \
151 | --num_train_epochs 3 \
152 | --logging_steps 5 \
153 | --lora_dim 4 \
154 | --lora_alpha 32 \
155 | --lora_dropout 0.0 \
156 | --max_grad_norm 0 \
157 | --lr_scheduler_type constant \
158 | --learning_rate 5e-4 \
159 | --disable_tqdm True \
160 | --dataloader_num_workers 2 \
161 | --label_names labels \
162 | --enable_lora
163 | ```


--------------------------------------------------------------------------------
/examples/nlg-reddit/sample-level-dp/aml/fuft-eps_8.yml:
--------------------------------------------------------------------------------
 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 2 | code: ../../../../
 3 | command: >-
 4 |   python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 examples/nlg-reddit/sample-level-dp/fine-tune-dp.py \
 5 |     --output_dir outputs \
 6 |     --model_name gpt2 \
 7 |     --sequence_len 128 \
 8 |     --per_device_train_batch_size 16 \
 9 |     --gradient_accumulation_steps 8 \
10 |     --evaluation_strategy steps \
11 |     --eval_steps 128 \
12 |     --log_level info \
13 |     --per_device_eval_batch_size 64 \
14 |     --eval_accumulation_steps 1 \
15 |     --seed 42 \
16 |     --target_epsilon 8 \
17 |     --target_delta 2e-6 \
18 |     --per_sample_max_grad_norm 1.0 \
19 |     --prediction_loss_only \
20 |     --weight_decay 0.01 \
21 |     --remove_unused_columns False \
22 |     --num_train_epochs 3 \
23 |     --logging_steps 5 \
24 |     --max_grad_norm 0 \
25 |     --lr_scheduler_type constant \
26 |     --learning_rate 1e-4 \
27 |     --disable_tqdm True \
28 |     --dataloader_num_workers 2 \
29 |     --label_names labels
30 | environment:
31 |   image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04
32 |   conda_file: ../environment.yml
33 | compute: azureml:ND40rsv2
34 | display_name: full_fine_tuning-epsilon_8
35 | experiment_name: dp-transformers-nlg-reddit-sample-level-dp
36 | description: Train a model on the Reddit dataset using differential privacy


--------------------------------------------------------------------------------
/examples/nlg-reddit/sample-level-dp/aml/fuft-eps_inf.yml:
--------------------------------------------------------------------------------
 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 2 | code: ../../../../
 3 | command: >-
 4 |   python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 examples/nlg-reddit/sample-level-dp/fine-tune-nodp.py \
 5 |     --output_dir outputs \
 6 |     --model_name gpt2 \
 7 |     --sequence_len 128 \
 8 |     --per_device_train_batch_size 64 \
 9 |     --gradient_accumulation_steps 2 \
10 |     --evaluation_strategy steps \
11 |     --eval_steps 128 \
12 |     --log_level info \
13 |     --per_device_eval_batch_size 64 \
14 |     --eval_accumulation_steps 1 \
15 |     --seed 42 \
16 |     --prediction_loss_only \
17 |     --weight_decay 0.01 \
18 |     --remove_unused_columns False \
19 |     --num_train_epochs 3 \
20 |     --logging_steps 5 \
21 |     --max_grad_norm 0 \
22 |     --lr_scheduler_type constant \
23 |     --learning_rate 1e-4 \
24 |     --disable_tqdm True \
25 |     --dataloader_num_workers 2 \
26 |     --label_names labels
27 | environment:
28 |   image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04
29 |   conda_file: ../environment.yml
30 | compute: azureml:ND40rsv2
31 | display_name: full_fine_tuning-epsilon_inf
32 | experiment_name: dp-transformers-nlg-reddit-sample-level-dp
33 | description: Train a model on the Reddit dataset using differential privacy


--------------------------------------------------------------------------------
/examples/nlg-reddit/sample-level-dp/aml/peft-eps_8-gpus_1.yml:
--------------------------------------------------------------------------------
 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 2 | code: ../../../../
 3 | command: >-
 4 |   python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 1 examples/nlg-reddit/sample-level-dp/fine-tune-dp.py \
 5 |     --output_dir outputs \
 6 |     --model_name gpt2 \
 7 |     --sequence_len 128 \
 8 |     --per_device_train_batch_size 64 \
 9 |     --gradient_accumulation_steps 16 \
10 |     --evaluation_strategy steps \
11 |     --eval_steps 128 \
12 |     --log_level info \
13 |     --per_device_eval_batch_size 64 \
14 |     --eval_accumulation_steps 1 \
15 |     --seed 42 \
16 |     --target_epsilon 8 \
17 |     --target_delta 2e-6 \
18 |     --per_sample_max_grad_norm 1.0 \
19 |     --prediction_loss_only \
20 |     --weight_decay 0.01 \
21 |     --remove_unused_columns False \
22 |     --num_train_epochs 3 \
23 |     --logging_steps 5 \
24 |     --max_grad_norm 0 \
25 |     --lr_scheduler_type constant \
26 |     --learning_rate 1e-4 \
27 |     --disable_tqdm True \
28 |     --dataloader_num_workers 2 \
29 |     --lora_dim 4 \
30 |     --lora_alpha 32 \
31 |     --lora_dropout 0.0 \
32 |     --label_names labels \
33 |     --enable_lora
34 | environment:
35 |   image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04
36 |   conda_file: ../environment.yml
37 | compute: azureml:NC6v3
38 | display_name: parameter_efficient_fine_tuning-epsilon_8-gpus_1
39 | experiment_name: dp-transformers-nlg-reddit-sample-level-dp
40 | description: Train a model on the Reddit dataset using differential privacy


--------------------------------------------------------------------------------
/examples/nlg-reddit/sample-level-dp/aml/peft-eps_8.yml:
--------------------------------------------------------------------------------
 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 2 | code: ../../../../
 3 | command: >-
 4 |   python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 examples/nlg-reddit/sample-level-dp/fine-tune-dp.py \
 5 |     --output_dir outputs \
 6 |     --model_name gpt2 \
 7 |     --sequence_len 128 \
 8 |     --per_device_train_batch_size 128 \
 9 |     --gradient_accumulation_steps 1 \
10 |     --evaluation_strategy steps \
11 |     --eval_steps 128 \
12 |     --log_level info \
13 |     --per_device_eval_batch_size 64 \
14 |     --eval_accumulation_steps 1 \
15 |     --seed 42 \
16 |     --target_epsilon 8 \
17 |     --target_delta 2e-6 \
18 |     --per_sample_max_grad_norm 1.0 \
19 |     --prediction_loss_only \
20 |     --weight_decay 0.01 \
21 |     --remove_unused_columns False \
22 |     --num_train_epochs 3 \
23 |     --logging_steps 5 \
24 |     --max_grad_norm 0 \
25 |     --lr_scheduler_type constant \
26 |     --learning_rate 1e-4 \
27 |     --disable_tqdm True \
28 |     --dataloader_num_workers 2 \
29 |     --lora_dim 4 \
30 |     --lora_alpha 32 \
31 |     --lora_dropout 0.0 \
32 |     --enable_lora \
33 |     --label_names labels
34 | environment:
35 |   image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04
36 |   conda_file: ../environment.yml
37 | compute: azureml:ND40rsv2
38 | display_name: parameter_efficient_fine_tuning-epsilon_8
39 | experiment_name: dp-transformers-nlg-reddit-sample-level-dp
40 | description: Train a model on the Reddit dataset using differential privacy


--------------------------------------------------------------------------------
/examples/nlg-reddit/sample-level-dp/aml/peft-eps_inf.yml:
--------------------------------------------------------------------------------
 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 2 | code: ../../../../
 3 | command: >-
 4 |   python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 examples/nlg-reddit/sample-level-dp/fine-tune-nodp.py \
 5 |     --output_dir outputs \
 6 |     --model_name gpt2 \
 7 |     --sequence_len 128 \
 8 |     --per_device_train_batch_size 128 \
 9 |     --gradient_accumulation_steps 1 \
10 |     --evaluation_strategy steps \
11 |     --eval_steps 128 \
12 |     --log_level info \
13 |     --per_device_eval_batch_size 64 \
14 |     --eval_accumulation_steps 1 \
15 |     --seed 42 \
16 |     --prediction_loss_only \
17 |     --weight_decay 0.01 \
18 |     --remove_unused_columns False \
19 |     --num_train_epochs 3 \
20 |     --logging_steps 5 \
21 |     --max_grad_norm 0 \
22 |     --lr_scheduler_type constant \
23 |     --learning_rate 1e-4 \
24 |     --disable_tqdm True \
25 |     --dataloader_num_workers 2 \
26 |     --lora_dim 4 \
27 |     --lora_alpha 32 \
28 |     --lora_dropout 0.0 \
29 |     --enable_lora \
30 |     --label_names labels
31 | environment:
32 |   image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04
33 |   conda_file: ../environment.yml
34 | compute: azureml:ND40rsv2
35 | display_name: parameter_efficient_fine_tuning-epsilon_inf
36 | experiment_name: dp-transformers-nlg-reddit-sample-level-dp
37 | description: Train a model on the Reddit dataset using differential privacy


--------------------------------------------------------------------------------
/examples/nlg-reddit/sample-level-dp/environment.yml:
--------------------------------------------------------------------------------
  1 | name: dp-transformers
  2 | channels:
  3 |   - pytorch
  4 |   - nvidia
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1
  8 |   - _openmp_mutex=5.1
  9 |   - blas=1.0
 10 |   - bzip2=1.0.8
 11 |   - ca-certificates=2023.05.30
 12 |   - cuda-cudart=11.8.89
 13 |   - cuda-cupti=11.8.87
 14 |   - cuda-libraries=11.8.0
 15 |   - cuda-nvrtc=11.8.89
 16 |   - cuda-nvtx=11.8.86
 17 |   - cuda-runtime=11.8.0
 18 |   - filelock=3.9.0
 19 |   - gmp=6.2.1
 20 |   - gmpy2=2.1.2
 21 |   - intel-openmp=2023.1.0
 22 |   - jinja2=3.1.2
 23 |   - ld_impl_linux-64=2.38
 24 |   - libcublas=11.11.3.6
 25 |   - libcufft=10.9.0.58
 26 |   - libcufile=1.7.1.12
 27 |   - libcurand=10.3.3.129
 28 |   - libcusolver=11.4.1.48
 29 |   - libcusparse=11.7.5.86
 30 |   - libffi=3.4.4
 31 |   - libgcc-ng=11.2.0
 32 |   - libgomp=11.2.0
 33 |   - libnpp=11.8.0.86
 34 |   - libnvjpeg=11.9.0.86
 35 |   - libstdcxx-ng=11.2.0
 36 |   - libuuid=1.41.5
 37 |   - markupsafe=2.1.1
 38 |   - mkl=2023.1.0
 39 |   - mpc=1.1.0
 40 |   - mpfr=4.0.2
 41 |   - mpmath=1.3.0
 42 |   - ncurses=6.4
 43 |   - networkx=3.1
 44 |   - openssl=3.0.10
 45 |   - pip=23.2.1
 46 |   - python=3.10.12
 47 |   - pytorch=2.0.1
 48 |   - pytorch-cuda=11.8
 49 |   - pytorch-mutex=1.0
 50 |   - readline=8.2
 51 |   - setuptools=68.0.0
 52 |   - sqlite=3.41.2
 53 |   - sympy=1.11.1
 54 |   - tbb=2021.8.0
 55 |   - tk=8.6.12
 56 |   - torchtriton=2.0.0
 57 |   - typing_extensions=4.7.1
 58 |   - wheel=0.38.4
 59 |   - xz=5.4.2
 60 |   - zlib=1.2.13
 61 |   - pip:
 62 |       - accelerate==0.21.0
 63 |       - aiohttp==3.8.5
 64 |       - aiosignal==1.3.1
 65 |       - alembic==1.11.2
 66 |       - async-timeout==4.0.3
 67 |       - attrs==23.1.0
 68 |       - azure-common==1.1.28
 69 |       - azure-core==1.29.2
 70 |       - azure-identity==1.14.0
 71 |       - azure-mgmt-core==1.4.0
 72 |       - azure-storage-blob==12.13.0
 73 |       - azureml-mlflow==1.52.0
 74 |       - blinker==1.6.2
 75 |       - certifi==2023.7.22
 76 |       - cffi==1.15.1
 77 |       - charset-normalizer==3.2.0
 78 |       - click==8.1.6
 79 |       - cloudpickle==2.2.1
 80 |       - contourpy==1.1.0
 81 |       - cryptography==41.0.3
 82 |       - cycler==0.11.0
 83 |       - databricks-cli==0.17.7
 84 |       - datasets==2.14.4
 85 |       - dill==0.3.7
 86 |       - docker==6.1.3
 87 |       - entrypoints==0.4
 88 |       - exceptiongroup==1.1.3
 89 |       - flask==2.3.2
 90 |       - fonttools==4.42.0
 91 |       - frozenlist==1.4.0
 92 |       - fsspec==2023.6.0
 93 |       - gitdb==4.0.10
 94 |       - gitpython==3.1.32
 95 |       - greenlet==2.0.2
 96 |       - gunicorn==21.2.0
 97 |       - huggingface-hub==0.19.4
 98 |       - idna==3.4
 99 |       - importlib-metadata==6.8.0
100 |       - iniconfig==2.0.0
101 |       - isodate==0.6.1
102 |       - itsdangerous==2.1.2
103 |       - joblib==1.3.2
104 |       - jsonpickle==3.0.2
105 |       - kiwisolver==1.4.4
106 |       - mako==1.2.4
107 |       - markdown==3.4.4
108 |       - matplotlib==3.7.2
109 |       - mlflow==2.6.0
110 |       - mlflow-skinny==2.6.0
111 |       - msal==1.23.0
112 |       - msal-extensions==1.0.0
113 |       - msrest==0.7.1
114 |       - multidict==6.0.4
115 |       - multiprocess==0.70.15
116 |       - numpy==1.25.2
117 |       - oauthlib==3.2.2
118 |       - opacus==1.4.0
119 |       - opt-einsum==3.3.0
120 |       - packaging==23.1
121 |       - pandas==2.0.3
122 |       - peft==0.4.0
123 |       - pillow==10.0.0
124 |       - pluggy==1.2.0
125 |       - portalocker==2.7.0
126 |       - protobuf==4.24.0
127 |       - prv-accountant==0.1.1.post1
128 |       - psutil==5.9.5
129 |       - pyarrow==12.0.1
130 |       - pycparser==2.21
131 |       - pyjwt==2.8.0
132 |       - pyparsing==3.0.9
133 |       - pytest==7.4.0
134 |       - python-dateutil==2.8.2
135 |       - pytz==2023.3
136 |       - pyyaml==6.0.1
137 |       - querystring-parser==1.2.4
138 |       - regex==2023.8.8
139 |       - requests==2.31.0
140 |       - requests-oauthlib==1.3.1
141 |       - safetensors==0.3.2
142 |       - scikit-learn==1.3.0
143 |       - scipy==1.11.1
144 |       - six==1.16.0
145 |       - smmap==5.0.0
146 |       - sqlalchemy==2.0.19
147 |       - sqlparse==0.4.4
148 |       - tabulate==0.9.0
149 |       - threadpoolctl==3.2.0
150 |       - tokenizers==0.15.0
151 |       - tomli==2.0.1
152 |       - tqdm==4.66.1
153 |       - transformers==4.36.1
154 |       - tzdata==2023.3
155 |       - urllib3==1.26.16
156 |       - websocket-client==1.6.1
157 |       - werkzeug==2.3.7
158 |       - xxhash==3.3.0
159 |       - yarl==1.9.2
160 |       - zipp==3.16.2
161 | 


--------------------------------------------------------------------------------
/examples/nlg-reddit/sample-level-dp/fine-tune-dp.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT License.
  3 | 
  4 | '''Train GPT2 model series with DP (w/ optional parameter-efficient approach LoRA)'''
  5 | 
  6 | import datasets
  7 | import dp_transformers
  8 | import transformers
  9 | import sys
 10 | import logging
 11 | 
 12 | from dataclasses import dataclass, field, asdict
 13 | from peft import get_peft_model, LoraConfig
 14 | 
 15 | from dp_transformers.grad_sample.transformers import conv_1d
 16 | 
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | @dataclass
 22 | class ModelArguments:
 23 |     model_name: str = field(default="gpt2", metadata={
 24 |         "help": "Model name in HuggingFace, e.g. 'gpt2'"
 25 |     })
 26 |     sequence_len: int = field(default=128, metadata={
 27 |         "help": "Maximum sequence length"
 28 |     })
 29 | 
 30 | 
 31 | @dataclass
 32 | class LoraArguments:
 33 |     enable_lora: bool = field(default=False, metadata={
 34 |         "help": "Whether to enable LoRA"
 35 |     })
 36 |     lora_dim: int = field(default=8, metadata={
 37 |         "help": "LoRA dimension"
 38 |     })
 39 |     lora_alpha: int = field(default=8, metadata={
 40 |         "help": "LoRA alpha"
 41 |     })
 42 |     lora_dropout: float = field(default=0.0, metadata={
 43 |         "help": "LoRA dropout"
 44 |     })
 45 | 
 46 |     def as_peft_config(self) -> LoraConfig:
 47 |         if not self.enable_lora:
 48 |             raise ValueError("LoRA is not enabled, cannot convert to LoRA config")
 49 |         params = asdict(self)
 50 |         params.pop("enable_lora")
 51 |         params["r"] = params.pop("lora_dim")
 52 |         return LoraConfig(**params)
 53 | 
 54 | 
 55 | @dataclass
 56 | class Arguments:
 57 |     train: dp_transformers.TrainingArguments
 58 |     privacy: dp_transformers.PrivacyArguments
 59 |     model: ModelArguments
 60 |     lora: LoraConfig
 61 | 
 62 | 
 63 | def main(args: Arguments):
 64 |     transformers.set_seed(args.train.seed)
 65 | 
 66 |     # Setup logging
 67 |     logging.basicConfig(
 68 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 69 |         datefmt="%m/%d/%Y %H:%M:%S",
 70 |         handlers=[logging.StreamHandler(sys.stdout)],
 71 |     )
 72 | 
 73 |     log_level = train_args.get_process_log_level()
 74 |     logger.setLevel(log_level)
 75 |     datasets.utils.logging.set_verbosity(log_level)
 76 |     transformers.utils.logging.set_verbosity(log_level)
 77 |     transformers.utils.logging.enable_default_handler()
 78 |     transformers.utils.logging.enable_explicit_format()
 79 | 
 80 |     # Log on each process the small summary:
 81 |     logger.warning(
 82 |         f"Process rank: {train_args.local_rank}, device: {train_args.device}, n_gpu: {train_args.n_gpu}, "
 83 |         f"distributed training: {bool(train_args.local_rank != -1)}, 16-bits training: {train_args.fp16}"
 84 |     )
 85 |     logger.info(f"Training/evaluation parameters {train_args}")
 86 |     logger.info(f"Privacy parameters {privacy_args}")
 87 | 
 88 |     # Load model
 89 |     model = transformers.AutoModelForCausalLM.from_pretrained(args.model.model_name)
 90 |     model = model.to(train_args.device)
 91 | 
 92 |     # Load data
 93 |     dataset = datasets.load_dataset('reddit', split="train[:500000]").train_test_split(0.02, seed=args.train.seed)
 94 | 
 95 |     # Load tokenizer
 96 |     tokenizer = transformers.AutoTokenizer.from_pretrained(args.model.model_name)
 97 |     tokenizer.pad_token = tokenizer.eos_token
 98 | 
 99 |     # Tokenize data
100 |     with train_args.main_process_first(desc="tokenizing dataset"):
101 |         dataset = dataset.map(
102 |             lambda batch: tokenizer(batch['content'], padding="max_length", truncation=True, max_length=args.model.sequence_len),
103 |             batched=True, num_proc=8, desc="tokenizing dataset", remove_columns=dataset.column_names['train']
104 |         )
105 | 
106 |     if args.lora.enable_lora:
107 |         logger.info("Using LoRA")
108 |         model = get_peft_model(model=model, peft_config=args.lora.as_peft_config())
109 |     else:
110 |         logger.info("Not using LoRA")
111 | 
112 |     if train_args.local_rank == 0:
113 |         logger.info(f"Total number of parameters of the model: {model.num_parameters(only_trainable=False)}")
114 |         logger.info(f"Fine-tuned number of parameters of the model: {model.num_parameters(only_trainable=True)}")
115 | 
116 |     model = model.cuda()
117 |     model.train()
118 | 
119 |     data_collator = dp_transformers.DataCollatorForPrivateCausalLanguageModeling(tokenizer)
120 | 
121 |     trainer = dp_transformers.dp_utils.OpacusDPTrainer(
122 |         args=train_args,
123 |         model=model,
124 |         train_dataset=dataset['train'],
125 |         eval_dataset=dataset['test'],
126 |         data_collator=data_collator,
127 |         privacy_args=privacy_args,
128 |     )
129 | 
130 |     try:
131 |         trainer.train()
132 |     finally:
133 |         eps_prv = trainer.get_prv_epsilon()
134 |         eps_rdp = trainer.get_rdp_epsilon()
135 |         trainer.log({
136 |             "final_epsilon_prv": eps_prv,
137 |             "final_epsilon_rdp": eps_rdp
138 |         })
139 | 
140 | if __name__ == "__main__":
141 |     arg_parser = transformers.HfArgumentParser((dp_transformers.TrainingArguments, dp_transformers.PrivacyArguments, ModelArguments, LoraArguments))
142 |     train_args, privacy_args, model_args, lora_args = arg_parser.parse_args_into_dataclasses()
143 |     main(Arguments(train=train_args, privacy=privacy_args, model=model_args, lora=lora_args))
144 | 


--------------------------------------------------------------------------------
/examples/nlg-reddit/sample-level-dp/fine-tune-nodp.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT License.
  3 | 
  4 | '''Train GPT2 model series without DP (w/ parameter-efficient approach LoRA when lora_dim > 0)'''
  5 | 
  6 | import datasets
  7 | import dp_transformers
  8 | import transformers
  9 | import sys
 10 | import logging
 11 | 
 12 | from dataclasses import dataclass, field
 13 | from dataclasses import dataclass, field, asdict
 14 | from peft import get_peft_model, LoraConfig
 15 | 
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | @dataclass
 21 | class ModelArguments:
 22 |     model_name: str = field(default="gpt2", metadata={
 23 |         "help": "Model name in HuggingFace, e.g. 'gpt2'"
 24 |     })
 25 |     sequence_len: int = field(default=128, metadata={
 26 |         "help": "Maximum sequence length"
 27 |     })
 28 | 
 29 | 
 30 | @dataclass
 31 | class LoraArguments:
 32 |     enable_lora: bool = field(default=False, metadata={
 33 |         "help": "Whether to enable LoRA"
 34 |     })
 35 |     lora_dim: int = field(default=8, metadata={
 36 |         "help": "LoRA dimension"
 37 |     })
 38 |     lora_alpha: int = field(default=8, metadata={
 39 |         "help": "LoRA alpha"
 40 |     })
 41 |     lora_dropout: float = field(default=0.0, metadata={
 42 |         "help": "LoRA dropout"
 43 |     })
 44 | 
 45 |     def as_peft_config(self) -> LoraConfig:
 46 |         if not self.enable_lora:
 47 |             raise ValueError("LoRA is not enabled, cannot convert to LoRA config")
 48 |         params = asdict(self)
 49 |         params.pop("enable_lora")
 50 |         params["r"] = params.pop("lora_dim")
 51 |         return LoraConfig(**params)
 52 | 
 53 | 
 54 | @dataclass
 55 | class Arguments:
 56 |     train: dp_transformers.TrainingArguments
 57 |     model: ModelArguments
 58 |     lora: LoraArguments
 59 | 
 60 | 
 61 | def main(args: Arguments):
 62 |     transformers.set_seed(args.train.seed)
 63 | 
 64 |     # Setup logging
 65 |     logging.basicConfig(
 66 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 67 |         datefmt="%m/%d/%Y %H:%M:%S",
 68 |         handlers=[logging.StreamHandler(sys.stdout)],
 69 |     )
 70 | 
 71 |     log_level = train_args.get_process_log_level()
 72 |     logger.setLevel(log_level)
 73 |     datasets.utils.logging.set_verbosity(log_level)
 74 |     transformers.utils.logging.set_verbosity(log_level)
 75 |     transformers.utils.logging.enable_default_handler()
 76 |     transformers.utils.logging.enable_explicit_format()
 77 | 
 78 |     # Log on each process the small summary:
 79 |     logger.warning(
 80 |         f"Process rank: {train_args.local_rank}, device: {train_args.device}, n_gpu: {train_args.n_gpu}, "
 81 |         f"distributed training: {bool(train_args.local_rank != -1)}, 16-bits training: {train_args.fp16}"
 82 |     )
 83 |     logger.info(f"Training/evaluation parameters {train_args}")
 84 | 
 85 |     # Load model
 86 |     model = transformers.AutoModelForCausalLM.from_pretrained(args.model.model_name)
 87 |     model = model.to(train_args.device)
 88 | 
 89 |     # Load data
 90 |     dataset = datasets.load_dataset('reddit', split="train[:500000]").train_test_split(0.02, seed=args.train.seed)
 91 | 
 92 |     # Load tokenizer
 93 |     tokenizer = transformers.AutoTokenizer.from_pretrained(args.model.model_name)
 94 |     tokenizer.pad_token = tokenizer.eos_token
 95 | 
 96 |     # Tokenize data
 97 |     with train_args.main_process_first(desc="tokenizing dataset"):
 98 |         dataset = dataset.map(
 99 |             lambda batch: tokenizer(batch['content'], padding="max_length", truncation=True, max_length=args.model.sequence_len),
100 |             batched=True, num_proc=8, desc="tokenizing dataset", remove_columns=dataset.column_names['train']
101 |         )
102 | 
103 |     if args.lora.enable_lora:
104 |         logger.info("Using LoRA")
105 |         model = get_peft_model(model=model, peft_config=args.lora.as_peft_config())
106 |     else:
107 |         logger.info("Not using LoRA")
108 | 
109 |     if train_args.local_rank == 0:
110 |         logger.info(f"Total number of parameters of the model: {model.num_parameters(only_trainable=False)}")
111 |         logger.info(f"Fine-tuned number of parameters of the model: {model.num_parameters(only_trainable=True)}")
112 | 
113 |     model = model.cuda()
114 |     model.train()
115 | 
116 |     data_collator = dp_transformers.DataCollatorForPrivateCausalLanguageModeling(tokenizer)
117 | 
118 |     trainer = transformers.Trainer(
119 |         args=train_args,
120 |         model=model,
121 |         train_dataset=dataset['train'],
122 |         eval_dataset=dataset['test'],
123 |         data_collator=data_collator
124 |     )
125 | 
126 |     trainer.train()
127 | 
128 | if __name__ == "__main__":
129 |     arg_parser = transformers.HfArgumentParser((dp_transformers.TrainingArguments, ModelArguments, LoraArguments))
130 |     train_args, model_args, lora_args = arg_parser.parse_args_into_dataclasses()
131 |     main(Arguments(train=train_args, model=model_args, lora=lora_args))
132 | 


--------------------------------------------------------------------------------
/examples/test_examples.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import json
  3 | import time
  4 | 
  5 | from subprocess import check_output
  6 | from pathlib import Path
  7 | from typing import Dict, Union
  8 | from azureml.core import Workspace, Run
  9 | from dataclasses import dataclass
 10 | from datetime import timedelta, datetime
 11 | 
 12 | 
 13 | @pytest.fixture(scope="session")
 14 | def az_workspace() -> Workspace:
 15 |     subscription_id = json.loads(check_output(["az", "account", "show", "--query", "id"]))
 16 | 
 17 |     output = json.loads(check_output(["az", "configure", "--list-defaults"]))
 18 |     resource_group = next(item for item in output if item["name"] == "group")["value"]
 19 |     workspace_name = next(item for item in output if item["name"] == "workspace")["value"]
 20 | 
 21 |     workspace = Workspace(
 22 |         subscription_id=subscription_id,
 23 |         resource_group=resource_group,
 24 |         workspace_name=workspace_name,
 25 |     )
 26 | 
 27 |     return workspace
 28 | 
 29 | 
 30 | def submit_example_and_wait_for_metrics(ws: Workspace, aml_config_path: Path) -> Dict[str, Union[float, int]]:
 31 |     raw_output = check_output(["az", "ml", "job", "create", "--file", aml_config_path])
 32 |     output = json.loads(raw_output)
 33 |     run = Run.get(ws, run_id=output["name"])
 34 |     print(f"Submitted run {run.get_portal_url()}")
 35 |     try:
 36 |         run.wait_for_completion()
 37 |     except KeyboardInterrupt as e:
 38 |         run.cancel()
 39 |         raise e
 40 |     except Exception as e:
 41 |         run.cancel()
 42 |         raise e
 43 |     
 44 |     waiting_for_details = True
 45 |     while waiting_for_details:
 46 |         details = run.get_details()
 47 |         if "endTimeUtc" in details:
 48 |             waiting_for_details = False
 49 |         else:
 50 |             time.sleep(30)
 51 | 
 52 |     if run.get_status() != "Completed":
 53 |         raise RuntimeError(f"Run did not complete successfully. Status: {run.get_status()}, AML URL: {run.get_portal_url()}")
 54 | 
 55 | 
 56 |     metrics = run.get_metrics()
 57 | 
 58 |     metrics["runtime"] = (
 59 |         datetime.strptime(details["endTimeUtc"], '%Y-%m-%dT%H:%M:%S.%fZ') -
 60 |         datetime.strptime(details["startTimeUtc"], '%Y-%m-%dT%H:%M:%S.%fZ')
 61 |     )
 62 | 
 63 |     return metrics
 64 | 
 65 | 
 66 | @dataclass
 67 | class ExampleTest:
 68 |     aml_config_path: Path
 69 |     expected_trn_loss: float
 70 |     expected_val_loss: float
 71 |     expected_time: timedelta
 72 | 
 73 |     def __str__(self):
 74 |         return f"Example({self.aml_config_path})"
 75 | 
 76 | 
 77 | @pytest.mark.parametrize("example_test",
 78 |     [
 79 |         ExampleTest(
 80 |             aml_config_path=Path("examples")/"nlg-reddit"/"author-level-dp"/"aml"/"fuft-eps_8.yml",
 81 |             expected_trn_loss=3.76,
 82 |             expected_val_loss=3.62,
 83 |             expected_time=timedelta(minutes=52, seconds=15),
 84 |         ),
 85 |         ExampleTest(
 86 |             aml_config_path=Path("examples")/"nlg-reddit"/"author-level-dp"/"aml"/"peft-eps_8.yml",
 87 |             expected_trn_loss=3.79,
 88 |             expected_val_loss=3.62,
 89 |             expected_time=timedelta(minutes=32, seconds=45),
 90 |         ),
 91 |         ExampleTest(
 92 |             aml_config_path=Path("examples")/"nlg-reddit"/"sample-level-dp"/"aml"/"fuft-eps_8.yml",
 93 |             expected_trn_loss=3.74,
 94 |             expected_val_loss=3.59,
 95 |             expected_time=timedelta(hours=1, minutes=15),
 96 |         ),
 97 |         ExampleTest(
 98 |             aml_config_path=Path("examples")/"nlg-reddit"/"sample-level-dp"/"aml"/"fuft-eps_inf.yml",
 99 |             expected_trn_loss=3.58,
100 |             expected_val_loss=3.47,
101 |             expected_time=timedelta(minutes=50, seconds=15),
102 |         ),
103 |         ExampleTest(
104 |             aml_config_path=Path("examples")/"nlg-reddit"/"sample-level-dp"/"aml"/"peft-eps_8.yml",
105 |             expected_trn_loss=3.76,
106 |             expected_val_loss=3.60,
107 |             expected_time=timedelta(minutes=42, seconds=30),
108 |         ),
109 |         ExampleTest(
110 |             aml_config_path=Path("examples")/"nlg-reddit"/"sample-level-dp"/"aml"/"peft-eps_inf.yml",
111 |             expected_trn_loss=3.72,
112 |             expected_val_loss=3.60,
113 |             expected_time=timedelta(minutes=42, seconds=0),
114 |         ),
115 |     ],
116 |     ids=str
117 | )
118 | def test_nlg_reddit(az_workspace, example_test: ExampleTest):
119 |     metrics = submit_example_and_wait_for_metrics(az_workspace, aml_config_path=example_test.aml_config_path)
120 | 
121 |     print(f"Test {example_test.aml_config_path}: {metrics}")
122 |     assert metrics["train_loss"] == pytest.approx(example_test.expected_trn_loss, abs=0.02)
123 |     assert metrics["eval_loss"][-1] == pytest.approx(example_test.expected_val_loss, abs=0.02)
124 |     allowed_time_delta = timedelta(minutes=5)
125 |     if abs(metrics["runtime"] - example_test.expected_time) > allowed_time_delta:
126 |         print(f"::warning file={__file__}:: {example_test.aml_config_path} took {metrics['runtime']} to run, expected "
127 |               f"{example_test.expected_time} +- {allowed_time_delta}")
128 | 


--------------------------------------------------------------------------------
/research/fine_tune_llm_w_qlora/README.md:
--------------------------------------------------------------------------------
 1 | # Differentially private fine-tuning of LLMs using QLoRA
 2 | 
 3 | We demonstrate examples of fine-tuning Mistral 7B using QLoRA with and without DP.
 4 | 
 5 | # Results
 6 | 
 7 | | Dataset (HF) | DP  | GPUs   | Epochs | Max Eval Accuracy | $\varepsilon$ | Run Time [s] | AML Config            |
 8 | | ---------- | --- | ------ | ------ | --------- | ------------- | ------------ | --------------------- |
 9 | | sst2       | Yes | 8xA100 |    3   |   96.44    |      8.0      |    .      | sst2/peft-eps_8.yml        |
10 | | sst2       | No  | 8xA100 |    3   |   97.25    | -             |    .      | sst2/peft-eps_inf.yml       |
11 | | qnli       | Yes | 8xA100 |    3   |   94.80    |      8.0      |    .      | qnli/peft-eps_8.yml        |
12 | | qnli       | No  | 8xA100 |    3   |   96.40    | -             |    .      | qnli/peft-eps_inf.yml       |
13 | 
14 | | Dataset (HF) | DP  | GPUs   | Epochs | Min Eval Loss | Test ROUGE1 | Test ROUGE2 | Test ROUGEL | $\varepsilon$ | Run Time [s] | AML Config |
15 | | ------------ | --- | ------ | ------ | ------------- | ----------- | ----------- | ----------- | ------------- | ------------ | ---------- |
16 | | cnn          | Yes | 8xA100 |    3   |     0.9624    |   44.16     |    22.16    |   30.99     |      8.0      |    .         | cnn/peft-eps_8.yml |
17 | | cnn          | No  | 8xA100 |    3   |     0.9188    |   45.05     |    22.99    |   31.69     | -             |    .         | cnn/peft-eps_inf.yml |
18 | 
19 | ## Azure Machine Learning
20 | 
21 | We provide Azure Machine Learning (AML) configuration files for the above experiments.
22 | 
23 | ```
24 | az ml job create --file aml/<aml config>
25 | ```
26 | 
27 | 
28 | ## Local Training
29 | 
30 | Alternatively, you can run the training script directly on your local machine.
31 | 
32 | Install the environment with
33 | 
34 | ```
35 | conda env create -f environment.yml
36 | conda activate dp-transformers
37 | ```
38 | 
39 | Follow the training scripts under aml folder.
40 | 


--------------------------------------------------------------------------------
/research/fine_tune_llm_w_qlora/aml/cnn/peft-eps_8.yml:
--------------------------------------------------------------------------------
 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 2 | code: ../../../../
 3 | command: >-
 4 |   python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 research/fine_tune_llm_w_qlora/fine-tune-dp.py \
 5 |     --output_dir outputs \
 6 |     --model_name mistralai/Mistral-7B-v0.1 \
 7 |     --dataset_name cnn \
 8 |     --sequence_len 1024 \
 9 |     --per_device_train_batch_size 16 \
10 |     --gradient_accumulation_steps 8 \
11 |     --evaluation_strategy steps \
12 |     --eval_steps 10 \
13 |     --save_strategy no \
14 |     --log_level info \
15 |     --per_device_eval_batch_size 32 \
16 |     --eval_accumulation_steps 1 \
17 |     --seed 42 \
18 |     --target_epsilon 8 \
19 |     --target_delta 1e-5 \
20 |     --per_sample_max_grad_norm 1.0 \
21 |     --weight_decay 0.01 \
22 |     --remove_unused_columns False \
23 |     --num_train_epochs 3 \
24 |     --logging_steps 4 \
25 |     --max_grad_norm 0 \
26 |     --lr_scheduler_type constant \
27 |     --learning_rate 3e-4 \
28 |     --disable_tqdm True \
29 |     --dataloader_num_workers 2 \
30 |     --lora_dim 4 \
31 |     --lora_alpha 32 \
32 |     --lora_dropout 0.0 \
33 |     --enable_lora \
34 |     --target_modules "['q_proj', 'v_proj']" \
35 |     --label_names labels \
36 |     --bf16 \
37 |     --gradient_checkpointing
38 | environment:
39 |   image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04
40 |   conda_file: ../../environment.yml
41 | compute: azureml:ND96asrv4
42 | display_name: mistral_7b_qlora_dp_cnn
43 | experiment_name: dp-transformers-mistral-7b-qlora-dp-cnn
44 | description: DP fine-tune Mistral 7B model with QLoRA on CNN dataset


--------------------------------------------------------------------------------
/research/fine_tune_llm_w_qlora/aml/cnn/peft-eps_inf.yml:
--------------------------------------------------------------------------------
 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 2 | code: ../../../../
 3 | command: >-
 4 |   python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 research/fine_tune_llm_w_qlora/fine-tune-nodp.py \
 5 |     --output_dir outputs \
 6 |     --model_name mistralai/Mistral-7B-v0.1 \
 7 |     --dataset_name cnn \
 8 |     --sequence_len 1024 \
 9 |     --per_device_train_batch_size 4 \
10 |     --gradient_accumulation_steps 1 \
11 |     --evaluation_strategy steps \
12 |     --eval_steps 300 \
13 |     --save_strategy no \
14 |     --log_level info \
15 |     --per_device_eval_batch_size 32 \
16 |     --eval_accumulation_steps 1 \
17 |     --seed 42 \
18 |     --weight_decay 0.01 \
19 |     --remove_unused_columns False \
20 |     --num_train_epochs 3 \
21 |     --logging_steps 5 \
22 |     --max_grad_norm 0 \
23 |     --lr_scheduler_type constant \
24 |     --learning_rate 2.5e-5 \
25 |     --disable_tqdm True \
26 |     --dataloader_num_workers 2 \
27 |     --lora_dim 4 \
28 |     --lora_alpha 32 \
29 |     --lora_dropout 0.0 \
30 |     --enable_lora \
31 |     --target_modules "['q_proj', 'v_proj']" \
32 |     --label_names labels \
33 |     --ddp_find_unused_parameters False \
34 |     --bf16 \
35 |     --gradient_checkpointing
36 | environment:
37 |   image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04
38 |   conda_file: ../../environment.yml
39 | compute: azureml:ND96asrv4
40 | display_name: mistral_7b_qlora_nodp_cnn
41 | experiment_name: dp-transformers-mistral-7b-qlora-nodp-cnn
42 | description: Fine-tune Mistral 7B model with QLoRA on CNN dataset


--------------------------------------------------------------------------------
/research/fine_tune_llm_w_qlora/aml/qnli/peft-eps_8.yml:
--------------------------------------------------------------------------------
 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 2 | code: ../../../../
 3 | command: >-
 4 |   python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 research/fine_tune_llm_w_qlora/fine-tune-dp.py \
 5 |     --output_dir outputs \
 6 |     --model_name mistralai/Mistral-7B-v0.1 \
 7 |     --dataset_name qnli \
 8 |     --sequence_len 256 \
 9 |     --per_device_train_batch_size 8 \
10 |     --gradient_accumulation_steps 16 \
11 |     --evaluation_strategy steps \
12 |     --eval_steps 4 \
13 |     --save_strategy no \
14 |     --log_level info \
15 |     --per_device_eval_batch_size 16 \
16 |     --eval_accumulation_steps 1 \
17 |     --seed 42 \
18 |     --target_epsilon 8 \
19 |     --target_delta 1e-5 \
20 |     --per_sample_max_grad_norm 1.0 \
21 |     --weight_decay 0.01 \
22 |     --remove_unused_columns False \
23 |     --num_train_epochs 3 \
24 |     --logging_steps 4 \
25 |     --max_grad_norm 0 \
26 |     --lr_scheduler_type constant \
27 |     --learning_rate 3e-4 \
28 |     --disable_tqdm True \
29 |     --dataloader_num_workers 2 \
30 |     --lora_dim 4 \
31 |     --lora_alpha 32 \
32 |     --lora_dropout 0.0 \
33 |     --enable_lora \
34 |     --target_modules "['q_proj', 'v_proj']" \
35 |     --label_names labels \
36 |     --bf16
37 | environment:
38 |   image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04
39 |   conda_file: ../../environment.yml
40 | compute: azureml:ND96asrv4
41 | display_name: mistral_7b_qlora_dp_qnli
42 | experiment_name: dp-transformers-mistral-7b-qlora-dp-qnli
43 | description: DP fine-tune Mistral 7B model with QLoRA on QNLI dataset


--------------------------------------------------------------------------------
/research/fine_tune_llm_w_qlora/aml/qnli/peft-eps_inf.yml:
--------------------------------------------------------------------------------
 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 2 | code: ../../../../
 3 | command: >-
 4 |   python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 research/fine_tune_llm_w_qlora/fine-tune-nodp.py \
 5 |     --output_dir outputs \
 6 |     --model_name mistralai/Mistral-7B-v0.1 \
 7 |     --dataset_name qnli \
 8 |     --sequence_len 256 \
 9 |     --per_device_train_batch_size 4 \
10 |     --gradient_accumulation_steps 1 \
11 |     --evaluation_strategy steps \
12 |     --eval_steps 100 \
13 |     --save_strategy no \
14 |     --log_level info \
15 |     --per_device_eval_batch_size 8 \
16 |     --eval_accumulation_steps 1 \
17 |     --seed 42 \
18 |     --weight_decay 0.01 \
19 |     --remove_unused_columns False \
20 |     --num_train_epochs 3 \
21 |     --logging_steps 5 \
22 |     --max_grad_norm 0 \
23 |     --lr_scheduler_type constant \
24 |     --learning_rate 2.5e-5 \
25 |     --disable_tqdm True \
26 |     --dataloader_num_workers 2 \
27 |     --lora_dim 4 \
28 |     --lora_alpha 32 \
29 |     --lora_dropout 0.0 \
30 |     --enable_lora \
31 |     --target_modules "['q_proj', 'v_proj']" \
32 |     --label_names labels \
33 |     --bf16
34 | environment:
35 |   image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04
36 |   conda_file: ../../environment.yml
37 | compute: azureml:ND96asrv4
38 | display_name: mistral_7b_qlora_nodp_qnli
39 | experiment_name: dp-transformers-mistral-7b-qlora-nodp-qnli
40 | description: Fine-tune Mistral 7B model with QLoRA on QNLI dataset


--------------------------------------------------------------------------------
/research/fine_tune_llm_w_qlora/aml/sst2/peft-eps_8.yml:
--------------------------------------------------------------------------------
 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 2 | code: ../../../../
 3 | command: >-
 4 |   python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 research/fine_tune_llm_w_qlora/fine-tune-dp.py \
 5 |     --output_dir outputs \
 6 |     --model_name mistralai/Mistral-7B-v0.1 \
 7 |     --dataset_name sst2 \
 8 |     --sequence_len 128 \
 9 |     --per_device_train_batch_size 8 \
10 |     --gradient_accumulation_steps 16 \
11 |     --evaluation_strategy steps \
12 |     --eval_steps 4 \
13 |     --save_strategy no \
14 |     --log_level info \
15 |     --per_device_eval_batch_size 16 \
16 |     --eval_accumulation_steps 1 \
17 |     --seed 42 \
18 |     --target_epsilon 8 \
19 |     --target_delta 1e-5 \
20 |     --per_sample_max_grad_norm 1.0 \
21 |     --weight_decay 0.01 \
22 |     --remove_unused_columns False \
23 |     --num_train_epochs 3 \
24 |     --logging_steps 4 \
25 |     --max_grad_norm 0 \
26 |     --lr_scheduler_type constant \
27 |     --learning_rate 3e-4 \
28 |     --disable_tqdm True \
29 |     --dataloader_num_workers 2 \
30 |     --lora_dim 4 \
31 |     --lora_alpha 32 \
32 |     --lora_dropout 0.0 \
33 |     --enable_lora \
34 |     --target_modules "['q_proj', 'v_proj']" \
35 |     --label_names labels \
36 |     --bf16
37 | environment:
38 |   image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04
39 |   conda_file: ../../environment.yml
40 | compute: azureml:ND96asrv4
41 | display_name: mistral_7b_qlora_dp_sst2
42 | experiment_name: dp-transformers-mistral-7b-qlora-dp-sst2
43 | description: DP fine-tune Mistral 7B model with QLoRA on SST-2 dataset


--------------------------------------------------------------------------------
/research/fine_tune_llm_w_qlora/aml/sst2/peft-eps_inf.yml:
--------------------------------------------------------------------------------
 1 | $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 2 | code: ../../../../
 3 | command: >-
 4 |   python -m pip install -e . && python -m torch.distributed.run --nproc_per_node 8 research/fine_tune_llm_w_qlora/fine-tune-nodp.py \
 5 |     --output_dir outputs \
 6 |     --model_name mistralai/Mistral-7B-v0.1 \
 7 |     --dataset_name sst2 \
 8 |     --sequence_len 128 \
 9 |     --per_device_train_batch_size 4 \
10 |     --gradient_accumulation_steps 1 \
11 |     --evaluation_strategy steps \
12 |     --eval_steps 100 \
13 |     --save_strategy no \
14 |     --log_level info \
15 |     --per_device_eval_batch_size 8 \
16 |     --eval_accumulation_steps 1 \
17 |     --seed 42 \
18 |     --weight_decay 0.01 \
19 |     --remove_unused_columns False \
20 |     --num_train_epochs 3 \
21 |     --logging_steps 5 \
22 |     --max_grad_norm 0 \
23 |     --lr_scheduler_type constant \
24 |     --learning_rate 2.5e-5 \
25 |     --disable_tqdm True \
26 |     --dataloader_num_workers 2 \
27 |     --lora_dim 4 \
28 |     --lora_alpha 32 \
29 |     --lora_dropout 0.0 \
30 |     --enable_lora \
31 |     --target_modules "['q_proj', 'v_proj']" \
32 |     --label_names labels \
33 |     --bf16
34 | environment:
35 |   image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04
36 |   conda_file: ../../environment.yml
37 | compute: azureml:ND96asrv4
38 | display_name: mistral_7b_qlora_nodp_sst2
39 | experiment_name: dp-transformers-mistral-7b-qlora-nodp-sst2
40 | description: Fine-tune Mistral 7B model with QLoRA on SST-2 dataset


--------------------------------------------------------------------------------
/research/fine_tune_llm_w_qlora/data_utils.py:
--------------------------------------------------------------------------------
  1 | import datasets
  2 | import evaluate
  3 | import torch
  4 | import numpy as np
  5 | 
  6 | 
  7 | # Modified from https://huggingface.co/docs/peft/task_guides/clm-prompt-tuning
  8 | def main_preprocess_function(examples, tokenizer, text_field, prompt_begin, prompt_end, label_field, sequence_len, single_token=True):
  9 |     batch_size = len(examples[text_field])
 10 | 
 11 |     # Prepare the context with the text in between of prompts, e.g. "Sentence : <text> Label :"
 12 |     inputs = [prompt_begin + x + prompt_end for x in examples[text_field]]
 13 |     
 14 |     # Prepare the prediction part
 15 |     targets = [str(x) for x in examples[label_field]]
 16 |     
 17 |     model_inputs = tokenizer(inputs)
 18 |     labels = tokenizer(targets)
 19 |     
 20 |     # Concatenate the context and prediction parts as one input and set -100 to the labels of the context part
 21 |     # This is because only the label part will be used to calculate the loss
 22 |     for i in range(batch_size):
 23 |         sample_input_ids = model_inputs["input_ids"][i]
 24 |         if single_token:
 25 |             # Tokenizer adds <s> to input_ids so just take the last id
 26 |             # NOTE THAT THIS ASSUMES THE LABEL IS SINGLE TOKEN
 27 |             label_input_ids = [labels["input_ids"][i][-1]]
 28 |         else:
 29 |             # Tokenizer adds <s> to input_ids so just take the rest
 30 |             label_input_ids = labels["input_ids"][i][1:]
 31 |         model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
 32 |         labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
 33 |         model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
 34 | 
 35 |     # Pad the samples with sequence_len and trim if longer than sequence_len
 36 |     # NOTE THAT IF CONTEXT IS LONGER THAN SEQUENCE_LEN, THERE WILL BE NOTHING TO PREDICT, LABEL IS ALL -100
 37 |     for i in range(batch_size):
 38 |         sample_input_ids = model_inputs["input_ids"][i]
 39 |         label_input_ids = labels["input_ids"][i]
 40 |         model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
 41 |             sequence_len - len(sample_input_ids)
 42 |         ) + sample_input_ids
 43 |         model_inputs["attention_mask"][i] = [0] * (sequence_len - len(sample_input_ids)) + model_inputs[
 44 |             "attention_mask"
 45 |         ][i]
 46 |         labels["input_ids"][i] = [-100] * (sequence_len - len(sample_input_ids)) + label_input_ids
 47 |         model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:sequence_len])
 48 |         model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:sequence_len])
 49 |         labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:sequence_len])
 50 | 
 51 |     model_inputs["labels"] = labels["input_ids"]
 52 |     return model_inputs
 53 | 
 54 | 
 55 | class Dataset:
 56 |     dataset = None
 57 |     classes = None # List of class labels
 58 |     text_field = None # Name of the field in the dataset that contains the text
 59 |     prompt_begin = None # Prompt to add to the beginning of the text, e.g. "Sentence : "
 60 |     prompt_end = None # Prompt to add to the end of the text, e.g. " Label :"
 61 |     label_field = None # Name of the field in the dataset that contains the label
 62 |     evaluate = None # Evaluation metric
 63 |     run_test = False # Whether to run test set evaluation
 64 | 
 65 |     def __init__(self, tokenizer, sequence_len):
 66 |         self.tokenizer = tokenizer
 67 |         self.sequence_len = sequence_len
 68 | 
 69 |     def target_max_len(self):
 70 |         target_lens = [len(self.tokenizer(class_label)["input_ids"]) for class_label in self.classes]
 71 |         target_max_len = max(target_lens)
 72 |         return target_max_len
 73 | 
 74 |     def preprocess_function(self, example):
 75 |         return main_preprocess_function(example, self.tokenizer, self.text_field, self.prompt_begin,
 76 |                                          self.prompt_end, self.label_field, self.sequence_len)
 77 |     
 78 |     # Define the evaluation metric (NOTE THAT THIS ASSUMES THE LABEL IS SINGLE TOKEN)
 79 |     def compute_metrics(self, eval_pred):
 80 |         predictions, labels = eval_pred
 81 |         # Only keep predictions for the last token shifted by 1
 82 |         predictions = predictions[..., -2]
 83 |         # Only keep labels for the last token
 84 |         labels = labels[..., -1]
 85 |         return self.evaluate.compute(predictions=predictions, references=labels)
 86 | 
 87 |     def preprocess_logits_for_metrics(self, logits, labels):
 88 |         """
 89 |         Original Trainer may lead to a memory issue.
 90 |         This is a workaround to avoid storing too many tensors that are not needed.
 91 |         """
 92 |         pred_ids = torch.argmax(logits, dim=-1)
 93 |         return pred_ids
 94 | 
 95 | 
 96 | class SST2Dataset(Dataset):
 97 |     def __init__(self, tokenizer, sequence_len):
 98 |         # Load data
 99 |         self.dataset = datasets.load_dataset('sst2')
100 |         # Map labels from 0/1 to negative/positive
101 |         self.classes = ['negative', 'positive']
102 |         self.dataset = self.dataset.map(
103 |             lambda x: {"text_label": [self.classes[label] for label in x["label"]]},
104 |             batched=True,
105 |             num_proc=8,
106 |         )
107 |         self.text_field = "sentence"
108 |         self.prompt_begin = "Sentence : "
109 |         self.prompt_end = " Label :"
110 |         self.label_field = "text_label"
111 |         self.evaluate = evaluate.load("accuracy")
112 |         super().__init__(tokenizer, sequence_len)
113 | 
114 | 
115 | class QNLIDataset(Dataset):
116 |     def __init__(self, tokenizer, sequence_len):
117 |         # Load data
118 |         self.dataset = datasets.load_dataset('glue', 'qnli')
119 |         self.classes = ['0', '1']
120 |         self.dataset = self.dataset.map(
121 |             lambda x: {"text_concat": [question + " ### " + sentence for question, sentence in zip(x["question"], x["sentence"])]},
122 |             batched=True,
123 |             num_proc=8,
124 |         )
125 |         # 5k eval samples too large, shuffle and reduce it to 1k
126 |         self.dataset['validation'] = self.dataset['validation'].shuffle().select(range(1000))
127 |         self.text_field = "text_concat"
128 |         self.prompt_begin = "Two sentences separated with ### : "
129 |         self.prompt_end = " Label :"
130 |         self.label_field = "label"
131 |         self.evaluate = evaluate.load("accuracy")
132 |         super().__init__(tokenizer, sequence_len)
133 | 
134 | 
135 | class CNNDataset(Dataset):
136 |     def __init__(self, tokenizer, sequence_len):
137 |         # Load data
138 |         self.dataset = datasets.load_dataset("cnn_dailymail", "3.0.0")
139 |         # 13.4k eval samples too large, shuffle and reduce it to 1k
140 |         self.dataset['validation'] = self.dataset['validation'].shuffle().select(range(1000))
141 |         # Get rid of the test dataset
142 |         del self.dataset['test']
143 |         self.text_field = "article"
144 |         self.prompt_begin = "Article : "
145 |         self.prompt_end = " Summary :"
146 |         self.label_field = "highlights"
147 |         self.evaluate = evaluate.load("rouge")
148 |         self.run_test = True
149 |         super().__init__(tokenizer, sequence_len)
150 | 
151 |     def preprocess_function(self, example):
152 |         return main_preprocess_function(example, self.tokenizer, self.text_field, self.prompt_begin,
153 |                                          self.prompt_end, self.label_field, self.sequence_len, single_token=False)
154 | 
155 |     def compute_metrics(self, eval_pred):
156 |         predictions, labels = eval_pred
157 |         # Only keep predictions up to last token
158 |         predictions = predictions[..., :-1]
159 |         # Only keep labels from the first token
160 |         labels = labels[..., 1:]
161 |         # Replace -100 of the labels as we don't want the content
162 |         predictions = np.where(labels != -100, predictions, self.tokenizer.pad_token_id)
163 |         # Decode generated summaries into text
164 |         decoded_preds = self.tokenizer.batch_decode(predictions, skip_special_tokens=True)
165 |         # Replace -100 in the labels as we can't decode them
166 |         labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id)
167 |         # Decode reference summaries into text
168 |         decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
169 |         # Compute ROUGE scores
170 |         result = self.evaluate.compute(
171 |             predictions=decoded_preds, references=decoded_labels, use_stemmer=True
172 |         )
173 |         return {k: round(v, 4) for k, v in result.items()}
174 | 
175 |     def compute_test_metrics(self, trainer):
176 |         test_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split='test')
177 |         # Filter out samples too long, e.g. more than 500 words
178 |         test_dataset = test_dataset.filter(lambda x: len(x['article'].split()) < 500)
179 |         # 11.4k test samples too large, shuffle and reduce it to 1k
180 |         test_dataset = test_dataset.shuffle().select(range(1000))
181 |         # Add prompt_begin and prompt_end
182 |         test_dataset = test_dataset.map(
183 |             lambda x: {"article": [self.prompt_begin + article + self.prompt_end for article in x["article"]]},
184 |             batched=True,
185 |             num_proc=None,
186 |         )
187 | 
188 |         # Tokenize data
189 |         def test_preprocess_function(examples):
190 |             model_inputs = trainer.tokenizer(examples['article'], padding=False)
191 | 
192 |             # 2. reserve the original article and summary for saving
193 |             model_inputs['summary'] = examples['highlights']
194 |             return model_inputs
195 | 
196 |         with trainer.args.main_process_first(desc="tokenizing test dataset"):
197 |             test_dataset = test_dataset.map(
198 |                 test_preprocess_function,
199 |                 batched=True, num_proc=None, desc="tokenizing dataset",
200 |                 remove_columns=test_dataset.column_names)
201 | 
202 |         # Filter out samples too long, e.g. more than 750 tokens
203 |         test_dataset = test_dataset.filter(lambda x: len(x['input_ids']) < 750)
204 | 
205 |         test_dataset.set_format(type="torch")
206 | 
207 |         def generate_batched(
208 |             model,
209 |             tokenizer,
210 |             device,
211 |             query_tensors,
212 |             batch_size: int = 4,
213 |             return_prompt: bool = True,
214 |             pad_to_multiple_of: int = None,
215 |             **generation_kwargs,
216 |         ):
217 |             outputs = []
218 | 
219 |             tokenizer.padding_side = "left"
220 | 
221 |             # handle distributed case and distribute query_tensors among gpus
222 |             query_tensors = query_tensors[device.index::trainer.args.world_size]
223 | 
224 |             # in case we have fewer examples than bs
225 |             batch_size = min(len(query_tensors), batch_size)
226 | 
227 |             for i in range(0, len(query_tensors), batch_size):
228 |                 # prevent overflow if query tensors are not even multiple of bs
229 |                 end_index = min(len(query_tensors), i + batch_size)
230 | 
231 |                 batch = query_tensors[i:end_index]
232 |                 batch_mask = [torch.ones_like(element) for element in batch]
233 |                 inputs = {"input_ids": batch, "attention_mask": batch_mask}
234 | 
235 |                 padded_inputs = tokenizer.pad(
236 |                     inputs,
237 |                     padding=True,
238 |                     max_length=None,
239 |                     pad_to_multiple_of=pad_to_multiple_of,
240 |                     return_tensors="pt",
241 |                 ).to(device)
242 | 
243 |                 with torch.no_grad():
244 |                     generations = model.generate(**padded_inputs, **generation_kwargs)
245 | 
246 |                 for generation, mask in zip(generations, padded_inputs["attention_mask"]):
247 |                     output = generation[(1 - mask).sum() :]  # remove padding
248 | 
249 |                     if not return_prompt:
250 |                         output = output[(mask).sum() :]  # remove prompt
251 |                     outputs.append(output)
252 | 
253 |             return outputs
254 | 
255 |         if hasattr(trainer.model, "generate"):
256 |             model = trainer.model
257 |         # The following is for GradSampleModule wrapping
258 |         elif hasattr(trainer.model._module, "generate"):
259 |             model = trainer.model._module
260 |         # The following is for GradSampleModule and DPDDP wrapping
261 |         elif hasattr(trainer.model._module.module, "generate"):
262 |             model = trainer.model._module.module
263 |         else:
264 |             raise ValueError("Cannot find generate function in the model.")
265 |     
266 |         model.eval()
267 |         generation_kwargs = {"max_new_tokens": 100, "pad_token_id": trainer.tokenizer.pad_token_id, 
268 |                              "eos_token_id": trainer.tokenizer.eos_token_id,}
269 | 
270 |         response_tensors = generate_batched(
271 |             model, trainer.tokenizer, trainer.args.device,
272 |             test_dataset["input_ids"],
273 |             batch_size=trainer.args.eval_batch_size, return_prompt=False,
274 |             **generation_kwargs
275 |         )
276 | 
277 |         responses = [trainer.tokenizer.decode(r.squeeze(), skip_special_tokens=True) 
278 |                                     for r in response_tensors]
279 | 
280 |         result = self.evaluate.compute(
281 |             predictions=responses, references=test_dataset["summary"][trainer.args.device.index::trainer.args.world_size],
282 |             use_stemmer=True
283 |         )
284 | 
285 |         r1 = trainer.accelerator.reduce(torch.tensor(result['rouge1']).to(trainer.args.device), reduction="mean")
286 |         r2 = trainer.accelerator.reduce(torch.tensor(result['rouge2']).to(trainer.args.device), reduction="mean")
287 |         rl = trainer.accelerator.reduce(torch.tensor(result['rougeL']).to(trainer.args.device), reduction="mean")
288 | 
289 |         result = {'rouge1': r1.item(), 'rouge2': r2.item(), 'rougeL': rl.item()}
290 |         return {k: round(v, 4) for k, v in result.items()}
291 | 
292 | 
293 | ALL_DATASETS = {"sst2": SST2Dataset, "qnli": QNLIDataset, "cnn": CNNDataset}


--------------------------------------------------------------------------------
/research/fine_tune_llm_w_qlora/environment.yml:
--------------------------------------------------------------------------------
  1 | name: dp-transformers
  2 | channels:
  3 |   - pytorch
  4 |   - nvidia
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1
  8 |   - _openmp_mutex=5.1
  9 |   - blas=1.0
 10 |   - bzip2=1.0.8
 11 |   - ca-certificates=2023.05.30
 12 |   - cuda-cudart=11.8.89
 13 |   - cuda-cupti=11.8.87
 14 |   - cuda-libraries=11.8.0
 15 |   - cuda-nvrtc=11.8.89
 16 |   - cuda-nvtx=11.8.86
 17 |   - cuda-runtime=11.8.0
 18 |   - filelock=3.9.0
 19 |   - gmp=6.2.1
 20 |   - gmpy2=2.1.2
 21 |   - intel-openmp=2023.1.0
 22 |   - jinja2=3.1.2
 23 |   - ld_impl_linux-64=2.38
 24 |   - libcublas=11.11.3.6
 25 |   - libcufft=10.9.0.58
 26 |   - libcufile=1.7.1.12
 27 |   - libcurand=10.3.3.129
 28 |   - libcusolver=11.4.1.48
 29 |   - libcusparse=11.7.5.86
 30 |   - libffi=3.4.4
 31 |   - libgcc-ng=11.2.0
 32 |   - libgomp=11.2.0
 33 |   - libnpp=11.8.0.86
 34 |   - libnvjpeg=11.9.0.86
 35 |   - libstdcxx-ng=11.2.0
 36 |   - libuuid=1.41.5
 37 |   - markupsafe=2.1.1
 38 |   - mkl=2023.1.0
 39 |   - mpc=1.1.0
 40 |   - mpfr=4.0.2
 41 |   - mpmath=1.3.0
 42 |   - ncurses=6.4
 43 |   - networkx=3.1
 44 |   - openssl=3.0.10
 45 |   - pip=23.2.1
 46 |   - python=3.10.12
 47 |   - pytorch=2.0.1
 48 |   - pytorch-cuda=11.8
 49 |   - pytorch-mutex=1.0
 50 |   - readline=8.2
 51 |   - setuptools=68.0.0
 52 |   - sqlite=3.41.2
 53 |   - sympy=1.11.1
 54 |   - tbb=2021.8.0
 55 |   - tk=8.6.12
 56 |   - torchtriton=2.0.0
 57 |   - typing_extensions=4.7.1
 58 |   - wheel=0.38.4
 59 |   - xz=5.4.2
 60 |   - zlib=1.2.13
 61 |   - pip:
 62 |       - absl-py==2.0.0
 63 |       - accelerate==0.21.0
 64 |       - aiohttp==3.8.5
 65 |       - aiosignal==1.3.1
 66 |       - alembic==1.11.2
 67 |       - async-timeout==4.0.3
 68 |       - attrs==23.1.0
 69 |       - azure-common==1.1.28
 70 |       - azure-core==1.29.2
 71 |       - azure-identity==1.14.0
 72 |       - azure-mgmt-core==1.4.0
 73 |       - azure-storage-blob==12.13.0
 74 |       - azureml-mlflow==1.52.0
 75 |       - bitsandbytes==0.41.1
 76 |       - blinker==1.6.2
 77 |       - certifi==2023.7.22
 78 |       - cffi==1.15.1
 79 |       - charset-normalizer==3.2.0
 80 |       - click==8.1.6
 81 |       - cloudpickle==2.2.1
 82 |       - contourpy==1.1.0
 83 |       - cryptography==41.0.3
 84 |       - cycler==0.11.0
 85 |       - databricks-cli==0.17.7
 86 |       - datasets==2.14.4
 87 |       - dill==0.3.7
 88 |       - docker==6.1.3
 89 |       - entrypoints==0.4
 90 |       - evaluate==0.4.1
 91 |       - exceptiongroup==1.1.3
 92 |       - flask==2.3.2
 93 |       - fonttools==4.42.0
 94 |       - frozenlist==1.4.0
 95 |       - fsspec==2023.6.0
 96 |       - gitdb==4.0.10
 97 |       - gitpython==3.1.32
 98 |       - greenlet==2.0.2
 99 |       - gunicorn==21.2.0
100 |       - huggingface-hub==0.16.4
101 |       - idna==3.4
102 |       - importlib-metadata==6.8.0
103 |       - iniconfig==2.0.0
104 |       - isodate==0.6.1
105 |       - itsdangerous==2.1.2
106 |       - joblib==1.3.2
107 |       - jsonpickle==3.0.2
108 |       - kiwisolver==1.4.4
109 |       - mako==1.2.4
110 |       - markdown==3.4.4
111 |       - matplotlib==3.7.2
112 |       - mlflow==2.6.0
113 |       - mlflow-skinny==2.6.0
114 |       - msal==1.23.0
115 |       - msal-extensions==1.0.0
116 |       - msrest==0.7.1
117 |       - multidict==6.0.4
118 |       - multiprocess==0.70.15
119 |       - nltk==3.8.1
120 |       - numpy==1.25.2
121 |       - nvidia-ml-py3==7.352.0
122 |       - oauthlib==3.2.2
123 |       - opacus==1.4.0
124 |       - opt-einsum==3.3.0
125 |       - packaging==23.1
126 |       - pandas==2.0.3
127 |       - peft==0.4.0
128 |       - pillow==10.0.0
129 |       - pluggy==1.2.0
130 |       - portalocker==2.7.0
131 |       - protobuf==4.24.0
132 |       - prv-accountant==0.1.1.post1
133 |       - psutil==5.9.5
134 |       - pyarrow==12.0.1
135 |       - pycparser==2.21
136 |       - pyjwt==2.8.0
137 |       - pynvml==11.5.0
138 |       - pyparsing==3.0.9
139 |       - pytest==7.4.0
140 |       - python-dateutil==2.8.2
141 |       - pytz==2023.3
142 |       - pyyaml==6.0.1
143 |       - querystring-parser==1.2.4
144 |       - regex==2023.8.8
145 |       - requests==2.31.0
146 |       - requests-oauthlib==1.3.1
147 |       - responses==0.18.0
148 |       - rouge-score==0.1.2
149 |       - safetensors==0.3.2
150 |       - scikit-learn==1.3.0
151 |       - scipy==1.11.1
152 |       - six==1.16.0
153 |       - smmap==5.0.0
154 |       - sqlalchemy==2.0.19
155 |       - sqlparse==0.4.4
156 |       - tabulate==0.9.0
157 |       - threadpoolctl==3.2.0
158 |       - tokenizers==0.14.1
159 |       - tomli==2.0.1
160 |       - tqdm==4.66.1
161 |       - transformers==4.35.1
162 |       - tzdata==2023.3
163 |       - urllib3==1.26.16
164 |       - websocket-client==1.6.1
165 |       - werkzeug==2.3.7
166 |       - xxhash==3.3.0
167 |       - yarl==1.9.2
168 |       - zipp==3.16.2
169 | 


--------------------------------------------------------------------------------
/research/fine_tune_llm_w_qlora/fine-tune-dp.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT License.
  3 | 
  4 | '''Train LLMs with DP using QLoRA'''
  5 | 
  6 | import datasets
  7 | import dp_transformers
  8 | import transformers
  9 | import sys
 10 | import logging
 11 | import torch
 12 | import ast
 13 | import linear
 14 | import data_utils
 15 | 
 16 | from dataclasses import dataclass, field, asdict
 17 | from typing import List, Optional, Tuple, Union
 18 | from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
 19 | 
 20 | from pynvml import *
 21 | 
 22 | def print_gpu_utilization():
 23 |     nvmlInit()
 24 |     handle = nvmlDeviceGetHandleByIndex(0)
 25 |     info = nvmlDeviceGetMemoryInfo(handle)
 26 |     print(f"GPU memory occupied: {info.used//1024**2} MB.")
 27 | 
 28 | logger = logging.getLogger(__name__)
 29 | 
 30 | 
 31 | @dataclass
 32 | class ModelArguments:
 33 |     model_name: str = field(default="gpt2", metadata={
 34 |         "help": "Model name in HuggingFace, e.g. 'gpt2'"
 35 |     })
 36 |     dataset_name: str = field(default="sst2", metadata={
 37 |         "help": "Dataset name in HuggingFace, e.g. 'sst2'"
 38 |     })
 39 |     sequence_len: int = field(default=128, metadata={
 40 |         "help": "Maximum sequence length"
 41 |     })
 42 | 
 43 | 
 44 | @dataclass
 45 | class LoraArguments:
 46 |     enable_lora: bool = field(default=False, metadata={
 47 |         "help": "Whether to enable LoRA"
 48 |     })
 49 |     lora_dim: int = field(default=8, metadata={
 50 |         "help": "LoRA dimension"
 51 |     })
 52 |     lora_alpha: int = field(default=8, metadata={
 53 |         "help": "LoRA alpha"
 54 |     })
 55 |     lora_dropout: float = field(default=0.0, metadata={
 56 |         "help": "LoRA dropout"
 57 |     })
 58 | 
 59 |     target_modules: List[str] = field(
 60 |         default_factory=list,
 61 |         metadata={
 62 |             "help": "List of module names or regex expression of the module names to replace with Lora."
 63 |             "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
 64 |         },
 65 |     )
 66 | 
 67 |     def as_peft_config(self) -> LoraConfig:
 68 |         if not self.enable_lora:
 69 |             raise ValueError("LoRA is not enabled, cannot convert to LoRA config")
 70 |         params = asdict(self)
 71 |         params.pop("enable_lora")
 72 |         params["r"] = params.pop("lora_dim")
 73 |         params["target_modules"] = ast.literal_eval(params["target_modules"][0])
 74 |         return LoraConfig(**params)
 75 | 
 76 | 
 77 | @dataclass
 78 | class Arguments:
 79 |     train: dp_transformers.TrainingArguments
 80 |     privacy: dp_transformers.PrivacyArguments
 81 |     model: ModelArguments
 82 |     lora: LoraArguments
 83 | 
 84 | 
 85 | def main(args: Arguments):
 86 |     transformers.set_seed(args.train.seed)
 87 | 
 88 |     # Setup logging
 89 |     logging.basicConfig(
 90 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 91 |         datefmt="%m/%d/%Y %H:%M:%S",
 92 |         handlers=[logging.StreamHandler(sys.stdout)],
 93 |     )
 94 | 
 95 |     log_level = train_args.get_process_log_level()
 96 |     logger.setLevel(log_level)
 97 |     datasets.utils.logging.set_verbosity(log_level)
 98 |     transformers.utils.logging.set_verbosity(log_level)
 99 |     transformers.utils.logging.enable_default_handler()
100 |     transformers.utils.logging.enable_explicit_format()
101 | 
102 |     # Log on each process the small summary:
103 |     logger.warning(
104 |         f"Process rank: {train_args.local_rank}, device: {train_args.device}, n_gpu: {train_args.n_gpu}, "
105 |         f"distributed training: {bool(train_args.local_rank != -1)}, 16-bits training: {train_args.fp16}"
106 |     )
107 |     logger.info(f"Training/evaluation parameters {train_args}")
108 |     logger.info(f"Privacy parameters {privacy_args}")
109 | 
110 |     # Load tokenizer
111 |     tokenizer = transformers.AutoTokenizer.from_pretrained(args.model.model_name)
112 |     if tokenizer.pad_token_id is None:
113 |         tokenizer.pad_token_id = tokenizer.eos_token_id
114 | 
115 |     # Load dataset
116 |     dataset = data_utils.ALL_DATASETS[args.model.dataset_name](tokenizer, args.model.sequence_len)
117 | 
118 |     if dataset.classes is not None:
119 |         target_max_len = dataset.target_max_len()
120 |         logger.info(f"Labels tokenized into max length: {target_max_len}")
121 | 
122 |     # Tokenize data
123 |     with train_args.main_process_first(desc="tokenizing dataset"):
124 |         dataset.dataset = dataset.dataset.map(
125 |             dataset.preprocess_function, batched=True, num_proc=8, desc="tokenizing dataset", 
126 |             remove_columns=dataset.dataset.column_names['train']
127 |         )
128 | 
129 |     bnb_config = transformers.BitsAndBytesConfig(
130 |         load_in_4bit=True,
131 |         bnb_4bit_use_double_quant=True,
132 |         bnb_4bit_quant_type="nf4",
133 |         bnb_4bit_compute_dtype=torch.bfloat16
134 |     )
135 | 
136 |     # Load model
137 |     model = transformers.AutoModelForCausalLM.from_pretrained(args.model.model_name, quantization_config=bnb_config)
138 |     model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=train_args.gradient_checkpointing)
139 | 
140 |     if args.lora.enable_lora:
141 |         logger.info("Using LoRA")
142 |         model = get_peft_model(model=model, peft_config=args.lora.as_peft_config())
143 |     else:
144 |         logger.info("Not using LoRA")
145 | 
146 |     if train_args.local_rank == 0:
147 |         logger.info(f"Total number of parameters of the model: {model.num_parameters(only_trainable=False)}")
148 |         logger.info(f"Fine-tuned number of parameters of the model: {model.num_parameters(only_trainable=True)}")
149 | 
150 |     trainer = dp_transformers.dp_utils.OpacusDPTrainer(
151 |         args=train_args,
152 |         model=model,
153 |         train_dataset=dataset.dataset['train'],
154 |         eval_dataset=dataset.dataset['validation'],
155 |         tokenizer=tokenizer,
156 |         compute_metrics=dataset.compute_metrics,
157 |         preprocess_logits_for_metrics=dataset.preprocess_logits_for_metrics,
158 |         privacy_args=privacy_args,
159 |     )
160 | 
161 |     if hasattr(trainer.model._module, "config"):
162 |         # The following is for GradSampleModule wrapping
163 |         ignore_keys = getattr(trainer.model._module.config, "keys_to_ignore_at_inference", [])
164 |     elif hasattr(trainer.model._module.module, "config"):
165 |         # The following is for GradSampleModule and DPDDP wrapping
166 |         ignore_keys = getattr(trainer.model._module.module.config, "keys_to_ignore_at_inference", [])
167 |     else:
168 |         ignore_keys = []
169 | 
170 |     try:
171 |         # A workaround to avoid the following error:
172 |         # AttributeError: 'GradSampleModule' object has no attribute 'gradient_checkpointing_enable'
173 |         # inside Trainer _inner_training_loop. Already done by prepare_model_for_kbit_training
174 |         trainer.args.gradient_checkpointing = False
175 |         result = trainer.train(ignore_keys_for_eval=ignore_keys)
176 |     finally:
177 |         eps_prv = trainer.get_prv_epsilon()
178 |         eps_rdp = trainer.get_rdp_epsilon()
179 |         trainer.log({
180 |             "final_epsilon_prv": eps_prv,
181 |             "final_epsilon_rdp": eps_rdp
182 |         })
183 | 
184 |     if dataset.run_test:
185 |         logger.info("Running test set evaluation after training")   
186 |         test_metrics = dataset.compute_test_metrics(trainer)
187 |         trainer.log(test_metrics)
188 | 
189 |     def print_summary(result):
190 |         print(f"Time: {result.metrics['train_runtime']:.2f}")
191 |         print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
192 |         print_gpu_utilization()
193 | 
194 |     print_summary(result)
195 | 
196 | if __name__ == "__main__":
197 |     arg_parser = transformers.HfArgumentParser((dp_transformers.TrainingArguments, dp_transformers.PrivacyArguments, ModelArguments, LoraArguments))
198 |     train_args, privacy_args, model_args, lora_args = arg_parser.parse_args_into_dataclasses()
199 |     main(Arguments(train=train_args, privacy=privacy_args, model=model_args, lora=lora_args))
200 | 


--------------------------------------------------------------------------------
/research/fine_tune_llm_w_qlora/fine-tune-nodp.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT License.
  3 | 
  4 | '''Train LLMs without DP using QLoRA'''
  5 | 
  6 | import datasets
  7 | import dp_transformers
  8 | import transformers
  9 | import sys
 10 | import logging
 11 | import torch
 12 | import ast
 13 | import data_utils
 14 | 
 15 | from dataclasses import dataclass, field, asdict
 16 | from typing import List, Optional, Tuple, Union
 17 | from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
 18 | 
 19 | from pynvml import *
 20 | 
 21 | def print_gpu_utilization():
 22 |     nvmlInit()
 23 |     handle = nvmlDeviceGetHandleByIndex(0)
 24 |     info = nvmlDeviceGetMemoryInfo(handle)
 25 |     print(f"GPU memory occupied: {info.used//1024**2} MB.")
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | @dataclass
 31 | class ModelArguments:
 32 |     model_name: str = field(default="gpt2", metadata={
 33 |         "help": "Model name in HuggingFace, e.g. 'gpt2'"
 34 |     })
 35 |     dataset_name: str = field(default="sst2", metadata={
 36 |         "help": "Dataset name in HuggingFace, e.g. 'sst2'"
 37 |     })
 38 |     sequence_len: int = field(default=128, metadata={
 39 |         "help": "Maximum sequence length"
 40 |     })
 41 | 
 42 | 
 43 | @dataclass
 44 | class LoraArguments:
 45 |     enable_lora: bool = field(default=False, metadata={
 46 |         "help": "Whether to enable LoRA"
 47 |     })
 48 |     lora_dim: int = field(default=8, metadata={
 49 |         "help": "LoRA dimension"
 50 |     })
 51 |     lora_alpha: int = field(default=8, metadata={
 52 |         "help": "LoRA alpha"
 53 |     })
 54 |     lora_dropout: float = field(default=0.0, metadata={
 55 |         "help": "LoRA dropout"
 56 |     })
 57 | 
 58 |     target_modules: List[str] = field(
 59 |         default_factory=list,
 60 |         metadata={
 61 |             "help": "List of module names or regex expression of the module names to replace with Lora."
 62 |             "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
 63 |         },
 64 |     )
 65 | 
 66 |     def as_peft_config(self) -> LoraConfig:
 67 |         if not self.enable_lora:
 68 |             raise ValueError("LoRA is not enabled, cannot convert to LoRA config")
 69 |         params = asdict(self)
 70 |         params.pop("enable_lora")
 71 |         params["r"] = params.pop("lora_dim")
 72 |         params["target_modules"] = ast.literal_eval(params["target_modules"][0])
 73 |         return LoraConfig(**params)
 74 | 
 75 | 
 76 | @dataclass
 77 | class Arguments:
 78 |     train: dp_transformers.TrainingArguments
 79 |     model: ModelArguments
 80 |     lora: LoraArguments
 81 | 
 82 | 
 83 | def main(args: Arguments):
 84 |     transformers.set_seed(args.train.seed)
 85 | 
 86 |     # Setup logging
 87 |     logging.basicConfig(
 88 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 89 |         datefmt="%m/%d/%Y %H:%M:%S",
 90 |         handlers=[logging.StreamHandler(sys.stdout)],
 91 |     )
 92 | 
 93 |     log_level = train_args.get_process_log_level()
 94 |     logger.setLevel(log_level)
 95 |     datasets.utils.logging.set_verbosity(log_level)
 96 |     transformers.utils.logging.set_verbosity(log_level)
 97 |     transformers.utils.logging.enable_default_handler()
 98 |     transformers.utils.logging.enable_explicit_format()
 99 | 
100 |     # Log on each process the small summary:
101 |     logger.warning(
102 |         f"Process rank: {train_args.local_rank}, device: {train_args.device}, n_gpu: {train_args.n_gpu}, "
103 |         f"distributed training: {bool(train_args.local_rank != -1)}, 16-bits training: {train_args.fp16}"
104 |     )
105 |     logger.info(f"Training/evaluation parameters {train_args}")
106 | 
107 |     # Load tokenizer
108 |     tokenizer = transformers.AutoTokenizer.from_pretrained(args.model.model_name)
109 |     if tokenizer.pad_token_id is None:
110 |         tokenizer.pad_token_id = tokenizer.eos_token_id
111 | 
112 |     # Load dataset
113 |     dataset = data_utils.ALL_DATASETS[args.model.dataset_name](tokenizer, args.model.sequence_len)
114 | 
115 |     if dataset.classes is not None:
116 |         target_max_len = dataset.target_max_len()
117 |         logger.info(f"Labels tokenized into max length: {target_max_len}")
118 | 
119 |     # Tokenize data
120 |     with train_args.main_process_first(desc="tokenizing dataset"):
121 |         dataset.dataset = dataset.dataset.map(
122 |             dataset.preprocess_function, batched=True, num_proc=8, desc="tokenizing dataset", 
123 |             remove_columns=dataset.dataset.column_names['train']
124 |         )
125 | 
126 |     bnb_config = transformers.BitsAndBytesConfig(
127 |         load_in_4bit=True,
128 |         bnb_4bit_use_double_quant=True,
129 |         bnb_4bit_quant_type="nf4",
130 |         bnb_4bit_compute_dtype=torch.bfloat16
131 |     )
132 | 
133 |     # Load model
134 |     model = transformers.AutoModelForCausalLM.from_pretrained(args.model.model_name, quantization_config=bnb_config)
135 |     model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=train_args.gradient_checkpointing)
136 | 
137 |     if args.lora.enable_lora:
138 |         logger.info("Using LoRA")
139 |         model = get_peft_model(model=model, peft_config=args.lora.as_peft_config())
140 |     else:
141 |         logger.info("Not using LoRA")
142 | 
143 |     if train_args.local_rank == 0:
144 |         logger.info(f"Total number of parameters of the model: {model.num_parameters(only_trainable=False)}")
145 |         logger.info(f"Fine-tuned number of parameters of the model: {model.num_parameters(only_trainable=True)}")
146 | 
147 |     trainer = transformers.Trainer(
148 |         args=train_args,
149 |         model=model,
150 |         train_dataset=dataset.dataset['train'],
151 |         eval_dataset=dataset.dataset['validation'],
152 |         tokenizer=tokenizer,
153 |         compute_metrics=dataset.compute_metrics,
154 |         preprocess_logits_for_metrics=dataset.preprocess_logits_for_metrics,
155 |     )
156 | 
157 |     result = trainer.train()
158 | 
159 |     if dataset.run_test:
160 |         logger.info("Running test set evaluation after training")
161 |         test_metrics = dataset.compute_test_metrics(trainer)
162 |         trainer.log(test_metrics)
163 | 
164 |     def print_summary(result):
165 |         print(f"Time: {result.metrics['train_runtime']:.2f}")
166 |         print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
167 |         print_gpu_utilization()
168 | 
169 |     print_summary(result)
170 | 
171 | if __name__ == "__main__":
172 |     arg_parser = transformers.HfArgumentParser((dp_transformers.TrainingArguments, ModelArguments, LoraArguments))
173 |     train_args, model_args, lora_args = arg_parser.parse_args_into_dataclasses()
174 |     main(Arguments(train=train_args, model=model_args, lora=lora_args))
175 | 


--------------------------------------------------------------------------------
/research/fine_tune_llm_w_qlora/linear.py:
--------------------------------------------------------------------------------
 1 | ### Convert activations and backprops to float for per-sample gradient computation
 2 | ### During mixed precision training it is possible that the activations and/or backprops are not in full precision
 3 | 
 4 | from typing import Dict, List
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | from opt_einsum import contract
 9 | 
10 | from opacus.grad_sample.utils import register_grad_sampler
11 | 
12 | 
13 | @register_grad_sampler(nn.Linear)
14 | def compute_linear_grad_sample(
15 |     layer: nn.Linear, activations: List[torch.Tensor], backprops: torch.Tensor
16 | ) -> Dict[nn.Parameter, torch.Tensor]:
17 |     """
18 |     Computes per sample gradients for ``nn.Linear`` layer
19 | 
20 |     Args:
21 |         layer: Layer
22 |         activations: Activations
23 |         backprops: Backpropagations
24 |     """
25 |     activations = activations[0]
26 |     ret = {}
27 |     if layer.weight.requires_grad:
28 |         gs = contract("n...i,n...j->nij", backprops.float(), activations.float())
29 |         ret[layer.weight] = gs
30 |     if layer.bias is not None and layer.bias.requires_grad:
31 |         ret[layer.bias] = contract("n...k->nk", backprops.float())
32 |     return ret
33 | 


--------------------------------------------------------------------------------
/research/synthetic-text-generation-with-DP/NOTICE.txt:
--------------------------------------------------------------------------------
 1 | THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
 2 | Do Not Translate or Localize
 3 | 
 4 | This software incorporates components from the projects listed below. The original copyright notices
 5 | and the licenses under which Microsoft received such components are set forth below and are provided for 
 6 | informational purposes only. Microsoft reserves all rights not expressly granted herein, whether by 
 7 | implication, estoppel or otherwise.
 8 | 
 9 | This software includes parts of the Huggingface/Transformers Library (https://github.com/huggingface/transformers). 
10 | State-of-the-art of Natural Language Processing for Jax, PyTorch and TensorFlow. Huggingface/Transformers library is 
11 | licensed under Apache License 2.0, you can find a copy of this license at https://github.com/huggingface/transformers/blob/master/LICENSE
12 | 


--------------------------------------------------------------------------------
/research/synthetic-text-generation-with-DP/README.md:
--------------------------------------------------------------------------------
  1 | We present the code of our paper "Synthetic Text Generation with Differential Privacy: A Simple and Practical Recipe" at ACL 2023.
  2 | 
  3 | ## Fine-tuning with DP
  4 | 
  5 | The following script assumes distributed training on 8 GPUs.
  6 | 
  7 | ```console
  8 | python -m torch.distributed.run --nproc_per_node 8 fine-tune-dp.py \
  9 |     --data_dir $DATA \
 10 |     --output_dir $OUTPUT_DIR \
 11 |     --model_name gpt2 \
 12 |     --per_device_train_batch_size 32 \
 13 |     --gradient_accumulation_steps 16 \
 14 |     --evaluation_strategy epoch \
 15 |     --save_strategy epoch \
 16 |     --log_level info \
 17 |     --per_device_eval_batch_size 64 \
 18 |     --eval_accumulation_steps 1 \
 19 |     --seed 42 \
 20 |     --target_epsilon 4.0 \
 21 |     --per_sample_max_grad_norm 1.0 \
 22 |     --weight_decay 0.01 \
 23 |     --remove_unused_columns False \
 24 |     --num_train_epochs 50 \
 25 |     --logging_steps 10 \
 26 |     --max_grad_norm 0 \
 27 |     --sequence_len 128 \
 28 |     --learning_rate 0.0001 \
 29 |     --lr_scheduler_type constant \
 30 |     --dataloader_num_workers 2 \
 31 |     --disable_tqdm True \
 32 |     --load_best_model_at_end True \
 33 | ```
 34 | 
 35 | ## Fine-tuning without DP
 36 | 
 37 | The following script assumes distributed training on 8 GPUs.
 38 | 
 39 | ```console
 40 | python -m torch.distributed.run --nproc_per_node 8 fine-tune-nodp.py \
 41 |     --data_dir $DATA \
 42 |     --output_dir $OUTPUT_DIR \
 43 |     --model_name gpt2 \
 44 |     --per_device_train_batch_size 4 \
 45 |     --gradient_accumulation_steps 1 \
 46 |     --evaluation_strategy epoch \
 47 |     --save_strategy epoch \
 48 |     --log_level info \
 49 |     --per_device_eval_batch_size 64 \
 50 |     --eval_accumulation_steps 1 \
 51 |     --seed 42 \
 52 |     --weight_decay 0.01 \
 53 |     --remove_unused_columns False \
 54 |     --num_train_epochs 5 \
 55 |     --logging_steps 2400 \
 56 |     --max_grad_norm 0 \
 57 |     --sequence_len 128 \
 58 |     --learning_rate 0.00005 \
 59 |     --lr_scheduler_type constant \
 60 |     --dataloader_num_workers 2 \
 61 |     --disable_tqdm True \
 62 |     --load_best_model_at_end True \
 63 | ```
 64 | 
 65 | ## Synthetic Text Generation
 66 | 
 67 | The following script generates synthetic data from a fine-tuned model on a single GPU.
 68 | 
 69 | ```console
 70 | python generate-text.py \
 71 |     --model_type gpt2 \
 72 |     --model_name_or_path $CHECKPOINT_FOLDER \
 73 |     --input_training_file $TRAINING_DATA_FILE \
 74 |     --output_dir $OUTPUT_DIR \
 75 |     --length 128 \
 76 |     --total_sequences 100000 \
 77 |     --do_sample \
 78 |     --batch_size 8 \
 79 | ```
 80 | 
 81 | ## Classification model
 82 | 
 83 | The following script assumes distributed training on 8 GPUs. 
 84 | Set --sample_dataset True to train the classifier on the original data to sample 100000 data points.
 85 | 
 86 | ```console
 87 | python -m torch.distributed.run --nproc_per_node 8 run-classification.py \
 88 |     --model_name_or_path roberta-base \
 89 |     --output_dir $OUTPUT_DIR \
 90 |     --train_file $TRAINING_DATA_FILE \
 91 |     --validation_file $VAL_DATA_FILE \
 92 |     --test_file $TEST_DATA_FILE \
 93 |     --do_train \
 94 |     --do_eval \
 95 |     --do_predict \
 96 |     --max_seq_length 512 \
 97 |     --per_device_train_batch_size 4 \
 98 |     --per_device_eval_batch_size 64 \
 99 |     --learning_rate 3e-5 \
100 |     --num_train_epochs 3 \
101 |     --logging_steps 100 \
102 |     --overwrite_output_dir \
103 |     --overwrite_cache True \
104 |     --evaluation_strategy steps \
105 |     --eval_steps 31 \
106 |     --save_steps 31 \
107 |     --load_best_model_at_end True \
108 |     --label_column_name "label1" \
109 |     --sample_dataset False \
110 |     --disable_tqdm True
111 | ```
112 | 
113 | ## Using LoRA during fine-tuning
114 | 
115 | Although not used in the paper, LoRA fine-tuning significantly improves the runtime by allowing much larger
116 | batch sizes to fit in each GPU. A starting point could be to add `--lora_dim 4 --lora_alpha 32 --lora_dropout 0.0`
117 | and use larger learning rates such as `--learning_rate 3e-4` or `4e-4`.
118 | 
119 | ## Third Party Notice
120 | 
121 | This software includes the files listed below from the Huggingface/Transformers Library (https://github.com/huggingface/transformers) 
122 | as part of text generation and task performance.
123 | 
124 |     research/synthetic-text-generation-with-DP
125 |     ├── generate-text.py
126 |     └── run-classification.py
127 | 


--------------------------------------------------------------------------------
/research/synthetic-text-generation-with-DP/fine-tune-dp.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT License.
  3 | 
  4 | '''Train GPT2 model series with DP (w/ parameter-efficient approach LoRA when lora_dim > 0)'''
  5 | 
  6 | import os
  7 | import datasets
  8 | import dp_transformers
  9 | import transformers
 10 | import sys
 11 | import logging
 12 | 
 13 | from dataclasses import dataclass, field
 14 | from dp_transformers.layers.dp_merged_linear import mark_only_lora_as_trainable
 15 | from dp_transformers.module_modification import convert_gpt2_attention_to_lora
 16 | 
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | @dataclass
 22 | class ModelArguments:
 23 |     data_dir: str = field(default="./", metadata={
 24 |         "help": "Path to training data"
 25 |     })
 26 | 
 27 |     model_name: str = field(default="gpt2", metadata={
 28 |         "help": "Model name in HuggingFace, e.g. 'gpt2'"
 29 |     })
 30 | 
 31 |     lora_dim: int = field(default=0, metadata={
 32 |         "help": "LoRA dimension; 0 means LoRA is disabled"
 33 |     })
 34 | 
 35 |     sequence_len: int = field(default=128, metadata={
 36 |         "help": "Model sequence length"
 37 |     })
 38 | 
 39 |     lora_dropout: float = field(default=0.0, metadata={
 40 |         "help": "Dropout probability for LoRA layers"
 41 |     })
 42 | 
 43 |     lora_alpha: int = field(default=32, metadata={
 44 |         "help": "LoRA attention alpha"
 45 |     })
 46 | 
 47 | @dataclass
 48 | class Arguments:
 49 |     train: dp_transformers.TrainingArguments
 50 |     privacy: dp_transformers.PrivacyArguments
 51 |     model: ModelArguments
 52 | 
 53 | 
 54 | def main(args: Arguments):
 55 | 
 56 |     transformers.set_seed(args.train.seed)
 57 | 
 58 |     # Setup logging
 59 |     logging.basicConfig(
 60 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 61 |         datefmt="%m/%d/%Y %H:%M:%S",
 62 |         handlers=[logging.StreamHandler(sys.stdout)],
 63 |     )
 64 | 
 65 |     log_level = train_args.get_process_log_level()
 66 |     logger.setLevel(log_level)
 67 |     datasets.utils.logging.set_verbosity(log_level)
 68 |     transformers.utils.logging.set_verbosity(log_level)
 69 |     transformers.utils.logging.enable_default_handler()
 70 |     transformers.utils.logging.enable_explicit_format()
 71 | 
 72 |     # Log on each process the small summary:
 73 |     logger.warning(
 74 |         f"Process rank: {train_args.local_rank}, device: {train_args.device}, n_gpu: {train_args.n_gpu}, "
 75 |         f"distributed training: {bool(train_args.local_rank != -1)}, 16-bits training: {train_args.fp16}"
 76 |     )
 77 |     logger.info(f"Training/evaluation parameters {train_args}")
 78 |     logger.info(f"Privacy parameters {privacy_args}")
 79 | 
 80 |     # Load model
 81 |     model = transformers.AutoModelForCausalLM.from_pretrained(args.model.model_name)
 82 |     model = model.to(train_args.device)
 83 | 
 84 |     # Load data
 85 |     data_path_train = os.path.join(args.model.data_dir, "train.csv")
 86 |     data_path_val = os.path.join(args.model.data_dir, "val.csv")
 87 |     dataset = datasets.load_dataset('csv', data_files={'train': data_path_train, 'validation': data_path_val})
 88 | 
 89 |     # Load tokenizer
 90 |     tokenizer = transformers.AutoTokenizer.from_pretrained(args.model.model_name)
 91 |     num_added_toks = tokenizer.add_special_tokens({'pad_token': '[PAD]'})
 92 |     mean_tok_emb = model.transformer.wte.weight.data.mean(dim=0)
 93 |     model.resize_token_embeddings(len(tokenizer))
 94 | 
 95 |     # Initialize the newly-added token embedding to the mean of all token embeddings
 96 |     for i in range(num_added_toks):
 97 |         model.transformer.wte.weight.data[-(i + 1), :] = mean_tok_emb
 98 | 
 99 |     label_column_names = [name for name in dataset["train"].column_names if "label" in name]
100 | 
101 |     # Tokenize data
102 |     def preprocess_function(examples):
103 |         batch = []
104 |         for t in range(len(examples['text'])):
105 |             text = "\t".join([examples[name][t] for name in label_column_names]) + "\n\n" + examples['text'][t] + tokenizer.eos_token
106 |             batch.append(text)
107 | 
108 |         result = tokenizer(batch, padding="max_length", truncation=True,
109 |                            max_length=args.model.sequence_len)
110 | 
111 |         return result
112 | 
113 |     # Tokenize data
114 |     with train_args.main_process_first(desc="tokenizing dataset"):
115 |         dataset = dataset.map(
116 |             preprocess_function, batched=True, desc="tokenizing dataset", remove_columns=dataset.column_names['train']
117 |         )
118 | 
119 |     if args.model.lora_dim > 0:
120 |         model = convert_gpt2_attention_to_lora(
121 |             model, r=args.model.lora_dim, lora_alpha=args.model.lora_alpha, lora_dropout=args.model.lora_dropout,
122 |             enable_lora=[True, False, True], merge_weights=False
123 |         )
124 |         mark_only_lora_as_trainable(model)
125 | 
126 |     if train_args.local_rank == 0:
127 |         logger.info(f"Total number of parameters of the model: {model.num_parameters(only_trainable=False)}")
128 |         logger.info(f"Fine-tuned number of parameters of the model: {model.num_parameters(only_trainable=True)}")
129 | 
130 |     model = model.cuda()
131 |     model.train()
132 | 
133 |     data_collator = dp_transformers.DataCollatorForPrivateCausalLanguageModeling(tokenizer)
134 | 
135 |     trainer = dp_transformers.dp_utils.OpacusDPTrainer(
136 |         args=train_args,
137 |         model=model,
138 |         train_dataset=dataset['train'],
139 |         eval_dataset=dataset['validation'],
140 |         data_collator=data_collator,
141 |         privacy_args=privacy_args,
142 |         tokenizer=tokenizer
143 |     )
144 | 
145 |     try:
146 |         train_result = trainer.train()
147 |     finally:
148 |         eps_prv = trainer.get_prv_epsilon()
149 |         eps_rdp = trainer.get_rdp_epsilon()
150 |         trainer.log({
151 |             "final_epsilon_prv": eps_prv,
152 |             "final_epsilon_rdp": eps_rdp
153 |         })
154 | 
155 |     if train_args.local_rank == 0 or train_args.local_rank == -1:
156 |         metrics = train_result.metrics
157 |         trainer.save_model()
158 |         trainer.log_metrics("train", metrics)
159 |         trainer.save_metrics("train", metrics)
160 | 
161 | 
162 | if __name__ == "__main__":
163 |     arg_parser = transformers.HfArgumentParser((dp_transformers.TrainingArguments, dp_transformers.PrivacyArguments, ModelArguments))
164 |     train_args, privacy_args, model_args = arg_parser.parse_args_into_dataclasses()
165 |     main(Arguments(train=train_args, privacy=privacy_args, model=model_args))


--------------------------------------------------------------------------------
/research/synthetic-text-generation-with-DP/fine-tune-nodp.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT License.
  3 | 
  4 | '''Train GPT2 model series without DP (w/ parameter-efficient approach LoRA when lora_dim > 0)'''
  5 | 
  6 | import os
  7 | import datasets
  8 | import dp_transformers
  9 | import transformers
 10 | import sys
 11 | import logging
 12 | 
 13 | from dataclasses import dataclass, field
 14 | from dp_transformers.layers.dp_merged_linear import mark_only_lora_as_trainable
 15 | from dp_transformers.module_modification import convert_gpt2_attention_to_lora
 16 | 
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | @dataclass
 22 | class ModelArguments:
 23 |     data_dir: str = field(default="./", metadata={
 24 |         "help": "Path to training data"
 25 |     })
 26 | 
 27 |     model_name: str = field(default="gpt2", metadata={
 28 |         "help": "Model name in HuggingFace, e.g. 'gpt2'"
 29 |     })
 30 | 
 31 |     lora_dim: int = field(default=0, metadata={
 32 |         "help": "LoRA dimension; 0 means LoRA is disabled"
 33 |     })
 34 | 
 35 |     sequence_len: int = field(default=128, metadata={
 36 |         "help": "Model sequence length"
 37 |     })
 38 | 
 39 |     lora_dropout: float = field(default=0.0, metadata={
 40 |         "help": "Dropout probability for LoRA layers"
 41 |     })
 42 | 
 43 |     lora_alpha: int = field(default=32, metadata={
 44 |         "help": "LoRA attention alpha"
 45 |     })
 46 | 
 47 | 
 48 | @dataclass
 49 | class Arguments:
 50 |     train: dp_transformers.TrainingArguments
 51 |     model: ModelArguments
 52 | 
 53 | 
 54 | def main(args: Arguments):
 55 |     transformers.set_seed(args.train.seed)
 56 | 
 57 |     # Setup logging
 58 |     logging.basicConfig(
 59 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 60 |         datefmt="%m/%d/%Y %H:%M:%S",
 61 |         handlers=[logging.StreamHandler(sys.stdout)],
 62 |     )
 63 | 
 64 |     log_level = train_args.get_process_log_level()
 65 |     logger.setLevel(log_level)
 66 |     datasets.utils.logging.set_verbosity(log_level)
 67 |     transformers.utils.logging.set_verbosity(log_level)
 68 |     transformers.utils.logging.enable_default_handler()
 69 |     transformers.utils.logging.enable_explicit_format()
 70 | 
 71 |     # Log on each process the small summary:
 72 |     logger.warning(
 73 |         f"Process rank: {train_args.local_rank}, device: {train_args.device}, n_gpu: {train_args.n_gpu}, "
 74 |         f"distributed training: {bool(train_args.local_rank != -1)}, 16-bits training: {train_args.fp16}"
 75 |     )
 76 |     logger.info(f"Training/evaluation parameters {train_args}")
 77 | 
 78 |     # Load model
 79 |     model = transformers.AutoModelForCausalLM.from_pretrained(args.model.model_name)
 80 |     model = model.to(train_args.device)
 81 | 
 82 |     # Load data
 83 |     data_path_train = os.path.join(args.model.data_dir, "train.csv")
 84 |     data_path_val = os.path.join(args.model.data_dir, "val.csv")
 85 |     dataset = datasets.load_dataset('csv', data_files={'train': data_path_train, 'validation': data_path_val})
 86 | 
 87 |     # Load tokenizer
 88 |     tokenizer = transformers.AutoTokenizer.from_pretrained(args.model.model_name)
 89 |     num_added_toks = tokenizer.add_special_tokens({'pad_token': '[PAD]'})
 90 |     mean_tok_emb = model.transformer.wte.weight.data.mean(dim=0)
 91 |     model.resize_token_embeddings(len(tokenizer))
 92 | 
 93 |     # Initialize the newly-added token embedding to the mean of all token embeddings
 94 |     for i in range(num_added_toks):
 95 |         model.transformer.wte.weight.data[-(i + 1), :] = mean_tok_emb
 96 | 
 97 |     label_column_names = [name for name in dataset["train"].column_names if "label" in name]
 98 | 
 99 |     # Tokenize data
100 |     def preprocess_function(examples):
101 |         batch = []
102 |         for t in range(len(examples['text'])):
103 |             text = "\t".join([examples[name][t] for name in label_column_names]) + "\n\n" + examples['text'][t] + tokenizer.eos_token
104 |             batch.append(text)
105 | 
106 |         result = tokenizer(batch, padding="max_length", truncation=True,
107 |                            max_length=args.model.sequence_len)
108 | 
109 |         return result
110 | 
111 |     # Tokenize data
112 |     with train_args.main_process_first(desc="tokenizing dataset"):
113 |         dataset = dataset.map(
114 |             preprocess_function, batched=True, desc="tokenizing dataset", remove_columns=dataset.column_names['train']
115 |         )
116 | 
117 |     if args.model.lora_dim > 0:
118 |         model = convert_gpt2_attention_to_lora(
119 |             model, r=args.model.lora_dim, lora_alpha=args.model.lora_alpha, lora_dropout=args.model.lora_dropout,
120 |             enable_lora=[True, False, True], merge_weights=False
121 |         )
122 |         mark_only_lora_as_trainable(model)
123 | 
124 |     if train_args.local_rank == 0:
125 |         logger.info(f"Total number of parameters of the model: {model.num_parameters(only_trainable=False)}")
126 |         logger.info(f"Fine-tuned number of parameters of the model: {model.num_parameters(only_trainable=True)}")
127 | 
128 |     model = model.cuda()
129 |     model.train()
130 | 
131 |     data_collator = dp_transformers.DataCollatorForPrivateCausalLanguageModeling(tokenizer)
132 | 
133 |     trainer = transformers.Trainer(
134 |         args=train_args,
135 |         model=model,
136 |         train_dataset=dataset['train'],
137 |         eval_dataset=dataset['validation'],
138 |         data_collator=data_collator,
139 |         tokenizer=tokenizer
140 |     )
141 | 
142 |     train_result = trainer.train()
143 | 
144 |     if train_args.local_rank == 0 or train_args.local_rank == -1:
145 |         metrics = train_result.metrics
146 |         trainer.save_model()
147 |         trainer.log_metrics("train", metrics)
148 |         trainer.save_metrics("train", metrics)
149 | 
150 | if __name__ == "__main__":
151 |     arg_parser = transformers.HfArgumentParser((dp_transformers.TrainingArguments, ModelArguments))
152 |     train_args, model_args = arg_parser.parse_args_into_dataclasses()
153 |     main(Arguments(train=train_args, model=model_args))


--------------------------------------------------------------------------------
/research/synthetic-text-generation-with-DP/generate-text.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
  2 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | '''Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)'''
 17 | 
 18 | import argparse
 19 | import collections
 20 | import csv
 21 | import os.path
 22 | import random
 23 | import sys
 24 | import numpy as np
 25 | import torch
 26 | import transformers
 27 | from transformers import GPT2LMHeadModel, GPT2Tokenizer
 28 | from tqdm import tqdm
 29 | 
 30 | from dp_transformers.module_modification import convert_gpt2_attention_to_lora
 31 | 
 32 | import logging
 33 | logger = logging.getLogger(__name__)
 34 | logger.setLevel(logging.INFO)
 35 | 
 36 | 
 37 | MODEL_CLASSES = {
 38 |     "distilgpt2": (GPT2LMHeadModel, GPT2Tokenizer),
 39 |     "gpt2": (GPT2LMHeadModel, GPT2Tokenizer),
 40 |     "gpt2-medium": (GPT2LMHeadModel, GPT2Tokenizer),
 41 |     "gpt2-large": (GPT2LMHeadModel, GPT2Tokenizer),
 42 |     "gpt2-xl": (GPT2LMHeadModel, GPT2Tokenizer),
 43 | }
 44 | 
 45 | 
 46 | def set_seed(args):
 47 |     np.random.seed(args.seed)
 48 |     torch.manual_seed(args.seed)
 49 |     if args.n_gpu > 0:
 50 |         torch.cuda.manual_seed_all(args.seed)
 51 | 
 52 | def calc_perplexity(encodings, cur_model):
 53 |     max_length = cur_model.config.n_positions
 54 |     stride = 512
 55 |     device = 'cuda' if torch.cuda.is_available() else "cpu"
 56 |     nlls_cur = []
 57 | 
 58 |     for i in range(0, encodings.size(1), stride):
 59 |         begin_loc = max(i + stride - max_length, 0)
 60 |         end_loc = min(i + stride, encodings.size(1))
 61 |         trg_len = end_loc - i  # may be different from stride on last loop
 62 |         input_ids = encodings[:, begin_loc:end_loc].to(device)
 63 |         target_ids = input_ids.clone()
 64 |         target_ids[:, :-trg_len] = -100
 65 |         target_ids[target_ids==cur_model.config.pad_token_id] = -100
 66 | 
 67 |         with torch.no_grad():
 68 |             outputs = cur_model(input_ids, labels=target_ids)
 69 |             nlls_cur.append(outputs[0] * trg_len)
 70 | 
 71 |     ppl_cur = torch.exp(torch.stack(nlls_cur).sum() / end_loc)
 72 | 
 73 |     return ppl_cur.item()
 74 | 
 75 | def convert_model(checkpoint_path):
 76 |     sd = torch.load(os.path.join(checkpoint_path, "pytorch_model.bin"), map_location="cpu")
 77 |     state_dict = {}
 78 |     for key, value in sd.items():
 79 |         key = key.replace("_module.module.", "")
 80 |         state_dict[key] = value
 81 |     return state_dict
 82 | 
 83 | 
 84 | def main():
 85 |     parser = argparse.ArgumentParser()
 86 |     parser.add_argument(
 87 |         "--model_type",
 88 |         default=None,
 89 |         type=str,
 90 |         required=True,
 91 |         help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
 92 |     )
 93 |     parser.add_argument(
 94 |         "--model_name_or_path",
 95 |         default=None,
 96 |         type=str,
 97 |         required=True,
 98 |         help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
 99 |     )
100 | 
101 |     parser.add_argument(
102 |         "--input_training_file",
103 |         default=None,
104 |         type=str,
105 |         required=True,
106 |     )
107 | 
108 |     parser.add_argument(
109 |         "--output_dir",
110 |         default=None,
111 |         type=str,
112 |         required=True,
113 |     )
114 | 
115 |     parser.add_argument("--length", type=int, default=128)
116 |     parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped")
117 | 
118 |     parser.add_argument(
119 |         "--temperature",
120 |         type=float,
121 |         default=1.0,
122 |         help="temperature of 1.0 has no effect, lower tend toward greedy sampling",
123 |     )
124 |     parser.add_argument(
125 |         "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2"
126 |     )
127 |     parser.add_argument("--k", type=int, default=50)
128 |     parser.add_argument("--p", type=float, default=0.9)
129 |     parser.add_argument("--num_beams", type=int, default=5)
130 |     parser.add_argument("--batch_size", type=int, default=32)
131 | 
132 |     parser.add_argument("--do_sample", action="store_true", help="sampling when generation")
133 | 
134 |     parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
135 |     parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
136 |     parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
137 |     parser.add_argument("--total_sequences", type=int, default=100000, help="The number of total samples to generate.")
138 | 
139 |     parser.add_argument(
140 |         "--fp16",
141 |         action="store_true",
142 |         help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
143 |     )
144 | 
145 |     parser.add_argument("--lora_dim", type=int, default=0)
146 |     parser.add_argument("--lora_alpha", type=int, default=32)
147 |     parser.add_argument("--lora_dropout", type=float, default=0.0)
148 | 
149 |     args = parser.parse_args()
150 | 
151 |     args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
152 |     args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
153 | 
154 |     logger.warning(f"device: {args.device}, n_gpu: {args.n_gpu}, 16-bits training: {args.fp16}")
155 | 
156 |     set_seed(args)
157 | 
158 |     # Initialize the model and tokenizer
159 |     try:
160 |         args.model_type = args.model_type.lower()
161 |         model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
162 |     except KeyError:
163 |         raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)")
164 | 
165 |     tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
166 | 
167 |     if tokenizer.pad_token_id:
168 |         model = transformers.AutoModelForCausalLM.from_pretrained(args.model_type, pad_token_id=tokenizer.pad_token_id)
169 |     else:
170 |         model = transformers.AutoModelForCausalLM.from_pretrained(args.model_type, pad_token_id=tokenizer.eos_token_id)
171 | 
172 |     model.resize_token_embeddings(len(tokenizer))
173 |     
174 |     if args.lora_dim > 0:
175 |         model = convert_gpt2_attention_to_lora(
176 |             model, r=args.lora_dim, lora_alpha=args.lora_alpha, lora_dropout=args.lora_dropout,
177 |             enable_lora=[True, False, True], merge_weights=False
178 |         )
179 | 
180 |     state_dict = convert_model(args.model_name_or_path)
181 |     model, *_ = model_class._load_pretrained_model(
182 |             model,
183 |             state_dict,
184 |             [k for k in state_dict.keys()],  # XXX: rename?
185 |             os.path.join(args.model_name_or_path, "pytorch_model.bin"),
186 |             args.model_name_or_path,
187 |         )
188 | 
189 |     # Make sure token embedding weights are still tied if needed
190 |     model.tie_weights()
191 | 
192 |     model.eval()
193 |     model.to(args.device)
194 | 
195 |     if args.fp16:
196 |         model.half()
197 | 
198 |     logger.info(args)
199 | 
200 |     def generate_text(prompt,seq_num,prompt_length):
201 |         ppls_cur = []
202 |         all_data = []
203 | 
204 |         for _ in tqdm(range(seq_num // args.batch_size + 1)):
205 |             input_ids = torch.tensor(prompt, device=args.device).repeat(args.batch_size, 1)
206 |             output_sequences = model.generate(
207 |                 input_ids=input_ids,
208 |                 max_length=args.length,
209 |                 temperature=args.temperature,
210 |                 top_k=args.k,
211 |                 top_p=args.p,
212 |                 early_stopping=True,
213 |                 repetition_penalty=args.repetition_penalty,
214 |                 do_sample=args.do_sample,
215 |                 num_return_sequences=2,  # overgenerate to ensure we have enough non-empty generated sequences
216 |                 no_repeat_ngram_size=2,
217 |             )
218 | 
219 |             ppl = calc_perplexity(output_sequences, model)
220 |             ppls_cur.append(ppl)
221 | 
222 |             generated_sequences = tokenizer.batch_decode(output_sequences, skip_special_tokens=True,
223 |                                                          clean_up_tokenization_spaces=True)
224 | 
225 |             for g in generated_sequences:
226 |                 labels, seq = g[:prompt_length], g[prompt_length:]
227 |                 seq = " ".join(seq.split())
228 |                 labels = labels.strip().split("\t")
229 |                 if seq:
230 |                     all_data.append([seq]+labels)
231 | 
232 |         if len(all_data) >seq_num:
233 |             all_data = random.sample(all_data,seq_num)
234 |         return all_data,ppls_cur
235 | 
236 |     with torch.no_grad():
237 |         prompt_counter = collections.Counter()
238 |         with open(args.input_training_file,encoding='utf-8') as rf:
239 |             csv_reader = csv.reader(rf)
240 |             title = next(csv_reader)
241 | 
242 |             label_column_index = [i for i,name in enumerate(title) if "label" in name]
243 | 
244 |             for line in csv_reader:
245 |                 prompt = "\t".join([line[idx] for idx in label_column_index]) + "\n\n"
246 |                 prompt_counter[prompt] += 1
247 | 
248 |         ratio_generation_training = args.total_sequences / sum(prompt_counter.values())
249 |         all_sequences = []
250 |         ppls_cur = []
251 | 
252 |         for prompt_text in tqdm(prompt_counter):
253 |             prompt = tokenizer(prompt_text)['input_ids']
254 |             num_seq_to_generate = round(prompt_counter[prompt_text] * ratio_generation_training)
255 |             if num_seq_to_generate>0:
256 |                 sequences, ppls = generate_text(prompt, num_seq_to_generate, len(prompt_text))
257 |                 all_sequences += sequences
258 |                 ppls_cur += ppls
259 | 
260 |     logger.info(f"Current PPL: %.2f±%.2f", np.mean(ppls_cur),np.std(ppls_cur))
261 |     logger.info(f"Total generated sequences: %d", len(all_sequences))
262 |     random.shuffle(all_sequences)
263 | 
264 |     #prefix = list(filter(None, args.model_name_or_path.split("/"))).pop()
265 |     os.makedirs(args.output_dir, exist_ok=True)
266 |     output_path = os.path.join(args.output_dir, str(args.length) + ".generations.csv")
267 |     with open(output_path, 'w', newline='', encoding="utf-8") as wf:
268 |         csv_writer = csv.writer(wf)
269 |         csv_writer.writerow(title)
270 |         for obj in all_sequences:
271 |             if obj[0]: # remove empty sequences
272 |                 csv_writer.writerow(obj)
273 | 
274 | 
275 | if __name__ == "__main__":
276 |     main()


--------------------------------------------------------------------------------
/research/synthetic-text-generation-with-DP/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas==1.5.3
3 | scikit-learn
4 | torch==1.12.1
5 | transformers==4.20.1
6 | datasets==2.0.0
7 | prv-accountant==0.1.1.post1
8 | opacus==1.2.0
9 | git+https://github.com/microsoft/dp-transformers.git@39fb6878623594cb0ab1c9a273058487b8f8a710


--------------------------------------------------------------------------------
/research/synthetic-text-generation-with-DP/run-classification.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ Finetuning the library models for sequence classification on GLUE."""
 17 | # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 18 | 
 19 | import logging
 20 | import os
 21 | import random
 22 | import sys
 23 | from dataclasses import dataclass, field
 24 | from typing import Optional
 25 | from numpy import expand_dims
 26 | from numpy import log
 27 | from numpy import mean,std
 28 | from numpy import exp
 29 | import datasets
 30 | import numpy as np
 31 | from datasets import load_dataset, load_metric
 32 | from math import floor
 33 | import transformers
 34 | from transformers import (
 35 |     AutoConfig,
 36 |     AutoModelForSequenceClassification,
 37 |     AutoTokenizer,
 38 |     DataCollatorWithPadding,
 39 |     EvalPrediction,
 40 |     HfArgumentParser,
 41 |     PretrainedConfig,
 42 |     Trainer,
 43 |     TrainingArguments,
 44 |     default_data_collator,
 45 |     set_seed,
 46 | )
 47 | from transformers.trainer_utils import get_last_checkpoint
 48 | from transformers.utils import check_min_version
 49 | from transformers.utils.versions import require_version
 50 | from sklearn.metrics import confusion_matrix
 51 | from scipy.special import softmax
 52 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 53 | check_min_version("4.18.0")
 54 | 
 55 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 56 | 
 57 | task_to_keys = {
 58 |     "cola": ("sentence", None),
 59 |     "mnli": ("premise", "hypothesis"),
 60 |     "mrpc": ("sentence1", "sentence2"),
 61 |     "qnli": ("question", "sentence"),
 62 |     "qqp": ("question1", "question2"),
 63 |     "rte": ("sentence1", "sentence2"),
 64 |     "sst2": ("sentence", None),
 65 |     "stsb": ("sentence1", "sentence2"),
 66 |     "wnli": ("sentence1", "sentence2"),
 67 | }
 68 | 
 69 | logger = logging.getLogger(__name__)
 70 | 
 71 | 
 72 | @dataclass
 73 | class DataTrainingArguments:
 74 |     """
 75 |     Arguments pertaining to what data we are going to input our model for training and eval.
 76 |     Using `HfArgumentParser` we can turn this class
 77 |     into argparse arguments to be able to specify them on
 78 |     the command line.
 79 |     """
 80 | 
 81 |     task_name: Optional[str] = field(
 82 |         default=None,
 83 |         metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
 84 |     )
 85 |     dataset_name: Optional[str] = field(
 86 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
 87 |     )
 88 |     label_column_name: Optional[str] = field(
 89 |         default=None, metadata={"help": "The name of the label column"}
 90 |     )
 91 |     dataset_config_name: Optional[str] = field(
 92 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
 93 |     )
 94 |     max_seq_length: int = field(
 95 |         default=128,
 96 |         metadata={
 97 |             "help": (
 98 |                 "The maximum total input sequence length after tokenization. Sequences longer "
 99 |                 "than this will be truncated, sequences shorter will be padded."
100 |             )
101 |         },
102 |     )
103 |     overwrite_cache: bool = field(
104 |         default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
105 |     )
106 |     pad_to_max_length: bool = field(
107 |         default=True,
108 |         metadata={
109 |             "help": (
110 |                 "Whether to pad all samples to `max_seq_length`. "
111 |                 "If False, will pad the samples dynamically when batching to the maximum length in the batch."
112 |             )
113 |         },
114 |     )
115 |     max_train_samples: Optional[int] = field(
116 |         default=None,
117 |         metadata={
118 |             "help": (
119 |                 "For debugging purposes or quicker training, truncate the number of training examples to this "
120 |                 "value if set."
121 |             )
122 |         },
123 |     )
124 |     max_eval_samples: Optional[int] = field(
125 |         default=None,
126 |         metadata={
127 |             "help": (
128 |                 "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
129 |                 "value if set."
130 |             )
131 |         },
132 |     )
133 |     max_predict_samples: Optional[int] = field(
134 |         default=None,
135 |         metadata={
136 |             "help": (
137 |                 "For debugging purposes or quicker training, truncate the number of prediction examples to this "
138 |                 "value if set."
139 |             )
140 |         },
141 |     )
142 |     train_file: Optional[str] = field(
143 |         default=None, metadata={"help": "A csv or a json file containing the training data."}
144 |     )
145 |     validation_file: Optional[str] = field(
146 |         default=None, metadata={"help": "A csv or a json file containing the validation data."}
147 |     )
148 |     test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})
149 | 
150 |     def __post_init__(self):
151 |         if self.task_name is not None:
152 |             self.task_name = self.task_name.lower()
153 |             if self.task_name not in task_to_keys.keys():
154 |                 raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
155 |         elif self.dataset_name is not None:
156 |             pass
157 |         elif self.train_file is None or self.validation_file is None:
158 |             raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.")
159 |         else:
160 |             train_extension = self.train_file.split(".")[-1]
161 |             assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
162 |             validation_extension = self.validation_file.split(".")[-1]
163 |             assert (
164 |                 validation_extension == train_extension
165 |             ), "`validation_file` should have the same extension (csv or json) as `train_file`."
166 | 
167 | 
168 | @dataclass
169 | class ModelArguments:
170 |     """
171 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
172 |     """
173 | 
174 |     model_name_or_path: str = field(
175 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
176 |     )
177 |     config_name: Optional[str] = field(
178 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
179 |     )
180 |     tokenizer_name: Optional[str] = field(
181 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
182 |     )
183 |     cache_dir: Optional[str] = field(
184 |         default=None,
185 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
186 |     )
187 |     use_fast_tokenizer: bool = field(
188 |         default=True,
189 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
190 |     )
191 |     model_revision: str = field(
192 |         default="main",
193 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
194 |     )
195 |     use_auth_token: bool = field(
196 |         default=False,
197 |         metadata={
198 |             "help": (
199 |                 "Will use the token generated when running `transformers-cli login` (necessary to use this script "
200 |                 "with private models)."
201 |             )
202 |         },
203 |     )
204 |     ignore_mismatched_sizes: bool = field(
205 |         default=False,
206 |         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
207 |     )
208 | 
209 |     binary_classification: bool = field(
210 |         default=False,
211 |     )
212 | 
213 |     sample_dataset: bool = field(
214 |         default=False,
215 |     )
216 | 
217 | def main():
218 |     # See all possible arguments in src/transformers/training_args.py
219 |     # or by passing the --help flag to this script.
220 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
221 | 
222 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
223 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
224 |         # If we pass only one argument to the script and it's the path to a json file,
225 |         # let's parse it to get our arguments.
226 |         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
227 |     else:
228 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
229 | 
230 |     # Setup logging
231 |     logging.basicConfig(
232 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
233 |         datefmt="%m/%d/%Y %H:%M:%S",
234 |         handlers=[logging.StreamHandler(sys.stdout)],
235 |     )
236 | 
237 |     log_level = training_args.get_process_log_level()
238 |     logger.setLevel(log_level)
239 |     datasets.utils.logging.set_verbosity(log_level)
240 |     transformers.utils.logging.set_verbosity(log_level)
241 |     transformers.utils.logging.enable_default_handler()
242 |     transformers.utils.logging.enable_explicit_format()
243 | 
244 |     # Log on each process the small summary:
245 |     logger.warning(
246 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
247 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
248 |     )
249 |     logger.info(f"Training/evaluation parameters {training_args}")
250 | 
251 |     # Detecting last checkpoint.
252 |     last_checkpoint = None
253 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
254 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
255 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
256 |             raise ValueError(
257 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
258 |                 "Use --overwrite_output_dir to overcome."
259 |             )
260 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
261 |             logger.info(
262 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
263 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
264 |             )
265 | 
266 |     # Set seed before initializing model.
267 |     set_seed(training_args.seed)
268 | 
269 |     def sample_dataset(dataset,label_column_name,sample_size=100000):
270 |         training_dataset = dataset['train']
271 |         sample_indices = []
272 |         label_list = training_dataset.unique(label_column_name)
273 |         for label in label_list:
274 |             indices = np.where(np.array(training_dataset[label_column_name])==label)[0]
275 |             sample_num = round(sample_size * (len(indices)/len(training_dataset)))
276 |             sample_indices.append(np.random.choice(indices,size=sample_num,replace=False))
277 |         sample_indices = np.concatenate(sample_indices)
278 |         np.random.shuffle(sample_indices)
279 |         training_dataset = training_dataset.select(sample_indices)
280 |         dataset['train'] = training_dataset
281 |         return dataset
282 | 
283 |     # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
284 |     # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
285 |     #
286 |     # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
287 |     # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
288 |     # label if at least two columns are provided.
289 |     #
290 |     # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
291 |     # single column. You can easily tweak this behavior (see below)
292 |     #
293 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
294 |     # download the dataset.
295 |     if data_args.task_name is not None:
296 |         # Downloading and loading a dataset from the hub.
297 |         raw_datasets = load_dataset(
298 |             "glue",
299 |             data_args.task_name,
300 |             cache_dir=model_args.cache_dir,
301 |             use_auth_token=True if model_args.use_auth_token else None,
302 |         )
303 |     elif data_args.dataset_name is not None:
304 |         # Downloading and loading a dataset from the hub.
305 |         raw_datasets = load_dataset(
306 |             data_args.dataset_name,
307 |             data_args.dataset_config_name,
308 |             cache_dir=model_args.cache_dir,
309 |             use_auth_token=True if model_args.use_auth_token else None,
310 |         )
311 |     else:
312 |         # Loading a dataset from your local files.
313 |         # CSV/JSON training and evaluation files are needed.
314 |         data_files = {"train": data_args.train_file, "validation": data_args.validation_file}
315 | 
316 |         # Get the test dataset: you can provide your own CSV/JSON test file (see below)
317 |         # when you use `do_predict` without specifying a GLUE benchmark task.
318 |         if training_args.do_predict:
319 |             if data_args.test_file is not None:
320 |                 train_extension = data_args.train_file.split(".")[-1]
321 |                 test_extension = data_args.test_file.split(".")[-1]
322 |                 assert (
323 |                     test_extension == train_extension
324 |                 ), "`test_file` should have the same extension (csv or json) as `train_file`."
325 |                 data_files["test"] = data_args.test_file
326 |             else:
327 |                 raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
328 | 
329 |         for key in data_files.keys():
330 |             logger.info(f"load a local file for {key}: {data_files[key]}")
331 | 
332 |         if data_args.train_file.endswith(".csv"):
333 |             # Loading a dataset from local csv files
334 |             raw_datasets = load_dataset(
335 |                 "csv",
336 |                 data_files=data_files,
337 |                 cache_dir=model_args.cache_dir,
338 |                 use_auth_token=True if model_args.use_auth_token else None,
339 |             )
340 | 
341 |         else:
342 |             # Loading a dataset from local json files
343 |             raw_datasets = load_dataset(
344 |                 "json",
345 |                 data_files=data_files,
346 |                 cache_dir=model_args.cache_dir,
347 |                 use_auth_token=True if model_args.use_auth_token else None,
348 |             )
349 |     # See more about loading any type of standard or custom dataset at
350 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
351 |     if not data_args.label_column_name:
352 |         label_column_name = [name for name in raw_datasets["train"].column_names if "label" in name][-1]
353 |     else:
354 |         label_column_name = data_args.label_column_name
355 | 
356 |     # Labels
357 |     if data_args.task_name is not None:
358 |         is_regression = data_args.task_name == "stsb"
359 |         if not is_regression:
360 |             label_list = raw_datasets["train"].features[label_column_name].names
361 |             num_labels = len(label_list)
362 |         else:
363 |             num_labels = 1
364 |     else:
365 |         # Trying to have good defaults here, don't hesitate to tweak to your needs.
366 |         # is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
367 |         is_regression = False
368 |         if is_regression:
369 |             num_labels = 1
370 |         else:
371 |             # A useful fast method:
372 |             # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
373 |             label_list = raw_datasets["train"].unique(label_column_name)
374 |             label_list.sort()  # Let's sort it for determinism
375 |             num_labels = len(label_list)
376 |     if model_args.sample_dataset:
377 |         raw_datasets = sample_dataset(raw_datasets,label_column_name)
378 | 
379 |     # Load pretrained model and tokenizer
380 |     #
381 |     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
382 |     # download model & vocab.
383 |     config = AutoConfig.from_pretrained(
384 |         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
385 |         num_labels=num_labels,
386 |         finetuning_task=data_args.task_name,
387 |         cache_dir=model_args.cache_dir,
388 |         revision=model_args.model_revision,
389 |         use_auth_token=True if model_args.use_auth_token else None,
390 |     )
391 |     tokenizer = AutoTokenizer.from_pretrained(
392 |         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
393 |         cache_dir=model_args.cache_dir,
394 |         use_fast=model_args.use_fast_tokenizer,
395 |         revision=model_args.model_revision,
396 |         use_auth_token=True if model_args.use_auth_token else None,
397 |     )
398 |     model = AutoModelForSequenceClassification.from_pretrained(
399 |         model_args.model_name_or_path,
400 |         from_tf=bool(".ckpt" in model_args.model_name_or_path),
401 |         config=config,
402 |         cache_dir=model_args.cache_dir,
403 |         revision=model_args.model_revision,
404 |         use_auth_token=True if model_args.use_auth_token else None,
405 |         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
406 |     )
407 | 
408 |     # Preprocessing the raw_datasets
409 |     if data_args.task_name is not None:
410 |         sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
411 |     else:
412 |         # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
413 |         non_label_column_names = [name for name in raw_datasets["train"].column_names if "label" not in name]
414 |         if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
415 |             sentence1_key, sentence2_key = "sentence1", "sentence2"
416 |         else:
417 |             if len(non_label_column_names) >= 2:
418 |                 sentence1_key, sentence2_key = non_label_column_names[:2]
419 |             else:
420 |                 sentence1_key, sentence2_key = non_label_column_names[0], None
421 | 
422 |     # Padding strategy
423 |     if data_args.pad_to_max_length:
424 |         padding = "max_length"
425 |     else:
426 |         # We will pad later, dynamically at batch creation, to the max sequence length in each batch
427 |         padding = False
428 | 
429 |     # Some models have set the order of the labels to use, so let's make sure we do use it.
430 |     label_to_id = None
431 |     if (
432 |         model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
433 |         and data_args.task_name is not None
434 |         and not is_regression
435 |     ):
436 |         # Some have all caps in their config, some don't.
437 |         label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
438 |         if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
439 |             label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
440 |         else:
441 |             logger.warning(
442 |                 "Your model seems to have been trained with labels, but they don't match the dataset: ",
443 |                 f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
444 |                 "\nIgnoring the model labels as a result.",
445 |             )
446 |     elif data_args.task_name is None and not is_regression:
447 |         label_to_id = {v: i for i, v in enumerate(label_list)}
448 | 
449 |     if label_to_id is not None:
450 |         model.config.label2id = label_to_id
451 |         model.config.id2label = {id: label for label, id in config.label2id.items()}
452 |     elif data_args.task_name is not None and not is_regression:
453 |         model.config.label2id = {l: i for i, l in enumerate(label_list)}
454 |         model.config.id2label = {id: label for label, id in config.label2id.items()}
455 | 
456 |     if data_args.max_seq_length > tokenizer.model_max_length:
457 |         logger.warning(
458 |             f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
459 |             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
460 |         )
461 |     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
462 | 
463 |     def preprocess_function(examples):
464 |         # Tokenize the texts
465 |         args = (
466 |             (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
467 |         )
468 |         result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
469 | 
470 |         # Map labels to IDs (not necessary for GLUE tasks)
471 |         if label_to_id is not None and label_column_name in examples:
472 |             result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples[label_column_name]]
473 |         return result
474 | 
475 |     with training_args.main_process_first(desc="dataset map pre-processing"):
476 |         raw_datasets = raw_datasets.map(
477 |             preprocess_function,
478 |             batched=True,
479 |             load_from_cache_file=not data_args.overwrite_cache,
480 |             desc="Running tokenizer on dataset",
481 |         )
482 |     if training_args.do_train:
483 |         if "train" not in raw_datasets:
484 |             raise ValueError("--do_train requires a train dataset")
485 |         train_dataset = raw_datasets["train"]
486 |         if data_args.max_train_samples is not None:
487 |             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
488 |             train_dataset = train_dataset.select(range(max_train_samples))
489 | 
490 |     if training_args.do_eval:
491 |         if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
492 |             raise ValueError("--do_eval requires a validation dataset")
493 |         eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
494 |         if data_args.max_eval_samples is not None:
495 |             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
496 |             eval_dataset = eval_dataset.select(range(max_eval_samples))
497 | 
498 |     if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
499 |         if "test" not in raw_datasets and "test_matched" not in raw_datasets:
500 |             raise ValueError("--do_predict requires a test dataset")
501 |         predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"]
502 |         if data_args.max_predict_samples is not None:
503 |             max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
504 |             predict_dataset = predict_dataset.select(range(max_predict_samples))
505 | 
506 |     # Log a few random samples from the training set:
507 |     if training_args.do_train:
508 |         for index in random.sample(range(len(train_dataset)), 3):
509 |             logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
510 | 
511 |     # Get the metric function
512 |     if data_args.task_name is not None:
513 |         metric = load_metric("glue", data_args.task_name)
514 |     else:
515 |         metric = load_metric("accuracy")
516 | 
517 |     # assumes images have the shape 299x299x3, pixels in [0,255]
518 |     def calculate_inception_score(yhat, n_split=5, eps=1E-16):
519 |         # enumerate splits of images/predictions
520 |         scores = list()
521 |         n_part = floor(yhat.shape[0] / n_split)
522 |         for i in range(n_split):
523 |             # retrieve p(y|x)
524 |             ix_start, ix_end = i * n_part, i * n_part + n_part
525 |             p_yx = yhat[ix_start:ix_end]
526 |             # calculate p(y)
527 |             p_y = expand_dims(p_yx.mean(axis=0), 0)
528 |             # calculate KL divergence using log probabilities
529 |             kl_d = p_yx * (log(p_yx + eps) - log(p_y + eps))
530 |             # sum over classes
531 |             sum_kl_d = kl_d.sum(axis=1)
532 |             # average over images
533 |             avg_kl_d = mean(sum_kl_d)
534 |             # undo the log
535 |             is_score = exp(avg_kl_d)
536 |             # store
537 |             scores.append(is_score)
538 |         # average across images
539 |         is_avg, is_std = mean(scores), std(scores)
540 |         return is_avg
541 | 
542 |     # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
543 |     # predictions and label_ids field) and has to return a dictionary string to float.
544 |     def compute_metrics(p: EvalPrediction):
545 |         preds_prob = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
546 |         preds = np.squeeze(preds_prob) if is_regression else np.argmax(preds_prob, axis=1)
547 |         if data_args.task_name is not None:
548 |             result = metric.compute(predictions=preds, references=p.label_ids)
549 |             if len(result) > 1:
550 |                 result["combined_score"] = np.mean(list(result.values())).item()
551 |             return result
552 |         elif is_regression:
553 |             return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
554 |         else:
555 |             matrix = confusion_matrix(p.label_ids, preds)
556 |             per_class_acc = matrix.diagonal() / matrix.sum(axis=1)
557 |             results = {"accuracy_class_" + str(k+1):per_class_acc[k] for k in range(len(per_class_acc))}
558 |             results["accuracy_all"] = (preds == p.label_ids).astype(np.float32).mean().item()
559 |             results['inception_score'] = calculate_inception_score(softmax(preds_prob,axis=1))
560 |             return results
561 | 
562 |     # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
563 |     # we already did the padding.
564 |     if data_args.pad_to_max_length:
565 |         data_collator = default_data_collator
566 |     elif training_args.fp16:
567 |         data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
568 |     else:
569 |         data_collator = None
570 | 
571 |     # Initialize our Trainer
572 |     trainer = Trainer(
573 |         model=model,
574 |         args=training_args,
575 |         train_dataset=train_dataset if training_args.do_train else None,
576 |         eval_dataset=eval_dataset if training_args.do_eval else None,
577 |         compute_metrics=compute_metrics,
578 |         tokenizer=tokenizer,
579 |         data_collator=data_collator,
580 |     )
581 | 
582 |     # Training
583 |     if training_args.do_train:
584 |         checkpoint = None
585 |         if training_args.resume_from_checkpoint is not None:
586 |             checkpoint = training_args.resume_from_checkpoint
587 |         elif last_checkpoint is not None:
588 |             checkpoint = last_checkpoint
589 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
590 |         metrics = train_result.metrics
591 |         max_train_samples = (
592 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
593 |         )
594 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
595 | 
596 |         #trainer.save_model()  # Saves the tokenizer too for easy upload
597 | 
598 |         trainer.log_metrics("train", metrics)
599 |         trainer.save_metrics("train", metrics)
600 |         #trainer.save_state()
601 | 
602 |     # Evaluation
603 |     if training_args.do_eval:
604 |         logger.info("*** Evaluate ***")
605 | 
606 |         # Loop to handle MNLI double evaluation (matched, mis-matched)
607 |         tasks = [data_args.task_name]
608 |         eval_datasets = [eval_dataset]
609 |         if data_args.task_name == "mnli":
610 |             tasks.append("mnli-mm")
611 |             eval_datasets.append(raw_datasets["validation_mismatched"])
612 |             combined = {}
613 | 
614 |         for eval_dataset, task in zip(eval_datasets, tasks):
615 |             metrics = trainer.evaluate(eval_dataset=eval_dataset)
616 | 
617 |             max_eval_samples = (
618 |                 data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
619 |             )
620 |             metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
621 | 
622 |             if task == "mnli-mm":
623 |                 metrics = {k + "_mm": v for k, v in metrics.items()}
624 |             if task is not None and "mnli" in task:
625 |                 combined.update(metrics)
626 | 
627 |             trainer.log_metrics("eval", metrics)
628 |             trainer.save_metrics("eval", combined if task is not None and "mnli" in task else metrics)
629 | 
630 |     if training_args.do_predict:
631 |         logger.info("*** Predict ***")
632 | 
633 |         # Loop to handle MNLI double evaluation (matched, mis-matched)
634 |         tasks = [data_args.task_name]
635 |         predict_datasets = [predict_dataset]
636 |         if data_args.task_name == "mnli":
637 |             tasks.append("mnli-mm")
638 |             predict_datasets.append(raw_datasets["test_mismatched"])
639 | 
640 |         for predict_dataset, task in zip(predict_datasets, tasks):
641 |             metrics = trainer.evaluate(eval_dataset=predict_dataset)
642 |             trainer.log_metrics("test", metrics)
643 |             trainer.save_metrics("test", combined if task is not None and "mnli" in task else metrics)
644 | 
645 |             # Removing the `label` columns because it contains -1 and Trainer won't like that.
646 |             predict_dataset = predict_dataset.remove_columns("label")
647 |             predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
648 |             predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
649 | 
650 |             output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt")
651 |             if trainer.is_world_process_zero():
652 |                 with open(output_predict_file, "w") as writer:
653 |                     logger.info(f"***** Predict results {task} *****")
654 |                     writer.write("index\tprediction\n")
655 |                     for index, item in enumerate(predictions):
656 |                         if is_regression:
657 |                             writer.write(f"{index}\t{item:3.3f}\n")
658 |                         else:
659 |                             item = label_list[item]
660 |                             writer.write(f"{index}\t{item}\n")
661 | 
662 |     kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
663 |     if data_args.task_name is not None:
664 |         kwargs["language"] = "en"
665 |         kwargs["dataset_tags"] = "glue"
666 |         kwargs["dataset_args"] = data_args.task_name
667 |         kwargs["dataset"] = f"GLUE {data_args.task_name.upper()}"
668 | 
669 |     if training_args.push_to_hub:
670 |         trainer.push_to_hub(**kwargs)
671 |     else:
672 |         trainer.create_model_card(**kwargs)
673 | 
674 | 
675 | def _mp_fn(index):
676 |     # For xla_spawn (TPUs)
677 |     main()
678 | 
679 | 
680 | if __name__ == "__main__":
681 |     main()


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import os
 5 | from setuptools import setup, find_packages
 6 | 
 7 | version = '1.0.1'
 8 | 
 9 | with open('README.md') as f:
10 |     long_description = f.read()
11 | 
12 | setup(
13 |     name='dp-transformers',
14 |     version=version,
15 |     description='Differentially-private transformers using HuggingFace and Opacus',
16 |     long_description=long_description,
17 |     long_description_content_type='text/markdown',
18 |     url="https://www.github.com/microsoft/dp-transformers",
19 |     author='Microsoft',
20 |     packages=find_packages('src'),
21 |     package_dir={'': 'src'},
22 |     python_requires=">=3.7.0",
23 |     include_package_data=True,
24 |     extras_require={
25 |         "test": [
26 |             "pytest",
27 |         ]
28 |     },
29 |     install_requires=[
30 |         "transformers>=4.30.0",
31 |         "datasets>=2.0.0",
32 |         "opacus>=1.3.0",
33 |         "peft",
34 |         "prv_accountant<0.2.0",
35 |         "torch>=1.13.1",
36 |     ],
37 |     test_suite="tests",
38 |     zip_safe=False
39 | )
40 | 


--------------------------------------------------------------------------------
/src/dp_transformers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 | 
4 | from .arguments import PrivacyArguments, TrainingArguments  # noqa: F401
5 | from .dp_utils import DPCallback, DataCollatorForPrivateCausalLanguageModeling  # noqa: F401
6 | from .sampler import PoissonAuthorSampler, ShuffledAuthorSampler  # noqa: F401
7 | 


--------------------------------------------------------------------------------
/src/dp_transformers/arguments.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT License.
  3 | 
  4 | from typing import Optional
  5 | 
  6 | import numpy as np
  7 | from scipy import optimize
  8 | from transformers import TrainingArguments as HfTrainingArguments
  9 | from transformers import IntervalStrategy, logging
 10 | from dataclasses import dataclass, field
 11 | from datasets.utils import disable_progress_bar
 12 | from prv_accountant import Accountant
 13 | 
 14 | logger = logging.get_logger(__name__)
 15 | 
 16 | 
 17 | @dataclass
 18 | class PrivacyArguments:
 19 |     per_sample_max_grad_norm: Optional[float] = field(default=None, metadata={"help": "Max per sample clip norm"})
 20 |     noise_multiplier: Optional[float] = field(default=None, metadata={"help": "Noise multiplier for DP training"})
 21 |     target_epsilon: Optional[float] = field(default=None, metadata={
 22 |         "help": "Target epsilon at end of training (mutually exclusive with noise multiplier)"
 23 |     })
 24 |     target_delta: Optional[float] = field(default=None, metadata={
 25 |         "help": "Target delta, defaults to 1/N"
 26 |     })
 27 |     disable_dp: bool = field(default=False, metadata={
 28 |         "help": "Disable DP training."
 29 |     })
 30 | 
 31 |     def initialize(self, sampling_probability: float, num_steps: int, num_samples: int) -> None:
 32 |         if self.target_delta is None:
 33 |             self.target_delta = 1.0/num_samples
 34 |         logger.info(f"The target delta is set to be: {self.target_delta}")
 35 | 
 36 |         # Set up noise multiplier
 37 |         if self.noise_multiplier is None:
 38 |             self.noise_multiplier = find_noise_multiplier(
 39 |                 sampling_probability=sampling_probability,
 40 |                 num_steps=num_steps,
 41 |                 target_delta=self.target_delta,
 42 |                 target_epsilon=self.target_epsilon
 43 |             )
 44 |         logger.info(f"The noise multiplier is set to be: {self.noise_multiplier}")
 45 | 
 46 |     @property
 47 |     def is_initialized(self) -> bool:
 48 |         return (
 49 |             self.per_sample_max_grad_norm is not None and
 50 |             self.noise_multiplier is not None and
 51 |             self.target_delta is not None
 52 |         )
 53 | 
 54 |     def __post_init__(self):
 55 |         if self.disable_dp:
 56 |             logger.warning("Disabling differentially private training...")
 57 |             self.noise_multiplier = 0.0
 58 |             self.per_sample_max_grad_norm = float('inf')
 59 |             self.target_epsilon = None
 60 |         else:
 61 |             if bool(self.target_epsilon) == bool(self.noise_multiplier):
 62 |                 raise ValueError("Exactly one of the arguments --target_epsilon and --noise_multiplier must be used.")
 63 |             if self.per_sample_max_grad_norm is None:
 64 |                 raise ValueError("DP training requires --per_sample_max_grad_norm argument.")
 65 | 
 66 | 
 67 | @dataclass
 68 | class TrainingArguments(HfTrainingArguments):
 69 |     dry_run: bool = field(
 70 |         default=False,
 71 |         metadata={"help": "Option for reducing training steps (2) and logging intervals (1) for quick sanity checking of arguments."}
 72 |     )
 73 | 
 74 |     def __post_init__(self):
 75 |         super().__post_init__()
 76 |         if self.dry_run:
 77 |             logger.warning("--dry_run was specified. Reducing number of training steps to 2 and logging intervals to 1...")
 78 |             self.logging_steps = 1
 79 |             self.logging_strategy = IntervalStrategy.STEPS
 80 |             self.eval_steps = 1
 81 |             self.evaluation_strategy = IntervalStrategy.STEPS
 82 | 
 83 |             self.max_steps = 2
 84 | 
 85 |         if self.disable_tqdm:
 86 |             disable_progress_bar()
 87 | 
 88 | 
 89 | def find_noise_multiplier(sampling_probability: float, num_steps: int, target_epsilon: float, target_delta: float,
 90 |                           eps_error: float=0.1) -> float:
 91 |     """
 92 |     Find a noise multiplier that satisfies a given target epsilon.
 93 | 
 94 |     :param float sampling_probability: Probability of a record being in batch for Poisson sampling
 95 |     :param int num_steps: Number of optimisation steps
 96 |     :param float target_epsilon: Desired target epsilon
 97 |     :param float target_delta: Value of DP delta
 98 |     :param float eps_error: Error allowed for final epsilon
 99 |     """
100 |     def compute_epsilon(mu: float) -> float:
101 |         acc = Accountant(
102 |             noise_multiplier=mu,
103 |             sampling_probability=sampling_probability,
104 |             delta=target_delta,
105 |             max_compositions=num_steps,
106 |             eps_error=eps_error/2
107 |         )
108 |         return acc.compute_epsilon(num_steps)
109 | 
110 |     mu_max = 100.0
111 | 
112 |     mu_R = 1.0
113 |     eps_R = float('inf')
114 |     while eps_R > target_epsilon:
115 |         mu_R *= np.sqrt(2)
116 |         try:
117 |             eps_R = compute_epsilon(mu_R)[2]
118 |         except (OverflowError, RuntimeError):
119 |             pass
120 |         if mu_R > mu_max:
121 |             raise RuntimeError("Finding a suitable noise multiplier has not converged. "
122 |                                "Try increasing target epsilon or decreasing sampling probability.")
123 | 
124 |     mu_L = mu_R
125 |     eps_L = eps_R
126 |     while eps_L < target_epsilon:
127 |         mu_L /= np.sqrt(2)
128 |         eps_L = compute_epsilon(mu_L)[0]
129 | 
130 |     has_converged = False 
131 |     bracket = [mu_L, mu_R]
132 |     while not has_converged:
133 |         mu_err = (bracket[1]-bracket[0])*0.01
134 |         mu_guess = optimize.root_scalar(lambda mu: compute_epsilon(mu)[2]-target_epsilon, bracket=bracket, xtol=mu_err).root
135 |         bracket = [mu_guess-mu_err, mu_guess+mu_err]
136 |         eps_up = compute_epsilon(mu_guess-mu_err)[2]
137 |         eps_low = compute_epsilon(mu_guess+mu_err)[0]
138 |         has_converged = (eps_up - eps_low) < 2*eps_error
139 |     assert compute_epsilon(bracket[1])[2] < target_epsilon + eps_error
140 | 
141 |     return bracket[1]


--------------------------------------------------------------------------------
/src/dp_transformers/dp_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT License.
  3 | 
  4 | import pandas as pd
  5 | import datasets
  6 | from datasets import Dataset
  7 | import torch
  8 | from torch import nn
  9 | from torch.utils.data import DataLoader
 10 | from transformers import (
 11 |     Trainer, TrainerCallback, TrainerState, TrainerControl, logging,
 12 |     DataCollatorForLanguageModeling, PreTrainedTokenizer, training_args, modeling_utils
 13 | )
 14 | from transformers.file_utils import is_sagemaker_mp_enabled, is_datasets_available
 15 | import opacus
 16 | from opacus.accountants import RDPAccountant
 17 | from prv_accountant import Accountant as PRVAccountant
 18 | from contextlib import contextmanager
 19 | from typing import Any, Callable, List, Optional, Union, Dict, Sequence
 20 | from accelerate.optimizer import AcceleratedOptimizer
 21 | 
 22 | from dp_transformers import sampler, arguments
 23 | 
 24 | logger = logging.get_logger(__name__)
 25 | 
 26 | 
 27 | class DPCallback(TrainerCallback):
 28 |     """
 29 |     This class registers all the necessary callbacks to make transformers.Trainer compatible with opacus.
 30 |     """
 31 |     def __init__(
 32 |         self,
 33 |         noise_multiplier: float,
 34 |         target_delta: float,
 35 |         sampling_probability: float,
 36 |         rdp_accountant: RDPAccountant,
 37 |         prv_accountant: PRVAccountant,
 38 |         max_epsilon: float = float('inf')
 39 |     ) -> None:
 40 | 
 41 |         self.noise_multiplier = noise_multiplier
 42 |         self.target_delta = target_delta
 43 |         self.sampling_probability = sampling_probability
 44 |         self.rdp_accountant = rdp_accountant
 45 |         self.prv_accountant = prv_accountant
 46 | 
 47 |         self.max_epsilon = max_epsilon
 48 |         self.on_substep_end_was_called = False
 49 |         self.compute_rdp_epsilon = lambda: self.rdp_accountant.get_epsilon(self.target_delta)
 50 |         self.compute_prv_epsilon = lambda s: self.prv_accountant.compute_epsilon(s)[2]
 51 | 
 52 |     def on_substep_end(self, args: training_args.TrainingArguments, state: TrainerState, control: TrainerControl, optimizer=None, **kwargs):
 53 |         if optimizer is None:
 54 |             raise RuntimeError("Impossible to access optimizer from inside callback")
 55 |         if isinstance(optimizer, AcceleratedOptimizer):
 56 |             dp_optimizer = optimizer.optimizer
 57 |         else:
 58 |             dp_optimizer = optimizer
 59 |         dp_optimizer.signal_skip_step(do_skip=True)
 60 |         dp_optimizer.step()
 61 |         dp_optimizer.zero_grad()
 62 | 
 63 |         self.on_substep_end_was_called = True
 64 | 
 65 |     def on_step_end(self, args: training_args.TrainingArguments, state: TrainerState, control: TrainerControl, optimizer=None, **kwargs):
 66 |         if not (
 67 |             args.gradient_accumulation_steps <= 1 or
 68 |             self.on_substep_end_was_called
 69 |         ):
 70 |             raise RuntimeError(
 71 |                 "Gradient accumulation was specified but `on_substep_end` wasn't called. "
 72 |                 "Make sure you're using a recent version of transformers (>=4.10.0) "
 73 |                 "which has an appropriate callback in the trainer."
 74 |             )
 75 | 
 76 |         if optimizer is None:
 77 |             raise RuntimeError("Impossible to access optimizer from inside callback")
 78 |         optimizer.zero_grad()  # Opacus is bothered that HF does not call .zero_grad() on the optimizer
 79 | 
 80 |         self.rdp_accountant.step(noise_multiplier=self.noise_multiplier, sample_rate=self.sampling_probability)
 81 | 
 82 |     def on_save(self, args: training_args.TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
 83 |         return self._check_max_epsilon_exceeded(state, control)
 84 | 
 85 |     def on_evaluate(self, args: training_args.TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
 86 |         return self._check_max_epsilon_exceeded(state, control)
 87 | 
 88 |     def _check_max_epsilon_exceeded(self, state: TrainerState, control: TrainerControl) -> TrainerControl:
 89 |         eps_rdp = self.compute_rdp_epsilon()
 90 |         eps_prv = self.compute_prv_epsilon(state.global_step)
 91 |         if eps_rdp > self.max_epsilon or eps_prv > self.max_epsilon:
 92 |             logger.error("Max epsilon exceeded. Stopping training...")
 93 |             control.should_training_stop = True
 94 |         return control
 95 | 
 96 | 
 97 | class DataCollatorForPrivateCausalLanguageModeling(DataCollatorForLanguageModeling):
 98 |     def __init__(self, tokenizer: PreTrainedTokenizer):
 99 |         super().__init__(tokenizer=tokenizer, mlm=False)
100 | 
101 |     def __call__(self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]]) -> Dict[str, torch.Tensor]:
102 |         batch = super().__call__(examples)
103 | 
104 |         # Huggingface's default way of constructing position_ids is not compatible with Opacus
105 |         # since Opacus is not able to deduce the batch size from the input. Here we manually
106 |         # generate a position_ids tensor which has the same values as Huggingface's default tensor
107 |         # but it is constructed in a way that is compatile with Opacus by using expand_as.
108 |         if "position_ids" not in batch:
109 |             input_ids = batch["input_ids"]
110 |             batch["position_ids"] = torch.arange(
111 |                 input_ids.shape[1], dtype=torch.long, device=input_ids.device
112 |             ).repeat(input_ids.shape[0], 1)
113 |         return batch
114 | 
115 | 
116 | class GradSampleModule(opacus.GradSampleModule):
117 |     """
118 |     Little wrapper to provide `no_sync` context which is assumed by Huggingface trainer.
119 |     We don't need to do anything in addition here
120 |     """
121 |     @contextmanager
122 |     def no_sync(self):
123 |         yield
124 | 
125 | 
126 | def create_author_mapping(dataset: Dataset, author: str) -> Sequence[Sequence[int]]:
127 |     """
128 |     Creates a mapping from authors to samples in a dataset.
129 |     """
130 |     with dataset.formatted_as(type="pandas"):
131 |         authors = pd.DataFrame(data={"author": dataset[author]})
132 |         author_mapping = [g.index.values for _, g in authors.groupby("author")]
133 |     return author_mapping
134 | 
135 | 
136 | class OpacusDPTrainer(Trainer):
137 |     """
138 |     Wrapper to modify Huggingface Trainer to:
139 |         (i) remove "loss = loss / self.args.gradient_accumulation_steps" operation in training_step
140 |         as this is already handled by Opacus package.
141 |         (ii) enable author-level DP training by modifing the sampler and the dataloader. In the case
142 |         of sample-level DP, each sample can be represented by a unique author.
143 |         (iii) wrap the optimizer with Opacus' DPOptimizer/DistributedDPOptimizer
144 |     """
145 |     def __init__(
146 |         self,
147 |         model: Union[modeling_utils.PreTrainedModel, torch.nn.modules.module.Module] = None,
148 |         args: arguments.TrainingArguments = None,
149 |         train_dataset: Optional[torch.utils.data.dataset.Dataset] = None,
150 |         privacy_args: arguments.PrivacyArguments = None,
151 |         author_mapping: Optional[Sequence[Sequence[int]]] = None,
152 |         **kwargs: Dict
153 |     ) -> None:
154 | 
155 |         self.train_args = args
156 |         self.privacy_args = privacy_args
157 | 
158 |         # Sample-level DP is equivalent to mapping each sample to a unique author. 
159 |         if author_mapping is None:
160 |             author_mapping = [[i] for i in range(len(train_dataset))]
161 |         self.author_mapping = author_mapping
162 | 
163 |         if not self.privacy_args.is_initialized:
164 |             self.privacy_args.initialize(
165 |                 sampling_probability=self.sampling_probability,
166 |                 num_steps=self.num_steps,
167 |                 num_samples=len(self.author_mapping),
168 |             )
169 | 
170 |         # Wrap model in DDP and GradSampleModule
171 |         if args.parallel_mode == training_args.ParallelMode.DISTRIBUTED:
172 |             logger.info(f"Wrapping the model with DPDDP in distributed training.")
173 |             model = opacus.distributed.DifferentiallyPrivateDistributedDataParallel(model)
174 | 
175 |         model = GradSampleModule(model)
176 | 
177 |         # Instantiate privacy accountants
178 |         self.rdp_accountant = RDPAccountant()
179 |         self.prv_accountant = PRVAccountant(
180 |             noise_multiplier=self.privacy_args.noise_multiplier,
181 |             sampling_probability=self.sampling_probability,
182 |             delta=self.privacy_args.target_delta,
183 |             eps_error=0.1,
184 |             max_compositions=self.num_steps
185 |         )
186 | 
187 |         # Set up callback for accounting and handling grad acc
188 |         self.dp_callback = DPCallback(
189 |             noise_multiplier=self.privacy_args.noise_multiplier,
190 |             target_delta=self.privacy_args.target_delta,
191 |             sampling_probability=self.sampling_probability,
192 |             rdp_accountant=self.rdp_accountant,
193 |             prv_accountant=self.prv_accountant
194 |         )
195 |         super().__init__(model=model, args=args, train_dataset=train_dataset, callbacks=[self.dp_callback], **kwargs)
196 | 
197 |         self.get_rdp_epsilon = lambda: self.rdp_accountant.get_epsilon(self.privacy_args.target_delta)  # RDP epsilon
198 |         self.get_prv_epsilon = lambda: self.prv_accountant.compute_epsilon(self.state.global_step)[2]
199 | 
200 |     @property
201 |     def sampling_probability(self) -> float:
202 |         return self.train_args.per_device_train_batch_size * self.train_args.world_size * \
203 |             self.train_args.gradient_accumulation_steps / len(self.author_mapping)
204 | 
205 |     @property
206 |     def num_steps(self) -> int:
207 |         return int(self.train_args.num_train_epochs * (1 / self.sampling_probability + 1))
208 | 
209 |     def create_optimizer(self):
210 |         _ = super().create_optimizer()
211 | 
212 |         if self.args.parallel_mode == training_args.ParallelMode.DISTRIBUTED:
213 |             optimizer_generator = opacus.optimizers.DistributedDPOptimizer
214 |         else:
215 |             optimizer_generator = opacus.optimizers.DPOptimizer
216 | 
217 |         self.optimizer = optimizer_generator(
218 |             optimizer=self.optimizer,
219 |             noise_multiplier=self.privacy_args.noise_multiplier,
220 |             max_grad_norm=self.privacy_args.per_sample_max_grad_norm,
221 |             expected_batch_size=self.args.per_device_train_batch_size * self.args.gradient_accumulation_steps,
222 |         )
223 | 
224 |         return self.optimizer
225 | 
226 |     def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
227 |         """
228 |         Perform a training step on a batch of inputs.
229 | 
230 |         Subclass and override to inject custom behavior.
231 | 
232 |         Args:
233 |             model (:obj:`nn.Module`):
234 |                 The model to train.
235 |             inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
236 |                 The inputs and targets of the model.
237 | 
238 |                 The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
239 |                 argument :obj:`labels`. Check your model's documentation for all accepted arguments.
240 | 
241 |         Return:
242 |             :obj:`torch.Tensor`: The tensor with training loss on this batch.
243 |         """
244 |         model.train()
245 |         inputs = self._prepare_inputs(inputs)
246 | 
247 |         if is_sagemaker_mp_enabled():
248 |             raise NotImplementedError("DP currently doesn't support this")
249 | 
250 |         with self.compute_loss_context_manager():
251 |             loss = self.compute_loss(model, inputs)
252 | 
253 |         if self.args.n_gpu > 1:
254 |             loss = loss.mean()  # mean() to average on multi-gpu parallel training
255 | 
256 |         # Compared to the original HF implementation, we have to remove the loss scaling by the number of gradient
257 |         # accumulation steps since opacus scales the gradients accordingly. However, we still need to scale the loss
258 |         # that is returned in order for the logging to work correctly. Hence we scale the loss after the call to 
259 |         # loss.backward()
260 | 
261 |         if self.use_apex:
262 |             raise NotImplementedError("DP currently doesn't support this")
263 |         else:
264 |             loss.backward()
265 | 
266 |         return loss.detach()/self.args.gradient_accumulation_steps
267 | 
268 |     def _get_train_sampler(self):
269 |         """
270 |         Provides author sampler.
271 |         """
272 |         train_sampler = sampler.ShuffledAuthorSampler(
273 |             author_mapping=self.author_mapping,
274 |             batch_size=self.args.per_device_train_batch_size,
275 |             world_size=self.args.world_size
276 |         )
277 |         return train_sampler
278 | 
279 |     def get_train_dataloader(self) -> DataLoader:
280 |         """
281 |         Returns the training :class:`~torch.utils.data.DataLoader`.
282 | 
283 |         Will use the author-level sampler from dp_transformers.
284 |         """
285 |         if self.train_dataset is None:
286 |             raise ValueError("Trainer: training requires a train_dataset.")
287 | 
288 |         train_sampler = self._get_train_sampler()
289 | 
290 |         train_dataset = self.train_dataset
291 |         if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
292 |             train_dataset = self._remove_unused_columns(train_dataset, description="training")
293 | 
294 |         return DataLoader(
295 |             train_dataset,
296 |             batch_sampler=train_sampler,
297 |             collate_fn=self.data_collator,
298 |             drop_last=self.args.dataloader_drop_last,
299 |             num_workers=self.args.dataloader_num_workers,
300 |             pin_memory=self.args.dataloader_pin_memory,
301 |         )
302 | 


--------------------------------------------------------------------------------
/src/dp_transformers/grad_sample/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dp-transformers/f9fae445b1d3bb28355dbaac6720c007abb974ce/src/dp_transformers/grad_sample/__init__.py


--------------------------------------------------------------------------------
/src/dp_transformers/grad_sample/transformers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dp-transformers/f9fae445b1d3bb28355dbaac6720c007abb974ce/src/dp_transformers/grad_sample/transformers/__init__.py


--------------------------------------------------------------------------------
/src/dp_transformers/grad_sample/transformers/conv_1d.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from typing import Dict
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | from opt_einsum import contract
 9 | from typing import List
10 | 
11 | from opacus.grad_sample.utils import register_grad_sampler
12 | 
13 | from transformers.modeling_utils import Conv1D
14 | 
15 | 
16 | @register_grad_sampler(Conv1D)
17 | def compute_transformers_conv1d_grad_sample(
18 |     layer: Conv1D, activations: List[torch.Tensor], backprops: torch.Tensor
19 | ) -> Dict[nn.Parameter, torch.Tensor]:
20 |     activations = activations[0]
21 |     ret = {}
22 |     if layer.weight.requires_grad:
23 |         ret[layer.weight] = contract("n...i,n...j->nji", backprops, activations).contiguous()
24 |     if layer.bias is not None and layer.bias.requires_grad:
25 |         ret[layer.bias] = contract("n...k->nk", backprops)
26 |     return ret
27 | 


--------------------------------------------------------------------------------
/src/dp_transformers/module_modification.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import warnings
 5 | import torch
 6 | from transformers import GPT2Model, GPT2PreTrainedModel
 7 | from typing import List
 8 | 
 9 | 
10 | def force_causal_attention(model: GPT2Model):
11 |     """
12 |     Force a GPT2 model to use causal attention
13 | 
14 |     Some variants of GPT2 may use bi-directional attention for the context.
15 |     This can cause issues when training in an auto-regressive fashion. This function forces causal attention
16 |     """
17 |     if not isinstance(model, GPT2Model):
18 |         raise TypeError("Requires a GPT2 model")
19 | 
20 |     if not hasattr(model, "h") and hasattr(model, "transformer"):
21 |         warnings.warn("""It looks like you have a model with a classification or LM head. """
22 |                       """If this is the case, pass `model.transformer` to `force_causal_attention` to avoid this warning. """, UserWarning)
23 |         transformer = model.transformer
24 |     else:
25 |         transformer = model
26 | 
27 | 
28 |     for h_i in transformer.h:
29 |         h_i.attn.bias = torch.tril(h_i.attn.bias)
30 | 
31 |     return model
32 | 
33 | 
34 |     


--------------------------------------------------------------------------------
/src/dp_transformers/sampler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import torch
 5 | 
 6 | from typing import Sequence
 7 | from torch.utils.data.sampler import Sampler, BatchSampler, SubsetRandomSampler, RandomSampler
 8 | from torch.utils.data.distributed import DistributedSampler
 9 | from opacus.utils.uniform_sampler import UniformWithReplacementSampler
10 | 
11 | from typing import Iterator, List
12 | 
13 | class AuthorSampler(Sampler):
14 |     def __init__(self, author_sampler: Sampler, author_mapping: Sequence[Sequence[int]]):
15 |         self.author_mapping = list(author_mapping)
16 |         self.author_sampler = author_sampler
17 |         self.indices = [0 for _ in range(len(self.author_mapping))]
18 | 
19 |     def __len__(self) -> int:
20 |         return len(self.author_sampler)
21 | 
22 |     def __iter__(self) -> Iterator[List[int]]:
23 |         for batch_author_ids in self.author_sampler:
24 |             sample_ids = [self.indices[author_id] for author_id in batch_author_ids]
25 |             for author_id in batch_author_ids:
26 |                 self.indices[author_id] += 1
27 |                 self.indices[author_id] = self.indices[author_id] % len(self.author_mapping[author_id])
28 |             yield [int(self.author_mapping[author_id][sample_id]) for author_id, sample_id in zip(batch_author_ids, sample_ids)]
29 | 
30 | 
31 | class PoissonAuthorSampler(AuthorSampler):
32 |     def __init__(self, author_mapping: Sequence[Sequence[int]], sample_rate: float) -> None:
33 |         """
34 |         Create batches by first sampling authors with uniform probability and then sampling a random element from the author
35 | 
36 |         :param author_mapping: A mapping where `dataset[author_mapping[i][j]]` produces the j-th sample of the i-th author in the dataset.
37 |         :type author_mapping: Sequence[Sequence[int]]
38 |         :param float sample_rate: Probability with which a author is sampled `E[len(batch_size)] = sample_rate*len(dataset)`
39 |         """
40 |         author_sampler = UniformWithReplacementSampler(
41 |             num_samples=len(author_mapping),
42 |             sample_rate=sample_rate
43 |         )
44 |         super().__init__(author_sampler, author_mapping)
45 | 
46 | 
47 | class ShuffledAuthorSampler(AuthorSampler):
48 |     def __init__(self, author_mapping: Sequence[Sequence[int]], batch_size: int, world_size: int) -> None:
49 |         """
50 |         Create batches by first shuffling the authors and then sampling the next element from the author
51 | 
52 |         :param author_mapping: A mapping where `dataset[author_mapping[i][j]]` produces the j-th sample of the i-th author in the dataset.
53 |         :type author_mapping: Sequence[Sequence[int]]
54 |         :param int batch_size: Batch size of the output
55 |         """
56 |         if world_size <= 1:
57 |             author_sampler = BatchSampler(RandomSampler(author_mapping), batch_size=batch_size, drop_last=True)
58 |         else:
59 |             author_sampler = BatchSampler(DistributedSampler(author_mapping), batch_size=batch_size, drop_last=True)
60 |         super().__init__(author_sampler, author_mapping)


--------------------------------------------------------------------------------
/tests/test_dp_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | from dp_transformers.arguments import find_noise_multiplier
 5 | from prv_accountant import Accountant
 6 | import pytest
 7 | 
 8 | class TestFindNoiseMultiplier:
 9 |     def test_sensible_range(self):
10 |         mu = find_noise_multiplier(2e-3, 10_000, 4.0, 1e-7)
11 |         assert 0 < mu and mu < 2 # Check that mu is in a sensible interval
12 | 
13 |     def test_inverse(self):
14 |         mu = find_noise_multiplier(2e-3, 10_000, 4.0, 1e-7)
15 |         acc = Accountant(mu, 2e-3, 1e-7, 10_000, eps_error = 0.5)
16 |         eps = acc.compute_epsilon(10_000)
17 |         assert eps[2] == pytest.approx(4, abs=0.5)
18 | 
19 |     def test_robustness(self):
20 |         with pytest.warns(None) as record:
21 |             mu = find_noise_multiplier(
22 |                 sampling_probability=256/50_000,
23 |                 num_steps=int(50*50_000/256),
24 |                 target_epsilon=10.0,
25 |                 target_delta=1e-5
26 |             )
27 |         assert len(record) == 0
28 | 
29 |     def test_robustness_2(self):
30 |         mu = find_noise_multiplier(
31 |             sampling_probability=0.26058631921824105,
32 |             num_steps=18800,
33 |             target_delta=0.00011448277499759097,
34 |             target_epsilon=4.0
35 |         )
36 |         # Just test that this doesn't cause a floating point overflow
37 |         print(mu)


--------------------------------------------------------------------------------
/tests/test_grad_sample/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/dp-transformers/f9fae445b1d3bb28355dbaac6720c007abb974ce/tests/test_grad_sample/__init__.py


--------------------------------------------------------------------------------
/tests/test_grad_sample/test_transformers_conv_1d.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import torch
 5 | 
 6 | from opacus.tests.grad_samples.common import GradSampleHooks_test
 7 | 
 8 | from transformers.modeling_utils import Conv1D
 9 | 
10 | from dp_transformers.grad_sample.transformers import conv_1d
11 | 
12 | 
13 | class TestConv1D(GradSampleHooks_test):
14 |     def test_grad_sample(self):
15 |         """
16 |         Verify that our custom implementation of the grad sample for huggingface's Conv1D
17 |         layer works. We largely build on the test routines in opacus's library.
18 |         """
19 |         x = torch.randn(16, 8)
20 |         layer = Conv1D(4, 8)
21 |         self.run_test(x, layer, batch_first=True, ew_compatible=False)
22 | 
23 |         self.run_test(torch.randn(24, 8, 8), Conv1D(4, 8), batch_first=True, ew_compatible=False)
24 | 


--------------------------------------------------------------------------------
/tests/test_models.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT License.
 3 | 
 4 | import pytest
 5 | 
 6 | from transformers import AutoModelForCausalLM
 7 | from opacus.validators import ModuleValidator
 8 | from opacus.validators.errors import UnsupportedModuleError
 9 | 
10 | 
11 | @pytest.mark.xfail(reason='functorch can deal with module in Opacus 1.2')
12 | def test_gpt2_grad_sample_layers_registered():
13 |     """
14 |     Test whether all layers in GPT2 are registered in the grad sampler.
15 |     """
16 |     model = AutoModelForCausalLM.from_pretrained("distilgpt2")
17 |     model.train()
18 | 
19 |     validator = ModuleValidator()
20 | 
21 |     # We haven't registered the grad samples yet so make sure that it actually fails
22 |     with pytest.raises(UnsupportedModuleError):
23 |         validator.validate(model, strict=True)
24 | 
25 |     # Register the grad samples
26 |     from dp_transformers.grad_sample.transformers import conv_1d
27 | 
28 |     # Now make sure that it works
29 |     validator.validate(model, strict=True)
30 | 


--------------------------------------------------------------------------------