├── .gitignore
├── LICENSE
├── README.md
├── adapter
    ├── distilbert-movie-review
    │   ├── 1_finetune-last-layers.ipynb
    │   ├── 2_finetune-using-adapter-layers.ipynb
    │   ├── 3_finetune-all-layers.ipynb
    │   ├── 4_finetune-all-layers-and-adapter-layers.ipynb
    │   ├── README.md
    │   ├── figures
    │   │   └── result-summary.png
    │   └── local_dataset_utilities.py
    └── lora-from-scratch
    │   └── lora-dora-mlp.ipynb
├── conventional
    └── distilbert-movie-review
    │   ├── 1_feature-extractor.ipynb
    │   ├── 2_finetune-last-layers.ipynb
    │   ├── 3_finetuning-all-layers.ipynb
    │   ├── figures
    │       ├── 1_feature-based.png
    │       ├── 2_finetune-last.png
    │       └── 3_finetune-all.png
    │   ├── layerwise-experiment
    │       ├── README.md
    │       ├── layerwise-experiment-results-clean.txt
    │       ├── layerwise-experiment-run.py
    │       ├── layerwise-experiment.ipynb
    │       ├── layerwise-experiment.py
    │       ├── layerwise-results.png
    │       ├── local_dataset_utilities.py
    │       └── results.txt
    │   ├── local_dataset_utilities.py
    │   └── mixed-precision-experiment
    │       ├── README.md
    │       ├── bfloat16-mixed-high.py
    │       ├── bfloat16-mixed-medium.py
    │       ├── bfloat16-mixed.py
    │       ├── bfloat16-regular.py
    │       ├── figures
    │           ├── 1.png
    │           ├── 2.png
    │           └── 3.png
    │       ├── float16-mixed-high.py
    │       ├── float16-mixed-medium.py
    │       ├── float16-mixed.py
    │       ├── float16-regular.py
    │       ├── float32-regular-high.py
    │       ├── float32-regular-medium.py
    │       ├── float32-regular.py
    │       ├── float64-regular.py
    │       └── local_dataset_utilities.py
└── lit-benchmarks
    └── falcon-7b
        ├── README.md
        ├── figures
            ├── lit-parrot.png
            ├── memory-requirements.png
            └── training-time.png
        └── finetune
            ├── adapter.py
            ├── adapter_v2.py
            ├── full.py
            └── lora.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | .DS_Store
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | pip-wheel-metadata/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 97 | __pypackages__/
 98 | 
 99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 | 
103 | # SageMath parsed files
104 | *.sage.py
105 | 
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 | 
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 | 
119 | # Rope project settings
120 | .ropeproject
121 | 
122 | # mkdocs documentation
123 | /site
124 | 
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 | 
130 | # Pyre type checker
131 | .pyre/
132 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LLM-finetuning-scripts


--------------------------------------------------------------------------------
/adapter/distilbert-movie-review/README.md:
--------------------------------------------------------------------------------
1 | # Result Summary
2 | 
3 | 
4 | 
5 | ![result-summary](figures/result-summary.png)


--------------------------------------------------------------------------------
/adapter/distilbert-movie-review/figures/result-summary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/adapter/distilbert-movie-review/figures/result-summary.png


--------------------------------------------------------------------------------
/adapter/distilbert-movie-review/local_dataset_utilities.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import tarfile
  4 | import time
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from packaging import version
  9 | from torch.utils.data import Dataset
 10 | from tqdm import tqdm
 11 | import urllib
 12 | 
 13 | 
 14 | def reporthook(count, block_size, total_size):
 15 |     global start_time
 16 |     if count == 0:
 17 |         start_time = time.time()
 18 |         return
 19 |     duration = time.time() - start_time
 20 |     progress_size = int(count * block_size)
 21 |     speed = progress_size / (1024.0**2 * duration)
 22 |     percent = count * block_size * 100.0 / total_size
 23 | 
 24 |     sys.stdout.write(
 25 |         f"\r{int(percent)}% | {progress_size / (1024.**2):.2f} MB "
 26 |         f"| {speed:.2f} MB/s | {duration:.2f} sec elapsed"
 27 |     )
 28 |     sys.stdout.flush()
 29 | 
 30 | 
 31 | def download_dataset():
 32 |     source = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
 33 |     target = "aclImdb_v1.tar.gz"
 34 | 
 35 |     if os.path.exists(target):
 36 |         os.remove(target)
 37 | 
 38 |     if not os.path.isdir("aclImdb") and not os.path.isfile("aclImdb_v1.tar.gz"):
 39 |         urllib.request.urlretrieve(source, target, reporthook)
 40 | 
 41 |     if not os.path.isdir("aclImdb"):
 42 | 
 43 |         with tarfile.open(target, "r:gz") as tar:
 44 |             tar.extractall()
 45 | 
 46 | 
 47 | def load_dataset_into_to_dataframe():
 48 |     basepath = "aclImdb"
 49 | 
 50 |     labels = {"pos": 1, "neg": 0}
 51 | 
 52 |     df = pd.DataFrame()
 53 | 
 54 |     with tqdm(total=50000) as pbar:
 55 |         for s in ("test", "train"):
 56 |             for l in ("pos", "neg"):
 57 |                 path = os.path.join(basepath, s, l)
 58 |                 for file in sorted(os.listdir(path)):
 59 |                     with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
 60 |                         txt = infile.read()
 61 | 
 62 |                     if version.parse(pd.__version__) >= version.parse("1.3.2"):
 63 |                         x = pd.DataFrame(
 64 |                             [[txt, labels[l]]], columns=["review", "sentiment"]
 65 |                         )
 66 |                         df = pd.concat([df, x], ignore_index=False)
 67 | 
 68 |                     else:
 69 |                         df = df.append([[txt, labels[l]]], ignore_index=True)
 70 |                     pbar.update()
 71 |     df.columns = ["text", "label"]
 72 | 
 73 |     np.random.seed(0)
 74 |     df = df.reindex(np.random.permutation(df.index))
 75 | 
 76 |     print("Class distribution:")
 77 |     np.bincount(df["label"].values)
 78 | 
 79 |     return df
 80 | 
 81 | 
 82 | def partition_dataset(df):
 83 |     df_shuffled = df.sample(frac=1, random_state=1).reset_index()
 84 | 
 85 |     df_train = df_shuffled.iloc[:35_000]
 86 |     df_val = df_shuffled.iloc[35_000:40_000]
 87 |     df_test = df_shuffled.iloc[40_000:]
 88 | 
 89 |     df_train.to_csv("train.csv", index=False, encoding="utf-8")
 90 |     df_val.to_csv("val.csv", index=False, encoding="utf-8")
 91 |     df_test.to_csv("test.csv", index=False, encoding="utf-8")
 92 | 
 93 | 
 94 | class IMDBDataset(Dataset):
 95 |     def __init__(self, dataset_dict, partition_key="train"):
 96 |         self.partition = dataset_dict[partition_key]
 97 | 
 98 |     def __getitem__(self, index):
 99 |         return self.partition[index]
100 | 
101 |     def __len__(self):
102 |         return self.partition.num_rows


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/1_feature-extractor.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "3c5d72f4",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# LLM as Feature Extractor"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "bb9d0299-8fc0-48f0-9b02-4c19214d479a",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "In this feature-based approach, we are using the embeddings from a pretrained transormer to train a random forest and logistic regression model in scikit-learn:\n",
 17 |     "\n",
 18 |     "<img src=\"figures/1_feature-based.png\" width=500>"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "id": "6fd9cda8",
 25 |    "metadata": {
 26 |     "tags": []
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "# pip install transformers datasets"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "id": "df18e3de-577a-43c5-8b9d-868397a6d7da",
 37 |    "metadata": {
 38 |     "tags": []
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "# conda install sklearn --yes"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 3,
 48 |    "id": "033b75c5",
 49 |    "metadata": {
 50 |     "tags": []
 51 |    },
 52 |    "outputs": [
 53 |     {
 54 |      "name": "stdout",
 55 |      "output_type": "stream",
 56 |      "text": [
 57 |       "torch       : 2.0.0\n",
 58 |       "transformers: 4.27.4\n",
 59 |       "datasets    : 2.11.0\n",
 60 |       "sklearn     : 1.2.2\n",
 61 |       "\n",
 62 |       "conda environment: finetuning-blog\n",
 63 |       "\n"
 64 |      ]
 65 |     }
 66 |    ],
 67 |    "source": [
 68 |     "%load_ext watermark\n",
 69 |     "%watermark --conda -p torch,transformers,datasets,sklearn"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 4,
 75 |    "id": "602ba8a0",
 76 |    "metadata": {
 77 |     "tags": []
 78 |    },
 79 |    "outputs": [
 80 |     {
 81 |      "name": "stdout",
 82 |      "output_type": "stream",
 83 |      "text": [
 84 |       "cuda:0\n"
 85 |      ]
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "import torch\n",
 90 |     "\n",
 91 |     "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
 92 |     "print(device)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "id": "4cfd724d",
 98 |    "metadata": {
 99 |     "tags": []
100 |    },
101 |    "source": [
102 |     "# 1 Loading the Dataset"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 5,
108 |    "id": "e39e2228-5f0b-4fb9-b762-df26c2052b45",
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "# pip install datasets\n",
113 |     "\n",
114 |     "import os.path as op\n",
115 |     "\n",
116 |     "from datasets import load_dataset\n",
117 |     "\n",
118 |     "import lightning as L\n",
119 |     "from lightning.pytorch.loggers import CSVLogger\n",
120 |     "from lightning.pytorch.callbacks import ModelCheckpoint\n",
121 |     "\n",
122 |     "import numpy as np\n",
123 |     "import pandas as pd\n",
124 |     "import torch\n",
125 |     "\n",
126 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
127 |     "\n",
128 |     "from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset\n",
129 |     "from local_dataset_utilities import IMDBDataset"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 6,
135 |    "id": "fb31ac90-9e3a-41d0-baf1-8e613043924b",
136 |    "metadata": {},
137 |    "outputs": [
138 |     {
139 |      "name": "stderr",
140 |      "output_type": "stream",
141 |      "text": [
142 |       "100%|███████████████████████████████████████████| 50000/50000 [00:25<00:00, 1973.05it/s]\n"
143 |      ]
144 |     },
145 |     {
146 |      "name": "stdout",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "Class distribution:\n"
150 |      ]
151 |     }
152 |    ],
153 |    "source": [
154 |     "download_dataset()\n",
155 |     "\n",
156 |     "df = load_dataset_into_to_dataframe()\n",
157 |     "partition_dataset(df)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 7,
163 |    "id": "221f30a1-b433-4304-a18d-8d03abd42b58",
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "df_train = pd.read_csv(\"train.csv\")\n",
168 |     "df_val = pd.read_csv(\"val.csv\")\n",
169 |     "df_test = pd.read_csv(\"test.csv\")"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "markdown",
174 |    "id": "846d83b1",
175 |    "metadata": {},
176 |    "source": [
177 |     "# 2 Tokenization and Numericalization"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 8,
183 |    "id": "21114d27-2697-4132-9714-b259bd63f5a1",
184 |    "metadata": {},
185 |    "outputs": [
186 |     {
187 |      "name": "stdout",
188 |      "output_type": "stream",
189 |      "text": [
190 |       "Downloading and preparing dataset csv/default to /home/sebastian/.cache/huggingface/datasets/csv/default-2417067d5b75d213/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...\n"
191 |      ]
192 |     },
193 |     {
194 |      "data": {
195 |       "application/vnd.jupyter.widget-view+json": {
196 |        "model_id": "0f3dbdca454a4e7d8ebfe80e8e946e7d",
197 |        "version_major": 2,
198 |        "version_minor": 0
199 |       },
200 |       "text/plain": [
201 |        "Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]"
202 |       ]
203 |      },
204 |      "metadata": {},
205 |      "output_type": "display_data"
206 |     },
207 |     {
208 |      "data": {
209 |       "application/vnd.jupyter.widget-view+json": {
210 |        "model_id": "1d913db1678e4636849970ff87653992",
211 |        "version_major": 2,
212 |        "version_minor": 0
213 |       },
214 |       "text/plain": [
215 |        "Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]"
216 |       ]
217 |      },
218 |      "metadata": {},
219 |      "output_type": "display_data"
220 |     },
221 |     {
222 |      "data": {
223 |       "application/vnd.jupyter.widget-view+json": {
224 |        "model_id": "",
225 |        "version_major": 2,
226 |        "version_minor": 0
227 |       },
228 |       "text/plain": [
229 |        "Generating train split: 0 examples [00:00, ? examples/s]"
230 |       ]
231 |      },
232 |      "metadata": {},
233 |      "output_type": "display_data"
234 |     },
235 |     {
236 |      "data": {
237 |       "application/vnd.jupyter.widget-view+json": {
238 |        "model_id": "",
239 |        "version_major": 2,
240 |        "version_minor": 0
241 |       },
242 |       "text/plain": [
243 |        "Generating validation split: 0 examples [00:00, ? examples/s]"
244 |       ]
245 |      },
246 |      "metadata": {},
247 |      "output_type": "display_data"
248 |     },
249 |     {
250 |      "data": {
251 |       "application/vnd.jupyter.widget-view+json": {
252 |        "model_id": "",
253 |        "version_major": 2,
254 |        "version_minor": 0
255 |       },
256 |       "text/plain": [
257 |        "Generating test split: 0 examples [00:00, ? examples/s]"
258 |       ]
259 |      },
260 |      "metadata": {},
261 |      "output_type": "display_data"
262 |     },
263 |     {
264 |      "name": "stdout",
265 |      "output_type": "stream",
266 |      "text": [
267 |       "Dataset csv downloaded and prepared to /home/sebastian/.cache/huggingface/datasets/csv/default-2417067d5b75d213/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.\n"
268 |      ]
269 |     },
270 |     {
271 |      "data": {
272 |       "application/vnd.jupyter.widget-view+json": {
273 |        "model_id": "0afb64f5f1f945248ac5ed71b2bda5d7",
274 |        "version_major": 2,
275 |        "version_minor": 0
276 |       },
277 |       "text/plain": [
278 |        "  0%|          | 0/3 [00:00<?, ?it/s]"
279 |       ]
280 |      },
281 |      "metadata": {},
282 |      "output_type": "display_data"
283 |     },
284 |     {
285 |      "name": "stdout",
286 |      "output_type": "stream",
287 |      "text": [
288 |       "DatasetDict({\n",
289 |       "    train: Dataset({\n",
290 |       "        features: ['index', 'text', 'label'],\n",
291 |       "        num_rows: 35000\n",
292 |       "    })\n",
293 |       "    validation: Dataset({\n",
294 |       "        features: ['index', 'text', 'label'],\n",
295 |       "        num_rows: 5000\n",
296 |       "    })\n",
297 |       "    test: Dataset({\n",
298 |       "        features: ['index', 'text', 'label'],\n",
299 |       "        num_rows: 10000\n",
300 |       "    })\n",
301 |       "})\n"
302 |      ]
303 |     }
304 |    ],
305 |    "source": [
306 |     "imdb_dataset = load_dataset(\n",
307 |     "    \"csv\",\n",
308 |     "    data_files={\n",
309 |     "        \"train\": \"train.csv\",\n",
310 |     "        \"validation\": \"val.csv\",\n",
311 |     "        \"test\": \"test.csv\",\n",
312 |     "    },\n",
313 |     ")\n",
314 |     "\n",
315 |     "print(imdb_dataset)"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": 9,
321 |    "id": "5ea762ba",
322 |    "metadata": {},
323 |    "outputs": [
324 |     {
325 |      "name": "stdout",
326 |      "output_type": "stream",
327 |      "text": [
328 |       "Tokenizer input max length: 512\n",
329 |       "Tokenizer vocabulary size: 30522\n"
330 |      ]
331 |     }
332 |    ],
333 |    "source": [
334 |     "from transformers import AutoTokenizer\n",
335 |     "\n",
336 |     "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
337 |     "print(\"Tokenizer input max length:\", tokenizer.model_max_length)\n",
338 |     "print(\"Tokenizer vocabulary size:\", tokenizer.vocab_size)"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 10,
344 |    "id": "8432c15c",
345 |    "metadata": {},
346 |    "outputs": [],
347 |    "source": [
348 |     "def tokenize_text(batch):\n",
349 |     "    return tokenizer(batch[\"text\"], truncation=True, padding=True)"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 11,
355 |    "id": "0bb392cf",
356 |    "metadata": {},
357 |    "outputs": [
358 |     {
359 |      "data": {
360 |       "application/vnd.jupyter.widget-view+json": {
361 |        "model_id": "",
362 |        "version_major": 2,
363 |        "version_minor": 0
364 |       },
365 |       "text/plain": [
366 |        "Map:   0%|          | 0/35000 [00:00<?, ? examples/s]"
367 |       ]
368 |      },
369 |      "metadata": {},
370 |      "output_type": "display_data"
371 |     },
372 |     {
373 |      "data": {
374 |       "application/vnd.jupyter.widget-view+json": {
375 |        "model_id": "",
376 |        "version_major": 2,
377 |        "version_minor": 0
378 |       },
379 |       "text/plain": [
380 |        "Map:   0%|          | 0/5000 [00:00<?, ? examples/s]"
381 |       ]
382 |      },
383 |      "metadata": {},
384 |      "output_type": "display_data"
385 |     },
386 |     {
387 |      "data": {
388 |       "application/vnd.jupyter.widget-view+json": {
389 |        "model_id": "",
390 |        "version_major": 2,
391 |        "version_minor": 0
392 |       },
393 |       "text/plain": [
394 |        "Map:   0%|          | 0/10000 [00:00<?, ? examples/s]"
395 |       ]
396 |      },
397 |      "metadata": {},
398 |      "output_type": "display_data"
399 |     }
400 |    ],
401 |    "source": [
402 |     "imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": 12,
408 |    "id": "6d4103c3",
409 |    "metadata": {},
410 |    "outputs": [],
411 |    "source": [
412 |     "del imdb_dataset"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "markdown",
417 |    "id": "bfeb1553",
418 |    "metadata": {},
419 |    "source": [
420 |     "# 3 Using DistilBERT as a Feature Extractor"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": 13,
426 |    "id": "9f2c474d",
427 |    "metadata": {},
428 |    "outputs": [
429 |     {
430 |      "name": "stderr",
431 |      "output_type": "stream",
432 |      "text": [
433 |       "Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']\n",
434 |       "- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
435 |       "- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
436 |      ]
437 |     }
438 |    ],
439 |    "source": [
440 |     "from transformers import AutoModel\n",
441 |     "\n",
442 |     "model = AutoModel.from_pretrained(\"distilbert-base-uncased\")\n",
443 |     "model.to(device);"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": 14,
449 |    "id": "c6686adc",
450 |    "metadata": {},
451 |    "outputs": [],
452 |    "source": [
453 |     "imdb_tokenized.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"label\"])"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": 15,
459 |    "id": "07122e49",
460 |    "metadata": {},
461 |    "outputs": [
462 |     {
463 |      "data": {
464 |       "text/plain": [
465 |        "torch.Size([3, 512, 768])"
466 |       ]
467 |      },
468 |      "execution_count": 15,
469 |      "metadata": {},
470 |      "output_type": "execute_result"
471 |     }
472 |    ],
473 |    "source": [
474 |     "test_batch = {\"attention_mask\": imdb_tokenized[\"train\"][:3][\"attention_mask\"].to(device),\n",
475 |     "              \"input_ids\": imdb_tokenized[\"train\"][:3][\"input_ids\"].to(device)}\n",
476 |     "\n",
477 |     "with torch.inference_mode():\n",
478 |     "    test_output = model(**test_batch)\n",
479 |     "    \n",
480 |     "test_output.last_hidden_state.shape"
481 |    ]
482 |   },
483 |   {
484 |    "cell_type": "code",
485 |    "execution_count": 16,
486 |    "id": "083e61f1",
487 |    "metadata": {},
488 |    "outputs": [
489 |     {
490 |      "data": {
491 |       "text/plain": [
492 |        "torch.Size([3, 768])"
493 |       ]
494 |      },
495 |      "execution_count": 16,
496 |      "metadata": {},
497 |      "output_type": "execute_result"
498 |     }
499 |    ],
500 |    "source": [
501 |     "cls_token_output = test_output.last_hidden_state[:, 0]\n",
502 |     "cls_token_output.shape"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "code",
507 |    "execution_count": 17,
508 |    "id": "316d0450",
509 |    "metadata": {},
510 |    "outputs": [],
511 |    "source": [
512 |     "@torch.inference_mode()\n",
513 |     "def get_output_embeddings(batch):\n",
514 |     "    output = model(\n",
515 |     "        batch[\"input_ids\"].to(device),\n",
516 |     "        attention_mask=batch[\"attention_mask\"].to(device)).last_hidden_state[:, 0]\n",
517 |     "    return {\"features\": output.cpu().numpy()}"
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "code",
522 |    "execution_count": 18,
523 |    "id": "2629aaa3",
524 |    "metadata": {},
525 |    "outputs": [
526 |     {
527 |      "data": {
528 |       "application/vnd.jupyter.widget-view+json": {
529 |        "model_id": "",
530 |        "version_major": 2,
531 |        "version_minor": 0
532 |       },
533 |       "text/plain": [
534 |        "Map:   0%|          | 0/35000 [00:00<?, ? examples/s]"
535 |       ]
536 |      },
537 |      "metadata": {},
538 |      "output_type": "display_data"
539 |     },
540 |     {
541 |      "data": {
542 |       "application/vnd.jupyter.widget-view+json": {
543 |        "model_id": "",
544 |        "version_major": 2,
545 |        "version_minor": 0
546 |       },
547 |       "text/plain": [
548 |        "Map:   0%|          | 0/5000 [00:00<?, ? examples/s]"
549 |       ]
550 |      },
551 |      "metadata": {},
552 |      "output_type": "display_data"
553 |     },
554 |     {
555 |      "data": {
556 |       "application/vnd.jupyter.widget-view+json": {
557 |        "model_id": "",
558 |        "version_major": 2,
559 |        "version_minor": 0
560 |       },
561 |       "text/plain": [
562 |        "Map:   0%|          | 0/10000 [00:00<?, ? examples/s]"
563 |       ]
564 |      },
565 |      "metadata": {},
566 |      "output_type": "display_data"
567 |     }
568 |    ],
569 |    "source": [
570 |     "import time\n",
571 |     "start = time.time()\n",
572 |     "\n",
573 |     "imdb_features = imdb_tokenized.map(get_output_embeddings, batched=True, batch_size=10)"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": 19,
579 |    "id": "0fe91178",
580 |    "metadata": {},
581 |    "outputs": [],
582 |    "source": [
583 |     "X_train = np.array(imdb_features[\"train\"][\"features\"])\n",
584 |     "y_train = np.array(imdb_features[\"train\"][\"label\"])\n",
585 |     "\n",
586 |     "X_val = np.array(imdb_features[\"validation\"][\"features\"])\n",
587 |     "y_val = np.array(imdb_features[\"validation\"][\"label\"])\n",
588 |     "\n",
589 |     "X_test = np.array(imdb_features[\"test\"][\"features\"])\n",
590 |     "y_test = np.array(imdb_features[\"test\"][\"label\"])"
591 |    ]
592 |   },
593 |   {
594 |    "cell_type": "markdown",
595 |    "id": "e76e2e95-e9b3-4a54-b778-0bdcef59f098",
596 |    "metadata": {},
597 |    "source": [
598 |     "# 4 Train Model on Embeddings (Extracted Features)"
599 |    ]
600 |   },
601 |   {
602 |    "cell_type": "code",
603 |    "execution_count": 20,
604 |    "id": "81c31cf9-ec66-41a9-aa54-3e5b6ca33cf6",
605 |    "metadata": {},
606 |    "outputs": [
607 |     {
608 |      "name": "stdout",
609 |      "output_type": "stream",
610 |      "text": [
611 |       "Training accuracy 0.8866285714285714\n",
612 |       "Validation accuracy 0.883\n",
613 |       "test accuracy 0.8795\n",
614 |       "Time elapsed 3.28 min\n"
615 |      ]
616 |     }
617 |    ],
618 |    "source": [
619 |     "from sklearn.linear_model import LogisticRegression\n",
620 |     "\n",
621 |     "clf = LogisticRegression(max_iter=1000)\n",
622 |     "clf.fit(X_train, y_train)\n",
623 |     "\n",
624 |     "print(\"Training accuracy\", clf.score(X_train, y_train))\n",
625 |     "print(\"Validation accuracy\", clf.score(X_val, y_val))\n",
626 |     "print(\"test accuracy\", clf.score(X_test, y_test))\n",
627 |     "\n",
628 |     "end = time.time()\n",
629 |     "elapsed = end - start\n",
630 |     "print(f\"Time elapsed {elapsed/60:.2f} min\")"
631 |    ]
632 |   },
633 |   {
634 |    "cell_type": "code",
635 |    "execution_count": 21,
636 |    "id": "201a4329-7a91-4501-9c75-4d18f4646fa5",
637 |    "metadata": {},
638 |    "outputs": [
639 |     {
640 |      "name": "stdout",
641 |      "output_type": "stream",
642 |      "text": [
643 |       "Training accuracy 1.0\n",
644 |       "Validation accuracy 0.8408\n",
645 |       "test accuracy 0.8324\n"
646 |      ]
647 |     }
648 |    ],
649 |    "source": [
650 |     "from sklearn.ensemble import RandomForestClassifier\n",
651 |     "\n",
652 |     "clf = RandomForestClassifier()\n",
653 |     "clf.fit(X_train, y_train)\n",
654 |     "\n",
655 |     "print(\"Training accuracy\", clf.score(X_train, y_train))\n",
656 |     "print(\"Validation accuracy\", clf.score(X_val, y_val))\n",
657 |     "print(\"test accuracy\", clf.score(X_test, y_test))"
658 |    ]
659 |   }
660 |  ],
661 |  "metadata": {
662 |   "kernelspec": {
663 |    "display_name": "Python 3 (ipykernel)",
664 |    "language": "python",
665 |    "name": "python3"
666 |   },
667 |   "language_info": {
668 |    "codemirror_mode": {
669 |     "name": "ipython",
670 |     "version": 3
671 |    },
672 |    "file_extension": ".py",
673 |    "mimetype": "text/x-python",
674 |    "name": "python",
675 |    "nbconvert_exporter": "python",
676 |    "pygments_lexer": "ipython3",
677 |    "version": "3.10.6"
678 |   }
679 |  },
680 |  "nbformat": 4,
681 |  "nbformat_minor": 5
682 | }
683 | 


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/figures/1_feature-based.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/conventional/distilbert-movie-review/figures/1_feature-based.png


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/figures/2_finetune-last.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/conventional/distilbert-movie-review/figures/2_finetune-last.png


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/figures/3_finetune-all.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/conventional/distilbert-movie-review/figures/3_finetune-all.png


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/layerwise-experiment/README.md:
--------------------------------------------------------------------------------
 1 | # Layerwise experiment
 2 | 
 3 | Run
 4 | 
 5 | ```bash
 6 | python layerwise-experiment-run.py
 7 | ```
 8 | 
 9 | to produce the `layerwise-experiment-results-clean.txt` files. The first `Test metric` in each section represents the training accuracy, the second instance the validation accuracy, and the third instance the test accuracy.  The `results.txt` file is a more readable, annotated version. 
10 | 
11 | The results are visualized below (plotting code note included.)
12 | 
13 | ![](layerwise-results.png)


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/layerwise-experiment/layerwise-experiment-results-clean.txt:
--------------------------------------------------------------------------------
  1 | Class distribution:
  2 | Downloading and preparing dataset csv/default to /home/sebastian/.cache/huggingface/datasets/csv/default-8c97c4f49e71f1f6/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...
  3 | Dataset csv downloaded and prepared to /home/sebastian/.cache/huggingface/datasets/csv/default-8c97c4f49e71f1f6/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.
  4 | DatasetDict({
  5 |     train: Dataset({
  6 |         features: ['index', 'text', 'label'],
  7 |         num_rows: 35000
  8 |     })
  9 |     validation: Dataset({
 10 |         features: ['index', 'text', 'label'],
 11 |         num_rows: 5000
 12 |     })
 13 |     test: Dataset({
 14 |         features: ['index', 'text', 'label'],
 15 |         num_rows: 10000
 16 |     })
 17 | })
 18 | Tokenizer input max length: 512
 19 | Tokenizer vocabulary size: 30522
 20 | Training: 0it [00:00, ?it/s]
 21 | Training:   0%|          | 0/2917 [00:00<?, ?it/s]
 22 |                                                                           [A
 23 |                                                                           [A
 24 |                                                                           [A
 25 | Time elapsed 6.99 min
 26 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
 27 | ┃        Test metric        ┃       DataLoader 0        ┃
 28 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
 29 | │         accuracy          │    0.9666571617126465     │
 30 | └───────────────────────────┴───────────────────────────┘
 31 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
 32 | ┃        Test metric        ┃       DataLoader 0        ┃
 33 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
 34 | │         accuracy          │    0.9301999807357788     │
 35 | └───────────────────────────┴───────────────────────────┘
 36 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
 37 | ┃        Test metric        ┃       DataLoader 0        ┃
 38 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
 39 | │         accuracy          │    0.9254999756813049     │
 40 | └───────────────────────────┴───────────────────────────┘
 41 | 1 -- Last Layer
 42 | Training: 0it [00:00, ?it/s]
 43 | Training:   0%|          | 0/2917 [00:00<?, ?it/s]
 44 |                                                                           [A
 45 |                                                                           [A
 46 |                                                                           [A
 47 | Time elapsed 2.77 min
 48 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
 49 | ┃        Test metric        ┃       DataLoader 0        ┃
 50 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
 51 | │         accuracy          │    0.7889142632484436     │
 52 | └───────────────────────────┴───────────────────────────┘
 53 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
 54 | ┃        Test metric        ┃       DataLoader 0        ┃
 55 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
 56 | │         accuracy          │    0.7942000031471252     │
 57 | └───────────────────────────┴───────────────────────────┘
 58 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
 59 | ┃        Test metric        ┃       DataLoader 0        ┃
 60 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
 61 | │         accuracy          │    0.7871000170707703     │
 62 | └───────────────────────────┴───────────────────────────┘
 63 | 2 -- Last 2 Layers
 64 | Training: 0it [00:00, ?it/s]
 65 | Training:   0%|          | 0/2917 [00:00<?, ?it/s]
 66 |                                                                           [A
 67 |                                                                           [A
 68 |                                                                           [A
 69 | Time elapsed 2.78 min
 70 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
 71 | ┃        Test metric        ┃       DataLoader 0        ┃
 72 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
 73 | │         accuracy          │     0.868228554725647     │
 74 | └───────────────────────────┴───────────────────────────┘
 75 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
 76 | ┃        Test metric        ┃       DataLoader 0        ┃
 77 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
 78 | │         accuracy          │    0.8712000250816345     │
 79 | └───────────────────────────┴───────────────────────────┘
 80 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
 81 | ┃        Test metric        ┃       DataLoader 0        ┃
 82 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
 83 | │         accuracy          │    0.8644999861717224     │
 84 | └───────────────────────────┴───────────────────────────┘
 85 | 3 -- Last 2 Layers + Last Tranformer Block
 86 | Training: 0it [00:00, ?it/s]
 87 | Training:   0%|          | 0/2917 [00:00<?, ?it/s]
 88 |                                                                           [A
 89 |                                                                           [A
 90 |                                                                           [A
 91 | Time elapsed 3.39 min
 92 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
 93 | ┃        Test metric        ┃       DataLoader 0        ┃
 94 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
 95 | │         accuracy          │    0.9498000144958496     │
 96 | └───────────────────────────┴───────────────────────────┘
 97 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
 98 | ┃        Test metric        ┃       DataLoader 0        ┃
 99 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
100 | │         accuracy          │    0.9272000193595886     │
101 | └───────────────────────────┴───────────────────────────┘
102 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
103 | ┃        Test metric        ┃       DataLoader 0        ┃
104 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
105 | │         accuracy          │     0.921999990940094     │
106 | └───────────────────────────┴───────────────────────────┘
107 | 4 -- Last 2 Layers + Last 2 Transformer Blocks
108 | Training: 0it [00:00, ?it/s]
109 | Training:   0%|          | 0/2917 [00:00<?, ?it/s]
110 |                                                                           [A
111 |                                                                           [A
112 |                                                                           [A
113 | Time elapsed 4.06 min
114 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
115 | ┃        Test metric        ┃       DataLoader 0        ┃
116 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
117 | │         accuracy          │    0.9771142601966858     │
118 | └───────────────────────────┴───────────────────────────┘
119 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
120 | ┃        Test metric        ┃       DataLoader 0        ┃
121 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
122 | │         accuracy          │    0.9300000071525574     │
123 | └───────────────────────────┴───────────────────────────┘
124 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
125 | ┃        Test metric        ┃       DataLoader 0        ┃
126 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
127 | │         accuracy          │    0.9240000247955322     │
128 | └───────────────────────────┴───────────────────────────┘
129 | 5 -- Last 2 Layers + Last 3 Transformer Blocks
130 | Training: 0it [00:00, ?it/s]
131 | Training:   0%|          | 0/2917 [00:00<?, ?it/s]
132 |                                                                           [A
133 |                                                                           [A
134 |                                                                           [A
135 | Time elapsed 4.63 min
136 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
137 | ┃        Test metric        ┃       DataLoader 0        ┃
138 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
139 | │         accuracy          │    0.9864857196807861     │
140 | └───────────────────────────┴───────────────────────────┘
141 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
142 | ┃        Test metric        ┃       DataLoader 0        ┃
143 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
144 | │         accuracy          │    0.9333999752998352     │
145 | └───────────────────────────┴───────────────────────────┘
146 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
147 | ┃        Test metric        ┃       DataLoader 0        ┃
148 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
149 | │         accuracy          │    0.9265000224113464     │
150 | └───────────────────────────┴───────────────────────────┘
151 | 6 -- Last 2 Layers + Last 4 Transformer Blocks
152 | Training: 0it [00:00, ?it/s]
153 | Training:   0%|          | 0/2917 [00:00<?, ?it/s]
154 |                                                                           [A
155 |                                                                           [A
156 |                                                                           [A
157 | Time elapsed 5.15 min
158 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
159 | ┃        Test metric        ┃       DataLoader 0        ┃
160 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
161 | │         accuracy          │    0.9763428568840027     │
162 | └───────────────────────────┴───────────────────────────┘
163 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
164 | ┃        Test metric        ┃       DataLoader 0        ┃
165 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
166 | │         accuracy          │    0.9279999732971191     │
167 | └───────────────────────────┴───────────────────────────┘
168 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
169 | ┃        Test metric        ┃       DataLoader 0        ┃
170 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
171 | │         accuracy          │    0.9262999892234802     │
172 | └───────────────────────────┴───────────────────────────┘
173 | ## 7 -- Last 2 Layers + Last 5 Transformer Blocks
174 | Training: 0it [00:00, ?it/s]
175 | Training:   0%|          | 0/2917 [00:00<?, ?it/s]
176 |                                                                           [A
177 |                                                                           [A
178 |                                                                           [A
179 | Time elapsed 6.99 min
180 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
181 | ┃        Test metric        ┃       DataLoader 0        ┃
182 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
183 | │         accuracy          │    0.9947142601013184     │
184 | └───────────────────────────┴───────────────────────────┘
185 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
186 | ┃        Test metric        ┃       DataLoader 0        ┃
187 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
188 | │         accuracy          │    0.9258000254631042     │
189 | └───────────────────────────┴───────────────────────────┘
190 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
191 | ┃        Test metric        ┃       DataLoader 0        ┃
192 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
193 | │         accuracy          │    0.9251999855041504     │
194 | └───────────────────────────┴───────────────────────────┘
195 | 8 -- Last 2 Layers + Last 6 Transformer Blocks
196 | Training: 0it [00:00, ?it/s]
197 | Training:   0%|          | 0/2917 [00:00<?, ?it/s]
198 |                                                                           [A
199 |                                                                           [A
200 |                                                                           [A
201 | Time elapsed 7.01 min
202 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
203 | ┃        Test metric        ┃       DataLoader 0        ┃
204 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
205 | │         accuracy          │    0.9925428628921509     │
206 | └───────────────────────────┴───────────────────────────┘
207 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
208 | ┃        Test metric        ┃       DataLoader 0        ┃
209 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
210 | │         accuracy          │    0.9277999997138977     │
211 | └───────────────────────────┴───────────────────────────┘
212 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
213 | ┃        Test metric        ┃       DataLoader 0        ┃
214 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
215 | │         accuracy          │    0.9262999892234802     │
216 | └───────────────────────────┴───────────────────────────┘
217 | 


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/layerwise-experiment/layerwise-experiment-run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import subprocess
 3 | 
 4 | with open("layerwise-experiment-results.txt", "w+") as output:
 5 |     subprocess.call(["python", "layerwise-experiment.py"], stdout=output);
 6 | 
 7 | ########
 8 | 
 9 | s = ("Sanity", "Testing", "Validation", "Epoch")
10 | 
11 | out = []
12 | with open("layerwise-experiment-results.txt", "r") as f:
13 |     for line in f.readlines():
14 |         if line.startswith(s) or not line.strip():
15 |             continue
16 | 
17 |         else:
18 |             out.append(line)
19 | with open("layerwise-experiment-results-clean.txt", "w") as f:
20 |     for line in out:
21 |         f.write(line)


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/layerwise-experiment/layerwise-experiment.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Finetuning All Layers
  5 | 
  6 | # <img src="figures/3_finetune-all.png" width=500>
  7 | 
  8 | # In[ ]:
  9 | 
 10 | 
 11 | # pip install transformers
 12 | 
 13 | 
 14 | # In[ ]:
 15 | 
 16 | 
 17 | # pip install datasets
 18 | 
 19 | 
 20 | # In[ ]:
 21 | 
 22 | 
 23 | # pip install lightning
 24 | 
 25 | 
 26 | # In[ ]:
 27 | 
 28 | 
 29 | # get_ipython().run_line_magic('load_ext', 'watermark')
 30 | # get_ipython().run_line_magic('watermark', '--conda -p torch,transformers,datasets,lightning')
 31 | 
 32 | 
 33 | # # 1 Loading the dataset into DataFrames
 34 | 
 35 | # In[ ]:
 36 | 
 37 | 
 38 | # pip install datasets
 39 | 
 40 | import shutil
 41 | 
 42 | from datasets import load_dataset
 43 | 
 44 | import lightning as L
 45 | from lightning.pytorch.loggers import CSVLogger
 46 | from lightning.pytorch.callbacks import ModelCheckpoint
 47 | 
 48 | import numpy as np
 49 | import pandas as pd
 50 | import torch
 51 | 
 52 | from sklearn.feature_extraction.text import CountVectorizer
 53 | 
 54 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
 55 | from local_dataset_utilities import IMDBDataset
 56 | 
 57 | 
 58 | # In[ ]:
 59 | 
 60 | 
 61 | download_dataset()
 62 | 
 63 | df = load_dataset_into_to_dataframe()
 64 | partition_dataset(df)
 65 | 
 66 | 
 67 | # In[ ]:
 68 | 
 69 | 
 70 | df_train = pd.read_csv("train.csv")
 71 | df_val = pd.read_csv("val.csv")
 72 | df_test = pd.read_csv("test.csv")
 73 | 
 74 | 
 75 | # # 2 Tokenization and Numericalization
 76 | 
 77 | # **Load the dataset via `load_dataset`**
 78 | 
 79 | # In[ ]:
 80 | 
 81 | 
 82 | imdb_dataset = load_dataset(
 83 |     "csv",
 84 |     data_files={
 85 |         "train": "train.csv",
 86 |         "validation": "val.csv",
 87 |         "test": "test.csv",
 88 |     },
 89 | )
 90 | 
 91 | print(imdb_dataset)
 92 | 
 93 | 
 94 | # **Tokenize the dataset**
 95 | 
 96 | # In[ ]:
 97 | 
 98 | 
 99 | from transformers import AutoTokenizer
100 | 
101 | tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
102 | print("Tokenizer input max length:", tokenizer.model_max_length)
103 | print("Tokenizer vocabulary size:", tokenizer.vocab_size)
104 | 
105 | 
106 | # In[ ]:
107 | 
108 | 
109 | def tokenize_text(batch):
110 |     return tokenizer(batch["text"], truncation=True, padding=True)
111 | 
112 | 
113 | # In[ ]:
114 | 
115 | 
116 | imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
117 | 
118 | 
119 | # In[ ]:
120 | 
121 | 
122 | del imdb_dataset
123 | 
124 | 
125 | # In[ ]:
126 | 
127 | 
128 | imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
129 | 
130 | 
131 | # In[ ]:
132 | 
133 | 
134 | import os
135 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
136 | 
137 | 
138 | # # 3 Set Up DataLoaders
139 | 
140 | # In[ ]:
141 | 
142 | 
143 | from torch.utils.data import DataLoader, Dataset
144 | 
145 | 
146 | class IMDBDataset(Dataset):
147 |     def __init__(self, dataset_dict, partition_key="train"):
148 |         self.partition = dataset_dict[partition_key]
149 | 
150 |     def __getitem__(self, index):
151 |         return self.partition[index]
152 | 
153 |     def __len__(self):
154 |         return self.partition.num_rows
155 | 
156 | 
157 | # In[ ]:
158 | 
159 | 
160 | train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
161 | val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
162 | test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
163 | 
164 | train_loader = DataLoader(
165 |     dataset=train_dataset,
166 |     batch_size=12,
167 |     shuffle=True, 
168 |     num_workers=4
169 | )
170 | 
171 | val_loader = DataLoader(
172 |     dataset=val_dataset,
173 |     batch_size=12,
174 |     num_workers=4
175 | )
176 | 
177 | test_loader = DataLoader(
178 |     dataset=test_dataset,
179 |     batch_size=12,
180 |     num_workers=4
181 | )
182 | 
183 | 
184 | # # 4 Initializing Modules
185 | 
186 | # **Wrap in LightningModule for Training**
187 | 
188 | # In[ ]:
189 | 
190 | 
191 | import lightning as L
192 | import torch
193 | import torchmetrics
194 | 
195 | 
196 | class CustomLightningModule(L.LightningModule):
197 |     def __init__(self, model, learning_rate=5e-5):
198 |         super().__init__()
199 | 
200 |         self.learning_rate = learning_rate
201 |         self.model = model
202 | 
203 |         self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
204 |         self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
205 | 
206 |     def forward(self, input_ids, attention_mask, labels):
207 |         return self.model(input_ids, attention_mask=attention_mask, labels=labels)
208 |         
209 |     def training_step(self, batch, batch_idx):
210 |         outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
211 |                        labels=batch["label"])        
212 |         self.log("train_loss", outputs["loss"])
213 |         return outputs["loss"]  # this is passed to the optimizer for training
214 | 
215 |     def validation_step(self, batch, batch_idx):
216 |         outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
217 |                        labels=batch["label"])        
218 |         self.log("val_loss", outputs["loss"], prog_bar=True)
219 |         
220 |         logits = outputs["logits"]
221 |         predicted_labels = torch.argmax(logits, 1)
222 |         self.val_acc(predicted_labels, batch["label"])
223 |         self.log("val_acc", self.val_acc, prog_bar=True)
224 |         
225 |     def test_step(self, batch, batch_idx):
226 |         outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
227 |                        labels=batch["label"])        
228 |         
229 |         logits = outputs["logits"]
230 |         predicted_labels = torch.argmax(logits, 1)
231 |         self.test_acc(predicted_labels, batch["label"])
232 |         self.log("accuracy", self.test_acc, prog_bar=True)
233 | 
234 |     def configure_optimizers(self):
235 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
236 |         return optimizer
237 | 
238 | 
239 | # In[ ]:
240 | 
241 | 
242 | from lightning.pytorch.callbacks import ModelCheckpoint
243 | from lightning.pytorch.loggers import CSVLogger
244 | 
245 | 
246 | callbacks = [
247 |     ModelCheckpoint(
248 |         save_top_k=1, mode="max", monitor="val_acc"
249 |     )  # save top 1 model
250 | ]
251 | logger = CSVLogger(save_dir="logs/", name="my-model")
252 | 
253 | 
254 | # # 5 Finetuning
255 | 
256 | # ## All layers
257 | 
258 | # In[ ]:
259 | 
260 | 
261 | from transformers import AutoModelForSequenceClassification
262 | 
263 | model = AutoModelForSequenceClassification.from_pretrained(
264 |     "distilbert-base-uncased", num_labels=2)
265 | 
266 | lightning_model = CustomLightningModule(model)
267 | 
268 | 
269 | # In[ ]:
270 | 
271 | 
272 | trainer = L.Trainer(
273 |     max_epochs=3,
274 |     callbacks=callbacks,
275 |     accelerator="gpu",
276 |     precision="16-mixed",
277 |     devices=1,
278 |     logger=logger,
279 |     log_every_n_steps=100,
280 | )
281 | 
282 | 
283 | # In[ ]:
284 | 
285 | 
286 | import time
287 | start = time.time()
288 | 
289 | trainer.fit(model=lightning_model,
290 |             train_dataloaders=train_loader,
291 |             val_dataloaders=val_loader)
292 | 
293 | end = time.time()
294 | elapsed = end - start
295 | print(f"Time elapsed {elapsed/60:.2f} min")
296 | 
297 | 
298 | # In[ ]:
299 | 
300 | 
301 | trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")
302 | 
303 | 
304 | # In[ ]:
305 | 
306 | 
307 | trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
308 | 
309 | 
310 | # In[ ]:
311 | 
312 | 
313 | trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
314 | shutil.rmtree("logs")
315 | logger = CSVLogger(save_dir="logs/", name="my-model")
316 | 
317 | 
318 | # ## 1 -- Last Layer
319 | 
320 | # In[ ]:
321 | 
322 | print("1 -- Last Layer")
323 | 
324 | model = AutoModelForSequenceClassification.from_pretrained(
325 |     "distilbert-base-uncased", num_labels=2)
326 | 
327 | lightning_model = CustomLightningModule(model)
328 | 
329 | 
330 | # In[ ]:
331 | 
332 | 
333 | for param in model.parameters():
334 |     param.requires_grad = False
335 |     
336 | for param in model.classifier.parameters():
337 |     param.requires_grad = True
338 | 
339 | 
340 | # In[ ]:
341 | 
342 | 
343 | trainer = L.Trainer(
344 |     max_epochs=3,
345 |     callbacks=callbacks,
346 |     accelerator="gpu",
347 |     precision="16-mixed",
348 |     devices=1,
349 |     logger=logger,
350 |     log_every_n_steps=100,
351 | )
352 | 
353 | 
354 | # In[ ]:
355 | 
356 | 
357 | start = time.time()
358 | 
359 | trainer.fit(model=lightning_model,
360 |             train_dataloaders=train_loader,
361 |             val_dataloaders=val_loader)
362 | 
363 | end = time.time()
364 | elapsed = end - start
365 | print(f"Time elapsed {elapsed/60:.2f} min")
366 | 
367 | 
368 | # In[ ]:
369 | 
370 | 
371 | trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")
372 | 
373 | 
374 | # In[ ]:
375 | 
376 | 
377 | trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
378 | 
379 | 
380 | # In[ ]:
381 | 
382 | 
383 | trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
384 | shutil.rmtree("logs")
385 | logger = CSVLogger(save_dir="logs/", name="my-model")
386 | 
387 | 
388 | # ## 2 -- Last 2 Layers
389 | 
390 | # In[ ]:
391 | 
392 | print("2 -- Last 2 Layers")
393 | 
394 | model = AutoModelForSequenceClassification.from_pretrained(
395 |     "distilbert-base-uncased", num_labels=2)
396 | 
397 | lightning_model = CustomLightningModule(model)
398 | 
399 | 
400 | # In[ ]:
401 | 
402 | 
403 | for param in model.parameters():
404 |     param.requires_grad = False
405 |     
406 | for param in model.pre_classifier.parameters():
407 |     param.requires_grad = True
408 |     
409 | for param in model.classifier.parameters():
410 |     param.requires_grad = True
411 | 
412 | 
413 | # In[ ]:
414 | 
415 | 
416 | trainer = L.Trainer(
417 |     max_epochs=3,
418 |     callbacks=callbacks,
419 |     accelerator="gpu",
420 |     precision="16-mixed",
421 |     devices=1,
422 |     logger=logger,
423 |     log_every_n_steps=100,
424 | )
425 | 
426 | 
427 | # In[ ]:
428 | 
429 | 
430 | start = time.time()
431 | 
432 | trainer.fit(model=lightning_model,
433 |             train_dataloaders=train_loader,
434 |             val_dataloaders=val_loader)
435 | 
436 | end = time.time()
437 | elapsed = end - start
438 | print(f"Time elapsed {elapsed/60:.2f} min")
439 | 
440 | 
441 | # In[ ]:
442 | 
443 | 
444 | trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")
445 | 
446 | 
447 | # In[ ]:
448 | 
449 | 
450 | trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
451 | 
452 | 
453 | # In[ ]:
454 | 
455 | 
456 | trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
457 | shutil.rmtree("logs")
458 | logger = CSVLogger(save_dir="logs/", name="my-model")
459 | 
460 | 
461 | # ## 3 -- Last 2 Layers + Last Tranformer Block
462 | 
463 | print("3 -- Last 2 Layers + Last Tranformer Block")
464 | 
465 | # In[ ]:
466 | 
467 | 
468 | 
469 | model = AutoModelForSequenceClassification.from_pretrained(
470 |     "distilbert-base-uncased", num_labels=2)
471 | 
472 | lightning_model = CustomLightningModule(model)
473 | 
474 | 
475 | # In[ ]:
476 | 
477 | 
478 | for param in model.parameters():
479 |     param.requires_grad = False
480 |     
481 | for param in model.pre_classifier.parameters():
482 |     param.requires_grad = True
483 |     
484 | for param in model.classifier.parameters():
485 |     param.requires_grad = True
486 | 
487 | for param in model.distilbert.transformer.layer[5].parameters():
488 |     param.requires_grad = True
489 | 
490 | 
491 | # In[ ]:
492 | 
493 | 
494 | trainer = L.Trainer(
495 |     max_epochs=3,
496 |     callbacks=callbacks,
497 |     accelerator="gpu",
498 |     precision="16-mixed",
499 |     devices=1,
500 |     logger=logger,
501 |     log_every_n_steps=100,
502 | )
503 | 
504 | 
505 | # In[ ]:
506 | 
507 | 
508 | start = time.time()
509 | 
510 | trainer.fit(model=lightning_model,
511 |             train_dataloaders=train_loader,
512 |             val_dataloaders=val_loader)
513 | 
514 | end = time.time()
515 | elapsed = end - start
516 | print(f"Time elapsed {elapsed/60:.2f} min")
517 | 
518 | 
519 | # In[ ]:
520 | 
521 | 
522 | trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")
523 | 
524 | 
525 | # In[ ]:
526 | 
527 | 
528 | trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
529 | 
530 | 
531 | # In[ ]:
532 | 
533 | 
534 | trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
535 | shutil.rmtree("logs")
536 | logger = CSVLogger(save_dir="logs/", name="my-model")
537 | 
538 | 
539 | # ## 4 -- Last 2 Layers + Last 2 Transformer Blocks
540 | 
541 | # In[ ]:
542 | 
543 | print("4 -- Last 2 Layers + Last 2 Transformer Blocks")
544 | 
545 | model = AutoModelForSequenceClassification.from_pretrained(
546 |     "distilbert-base-uncased", num_labels=2)
547 | 
548 | lightning_model = CustomLightningModule(model)
549 | 
550 | 
551 | # In[ ]:
552 | 
553 | 
554 | for param in model.parameters():
555 |     param.requires_grad = False
556 |     
557 | for param in model.pre_classifier.parameters():
558 |     param.requires_grad = True
559 |     
560 | for param in model.classifier.parameters():
561 |     param.requires_grad = True
562 | 
563 | for param in model.distilbert.transformer.layer[5].parameters():
564 |     param.requires_grad = True
565 |     
566 | for param in model.distilbert.transformer.layer[4].parameters():
567 |     param.requires_grad = True
568 | 
569 | 
570 | # In[ ]:
571 | 
572 | 
573 | trainer = L.Trainer(
574 |     max_epochs=3,
575 |     callbacks=callbacks,
576 |     accelerator="gpu",
577 |     precision="16-mixed",
578 |     devices=1,
579 |     logger=logger,
580 |     log_every_n_steps=100,
581 | )
582 | 
583 | 
584 | # In[ ]:
585 | 
586 | 
587 | start = time.time()
588 | 
589 | trainer.fit(model=lightning_model,
590 |             train_dataloaders=train_loader,
591 |             val_dataloaders=val_loader)
592 | 
593 | end = time.time()
594 | elapsed = end - start
595 | print(f"Time elapsed {elapsed/60:.2f} min")
596 | 
597 | 
598 | # In[ ]:
599 | 
600 | 
601 | trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")
602 | 
603 | 
604 | # In[ ]:
605 | 
606 | 
607 | trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
608 | 
609 | 
610 | # In[ ]:
611 | 
612 | 
613 | trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
614 | shutil.rmtree("logs")
615 | logger = CSVLogger(save_dir="logs/", name="my-model")
616 | 
617 | 
618 | # ## 5 -- Last 2 Layers + Last 3 Transformer Blocks
619 | 
620 | # In[ ]:
621 | 
622 | print("5 -- Last 2 Layers + Last 3 Transformer Blocks")
623 | 
624 | model = AutoModelForSequenceClassification.from_pretrained(
625 |     "distilbert-base-uncased", num_labels=2)
626 | 
627 | lightning_model = CustomLightningModule(model)
628 | 
629 | 
630 | # In[ ]:
631 | 
632 | 
633 | for param in model.parameters():
634 |     param.requires_grad = False
635 |     
636 | for param in model.pre_classifier.parameters():
637 |     param.requires_grad = True
638 |     
639 | for param in model.classifier.parameters():
640 |     param.requires_grad = True
641 | 
642 | for param in model.distilbert.transformer.layer[5].parameters():
643 |     param.requires_grad = True
644 |     
645 | for param in model.distilbert.transformer.layer[4].parameters():
646 |     param.requires_grad = True
647 |     
648 | for param in model.distilbert.transformer.layer[3].parameters():
649 |     param.requires_grad = True
650 | 
651 | 
652 | # In[ ]:
653 | 
654 | 
655 | trainer = L.Trainer(
656 |     max_epochs=3,
657 |     callbacks=callbacks,
658 |     accelerator="gpu",
659 |     precision="16-mixed",
660 |     devices=1,
661 |     logger=logger,
662 |     log_every_n_steps=100,
663 | )
664 | 
665 | 
666 | # In[ ]:
667 | 
668 | 
669 | start = time.time()
670 | 
671 | trainer.fit(model=lightning_model,
672 |             train_dataloaders=train_loader,
673 |             val_dataloaders=val_loader)
674 | 
675 | end = time.time()
676 | elapsed = end - start
677 | print(f"Time elapsed {elapsed/60:.2f} min")
678 | 
679 | 
680 | # In[ ]:
681 | 
682 | 
683 | trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")
684 | 
685 | 
686 | # In[ ]:
687 | 
688 | 
689 | trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
690 | 
691 | 
692 | # In[ ]:
693 | 
694 | 
695 | trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
696 | shutil.rmtree("logs")
697 | logger = CSVLogger(save_dir="logs/", name="my-model")
698 | 
699 | 
700 | 
701 | ## 6 -- Last 2 Layers + Last 4 Transformer Blocks
702 | 
703 | print("6 -- Last 2 Layers + Last 4 Transformer Blocks")
704 | 
705 | for param in model.parameters():
706 |     param.requires_grad = False
707 |     
708 | for param in model.pre_classifier.parameters():
709 |     param.requires_grad = True
710 |     
711 | for param in model.classifier.parameters():
712 |     param.requires_grad = True
713 | 
714 | for param in model.distilbert.transformer.layer[5].parameters():
715 |     param.requires_grad = True
716 |     
717 | for param in model.distilbert.transformer.layer[4].parameters():
718 |     param.requires_grad = True
719 |     
720 | for param in model.distilbert.transformer.layer[3].parameters():
721 |     param.requires_grad = True
722 |     
723 | for param in model.distilbert.transformer.layer[2].parameters():
724 |     param.requires_grad = True
725 | 
726 | 
727 | # In[ ]:
728 | 
729 | 
730 | trainer = L.Trainer(
731 |     max_epochs=3,
732 |     callbacks=callbacks,
733 |     accelerator="gpu",
734 |     precision="16-mixed",
735 |     devices=1,
736 |     logger=logger,
737 |     log_every_n_steps=100,
738 | )
739 | 
740 | 
741 | # In[ ]:
742 | 
743 | 
744 | start = time.time()
745 | 
746 | trainer.fit(model=lightning_model,
747 |             train_dataloaders=train_loader,
748 |             val_dataloaders=val_loader)
749 | 
750 | end = time.time()
751 | elapsed = end - start
752 | print(f"Time elapsed {elapsed/60:.2f} min")
753 | 
754 | 
755 | # In[ ]:
756 | 
757 | 
758 | trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")
759 | 
760 | 
761 | # In[ ]:
762 | 
763 | 
764 | trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
765 | 
766 | 
767 | # In[ ]:
768 | 
769 | 
770 | trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
771 | shutil.rmtree("logs")
772 | logger = CSVLogger(save_dir="logs/", name="my-model")
773 | 
774 | 
775 | # ## 7 -- Last 2 Layers + Last 5 Transformer Blocks
776 | 
777 | # In[ ]:
778 | 
779 | print("## 7 -- Last 2 Layers + Last 5 Transformer Blocks")
780 | 
781 | model = AutoModelForSequenceClassification.from_pretrained(
782 |     "distilbert-base-uncased", num_labels=2)
783 | 
784 | lightning_model = CustomLightningModule(model)
785 | 
786 | 
787 | # In[ ]:
788 | 
789 | 
790 | for param in model.distilbert.transformer.layer[5].parameters():
791 |     param.requires_grad = True
792 |     
793 | for param in model.distilbert.transformer.layer[4].parameters():
794 |     param.requires_grad = True
795 |     
796 | for param in model.distilbert.transformer.layer[3].parameters():
797 |     param.requires_grad = True
798 |     
799 | for param in model.distilbert.transformer.layer[2].parameters():
800 |     param.requires_grad = True
801 | 
802 | for param in model.distilbert.transformer.layer[1].parameters():
803 |     param.requires_grad = True
804 | 
805 | 
806 | # In[ ]:
807 | 
808 | 
809 | trainer = L.Trainer(
810 |     max_epochs=3,
811 |     callbacks=callbacks,
812 |     accelerator="gpu",
813 |     precision="16-mixed",
814 |     devices=1,
815 |     logger=logger,
816 |     log_every_n_steps=100,
817 | )
818 | 
819 | 
820 | # In[ ]:
821 | 
822 | 
823 | start = time.time()
824 | 
825 | trainer.fit(model=lightning_model,
826 |             train_dataloaders=train_loader,
827 |             val_dataloaders=val_loader)
828 | 
829 | end = time.time()
830 | elapsed = end - start
831 | print(f"Time elapsed {elapsed/60:.2f} min")
832 | 
833 | 
834 | # In[ ]:
835 | 
836 | 
837 | trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")
838 | 
839 | 
840 | # In[ ]:
841 | 
842 | 
843 | trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
844 | 
845 | 
846 | # In[ ]:
847 | 
848 | 
849 | trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
850 | shutil.rmtree("logs")
851 | logger = CSVLogger(save_dir="logs/", name="my-model")
852 | 
853 | 
854 | # ## 8 -- Last 2 Layers + Last 6 Transformer Blocks
855 | 
856 | # In[ ]:
857 | 
858 | print("8 -- Last 2 Layers + Last 6 Transformer Blocks")
859 | 
860 | model = AutoModelForSequenceClassification.from_pretrained(
861 |     "distilbert-base-uncased", num_labels=2)
862 | 
863 | lightning_model = CustomLightningModule(model)
864 | 
865 | 
866 | # In[ ]:
867 | 
868 | 
869 | for param in model.distilbert.transformer.layer[5].parameters():
870 |     param.requires_grad = True
871 |     
872 | for param in model.distilbert.transformer.layer[4].parameters():
873 |     param.requires_grad = True
874 |     
875 | for param in model.distilbert.transformer.layer[3].parameters():
876 |     param.requires_grad = True
877 |     
878 | for param in model.distilbert.transformer.layer[2].parameters():
879 |     param.requires_grad = True
880 | 
881 | for param in model.distilbert.transformer.layer[1].parameters():
882 |     param.requires_grad = True
883 | 
884 | for param in model.distilbert.transformer.layer[0].parameters():
885 |     param.requires_grad = True
886 | 
887 | 
888 | # In[ ]:
889 | 
890 | 
891 | trainer = L.Trainer(
892 |     max_epochs=3,
893 |     callbacks=callbacks,
894 |     accelerator="gpu",
895 |     precision="16-mixed",
896 |     devices=1,
897 |     logger=logger,
898 |     log_every_n_steps=100,
899 | )
900 | 
901 | 
902 | # In[ ]:
903 | 
904 | 
905 | start = time.time()
906 | 
907 | trainer.fit(model=lightning_model,
908 |             train_dataloaders=train_loader,
909 |             val_dataloaders=val_loader)
910 | 
911 | end = time.time()
912 | elapsed = end - start
913 | print(f"Time elapsed {elapsed/60:.2f} min")
914 | 
915 | 
916 | # In[ ]:
917 | 
918 | 
919 | trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")
920 | 
921 | 
922 | # In[ ]:
923 | 
924 | 
925 | trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
926 | 
927 | 
928 | # In[ ]:
929 | 
930 | 
931 | trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
932 | shutil.rmtree("logs")
933 | logger = CSVLogger(save_dir="logs/", name="my-model")
934 | 
935 | 


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/layerwise-experiment/layerwise-results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/conventional/distilbert-movie-review/layerwise-experiment/layerwise-results.png


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/layerwise-experiment/local_dataset_utilities.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import tarfile
  4 | import time
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from packaging import version
  9 | from torch.utils.data import Dataset
 10 | from tqdm import tqdm
 11 | import urllib
 12 | 
 13 | 
 14 | def reporthook(count, block_size, total_size):
 15 |     global start_time
 16 |     if count == 0:
 17 |         start_time = time.time()
 18 |         return
 19 |     duration = time.time() - start_time
 20 |     progress_size = int(count * block_size)
 21 |     speed = progress_size / (1024.0**2 * duration)
 22 |     percent = count * block_size * 100.0 / total_size
 23 | 
 24 |     sys.stdout.write(
 25 |         f"\r{int(percent)}% | {progress_size / (1024.**2):.2f} MB "
 26 |         f"| {speed:.2f} MB/s | {duration:.2f} sec elapsed"
 27 |     )
 28 |     sys.stdout.flush()
 29 | 
 30 | 
 31 | def download_dataset():
 32 |     source = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
 33 |     target = "aclImdb_v1.tar.gz"
 34 | 
 35 |     if os.path.exists(target):
 36 |         os.remove(target)
 37 | 
 38 |     if not os.path.isdir("aclImdb") and not os.path.isfile("aclImdb_v1.tar.gz"):
 39 |         urllib.request.urlretrieve(source, target, reporthook)
 40 | 
 41 |     if not os.path.isdir("aclImdb"):
 42 | 
 43 |         with tarfile.open(target, "r:gz") as tar:
 44 |             tar.extractall()
 45 | 
 46 | 
 47 | def load_dataset_into_to_dataframe():
 48 |     basepath = "aclImdb"
 49 | 
 50 |     labels = {"pos": 1, "neg": 0}
 51 | 
 52 |     df = pd.DataFrame()
 53 | 
 54 |     with tqdm(total=50000) as pbar:
 55 |         for s in ("test", "train"):
 56 |             for l in ("pos", "neg"):
 57 |                 path = os.path.join(basepath, s, l)
 58 |                 for file in sorted(os.listdir(path)):
 59 |                     with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
 60 |                         txt = infile.read()
 61 | 
 62 |                     if version.parse(pd.__version__) >= version.parse("1.3.2"):
 63 |                         x = pd.DataFrame(
 64 |                             [[txt, labels[l]]], columns=["review", "sentiment"]
 65 |                         )
 66 |                         df = pd.concat([df, x], ignore_index=False)
 67 | 
 68 |                     else:
 69 |                         df = df.append([[txt, labels[l]]], ignore_index=True)
 70 |                     pbar.update()
 71 |     df.columns = ["text", "label"]
 72 | 
 73 |     np.random.seed(0)
 74 |     df = df.reindex(np.random.permutation(df.index))
 75 | 
 76 |     print("Class distribution:")
 77 |     np.bincount(df["label"].values)
 78 | 
 79 |     return df
 80 | 
 81 | 
 82 | def partition_dataset(df):
 83 |     df_shuffled = df.sample(frac=1, random_state=1).reset_index()
 84 | 
 85 |     df_train = df_shuffled.iloc[:35_000]
 86 |     df_val = df_shuffled.iloc[35_000:40_000]
 87 |     df_test = df_shuffled.iloc[40_000:]
 88 | 
 89 |     df_train.to_csv("train.csv", index=False, encoding="utf-8")
 90 |     df_val.to_csv("val.csv", index=False, encoding="utf-8")
 91 |     df_test.to_csv("test.csv", index=False, encoding="utf-8")
 92 | 
 93 | 
 94 | class IMDBDataset(Dataset):
 95 |     def __init__(self, dataset_dict, partition_key="train"):
 96 |         self.partition = dataset_dict[partition_key]
 97 | 
 98 |     def __getitem__(self, index):
 99 |         return self.partition[index]
100 | 
101 |     def __len__(self):
102 |         return self.partition.num_rows


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/layerwise-experiment/results.txt:
--------------------------------------------------------------------------------
 1 | ## All layers
 2 | 
 3 | Time elapsed 6.99 min
 4 | 
 5 | Train: 0.9666571617126465
 6 | Val: 0.9301999807357788 
 7 | Test: 0.9254999756813049  
 8 | 
 9 | ## 1 -- Last Layer
10 | 
11 | 2.77 min
12 | 
13 | Train: 0.7889142632484436
14 | Val: 0.7942000031471252  
15 | Test: 0.7871000170707703 
16 | 
17 | ## 2 -- Last 2 Layers	
18 | 
19 | 2.78 min
20 | 
21 | Train: 0.868228554725647 
22 | Val: 0.8712000250816345
23 | Test: 0.8644999861717224
24 | 
25 | 
26 | 
27 | ## 3 -- Last 2 Layers + Last Transformer Block
28 | 
29 | 3.39 min
30 | 
31 | Train: 0.9498000144958496
32 | Val: 0.9272000193595886
33 | Test: 0.921999990940094  
34 | 
35 | 
36 | ## 4 -- Last 2 Layers + Last 2 Transformer Blocks
37 | 
38 | 4.06 min
39 | 
40 | Train: 0.9771142601966858
41 | Val: 0.9300000071525574
42 | Test: 0.9240000247955322
43 | 
44 | ## 5 -- Last 2 Layers + Last 3 Transformer Blocks
45 | 
46 | 4.63 min
47 | 
48 | Train: 0.9864857196807861
49 | Val: 0.9333999752998352
50 | Test: 0.9265000224113464
51 | 
52 | ## 6 -- Last 2 Layers + Last 4 Transformer Blocks
53 | 
54 | 5.15 min
55 | 
56 | Train: 0.9763428568840027
57 | Val: 0.9279999732971191
58 | Test: 0.9262999892234802
59 | 
60 | ## 7 -- Last 2 Layers + Last 5 Transformer Blocks
61 | 
62 | 6.99 min
63 | 
64 | Train: 0.9947142601013184
65 | Val: 0.9258000254631042
66 | Test: 0.9251999855041504
67 | 
68 | ## 8 -- Last 2 Layers + Last 6 Transformer Blocks
69 | 
70 | Train: 0.9925428628921509
71 | Val: 0.9277999997138977
72 | Test: 0.9262999892234802


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/local_dataset_utilities.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import tarfile
  4 | import time
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from packaging import version
  9 | from torch.utils.data import Dataset
 10 | from tqdm import tqdm
 11 | import urllib
 12 | 
 13 | 
 14 | def reporthook(count, block_size, total_size):
 15 |     global start_time
 16 |     if count == 0:
 17 |         start_time = time.time()
 18 |         return
 19 |     duration = time.time() - start_time
 20 |     progress_size = int(count * block_size)
 21 |     speed = progress_size / (1024.0**2 * duration)
 22 |     percent = count * block_size * 100.0 / total_size
 23 | 
 24 |     sys.stdout.write(
 25 |         f"\r{int(percent)}% | {progress_size / (1024.**2):.2f} MB "
 26 |         f"| {speed:.2f} MB/s | {duration:.2f} sec elapsed"
 27 |     )
 28 |     sys.stdout.flush()
 29 | 
 30 | 
 31 | def download_dataset():
 32 |     source = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
 33 |     target = "aclImdb_v1.tar.gz"
 34 | 
 35 |     if os.path.exists(target):
 36 |         os.remove(target)
 37 | 
 38 |     if not os.path.isdir("aclImdb") and not os.path.isfile("aclImdb_v1.tar.gz"):
 39 |         urllib.request.urlretrieve(source, target, reporthook)
 40 | 
 41 |     if not os.path.isdir("aclImdb"):
 42 | 
 43 |         with tarfile.open(target, "r:gz") as tar:
 44 |             tar.extractall()
 45 | 
 46 | 
 47 | def load_dataset_into_to_dataframe():
 48 |     basepath = "aclImdb"
 49 | 
 50 |     labels = {"pos": 1, "neg": 0}
 51 | 
 52 |     df = pd.DataFrame()
 53 | 
 54 |     with tqdm(total=50000) as pbar:
 55 |         for s in ("test", "train"):
 56 |             for l in ("pos", "neg"):
 57 |                 path = os.path.join(basepath, s, l)
 58 |                 for file in sorted(os.listdir(path)):
 59 |                     with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
 60 |                         txt = infile.read()
 61 | 
 62 |                     if version.parse(pd.__version__) >= version.parse("1.3.2"):
 63 |                         x = pd.DataFrame(
 64 |                             [[txt, labels[l]]], columns=["review", "sentiment"]
 65 |                         )
 66 |                         df = pd.concat([df, x], ignore_index=False)
 67 | 
 68 |                     else:
 69 |                         df = df.append([[txt, labels[l]]], ignore_index=True)
 70 |                     pbar.update()
 71 |     df.columns = ["text", "label"]
 72 | 
 73 |     np.random.seed(0)
 74 |     df = df.reindex(np.random.permutation(df.index))
 75 | 
 76 |     print("Class distribution:")
 77 |     np.bincount(df["label"].values)
 78 | 
 79 |     return df
 80 | 
 81 | 
 82 | def partition_dataset(df):
 83 |     df_shuffled = df.sample(frac=1, random_state=1).reset_index()
 84 | 
 85 |     df_train = df_shuffled.iloc[:35_000]
 86 |     df_val = df_shuffled.iloc[35_000:40_000]
 87 |     df_test = df_shuffled.iloc[40_000:]
 88 | 
 89 |     df_train.to_csv("train.csv", index=False, encoding="utf-8")
 90 |     df_val.to_csv("val.csv", index=False, encoding="utf-8")
 91 |     df_test.to_csv("test.csv", index=False, encoding="utf-8")
 92 | 
 93 | 
 94 | class IMDBDataset(Dataset):
 95 |     def __init__(self, dataset_dict, partition_key="train"):
 96 |         self.partition = dataset_dict[partition_key]
 97 | 
 98 |     def __getitem__(self, index):
 99 |         return self.partition[index]
100 | 
101 |     def __len__(self):
102 |         return self.partition.num_rows


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/README.md:
--------------------------------------------------------------------------------
  1 | All results from training on a single A100 GPU.
  2 | 
  3 | 
  4 | 
  5 | # Summary
  6 | 
  7 | 
  8 | 
  9 | ![](figures/1.png)
 10 | 
 11 | ![](figures/2.png)
 12 | 
 13 | ![](figures/3.png)
 14 | 
 15 | ---
 16 | 
 17 | # Raw results
 18 | 
 19 | 
 20 | 
 21 | # torch.set_float32_matmul_precision("highest")
 22 | 
 23 | 
 24 | ## Float16-mixed
 25 | 
 26 | ```python
 27 | python float16-mixed.py
 28 | ```
 29 | 
 30 | ```
 31 | Python implementation: CPython
 32 | Python version       : 3.9.16
 33 | 
 34 | torch       : 2.0.0
 35 | lightning   : 2.0.2
 36 | transformers: 4.28.1
 37 | Torch CUDA available? True
 38 | ...
 39 | Epoch: 0002/0003 | Train acc.: 95.19% | Val acc.: 92.55%
 40 | Epoch: 0003/0003 | Batch 0000/2916 | Loss: 0.0083
 41 | Epoch: 0003/0003 | Batch 0300/2916 | Loss: 0.1804
 42 | Epoch: 0003/0003 | Batch 0600/2916 | Loss: 0.0056
 43 | Epoch: 0003/0003 | Batch 0900/2916 | Loss: 0.0197
 44 | Epoch: 0003/0003 | Batch 1200/2916 | Loss: 0.0146
 45 | Epoch: 0003/0003 | Batch 1500/2916 | Loss: 0.0085
 46 | Epoch: 0003/0003 | Batch 1800/2916 | Loss: 0.0166
 47 | Epoch: 0003/0003 | Batch 2100/2916 | Loss: 0.0034
 48 | Epoch: 0003/0003 | Batch 2400/2916 | Loss: 0.0271
 49 | Epoch: 0003/0003 | Batch 2700/2916 | Loss: 0.0537
 50 | Epoch: 0003/0003 | Train acc.: 97.39% | Val acc.: 92.21%
 51 | Time elapsed 7.25 min
 52 | Memory used: 4.31 GB
 53 | Test accuracy 92.15%
 54 | ```
 55 | 
 56 | ## Bfloat16-mixed
 57 | 
 58 | ```python
 59 | python bfloat16-mixed.py
 60 | ```
 61 | 
 62 | ```
 63 | Python implementation: CPython
 64 | Python version       : 3.9.16
 65 | 
 66 | torch       : 2.0.0
 67 | lightning   : 2.0.2
 68 | transformers: 4.28.1
 69 | 
 70 | Torch CUDA available? True
 71 | GPU supports bfloat16: True
 72 | ...
 73 | Time elapsed 7.45 min
 74 | Memory used: 4.46 GB
 75 | Test accuracy 92.61%
 76 | ```
 77 | 
 78 | ## Float16-regular
 79 | 
 80 | ```
 81 | Epoch: 0003/0003 | Batch 2700/2916 | Loss: nan
 82 | Epoch: 0003/0003 | Train acc.: 49.86% | Val acc.: 50.80%
 83 | Time elapsed 5.23 min
 84 | Memory used: 2.87 GB
 85 | Test accuracy 50.08%
 86 | ```
 87 | 
 88 | ## Bfloat16-regular
 89 | 
 90 | ```
 91 | Train acc.: 96.55% | Val acc.: 92.59%
 92 | Time elapsed 5.22 min
 93 | Memory used: 2.87 GB
 94 | Test accuracy 92.69%
 95 | ```
 96 | 
 97 | ## Float32-regular
 98 | 
 99 | ```
100 | Epoch: 0003/0003 | Train acc.: 97.28% | Val acc.: 89.88%
101 | Time elapsed 21.75 min
102 | Memory used: 5.37 GB
103 | Test accuracy 89.92%
104 | ```
105 | 
106 | ## Float64-regular
107 | 
108 | ```
109 | Time elapsed 24.59 min
110 | Memory used: 10.42 GB
111 | Test accuracy 92.14%
112 | ```
113 | 
114 | ---
115 | 
116 | # torch.set_float32_matmul_precision("high")
117 | 
118 | ## float-32
119 | 
120 | ```
121 | Epoch: 0003/0003 | Train acc.: 97.41% | Val acc.: 92.75%
122 | Time elapsed 8.11 min
123 | Memory used: 5.37 GB
124 | Test accuracy 92.50%
125 | ```
126 | 
127 | ## float-16 mixed
128 | 
129 | ```
130 | ...
131 | Time elapsed 7.10 min
132 | Memory used: 4.31 GB
133 | Test accuracy 92.15%
134 | ```
135 | 
136 | ## bfloat-16 mixed
137 | 
138 | 
139 | ```
140 | Time elapsed 7.43 min
141 | Memory used: 4.46 GB
142 | Test accuracy 92.61%
143 | ```
144 | 
145 | ---
146 | 
147 | # torch.set_float32_matmul_precision("medium")
148 | 
149 | ## float-32
150 | 
151 | ```
152 | ...
153 | Epoch: 0003/0003 | Train acc.: 97.41% | Val acc.: 92.75%
154 | Time elapsed 8.14 min
155 | Memory used: 5.37 GB
156 | Test accuracy 92.50%
157 | ```
158 | 
159 | ## float-16 mixed
160 | 
161 | ```
162 | ...
163 | Time elapsed 7.07 min
164 | Memory used: 4.31 GB
165 | Test accuracy 92.15%
166 | 
167 | ```
168 | 
169 | ## bfloat-16 mixed
170 | 
171 | ```
172 | ...
173 | Epoch: 0003/0003 | Train acc.: 97.41% | Val acc.: 92.97%
174 | Time elapsed 7.44 min
175 | Memory used: 4.46 GB
176 | Test accuracy 92.61%
177 | ```


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/bfloat16-mixed-high.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import os.path as op
  4 | import time
  5 | 
  6 | from datasets import load_dataset
  7 | from lightning import Fabric
  8 | import torch
  9 | from torch.utils.data import DataLoader
 10 | import torchmetrics
 11 | from transformers import AutoTokenizer
 12 | from transformers import AutoModelForSequenceClassification
 13 | from watermark import watermark
 14 | 
 15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
 16 | from local_dataset_utilities import IMDBDataset
 17 | 
 18 | 
 19 | def tokenize_text(batch):
 20 |     return tokenizer(batch["text"], truncation=True, padding=True)
 21 | 
 22 | 
 23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
 24 | 
 25 |     for epoch in range(num_epochs):
 26 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 27 | 
 28 |         model.train()
 29 |         for batch_idx, batch in enumerate(train_loader):
 30 |             model.train()
 31 | 
 32 |             # For non-Fabric PyTorch:
 33 |             #for s in ["input_ids", "attention_mask", "label"]:
 34 |             #    batch[s] = batch[s].to(device)
 35 | 
 36 |             ### FORWARD AND BACK PROP   
 37 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"]) 
 38 |             optimizer.zero_grad()
 39 |             
 40 |             # For non-Fabric PyTorch:
 41 |             #outputs["loss"].backward()
 42 |             fabric.backward(outputs["loss"])
 43 | 
 44 |             ### UPDATE MODEL PARAMETERS
 45 |             optimizer.step()
 46 | 
 47 |             ### LOGGING
 48 |             if not batch_idx % 300:
 49 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
 50 | 
 51 |             model.eval()
 52 |             with torch.no_grad():
 53 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 54 |                 train_acc.update(predicted_labels, batch["label"])
 55 | 
 56 |         ### MORE LOGGING
 57 |         model.eval()
 58 |         with torch.no_grad():
 59 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 60 |             for batch in val_loader:
 61 | 
 62 |                 # For non-Fabric PyTorch:
 63 |                 #for s in ["input_ids", "attention_mask", "label"]:
 64 |                 #    batch[s] = batch[s].to(device)
 65 |                 outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
 66 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 67 |                 val_acc.update(predicted_labels, batch["label"])
 68 | 
 69 |             print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 70 |             train_acc.reset(), val_acc.reset()
 71 | 
 72 | 
 73 | if __name__ == "__main__":
 74 | 
 75 |     print(watermark(packages="torch,lightning,transformers", python=True))
 76 |     print("Torch CUDA available?", torch.cuda.is_available())
 77 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 78 | 
 79 |     torch.set_float32_matmul_precision("high")
 80 |     torch.manual_seed(123)
 81 | 
 82 |     ##########################
 83 |     ### 1 Loading the Dataset
 84 |     ##########################
 85 |     download_dataset()
 86 |     df = load_dataset_into_to_dataframe()
 87 |     if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
 88 |         partition_dataset(df)
 89 | 
 90 |     imdb_dataset = load_dataset(
 91 |         "csv",
 92 |         data_files={
 93 |             "train": "train.csv",
 94 |             "validation": "val.csv",
 95 |             "test": "test.csv",
 96 |         },
 97 |     )
 98 | 
 99 |     #########################################
100 |     ### 2 Tokenization and Numericalization
101 |     #########################################
102 | 
103 |     tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
104 |     print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
105 |     print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
106 | 
107 |     print("Tokenizing ...", flush=True)
108 |     imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
109 |     del imdb_dataset
110 |     imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
111 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
112 | 
113 |     #########################################
114 |     ### 3 Set Up DataLoaders
115 |     #########################################
116 | 
117 |     train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
118 |     val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
119 |     test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
120 | 
121 |     train_loader = DataLoader(
122 |         dataset=train_dataset,
123 |         batch_size=12,
124 |         shuffle=True, 
125 |         num_workers=4,
126 |         drop_last=True,
127 |     )
128 | 
129 |     val_loader = DataLoader(
130 |         dataset=val_dataset,
131 |         batch_size=12,
132 |         num_workers=4,
133 |         drop_last=True,
134 |     )
135 | 
136 |     test_loader = DataLoader(
137 |         dataset=test_dataset,
138 |         batch_size=12,
139 |         num_workers=2,
140 |         drop_last=True,
141 |     )
142 | 
143 | 
144 |     #########################################
145 |     ### 4 Initializing the Model
146 |     #########################################
147 | 
148 |     fabric = Fabric(accelerator="cuda", devices=[4], precision="bf16-mixed")
149 |     fabric.launch()
150 | 
151 |     model = AutoModelForSequenceClassification.from_pretrained(
152 |         "distilbert-base-uncased", num_labels=2)
153 | 
154 |     # model.to(device)
155 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
156 | 
157 |     model, optimizer = fabric.setup(model, optimizer)
158 |     train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
159 |     fabric.barrier()
160 | 
161 |     #########################################
162 |     ### 5 Finetuning
163 |     #########################################
164 | 
165 |     start = time.time()
166 |     train(
167 |         num_epochs=3,
168 |         model=model,
169 |         optimizer=optimizer,
170 |         train_loader=train_loader,
171 |         val_loader=val_loader,
172 |         fabric=fabric
173 |     )
174 | 
175 |     end = time.time()
176 |     elapsed = end-start
177 |     print(f"Time elapsed {elapsed/60:.2f} min")
178 | 
179 |     with torch.no_grad():
180 |         model.eval()
181 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
182 |         for batch in test_loader:
183 | 
184 |             #for s in ["input_ids", "attention_mask", "label"]:
185 |             #    batch[s] = batch[s].to(device)
186 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
187 |             predicted_labels = torch.argmax(outputs["logits"], 1)
188 |             test_acc.update(predicted_labels, batch["label"])
189 | 
190 |     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
191 |     print(f"Test accuracy {test_acc.compute()*100:.2f}%")


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/bfloat16-mixed-medium.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import os.path as op
  4 | import time
  5 | 
  6 | from datasets import load_dataset
  7 | from lightning import Fabric
  8 | import torch
  9 | from torch.utils.data import DataLoader
 10 | import torchmetrics
 11 | from transformers import AutoTokenizer
 12 | from transformers import AutoModelForSequenceClassification
 13 | from watermark import watermark
 14 | 
 15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
 16 | from local_dataset_utilities import IMDBDataset
 17 | 
 18 | 
 19 | def tokenize_text(batch):
 20 |     return tokenizer(batch["text"], truncation=True, padding=True)
 21 | 
 22 | 
 23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
 24 | 
 25 |     for epoch in range(num_epochs):
 26 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 27 | 
 28 |         model.train()
 29 |         for batch_idx, batch in enumerate(train_loader):
 30 |             model.train()
 31 | 
 32 |             # For non-Fabric PyTorch:
 33 |             #for s in ["input_ids", "attention_mask", "label"]:
 34 |             #    batch[s] = batch[s].to(device)
 35 | 
 36 |             ### FORWARD AND BACK PROP   
 37 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"]) 
 38 |             optimizer.zero_grad()
 39 |             
 40 |             # For non-Fabric PyTorch:
 41 |             #outputs["loss"].backward()
 42 |             fabric.backward(outputs["loss"])
 43 | 
 44 |             ### UPDATE MODEL PARAMETERS
 45 |             optimizer.step()
 46 | 
 47 |             ### LOGGING
 48 |             if not batch_idx % 300:
 49 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
 50 | 
 51 |             model.eval()
 52 |             with torch.no_grad():
 53 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 54 |                 train_acc.update(predicted_labels, batch["label"])
 55 | 
 56 |         ### MORE LOGGING
 57 |         model.eval()
 58 |         with torch.no_grad():
 59 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 60 |             for batch in val_loader:
 61 | 
 62 |                 # For non-Fabric PyTorch:
 63 |                 #for s in ["input_ids", "attention_mask", "label"]:
 64 |                 #    batch[s] = batch[s].to(device)
 65 |                 outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
 66 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 67 |                 val_acc.update(predicted_labels, batch["label"])
 68 | 
 69 |             print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 70 |             train_acc.reset(), val_acc.reset()
 71 | 
 72 | 
 73 | if __name__ == "__main__":
 74 | 
 75 |     print(watermark(packages="torch,lightning,transformers", python=True))
 76 |     print("Torch CUDA available?", torch.cuda.is_available())
 77 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 78 | 
 79 |     torch.set_float32_matmul_precision("medium")
 80 |     torch.manual_seed(123)
 81 | 
 82 |     ##########################
 83 |     ### 1 Loading the Dataset
 84 |     ##########################
 85 |     download_dataset()
 86 |     df = load_dataset_into_to_dataframe()
 87 |     if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
 88 |         partition_dataset(df)
 89 | 
 90 |     imdb_dataset = load_dataset(
 91 |         "csv",
 92 |         data_files={
 93 |             "train": "train.csv",
 94 |             "validation": "val.csv",
 95 |             "test": "test.csv",
 96 |         },
 97 |     )
 98 | 
 99 |     #########################################
100 |     ### 2 Tokenization and Numericalization
101 |     #########################################
102 | 
103 |     tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
104 |     print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
105 |     print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
106 | 
107 |     print("Tokenizing ...", flush=True)
108 |     imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
109 |     del imdb_dataset
110 |     imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
111 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
112 | 
113 |     #########################################
114 |     ### 3 Set Up DataLoaders
115 |     #########################################
116 | 
117 |     train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
118 |     val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
119 |     test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
120 | 
121 |     train_loader = DataLoader(
122 |         dataset=train_dataset,
123 |         batch_size=12,
124 |         shuffle=True, 
125 |         num_workers=4,
126 |         drop_last=True,
127 |     )
128 | 
129 |     val_loader = DataLoader(
130 |         dataset=val_dataset,
131 |         batch_size=12,
132 |         num_workers=4,
133 |         drop_last=True,
134 |     )
135 | 
136 |     test_loader = DataLoader(
137 |         dataset=test_dataset,
138 |         batch_size=12,
139 |         num_workers=2,
140 |         drop_last=True,
141 |     )
142 | 
143 | 
144 |     #########################################
145 |     ### 4 Initializing the Model
146 |     #########################################
147 | 
148 |     fabric = Fabric(accelerator="cuda", devices=[5], precision="bf16-mixed")
149 |     fabric.launch()
150 | 
151 |     model = AutoModelForSequenceClassification.from_pretrained(
152 |         "distilbert-base-uncased", num_labels=2)
153 | 
154 |     # model.to(device)
155 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
156 | 
157 |     model, optimizer = fabric.setup(model, optimizer)
158 |     train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
159 |     fabric.barrier()
160 | 
161 |     #########################################
162 |     ### 5 Finetuning
163 |     #########################################
164 | 
165 |     start = time.time()
166 |     train(
167 |         num_epochs=3,
168 |         model=model,
169 |         optimizer=optimizer,
170 |         train_loader=train_loader,
171 |         val_loader=val_loader,
172 |         fabric=fabric
173 |     )
174 | 
175 |     end = time.time()
176 |     elapsed = end-start
177 |     print(f"Time elapsed {elapsed/60:.2f} min")
178 | 
179 |     with torch.no_grad():
180 |         model.eval()
181 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
182 |         for batch in test_loader:
183 | 
184 |             #for s in ["input_ids", "attention_mask", "label"]:
185 |             #    batch[s] = batch[s].to(device)
186 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
187 |             predicted_labels = torch.argmax(outputs["logits"], 1)
188 |             test_acc.update(predicted_labels, batch["label"])
189 | 
190 |     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
191 |     print(f"Test accuracy {test_acc.compute()*100:.2f}%")


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/bfloat16-mixed.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import os.path as op
  4 | import time
  5 | 
  6 | from datasets import load_dataset
  7 | from lightning import Fabric
  8 | import torch
  9 | from torch.utils.data import DataLoader
 10 | import torchmetrics
 11 | from transformers import AutoTokenizer
 12 | from transformers import AutoModelForSequenceClassification
 13 | from watermark import watermark
 14 | 
 15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
 16 | from local_dataset_utilities import IMDBDataset
 17 | 
 18 | 
 19 | def tokenize_text(batch):
 20 |     return tokenizer(batch["text"], truncation=True, padding=True)
 21 | 
 22 | 
 23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
 24 | 
 25 |     for epoch in range(num_epochs):
 26 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 27 | 
 28 |         model.train()
 29 |         for batch_idx, batch in enumerate(train_loader):
 30 |             model.train()
 31 | 
 32 |             # For non-Fabric PyTorch:
 33 |             #for s in ["input_ids", "attention_mask", "label"]:
 34 |             #    batch[s] = batch[s].to(device)
 35 | 
 36 |             ### FORWARD AND BACK PROP   
 37 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"]) 
 38 |             optimizer.zero_grad()
 39 |             
 40 |             # For non-Fabric PyTorch:
 41 |             #outputs["loss"].backward()
 42 |             fabric.backward(outputs["loss"])
 43 | 
 44 |             ### UPDATE MODEL PARAMETERS
 45 |             optimizer.step()
 46 | 
 47 |             ### LOGGING
 48 |             if not batch_idx % 300:
 49 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
 50 | 
 51 |             model.eval()
 52 |             with torch.no_grad():
 53 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 54 |                 train_acc.update(predicted_labels, batch["label"])
 55 | 
 56 |         ### MORE LOGGING
 57 |         model.eval()
 58 |         with torch.no_grad():
 59 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 60 |             for batch in val_loader:
 61 | 
 62 |                 # For non-Fabric PyTorch:
 63 |                 #for s in ["input_ids", "attention_mask", "label"]:
 64 |                 #    batch[s] = batch[s].to(device)
 65 |                 outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
 66 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 67 |                 val_acc.update(predicted_labels, batch["label"])
 68 | 
 69 |             print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 70 |             train_acc.reset(), val_acc.reset()
 71 | 
 72 | 
 73 | if __name__ == "__main__":
 74 | 
 75 |     print(watermark(packages="torch,lightning,transformers", python=True))
 76 |     print("Torch CUDA available?", torch.cuda.is_available())
 77 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 78 |     print("GPU supports bfloat16:", torch.cuda.is_bf16_supported())
 79 | 
 80 |     torch.manual_seed(123)
 81 | 
 82 |     ##########################
 83 |     ### 1 Loading the Dataset
 84 |     ##########################
 85 |     download_dataset()
 86 |     df = load_dataset_into_to_dataframe()
 87 |     if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
 88 |         partition_dataset(df)
 89 | 
 90 |     imdb_dataset = load_dataset(
 91 |         "csv",
 92 |         data_files={
 93 |             "train": "train.csv",
 94 |             "validation": "val.csv",
 95 |             "test": "test.csv",
 96 |         },
 97 |     )
 98 | 
 99 |     #########################################
100 |     ### 2 Tokenization and Numericalization
101 |     #########################################
102 | 
103 |     tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
104 |     print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
105 |     print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
106 | 
107 |     print("Tokenizing ...", flush=True)
108 |     imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
109 |     del imdb_dataset
110 |     imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
111 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
112 | 
113 |     #########################################
114 |     ### 3 Set Up DataLoaders
115 |     #########################################
116 | 
117 |     train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
118 |     val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
119 |     test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
120 | 
121 |     train_loader = DataLoader(
122 |         dataset=train_dataset,
123 |         batch_size=12,
124 |         shuffle=True, 
125 |         num_workers=4,
126 |         drop_last=True,
127 |     )
128 | 
129 |     val_loader = DataLoader(
130 |         dataset=val_dataset,
131 |         batch_size=12,
132 |         num_workers=4,
133 |         drop_last=True,
134 |     )
135 | 
136 |     test_loader = DataLoader(
137 |         dataset=test_dataset,
138 |         batch_size=12,
139 |         num_workers=2,
140 |         drop_last=True,
141 |     )
142 | 
143 | 
144 |     #########################################
145 |     ### 4 Initializing the Model
146 |     #########################################
147 | 
148 |     fabric = Fabric(accelerator="cuda", devices=1, precision="bf16-mixed")
149 |     fabric.launch()
150 | 
151 |     model = AutoModelForSequenceClassification.from_pretrained(
152 |         "distilbert-base-uncased", num_labels=2)
153 | 
154 |     # model.to(device)
155 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
156 | 
157 |     model, optimizer = fabric.setup(model, optimizer)
158 |     train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
159 |     fabric.barrier()
160 | 
161 |     #########################################
162 |     ### 5 Finetuning
163 |     #########################################
164 | 
165 |     start = time.time()
166 |     train(
167 |         num_epochs=3,
168 |         model=model,
169 |         optimizer=optimizer,
170 |         train_loader=train_loader,
171 |         val_loader=val_loader,
172 |         fabric=fabric
173 |     )
174 | 
175 |     end = time.time()
176 |     elapsed = end-start
177 |     print(f"Time elapsed {elapsed/60:.2f} min")
178 | 
179 |     with torch.no_grad():
180 |         model.eval()
181 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
182 |         for batch in test_loader:
183 | 
184 |             #for s in ["input_ids", "attention_mask", "label"]:
185 |             #    batch[s] = batch[s].to(device)
186 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
187 |             predicted_labels = torch.argmax(outputs["logits"], 1)
188 |             test_acc.update(predicted_labels, batch["label"])
189 | 
190 |     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
191 |     print(f"Test accuracy {test_acc.compute()*100:.2f}%")


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/bfloat16-regular.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import os.path as op
  4 | import time
  5 | 
  6 | from datasets import load_dataset
  7 | from lightning import Fabric
  8 | import torch
  9 | from torch.utils.data import DataLoader
 10 | import torchmetrics
 11 | from transformers import AutoTokenizer
 12 | from transformers import AutoModelForSequenceClassification
 13 | from watermark import watermark
 14 | 
 15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
 16 | from local_dataset_utilities import IMDBDataset
 17 | 
 18 | 
 19 | def tokenize_text(batch):
 20 |     return tokenizer(batch["text"], truncation=True, padding=True)
 21 | 
 22 | 
 23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
 24 | 
 25 |     for epoch in range(num_epochs):
 26 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 27 | 
 28 |         model.train()
 29 |         for batch_idx, batch in enumerate(train_loader):
 30 |             model.train()
 31 | 
 32 |             # For non-Fabric PyTorch:
 33 |             #for s in ["input_ids", "attention_mask", "label"]:
 34 |             #    batch[s] = batch[s].to(device)
 35 | 
 36 |             ### FORWARD AND BACK PROP   
 37 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"]) 
 38 |             optimizer.zero_grad()
 39 |             
 40 |             # For non-Fabric PyTorch:
 41 |             #outputs["loss"].backward()
 42 |             fabric.backward(outputs["loss"])
 43 | 
 44 |             ### UPDATE MODEL PARAMETERS
 45 |             optimizer.step()
 46 | 
 47 |             ### LOGGING
 48 |             if not batch_idx % 300:
 49 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
 50 | 
 51 |             model.eval()
 52 |             with torch.no_grad():
 53 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 54 |                 train_acc.update(predicted_labels, batch["label"])
 55 | 
 56 |         ### MORE LOGGING
 57 |         model.eval()
 58 |         with torch.no_grad():
 59 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 60 |             for batch in val_loader:
 61 | 
 62 |                 # For non-Fabric PyTorch:
 63 |                 #for s in ["input_ids", "attention_mask", "label"]:
 64 |                 #    batch[s] = batch[s].to(device)
 65 |                 outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
 66 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 67 |                 val_acc.update(predicted_labels, batch["label"])
 68 | 
 69 |             print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 70 |             train_acc.reset(), val_acc.reset()
 71 | 
 72 | 
 73 | if __name__ == "__main__":
 74 | 
 75 |     print(watermark(packages="torch,lightning,transformers", python=True))
 76 |     print("Torch CUDA available?", torch.cuda.is_available())
 77 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 78 | 
 79 |     torch.manual_seed(123)
 80 | 
 81 |     ##########################
 82 |     ### 1 Loading the Dataset
 83 |     ##########################
 84 |     download_dataset()
 85 |     df = load_dataset_into_to_dataframe()
 86 |     if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
 87 |         partition_dataset(df)
 88 | 
 89 |     imdb_dataset = load_dataset(
 90 |         "csv",
 91 |         data_files={
 92 |             "train": "train.csv",
 93 |             "validation": "val.csv",
 94 |             "test": "test.csv",
 95 |         },
 96 |     )
 97 | 
 98 |     #########################################
 99 |     ### 2 Tokenization and Numericalization
100 |     #########################################
101 | 
102 |     tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
103 |     print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
104 |     print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
105 | 
106 |     print("Tokenizing ...", flush=True)
107 |     imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
108 |     del imdb_dataset
109 |     imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
110 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
111 | 
112 |     #########################################
113 |     ### 3 Set Up DataLoaders
114 |     #########################################
115 | 
116 |     train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
117 |     val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
118 |     test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
119 | 
120 |     train_loader = DataLoader(
121 |         dataset=train_dataset,
122 |         batch_size=12,
123 |         shuffle=True, 
124 |         num_workers=4,
125 |         drop_last=True,
126 |     )
127 | 
128 |     val_loader = DataLoader(
129 |         dataset=val_dataset,
130 |         batch_size=12,
131 |         num_workers=4,
132 |         drop_last=True,
133 |     )
134 | 
135 |     test_loader = DataLoader(
136 |         dataset=test_dataset,
137 |         batch_size=12,
138 |         num_workers=2,
139 |         drop_last=True,
140 |     )
141 | 
142 | 
143 |     #########################################
144 |     ### 4 Initializing the Model
145 |     #########################################
146 | 
147 |     fabric = Fabric(accelerator="cuda", devices=1, precision="bf16-true")
148 |     fabric.launch()
149 | 
150 |     model = AutoModelForSequenceClassification.from_pretrained(
151 |         "distilbert-base-uncased", num_labels=2)
152 | 
153 |     # model.to(device)
154 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
155 | 
156 |     model, optimizer = fabric.setup(model, optimizer)
157 |     train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
158 |     fabric.barrier()
159 | 
160 |     #########################################
161 |     ### 5 Finetuning
162 |     #########################################
163 | 
164 |     start = time.time()
165 |     train(
166 |         num_epochs=3,
167 |         model=model,
168 |         optimizer=optimizer,
169 |         train_loader=train_loader,
170 |         val_loader=val_loader,
171 |         fabric=fabric
172 |     )
173 | 
174 |     end = time.time()
175 |     elapsed = end-start
176 |     print(f"Time elapsed {elapsed/60:.2f} min")
177 | 
178 |     with torch.no_grad():
179 |         model.eval()
180 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
181 |         for batch in test_loader:
182 | 
183 |             #for s in ["input_ids", "attention_mask", "label"]:
184 |             #    batch[s] = batch[s].to(device)
185 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
186 |             predicted_labels = torch.argmax(outputs["logits"], 1)
187 |             test_acc.update(predicted_labels, batch["label"])
188 | 
189 |     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
190 |     print(f"Test accuracy {test_acc.compute()*100:.2f}%")


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/figures/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/conventional/distilbert-movie-review/mixed-precision-experiment/figures/1.png


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/figures/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/conventional/distilbert-movie-review/mixed-precision-experiment/figures/2.png


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/figures/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/conventional/distilbert-movie-review/mixed-precision-experiment/figures/3.png


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/float16-mixed-high.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import os.path as op
  4 | import time
  5 | 
  6 | from datasets import load_dataset
  7 | from lightning import Fabric
  8 | import torch
  9 | from torch.utils.data import DataLoader
 10 | import torchmetrics
 11 | from transformers import AutoTokenizer
 12 | from transformers import AutoModelForSequenceClassification
 13 | from watermark import watermark
 14 | 
 15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
 16 | from local_dataset_utilities import IMDBDataset
 17 | 
 18 | 
 19 | def tokenize_text(batch):
 20 |     return tokenizer(batch["text"], truncation=True, padding=True)
 21 | 
 22 | 
 23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
 24 | 
 25 |     for epoch in range(num_epochs):
 26 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 27 | 
 28 |         model.train()
 29 |         for batch_idx, batch in enumerate(train_loader):
 30 |             model.train()
 31 | 
 32 |             # For non-Fabric PyTorch:
 33 |             #for s in ["input_ids", "attention_mask", "label"]:
 34 |             #    batch[s] = batch[s].to(device)
 35 | 
 36 |             ### FORWARD AND BACK PROP   
 37 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"]) 
 38 |             optimizer.zero_grad()
 39 |             
 40 |             # For non-Fabric PyTorch:
 41 |             #outputs["loss"].backward()
 42 |             fabric.backward(outputs["loss"])
 43 | 
 44 |             ### UPDATE MODEL PARAMETERS
 45 |             optimizer.step()
 46 | 
 47 |             ### LOGGING
 48 |             if not batch_idx % 300:
 49 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
 50 | 
 51 |             model.eval()
 52 |             with torch.no_grad():
 53 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 54 |                 train_acc.update(predicted_labels, batch["label"])
 55 | 
 56 |         ### MORE LOGGING
 57 |         model.eval()
 58 |         with torch.no_grad():
 59 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 60 |             for batch in val_loader:
 61 | 
 62 |                 # For non-Fabric PyTorch:
 63 |                 #for s in ["input_ids", "attention_mask", "label"]:
 64 |                 #    batch[s] = batch[s].to(device)
 65 |                 outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
 66 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 67 |                 val_acc.update(predicted_labels, batch["label"])
 68 | 
 69 |             print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 70 |             train_acc.reset(), val_acc.reset()
 71 | 
 72 | 
 73 | if __name__ == "__main__":
 74 | 
 75 |     print(watermark(packages="torch,lightning,transformers", python=True))
 76 |     print("Torch CUDA available?", torch.cuda.is_available())
 77 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 78 | 
 79 |     torch.set_float32_matmul_precision("high")
 80 |     torch.manual_seed(123)
 81 | 
 82 |     ##########################
 83 |     ### 1 Loading the Dataset
 84 |     ##########################
 85 |     download_dataset()
 86 |     df = load_dataset_into_to_dataframe()
 87 |     if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
 88 |         partition_dataset(df)
 89 | 
 90 |     imdb_dataset = load_dataset(
 91 |         "csv",
 92 |         data_files={
 93 |             "train": "train.csv",
 94 |             "validation": "val.csv",
 95 |             "test": "test.csv",
 96 |         },
 97 |     )
 98 | 
 99 |     #########################################
100 |     ### 2 Tokenization and Numericalization
101 |     #########################################
102 | 
103 |     tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
104 |     print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
105 |     print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
106 | 
107 |     print("Tokenizing ...", flush=True)
108 |     imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
109 |     del imdb_dataset
110 |     imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
111 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
112 | 
113 |     #########################################
114 |     ### 3 Set Up DataLoaders
115 |     #########################################
116 | 
117 |     train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
118 |     val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
119 |     test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
120 | 
121 |     train_loader = DataLoader(
122 |         dataset=train_dataset,
123 |         batch_size=12,
124 |         shuffle=True, 
125 |         num_workers=4,
126 |         drop_last=True,
127 |     )
128 | 
129 |     val_loader = DataLoader(
130 |         dataset=val_dataset,
131 |         batch_size=12,
132 |         num_workers=4,
133 |         drop_last=True,
134 |     )
135 | 
136 |     test_loader = DataLoader(
137 |         dataset=test_dataset,
138 |         batch_size=12,
139 |         num_workers=2,
140 |         drop_last=True,
141 |     )
142 | 
143 | 
144 |     #########################################
145 |     ### 4 Initializing the Model
146 |     #########################################
147 | 
148 |     fabric = Fabric(accelerator="cuda", devices=[4], precision="16-mixed")
149 |     fabric.launch()
150 | 
151 |     model = AutoModelForSequenceClassification.from_pretrained(
152 |         "distilbert-base-uncased", num_labels=2)
153 | 
154 |     # model.to(device)
155 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
156 | 
157 |     model, optimizer = fabric.setup(model, optimizer)
158 |     train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
159 |     fabric.barrier()
160 | 
161 |     #########################################
162 |     ### 5 Finetuning
163 |     #########################################
164 | 
165 |     start = time.time()
166 |     train(
167 |         num_epochs=3,
168 |         model=model,
169 |         optimizer=optimizer,
170 |         train_loader=train_loader,
171 |         val_loader=val_loader,
172 |         fabric=fabric
173 |     )
174 | 
175 |     end = time.time()
176 |     elapsed = end-start
177 |     print(f"Time elapsed {elapsed/60:.2f} min")
178 | 
179 |     with torch.no_grad():
180 |         model.eval()
181 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
182 |         for batch in test_loader:
183 | 
184 |             #for s in ["input_ids", "attention_mask", "label"]:
185 |             #    batch[s] = batch[s].to(device)
186 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
187 |             predicted_labels = torch.argmax(outputs["logits"], 1)
188 |             test_acc.update(predicted_labels, batch["label"])
189 | 
190 |     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
191 |     print(f"Test accuracy {test_acc.compute()*100:.2f}%")


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/float16-mixed-medium.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import os.path as op
  4 | import time
  5 | 
  6 | from datasets import load_dataset
  7 | from lightning import Fabric
  8 | import torch
  9 | from torch.utils.data import DataLoader
 10 | import torchmetrics
 11 | from transformers import AutoTokenizer
 12 | from transformers import AutoModelForSequenceClassification
 13 | from watermark import watermark
 14 | 
 15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
 16 | from local_dataset_utilities import IMDBDataset
 17 | 
 18 | 
 19 | def tokenize_text(batch):
 20 |     return tokenizer(batch["text"], truncation=True, padding=True)
 21 | 
 22 | 
 23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
 24 | 
 25 |     for epoch in range(num_epochs):
 26 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 27 | 
 28 |         model.train()
 29 |         for batch_idx, batch in enumerate(train_loader):
 30 |             model.train()
 31 | 
 32 |             # For non-Fabric PyTorch:
 33 |             #for s in ["input_ids", "attention_mask", "label"]:
 34 |             #    batch[s] = batch[s].to(device)
 35 | 
 36 |             ### FORWARD AND BACK PROP   
 37 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"]) 
 38 |             optimizer.zero_grad()
 39 |             
 40 |             # For non-Fabric PyTorch:
 41 |             #outputs["loss"].backward()
 42 |             fabric.backward(outputs["loss"])
 43 | 
 44 |             ### UPDATE MODEL PARAMETERS
 45 |             optimizer.step()
 46 | 
 47 |             ### LOGGING
 48 |             if not batch_idx % 300:
 49 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
 50 | 
 51 |             model.eval()
 52 |             with torch.no_grad():
 53 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 54 |                 train_acc.update(predicted_labels, batch["label"])
 55 | 
 56 |         ### MORE LOGGING
 57 |         model.eval()
 58 |         with torch.no_grad():
 59 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 60 |             for batch in val_loader:
 61 | 
 62 |                 # For non-Fabric PyTorch:
 63 |                 #for s in ["input_ids", "attention_mask", "label"]:
 64 |                 #    batch[s] = batch[s].to(device)
 65 |                 outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
 66 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 67 |                 val_acc.update(predicted_labels, batch["label"])
 68 | 
 69 |             print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 70 |             train_acc.reset(), val_acc.reset()
 71 | 
 72 | 
 73 | if __name__ == "__main__":
 74 | 
 75 |     print(watermark(packages="torch,lightning,transformers", python=True))
 76 |     print("Torch CUDA available?", torch.cuda.is_available())
 77 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 78 | 
 79 |     torch.set_float32_matmul_precision("medium")
 80 |     torch.manual_seed(123)
 81 | 
 82 |     ##########################
 83 |     ### 1 Loading the Dataset
 84 |     ##########################
 85 |     download_dataset()
 86 |     df = load_dataset_into_to_dataframe()
 87 |     if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
 88 |         partition_dataset(df)
 89 | 
 90 |     imdb_dataset = load_dataset(
 91 |         "csv",
 92 |         data_files={
 93 |             "train": "train.csv",
 94 |             "validation": "val.csv",
 95 |             "test": "test.csv",
 96 |         },
 97 |     )
 98 | 
 99 |     #########################################
100 |     ### 2 Tokenization and Numericalization
101 |     #########################################
102 | 
103 |     tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
104 |     print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
105 |     print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
106 | 
107 |     print("Tokenizing ...", flush=True)
108 |     imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
109 |     del imdb_dataset
110 |     imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
111 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
112 | 
113 |     #########################################
114 |     ### 3 Set Up DataLoaders
115 |     #########################################
116 | 
117 |     train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
118 |     val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
119 |     test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
120 | 
121 |     train_loader = DataLoader(
122 |         dataset=train_dataset,
123 |         batch_size=12,
124 |         shuffle=True, 
125 |         num_workers=4,
126 |         drop_last=True,
127 |     )
128 | 
129 |     val_loader = DataLoader(
130 |         dataset=val_dataset,
131 |         batch_size=12,
132 |         num_workers=4,
133 |         drop_last=True,
134 |     )
135 | 
136 |     test_loader = DataLoader(
137 |         dataset=test_dataset,
138 |         batch_size=12,
139 |         num_workers=2,
140 |         drop_last=True,
141 |     )
142 | 
143 | 
144 |     #########################################
145 |     ### 4 Initializing the Model
146 |     #########################################
147 | 
148 |     fabric = Fabric(accelerator="cuda", devices=[5], precision="16-mixed")
149 |     fabric.launch()
150 | 
151 |     model = AutoModelForSequenceClassification.from_pretrained(
152 |         "distilbert-base-uncased", num_labels=2)
153 | 
154 |     # model.to(device)
155 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
156 | 
157 |     model, optimizer = fabric.setup(model, optimizer)
158 |     train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
159 |     fabric.barrier()
160 | 
161 |     #########################################
162 |     ### 5 Finetuning
163 |     #########################################
164 | 
165 |     start = time.time()
166 |     train(
167 |         num_epochs=3,
168 |         model=model,
169 |         optimizer=optimizer,
170 |         train_loader=train_loader,
171 |         val_loader=val_loader,
172 |         fabric=fabric
173 |     )
174 | 
175 |     end = time.time()
176 |     elapsed = end-start
177 |     print(f"Time elapsed {elapsed/60:.2f} min")
178 | 
179 |     with torch.no_grad():
180 |         model.eval()
181 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
182 |         for batch in test_loader:
183 | 
184 |             #for s in ["input_ids", "attention_mask", "label"]:
185 |             #    batch[s] = batch[s].to(device)
186 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
187 |             predicted_labels = torch.argmax(outputs["logits"], 1)
188 |             test_acc.update(predicted_labels, batch["label"])
189 | 
190 |     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
191 |     print(f"Test accuracy {test_acc.compute()*100:.2f}%")


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/float16-mixed.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import os.path as op
  4 | import time
  5 | 
  6 | from datasets import load_dataset
  7 | from lightning import Fabric
  8 | import torch
  9 | from torch.utils.data import DataLoader
 10 | import torchmetrics
 11 | from transformers import AutoTokenizer
 12 | from transformers import AutoModelForSequenceClassification
 13 | from watermark import watermark
 14 | 
 15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
 16 | from local_dataset_utilities import IMDBDataset
 17 | 
 18 | 
 19 | def tokenize_text(batch):
 20 |     return tokenizer(batch["text"], truncation=True, padding=True)
 21 | 
 22 | 
 23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
 24 | 
 25 |     for epoch in range(num_epochs):
 26 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 27 | 
 28 |         model.train()
 29 |         for batch_idx, batch in enumerate(train_loader):
 30 |             model.train()
 31 | 
 32 |             # For non-Fabric PyTorch:
 33 |             #for s in ["input_ids", "attention_mask", "label"]:
 34 |             #    batch[s] = batch[s].to(device)
 35 | 
 36 |             ### FORWARD AND BACK PROP   
 37 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"]) 
 38 |             optimizer.zero_grad()
 39 |             
 40 |             # For non-Fabric PyTorch:
 41 |             #outputs["loss"].backward()
 42 |             fabric.backward(outputs["loss"])
 43 | 
 44 |             ### UPDATE MODEL PARAMETERS
 45 |             optimizer.step()
 46 | 
 47 |             ### LOGGING
 48 |             if not batch_idx % 300:
 49 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
 50 | 
 51 |             model.eval()
 52 |             with torch.no_grad():
 53 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 54 |                 train_acc.update(predicted_labels, batch["label"])
 55 | 
 56 |         ### MORE LOGGING
 57 |         model.eval()
 58 |         with torch.no_grad():
 59 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 60 |             for batch in val_loader:
 61 | 
 62 |                 # For non-Fabric PyTorch:
 63 |                 #for s in ["input_ids", "attention_mask", "label"]:
 64 |                 #    batch[s] = batch[s].to(device)
 65 |                 outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
 66 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 67 |                 val_acc.update(predicted_labels, batch["label"])
 68 | 
 69 |             print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 70 |             train_acc.reset(), val_acc.reset()
 71 | 
 72 | 
 73 | if __name__ == "__main__":
 74 | 
 75 |     print(watermark(packages="torch,lightning,transformers", python=True))
 76 |     print("Torch CUDA available?", torch.cuda.is_available())
 77 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 78 | 
 79 |     torch.manual_seed(123)
 80 | 
 81 |     ##########################
 82 |     ### 1 Loading the Dataset
 83 |     ##########################
 84 |     download_dataset()
 85 |     df = load_dataset_into_to_dataframe()
 86 |     if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
 87 |         partition_dataset(df)
 88 | 
 89 |     imdb_dataset = load_dataset(
 90 |         "csv",
 91 |         data_files={
 92 |             "train": "train.csv",
 93 |             "validation": "val.csv",
 94 |             "test": "test.csv",
 95 |         },
 96 |     )
 97 | 
 98 |     #########################################
 99 |     ### 2 Tokenization and Numericalization
100 |     #########################################
101 | 
102 |     tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
103 |     print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
104 |     print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
105 | 
106 |     print("Tokenizing ...", flush=True)
107 |     imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
108 |     del imdb_dataset
109 |     imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
110 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
111 | 
112 |     #########################################
113 |     ### 3 Set Up DataLoaders
114 |     #########################################
115 | 
116 |     train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
117 |     val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
118 |     test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
119 | 
120 |     train_loader = DataLoader(
121 |         dataset=train_dataset,
122 |         batch_size=12,
123 |         shuffle=True, 
124 |         num_workers=4,
125 |         drop_last=True,
126 |     )
127 | 
128 |     val_loader = DataLoader(
129 |         dataset=val_dataset,
130 |         batch_size=12,
131 |         num_workers=4,
132 |         drop_last=True,
133 |     )
134 | 
135 |     test_loader = DataLoader(
136 |         dataset=test_dataset,
137 |         batch_size=12,
138 |         num_workers=2,
139 |         drop_last=True,
140 |     )
141 | 
142 | 
143 |     #########################################
144 |     ### 4 Initializing the Model
145 |     #########################################
146 | 
147 |     fabric = Fabric(accelerator="cuda", devices=1, precision="16-mixed")
148 |     fabric.launch()
149 | 
150 |     model = AutoModelForSequenceClassification.from_pretrained(
151 |         "distilbert-base-uncased", num_labels=2)
152 | 
153 |     # model.to(device)
154 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
155 | 
156 |     model, optimizer = fabric.setup(model, optimizer)
157 |     train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
158 |     fabric.barrier()
159 | 
160 |     #########################################
161 |     ### 5 Finetuning
162 |     #########################################
163 | 
164 |     start = time.time()
165 |     train(
166 |         num_epochs=3,
167 |         model=model,
168 |         optimizer=optimizer,
169 |         train_loader=train_loader,
170 |         val_loader=val_loader,
171 |         fabric=fabric
172 |     )
173 | 
174 |     end = time.time()
175 |     elapsed = end-start
176 |     print(f"Time elapsed {elapsed/60:.2f} min")
177 | 
178 |     with torch.no_grad():
179 |         model.eval()
180 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
181 |         for batch in test_loader:
182 | 
183 |             #for s in ["input_ids", "attention_mask", "label"]:
184 |             #    batch[s] = batch[s].to(device)
185 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
186 |             predicted_labels = torch.argmax(outputs["logits"], 1)
187 |             test_acc.update(predicted_labels, batch["label"])
188 | 
189 |     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
190 |     print(f"Test accuracy {test_acc.compute()*100:.2f}%")


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/float16-regular.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import os.path as op
  4 | import time
  5 | 
  6 | from datasets import load_dataset
  7 | from lightning import Fabric
  8 | import torch
  9 | from torch.utils.data import DataLoader
 10 | import torchmetrics
 11 | from transformers import AutoTokenizer
 12 | from transformers import AutoModelForSequenceClassification
 13 | from watermark import watermark
 14 | 
 15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
 16 | from local_dataset_utilities import IMDBDataset
 17 | 
 18 | 
 19 | def tokenize_text(batch):
 20 |     return tokenizer(batch["text"], truncation=True, padding=True)
 21 | 
 22 | 
 23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
 24 | 
 25 |     for epoch in range(num_epochs):
 26 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 27 | 
 28 |         model.train()
 29 |         for batch_idx, batch in enumerate(train_loader):
 30 |             model.train()
 31 | 
 32 |             # For non-Fabric PyTorch:
 33 |             #for s in ["input_ids", "attention_mask", "label"]:
 34 |             #    batch[s] = batch[s].to(device)
 35 | 
 36 |             ### FORWARD AND BACK PROP   
 37 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"]) 
 38 |             optimizer.zero_grad()
 39 |             
 40 |             # For non-Fabric PyTorch:
 41 |             #outputs["loss"].backward()
 42 |             fabric.backward(outputs["loss"])
 43 | 
 44 |             ### UPDATE MODEL PARAMETERS
 45 |             optimizer.step()
 46 | 
 47 |             ### LOGGING
 48 |             if not batch_idx % 300:
 49 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
 50 | 
 51 |             model.eval()
 52 |             with torch.no_grad():
 53 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 54 |                 train_acc.update(predicted_labels, batch["label"])
 55 | 
 56 |         ### MORE LOGGING
 57 |         model.eval()
 58 |         with torch.no_grad():
 59 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 60 |             for batch in val_loader:
 61 | 
 62 |                 # For non-Fabric PyTorch:
 63 |                 #for s in ["input_ids", "attention_mask", "label"]:
 64 |                 #    batch[s] = batch[s].to(device)
 65 |                 outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
 66 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 67 |                 val_acc.update(predicted_labels, batch["label"])
 68 | 
 69 |             print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 70 |             train_acc.reset(), val_acc.reset()
 71 | 
 72 | 
 73 | if __name__ == "__main__":
 74 | 
 75 |     print(watermark(packages="torch,lightning,transformers", python=True))
 76 |     print("Torch CUDA available?", torch.cuda.is_available())
 77 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 78 | 
 79 |     torch.manual_seed(123)
 80 | 
 81 |     ##########################
 82 |     ### 1 Loading the Dataset
 83 |     ##########################
 84 |     download_dataset()
 85 |     df = load_dataset_into_to_dataframe()
 86 |     if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
 87 |         partition_dataset(df)
 88 | 
 89 |     imdb_dataset = load_dataset(
 90 |         "csv",
 91 |         data_files={
 92 |             "train": "train.csv",
 93 |             "validation": "val.csv",
 94 |             "test": "test.csv",
 95 |         },
 96 |     )
 97 | 
 98 |     #########################################
 99 |     ### 2 Tokenization and Numericalization
100 |     #########################################
101 | 
102 |     tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
103 |     print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
104 |     print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
105 | 
106 |     print("Tokenizing ...", flush=True)
107 |     imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
108 |     del imdb_dataset
109 |     imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
110 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
111 | 
112 |     #########################################
113 |     ### 3 Set Up DataLoaders
114 |     #########################################
115 | 
116 |     train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
117 |     val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
118 |     test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
119 | 
120 |     train_loader = DataLoader(
121 |         dataset=train_dataset,
122 |         batch_size=12,
123 |         shuffle=True, 
124 |         num_workers=4,
125 |         drop_last=True,
126 |     )
127 | 
128 |     val_loader = DataLoader(
129 |         dataset=val_dataset,
130 |         batch_size=12,
131 |         num_workers=4,
132 |         drop_last=True,
133 |     )
134 | 
135 |     test_loader = DataLoader(
136 |         dataset=test_dataset,
137 |         batch_size=12,
138 |         num_workers=2,
139 |         drop_last=True,
140 |     )
141 | 
142 | 
143 |     #########################################
144 |     ### 4 Initializing the Model
145 |     #########################################
146 | 
147 |     fabric = Fabric(accelerator="cuda", devices=1, precision="16-true")
148 |     fabric.launch()
149 | 
150 |     model = AutoModelForSequenceClassification.from_pretrained(
151 |         "distilbert-base-uncased", num_labels=2)
152 | 
153 |     # model.to(device)
154 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
155 | 
156 |     model, optimizer = fabric.setup(model, optimizer)
157 |     train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
158 |     fabric.barrier()
159 | 
160 |     #########################################
161 |     ### 5 Finetuning
162 |     #########################################
163 | 
164 |     start = time.time()
165 |     train(
166 |         num_epochs=3,
167 |         model=model,
168 |         optimizer=optimizer,
169 |         train_loader=train_loader,
170 |         val_loader=val_loader,
171 |         fabric=fabric
172 |     )
173 | 
174 |     end = time.time()
175 |     elapsed = end-start
176 |     print(f"Time elapsed {elapsed/60:.2f} min")
177 | 
178 |     with torch.no_grad():
179 |         model.eval()
180 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
181 |         for batch in test_loader:
182 | 
183 |             #for s in ["input_ids", "attention_mask", "label"]:
184 |             #    batch[s] = batch[s].to(device)
185 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
186 |             predicted_labels = torch.argmax(outputs["logits"], 1)
187 |             test_acc.update(predicted_labels, batch["label"])
188 | 
189 |     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
190 |     print(f"Test accuracy {test_acc.compute()*100:.2f}%")


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/float32-regular-high.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import os.path as op
  4 | import time
  5 | 
  6 | from datasets import load_dataset
  7 | from lightning import Fabric
  8 | import torch
  9 | from torch.utils.data import DataLoader
 10 | import torchmetrics
 11 | from transformers import AutoTokenizer
 12 | from transformers import AutoModelForSequenceClassification
 13 | from watermark import watermark
 14 | 
 15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
 16 | from local_dataset_utilities import IMDBDataset
 17 | 
 18 | 
 19 | def tokenize_text(batch):
 20 |     return tokenizer(batch["text"], truncation=True, padding=True)
 21 | 
 22 | 
 23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
 24 | 
 25 |     for epoch in range(num_epochs):
 26 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 27 | 
 28 |         model.train()
 29 |         for batch_idx, batch in enumerate(train_loader):
 30 |             model.train()
 31 | 
 32 |             # For non-Fabric PyTorch:
 33 |             #for s in ["input_ids", "attention_mask", "label"]:
 34 |             #    batch[s] = batch[s].to(device)
 35 | 
 36 |             ### FORWARD AND BACK PROP   
 37 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"]) 
 38 |             optimizer.zero_grad()
 39 |             
 40 |             # For non-Fabric PyTorch:
 41 |             #outputs["loss"].backward()
 42 |             fabric.backward(outputs["loss"])
 43 | 
 44 |             ### UPDATE MODEL PARAMETERS
 45 |             optimizer.step()
 46 | 
 47 |             ### LOGGING
 48 |             if not batch_idx % 300:
 49 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
 50 | 
 51 |             model.eval()
 52 |             with torch.no_grad():
 53 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 54 |                 train_acc.update(predicted_labels, batch["label"])
 55 | 
 56 |         ### MORE LOGGING
 57 |         model.eval()
 58 |         with torch.no_grad():
 59 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 60 |             for batch in val_loader:
 61 | 
 62 |                 # For non-Fabric PyTorch:
 63 |                 #for s in ["input_ids", "attention_mask", "label"]:
 64 |                 #    batch[s] = batch[s].to(device)
 65 |                 outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
 66 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 67 |                 val_acc.update(predicted_labels, batch["label"])
 68 | 
 69 |             print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 70 |             train_acc.reset(), val_acc.reset()
 71 | 
 72 | 
 73 | if __name__ == "__main__":
 74 | 
 75 |     print(watermark(packages="torch,lightning,transformers", python=True))
 76 |     print("Torch CUDA available?", torch.cuda.is_available())
 77 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 78 | 
 79 |     torch.set_float32_matmul_precision("high")
 80 |     torch.manual_seed(123)
 81 | 
 82 |     ##########################
 83 |     ### 1 Loading the Dataset
 84 |     ##########################
 85 |     download_dataset()
 86 |     df = load_dataset_into_to_dataframe()
 87 |     if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
 88 |         partition_dataset(df)
 89 | 
 90 |     imdb_dataset = load_dataset(
 91 |         "csv",
 92 |         data_files={
 93 |             "train": "train.csv",
 94 |             "validation": "val.csv",
 95 |             "test": "test.csv",
 96 |         },
 97 |     )
 98 | 
 99 |     #########################################
100 |     ### 2 Tokenization and Numericalization
101 |     #########################################
102 | 
103 |     tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
104 |     print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
105 |     print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
106 | 
107 |     print("Tokenizing ...", flush=True)
108 |     imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
109 |     del imdb_dataset
110 |     imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
111 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
112 | 
113 |     #########################################
114 |     ### 3 Set Up DataLoaders
115 |     #########################################
116 | 
117 |     train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
118 |     val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
119 |     test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
120 | 
121 |     train_loader = DataLoader(
122 |         dataset=train_dataset,
123 |         batch_size=12,
124 |         shuffle=True, 
125 |         num_workers=4,
126 |         drop_last=True,
127 |     )
128 | 
129 |     val_loader = DataLoader(
130 |         dataset=val_dataset,
131 |         batch_size=12,
132 |         num_workers=4,
133 |         drop_last=True,
134 |     )
135 | 
136 |     test_loader = DataLoader(
137 |         dataset=test_dataset,
138 |         batch_size=12,
139 |         num_workers=2,
140 |         drop_last=True,
141 |     )
142 | 
143 | 
144 |     #########################################
145 |     ### 4 Initializing the Model
146 |     #########################################
147 | 
148 |     fabric = Fabric(accelerator="cuda", devices=[7], precision="32-true")
149 |     fabric.launch()
150 | 
151 |     model = AutoModelForSequenceClassification.from_pretrained(
152 |         "distilbert-base-uncased", num_labels=2)
153 | 
154 |     # model.to(device)
155 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
156 | 
157 |     model, optimizer = fabric.setup(model, optimizer)
158 |     train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
159 |     fabric.barrier()
160 | 
161 |     #########################################
162 |     ### 5 Finetuning
163 |     #########################################
164 | 
165 |     start = time.time()
166 |     train(
167 |         num_epochs=3,
168 |         model=model,
169 |         optimizer=optimizer,
170 |         train_loader=train_loader,
171 |         val_loader=val_loader,
172 |         fabric=fabric
173 |     )
174 | 
175 |     end = time.time()
176 |     elapsed = end-start
177 |     print(f"Time elapsed {elapsed/60:.2f} min")
178 | 
179 |     with torch.no_grad():
180 |         model.eval()
181 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
182 |         for batch in test_loader:
183 | 
184 |             #for s in ["input_ids", "attention_mask", "label"]:
185 |             #    batch[s] = batch[s].to(device)
186 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
187 |             predicted_labels = torch.argmax(outputs["logits"], 1)
188 |             test_acc.update(predicted_labels, batch["label"])
189 | 
190 |     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
191 |     print(f"Test accuracy {test_acc.compute()*100:.2f}%")


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/float32-regular-medium.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import os.path as op
  4 | import time
  5 | 
  6 | from datasets import load_dataset
  7 | from lightning import Fabric
  8 | import torch
  9 | from torch.utils.data import DataLoader
 10 | import torchmetrics
 11 | from transformers import AutoTokenizer
 12 | from transformers import AutoModelForSequenceClassification
 13 | from watermark import watermark
 14 | 
 15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
 16 | from local_dataset_utilities import IMDBDataset
 17 | 
 18 | 
 19 | def tokenize_text(batch):
 20 |     return tokenizer(batch["text"], truncation=True, padding=True)
 21 | 
 22 | 
 23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
 24 | 
 25 |     for epoch in range(num_epochs):
 26 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 27 | 
 28 |         model.train()
 29 |         for batch_idx, batch in enumerate(train_loader):
 30 |             model.train()
 31 | 
 32 |             # For non-Fabric PyTorch:
 33 |             #for s in ["input_ids", "attention_mask", "label"]:
 34 |             #    batch[s] = batch[s].to(device)
 35 | 
 36 |             ### FORWARD AND BACK PROP   
 37 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"]) 
 38 |             optimizer.zero_grad()
 39 |             
 40 |             # For non-Fabric PyTorch:
 41 |             #outputs["loss"].backward()
 42 |             fabric.backward(outputs["loss"])
 43 | 
 44 |             ### UPDATE MODEL PARAMETERS
 45 |             optimizer.step()
 46 | 
 47 |             ### LOGGING
 48 |             if not batch_idx % 300:
 49 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
 50 | 
 51 |             model.eval()
 52 |             with torch.no_grad():
 53 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 54 |                 train_acc.update(predicted_labels, batch["label"])
 55 | 
 56 |         ### MORE LOGGING
 57 |         model.eval()
 58 |         with torch.no_grad():
 59 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 60 |             for batch in val_loader:
 61 | 
 62 |                 # For non-Fabric PyTorch:
 63 |                 #for s in ["input_ids", "attention_mask", "label"]:
 64 |                 #    batch[s] = batch[s].to(device)
 65 |                 outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
 66 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 67 |                 val_acc.update(predicted_labels, batch["label"])
 68 | 
 69 |             print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 70 |             train_acc.reset(), val_acc.reset()
 71 | 
 72 | 
 73 | if __name__ == "__main__":
 74 | 
 75 |     print(watermark(packages="torch,lightning,transformers", python=True))
 76 |     print("Torch CUDA available?", torch.cuda.is_available())
 77 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 78 | 
 79 |     torch.set_float32_matmul_precision("medium")
 80 |     torch.manual_seed(123)
 81 | 
 82 |     ##########################
 83 |     ### 1 Loading the Dataset
 84 |     ##########################
 85 |     download_dataset()
 86 |     df = load_dataset_into_to_dataframe()
 87 |     if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
 88 |         partition_dataset(df)
 89 | 
 90 |     imdb_dataset = load_dataset(
 91 |         "csv",
 92 |         data_files={
 93 |             "train": "train.csv",
 94 |             "validation": "val.csv",
 95 |             "test": "test.csv",
 96 |         },
 97 |     )
 98 | 
 99 |     #########################################
100 |     ### 2 Tokenization and Numericalization
101 |     #########################################
102 | 
103 |     tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
104 |     print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
105 |     print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
106 | 
107 |     print("Tokenizing ...", flush=True)
108 |     imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
109 |     del imdb_dataset
110 |     imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
111 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
112 | 
113 |     #########################################
114 |     ### 3 Set Up DataLoaders
115 |     #########################################
116 | 
117 |     train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
118 |     val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
119 |     test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
120 | 
121 |     train_loader = DataLoader(
122 |         dataset=train_dataset,
123 |         batch_size=12,
124 |         shuffle=True, 
125 |         num_workers=4,
126 |         drop_last=True,
127 |     )
128 | 
129 |     val_loader = DataLoader(
130 |         dataset=val_dataset,
131 |         batch_size=12,
132 |         num_workers=4,
133 |         drop_last=True,
134 |     )
135 | 
136 |     test_loader = DataLoader(
137 |         dataset=test_dataset,
138 |         batch_size=12,
139 |         num_workers=2,
140 |         drop_last=True,
141 |     )
142 | 
143 | 
144 |     #########################################
145 |     ### 4 Initializing the Model
146 |     #########################################
147 | 
148 |     fabric = Fabric(accelerator="cuda", devices=[6], precision="32-true")
149 |     fabric.launch()
150 | 
151 |     model = AutoModelForSequenceClassification.from_pretrained(
152 |         "distilbert-base-uncased", num_labels=2)
153 | 
154 |     # model.to(device)
155 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
156 | 
157 |     model, optimizer = fabric.setup(model, optimizer)
158 |     train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
159 |     fabric.barrier()
160 | 
161 |     #########################################
162 |     ### 5 Finetuning
163 |     #########################################
164 | 
165 |     start = time.time()
166 |     train(
167 |         num_epochs=3,
168 |         model=model,
169 |         optimizer=optimizer,
170 |         train_loader=train_loader,
171 |         val_loader=val_loader,
172 |         fabric=fabric
173 |     )
174 | 
175 |     end = time.time()
176 |     elapsed = end-start
177 |     print(f"Time elapsed {elapsed/60:.2f} min")
178 | 
179 |     with torch.no_grad():
180 |         model.eval()
181 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
182 |         for batch in test_loader:
183 | 
184 |             #for s in ["input_ids", "attention_mask", "label"]:
185 |             #    batch[s] = batch[s].to(device)
186 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
187 |             predicted_labels = torch.argmax(outputs["logits"], 1)
188 |             test_acc.update(predicted_labels, batch["label"])
189 | 
190 |     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
191 |     print(f"Test accuracy {test_acc.compute()*100:.2f}%")


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/float32-regular.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import os.path as op
  4 | import time
  5 | 
  6 | from datasets import load_dataset
  7 | from lightning import Fabric
  8 | import torch
  9 | from torch.utils.data import DataLoader
 10 | import torchmetrics
 11 | from transformers import AutoTokenizer
 12 | from transformers import AutoModelForSequenceClassification
 13 | from watermark import watermark
 14 | 
 15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
 16 | from local_dataset_utilities import IMDBDataset
 17 | 
 18 | 
 19 | def tokenize_text(batch):
 20 |     return tokenizer(batch["text"], truncation=True, padding=True)
 21 | 
 22 | 
 23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
 24 | 
 25 |     for epoch in range(num_epochs):
 26 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 27 | 
 28 |         model.train()
 29 |         for batch_idx, batch in enumerate(train_loader):
 30 |             model.train()
 31 | 
 32 |             # For non-Fabric PyTorch:
 33 |             #for s in ["input_ids", "attention_mask", "label"]:
 34 |             #    batch[s] = batch[s].to(device)
 35 | 
 36 |             ### FORWARD AND BACK PROP   
 37 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"]) 
 38 |             optimizer.zero_grad()
 39 |             
 40 |             # For non-Fabric PyTorch:
 41 |             #outputs["loss"].backward()
 42 |             fabric.backward(outputs["loss"])
 43 | 
 44 |             ### UPDATE MODEL PARAMETERS
 45 |             optimizer.step()
 46 | 
 47 |             ### LOGGING
 48 |             if not batch_idx % 300:
 49 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
 50 | 
 51 |             model.eval()
 52 |             with torch.no_grad():
 53 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 54 |                 train_acc.update(predicted_labels, batch["label"])
 55 | 
 56 |         ### MORE LOGGING
 57 |         model.eval()
 58 |         with torch.no_grad():
 59 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 60 |             for batch in val_loader:
 61 | 
 62 |                 # For non-Fabric PyTorch:
 63 |                 #for s in ["input_ids", "attention_mask", "label"]:
 64 |                 #    batch[s] = batch[s].to(device)
 65 |                 outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
 66 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 67 |                 val_acc.update(predicted_labels, batch["label"])
 68 | 
 69 |             print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 70 |             train_acc.reset(), val_acc.reset()
 71 | 
 72 | 
 73 | if __name__ == "__main__":
 74 | 
 75 |     print(watermark(packages="torch,lightning,transformers", python=True))
 76 |     print("Torch CUDA available?", torch.cuda.is_available())
 77 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 78 | 
 79 |     torch.manual_seed(123)
 80 | 
 81 |     ##########################
 82 |     ### 1 Loading the Dataset
 83 |     ##########################
 84 |     download_dataset()
 85 |     df = load_dataset_into_to_dataframe()
 86 |     if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
 87 |         partition_dataset(df)
 88 | 
 89 |     imdb_dataset = load_dataset(
 90 |         "csv",
 91 |         data_files={
 92 |             "train": "train.csv",
 93 |             "validation": "val.csv",
 94 |             "test": "test.csv",
 95 |         },
 96 |     )
 97 | 
 98 |     #########################################
 99 |     ### 2 Tokenization and Numericalization
100 |     #########################################
101 | 
102 |     tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
103 |     print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
104 |     print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
105 | 
106 |     print("Tokenizing ...", flush=True)
107 |     imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
108 |     del imdb_dataset
109 |     imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
110 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
111 | 
112 |     #########################################
113 |     ### 3 Set Up DataLoaders
114 |     #########################################
115 | 
116 |     train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
117 |     val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
118 |     test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
119 | 
120 |     train_loader = DataLoader(
121 |         dataset=train_dataset,
122 |         batch_size=12,
123 |         shuffle=True, 
124 |         num_workers=4,
125 |         drop_last=True,
126 |     )
127 | 
128 |     val_loader = DataLoader(
129 |         dataset=val_dataset,
130 |         batch_size=12,
131 |         num_workers=4,
132 |         drop_last=True,
133 |     )
134 | 
135 |     test_loader = DataLoader(
136 |         dataset=test_dataset,
137 |         batch_size=12,
138 |         num_workers=2,
139 |         drop_last=True,
140 |     )
141 | 
142 | 
143 |     #########################################
144 |     ### 4 Initializing the Model
145 |     #########################################
146 | 
147 |     fabric = Fabric(accelerator="cuda", devices=1, precision="32-true")
148 |     fabric.launch()
149 | 
150 |     model = AutoModelForSequenceClassification.from_pretrained(
151 |         "distilbert-base-uncased", num_labels=2)
152 | 
153 |     # model.to(device)
154 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
155 | 
156 |     model, optimizer = fabric.setup(model, optimizer)
157 |     train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
158 |     fabric.barrier()
159 | 
160 |     #########################################
161 |     ### 5 Finetuning
162 |     #########################################
163 | 
164 |     start = time.time()
165 |     train(
166 |         num_epochs=3,
167 |         model=model,
168 |         optimizer=optimizer,
169 |         train_loader=train_loader,
170 |         val_loader=val_loader,
171 |         fabric=fabric
172 |     )
173 | 
174 |     end = time.time()
175 |     elapsed = end-start
176 |     print(f"Time elapsed {elapsed/60:.2f} min")
177 | 
178 |     with torch.no_grad():
179 |         model.eval()
180 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
181 |         for batch in test_loader:
182 | 
183 |             #for s in ["input_ids", "attention_mask", "label"]:
184 |             #    batch[s] = batch[s].to(device)
185 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
186 |             predicted_labels = torch.argmax(outputs["logits"], 1)
187 |             test_acc.update(predicted_labels, batch["label"])
188 | 
189 |     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
190 |     print(f"Test accuracy {test_acc.compute()*100:.2f}%")


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/float64-regular.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import os.path as op
  4 | import time
  5 | 
  6 | from datasets import load_dataset
  7 | from lightning import Fabric
  8 | import torch
  9 | from torch.utils.data import DataLoader
 10 | import torchmetrics
 11 | from transformers import AutoTokenizer
 12 | from transformers import AutoModelForSequenceClassification
 13 | from watermark import watermark
 14 | 
 15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
 16 | from local_dataset_utilities import IMDBDataset
 17 | 
 18 | 
 19 | def tokenize_text(batch):
 20 |     return tokenizer(batch["text"], truncation=True, padding=True)
 21 | 
 22 | 
 23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
 24 | 
 25 |     for epoch in range(num_epochs):
 26 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 27 | 
 28 |         model.train()
 29 |         for batch_idx, batch in enumerate(train_loader):
 30 |             model.train()
 31 | 
 32 |             # For non-Fabric PyTorch:
 33 |             #for s in ["input_ids", "attention_mask", "label"]:
 34 |             #    batch[s] = batch[s].to(device)
 35 | 
 36 |             ### FORWARD AND BACK PROP   
 37 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"]) 
 38 |             optimizer.zero_grad()
 39 |             
 40 |             # For non-Fabric PyTorch:
 41 |             #outputs["loss"].backward()
 42 |             fabric.backward(outputs["loss"])
 43 | 
 44 |             ### UPDATE MODEL PARAMETERS
 45 |             optimizer.step()
 46 | 
 47 |             ### LOGGING
 48 |             if not batch_idx % 300:
 49 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
 50 | 
 51 |             model.eval()
 52 |             with torch.no_grad():
 53 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 54 |                 train_acc.update(predicted_labels, batch["label"])
 55 | 
 56 |         ### MORE LOGGING
 57 |         model.eval()
 58 |         with torch.no_grad():
 59 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
 60 |             for batch in val_loader:
 61 | 
 62 |                 # For non-Fabric PyTorch:
 63 |                 #for s in ["input_ids", "attention_mask", "label"]:
 64 |                 #    batch[s] = batch[s].to(device)
 65 |                 outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
 66 |                 predicted_labels = torch.argmax(outputs["logits"], 1)
 67 |                 val_acc.update(predicted_labels, batch["label"])
 68 | 
 69 |             print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 70 |             train_acc.reset(), val_acc.reset()
 71 | 
 72 | 
 73 | if __name__ == "__main__":
 74 | 
 75 |     print(watermark(packages="torch,lightning,transformers", python=True))
 76 |     print("Torch CUDA available?", torch.cuda.is_available())
 77 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 78 | 
 79 |     torch.manual_seed(123)
 80 | 
 81 |     ##########################
 82 |     ### 1 Loading the Dataset
 83 |     ##########################
 84 |     download_dataset()
 85 |     df = load_dataset_into_to_dataframe()
 86 |     if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
 87 |         partition_dataset(df)
 88 | 
 89 |     imdb_dataset = load_dataset(
 90 |         "csv",
 91 |         data_files={
 92 |             "train": "train.csv",
 93 |             "validation": "val.csv",
 94 |             "test": "test.csv",
 95 |         },
 96 |     )
 97 | 
 98 |     #########################################
 99 |     ### 2 Tokenization and Numericalization
100 |     #########################################
101 | 
102 |     tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
103 |     print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
104 |     print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
105 | 
106 |     print("Tokenizing ...", flush=True)
107 |     imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
108 |     del imdb_dataset
109 |     imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
110 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
111 | 
112 |     #########################################
113 |     ### 3 Set Up DataLoaders
114 |     #########################################
115 | 
116 |     train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
117 |     val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
118 |     test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
119 | 
120 |     train_loader = DataLoader(
121 |         dataset=train_dataset,
122 |         batch_size=12,
123 |         shuffle=True, 
124 |         num_workers=4,
125 |         drop_last=True,
126 |     )
127 | 
128 |     val_loader = DataLoader(
129 |         dataset=val_dataset,
130 |         batch_size=12,
131 |         num_workers=4,
132 |         drop_last=True,
133 |     )
134 | 
135 |     test_loader = DataLoader(
136 |         dataset=test_dataset,
137 |         batch_size=12,
138 |         num_workers=2,
139 |         drop_last=True,
140 |     )
141 | 
142 | 
143 |     #########################################
144 |     ### 4 Initializing the Model
145 |     #########################################
146 | 
147 |     fabric = Fabric(accelerator="cuda", devices=1, precision="64-true")
148 |     fabric.launch()
149 | 
150 |     model = AutoModelForSequenceClassification.from_pretrained(
151 |         "distilbert-base-uncased", num_labels=2)
152 | 
153 |     # model.to(device)
154 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
155 | 
156 |     model, optimizer = fabric.setup(model, optimizer)
157 |     train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
158 |     fabric.barrier()
159 | 
160 |     #########################################
161 |     ### 5 Finetuning
162 |     #########################################
163 | 
164 |     start = time.time()
165 |     train(
166 |         num_epochs=3,
167 |         model=model,
168 |         optimizer=optimizer,
169 |         train_loader=train_loader,
170 |         val_loader=val_loader,
171 |         fabric=fabric
172 |     )
173 | 
174 |     end = time.time()
175 |     elapsed = end-start
176 |     print(f"Time elapsed {elapsed/60:.2f} min")
177 | 
178 |     with torch.no_grad():
179 |         model.eval()
180 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
181 |         for batch in test_loader:
182 | 
183 |             #for s in ["input_ids", "attention_mask", "label"]:
184 |             #    batch[s] = batch[s].to(device)
185 |             outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
186 |             predicted_labels = torch.argmax(outputs["logits"], 1)
187 |             test_acc.update(predicted_labels, batch["label"])
188 | 
189 |     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
190 |     print(f"Test accuracy {test_acc.compute()*100:.2f}%")


--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/local_dataset_utilities.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import tarfile
  4 | import time
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from packaging import version
  9 | from torch.utils.data import Dataset
 10 | from tqdm import tqdm
 11 | import urllib
 12 | 
 13 | 
 14 | def reporthook(count, block_size, total_size):
 15 |     global start_time
 16 |     if count == 0:
 17 |         start_time = time.time()
 18 |         return
 19 |     duration = time.time() - start_time
 20 |     progress_size = int(count * block_size)
 21 |     speed = progress_size / (1024.0**2 * duration)
 22 |     percent = count * block_size * 100.0 / total_size
 23 | 
 24 |     sys.stdout.write(
 25 |         f"\r{int(percent)}% | {progress_size / (1024.**2):.2f} MB "
 26 |         f"| {speed:.2f} MB/s | {duration:.2f} sec elapsed"
 27 |     )
 28 |     sys.stdout.flush()
 29 | 
 30 | 
 31 | def download_dataset():
 32 |     source = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
 33 |     target = "aclImdb_v1.tar.gz"
 34 | 
 35 |     if os.path.exists(target):
 36 |         os.remove(target)
 37 | 
 38 |     if not os.path.isdir("aclImdb") and not os.path.isfile("aclImdb_v1.tar.gz"):
 39 |         urllib.request.urlretrieve(source, target, reporthook)
 40 | 
 41 |     if not os.path.isdir("aclImdb"):
 42 | 
 43 |         with tarfile.open(target, "r:gz") as tar:
 44 |             tar.extractall()
 45 | 
 46 | 
 47 | def load_dataset_into_to_dataframe():
 48 |     basepath = "aclImdb"
 49 | 
 50 |     labels = {"pos": 1, "neg": 0}
 51 | 
 52 |     df = pd.DataFrame()
 53 | 
 54 |     with tqdm(total=50000) as pbar:
 55 |         for s in ("test", "train"):
 56 |             for l in ("pos", "neg"):
 57 |                 path = os.path.join(basepath, s, l)
 58 |                 for file in sorted(os.listdir(path)):
 59 |                     with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
 60 |                         txt = infile.read()
 61 | 
 62 |                     if version.parse(pd.__version__) >= version.parse("1.3.2"):
 63 |                         x = pd.DataFrame(
 64 |                             [[txt, labels[l]]], columns=["review", "sentiment"]
 65 |                         )
 66 |                         df = pd.concat([df, x], ignore_index=False)
 67 | 
 68 |                     else:
 69 |                         df = df.append([[txt, labels[l]]], ignore_index=True)
 70 |                     pbar.update()
 71 |     df.columns = ["text", "label"]
 72 | 
 73 |     np.random.seed(0)
 74 |     df = df.reindex(np.random.permutation(df.index))
 75 | 
 76 |     print("Class distribution:")
 77 |     np.bincount(df["label"].values)
 78 | 
 79 |     return df
 80 | 
 81 | 
 82 | def partition_dataset(df):
 83 |     df_shuffled = df.sample(frac=1, random_state=1).reset_index()
 84 | 
 85 |     df_train = df_shuffled.iloc[:35_000]
 86 |     df_val = df_shuffled.iloc[35_000:40_000]
 87 |     df_test = df_shuffled.iloc[40_000:]
 88 | 
 89 |     df_train.to_csv("train.csv", index=False, encoding="utf-8")
 90 |     df_val.to_csv("val.csv", index=False, encoding="utf-8")
 91 |     df_test.to_csv("test.csv", index=False, encoding="utf-8")
 92 | 
 93 | 
 94 | class IMDBDataset(Dataset):
 95 |     def __init__(self, dataset_dict, partition_key="train"):
 96 |         self.partition = dataset_dict[partition_key]
 97 | 
 98 |     def __getitem__(self, index):
 99 |         return self.partition[index]
100 | 
101 |     def __len__(self):
102 |         return self.partition.num_rows


--------------------------------------------------------------------------------
/lit-benchmarks/falcon-7b/README.md:
--------------------------------------------------------------------------------
 1 | # Hyperparameter settings for finetuning Falcon 7B
 2 | 
 3 | 
 4 | 
 5 | These need to be used in combination with the https://github.com/Lightning-AI/lit-parrot repository.
 6 | 
 7 | 
 8 | 
 9 | **Preparing the model and dataset**
10 | 
11 | For this benchmark, we will be using the [Lit-Parrot](https://github.com/Lightning-AI/lit-parrot) open-source library, which provides efficient implementations for training and using various LLMs.
12 | 
13 | ![lit-parrot](figures/lit-parrot.png)
14 | 
15 | Title: The Lit-Parrot repository (https://github.com/Lightning-AI/lit-parrot)
16 | 
17 | 
18 | 
19 | The first  step is to download the model:
20 | 
21 | ```
22 | python scripts/download.py --repo_id tiiuae/falcon-7b
23 | ```
24 | 
25 | (This requires approximately 20 Gb of storage.)
26 | 
27 | 
28 | Second, we convert the weights into a standardized form:
29 | 
30 | ```
31 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/tiiuae/falcon-7b
32 | ```
33 | 
34 | Third, we have to download the dataset. For this example, we will be using the Alpaca dataset [link] consisting of 52 instruction pairs: 
35 | 
36 | ```
37 | python scripts/prepare_alpaca.py
38 | ```
39 | 
40 | (More on using custom datasets later.)
41 | 
42 | **Running the code**
43 | 
44 | Now, we are running the finetuning scripts for the Falcon 7B model. We are going to compare 4 different methods below.  For now, we are going to focus on the finetuning results. And we will discuss how these methods work later in this article.
45 | 
46 | Adapter: 
47 | 
48 | ```python finetune/adapter.py --checkpoint_dir checkpoints/tiiuae/falcon-7b/
49 | python finetune/adapter.py  --checkpoint_dir checkpoints/tiiuae/falcon-7b/
50 | ```
51 | 
52 | Adapter v2: 
53 | 
54 | ```
55 | python finetune/adapter_v2.py  --checkpoint_dir checkpoints/tiiuae/falcon-7b/
56 | ```
57 | 
58 | LoRA: 
59 | 
60 | ```
61 | python finetune/lora.py  --checkpoint_dir checkpoints/tiiuae/falcon-7b/
62 | ```
63 | 
64 | Full finetuning (updating all layers):
65 | 
66 | ```
67 | python finetune/lora.py  --checkpoint_dir checkpoints/tiiuae/falcon-7b/
68 | ```
69 | 
70 | Let's take a look at the time it takes to finetune the LLM first:
71 | 
72 | 
73 | <img src="figures/training-time.png" alt="training-time" style="zoom:25%;" />
74 | 
75 | As we can see in the chart above, using a parameter-efficient finetuning method is about 9 times faster than finetuning all layers ("full"). Moreover, finetuning all layers required 6 GPUs due to memory constraints, whereas **the Adapter methods and LoRA could be used on a single GPU**.
76 | 
77 | So, speaking of GPU memory requirements, the peak memory requirements are plotted below:
78 | 
79 | 
80 | 
81 | <img src="figures/memory-requirements.png" alt="memory-requirements" style="zoom:25%;" />
82 | 
83 | Finetuning all layers of Falcon 7B required ~40 GB on each of the 6 GPUs (here, via tensor sharding using DeepSpeed). So, that's 240 Gb in total. In contrast, the parameter-efficient finetuning methods only required ~16 GB RAM, which allows users to even finetune these models on a single consumer-grade GPU.
84 | 
85 | By the way, note that the memory requirements are directly related to the number of parameters that are required to be updated for each method:
86 | 
87 | - Full finetuning: 7,217,189,760
88 | - Adapter: 1,365,330
89 | - Adapter v2: 3,839,186
90 | - LoRA: 3,506,176
91 | 


--------------------------------------------------------------------------------
/lit-benchmarks/falcon-7b/figures/lit-parrot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/lit-benchmarks/falcon-7b/figures/lit-parrot.png


--------------------------------------------------------------------------------
/lit-benchmarks/falcon-7b/figures/memory-requirements.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/lit-benchmarks/falcon-7b/figures/memory-requirements.png


--------------------------------------------------------------------------------
/lit-benchmarks/falcon-7b/figures/training-time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/lit-benchmarks/falcon-7b/figures/training-time.png


--------------------------------------------------------------------------------
/lit-benchmarks/falcon-7b/finetune/adapter.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import sys
  4 | import time
  5 | import warnings
  6 | from pathlib import Path
  7 | from typing import Optional
  8 | 
  9 | import lightning as L
 10 | import numpy as np
 11 | import torch
 12 | from lightning.fabric.strategies import DeepSpeedStrategy, XLAStrategy
 13 | 
 14 | # support running without installing as a package
 15 | wd = Path(__file__).parent.parent.resolve()
 16 | sys.path.append(str(wd))
 17 | 
 18 | from generate.base import generate
 19 | from lit_parrot.adapter import Parrot, Config, mark_only_adapter_as_trainable, adapter_state_from_state_dict
 20 | from lit_parrot.tokenizer import Tokenizer
 21 | from lit_parrot.utils import lazy_load, check_valid_checkpoint_dir
 22 | from scripts.prepare_alpaca import generate_prompt
 23 | 
 24 | eval_interval = 600
 25 | save_interval = 1000
 26 | eval_iters = 100
 27 | log_interval = 1
 28 | devices = 1
 29 | 
 30 | # Hyperparameters
 31 | learning_rate = 9e-3
 32 | batch_size = 128 / devices
 33 | micro_batch_size = 1
 34 | gradient_accumulation_steps = batch_size // micro_batch_size
 35 | assert gradient_accumulation_steps > 0
 36 | epoch_size = 52000  # train dataset size
 37 | num_epochs = 1
 38 | max_iters = num_epochs * (epoch_size // micro_batch_size) // devices
 39 | weight_decay = 0.02
 40 | warmup_iters = 2 * (epoch_size // micro_batch_size) // devices  # 2 epochs
 41 | 
 42 | 
 43 | 
 44 | ds_config = {
 45 |     "train_micro_batch_size_per_gpu": micro_batch_size,
 46 |     "gradient_accumulation_steps": gradient_accumulation_steps,
 47 |     "zero_optimization": {"stage": 2},
 48 | }
 49 | 
 50 | 
 51 | def setup(
 52 |     data_dir: Path = Path("data/alpaca"),
 53 |     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
 54 |     out_dir: Path = Path("out/adapter/alpaca"),
 55 |     precision: Optional[str] = None,
 56 |     tpu: bool = False,
 57 | ):
 58 |     if precision is None:
 59 |         precision = "32-true" if tpu else "bf16-true"
 60 |     strategy = (
 61 |         "auto"
 62 |         if devices <= 1
 63 |         else XLAStrategy(sync_module_states=False) if tpu else DeepSpeedStrategy(config=ds_config)
 64 |     )
 65 |     # For multi-host TPU training, the device count for Fabric is limited to the count on a single host.
 66 |     fabric_devices = "auto" if (tpu and devices > 1) else devices
 67 |     fabric = L.Fabric(devices=fabric_devices, strategy=strategy, precision=precision)
 68 |     fabric.launch(main, data_dir, checkpoint_dir, out_dir)
 69 | 
 70 | 
 71 | def main(
 72 |     fabric: L.Fabric = None,
 73 |     data_dir: Path = Path("data/alpaca"),
 74 |     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
 75 |     out_dir: Path = Path("out/adapter/alpaca"),
 76 | ):
 77 |     check_valid_checkpoint_dir(checkpoint_dir)
 78 |     fabric.seed_everything(1337 + fabric.global_rank)
 79 | 
 80 |     if fabric.global_rank == 0:
 81 |         os.makedirs(out_dir, exist_ok=True)
 82 | 
 83 |     train_data, val_data = load_datasets(data_dir=data_dir)
 84 | 
 85 |     config = Config.from_name(name=checkpoint_dir.name)
 86 |     checkpoint_path = checkpoint_dir / "lit_model.pth"
 87 |     fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}")
 88 |     with fabric.init_module():
 89 |         model = Parrot(config)
 90 |     with lazy_load(checkpoint_path) as checkpoint:
 91 |         model.load_state_dict(checkpoint, strict=False)
 92 | 
 93 |     mark_only_adapter_as_trainable(model)
 94 | 
 95 |     num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
 96 |     fabric.print(f"Number of trainable parameters: {num_params}")
 97 | 
 98 |     optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
 99 |     model, optimizer = fabric.setup(model, optimizer)
100 | 
101 |     train_time = time.time()
102 |     train(fabric, model, optimizer, train_data, val_data, checkpoint_dir, out_dir)
103 |     print(f"Training time: {(time.time()-train_time):.2f}s")
104 | 
105 |     # Save the final checkpoint at the end of training
106 |     save_path = out_dir / "lit_model_adapter_finetuned.pth"
107 |     fabric.print(f"Saving adapter weights to {str(save_path)!r}")
108 |     save_model_checkpoint(fabric, model, save_path)
109 | 
110 | 
111 | def train(
112 |     fabric: L.Fabric,
113 |     model: torch.nn.Module,
114 |     optimizer: torch.optim.Optimizer,
115 |     train_data: np.ndarray,
116 |     val_data: np.ndarray,
117 |     checkpoint_dir: Path,
118 |     out_dir: Path,
119 | ) -> None:
120 |     """The training loop.
121 | 
122 |     Loosely based on the nanoGPT implementation: https://github.com/karpathy/nanoGPT.
123 |     """
124 |     step_count = 0
125 | 
126 |     tokenizer = Tokenizer(checkpoint_dir / "tokenizer.json", checkpoint_dir / "tokenizer_config.json")
127 | 
128 |     if fabric.device.type == "xla":
129 |         import torch_xla.core.xla_model as xm
130 | 
131 |         xm.mark_step()
132 |     for iter_num in range(max_iters):
133 |         if step_count <= warmup_iters:
134 |             # linear warmup
135 |             lr = learning_rate * step_count / warmup_iters
136 |             for param_group in optimizer.param_groups:
137 |                 param_group["lr"] = lr
138 | 
139 |         t0 = time.time()
140 | 
141 |         input_ids, targets = get_batch(fabric, train_data)
142 | 
143 |         with fabric.no_backward_sync(model, enabled=((iter_num + 1) % gradient_accumulation_steps != 0)):
144 |             logits = model(input_ids)
145 |             loss = loss_fn(logits, targets)
146 |             fabric.backward(loss / gradient_accumulation_steps)
147 | 
148 |         if (iter_num + 1) % gradient_accumulation_steps == 0:
149 |             optimizer.step()
150 |             if fabric.device.type == "xla":
151 |                 xm.mark_step()
152 |             optimizer.zero_grad()
153 |             step_count += 1
154 | 
155 |             if step_count % eval_interval == 0:
156 |                 val_loss = validate(fabric, model, val_data, tokenizer)
157 |                 fabric.print(f"step {iter_num}: val loss {val_loss:.4f}")
158 |                 fabric.barrier()
159 | 
160 |             if step_count % save_interval == 0:
161 |                 save_path = out_dir / f"iter-{iter_num:06d}.pth"
162 |                 fabric.print(f"Saving adapter weights to {str(save_path)!r}")
163 |                 # TODO: Provide a function/script to merge the adapter weights with pretrained weights
164 |                 save_model_checkpoint(fabric, model, save_path)
165 |         else:
166 |             if fabric.device.type == "xla":
167 |                 xm.mark_step()
168 | 
169 |         dt = time.time() - t0
170 |         if iter_num % log_interval == 0:
171 |             fabric.print(f"iter {iter_num}: loss {loss.item():.4f}, time: {dt*1000:.2f}ms")
172 | 
173 | 
174 | @torch.no_grad()
175 | def validate(
176 |     fabric: L.Fabric, model: torch.nn.Module, val_data: np.ndarray, tokenizer: Tokenizer
177 | ) -> torch.Tensor:
178 |     fabric.print("Validating ...")
179 |     model.eval()
180 |     losses = torch.zeros(eval_iters)
181 |     for k in range(eval_iters):
182 |         input_ids, targets = get_batch(fabric, val_data)
183 |         logits = model(input_ids)
184 |         loss = loss_fn(logits, targets)
185 |         losses[k] = loss.item()
186 |     val_loss = losses.mean()
187 | 
188 |     # produce an example:
189 |     instruction = "Recommend a movie for me to watch during the weekend and explain the reason."
190 |     fabric.print(instruction)
191 |     sample = {"instruction": instruction, "input": ""}
192 |     prompt = generate_prompt(sample)
193 |     encoded = tokenizer.encode(prompt, device=model.device)
194 |     output = generate(
195 |         model, idx=encoded, max_returned_tokens=len(encoded) + 100, max_seq_length=model.config.block_size, temperature=0.8
196 |     )
197 |     output = tokenizer.decode(output)
198 |     fabric.print(output)
199 | 
200 |     model.train()
201 |     return val_loss.item()
202 | 
203 | 
204 | def loss_fn(logits, targets):
205 |     # shift the targets such that output n predicts token n+1
206 |     logits = logits[..., :-1, :].contiguous()
207 |     targets = targets[..., 1:].contiguous()
208 |     loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
209 |     return loss
210 | 
211 | 
212 | def get_batch(fabric: L.Fabric, data: list):
213 |     ix = torch.randint(len(data), (micro_batch_size,))
214 | 
215 |     input_ids = [data[i]["input_ids"].type(torch.int64) for i in ix]
216 |     labels = [data[i]["labels"].type(torch.int64) for i in ix]
217 | 
218 |     max_len = max(len(s) for s in input_ids) if fabric.device.type != "xla" else max_seq_length
219 | 
220 |     def pad_right(x, pad_id):
221 |         # pad right based on the longest sequence
222 |         n = max_len - len(x)
223 |         return torch.cat((x, torch.full((n,), pad_id, dtype=x.dtype)))
224 | 
225 |     x = torch.stack([pad_right(x, pad_id=0) for x in input_ids])
226 |     y = torch.stack([pad_right(x, pad_id=-1) for x in labels])
227 | 
228 |     if fabric.device.type in ("mps", "xla"):
229 |         x, y = fabric.to_device((x, y))
230 |     else:
231 |         x, y = fabric.to_device((x.pin_memory(), y.pin_memory()))
232 | 
233 |     return x, y
234 | 
235 | 
236 | def load_datasets(data_dir: Path):
237 |     train_data = torch.load(data_dir / "train.pt")
238 |     val_data = torch.load(data_dir / "test.pt")
239 |     return train_data, val_data
240 | 
241 | 
242 | def save_model_checkpoint(fabric, model, file_path: Path):
243 |     file_path = Path(file_path)
244 | 
245 |     if isinstance(fabric.strategy, DeepSpeedStrategy):
246 |         from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
247 | 
248 |         tmp_path = file_path.with_suffix(".tmp")
249 |         fabric.save(tmp_path, {"model": model})
250 |         fabric.barrier()
251 |         if fabric.global_rank == 0:
252 |             # Create a consolidated checkpoint with the same name next to the deepspeed checkpoint
253 |             # and only keep the adapter weights
254 |             state_dict = get_fp32_state_dict_from_zero_checkpoint(tmp_path)
255 |             state_dict = adapter_state_from_state_dict(state_dict)
256 |             torch.save(state_dict, file_path)
257 |             shutil.rmtree(tmp_path)
258 |     else:
259 |         state_dict = adapter_state_from_state_dict(model.state_dict())
260 |         if fabric.global_rank == 0:
261 |             torch.save(state_dict, file_path)
262 |         fabric.barrier()
263 | 
264 | 
265 | if __name__ == "__main__":
266 |     # Uncomment this line if you see an error: "Expected is_sm80 to be true, but got false"
267 |     # torch.backends.cuda.enable_flash_sdp(False)
268 |     torch.set_float32_matmul_precision("high")
269 | 
270 |     from jsonargparse.cli import CLI
271 | 
272 |     warnings.filterwarnings(
273 |         # false positive using deepspeed: https://github.com/Lightning-AI/lightning/pull/17761#discussion_r1219705307
274 |         "ignore",
275 |         message="Remove `.no_backward_sync()` from your code",
276 |     )
277 | 
278 | 
279 |     import datetime
280 |     started = datetime.datetime.now()
281 | 
282 |     CLI(setup)
283 | 
284 |     now = datetime.datetime.now()
285 |     print("started:", started)
286 |     print("finished:", now)
287 | 
288 |     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)


--------------------------------------------------------------------------------
/lit-benchmarks/falcon-7b/finetune/adapter_v2.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import sys
  4 | import time
  5 | import warnings
  6 | from pathlib import Path
  7 | from typing import Optional
  8 | 
  9 | import lightning as L
 10 | import numpy as np
 11 | import torch
 12 | from lightning.fabric.strategies import DeepSpeedStrategy, XLAStrategy
 13 | 
 14 | # support running without installing as a package
 15 | wd = Path(__file__).parent.parent.resolve()
 16 | sys.path.append(str(wd))
 17 | 
 18 | from generate.base import generate
 19 | from lit_parrot.adapter import Parrot, Config
 20 | from lit_parrot.adapter_v2 import (
 21 |     mark_only_adapter_v2_as_trainable,
 22 |     add_adapter_v2_parameters_to_linear_layers,
 23 |     adapter_v2_state_from_state_dict,
 24 | )
 25 | from lit_parrot.tokenizer import Tokenizer
 26 | from lit_parrot.utils import lazy_load, check_valid_checkpoint_dir
 27 | from scripts.prepare_alpaca import generate_prompt
 28 | 
 29 | eval_interval = 600
 30 | save_interval = 1000
 31 | eval_iters = 100
 32 | log_interval = 1
 33 | devices = 1
 34 | 
 35 | # Hyperparameters
 36 | learning_rate = 9e-3
 37 | batch_size = 128 / devices
 38 | micro_batch_size = 1  # set to 2 because this is fit into 12GB Vram
 39 | gradient_accumulation_iters = batch_size // micro_batch_size
 40 | assert gradient_accumulation_iters > 0
 41 | epoch_size = 52000  # train dataset size
 42 | num_epochs = 1
 43 | max_iters = num_epochs * (epoch_size // micro_batch_size) // devices
 44 | weight_decay = 0.02
 45 | warmup_iters = 2 * (epoch_size // micro_batch_size) // devices  # 2 epochs
 46 | 
 47 | ds_config = {
 48 |     "train_micro_batch_size_per_gpu": micro_batch_size,
 49 |     "gradient_accumulation_steps": gradient_accumulation_iters,
 50 |     "zero_optimization": {"stage": 2},
 51 | }
 52 | 
 53 | 
 54 | def setup(
 55 |     data_dir: Path = Path("data/alpaca"),
 56 |     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
 57 |     out_dir: Path = Path("out/adapter_v2/alpaca"),
 58 |     precision: Optional[str] = None,
 59 |     tpu: bool = False,
 60 | ):
 61 |     if precision is None:
 62 |         precision = "32-true" if tpu else "bf16-true"
 63 |     strategy = (
 64 |         "auto"
 65 |         if devices <= 1
 66 |         else XLAStrategy(sync_module_states=False) if tpu else DeepSpeedStrategy(config=ds_config)
 67 |     )
 68 |     # For multi-host TPU training, the device count for Fabric is limited to the count on a single host.
 69 |     fabric_devices = "auto" if (tpu and devices > 1) else devices
 70 |     fabric = L.Fabric(devices=fabric_devices, strategy=strategy, precision=precision)
 71 |     fabric.launch(main, data_dir, checkpoint_dir, out_dir)
 72 | 
 73 | 
 74 | def main(
 75 |     fabric: L.Fabric = None,
 76 |     data_dir: Path = Path("data/alpaca"),
 77 |     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
 78 |     out_dir: Path = Path("out/adapter_v2/alpaca"),
 79 | ):
 80 |     check_valid_checkpoint_dir(checkpoint_dir)
 81 |     fabric.seed_everything(1337 + fabric.global_rank)
 82 | 
 83 |     if fabric.global_rank == 0:
 84 |         os.makedirs(out_dir, exist_ok=True)
 85 | 
 86 |     train_data, val_data = load_datasets(data_dir=data_dir)
 87 | 
 88 |     config = Config.from_name(name=checkpoint_dir.name)
 89 |     checkpoint_path = checkpoint_dir / "lit_model.pth"
 90 |     fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}")
 91 |     with fabric.init_module():
 92 |         model = Parrot(config)
 93 |     with lazy_load(checkpoint_dir / "lit_model.pth") as checkpoint:
 94 |         # strict=False because missing keys due to adapter weights not contained in state dict
 95 |         model.load_state_dict(checkpoint, strict=False)
 96 | 
 97 |     add_adapter_v2_parameters_to_linear_layers(model)
 98 |     mark_only_adapter_v2_as_trainable(model)
 99 | 
100 |     num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
101 |     fabric.print(f"Number of trainable parameters: {num_params}")
102 | 
103 |     optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
104 |     model, optimizer = fabric.setup(model, optimizer)
105 |     train(fabric, model, optimizer, train_data, val_data, checkpoint_dir, out_dir)
106 | 
107 |     # Save the final checkpoint at the end of training
108 |     save_path = out_dir / "lit_model_adapter_finetuned.pth"
109 |     fabric.print(f"Saving adapter weights to {str(save_path)!r}")
110 |     save_model_checkpoint(fabric, model, save_path)
111 | 
112 | 
113 | def train(
114 |     fabric: L.Fabric,
115 |     model: torch.nn.Module,
116 |     optimizer: torch.optim.Optimizer,
117 |     train_data: np.ndarray,
118 |     val_data: np.ndarray,
119 |     checkpoint_dir: Path,
120 |     out_dir: Path,
121 | ) -> None:
122 |     """The training loop.
123 | 
124 |     Loosely based on the nanoGPT implementation: https://github.com/karpathy/nanoGPT.
125 |     """
126 |     step_count = 0
127 | 
128 |     tokenizer = Tokenizer(checkpoint_dir / "tokenizer.json", checkpoint_dir / "tokenizer_config.json")
129 | 
130 |     if fabric.device.type == "xla":
131 |         import torch_xla.core.xla_model as xm
132 | 
133 |         xm.mark_step()
134 |     for iter_num in range(max_iters):
135 |         if step_count <= warmup_iters:
136 |             # linear warmup
137 |             lr = learning_rate * step_count / warmup_iters
138 |             for param_group in optimizer.param_groups:
139 |                 param_group["lr"] = lr
140 | 
141 |         t0 = time.time()
142 | 
143 |         input_ids, targets = get_batch(fabric, train_data)
144 | 
145 |         with fabric.no_backward_sync(model, enabled=((iter_num + 1) % gradient_accumulation_iters != 0)):
146 |             logits = model(input_ids)
147 |             loss = loss_fn(logits, targets)
148 |             fabric.backward(loss / gradient_accumulation_iters)
149 | 
150 |         if (iter_num + 1) % gradient_accumulation_iters == 0:
151 |             optimizer.step()
152 |             if fabric.device.type == "xla":
153 |                 xm.mark_step()
154 |             optimizer.zero_grad()
155 |             step_count += 1
156 | 
157 |             if step_count % eval_interval == 0:
158 |                 val_loss = validate(fabric, model, val_data, tokenizer)
159 |                 fabric.print(f"step {iter_num}: val loss {val_loss:.4f}")
160 |                 fabric.barrier()
161 | 
162 |             if step_count % save_interval == 0:
163 |                 save_path = out_dir / f"iter-{iter_num:06d}.pth"
164 |                 fabric.print(f"Saving adapter weights to {str(save_path)!r}")
165 |                 # TODO: Provide a function/script to merge the adapter weights with pretrained weights
166 |                 save_model_checkpoint(fabric, model, save_path)
167 |         else:
168 |             if fabric.device.type == "xla":
169 |                 xm.mark_step()
170 | 
171 |         dt = time.time() - t0
172 |         if iter_num % log_interval == 0:
173 |             fabric.print(f"iter {iter_num}: loss {loss.item():.4f}, time: {dt*1000:.2f}ms")
174 | 
175 | 
176 | @torch.no_grad()
177 | def validate(
178 |     fabric: L.Fabric, model: torch.nn.Module, val_data: np.ndarray, tokenizer: Tokenizer
179 | ) -> torch.Tensor:
180 |     fabric.print("Validating ...")
181 |     model.eval()
182 |     losses = torch.zeros(eval_iters)
183 |     for k in range(eval_iters):
184 |         input_ids, targets = get_batch(fabric, val_data)
185 |         logits = model(input_ids)
186 |         loss = loss_fn(logits, targets)
187 |         losses[k] = loss.item()
188 |     val_loss = losses.mean()
189 | 
190 |     # produce an example:
191 |     instruction = "Recommend a movie for me to watch during the weekend and explain the reason."
192 |     fabric.print(instruction)
193 |     sample = {"instruction": instruction, "input": ""}
194 |     prompt = generate_prompt(sample)
195 |     encoded = tokenizer.encode(prompt, device=model.device)
196 |     output = generate(
197 |         model, idx=encoded, max_returned_tokens=len(encoded) + 100, max_seq_length=model.config.block_size, temperature=0.8
198 |     )
199 |     output = tokenizer.decode(output)
200 |     fabric.print(output)
201 | 
202 |     model.train()
203 |     return val_loss.item()
204 | 
205 | 
206 | def loss_fn(logits, targets):
207 |     # shift the targets such that output n predicts token n+1
208 |     logits = logits[..., :-1, :].contiguous()
209 |     targets = targets[..., 1:].contiguous()
210 |     loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
211 |     return loss
212 | 
213 | 
214 | def get_batch(fabric: L.Fabric, data: list):
215 |     ix = torch.randint(len(data), (micro_batch_size,))
216 | 
217 |     input_ids = [data[i]["input_ids"].type(torch.int64) for i in ix]
218 |     labels = [data[i]["labels"].type(torch.int64) for i in ix]
219 | 
220 |     max_len = max(len(s) for s in input_ids) if fabric.device.type != "xla" else max_seq_length
221 | 
222 |     def pad_right(x, pad_id):
223 |         # pad right based on the longest sequence
224 |         n = max_len - len(x)
225 |         return torch.cat((x, torch.full((n,), pad_id, dtype=x.dtype)))
226 | 
227 |     x = torch.stack([pad_right(x, pad_id=0) for x in input_ids])
228 |     y = torch.stack([pad_right(x, pad_id=-1) for x in labels])
229 | 
230 |     if fabric.device.type in ("mps", "xla"):
231 |         x, y = fabric.to_device((x, y))
232 |     else:
233 |         x, y = fabric.to_device((x.pin_memory(), y.pin_memory()))
234 | 
235 |     return x, y
236 | 
237 | 
238 | def load_datasets(data_dir: Path):
239 |     train_data = torch.load(data_dir / "train.pt")
240 |     val_data = torch.load(data_dir / "test.pt")
241 |     return train_data, val_data
242 | 
243 | 
244 | def save_model_checkpoint(fabric, model, file_path: Path):
245 |     file_path = Path(file_path)
246 | 
247 |     if isinstance(fabric.strategy, DeepSpeedStrategy):
248 |         from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
249 | 
250 |         tmp_path = file_path.with_suffix(".tmp")
251 |         fabric.save(tmp_path, {"model": model})
252 |         fabric.barrier()
253 |         if fabric.global_rank == 0:
254 |             # Create a consolidated checkpoint with the same name next to the deepspeed checkpoint
255 |             # and only keep the adapter weights
256 |             state_dict = get_fp32_state_dict_from_zero_checkpoint(tmp_path)
257 |             state_dict = adapter_v2_state_from_state_dict(state_dict)
258 |             torch.save(state_dict, file_path)
259 |             shutil.rmtree(tmp_path)
260 |     else:
261 |         state_dict = adapter_v2_state_from_state_dict(model.state_dict())
262 |         if fabric.global_rank == 0:
263 |             torch.save(state_dict, file_path)
264 |         fabric.barrier()
265 | 
266 | 
267 | if __name__ == "__main__":
268 |     # Uncomment this line if you see an error: "Expected is_sm80 to be true, but got false"
269 |     # torch.backends.cuda.enable_flash_sdp(False)
270 |     torch.set_float32_matmul_precision("high")
271 | 
272 |     from jsonargparse.cli import CLI
273 | 
274 |     warnings.filterwarnings(
275 |         # false positive using deepspeed: https://github.com/Lightning-AI/lightning/pull/17761#discussion_r1219705307
276 |         "ignore",
277 |         message="Remove `.no_backward_sync()` from your code",
278 |     )
279 |     import datetime
280 |     started = datetime.datetime.now()
281 | 
282 |     CLI(setup)
283 | 
284 |     now = datetime.datetime.now()
285 |     print("started:", started)
286 |     print("finished:", now)
287 | 
288 |     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)


--------------------------------------------------------------------------------
/lit-benchmarks/falcon-7b/finetune/full.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import sys
  4 | import time
  5 | import warnings
  6 | from pathlib import Path
  7 | from typing import Literal
  8 | 
  9 | import lightning as L
 10 | import numpy as np
 11 | import torch
 12 | from lightning.fabric.accelerators.mps import MPSAccelerator
 13 | from lightning.fabric.strategies import DeepSpeedStrategy
 14 | 
 15 | # support running without installing as a package
 16 | wd = Path(__file__).parent.parent.resolve()
 17 | sys.path.append(str(wd))
 18 | 
 19 | from generate.base import generate
 20 | from lit_parrot.config import Config
 21 | from lit_parrot.model import Parrot
 22 | from lit_parrot.tokenizer import Tokenizer
 23 | from lit_parrot.utils import lazy_load, check_valid_checkpoint_dir
 24 | from scripts.prepare_alpaca import generate_prompt
 25 | 
 26 | eval_interval = 600
 27 | save_interval = 1000
 28 | eval_iters = 100
 29 | log_interval = 1
 30 | devices = 6
 31 | 
 32 | # Hyperparameters
 33 | learning_rate = 9e-3
 34 | batch_size = 128 / devices
 35 | micro_batch_size = 1
 36 | gradient_accumulation_steps = batch_size // micro_batch_size
 37 | assert gradient_accumulation_steps > 0
 38 | epoch_size = 50000 * 6 # train dataset size
 39 | num_epochs = 1
 40 | max_iters = num_epochs * (epoch_size // micro_batch_size) // devices
 41 | weight_decay = 0.02
 42 | max_seq_length = 256  # see scripts/prepare_alpaca.py
 43 | warmup_iters = 2 * (epoch_size // micro_batch_size) // devices  # 2 epochs
 44 | 
 45 | ds_config = {
 46 |     "train_micro_batch_size_per_gpu": micro_batch_size,
 47 |     "gradient_accumulation_steps": gradient_accumulation_steps,
 48 |     "zero_optimization": {"stage": 3},
 49 | }
 50 | 
 51 | 
 52 | def main(
 53 |     data_dir: Path = Path("data/alpaca"),
 54 |     checkpoint_dir: Path = Path("checkpoints/tiiuae/falcon-7b"),
 55 |     out_dir: Path = Path("out/full/alpaca"),
 56 |     precision: Literal["bf16-true", "32-true", "bf16-mixed"] = "bf16-true",
 57 | ):
 58 |     check_valid_checkpoint_dir(checkpoint_dir)
 59 | 
 60 |     fabric = L.Fabric(
 61 |         devices=devices, strategy=(DeepSpeedStrategy(config=ds_config) if devices > 1 else "auto"), precision=precision
 62 |     )
 63 |     fabric.launch()
 64 |     fabric.seed_everything(1337 + fabric.global_rank)
 65 | 
 66 |     if fabric.global_rank == 0:
 67 |         os.makedirs(out_dir, exist_ok=True)
 68 | 
 69 |     train_data, val_data = load_datasets(data_dir=data_dir)
 70 | 
 71 |     config = Config.from_name(name=checkpoint_dir.name, block_size=max_seq_length)
 72 |     checkpoint_path = checkpoint_dir / "lit_model.pth"
 73 |     fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}")
 74 |     
 75 |     
 76 |     #with fabric.init_module():
 77 |     #    model = Parrot(config)
 78 | 
 79 |     #with lazy_load(checkpoint_path) as checkpoint:
 80 |     #    model.load_state_dict(checkpoint, strict=False)
 81 | 
 82 |     checkpoint = torch.load(checkpoint_path)
 83 |     with fabric.device:
 84 |         torch.set_default_tensor_type(torch.HalfTensor)
 85 |         model = Parrot(config).bfloat16()
 86 |         torch.set_default_tensor_type(torch.FloatTensor)
 87 |         model.load_state_dict(checkpoint, strict=False) 
 88 | 
 89 | 
 90 |     num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
 91 |     fabric.print(f"Number of trainable parameters: {num_params}")
 92 | 
 93 |     optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
 94 |     model, optimizer = fabric.setup(model, optimizer)
 95 |     train(fabric, model, optimizer, train_data, val_data, checkpoint_dir, out_dir)
 96 | 
 97 |     # Save the final checkpoint at the end of training
 98 |     save_path = out_dir / "lit_model_full_finetuned.pth"
 99 |     fabric.print(f"Saving weights to {str(save_path)!r}")
100 |     save_model_checkpoint(fabric, model, save_path)
101 | 
102 | 
103 | def train(
104 |     fabric: L.Fabric,
105 |     model: torch.nn.Module,
106 |     optimizer: torch.optim.Optimizer,
107 |     train_data: np.ndarray,
108 |     val_data: np.ndarray,
109 |     checkpoint_dir: Path,
110 |     out_dir: Path,
111 | ) -> None:
112 |     """The training loop.
113 | 
114 |     Loosely based on the nanoGPT implementation: https://github.com/karpathy/nanoGPT.
115 |     """
116 |     step_count = 0
117 | 
118 |     tokenizer = Tokenizer(checkpoint_dir / "tokenizer.json", checkpoint_dir / "tokenizer_config.json")
119 | 
120 |     for iter_num in range(max_iters):
121 |         if step_count <= warmup_iters:
122 |             # linear warmup
123 |             lr = learning_rate * step_count / warmup_iters
124 |             for param_group in optimizer.param_groups:
125 |                 param_group["lr"] = lr
126 | 
127 |         t0 = time.time()
128 | 
129 |         input_ids, targets = get_batch(fabric, train_data)
130 | 
131 |         with fabric.no_backward_sync(model, enabled=((iter_num + 1) % gradient_accumulation_steps != 0)):
132 |             logits = model(input_ids)
133 |             loss = loss_fn(logits, targets)
134 |             fabric.backward(loss / gradient_accumulation_steps)
135 | 
136 |         if (iter_num + 1) % gradient_accumulation_steps == 0:
137 |             optimizer.step()
138 |             optimizer.zero_grad()
139 |             step_count += 1
140 | 
141 |             if step_count % eval_interval == 0:
142 |                 val_loss = validate(fabric, model, val_data, tokenizer)
143 |                 fabric.print(f"step {iter_num}: val loss {val_loss:.4f}")
144 |                 fabric.barrier()
145 | 
146 |             if step_count % save_interval == 0:
147 |                 save_path = out_dir / f"iter-{iter_num:06d}.pth"
148 |                 fabric.print(f"Saving weights to {str(save_path)!r}")
149 |                 save_model_checkpoint(fabric, model, save_path)
150 | 
151 |         dt = time.time() - t0
152 |         if iter_num % log_interval == 0:
153 |             fabric.print(f"iter {iter_num}: loss {loss.item():.4f}, time: {dt*1000:.2f}ms")
154 | 
155 | 
156 | @torch.no_grad()
157 | def validate(fabric: L.Fabric, model: torch.nn.Module, val_data: np.ndarray, tokenizer: Tokenizer) -> torch.Tensor:
158 |     fabric.print("Validating ...")
159 |     model.eval()
160 |     losses = torch.zeros(eval_iters)
161 |     for k in range(eval_iters):
162 |         input_ids, targets = get_batch(fabric, val_data)
163 |         logits = model(input_ids)
164 |         loss = loss_fn(logits, targets)
165 |         losses[k] = loss.item()
166 |     val_loss = losses.mean()
167 | 
168 |     # produce an example:
169 |     instruction = "Recommend a movie for me to watch during the weekend and explain the reason."
170 |     fabric.print(instruction)
171 |     sample = {"instruction": instruction, "input": ""}
172 |     prompt = generate_prompt(sample)
173 |     encoded = tokenizer.encode(prompt, device=model.device)
174 |     output = generate(
175 |         model, idx=encoded, max_returned_tokens=len(encoded) + 100, max_seq_length=max_seq_length, temperature=0.8
176 |     )
177 |     output = tokenizer.decode(output)
178 |     fabric.print(output)
179 | 
180 |     model.train()
181 |     return val_loss.item()
182 | 
183 | 
184 | def loss_fn(logits, targets):
185 |     # shift the targets such that output n predicts token n+1
186 |     logits = logits[..., :-1, :].contiguous()
187 |     targets = targets[..., 1:].contiguous()
188 |     loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
189 |     return loss
190 | 
191 | 
192 | def get_batch(fabric: L.Fabric, data: list):
193 |     ix = torch.randint(len(data), (micro_batch_size,))
194 | 
195 |     input_ids = [data[i]["input_ids"].type(torch.int64) for i in ix]
196 |     labels = [data[i]["labels"].type(torch.int64) for i in ix]
197 | 
198 |     max_len = max(len(s) for s in input_ids)
199 | 
200 |     def pad_right(x, pad_id):
201 |         # pad right based on the longest sequence
202 |         n = max_len - len(x)
203 |         return torch.cat((x, torch.full((n,), pad_id, dtype=x.dtype)))
204 | 
205 |     x = torch.stack([pad_right(x, pad_id=0) for x in input_ids])
206 |     y = torch.stack([pad_right(x, pad_id=-1) for x in labels])
207 | 
208 |     if isinstance(fabric.accelerator, MPSAccelerator):
209 |         x, y = fabric.to_device((x, y))
210 |     else:
211 |         x, y = fabric.to_device((x.pin_memory(), y.pin_memory()))
212 | 
213 |     return x, y
214 | 
215 | 
216 | def load_datasets(data_dir: Path):
217 |     train_data = torch.load(data_dir / "train.pt")
218 |     val_data = torch.load(data_dir / "test.pt")
219 |     return train_data, val_data
220 | 
221 | 
222 | def save_model_checkpoint(fabric, model, file_path: Path):
223 |     file_path = Path(file_path)
224 | 
225 |     if isinstance(fabric.strategy, DeepSpeedStrategy):
226 |         from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
227 | 
228 |         tmp_path = file_path.with_suffix(".tmp")
229 |         fabric.save(tmp_path, {"model": model})
230 |         fabric.barrier()
231 |         if fabric.global_rank == 0:
232 |             state_dict = get_fp32_state_dict_from_zero_checkpoint(tmp_path)
233 |             torch.save(state_dict, file_path)
234 |             shutil.rmtree(tmp_path)
235 |     else:
236 |         if fabric.global_rank == 0:
237 |             torch.save(model.state_dict(), file_path)
238 |         fabric.barrier()
239 | 
240 | 
241 | if __name__ == "__main__":
242 |     # Uncomment this line if you see an error: "Expected is_sm80 to be true, but got false"
243 |     # torch.backends.cuda.enable_flash_sdp(False)
244 |     torch.set_float32_matmul_precision("high")
245 | 
246 |     from jsonargparse.cli import CLI
247 |     warnings.filterwarnings(
248 |         # false positive using deepspeed: https://github.com/Lightning-AI/lightning/pull/17761#discussion_r1219705307
249 |         "ignore", message="Remove `.no_backward_sync()` from your code",
250 |     )
251 |     import datetime
252 |     started = datetime.datetime.now()
253 | 
254 |     CLI(main)
255 | 
256 |     now = datetime.datetime.now()
257 |     print("started:", started)
258 |     print("finished:", now)
259 | 
260 |     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)


--------------------------------------------------------------------------------
/lit-benchmarks/falcon-7b/finetune/lora.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Instruction-tuning with LoRA on the Alpaca dataset.
  3 | 
  4 | Note: If you run into a CUDA error "Expected is_sm80 to be true, but got false", uncomment the line
  5 | `torch.backends.cuda.enable_flash_sdp(False)` in the script below (see https://github.com/Lightning-AI/lit-llama/issues/101).
  6 | """
  7 | import os
  8 | import sys
  9 | import time
 10 | import warnings
 11 | from pathlib import Path
 12 | from typing import Optional
 13 | 
 14 | import lightning as L
 15 | import numpy as np
 16 | import torch
 17 | from lightning.fabric.strategies import DeepSpeedStrategy, XLAStrategy
 18 | 
 19 | # support running without installing as a package
 20 | wd = Path(__file__).parent.parent.resolve()
 21 | sys.path.append(str(wd))
 22 | 
 23 | from generate.base import generate
 24 | from lit_parrot.lora import mark_only_lora_as_trainable, lora, lora_state_dict
 25 | from lit_parrot.model import Parrot, Config
 26 | from lit_parrot.tokenizer import Tokenizer
 27 | from lit_parrot.utils import lazy_load, check_valid_checkpoint_dir
 28 | from scripts.prepare_alpaca import generate_prompt
 29 | 
 30 | 
 31 | eval_interval = 100
 32 | save_interval = 100
 33 | eval_iters = 100
 34 | log_interval = 1
 35 | devices = 1
 36 | 
 37 | # Hyperparameters
 38 | learning_rate = 3e-4
 39 | batch_size = 128
 40 | micro_batch_size = 1
 41 | gradient_accumulation_iters = batch_size // micro_batch_size
 42 | assert gradient_accumulation_iters > 0
 43 | max_iters = 52000  # train dataset size
 44 | weight_decay = 0.01
 45 | lora_r = 8
 46 | lora_alpha = 16
 47 | lora_dropout = 0.05
 48 | warmup_iters = 100
 49 | 
 50 | ds_config = {
 51 |     "train_micro_batch_size_per_gpu": micro_batch_size,
 52 |     "gradient_accumulation_steps": gradient_accumulation_iters,
 53 |     "zero_optimization": {"stage": 2},
 54 | }
 55 | 
 56 | 
 57 | def setup(
 58 |     data_dir: Path = Path("data/alpaca"),
 59 |     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
 60 |     out_dir: Path = Path("out/lora/alpaca"),
 61 |     precision: Optional[str] = None,
 62 |     tpu: bool = False,
 63 | ):
 64 |     if precision is None:
 65 |         precision = "32-true" if tpu else "bf16-true"
 66 |     strategy = (
 67 |         "auto"
 68 |         if devices <= 1
 69 |         else XLAStrategy(sync_module_states=False) if tpu else DeepSpeedStrategy(config=ds_config)
 70 |     )
 71 |     # For multi-host TPU training, the device count for Fabric is limited to the count on a single host.
 72 |     fabric_devices = "auto" if (tpu and devices > 1) else devices
 73 |     fabric = L.Fabric(devices=fabric_devices, strategy=strategy, precision=precision)
 74 |     fabric.launch(main, data_dir, checkpoint_dir, out_dir)
 75 | 
 76 | 
 77 | def main(
 78 |     fabric: L.Fabric = None,
 79 |     data_dir: Path = Path("data/alpaca"),
 80 |     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
 81 |     out_dir: Path = Path("out/lora/alpaca"),
 82 | ):
 83 |     check_valid_checkpoint_dir(checkpoint_dir)
 84 |     fabric.seed_everything(1337 + fabric.global_rank)
 85 | 
 86 |     if fabric.global_rank == 0:
 87 |         os.makedirs(out_dir, exist_ok=True)
 88 | 
 89 |     train_data, val_data = load_datasets(data_dir=data_dir)
 90 | 
 91 |     config = Config.from_name(name=checkpoint_dir.name)
 92 |     checkpoint_path = checkpoint_dir / "lit_model.pth"
 93 |     fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}")
 94 |     with fabric.init_module(), lora(r=lora_r, alpha=lora_alpha, dropout=lora_dropout, enabled=True):
 95 |         model = Parrot(config)
 96 |     with lazy_load(checkpoint_path) as checkpoint:
 97 |         # strict=False because missing keys due to LoRA weights not contained in state dict
 98 |         model.load_state_dict(checkpoint, strict=False)
 99 | 
100 |     mark_only_lora_as_trainable(model)
101 |     num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
102 |     fabric.print(f"Number of trainable parameters: {num_params}")
103 | 
104 |     optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
105 |     model, optimizer = fabric.setup(model, optimizer)
106 |     train(fabric, model, optimizer, train_data, val_data, checkpoint_dir, out_dir)
107 | 
108 |     # Save the final LoRA checkpoint at the end of training
109 |     save_path = out_dir / "lit_model_lora_finetuned.pth"
110 |     save_lora_checkpoint(fabric, model, path=save_path)
111 | 
112 | 
113 | def train(
114 |     fabric: L.Fabric,
115 |     model: torch.nn.Module,
116 |     optimizer: torch.optim.Optimizer,
117 |     train_data: np.ndarray,
118 |     val_data: np.ndarray,
119 |     checkpoint_dir: Path,
120 |     out_dir: Path,
121 | ) -> None:
122 |     """The training loop.
123 | 
124 |     Loosely based on the nanoGPT implementation: https://github.com/karpathy/nanoGPT.
125 |     """
126 |     step_count = 0
127 | 
128 |     tokenizer = Tokenizer(checkpoint_dir / "tokenizer.json", checkpoint_dir / "tokenizer_config.json")
129 | 
130 |     if fabric.device.type == "xla":
131 |         import torch_xla.core.xla_model as xm
132 | 
133 |         xm.mark_step()
134 |     for iter_num in range(max_iters):
135 |         if step_count <= warmup_iters:
136 |             # linear warmup
137 |             lr = learning_rate * step_count / warmup_iters
138 |             for param_group in optimizer.param_groups:
139 |                 param_group["lr"] = lr
140 | 
141 |         t0 = time.time()
142 | 
143 |         input_ids, targets = get_batch(fabric, train_data)
144 | 
145 |         with fabric.no_backward_sync(model, enabled=((iter_num + 1) % gradient_accumulation_iters != 0)):
146 |             logits = model(input_ids)
147 |             loss = loss_fn(logits, targets)
148 |             fabric.backward(loss / gradient_accumulation_iters)
149 | 
150 |         if (iter_num + 1) % gradient_accumulation_iters == 0:
151 |             optimizer.step()
152 |             if fabric.device.type == "xla":
153 |                 xm.mark_step()
154 |             optimizer.zero_grad()
155 |             step_count += 1
156 | 
157 |             if step_count % eval_interval == 0:
158 |                 val_loss = validate(fabric, model, val_data, tokenizer)
159 |                 fabric.print(f"step {iter_num}: val loss {val_loss:.4f}")
160 |                 fabric.barrier()
161 | 
162 |             if step_count % save_interval == 0:
163 |                 # We are only saving the LoRA weights
164 |                 save_path = out_dir / f"iter-{iter_num:06d}.pth"
165 |                 save_lora_checkpoint(fabric, model, save_path)
166 |         else:
167 |             if fabric.device.type == "xla":
168 |                 xm.mark_step()
169 | 
170 |         dt = time.time() - t0
171 |         if iter_num % log_interval == 0:
172 |             fabric.print(f"iter {iter_num}: loss {loss.item():.4f}, time: {dt*1000:.2f}ms")
173 | 
174 | 
175 | @torch.no_grad()
176 | def validate(fabric: L.Fabric, model: torch.nn.Module, val_data: np.ndarray, tokenizer: Tokenizer) -> torch.Tensor:
177 |     fabric.print("Validating ...")
178 |     model.eval()
179 |     losses = torch.zeros(eval_iters)
180 |     for k in range(eval_iters):
181 |         input_ids, targets = get_batch(fabric, val_data)
182 |         logits = model(input_ids)
183 |         loss = loss_fn(logits, targets)
184 |         losses[k] = loss.item()
185 |     val_loss = losses.mean()
186 | 
187 |     # produce an example:
188 |     instruction = "Recommend a movie for me to watch during the weekend and explain the reason."
189 |     fabric.print(instruction)
190 |     sample = {"instruction": instruction, "input": ""}
191 |     prompt = generate_prompt(sample)
192 |     encoded = tokenizer.encode(prompt, device=model.device)
193 |     output = generate(
194 |         model,
195 |         idx=encoded,
196 |         max_returned_tokens=len(encoded) + 100,
197 |         max_seq_length=model.config.block_size,
198 |         temperature=0.8,
199 |     )
200 |     output = tokenizer.decode(output)
201 |     fabric.print(output)
202 | 
203 |     model.train()
204 |     return val_loss.item()
205 | 
206 | 
207 | def loss_fn(logits, targets):
208 |     # shift the targets such that output n predicts token n+1
209 |     logits = logits[..., :-1, :].contiguous()
210 |     targets = targets[..., 1:].contiguous()
211 |     loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
212 |     return loss
213 | 
214 | 
215 | def get_batch(fabric: L.Fabric, data: list):
216 |     ix = torch.randint(len(data), (micro_batch_size,))
217 | 
218 |     input_ids = [data[i]["input_ids"].type(torch.int64) for i in ix]
219 |     labels = [data[i]["labels"].type(torch.int64) for i in ix]
220 | 
221 |     max_len = max(len(s) for s in input_ids) if fabric.device.type != "xla" else max_seq_length
222 | 
223 |     def pad_right(x, pad_id):
224 |         # pad right based on the longest sequence
225 |         n = max_len - len(x)
226 |         return torch.cat((x, torch.full((n,), pad_id, dtype=x.dtype)))
227 | 
228 |     x = torch.stack([pad_right(x, pad_id=0) for x in input_ids])
229 |     y = torch.stack([pad_right(x, pad_id=-1) for x in labels])
230 | 
231 |     if fabric.device.type in ("mps", "xla"):
232 |         x, y = fabric.to_device((x, y))
233 |     else:
234 |         x, y = fabric.to_device((x.pin_memory(), y.pin_memory()))
235 |     return x, y
236 | 
237 | 
238 | def load_datasets(data_dir: Path):
239 |     train_data = torch.load(data_dir / "train.pt")
240 |     val_data = torch.load(data_dir / "test.pt")
241 |     return train_data, val_data
242 | 
243 | 
244 | def save_lora_checkpoint(fabric, model, path):
245 |     fabric.print(f"Saving LoRA weights to {str(path)!r}")
246 |     checkpoint = lora_state_dict(model)
247 |     torch.save(checkpoint, path)
248 | 
249 | 
250 | if __name__ == "__main__":
251 |     # Uncomment this line if you see an error: "Expected is_sm80 to be true, but got false"
252 |     # torch.backends.cuda.enable_flash_sdp(False)
253 |     torch.set_float32_matmul_precision("high")
254 | 
255 |     from jsonargparse.cli import CLI
256 | 
257 |     warnings.filterwarnings(
258 |         # false positive using deepspeed: https://github.com/Lightning-AI/lightning/pull/17761#discussion_r1219705307
259 |         "ignore",
260 |         message="Remove `.no_backward_sync()` from your code",
261 |     )
262 | 
263 |     import datetime
264 |     started = datetime.datetime.now()
265 | 
266 |     CLI(setup)
267 | 
268 |     now = datetime.datetime.now()
269 |     print("started:", started)
270 |     print("finished:", now)
271 | 
272 |     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)


--------------------------------------------------------------------------------