├── .gitignore
├── LICENSE
├── README.md
├── adapter
├── distilbert-movie-review
│ ├── 1_finetune-last-layers.ipynb
│ ├── 2_finetune-using-adapter-layers.ipynb
│ ├── 3_finetune-all-layers.ipynb
│ ├── 4_finetune-all-layers-and-adapter-layers.ipynb
│ ├── README.md
│ ├── figures
│ │ └── result-summary.png
│ └── local_dataset_utilities.py
└── lora-from-scratch
│ └── lora-dora-mlp.ipynb
├── conventional
└── distilbert-movie-review
│ ├── 1_feature-extractor.ipynb
│ ├── 2_finetune-last-layers.ipynb
│ ├── 3_finetuning-all-layers.ipynb
│ ├── figures
│ ├── 1_feature-based.png
│ ├── 2_finetune-last.png
│ └── 3_finetune-all.png
│ ├── layerwise-experiment
│ ├── README.md
│ ├── layerwise-experiment-results-clean.txt
│ ├── layerwise-experiment-run.py
│ ├── layerwise-experiment.ipynb
│ ├── layerwise-experiment.py
│ ├── layerwise-results.png
│ ├── local_dataset_utilities.py
│ └── results.txt
│ ├── local_dataset_utilities.py
│ └── mixed-precision-experiment
│ ├── README.md
│ ├── bfloat16-mixed-high.py
│ ├── bfloat16-mixed-medium.py
│ ├── bfloat16-mixed.py
│ ├── bfloat16-regular.py
│ ├── figures
│ ├── 1.png
│ ├── 2.png
│ └── 3.png
│ ├── float16-mixed-high.py
│ ├── float16-mixed-medium.py
│ ├── float16-mixed.py
│ ├── float16-regular.py
│ ├── float32-regular-high.py
│ ├── float32-regular-medium.py
│ ├── float32-regular.py
│ ├── float64-regular.py
│ └── local_dataset_utilities.py
└── lit-benchmarks
└── falcon-7b
├── README.md
├── figures
├── lit-parrot.png
├── memory-requirements.png
└── training-time.png
└── finetune
├── adapter.py
├── adapter_v2.py
├── full.py
└── lora.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | pip-wheel-metadata/
26 | share/python-wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .nox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *.cover
52 | *.py,cover
53 | .hypothesis/
54 | .pytest_cache/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | db.sqlite3
64 | db.sqlite3-journal
65 |
66 | # Flask stuff:
67 | instance/
68 | .webassets-cache
69 |
70 | # Scrapy stuff:
71 | .scrapy
72 |
73 | # Sphinx documentation
74 | docs/_build/
75 |
76 | # PyBuilder
77 | target/
78 |
79 | # Jupyter Notebook
80 | .ipynb_checkpoints
81 |
82 | # IPython
83 | profile_default/
84 | ipython_config.py
85 |
86 | # pyenv
87 | .python-version
88 |
89 | # pipenv
90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
93 | # install all needed dependencies.
94 | #Pipfile.lock
95 |
96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
97 | __pypackages__/
98 |
99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 |
103 | # SageMath parsed files
104 | *.sage.py
105 |
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 |
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 |
119 | # Rope project settings
120 | .ropeproject
121 |
122 | # mkdocs documentation
123 | /site
124 |
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 |
130 | # Pyre type checker
131 | .pyre/
132 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LLM-finetuning-scripts
--------------------------------------------------------------------------------
/adapter/distilbert-movie-review/README.md:
--------------------------------------------------------------------------------
1 | # Result Summary
2 |
3 |
4 |
5 | 
--------------------------------------------------------------------------------
/adapter/distilbert-movie-review/figures/result-summary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/adapter/distilbert-movie-review/figures/result-summary.png
--------------------------------------------------------------------------------
/adapter/distilbert-movie-review/local_dataset_utilities.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import tarfile
4 | import time
5 |
6 | import numpy as np
7 | import pandas as pd
8 | from packaging import version
9 | from torch.utils.data import Dataset
10 | from tqdm import tqdm
11 | import urllib
12 |
13 |
14 | def reporthook(count, block_size, total_size):
15 | global start_time
16 | if count == 0:
17 | start_time = time.time()
18 | return
19 | duration = time.time() - start_time
20 | progress_size = int(count * block_size)
21 | speed = progress_size / (1024.0**2 * duration)
22 | percent = count * block_size * 100.0 / total_size
23 |
24 | sys.stdout.write(
25 | f"\r{int(percent)}% | {progress_size / (1024.**2):.2f} MB "
26 | f"| {speed:.2f} MB/s | {duration:.2f} sec elapsed"
27 | )
28 | sys.stdout.flush()
29 |
30 |
31 | def download_dataset():
32 | source = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
33 | target = "aclImdb_v1.tar.gz"
34 |
35 | if os.path.exists(target):
36 | os.remove(target)
37 |
38 | if not os.path.isdir("aclImdb") and not os.path.isfile("aclImdb_v1.tar.gz"):
39 | urllib.request.urlretrieve(source, target, reporthook)
40 |
41 | if not os.path.isdir("aclImdb"):
42 |
43 | with tarfile.open(target, "r:gz") as tar:
44 | tar.extractall()
45 |
46 |
47 | def load_dataset_into_to_dataframe():
48 | basepath = "aclImdb"
49 |
50 | labels = {"pos": 1, "neg": 0}
51 |
52 | df = pd.DataFrame()
53 |
54 | with tqdm(total=50000) as pbar:
55 | for s in ("test", "train"):
56 | for l in ("pos", "neg"):
57 | path = os.path.join(basepath, s, l)
58 | for file in sorted(os.listdir(path)):
59 | with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
60 | txt = infile.read()
61 |
62 | if version.parse(pd.__version__) >= version.parse("1.3.2"):
63 | x = pd.DataFrame(
64 | [[txt, labels[l]]], columns=["review", "sentiment"]
65 | )
66 | df = pd.concat([df, x], ignore_index=False)
67 |
68 | else:
69 | df = df.append([[txt, labels[l]]], ignore_index=True)
70 | pbar.update()
71 | df.columns = ["text", "label"]
72 |
73 | np.random.seed(0)
74 | df = df.reindex(np.random.permutation(df.index))
75 |
76 | print("Class distribution:")
77 | np.bincount(df["label"].values)
78 |
79 | return df
80 |
81 |
82 | def partition_dataset(df):
83 | df_shuffled = df.sample(frac=1, random_state=1).reset_index()
84 |
85 | df_train = df_shuffled.iloc[:35_000]
86 | df_val = df_shuffled.iloc[35_000:40_000]
87 | df_test = df_shuffled.iloc[40_000:]
88 |
89 | df_train.to_csv("train.csv", index=False, encoding="utf-8")
90 | df_val.to_csv("val.csv", index=False, encoding="utf-8")
91 | df_test.to_csv("test.csv", index=False, encoding="utf-8")
92 |
93 |
94 | class IMDBDataset(Dataset):
95 | def __init__(self, dataset_dict, partition_key="train"):
96 | self.partition = dataset_dict[partition_key]
97 |
98 | def __getitem__(self, index):
99 | return self.partition[index]
100 |
101 | def __len__(self):
102 | return self.partition.num_rows
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/1_feature-extractor.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "3c5d72f4",
6 | "metadata": {},
7 | "source": [
8 | "# LLM as Feature Extractor"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "bb9d0299-8fc0-48f0-9b02-4c19214d479a",
14 | "metadata": {},
15 | "source": [
16 | "In this feature-based approach, we are using the embeddings from a pretrained transormer to train a random forest and logistic regression model in scikit-learn:\n",
17 | "\n",
18 | "
"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 1,
24 | "id": "6fd9cda8",
25 | "metadata": {
26 | "tags": []
27 | },
28 | "outputs": [],
29 | "source": [
30 | "# pip install transformers datasets"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 2,
36 | "id": "df18e3de-577a-43c5-8b9d-868397a6d7da",
37 | "metadata": {
38 | "tags": []
39 | },
40 | "outputs": [],
41 | "source": [
42 | "# conda install sklearn --yes"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 3,
48 | "id": "033b75c5",
49 | "metadata": {
50 | "tags": []
51 | },
52 | "outputs": [
53 | {
54 | "name": "stdout",
55 | "output_type": "stream",
56 | "text": [
57 | "torch : 2.0.0\n",
58 | "transformers: 4.27.4\n",
59 | "datasets : 2.11.0\n",
60 | "sklearn : 1.2.2\n",
61 | "\n",
62 | "conda environment: finetuning-blog\n",
63 | "\n"
64 | ]
65 | }
66 | ],
67 | "source": [
68 | "%load_ext watermark\n",
69 | "%watermark --conda -p torch,transformers,datasets,sklearn"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 4,
75 | "id": "602ba8a0",
76 | "metadata": {
77 | "tags": []
78 | },
79 | "outputs": [
80 | {
81 | "name": "stdout",
82 | "output_type": "stream",
83 | "text": [
84 | "cuda:0\n"
85 | ]
86 | }
87 | ],
88 | "source": [
89 | "import torch\n",
90 | "\n",
91 | "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
92 | "print(device)"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "id": "4cfd724d",
98 | "metadata": {
99 | "tags": []
100 | },
101 | "source": [
102 | "# 1 Loading the Dataset"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 5,
108 | "id": "e39e2228-5f0b-4fb9-b762-df26c2052b45",
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "# pip install datasets\n",
113 | "\n",
114 | "import os.path as op\n",
115 | "\n",
116 | "from datasets import load_dataset\n",
117 | "\n",
118 | "import lightning as L\n",
119 | "from lightning.pytorch.loggers import CSVLogger\n",
120 | "from lightning.pytorch.callbacks import ModelCheckpoint\n",
121 | "\n",
122 | "import numpy as np\n",
123 | "import pandas as pd\n",
124 | "import torch\n",
125 | "\n",
126 | "from sklearn.feature_extraction.text import CountVectorizer\n",
127 | "\n",
128 | "from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset\n",
129 | "from local_dataset_utilities import IMDBDataset"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 6,
135 | "id": "fb31ac90-9e3a-41d0-baf1-8e613043924b",
136 | "metadata": {},
137 | "outputs": [
138 | {
139 | "name": "stderr",
140 | "output_type": "stream",
141 | "text": [
142 | "100%|███████████████████████████████████████████| 50000/50000 [00:25<00:00, 1973.05it/s]\n"
143 | ]
144 | },
145 | {
146 | "name": "stdout",
147 | "output_type": "stream",
148 | "text": [
149 | "Class distribution:\n"
150 | ]
151 | }
152 | ],
153 | "source": [
154 | "download_dataset()\n",
155 | "\n",
156 | "df = load_dataset_into_to_dataframe()\n",
157 | "partition_dataset(df)"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 7,
163 | "id": "221f30a1-b433-4304-a18d-8d03abd42b58",
164 | "metadata": {},
165 | "outputs": [],
166 | "source": [
167 | "df_train = pd.read_csv(\"train.csv\")\n",
168 | "df_val = pd.read_csv(\"val.csv\")\n",
169 | "df_test = pd.read_csv(\"test.csv\")"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "id": "846d83b1",
175 | "metadata": {},
176 | "source": [
177 | "# 2 Tokenization and Numericalization"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": 8,
183 | "id": "21114d27-2697-4132-9714-b259bd63f5a1",
184 | "metadata": {},
185 | "outputs": [
186 | {
187 | "name": "stdout",
188 | "output_type": "stream",
189 | "text": [
190 | "Downloading and preparing dataset csv/default to /home/sebastian/.cache/huggingface/datasets/csv/default-2417067d5b75d213/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...\n"
191 | ]
192 | },
193 | {
194 | "data": {
195 | "application/vnd.jupyter.widget-view+json": {
196 | "model_id": "0f3dbdca454a4e7d8ebfe80e8e946e7d",
197 | "version_major": 2,
198 | "version_minor": 0
199 | },
200 | "text/plain": [
201 | "Downloading data files: 0%| | 0/3 [00:00, ?it/s]"
202 | ]
203 | },
204 | "metadata": {},
205 | "output_type": "display_data"
206 | },
207 | {
208 | "data": {
209 | "application/vnd.jupyter.widget-view+json": {
210 | "model_id": "1d913db1678e4636849970ff87653992",
211 | "version_major": 2,
212 | "version_minor": 0
213 | },
214 | "text/plain": [
215 | "Extracting data files: 0%| | 0/3 [00:00, ?it/s]"
216 | ]
217 | },
218 | "metadata": {},
219 | "output_type": "display_data"
220 | },
221 | {
222 | "data": {
223 | "application/vnd.jupyter.widget-view+json": {
224 | "model_id": "",
225 | "version_major": 2,
226 | "version_minor": 0
227 | },
228 | "text/plain": [
229 | "Generating train split: 0 examples [00:00, ? examples/s]"
230 | ]
231 | },
232 | "metadata": {},
233 | "output_type": "display_data"
234 | },
235 | {
236 | "data": {
237 | "application/vnd.jupyter.widget-view+json": {
238 | "model_id": "",
239 | "version_major": 2,
240 | "version_minor": 0
241 | },
242 | "text/plain": [
243 | "Generating validation split: 0 examples [00:00, ? examples/s]"
244 | ]
245 | },
246 | "metadata": {},
247 | "output_type": "display_data"
248 | },
249 | {
250 | "data": {
251 | "application/vnd.jupyter.widget-view+json": {
252 | "model_id": "",
253 | "version_major": 2,
254 | "version_minor": 0
255 | },
256 | "text/plain": [
257 | "Generating test split: 0 examples [00:00, ? examples/s]"
258 | ]
259 | },
260 | "metadata": {},
261 | "output_type": "display_data"
262 | },
263 | {
264 | "name": "stdout",
265 | "output_type": "stream",
266 | "text": [
267 | "Dataset csv downloaded and prepared to /home/sebastian/.cache/huggingface/datasets/csv/default-2417067d5b75d213/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.\n"
268 | ]
269 | },
270 | {
271 | "data": {
272 | "application/vnd.jupyter.widget-view+json": {
273 | "model_id": "0afb64f5f1f945248ac5ed71b2bda5d7",
274 | "version_major": 2,
275 | "version_minor": 0
276 | },
277 | "text/plain": [
278 | " 0%| | 0/3 [00:00, ?it/s]"
279 | ]
280 | },
281 | "metadata": {},
282 | "output_type": "display_data"
283 | },
284 | {
285 | "name": "stdout",
286 | "output_type": "stream",
287 | "text": [
288 | "DatasetDict({\n",
289 | " train: Dataset({\n",
290 | " features: ['index', 'text', 'label'],\n",
291 | " num_rows: 35000\n",
292 | " })\n",
293 | " validation: Dataset({\n",
294 | " features: ['index', 'text', 'label'],\n",
295 | " num_rows: 5000\n",
296 | " })\n",
297 | " test: Dataset({\n",
298 | " features: ['index', 'text', 'label'],\n",
299 | " num_rows: 10000\n",
300 | " })\n",
301 | "})\n"
302 | ]
303 | }
304 | ],
305 | "source": [
306 | "imdb_dataset = load_dataset(\n",
307 | " \"csv\",\n",
308 | " data_files={\n",
309 | " \"train\": \"train.csv\",\n",
310 | " \"validation\": \"val.csv\",\n",
311 | " \"test\": \"test.csv\",\n",
312 | " },\n",
313 | ")\n",
314 | "\n",
315 | "print(imdb_dataset)"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 9,
321 | "id": "5ea762ba",
322 | "metadata": {},
323 | "outputs": [
324 | {
325 | "name": "stdout",
326 | "output_type": "stream",
327 | "text": [
328 | "Tokenizer input max length: 512\n",
329 | "Tokenizer vocabulary size: 30522\n"
330 | ]
331 | }
332 | ],
333 | "source": [
334 | "from transformers import AutoTokenizer\n",
335 | "\n",
336 | "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
337 | "print(\"Tokenizer input max length:\", tokenizer.model_max_length)\n",
338 | "print(\"Tokenizer vocabulary size:\", tokenizer.vocab_size)"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 10,
344 | "id": "8432c15c",
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "def tokenize_text(batch):\n",
349 | " return tokenizer(batch[\"text\"], truncation=True, padding=True)"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": 11,
355 | "id": "0bb392cf",
356 | "metadata": {},
357 | "outputs": [
358 | {
359 | "data": {
360 | "application/vnd.jupyter.widget-view+json": {
361 | "model_id": "",
362 | "version_major": 2,
363 | "version_minor": 0
364 | },
365 | "text/plain": [
366 | "Map: 0%| | 0/35000 [00:00, ? examples/s]"
367 | ]
368 | },
369 | "metadata": {},
370 | "output_type": "display_data"
371 | },
372 | {
373 | "data": {
374 | "application/vnd.jupyter.widget-view+json": {
375 | "model_id": "",
376 | "version_major": 2,
377 | "version_minor": 0
378 | },
379 | "text/plain": [
380 | "Map: 0%| | 0/5000 [00:00, ? examples/s]"
381 | ]
382 | },
383 | "metadata": {},
384 | "output_type": "display_data"
385 | },
386 | {
387 | "data": {
388 | "application/vnd.jupyter.widget-view+json": {
389 | "model_id": "",
390 | "version_major": 2,
391 | "version_minor": 0
392 | },
393 | "text/plain": [
394 | "Map: 0%| | 0/10000 [00:00, ? examples/s]"
395 | ]
396 | },
397 | "metadata": {},
398 | "output_type": "display_data"
399 | }
400 | ],
401 | "source": [
402 | "imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)"
403 | ]
404 | },
405 | {
406 | "cell_type": "code",
407 | "execution_count": 12,
408 | "id": "6d4103c3",
409 | "metadata": {},
410 | "outputs": [],
411 | "source": [
412 | "del imdb_dataset"
413 | ]
414 | },
415 | {
416 | "cell_type": "markdown",
417 | "id": "bfeb1553",
418 | "metadata": {},
419 | "source": [
420 | "# 3 Using DistilBERT as a Feature Extractor"
421 | ]
422 | },
423 | {
424 | "cell_type": "code",
425 | "execution_count": 13,
426 | "id": "9f2c474d",
427 | "metadata": {},
428 | "outputs": [
429 | {
430 | "name": "stderr",
431 | "output_type": "stream",
432 | "text": [
433 | "Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']\n",
434 | "- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
435 | "- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
436 | ]
437 | }
438 | ],
439 | "source": [
440 | "from transformers import AutoModel\n",
441 | "\n",
442 | "model = AutoModel.from_pretrained(\"distilbert-base-uncased\")\n",
443 | "model.to(device);"
444 | ]
445 | },
446 | {
447 | "cell_type": "code",
448 | "execution_count": 14,
449 | "id": "c6686adc",
450 | "metadata": {},
451 | "outputs": [],
452 | "source": [
453 | "imdb_tokenized.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"label\"])"
454 | ]
455 | },
456 | {
457 | "cell_type": "code",
458 | "execution_count": 15,
459 | "id": "07122e49",
460 | "metadata": {},
461 | "outputs": [
462 | {
463 | "data": {
464 | "text/plain": [
465 | "torch.Size([3, 512, 768])"
466 | ]
467 | },
468 | "execution_count": 15,
469 | "metadata": {},
470 | "output_type": "execute_result"
471 | }
472 | ],
473 | "source": [
474 | "test_batch = {\"attention_mask\": imdb_tokenized[\"train\"][:3][\"attention_mask\"].to(device),\n",
475 | " \"input_ids\": imdb_tokenized[\"train\"][:3][\"input_ids\"].to(device)}\n",
476 | "\n",
477 | "with torch.inference_mode():\n",
478 | " test_output = model(**test_batch)\n",
479 | " \n",
480 | "test_output.last_hidden_state.shape"
481 | ]
482 | },
483 | {
484 | "cell_type": "code",
485 | "execution_count": 16,
486 | "id": "083e61f1",
487 | "metadata": {},
488 | "outputs": [
489 | {
490 | "data": {
491 | "text/plain": [
492 | "torch.Size([3, 768])"
493 | ]
494 | },
495 | "execution_count": 16,
496 | "metadata": {},
497 | "output_type": "execute_result"
498 | }
499 | ],
500 | "source": [
501 | "cls_token_output = test_output.last_hidden_state[:, 0]\n",
502 | "cls_token_output.shape"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": 17,
508 | "id": "316d0450",
509 | "metadata": {},
510 | "outputs": [],
511 | "source": [
512 | "@torch.inference_mode()\n",
513 | "def get_output_embeddings(batch):\n",
514 | " output = model(\n",
515 | " batch[\"input_ids\"].to(device),\n",
516 | " attention_mask=batch[\"attention_mask\"].to(device)).last_hidden_state[:, 0]\n",
517 | " return {\"features\": output.cpu().numpy()}"
518 | ]
519 | },
520 | {
521 | "cell_type": "code",
522 | "execution_count": 18,
523 | "id": "2629aaa3",
524 | "metadata": {},
525 | "outputs": [
526 | {
527 | "data": {
528 | "application/vnd.jupyter.widget-view+json": {
529 | "model_id": "",
530 | "version_major": 2,
531 | "version_minor": 0
532 | },
533 | "text/plain": [
534 | "Map: 0%| | 0/35000 [00:00, ? examples/s]"
535 | ]
536 | },
537 | "metadata": {},
538 | "output_type": "display_data"
539 | },
540 | {
541 | "data": {
542 | "application/vnd.jupyter.widget-view+json": {
543 | "model_id": "",
544 | "version_major": 2,
545 | "version_minor": 0
546 | },
547 | "text/plain": [
548 | "Map: 0%| | 0/5000 [00:00, ? examples/s]"
549 | ]
550 | },
551 | "metadata": {},
552 | "output_type": "display_data"
553 | },
554 | {
555 | "data": {
556 | "application/vnd.jupyter.widget-view+json": {
557 | "model_id": "",
558 | "version_major": 2,
559 | "version_minor": 0
560 | },
561 | "text/plain": [
562 | "Map: 0%| | 0/10000 [00:00, ? examples/s]"
563 | ]
564 | },
565 | "metadata": {},
566 | "output_type": "display_data"
567 | }
568 | ],
569 | "source": [
570 | "import time\n",
571 | "start = time.time()\n",
572 | "\n",
573 | "imdb_features = imdb_tokenized.map(get_output_embeddings, batched=True, batch_size=10)"
574 | ]
575 | },
576 | {
577 | "cell_type": "code",
578 | "execution_count": 19,
579 | "id": "0fe91178",
580 | "metadata": {},
581 | "outputs": [],
582 | "source": [
583 | "X_train = np.array(imdb_features[\"train\"][\"features\"])\n",
584 | "y_train = np.array(imdb_features[\"train\"][\"label\"])\n",
585 | "\n",
586 | "X_val = np.array(imdb_features[\"validation\"][\"features\"])\n",
587 | "y_val = np.array(imdb_features[\"validation\"][\"label\"])\n",
588 | "\n",
589 | "X_test = np.array(imdb_features[\"test\"][\"features\"])\n",
590 | "y_test = np.array(imdb_features[\"test\"][\"label\"])"
591 | ]
592 | },
593 | {
594 | "cell_type": "markdown",
595 | "id": "e76e2e95-e9b3-4a54-b778-0bdcef59f098",
596 | "metadata": {},
597 | "source": [
598 | "# 4 Train Model on Embeddings (Extracted Features)"
599 | ]
600 | },
601 | {
602 | "cell_type": "code",
603 | "execution_count": 20,
604 | "id": "81c31cf9-ec66-41a9-aa54-3e5b6ca33cf6",
605 | "metadata": {},
606 | "outputs": [
607 | {
608 | "name": "stdout",
609 | "output_type": "stream",
610 | "text": [
611 | "Training accuracy 0.8866285714285714\n",
612 | "Validation accuracy 0.883\n",
613 | "test accuracy 0.8795\n",
614 | "Time elapsed 3.28 min\n"
615 | ]
616 | }
617 | ],
618 | "source": [
619 | "from sklearn.linear_model import LogisticRegression\n",
620 | "\n",
621 | "clf = LogisticRegression(max_iter=1000)\n",
622 | "clf.fit(X_train, y_train)\n",
623 | "\n",
624 | "print(\"Training accuracy\", clf.score(X_train, y_train))\n",
625 | "print(\"Validation accuracy\", clf.score(X_val, y_val))\n",
626 | "print(\"test accuracy\", clf.score(X_test, y_test))\n",
627 | "\n",
628 | "end = time.time()\n",
629 | "elapsed = end - start\n",
630 | "print(f\"Time elapsed {elapsed/60:.2f} min\")"
631 | ]
632 | },
633 | {
634 | "cell_type": "code",
635 | "execution_count": 21,
636 | "id": "201a4329-7a91-4501-9c75-4d18f4646fa5",
637 | "metadata": {},
638 | "outputs": [
639 | {
640 | "name": "stdout",
641 | "output_type": "stream",
642 | "text": [
643 | "Training accuracy 1.0\n",
644 | "Validation accuracy 0.8408\n",
645 | "test accuracy 0.8324\n"
646 | ]
647 | }
648 | ],
649 | "source": [
650 | "from sklearn.ensemble import RandomForestClassifier\n",
651 | "\n",
652 | "clf = RandomForestClassifier()\n",
653 | "clf.fit(X_train, y_train)\n",
654 | "\n",
655 | "print(\"Training accuracy\", clf.score(X_train, y_train))\n",
656 | "print(\"Validation accuracy\", clf.score(X_val, y_val))\n",
657 | "print(\"test accuracy\", clf.score(X_test, y_test))"
658 | ]
659 | }
660 | ],
661 | "metadata": {
662 | "kernelspec": {
663 | "display_name": "Python 3 (ipykernel)",
664 | "language": "python",
665 | "name": "python3"
666 | },
667 | "language_info": {
668 | "codemirror_mode": {
669 | "name": "ipython",
670 | "version": 3
671 | },
672 | "file_extension": ".py",
673 | "mimetype": "text/x-python",
674 | "name": "python",
675 | "nbconvert_exporter": "python",
676 | "pygments_lexer": "ipython3",
677 | "version": "3.10.6"
678 | }
679 | },
680 | "nbformat": 4,
681 | "nbformat_minor": 5
682 | }
683 |
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/figures/1_feature-based.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/conventional/distilbert-movie-review/figures/1_feature-based.png
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/figures/2_finetune-last.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/conventional/distilbert-movie-review/figures/2_finetune-last.png
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/figures/3_finetune-all.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/conventional/distilbert-movie-review/figures/3_finetune-all.png
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/layerwise-experiment/README.md:
--------------------------------------------------------------------------------
1 | # Layerwise experiment
2 |
3 | Run
4 |
5 | ```bash
6 | python layerwise-experiment-run.py
7 | ```
8 |
9 | to produce the `layerwise-experiment-results-clean.txt` files. The first `Test metric` in each section represents the training accuracy, the second instance the validation accuracy, and the third instance the test accuracy. The `results.txt` file is a more readable, annotated version.
10 |
11 | The results are visualized below (plotting code note included.)
12 |
13 | 
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/layerwise-experiment/layerwise-experiment-results-clean.txt:
--------------------------------------------------------------------------------
1 | Class distribution:
2 | Downloading and preparing dataset csv/default to /home/sebastian/.cache/huggingface/datasets/csv/default-8c97c4f49e71f1f6/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...
3 | Dataset csv downloaded and prepared to /home/sebastian/.cache/huggingface/datasets/csv/default-8c97c4f49e71f1f6/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.
4 | DatasetDict({
5 | train: Dataset({
6 | features: ['index', 'text', 'label'],
7 | num_rows: 35000
8 | })
9 | validation: Dataset({
10 | features: ['index', 'text', 'label'],
11 | num_rows: 5000
12 | })
13 | test: Dataset({
14 | features: ['index', 'text', 'label'],
15 | num_rows: 10000
16 | })
17 | })
18 | Tokenizer input max length: 512
19 | Tokenizer vocabulary size: 30522
20 | Training: 0it [00:00, ?it/s]
21 | Training: 0%| | 0/2917 [00:00, ?it/s]
22 | [A
23 | [A
24 | [A
25 | Time elapsed 6.99 min
26 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
27 | ┃ Test metric ┃ DataLoader 0 ┃
28 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
29 | │ accuracy │ 0.9666571617126465 │
30 | └───────────────────────────┴───────────────────────────┘
31 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
32 | ┃ Test metric ┃ DataLoader 0 ┃
33 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
34 | │ accuracy │ 0.9301999807357788 │
35 | └───────────────────────────┴───────────────────────────┘
36 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
37 | ┃ Test metric ┃ DataLoader 0 ┃
38 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
39 | │ accuracy │ 0.9254999756813049 │
40 | └───────────────────────────┴───────────────────────────┘
41 | 1 -- Last Layer
42 | Training: 0it [00:00, ?it/s]
43 | Training: 0%| | 0/2917 [00:00, ?it/s]
44 | [A
45 | [A
46 | [A
47 | Time elapsed 2.77 min
48 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
49 | ┃ Test metric ┃ DataLoader 0 ┃
50 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
51 | │ accuracy │ 0.7889142632484436 │
52 | └───────────────────────────┴───────────────────────────┘
53 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
54 | ┃ Test metric ┃ DataLoader 0 ┃
55 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
56 | │ accuracy │ 0.7942000031471252 │
57 | └───────────────────────────┴───────────────────────────┘
58 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
59 | ┃ Test metric ┃ DataLoader 0 ┃
60 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
61 | │ accuracy │ 0.7871000170707703 │
62 | └───────────────────────────┴───────────────────────────┘
63 | 2 -- Last 2 Layers
64 | Training: 0it [00:00, ?it/s]
65 | Training: 0%| | 0/2917 [00:00, ?it/s]
66 | [A
67 | [A
68 | [A
69 | Time elapsed 2.78 min
70 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
71 | ┃ Test metric ┃ DataLoader 0 ┃
72 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
73 | │ accuracy │ 0.868228554725647 │
74 | └───────────────────────────┴───────────────────────────┘
75 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
76 | ┃ Test metric ┃ DataLoader 0 ┃
77 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
78 | │ accuracy │ 0.8712000250816345 │
79 | └───────────────────────────┴───────────────────────────┘
80 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
81 | ┃ Test metric ┃ DataLoader 0 ┃
82 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
83 | │ accuracy │ 0.8644999861717224 │
84 | └───────────────────────────┴───────────────────────────┘
85 | 3 -- Last 2 Layers + Last Tranformer Block
86 | Training: 0it [00:00, ?it/s]
87 | Training: 0%| | 0/2917 [00:00, ?it/s]
88 | [A
89 | [A
90 | [A
91 | Time elapsed 3.39 min
92 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
93 | ┃ Test metric ┃ DataLoader 0 ┃
94 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
95 | │ accuracy │ 0.9498000144958496 │
96 | └───────────────────────────┴───────────────────────────┘
97 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
98 | ┃ Test metric ┃ DataLoader 0 ┃
99 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
100 | │ accuracy │ 0.9272000193595886 │
101 | └───────────────────────────┴───────────────────────────┘
102 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
103 | ┃ Test metric ┃ DataLoader 0 ┃
104 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
105 | │ accuracy │ 0.921999990940094 │
106 | └───────────────────────────┴───────────────────────────┘
107 | 4 -- Last 2 Layers + Last 2 Transformer Blocks
108 | Training: 0it [00:00, ?it/s]
109 | Training: 0%| | 0/2917 [00:00, ?it/s]
110 | [A
111 | [A
112 | [A
113 | Time elapsed 4.06 min
114 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
115 | ┃ Test metric ┃ DataLoader 0 ┃
116 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
117 | │ accuracy │ 0.9771142601966858 │
118 | └───────────────────────────┴───────────────────────────┘
119 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
120 | ┃ Test metric ┃ DataLoader 0 ┃
121 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
122 | │ accuracy │ 0.9300000071525574 │
123 | └───────────────────────────┴───────────────────────────┘
124 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
125 | ┃ Test metric ┃ DataLoader 0 ┃
126 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
127 | │ accuracy │ 0.9240000247955322 │
128 | └───────────────────────────┴───────────────────────────┘
129 | 5 -- Last 2 Layers + Last 3 Transformer Blocks
130 | Training: 0it [00:00, ?it/s]
131 | Training: 0%| | 0/2917 [00:00, ?it/s]
132 | [A
133 | [A
134 | [A
135 | Time elapsed 4.63 min
136 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
137 | ┃ Test metric ┃ DataLoader 0 ┃
138 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
139 | │ accuracy │ 0.9864857196807861 │
140 | └───────────────────────────┴───────────────────────────┘
141 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
142 | ┃ Test metric ┃ DataLoader 0 ┃
143 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
144 | │ accuracy │ 0.9333999752998352 │
145 | └───────────────────────────┴───────────────────────────┘
146 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
147 | ┃ Test metric ┃ DataLoader 0 ┃
148 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
149 | │ accuracy │ 0.9265000224113464 │
150 | └───────────────────────────┴───────────────────────────┘
151 | 6 -- Last 2 Layers + Last 4 Transformer Blocks
152 | Training: 0it [00:00, ?it/s]
153 | Training: 0%| | 0/2917 [00:00, ?it/s]
154 | [A
155 | [A
156 | [A
157 | Time elapsed 5.15 min
158 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
159 | ┃ Test metric ┃ DataLoader 0 ┃
160 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
161 | │ accuracy │ 0.9763428568840027 │
162 | └───────────────────────────┴───────────────────────────┘
163 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
164 | ┃ Test metric ┃ DataLoader 0 ┃
165 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
166 | │ accuracy │ 0.9279999732971191 │
167 | └───────────────────────────┴───────────────────────────┘
168 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
169 | ┃ Test metric ┃ DataLoader 0 ┃
170 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
171 | │ accuracy │ 0.9262999892234802 │
172 | └───────────────────────────┴───────────────────────────┘
173 | ## 7 -- Last 2 Layers + Last 5 Transformer Blocks
174 | Training: 0it [00:00, ?it/s]
175 | Training: 0%| | 0/2917 [00:00, ?it/s]
176 | [A
177 | [A
178 | [A
179 | Time elapsed 6.99 min
180 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
181 | ┃ Test metric ┃ DataLoader 0 ┃
182 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
183 | │ accuracy │ 0.9947142601013184 │
184 | └───────────────────────────┴───────────────────────────┘
185 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
186 | ┃ Test metric ┃ DataLoader 0 ┃
187 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
188 | │ accuracy │ 0.9258000254631042 │
189 | └───────────────────────────┴───────────────────────────┘
190 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
191 | ┃ Test metric ┃ DataLoader 0 ┃
192 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
193 | │ accuracy │ 0.9251999855041504 │
194 | └───────────────────────────┴───────────────────────────┘
195 | 8 -- Last 2 Layers + Last 6 Transformer Blocks
196 | Training: 0it [00:00, ?it/s]
197 | Training: 0%| | 0/2917 [00:00, ?it/s]
198 | [A
199 | [A
200 | [A
201 | Time elapsed 7.01 min
202 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
203 | ┃ Test metric ┃ DataLoader 0 ┃
204 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
205 | │ accuracy │ 0.9925428628921509 │
206 | └───────────────────────────┴───────────────────────────┘
207 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
208 | ┃ Test metric ┃ DataLoader 0 ┃
209 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
210 | │ accuracy │ 0.9277999997138977 │
211 | └───────────────────────────┴───────────────────────────┘
212 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
213 | ┃ Test metric ┃ DataLoader 0 ┃
214 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
215 | │ accuracy │ 0.9262999892234802 │
216 | └───────────────────────────┴───────────────────────────┘
217 |
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/layerwise-experiment/layerwise-experiment-run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import subprocess
3 |
4 | with open("layerwise-experiment-results.txt", "w+") as output:
5 | subprocess.call(["python", "layerwise-experiment.py"], stdout=output);
6 |
7 | ########
8 |
9 | s = ("Sanity", "Testing", "Validation", "Epoch")
10 |
11 | out = []
12 | with open("layerwise-experiment-results.txt", "r") as f:
13 | for line in f.readlines():
14 | if line.startswith(s) or not line.strip():
15 | continue
16 |
17 | else:
18 | out.append(line)
19 | with open("layerwise-experiment-results-clean.txt", "w") as f:
20 | for line in out:
21 | f.write(line)
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/layerwise-experiment/layerwise-experiment.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Finetuning All Layers
5 |
6 | #
7 |
8 | # In[ ]:
9 |
10 |
11 | # pip install transformers
12 |
13 |
14 | # In[ ]:
15 |
16 |
17 | # pip install datasets
18 |
19 |
20 | # In[ ]:
21 |
22 |
23 | # pip install lightning
24 |
25 |
26 | # In[ ]:
27 |
28 |
29 | # get_ipython().run_line_magic('load_ext', 'watermark')
30 | # get_ipython().run_line_magic('watermark', '--conda -p torch,transformers,datasets,lightning')
31 |
32 |
33 | # # 1 Loading the dataset into DataFrames
34 |
35 | # In[ ]:
36 |
37 |
38 | # pip install datasets
39 |
40 | import shutil
41 |
42 | from datasets import load_dataset
43 |
44 | import lightning as L
45 | from lightning.pytorch.loggers import CSVLogger
46 | from lightning.pytorch.callbacks import ModelCheckpoint
47 |
48 | import numpy as np
49 | import pandas as pd
50 | import torch
51 |
52 | from sklearn.feature_extraction.text import CountVectorizer
53 |
54 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
55 | from local_dataset_utilities import IMDBDataset
56 |
57 |
58 | # In[ ]:
59 |
60 |
61 | download_dataset()
62 |
63 | df = load_dataset_into_to_dataframe()
64 | partition_dataset(df)
65 |
66 |
67 | # In[ ]:
68 |
69 |
70 | df_train = pd.read_csv("train.csv")
71 | df_val = pd.read_csv("val.csv")
72 | df_test = pd.read_csv("test.csv")
73 |
74 |
75 | # # 2 Tokenization and Numericalization
76 |
77 | # **Load the dataset via `load_dataset`**
78 |
79 | # In[ ]:
80 |
81 |
82 | imdb_dataset = load_dataset(
83 | "csv",
84 | data_files={
85 | "train": "train.csv",
86 | "validation": "val.csv",
87 | "test": "test.csv",
88 | },
89 | )
90 |
91 | print(imdb_dataset)
92 |
93 |
94 | # **Tokenize the dataset**
95 |
96 | # In[ ]:
97 |
98 |
99 | from transformers import AutoTokenizer
100 |
101 | tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
102 | print("Tokenizer input max length:", tokenizer.model_max_length)
103 | print("Tokenizer vocabulary size:", tokenizer.vocab_size)
104 |
105 |
106 | # In[ ]:
107 |
108 |
109 | def tokenize_text(batch):
110 | return tokenizer(batch["text"], truncation=True, padding=True)
111 |
112 |
113 | # In[ ]:
114 |
115 |
116 | imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
117 |
118 |
119 | # In[ ]:
120 |
121 |
122 | del imdb_dataset
123 |
124 |
125 | # In[ ]:
126 |
127 |
128 | imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
129 |
130 |
131 | # In[ ]:
132 |
133 |
134 | import os
135 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
136 |
137 |
138 | # # 3 Set Up DataLoaders
139 |
140 | # In[ ]:
141 |
142 |
143 | from torch.utils.data import DataLoader, Dataset
144 |
145 |
146 | class IMDBDataset(Dataset):
147 | def __init__(self, dataset_dict, partition_key="train"):
148 | self.partition = dataset_dict[partition_key]
149 |
150 | def __getitem__(self, index):
151 | return self.partition[index]
152 |
153 | def __len__(self):
154 | return self.partition.num_rows
155 |
156 |
157 | # In[ ]:
158 |
159 |
160 | train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
161 | val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
162 | test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
163 |
164 | train_loader = DataLoader(
165 | dataset=train_dataset,
166 | batch_size=12,
167 | shuffle=True,
168 | num_workers=4
169 | )
170 |
171 | val_loader = DataLoader(
172 | dataset=val_dataset,
173 | batch_size=12,
174 | num_workers=4
175 | )
176 |
177 | test_loader = DataLoader(
178 | dataset=test_dataset,
179 | batch_size=12,
180 | num_workers=4
181 | )
182 |
183 |
184 | # # 4 Initializing Modules
185 |
186 | # **Wrap in LightningModule for Training**
187 |
188 | # In[ ]:
189 |
190 |
191 | import lightning as L
192 | import torch
193 | import torchmetrics
194 |
195 |
196 | class CustomLightningModule(L.LightningModule):
197 | def __init__(self, model, learning_rate=5e-5):
198 | super().__init__()
199 |
200 | self.learning_rate = learning_rate
201 | self.model = model
202 |
203 | self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
204 | self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
205 |
206 | def forward(self, input_ids, attention_mask, labels):
207 | return self.model(input_ids, attention_mask=attention_mask, labels=labels)
208 |
209 | def training_step(self, batch, batch_idx):
210 | outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
211 | labels=batch["label"])
212 | self.log("train_loss", outputs["loss"])
213 | return outputs["loss"] # this is passed to the optimizer for training
214 |
215 | def validation_step(self, batch, batch_idx):
216 | outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
217 | labels=batch["label"])
218 | self.log("val_loss", outputs["loss"], prog_bar=True)
219 |
220 | logits = outputs["logits"]
221 | predicted_labels = torch.argmax(logits, 1)
222 | self.val_acc(predicted_labels, batch["label"])
223 | self.log("val_acc", self.val_acc, prog_bar=True)
224 |
225 | def test_step(self, batch, batch_idx):
226 | outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
227 | labels=batch["label"])
228 |
229 | logits = outputs["logits"]
230 | predicted_labels = torch.argmax(logits, 1)
231 | self.test_acc(predicted_labels, batch["label"])
232 | self.log("accuracy", self.test_acc, prog_bar=True)
233 |
234 | def configure_optimizers(self):
235 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
236 | return optimizer
237 |
238 |
239 | # In[ ]:
240 |
241 |
242 | from lightning.pytorch.callbacks import ModelCheckpoint
243 | from lightning.pytorch.loggers import CSVLogger
244 |
245 |
246 | callbacks = [
247 | ModelCheckpoint(
248 | save_top_k=1, mode="max", monitor="val_acc"
249 | ) # save top 1 model
250 | ]
251 | logger = CSVLogger(save_dir="logs/", name="my-model")
252 |
253 |
254 | # # 5 Finetuning
255 |
256 | # ## All layers
257 |
258 | # In[ ]:
259 |
260 |
261 | from transformers import AutoModelForSequenceClassification
262 |
263 | model = AutoModelForSequenceClassification.from_pretrained(
264 | "distilbert-base-uncased", num_labels=2)
265 |
266 | lightning_model = CustomLightningModule(model)
267 |
268 |
269 | # In[ ]:
270 |
271 |
272 | trainer = L.Trainer(
273 | max_epochs=3,
274 | callbacks=callbacks,
275 | accelerator="gpu",
276 | precision="16-mixed",
277 | devices=1,
278 | logger=logger,
279 | log_every_n_steps=100,
280 | )
281 |
282 |
283 | # In[ ]:
284 |
285 |
286 | import time
287 | start = time.time()
288 |
289 | trainer.fit(model=lightning_model,
290 | train_dataloaders=train_loader,
291 | val_dataloaders=val_loader)
292 |
293 | end = time.time()
294 | elapsed = end - start
295 | print(f"Time elapsed {elapsed/60:.2f} min")
296 |
297 |
298 | # In[ ]:
299 |
300 |
301 | trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")
302 |
303 |
304 | # In[ ]:
305 |
306 |
307 | trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
308 |
309 |
310 | # In[ ]:
311 |
312 |
313 | trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
314 | shutil.rmtree("logs")
315 | logger = CSVLogger(save_dir="logs/", name="my-model")
316 |
317 |
318 | # ## 1 -- Last Layer
319 |
320 | # In[ ]:
321 |
322 | print("1 -- Last Layer")
323 |
324 | model = AutoModelForSequenceClassification.from_pretrained(
325 | "distilbert-base-uncased", num_labels=2)
326 |
327 | lightning_model = CustomLightningModule(model)
328 |
329 |
330 | # In[ ]:
331 |
332 |
333 | for param in model.parameters():
334 | param.requires_grad = False
335 |
336 | for param in model.classifier.parameters():
337 | param.requires_grad = True
338 |
339 |
340 | # In[ ]:
341 |
342 |
343 | trainer = L.Trainer(
344 | max_epochs=3,
345 | callbacks=callbacks,
346 | accelerator="gpu",
347 | precision="16-mixed",
348 | devices=1,
349 | logger=logger,
350 | log_every_n_steps=100,
351 | )
352 |
353 |
354 | # In[ ]:
355 |
356 |
357 | start = time.time()
358 |
359 | trainer.fit(model=lightning_model,
360 | train_dataloaders=train_loader,
361 | val_dataloaders=val_loader)
362 |
363 | end = time.time()
364 | elapsed = end - start
365 | print(f"Time elapsed {elapsed/60:.2f} min")
366 |
367 |
368 | # In[ ]:
369 |
370 |
371 | trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")
372 |
373 |
374 | # In[ ]:
375 |
376 |
377 | trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
378 |
379 |
380 | # In[ ]:
381 |
382 |
383 | trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
384 | shutil.rmtree("logs")
385 | logger = CSVLogger(save_dir="logs/", name="my-model")
386 |
387 |
388 | # ## 2 -- Last 2 Layers
389 |
390 | # In[ ]:
391 |
392 | print("2 -- Last 2 Layers")
393 |
394 | model = AutoModelForSequenceClassification.from_pretrained(
395 | "distilbert-base-uncased", num_labels=2)
396 |
397 | lightning_model = CustomLightningModule(model)
398 |
399 |
400 | # In[ ]:
401 |
402 |
403 | for param in model.parameters():
404 | param.requires_grad = False
405 |
406 | for param in model.pre_classifier.parameters():
407 | param.requires_grad = True
408 |
409 | for param in model.classifier.parameters():
410 | param.requires_grad = True
411 |
412 |
413 | # In[ ]:
414 |
415 |
416 | trainer = L.Trainer(
417 | max_epochs=3,
418 | callbacks=callbacks,
419 | accelerator="gpu",
420 | precision="16-mixed",
421 | devices=1,
422 | logger=logger,
423 | log_every_n_steps=100,
424 | )
425 |
426 |
427 | # In[ ]:
428 |
429 |
430 | start = time.time()
431 |
432 | trainer.fit(model=lightning_model,
433 | train_dataloaders=train_loader,
434 | val_dataloaders=val_loader)
435 |
436 | end = time.time()
437 | elapsed = end - start
438 | print(f"Time elapsed {elapsed/60:.2f} min")
439 |
440 |
441 | # In[ ]:
442 |
443 |
444 | trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")
445 |
446 |
447 | # In[ ]:
448 |
449 |
450 | trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
451 |
452 |
453 | # In[ ]:
454 |
455 |
456 | trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
457 | shutil.rmtree("logs")
458 | logger = CSVLogger(save_dir="logs/", name="my-model")
459 |
460 |
461 | # ## 3 -- Last 2 Layers + Last Tranformer Block
462 |
463 | print("3 -- Last 2 Layers + Last Tranformer Block")
464 |
465 | # In[ ]:
466 |
467 |
468 |
469 | model = AutoModelForSequenceClassification.from_pretrained(
470 | "distilbert-base-uncased", num_labels=2)
471 |
472 | lightning_model = CustomLightningModule(model)
473 |
474 |
475 | # In[ ]:
476 |
477 |
478 | for param in model.parameters():
479 | param.requires_grad = False
480 |
481 | for param in model.pre_classifier.parameters():
482 | param.requires_grad = True
483 |
484 | for param in model.classifier.parameters():
485 | param.requires_grad = True
486 |
487 | for param in model.distilbert.transformer.layer[5].parameters():
488 | param.requires_grad = True
489 |
490 |
491 | # In[ ]:
492 |
493 |
494 | trainer = L.Trainer(
495 | max_epochs=3,
496 | callbacks=callbacks,
497 | accelerator="gpu",
498 | precision="16-mixed",
499 | devices=1,
500 | logger=logger,
501 | log_every_n_steps=100,
502 | )
503 |
504 |
505 | # In[ ]:
506 |
507 |
508 | start = time.time()
509 |
510 | trainer.fit(model=lightning_model,
511 | train_dataloaders=train_loader,
512 | val_dataloaders=val_loader)
513 |
514 | end = time.time()
515 | elapsed = end - start
516 | print(f"Time elapsed {elapsed/60:.2f} min")
517 |
518 |
519 | # In[ ]:
520 |
521 |
522 | trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")
523 |
524 |
525 | # In[ ]:
526 |
527 |
528 | trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
529 |
530 |
531 | # In[ ]:
532 |
533 |
534 | trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
535 | shutil.rmtree("logs")
536 | logger = CSVLogger(save_dir="logs/", name="my-model")
537 |
538 |
539 | # ## 4 -- Last 2 Layers + Last 2 Transformer Blocks
540 |
541 | # In[ ]:
542 |
543 | print("4 -- Last 2 Layers + Last 2 Transformer Blocks")
544 |
545 | model = AutoModelForSequenceClassification.from_pretrained(
546 | "distilbert-base-uncased", num_labels=2)
547 |
548 | lightning_model = CustomLightningModule(model)
549 |
550 |
551 | # In[ ]:
552 |
553 |
554 | for param in model.parameters():
555 | param.requires_grad = False
556 |
557 | for param in model.pre_classifier.parameters():
558 | param.requires_grad = True
559 |
560 | for param in model.classifier.parameters():
561 | param.requires_grad = True
562 |
563 | for param in model.distilbert.transformer.layer[5].parameters():
564 | param.requires_grad = True
565 |
566 | for param in model.distilbert.transformer.layer[4].parameters():
567 | param.requires_grad = True
568 |
569 |
570 | # In[ ]:
571 |
572 |
573 | trainer = L.Trainer(
574 | max_epochs=3,
575 | callbacks=callbacks,
576 | accelerator="gpu",
577 | precision="16-mixed",
578 | devices=1,
579 | logger=logger,
580 | log_every_n_steps=100,
581 | )
582 |
583 |
584 | # In[ ]:
585 |
586 |
587 | start = time.time()
588 |
589 | trainer.fit(model=lightning_model,
590 | train_dataloaders=train_loader,
591 | val_dataloaders=val_loader)
592 |
593 | end = time.time()
594 | elapsed = end - start
595 | print(f"Time elapsed {elapsed/60:.2f} min")
596 |
597 |
598 | # In[ ]:
599 |
600 |
601 | trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")
602 |
603 |
604 | # In[ ]:
605 |
606 |
607 | trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
608 |
609 |
610 | # In[ ]:
611 |
612 |
613 | trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
614 | shutil.rmtree("logs")
615 | logger = CSVLogger(save_dir="logs/", name="my-model")
616 |
617 |
618 | # ## 5 -- Last 2 Layers + Last 3 Transformer Blocks
619 |
620 | # In[ ]:
621 |
622 | print("5 -- Last 2 Layers + Last 3 Transformer Blocks")
623 |
624 | model = AutoModelForSequenceClassification.from_pretrained(
625 | "distilbert-base-uncased", num_labels=2)
626 |
627 | lightning_model = CustomLightningModule(model)
628 |
629 |
630 | # In[ ]:
631 |
632 |
633 | for param in model.parameters():
634 | param.requires_grad = False
635 |
636 | for param in model.pre_classifier.parameters():
637 | param.requires_grad = True
638 |
639 | for param in model.classifier.parameters():
640 | param.requires_grad = True
641 |
642 | for param in model.distilbert.transformer.layer[5].parameters():
643 | param.requires_grad = True
644 |
645 | for param in model.distilbert.transformer.layer[4].parameters():
646 | param.requires_grad = True
647 |
648 | for param in model.distilbert.transformer.layer[3].parameters():
649 | param.requires_grad = True
650 |
651 |
652 | # In[ ]:
653 |
654 |
655 | trainer = L.Trainer(
656 | max_epochs=3,
657 | callbacks=callbacks,
658 | accelerator="gpu",
659 | precision="16-mixed",
660 | devices=1,
661 | logger=logger,
662 | log_every_n_steps=100,
663 | )
664 |
665 |
666 | # In[ ]:
667 |
668 |
669 | start = time.time()
670 |
671 | trainer.fit(model=lightning_model,
672 | train_dataloaders=train_loader,
673 | val_dataloaders=val_loader)
674 |
675 | end = time.time()
676 | elapsed = end - start
677 | print(f"Time elapsed {elapsed/60:.2f} min")
678 |
679 |
680 | # In[ ]:
681 |
682 |
683 | trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")
684 |
685 |
686 | # In[ ]:
687 |
688 |
689 | trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
690 |
691 |
692 | # In[ ]:
693 |
694 |
695 | trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
696 | shutil.rmtree("logs")
697 | logger = CSVLogger(save_dir="logs/", name="my-model")
698 |
699 |
700 |
701 | ## 6 -- Last 2 Layers + Last 4 Transformer Blocks
702 |
703 | print("6 -- Last 2 Layers + Last 4 Transformer Blocks")
704 |
705 | for param in model.parameters():
706 | param.requires_grad = False
707 |
708 | for param in model.pre_classifier.parameters():
709 | param.requires_grad = True
710 |
711 | for param in model.classifier.parameters():
712 | param.requires_grad = True
713 |
714 | for param in model.distilbert.transformer.layer[5].parameters():
715 | param.requires_grad = True
716 |
717 | for param in model.distilbert.transformer.layer[4].parameters():
718 | param.requires_grad = True
719 |
720 | for param in model.distilbert.transformer.layer[3].parameters():
721 | param.requires_grad = True
722 |
723 | for param in model.distilbert.transformer.layer[2].parameters():
724 | param.requires_grad = True
725 |
726 |
727 | # In[ ]:
728 |
729 |
730 | trainer = L.Trainer(
731 | max_epochs=3,
732 | callbacks=callbacks,
733 | accelerator="gpu",
734 | precision="16-mixed",
735 | devices=1,
736 | logger=logger,
737 | log_every_n_steps=100,
738 | )
739 |
740 |
741 | # In[ ]:
742 |
743 |
744 | start = time.time()
745 |
746 | trainer.fit(model=lightning_model,
747 | train_dataloaders=train_loader,
748 | val_dataloaders=val_loader)
749 |
750 | end = time.time()
751 | elapsed = end - start
752 | print(f"Time elapsed {elapsed/60:.2f} min")
753 |
754 |
755 | # In[ ]:
756 |
757 |
758 | trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")
759 |
760 |
761 | # In[ ]:
762 |
763 |
764 | trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
765 |
766 |
767 | # In[ ]:
768 |
769 |
770 | trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
771 | shutil.rmtree("logs")
772 | logger = CSVLogger(save_dir="logs/", name="my-model")
773 |
774 |
775 | # ## 7 -- Last 2 Layers + Last 5 Transformer Blocks
776 |
777 | # In[ ]:
778 |
779 | print("## 7 -- Last 2 Layers + Last 5 Transformer Blocks")
780 |
781 | model = AutoModelForSequenceClassification.from_pretrained(
782 | "distilbert-base-uncased", num_labels=2)
783 |
784 | lightning_model = CustomLightningModule(model)
785 |
786 |
787 | # In[ ]:
788 |
789 |
790 | for param in model.distilbert.transformer.layer[5].parameters():
791 | param.requires_grad = True
792 |
793 | for param in model.distilbert.transformer.layer[4].parameters():
794 | param.requires_grad = True
795 |
796 | for param in model.distilbert.transformer.layer[3].parameters():
797 | param.requires_grad = True
798 |
799 | for param in model.distilbert.transformer.layer[2].parameters():
800 | param.requires_grad = True
801 |
802 | for param in model.distilbert.transformer.layer[1].parameters():
803 | param.requires_grad = True
804 |
805 |
806 | # In[ ]:
807 |
808 |
809 | trainer = L.Trainer(
810 | max_epochs=3,
811 | callbacks=callbacks,
812 | accelerator="gpu",
813 | precision="16-mixed",
814 | devices=1,
815 | logger=logger,
816 | log_every_n_steps=100,
817 | )
818 |
819 |
820 | # In[ ]:
821 |
822 |
823 | start = time.time()
824 |
825 | trainer.fit(model=lightning_model,
826 | train_dataloaders=train_loader,
827 | val_dataloaders=val_loader)
828 |
829 | end = time.time()
830 | elapsed = end - start
831 | print(f"Time elapsed {elapsed/60:.2f} min")
832 |
833 |
834 | # In[ ]:
835 |
836 |
837 | trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")
838 |
839 |
840 | # In[ ]:
841 |
842 |
843 | trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
844 |
845 |
846 | # In[ ]:
847 |
848 |
849 | trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
850 | shutil.rmtree("logs")
851 | logger = CSVLogger(save_dir="logs/", name="my-model")
852 |
853 |
854 | # ## 8 -- Last 2 Layers + Last 6 Transformer Blocks
855 |
856 | # In[ ]:
857 |
858 | print("8 -- Last 2 Layers + Last 6 Transformer Blocks")
859 |
860 | model = AutoModelForSequenceClassification.from_pretrained(
861 | "distilbert-base-uncased", num_labels=2)
862 |
863 | lightning_model = CustomLightningModule(model)
864 |
865 |
866 | # In[ ]:
867 |
868 |
869 | for param in model.distilbert.transformer.layer[5].parameters():
870 | param.requires_grad = True
871 |
872 | for param in model.distilbert.transformer.layer[4].parameters():
873 | param.requires_grad = True
874 |
875 | for param in model.distilbert.transformer.layer[3].parameters():
876 | param.requires_grad = True
877 |
878 | for param in model.distilbert.transformer.layer[2].parameters():
879 | param.requires_grad = True
880 |
881 | for param in model.distilbert.transformer.layer[1].parameters():
882 | param.requires_grad = True
883 |
884 | for param in model.distilbert.transformer.layer[0].parameters():
885 | param.requires_grad = True
886 |
887 |
888 | # In[ ]:
889 |
890 |
891 | trainer = L.Trainer(
892 | max_epochs=3,
893 | callbacks=callbacks,
894 | accelerator="gpu",
895 | precision="16-mixed",
896 | devices=1,
897 | logger=logger,
898 | log_every_n_steps=100,
899 | )
900 |
901 |
902 | # In[ ]:
903 |
904 |
905 | start = time.time()
906 |
907 | trainer.fit(model=lightning_model,
908 | train_dataloaders=train_loader,
909 | val_dataloaders=val_loader)
910 |
911 | end = time.time()
912 | elapsed = end - start
913 | print(f"Time elapsed {elapsed/60:.2f} min")
914 |
915 |
916 | # In[ ]:
917 |
918 |
919 | trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")
920 |
921 |
922 | # In[ ]:
923 |
924 |
925 | trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
926 |
927 |
928 | # In[ ]:
929 |
930 |
931 | trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
932 | shutil.rmtree("logs")
933 | logger = CSVLogger(save_dir="logs/", name="my-model")
934 |
935 |
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/layerwise-experiment/layerwise-results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/conventional/distilbert-movie-review/layerwise-experiment/layerwise-results.png
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/layerwise-experiment/local_dataset_utilities.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import tarfile
4 | import time
5 |
6 | import numpy as np
7 | import pandas as pd
8 | from packaging import version
9 | from torch.utils.data import Dataset
10 | from tqdm import tqdm
11 | import urllib
12 |
13 |
14 | def reporthook(count, block_size, total_size):
15 | global start_time
16 | if count == 0:
17 | start_time = time.time()
18 | return
19 | duration = time.time() - start_time
20 | progress_size = int(count * block_size)
21 | speed = progress_size / (1024.0**2 * duration)
22 | percent = count * block_size * 100.0 / total_size
23 |
24 | sys.stdout.write(
25 | f"\r{int(percent)}% | {progress_size / (1024.**2):.2f} MB "
26 | f"| {speed:.2f} MB/s | {duration:.2f} sec elapsed"
27 | )
28 | sys.stdout.flush()
29 |
30 |
31 | def download_dataset():
32 | source = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
33 | target = "aclImdb_v1.tar.gz"
34 |
35 | if os.path.exists(target):
36 | os.remove(target)
37 |
38 | if not os.path.isdir("aclImdb") and not os.path.isfile("aclImdb_v1.tar.gz"):
39 | urllib.request.urlretrieve(source, target, reporthook)
40 |
41 | if not os.path.isdir("aclImdb"):
42 |
43 | with tarfile.open(target, "r:gz") as tar:
44 | tar.extractall()
45 |
46 |
47 | def load_dataset_into_to_dataframe():
48 | basepath = "aclImdb"
49 |
50 | labels = {"pos": 1, "neg": 0}
51 |
52 | df = pd.DataFrame()
53 |
54 | with tqdm(total=50000) as pbar:
55 | for s in ("test", "train"):
56 | for l in ("pos", "neg"):
57 | path = os.path.join(basepath, s, l)
58 | for file in sorted(os.listdir(path)):
59 | with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
60 | txt = infile.read()
61 |
62 | if version.parse(pd.__version__) >= version.parse("1.3.2"):
63 | x = pd.DataFrame(
64 | [[txt, labels[l]]], columns=["review", "sentiment"]
65 | )
66 | df = pd.concat([df, x], ignore_index=False)
67 |
68 | else:
69 | df = df.append([[txt, labels[l]]], ignore_index=True)
70 | pbar.update()
71 | df.columns = ["text", "label"]
72 |
73 | np.random.seed(0)
74 | df = df.reindex(np.random.permutation(df.index))
75 |
76 | print("Class distribution:")
77 | np.bincount(df["label"].values)
78 |
79 | return df
80 |
81 |
82 | def partition_dataset(df):
83 | df_shuffled = df.sample(frac=1, random_state=1).reset_index()
84 |
85 | df_train = df_shuffled.iloc[:35_000]
86 | df_val = df_shuffled.iloc[35_000:40_000]
87 | df_test = df_shuffled.iloc[40_000:]
88 |
89 | df_train.to_csv("train.csv", index=False, encoding="utf-8")
90 | df_val.to_csv("val.csv", index=False, encoding="utf-8")
91 | df_test.to_csv("test.csv", index=False, encoding="utf-8")
92 |
93 |
94 | class IMDBDataset(Dataset):
95 | def __init__(self, dataset_dict, partition_key="train"):
96 | self.partition = dataset_dict[partition_key]
97 |
98 | def __getitem__(self, index):
99 | return self.partition[index]
100 |
101 | def __len__(self):
102 | return self.partition.num_rows
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/layerwise-experiment/results.txt:
--------------------------------------------------------------------------------
1 | ## All layers
2 |
3 | Time elapsed 6.99 min
4 |
5 | Train: 0.9666571617126465
6 | Val: 0.9301999807357788
7 | Test: 0.9254999756813049
8 |
9 | ## 1 -- Last Layer
10 |
11 | 2.77 min
12 |
13 | Train: 0.7889142632484436
14 | Val: 0.7942000031471252
15 | Test: 0.7871000170707703
16 |
17 | ## 2 -- Last 2 Layers
18 |
19 | 2.78 min
20 |
21 | Train: 0.868228554725647
22 | Val: 0.8712000250816345
23 | Test: 0.8644999861717224
24 |
25 |
26 |
27 | ## 3 -- Last 2 Layers + Last Transformer Block
28 |
29 | 3.39 min
30 |
31 | Train: 0.9498000144958496
32 | Val: 0.9272000193595886
33 | Test: 0.921999990940094
34 |
35 |
36 | ## 4 -- Last 2 Layers + Last 2 Transformer Blocks
37 |
38 | 4.06 min
39 |
40 | Train: 0.9771142601966858
41 | Val: 0.9300000071525574
42 | Test: 0.9240000247955322
43 |
44 | ## 5 -- Last 2 Layers + Last 3 Transformer Blocks
45 |
46 | 4.63 min
47 |
48 | Train: 0.9864857196807861
49 | Val: 0.9333999752998352
50 | Test: 0.9265000224113464
51 |
52 | ## 6 -- Last 2 Layers + Last 4 Transformer Blocks
53 |
54 | 5.15 min
55 |
56 | Train: 0.9763428568840027
57 | Val: 0.9279999732971191
58 | Test: 0.9262999892234802
59 |
60 | ## 7 -- Last 2 Layers + Last 5 Transformer Blocks
61 |
62 | 6.99 min
63 |
64 | Train: 0.9947142601013184
65 | Val: 0.9258000254631042
66 | Test: 0.9251999855041504
67 |
68 | ## 8 -- Last 2 Layers + Last 6 Transformer Blocks
69 |
70 | Train: 0.9925428628921509
71 | Val: 0.9277999997138977
72 | Test: 0.9262999892234802
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/local_dataset_utilities.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import tarfile
4 | import time
5 |
6 | import numpy as np
7 | import pandas as pd
8 | from packaging import version
9 | from torch.utils.data import Dataset
10 | from tqdm import tqdm
11 | import urllib
12 |
13 |
14 | def reporthook(count, block_size, total_size):
15 | global start_time
16 | if count == 0:
17 | start_time = time.time()
18 | return
19 | duration = time.time() - start_time
20 | progress_size = int(count * block_size)
21 | speed = progress_size / (1024.0**2 * duration)
22 | percent = count * block_size * 100.0 / total_size
23 |
24 | sys.stdout.write(
25 | f"\r{int(percent)}% | {progress_size / (1024.**2):.2f} MB "
26 | f"| {speed:.2f} MB/s | {duration:.2f} sec elapsed"
27 | )
28 | sys.stdout.flush()
29 |
30 |
31 | def download_dataset():
32 | source = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
33 | target = "aclImdb_v1.tar.gz"
34 |
35 | if os.path.exists(target):
36 | os.remove(target)
37 |
38 | if not os.path.isdir("aclImdb") and not os.path.isfile("aclImdb_v1.tar.gz"):
39 | urllib.request.urlretrieve(source, target, reporthook)
40 |
41 | if not os.path.isdir("aclImdb"):
42 |
43 | with tarfile.open(target, "r:gz") as tar:
44 | tar.extractall()
45 |
46 |
47 | def load_dataset_into_to_dataframe():
48 | basepath = "aclImdb"
49 |
50 | labels = {"pos": 1, "neg": 0}
51 |
52 | df = pd.DataFrame()
53 |
54 | with tqdm(total=50000) as pbar:
55 | for s in ("test", "train"):
56 | for l in ("pos", "neg"):
57 | path = os.path.join(basepath, s, l)
58 | for file in sorted(os.listdir(path)):
59 | with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
60 | txt = infile.read()
61 |
62 | if version.parse(pd.__version__) >= version.parse("1.3.2"):
63 | x = pd.DataFrame(
64 | [[txt, labels[l]]], columns=["review", "sentiment"]
65 | )
66 | df = pd.concat([df, x], ignore_index=False)
67 |
68 | else:
69 | df = df.append([[txt, labels[l]]], ignore_index=True)
70 | pbar.update()
71 | df.columns = ["text", "label"]
72 |
73 | np.random.seed(0)
74 | df = df.reindex(np.random.permutation(df.index))
75 |
76 | print("Class distribution:")
77 | np.bincount(df["label"].values)
78 |
79 | return df
80 |
81 |
82 | def partition_dataset(df):
83 | df_shuffled = df.sample(frac=1, random_state=1).reset_index()
84 |
85 | df_train = df_shuffled.iloc[:35_000]
86 | df_val = df_shuffled.iloc[35_000:40_000]
87 | df_test = df_shuffled.iloc[40_000:]
88 |
89 | df_train.to_csv("train.csv", index=False, encoding="utf-8")
90 | df_val.to_csv("val.csv", index=False, encoding="utf-8")
91 | df_test.to_csv("test.csv", index=False, encoding="utf-8")
92 |
93 |
94 | class IMDBDataset(Dataset):
95 | def __init__(self, dataset_dict, partition_key="train"):
96 | self.partition = dataset_dict[partition_key]
97 |
98 | def __getitem__(self, index):
99 | return self.partition[index]
100 |
101 | def __len__(self):
102 | return self.partition.num_rows
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/README.md:
--------------------------------------------------------------------------------
1 | All results from training on a single A100 GPU.
2 |
3 |
4 |
5 | # Summary
6 |
7 |
8 |
9 | 
10 |
11 | 
12 |
13 | 
14 |
15 | ---
16 |
17 | # Raw results
18 |
19 |
20 |
21 | # torch.set_float32_matmul_precision("highest")
22 |
23 |
24 | ## Float16-mixed
25 |
26 | ```python
27 | python float16-mixed.py
28 | ```
29 |
30 | ```
31 | Python implementation: CPython
32 | Python version : 3.9.16
33 |
34 | torch : 2.0.0
35 | lightning : 2.0.2
36 | transformers: 4.28.1
37 | Torch CUDA available? True
38 | ...
39 | Epoch: 0002/0003 | Train acc.: 95.19% | Val acc.: 92.55%
40 | Epoch: 0003/0003 | Batch 0000/2916 | Loss: 0.0083
41 | Epoch: 0003/0003 | Batch 0300/2916 | Loss: 0.1804
42 | Epoch: 0003/0003 | Batch 0600/2916 | Loss: 0.0056
43 | Epoch: 0003/0003 | Batch 0900/2916 | Loss: 0.0197
44 | Epoch: 0003/0003 | Batch 1200/2916 | Loss: 0.0146
45 | Epoch: 0003/0003 | Batch 1500/2916 | Loss: 0.0085
46 | Epoch: 0003/0003 | Batch 1800/2916 | Loss: 0.0166
47 | Epoch: 0003/0003 | Batch 2100/2916 | Loss: 0.0034
48 | Epoch: 0003/0003 | Batch 2400/2916 | Loss: 0.0271
49 | Epoch: 0003/0003 | Batch 2700/2916 | Loss: 0.0537
50 | Epoch: 0003/0003 | Train acc.: 97.39% | Val acc.: 92.21%
51 | Time elapsed 7.25 min
52 | Memory used: 4.31 GB
53 | Test accuracy 92.15%
54 | ```
55 |
56 | ## Bfloat16-mixed
57 |
58 | ```python
59 | python bfloat16-mixed.py
60 | ```
61 |
62 | ```
63 | Python implementation: CPython
64 | Python version : 3.9.16
65 |
66 | torch : 2.0.0
67 | lightning : 2.0.2
68 | transformers: 4.28.1
69 |
70 | Torch CUDA available? True
71 | GPU supports bfloat16: True
72 | ...
73 | Time elapsed 7.45 min
74 | Memory used: 4.46 GB
75 | Test accuracy 92.61%
76 | ```
77 |
78 | ## Float16-regular
79 |
80 | ```
81 | Epoch: 0003/0003 | Batch 2700/2916 | Loss: nan
82 | Epoch: 0003/0003 | Train acc.: 49.86% | Val acc.: 50.80%
83 | Time elapsed 5.23 min
84 | Memory used: 2.87 GB
85 | Test accuracy 50.08%
86 | ```
87 |
88 | ## Bfloat16-regular
89 |
90 | ```
91 | Train acc.: 96.55% | Val acc.: 92.59%
92 | Time elapsed 5.22 min
93 | Memory used: 2.87 GB
94 | Test accuracy 92.69%
95 | ```
96 |
97 | ## Float32-regular
98 |
99 | ```
100 | Epoch: 0003/0003 | Train acc.: 97.28% | Val acc.: 89.88%
101 | Time elapsed 21.75 min
102 | Memory used: 5.37 GB
103 | Test accuracy 89.92%
104 | ```
105 |
106 | ## Float64-regular
107 |
108 | ```
109 | Time elapsed 24.59 min
110 | Memory used: 10.42 GB
111 | Test accuracy 92.14%
112 | ```
113 |
114 | ---
115 |
116 | # torch.set_float32_matmul_precision("high")
117 |
118 | ## float-32
119 |
120 | ```
121 | Epoch: 0003/0003 | Train acc.: 97.41% | Val acc.: 92.75%
122 | Time elapsed 8.11 min
123 | Memory used: 5.37 GB
124 | Test accuracy 92.50%
125 | ```
126 |
127 | ## float-16 mixed
128 |
129 | ```
130 | ...
131 | Time elapsed 7.10 min
132 | Memory used: 4.31 GB
133 | Test accuracy 92.15%
134 | ```
135 |
136 | ## bfloat-16 mixed
137 |
138 |
139 | ```
140 | Time elapsed 7.43 min
141 | Memory used: 4.46 GB
142 | Test accuracy 92.61%
143 | ```
144 |
145 | ---
146 |
147 | # torch.set_float32_matmul_precision("medium")
148 |
149 | ## float-32
150 |
151 | ```
152 | ...
153 | Epoch: 0003/0003 | Train acc.: 97.41% | Val acc.: 92.75%
154 | Time elapsed 8.14 min
155 | Memory used: 5.37 GB
156 | Test accuracy 92.50%
157 | ```
158 |
159 | ## float-16 mixed
160 |
161 | ```
162 | ...
163 | Time elapsed 7.07 min
164 | Memory used: 4.31 GB
165 | Test accuracy 92.15%
166 |
167 | ```
168 |
169 | ## bfloat-16 mixed
170 |
171 | ```
172 | ...
173 | Epoch: 0003/0003 | Train acc.: 97.41% | Val acc.: 92.97%
174 | Time elapsed 7.44 min
175 | Memory used: 4.46 GB
176 | Test accuracy 92.61%
177 | ```
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/bfloat16-mixed-high.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import os.path as op
4 | import time
5 |
6 | from datasets import load_dataset
7 | from lightning import Fabric
8 | import torch
9 | from torch.utils.data import DataLoader
10 | import torchmetrics
11 | from transformers import AutoTokenizer
12 | from transformers import AutoModelForSequenceClassification
13 | from watermark import watermark
14 |
15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
16 | from local_dataset_utilities import IMDBDataset
17 |
18 |
19 | def tokenize_text(batch):
20 | return tokenizer(batch["text"], truncation=True, padding=True)
21 |
22 |
23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
24 |
25 | for epoch in range(num_epochs):
26 | train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
27 |
28 | model.train()
29 | for batch_idx, batch in enumerate(train_loader):
30 | model.train()
31 |
32 | # For non-Fabric PyTorch:
33 | #for s in ["input_ids", "attention_mask", "label"]:
34 | # batch[s] = batch[s].to(device)
35 |
36 | ### FORWARD AND BACK PROP
37 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
38 | optimizer.zero_grad()
39 |
40 | # For non-Fabric PyTorch:
41 | #outputs["loss"].backward()
42 | fabric.backward(outputs["loss"])
43 |
44 | ### UPDATE MODEL PARAMETERS
45 | optimizer.step()
46 |
47 | ### LOGGING
48 | if not batch_idx % 300:
49 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
50 |
51 | model.eval()
52 | with torch.no_grad():
53 | predicted_labels = torch.argmax(outputs["logits"], 1)
54 | train_acc.update(predicted_labels, batch["label"])
55 |
56 | ### MORE LOGGING
57 | model.eval()
58 | with torch.no_grad():
59 | val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
60 | for batch in val_loader:
61 |
62 | # For non-Fabric PyTorch:
63 | #for s in ["input_ids", "attention_mask", "label"]:
64 | # batch[s] = batch[s].to(device)
65 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
66 | predicted_labels = torch.argmax(outputs["logits"], 1)
67 | val_acc.update(predicted_labels, batch["label"])
68 |
69 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
70 | train_acc.reset(), val_acc.reset()
71 |
72 |
73 | if __name__ == "__main__":
74 |
75 | print(watermark(packages="torch,lightning,transformers", python=True))
76 | print("Torch CUDA available?", torch.cuda.is_available())
77 | device = "cuda" if torch.cuda.is_available() else "cpu"
78 |
79 | torch.set_float32_matmul_precision("high")
80 | torch.manual_seed(123)
81 |
82 | ##########################
83 | ### 1 Loading the Dataset
84 | ##########################
85 | download_dataset()
86 | df = load_dataset_into_to_dataframe()
87 | if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
88 | partition_dataset(df)
89 |
90 | imdb_dataset = load_dataset(
91 | "csv",
92 | data_files={
93 | "train": "train.csv",
94 | "validation": "val.csv",
95 | "test": "test.csv",
96 | },
97 | )
98 |
99 | #########################################
100 | ### 2 Tokenization and Numericalization
101 | #########################################
102 |
103 | tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
104 | print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
105 | print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
106 |
107 | print("Tokenizing ...", flush=True)
108 | imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
109 | del imdb_dataset
110 | imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
111 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
112 |
113 | #########################################
114 | ### 3 Set Up DataLoaders
115 | #########################################
116 |
117 | train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
118 | val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
119 | test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
120 |
121 | train_loader = DataLoader(
122 | dataset=train_dataset,
123 | batch_size=12,
124 | shuffle=True,
125 | num_workers=4,
126 | drop_last=True,
127 | )
128 |
129 | val_loader = DataLoader(
130 | dataset=val_dataset,
131 | batch_size=12,
132 | num_workers=4,
133 | drop_last=True,
134 | )
135 |
136 | test_loader = DataLoader(
137 | dataset=test_dataset,
138 | batch_size=12,
139 | num_workers=2,
140 | drop_last=True,
141 | )
142 |
143 |
144 | #########################################
145 | ### 4 Initializing the Model
146 | #########################################
147 |
148 | fabric = Fabric(accelerator="cuda", devices=[4], precision="bf16-mixed")
149 | fabric.launch()
150 |
151 | model = AutoModelForSequenceClassification.from_pretrained(
152 | "distilbert-base-uncased", num_labels=2)
153 |
154 | # model.to(device)
155 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
156 |
157 | model, optimizer = fabric.setup(model, optimizer)
158 | train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
159 | fabric.barrier()
160 |
161 | #########################################
162 | ### 5 Finetuning
163 | #########################################
164 |
165 | start = time.time()
166 | train(
167 | num_epochs=3,
168 | model=model,
169 | optimizer=optimizer,
170 | train_loader=train_loader,
171 | val_loader=val_loader,
172 | fabric=fabric
173 | )
174 |
175 | end = time.time()
176 | elapsed = end-start
177 | print(f"Time elapsed {elapsed/60:.2f} min")
178 |
179 | with torch.no_grad():
180 | model.eval()
181 | test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
182 | for batch in test_loader:
183 |
184 | #for s in ["input_ids", "attention_mask", "label"]:
185 | # batch[s] = batch[s].to(device)
186 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
187 | predicted_labels = torch.argmax(outputs["logits"], 1)
188 | test_acc.update(predicted_labels, batch["label"])
189 |
190 | print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
191 | print(f"Test accuracy {test_acc.compute()*100:.2f}%")
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/bfloat16-mixed-medium.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import os.path as op
4 | import time
5 |
6 | from datasets import load_dataset
7 | from lightning import Fabric
8 | import torch
9 | from torch.utils.data import DataLoader
10 | import torchmetrics
11 | from transformers import AutoTokenizer
12 | from transformers import AutoModelForSequenceClassification
13 | from watermark import watermark
14 |
15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
16 | from local_dataset_utilities import IMDBDataset
17 |
18 |
19 | def tokenize_text(batch):
20 | return tokenizer(batch["text"], truncation=True, padding=True)
21 |
22 |
23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
24 |
25 | for epoch in range(num_epochs):
26 | train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
27 |
28 | model.train()
29 | for batch_idx, batch in enumerate(train_loader):
30 | model.train()
31 |
32 | # For non-Fabric PyTorch:
33 | #for s in ["input_ids", "attention_mask", "label"]:
34 | # batch[s] = batch[s].to(device)
35 |
36 | ### FORWARD AND BACK PROP
37 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
38 | optimizer.zero_grad()
39 |
40 | # For non-Fabric PyTorch:
41 | #outputs["loss"].backward()
42 | fabric.backward(outputs["loss"])
43 |
44 | ### UPDATE MODEL PARAMETERS
45 | optimizer.step()
46 |
47 | ### LOGGING
48 | if not batch_idx % 300:
49 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
50 |
51 | model.eval()
52 | with torch.no_grad():
53 | predicted_labels = torch.argmax(outputs["logits"], 1)
54 | train_acc.update(predicted_labels, batch["label"])
55 |
56 | ### MORE LOGGING
57 | model.eval()
58 | with torch.no_grad():
59 | val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
60 | for batch in val_loader:
61 |
62 | # For non-Fabric PyTorch:
63 | #for s in ["input_ids", "attention_mask", "label"]:
64 | # batch[s] = batch[s].to(device)
65 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
66 | predicted_labels = torch.argmax(outputs["logits"], 1)
67 | val_acc.update(predicted_labels, batch["label"])
68 |
69 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
70 | train_acc.reset(), val_acc.reset()
71 |
72 |
73 | if __name__ == "__main__":
74 |
75 | print(watermark(packages="torch,lightning,transformers", python=True))
76 | print("Torch CUDA available?", torch.cuda.is_available())
77 | device = "cuda" if torch.cuda.is_available() else "cpu"
78 |
79 | torch.set_float32_matmul_precision("medium")
80 | torch.manual_seed(123)
81 |
82 | ##########################
83 | ### 1 Loading the Dataset
84 | ##########################
85 | download_dataset()
86 | df = load_dataset_into_to_dataframe()
87 | if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
88 | partition_dataset(df)
89 |
90 | imdb_dataset = load_dataset(
91 | "csv",
92 | data_files={
93 | "train": "train.csv",
94 | "validation": "val.csv",
95 | "test": "test.csv",
96 | },
97 | )
98 |
99 | #########################################
100 | ### 2 Tokenization and Numericalization
101 | #########################################
102 |
103 | tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
104 | print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
105 | print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
106 |
107 | print("Tokenizing ...", flush=True)
108 | imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
109 | del imdb_dataset
110 | imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
111 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
112 |
113 | #########################################
114 | ### 3 Set Up DataLoaders
115 | #########################################
116 |
117 | train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
118 | val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
119 | test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
120 |
121 | train_loader = DataLoader(
122 | dataset=train_dataset,
123 | batch_size=12,
124 | shuffle=True,
125 | num_workers=4,
126 | drop_last=True,
127 | )
128 |
129 | val_loader = DataLoader(
130 | dataset=val_dataset,
131 | batch_size=12,
132 | num_workers=4,
133 | drop_last=True,
134 | )
135 |
136 | test_loader = DataLoader(
137 | dataset=test_dataset,
138 | batch_size=12,
139 | num_workers=2,
140 | drop_last=True,
141 | )
142 |
143 |
144 | #########################################
145 | ### 4 Initializing the Model
146 | #########################################
147 |
148 | fabric = Fabric(accelerator="cuda", devices=[5], precision="bf16-mixed")
149 | fabric.launch()
150 |
151 | model = AutoModelForSequenceClassification.from_pretrained(
152 | "distilbert-base-uncased", num_labels=2)
153 |
154 | # model.to(device)
155 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
156 |
157 | model, optimizer = fabric.setup(model, optimizer)
158 | train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
159 | fabric.barrier()
160 |
161 | #########################################
162 | ### 5 Finetuning
163 | #########################################
164 |
165 | start = time.time()
166 | train(
167 | num_epochs=3,
168 | model=model,
169 | optimizer=optimizer,
170 | train_loader=train_loader,
171 | val_loader=val_loader,
172 | fabric=fabric
173 | )
174 |
175 | end = time.time()
176 | elapsed = end-start
177 | print(f"Time elapsed {elapsed/60:.2f} min")
178 |
179 | with torch.no_grad():
180 | model.eval()
181 | test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
182 | for batch in test_loader:
183 |
184 | #for s in ["input_ids", "attention_mask", "label"]:
185 | # batch[s] = batch[s].to(device)
186 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
187 | predicted_labels = torch.argmax(outputs["logits"], 1)
188 | test_acc.update(predicted_labels, batch["label"])
189 |
190 | print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
191 | print(f"Test accuracy {test_acc.compute()*100:.2f}%")
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/bfloat16-mixed.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import os.path as op
4 | import time
5 |
6 | from datasets import load_dataset
7 | from lightning import Fabric
8 | import torch
9 | from torch.utils.data import DataLoader
10 | import torchmetrics
11 | from transformers import AutoTokenizer
12 | from transformers import AutoModelForSequenceClassification
13 | from watermark import watermark
14 |
15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
16 | from local_dataset_utilities import IMDBDataset
17 |
18 |
19 | def tokenize_text(batch):
20 | return tokenizer(batch["text"], truncation=True, padding=True)
21 |
22 |
23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
24 |
25 | for epoch in range(num_epochs):
26 | train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
27 |
28 | model.train()
29 | for batch_idx, batch in enumerate(train_loader):
30 | model.train()
31 |
32 | # For non-Fabric PyTorch:
33 | #for s in ["input_ids", "attention_mask", "label"]:
34 | # batch[s] = batch[s].to(device)
35 |
36 | ### FORWARD AND BACK PROP
37 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
38 | optimizer.zero_grad()
39 |
40 | # For non-Fabric PyTorch:
41 | #outputs["loss"].backward()
42 | fabric.backward(outputs["loss"])
43 |
44 | ### UPDATE MODEL PARAMETERS
45 | optimizer.step()
46 |
47 | ### LOGGING
48 | if not batch_idx % 300:
49 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
50 |
51 | model.eval()
52 | with torch.no_grad():
53 | predicted_labels = torch.argmax(outputs["logits"], 1)
54 | train_acc.update(predicted_labels, batch["label"])
55 |
56 | ### MORE LOGGING
57 | model.eval()
58 | with torch.no_grad():
59 | val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
60 | for batch in val_loader:
61 |
62 | # For non-Fabric PyTorch:
63 | #for s in ["input_ids", "attention_mask", "label"]:
64 | # batch[s] = batch[s].to(device)
65 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
66 | predicted_labels = torch.argmax(outputs["logits"], 1)
67 | val_acc.update(predicted_labels, batch["label"])
68 |
69 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
70 | train_acc.reset(), val_acc.reset()
71 |
72 |
73 | if __name__ == "__main__":
74 |
75 | print(watermark(packages="torch,lightning,transformers", python=True))
76 | print("Torch CUDA available?", torch.cuda.is_available())
77 | device = "cuda" if torch.cuda.is_available() else "cpu"
78 | print("GPU supports bfloat16:", torch.cuda.is_bf16_supported())
79 |
80 | torch.manual_seed(123)
81 |
82 | ##########################
83 | ### 1 Loading the Dataset
84 | ##########################
85 | download_dataset()
86 | df = load_dataset_into_to_dataframe()
87 | if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
88 | partition_dataset(df)
89 |
90 | imdb_dataset = load_dataset(
91 | "csv",
92 | data_files={
93 | "train": "train.csv",
94 | "validation": "val.csv",
95 | "test": "test.csv",
96 | },
97 | )
98 |
99 | #########################################
100 | ### 2 Tokenization and Numericalization
101 | #########################################
102 |
103 | tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
104 | print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
105 | print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
106 |
107 | print("Tokenizing ...", flush=True)
108 | imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
109 | del imdb_dataset
110 | imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
111 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
112 |
113 | #########################################
114 | ### 3 Set Up DataLoaders
115 | #########################################
116 |
117 | train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
118 | val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
119 | test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
120 |
121 | train_loader = DataLoader(
122 | dataset=train_dataset,
123 | batch_size=12,
124 | shuffle=True,
125 | num_workers=4,
126 | drop_last=True,
127 | )
128 |
129 | val_loader = DataLoader(
130 | dataset=val_dataset,
131 | batch_size=12,
132 | num_workers=4,
133 | drop_last=True,
134 | )
135 |
136 | test_loader = DataLoader(
137 | dataset=test_dataset,
138 | batch_size=12,
139 | num_workers=2,
140 | drop_last=True,
141 | )
142 |
143 |
144 | #########################################
145 | ### 4 Initializing the Model
146 | #########################################
147 |
148 | fabric = Fabric(accelerator="cuda", devices=1, precision="bf16-mixed")
149 | fabric.launch()
150 |
151 | model = AutoModelForSequenceClassification.from_pretrained(
152 | "distilbert-base-uncased", num_labels=2)
153 |
154 | # model.to(device)
155 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
156 |
157 | model, optimizer = fabric.setup(model, optimizer)
158 | train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
159 | fabric.barrier()
160 |
161 | #########################################
162 | ### 5 Finetuning
163 | #########################################
164 |
165 | start = time.time()
166 | train(
167 | num_epochs=3,
168 | model=model,
169 | optimizer=optimizer,
170 | train_loader=train_loader,
171 | val_loader=val_loader,
172 | fabric=fabric
173 | )
174 |
175 | end = time.time()
176 | elapsed = end-start
177 | print(f"Time elapsed {elapsed/60:.2f} min")
178 |
179 | with torch.no_grad():
180 | model.eval()
181 | test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
182 | for batch in test_loader:
183 |
184 | #for s in ["input_ids", "attention_mask", "label"]:
185 | # batch[s] = batch[s].to(device)
186 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
187 | predicted_labels = torch.argmax(outputs["logits"], 1)
188 | test_acc.update(predicted_labels, batch["label"])
189 |
190 | print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
191 | print(f"Test accuracy {test_acc.compute()*100:.2f}%")
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/bfloat16-regular.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import os.path as op
4 | import time
5 |
6 | from datasets import load_dataset
7 | from lightning import Fabric
8 | import torch
9 | from torch.utils.data import DataLoader
10 | import torchmetrics
11 | from transformers import AutoTokenizer
12 | from transformers import AutoModelForSequenceClassification
13 | from watermark import watermark
14 |
15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
16 | from local_dataset_utilities import IMDBDataset
17 |
18 |
19 | def tokenize_text(batch):
20 | return tokenizer(batch["text"], truncation=True, padding=True)
21 |
22 |
23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
24 |
25 | for epoch in range(num_epochs):
26 | train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
27 |
28 | model.train()
29 | for batch_idx, batch in enumerate(train_loader):
30 | model.train()
31 |
32 | # For non-Fabric PyTorch:
33 | #for s in ["input_ids", "attention_mask", "label"]:
34 | # batch[s] = batch[s].to(device)
35 |
36 | ### FORWARD AND BACK PROP
37 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
38 | optimizer.zero_grad()
39 |
40 | # For non-Fabric PyTorch:
41 | #outputs["loss"].backward()
42 | fabric.backward(outputs["loss"])
43 |
44 | ### UPDATE MODEL PARAMETERS
45 | optimizer.step()
46 |
47 | ### LOGGING
48 | if not batch_idx % 300:
49 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
50 |
51 | model.eval()
52 | with torch.no_grad():
53 | predicted_labels = torch.argmax(outputs["logits"], 1)
54 | train_acc.update(predicted_labels, batch["label"])
55 |
56 | ### MORE LOGGING
57 | model.eval()
58 | with torch.no_grad():
59 | val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
60 | for batch in val_loader:
61 |
62 | # For non-Fabric PyTorch:
63 | #for s in ["input_ids", "attention_mask", "label"]:
64 | # batch[s] = batch[s].to(device)
65 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
66 | predicted_labels = torch.argmax(outputs["logits"], 1)
67 | val_acc.update(predicted_labels, batch["label"])
68 |
69 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
70 | train_acc.reset(), val_acc.reset()
71 |
72 |
73 | if __name__ == "__main__":
74 |
75 | print(watermark(packages="torch,lightning,transformers", python=True))
76 | print("Torch CUDA available?", torch.cuda.is_available())
77 | device = "cuda" if torch.cuda.is_available() else "cpu"
78 |
79 | torch.manual_seed(123)
80 |
81 | ##########################
82 | ### 1 Loading the Dataset
83 | ##########################
84 | download_dataset()
85 | df = load_dataset_into_to_dataframe()
86 | if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
87 | partition_dataset(df)
88 |
89 | imdb_dataset = load_dataset(
90 | "csv",
91 | data_files={
92 | "train": "train.csv",
93 | "validation": "val.csv",
94 | "test": "test.csv",
95 | },
96 | )
97 |
98 | #########################################
99 | ### 2 Tokenization and Numericalization
100 | #########################################
101 |
102 | tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
103 | print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
104 | print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
105 |
106 | print("Tokenizing ...", flush=True)
107 | imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
108 | del imdb_dataset
109 | imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
110 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
111 |
112 | #########################################
113 | ### 3 Set Up DataLoaders
114 | #########################################
115 |
116 | train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
117 | val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
118 | test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
119 |
120 | train_loader = DataLoader(
121 | dataset=train_dataset,
122 | batch_size=12,
123 | shuffle=True,
124 | num_workers=4,
125 | drop_last=True,
126 | )
127 |
128 | val_loader = DataLoader(
129 | dataset=val_dataset,
130 | batch_size=12,
131 | num_workers=4,
132 | drop_last=True,
133 | )
134 |
135 | test_loader = DataLoader(
136 | dataset=test_dataset,
137 | batch_size=12,
138 | num_workers=2,
139 | drop_last=True,
140 | )
141 |
142 |
143 | #########################################
144 | ### 4 Initializing the Model
145 | #########################################
146 |
147 | fabric = Fabric(accelerator="cuda", devices=1, precision="bf16-true")
148 | fabric.launch()
149 |
150 | model = AutoModelForSequenceClassification.from_pretrained(
151 | "distilbert-base-uncased", num_labels=2)
152 |
153 | # model.to(device)
154 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
155 |
156 | model, optimizer = fabric.setup(model, optimizer)
157 | train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
158 | fabric.barrier()
159 |
160 | #########################################
161 | ### 5 Finetuning
162 | #########################################
163 |
164 | start = time.time()
165 | train(
166 | num_epochs=3,
167 | model=model,
168 | optimizer=optimizer,
169 | train_loader=train_loader,
170 | val_loader=val_loader,
171 | fabric=fabric
172 | )
173 |
174 | end = time.time()
175 | elapsed = end-start
176 | print(f"Time elapsed {elapsed/60:.2f} min")
177 |
178 | with torch.no_grad():
179 | model.eval()
180 | test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
181 | for batch in test_loader:
182 |
183 | #for s in ["input_ids", "attention_mask", "label"]:
184 | # batch[s] = batch[s].to(device)
185 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
186 | predicted_labels = torch.argmax(outputs["logits"], 1)
187 | test_acc.update(predicted_labels, batch["label"])
188 |
189 | print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
190 | print(f"Test accuracy {test_acc.compute()*100:.2f}%")
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/figures/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/conventional/distilbert-movie-review/mixed-precision-experiment/figures/1.png
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/figures/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/conventional/distilbert-movie-review/mixed-precision-experiment/figures/2.png
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/figures/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/conventional/distilbert-movie-review/mixed-precision-experiment/figures/3.png
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/float16-mixed-high.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import os.path as op
4 | import time
5 |
6 | from datasets import load_dataset
7 | from lightning import Fabric
8 | import torch
9 | from torch.utils.data import DataLoader
10 | import torchmetrics
11 | from transformers import AutoTokenizer
12 | from transformers import AutoModelForSequenceClassification
13 | from watermark import watermark
14 |
15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
16 | from local_dataset_utilities import IMDBDataset
17 |
18 |
19 | def tokenize_text(batch):
20 | return tokenizer(batch["text"], truncation=True, padding=True)
21 |
22 |
23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
24 |
25 | for epoch in range(num_epochs):
26 | train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
27 |
28 | model.train()
29 | for batch_idx, batch in enumerate(train_loader):
30 | model.train()
31 |
32 | # For non-Fabric PyTorch:
33 | #for s in ["input_ids", "attention_mask", "label"]:
34 | # batch[s] = batch[s].to(device)
35 |
36 | ### FORWARD AND BACK PROP
37 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
38 | optimizer.zero_grad()
39 |
40 | # For non-Fabric PyTorch:
41 | #outputs["loss"].backward()
42 | fabric.backward(outputs["loss"])
43 |
44 | ### UPDATE MODEL PARAMETERS
45 | optimizer.step()
46 |
47 | ### LOGGING
48 | if not batch_idx % 300:
49 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
50 |
51 | model.eval()
52 | with torch.no_grad():
53 | predicted_labels = torch.argmax(outputs["logits"], 1)
54 | train_acc.update(predicted_labels, batch["label"])
55 |
56 | ### MORE LOGGING
57 | model.eval()
58 | with torch.no_grad():
59 | val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
60 | for batch in val_loader:
61 |
62 | # For non-Fabric PyTorch:
63 | #for s in ["input_ids", "attention_mask", "label"]:
64 | # batch[s] = batch[s].to(device)
65 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
66 | predicted_labels = torch.argmax(outputs["logits"], 1)
67 | val_acc.update(predicted_labels, batch["label"])
68 |
69 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
70 | train_acc.reset(), val_acc.reset()
71 |
72 |
73 | if __name__ == "__main__":
74 |
75 | print(watermark(packages="torch,lightning,transformers", python=True))
76 | print("Torch CUDA available?", torch.cuda.is_available())
77 | device = "cuda" if torch.cuda.is_available() else "cpu"
78 |
79 | torch.set_float32_matmul_precision("high")
80 | torch.manual_seed(123)
81 |
82 | ##########################
83 | ### 1 Loading the Dataset
84 | ##########################
85 | download_dataset()
86 | df = load_dataset_into_to_dataframe()
87 | if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
88 | partition_dataset(df)
89 |
90 | imdb_dataset = load_dataset(
91 | "csv",
92 | data_files={
93 | "train": "train.csv",
94 | "validation": "val.csv",
95 | "test": "test.csv",
96 | },
97 | )
98 |
99 | #########################################
100 | ### 2 Tokenization and Numericalization
101 | #########################################
102 |
103 | tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
104 | print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
105 | print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
106 |
107 | print("Tokenizing ...", flush=True)
108 | imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
109 | del imdb_dataset
110 | imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
111 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
112 |
113 | #########################################
114 | ### 3 Set Up DataLoaders
115 | #########################################
116 |
117 | train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
118 | val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
119 | test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
120 |
121 | train_loader = DataLoader(
122 | dataset=train_dataset,
123 | batch_size=12,
124 | shuffle=True,
125 | num_workers=4,
126 | drop_last=True,
127 | )
128 |
129 | val_loader = DataLoader(
130 | dataset=val_dataset,
131 | batch_size=12,
132 | num_workers=4,
133 | drop_last=True,
134 | )
135 |
136 | test_loader = DataLoader(
137 | dataset=test_dataset,
138 | batch_size=12,
139 | num_workers=2,
140 | drop_last=True,
141 | )
142 |
143 |
144 | #########################################
145 | ### 4 Initializing the Model
146 | #########################################
147 |
148 | fabric = Fabric(accelerator="cuda", devices=[4], precision="16-mixed")
149 | fabric.launch()
150 |
151 | model = AutoModelForSequenceClassification.from_pretrained(
152 | "distilbert-base-uncased", num_labels=2)
153 |
154 | # model.to(device)
155 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
156 |
157 | model, optimizer = fabric.setup(model, optimizer)
158 | train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
159 | fabric.barrier()
160 |
161 | #########################################
162 | ### 5 Finetuning
163 | #########################################
164 |
165 | start = time.time()
166 | train(
167 | num_epochs=3,
168 | model=model,
169 | optimizer=optimizer,
170 | train_loader=train_loader,
171 | val_loader=val_loader,
172 | fabric=fabric
173 | )
174 |
175 | end = time.time()
176 | elapsed = end-start
177 | print(f"Time elapsed {elapsed/60:.2f} min")
178 |
179 | with torch.no_grad():
180 | model.eval()
181 | test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
182 | for batch in test_loader:
183 |
184 | #for s in ["input_ids", "attention_mask", "label"]:
185 | # batch[s] = batch[s].to(device)
186 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
187 | predicted_labels = torch.argmax(outputs["logits"], 1)
188 | test_acc.update(predicted_labels, batch["label"])
189 |
190 | print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
191 | print(f"Test accuracy {test_acc.compute()*100:.2f}%")
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/float16-mixed-medium.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import os.path as op
4 | import time
5 |
6 | from datasets import load_dataset
7 | from lightning import Fabric
8 | import torch
9 | from torch.utils.data import DataLoader
10 | import torchmetrics
11 | from transformers import AutoTokenizer
12 | from transformers import AutoModelForSequenceClassification
13 | from watermark import watermark
14 |
15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
16 | from local_dataset_utilities import IMDBDataset
17 |
18 |
19 | def tokenize_text(batch):
20 | return tokenizer(batch["text"], truncation=True, padding=True)
21 |
22 |
23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
24 |
25 | for epoch in range(num_epochs):
26 | train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
27 |
28 | model.train()
29 | for batch_idx, batch in enumerate(train_loader):
30 | model.train()
31 |
32 | # For non-Fabric PyTorch:
33 | #for s in ["input_ids", "attention_mask", "label"]:
34 | # batch[s] = batch[s].to(device)
35 |
36 | ### FORWARD AND BACK PROP
37 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
38 | optimizer.zero_grad()
39 |
40 | # For non-Fabric PyTorch:
41 | #outputs["loss"].backward()
42 | fabric.backward(outputs["loss"])
43 |
44 | ### UPDATE MODEL PARAMETERS
45 | optimizer.step()
46 |
47 | ### LOGGING
48 | if not batch_idx % 300:
49 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
50 |
51 | model.eval()
52 | with torch.no_grad():
53 | predicted_labels = torch.argmax(outputs["logits"], 1)
54 | train_acc.update(predicted_labels, batch["label"])
55 |
56 | ### MORE LOGGING
57 | model.eval()
58 | with torch.no_grad():
59 | val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
60 | for batch in val_loader:
61 |
62 | # For non-Fabric PyTorch:
63 | #for s in ["input_ids", "attention_mask", "label"]:
64 | # batch[s] = batch[s].to(device)
65 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
66 | predicted_labels = torch.argmax(outputs["logits"], 1)
67 | val_acc.update(predicted_labels, batch["label"])
68 |
69 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
70 | train_acc.reset(), val_acc.reset()
71 |
72 |
73 | if __name__ == "__main__":
74 |
75 | print(watermark(packages="torch,lightning,transformers", python=True))
76 | print("Torch CUDA available?", torch.cuda.is_available())
77 | device = "cuda" if torch.cuda.is_available() else "cpu"
78 |
79 | torch.set_float32_matmul_precision("medium")
80 | torch.manual_seed(123)
81 |
82 | ##########################
83 | ### 1 Loading the Dataset
84 | ##########################
85 | download_dataset()
86 | df = load_dataset_into_to_dataframe()
87 | if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
88 | partition_dataset(df)
89 |
90 | imdb_dataset = load_dataset(
91 | "csv",
92 | data_files={
93 | "train": "train.csv",
94 | "validation": "val.csv",
95 | "test": "test.csv",
96 | },
97 | )
98 |
99 | #########################################
100 | ### 2 Tokenization and Numericalization
101 | #########################################
102 |
103 | tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
104 | print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
105 | print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
106 |
107 | print("Tokenizing ...", flush=True)
108 | imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
109 | del imdb_dataset
110 | imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
111 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
112 |
113 | #########################################
114 | ### 3 Set Up DataLoaders
115 | #########################################
116 |
117 | train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
118 | val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
119 | test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
120 |
121 | train_loader = DataLoader(
122 | dataset=train_dataset,
123 | batch_size=12,
124 | shuffle=True,
125 | num_workers=4,
126 | drop_last=True,
127 | )
128 |
129 | val_loader = DataLoader(
130 | dataset=val_dataset,
131 | batch_size=12,
132 | num_workers=4,
133 | drop_last=True,
134 | )
135 |
136 | test_loader = DataLoader(
137 | dataset=test_dataset,
138 | batch_size=12,
139 | num_workers=2,
140 | drop_last=True,
141 | )
142 |
143 |
144 | #########################################
145 | ### 4 Initializing the Model
146 | #########################################
147 |
148 | fabric = Fabric(accelerator="cuda", devices=[5], precision="16-mixed")
149 | fabric.launch()
150 |
151 | model = AutoModelForSequenceClassification.from_pretrained(
152 | "distilbert-base-uncased", num_labels=2)
153 |
154 | # model.to(device)
155 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
156 |
157 | model, optimizer = fabric.setup(model, optimizer)
158 | train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
159 | fabric.barrier()
160 |
161 | #########################################
162 | ### 5 Finetuning
163 | #########################################
164 |
165 | start = time.time()
166 | train(
167 | num_epochs=3,
168 | model=model,
169 | optimizer=optimizer,
170 | train_loader=train_loader,
171 | val_loader=val_loader,
172 | fabric=fabric
173 | )
174 |
175 | end = time.time()
176 | elapsed = end-start
177 | print(f"Time elapsed {elapsed/60:.2f} min")
178 |
179 | with torch.no_grad():
180 | model.eval()
181 | test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
182 | for batch in test_loader:
183 |
184 | #for s in ["input_ids", "attention_mask", "label"]:
185 | # batch[s] = batch[s].to(device)
186 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
187 | predicted_labels = torch.argmax(outputs["logits"], 1)
188 | test_acc.update(predicted_labels, batch["label"])
189 |
190 | print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
191 | print(f"Test accuracy {test_acc.compute()*100:.2f}%")
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/float16-mixed.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import os.path as op
4 | import time
5 |
6 | from datasets import load_dataset
7 | from lightning import Fabric
8 | import torch
9 | from torch.utils.data import DataLoader
10 | import torchmetrics
11 | from transformers import AutoTokenizer
12 | from transformers import AutoModelForSequenceClassification
13 | from watermark import watermark
14 |
15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
16 | from local_dataset_utilities import IMDBDataset
17 |
18 |
19 | def tokenize_text(batch):
20 | return tokenizer(batch["text"], truncation=True, padding=True)
21 |
22 |
23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
24 |
25 | for epoch in range(num_epochs):
26 | train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
27 |
28 | model.train()
29 | for batch_idx, batch in enumerate(train_loader):
30 | model.train()
31 |
32 | # For non-Fabric PyTorch:
33 | #for s in ["input_ids", "attention_mask", "label"]:
34 | # batch[s] = batch[s].to(device)
35 |
36 | ### FORWARD AND BACK PROP
37 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
38 | optimizer.zero_grad()
39 |
40 | # For non-Fabric PyTorch:
41 | #outputs["loss"].backward()
42 | fabric.backward(outputs["loss"])
43 |
44 | ### UPDATE MODEL PARAMETERS
45 | optimizer.step()
46 |
47 | ### LOGGING
48 | if not batch_idx % 300:
49 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
50 |
51 | model.eval()
52 | with torch.no_grad():
53 | predicted_labels = torch.argmax(outputs["logits"], 1)
54 | train_acc.update(predicted_labels, batch["label"])
55 |
56 | ### MORE LOGGING
57 | model.eval()
58 | with torch.no_grad():
59 | val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
60 | for batch in val_loader:
61 |
62 | # For non-Fabric PyTorch:
63 | #for s in ["input_ids", "attention_mask", "label"]:
64 | # batch[s] = batch[s].to(device)
65 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
66 | predicted_labels = torch.argmax(outputs["logits"], 1)
67 | val_acc.update(predicted_labels, batch["label"])
68 |
69 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
70 | train_acc.reset(), val_acc.reset()
71 |
72 |
73 | if __name__ == "__main__":
74 |
75 | print(watermark(packages="torch,lightning,transformers", python=True))
76 | print("Torch CUDA available?", torch.cuda.is_available())
77 | device = "cuda" if torch.cuda.is_available() else "cpu"
78 |
79 | torch.manual_seed(123)
80 |
81 | ##########################
82 | ### 1 Loading the Dataset
83 | ##########################
84 | download_dataset()
85 | df = load_dataset_into_to_dataframe()
86 | if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
87 | partition_dataset(df)
88 |
89 | imdb_dataset = load_dataset(
90 | "csv",
91 | data_files={
92 | "train": "train.csv",
93 | "validation": "val.csv",
94 | "test": "test.csv",
95 | },
96 | )
97 |
98 | #########################################
99 | ### 2 Tokenization and Numericalization
100 | #########################################
101 |
102 | tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
103 | print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
104 | print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
105 |
106 | print("Tokenizing ...", flush=True)
107 | imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
108 | del imdb_dataset
109 | imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
110 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
111 |
112 | #########################################
113 | ### 3 Set Up DataLoaders
114 | #########################################
115 |
116 | train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
117 | val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
118 | test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
119 |
120 | train_loader = DataLoader(
121 | dataset=train_dataset,
122 | batch_size=12,
123 | shuffle=True,
124 | num_workers=4,
125 | drop_last=True,
126 | )
127 |
128 | val_loader = DataLoader(
129 | dataset=val_dataset,
130 | batch_size=12,
131 | num_workers=4,
132 | drop_last=True,
133 | )
134 |
135 | test_loader = DataLoader(
136 | dataset=test_dataset,
137 | batch_size=12,
138 | num_workers=2,
139 | drop_last=True,
140 | )
141 |
142 |
143 | #########################################
144 | ### 4 Initializing the Model
145 | #########################################
146 |
147 | fabric = Fabric(accelerator="cuda", devices=1, precision="16-mixed")
148 | fabric.launch()
149 |
150 | model = AutoModelForSequenceClassification.from_pretrained(
151 | "distilbert-base-uncased", num_labels=2)
152 |
153 | # model.to(device)
154 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
155 |
156 | model, optimizer = fabric.setup(model, optimizer)
157 | train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
158 | fabric.barrier()
159 |
160 | #########################################
161 | ### 5 Finetuning
162 | #########################################
163 |
164 | start = time.time()
165 | train(
166 | num_epochs=3,
167 | model=model,
168 | optimizer=optimizer,
169 | train_loader=train_loader,
170 | val_loader=val_loader,
171 | fabric=fabric
172 | )
173 |
174 | end = time.time()
175 | elapsed = end-start
176 | print(f"Time elapsed {elapsed/60:.2f} min")
177 |
178 | with torch.no_grad():
179 | model.eval()
180 | test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
181 | for batch in test_loader:
182 |
183 | #for s in ["input_ids", "attention_mask", "label"]:
184 | # batch[s] = batch[s].to(device)
185 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
186 | predicted_labels = torch.argmax(outputs["logits"], 1)
187 | test_acc.update(predicted_labels, batch["label"])
188 |
189 | print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
190 | print(f"Test accuracy {test_acc.compute()*100:.2f}%")
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/float16-regular.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import os.path as op
4 | import time
5 |
6 | from datasets import load_dataset
7 | from lightning import Fabric
8 | import torch
9 | from torch.utils.data import DataLoader
10 | import torchmetrics
11 | from transformers import AutoTokenizer
12 | from transformers import AutoModelForSequenceClassification
13 | from watermark import watermark
14 |
15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
16 | from local_dataset_utilities import IMDBDataset
17 |
18 |
19 | def tokenize_text(batch):
20 | return tokenizer(batch["text"], truncation=True, padding=True)
21 |
22 |
23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
24 |
25 | for epoch in range(num_epochs):
26 | train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
27 |
28 | model.train()
29 | for batch_idx, batch in enumerate(train_loader):
30 | model.train()
31 |
32 | # For non-Fabric PyTorch:
33 | #for s in ["input_ids", "attention_mask", "label"]:
34 | # batch[s] = batch[s].to(device)
35 |
36 | ### FORWARD AND BACK PROP
37 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
38 | optimizer.zero_grad()
39 |
40 | # For non-Fabric PyTorch:
41 | #outputs["loss"].backward()
42 | fabric.backward(outputs["loss"])
43 |
44 | ### UPDATE MODEL PARAMETERS
45 | optimizer.step()
46 |
47 | ### LOGGING
48 | if not batch_idx % 300:
49 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
50 |
51 | model.eval()
52 | with torch.no_grad():
53 | predicted_labels = torch.argmax(outputs["logits"], 1)
54 | train_acc.update(predicted_labels, batch["label"])
55 |
56 | ### MORE LOGGING
57 | model.eval()
58 | with torch.no_grad():
59 | val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
60 | for batch in val_loader:
61 |
62 | # For non-Fabric PyTorch:
63 | #for s in ["input_ids", "attention_mask", "label"]:
64 | # batch[s] = batch[s].to(device)
65 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
66 | predicted_labels = torch.argmax(outputs["logits"], 1)
67 | val_acc.update(predicted_labels, batch["label"])
68 |
69 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
70 | train_acc.reset(), val_acc.reset()
71 |
72 |
73 | if __name__ == "__main__":
74 |
75 | print(watermark(packages="torch,lightning,transformers", python=True))
76 | print("Torch CUDA available?", torch.cuda.is_available())
77 | device = "cuda" if torch.cuda.is_available() else "cpu"
78 |
79 | torch.manual_seed(123)
80 |
81 | ##########################
82 | ### 1 Loading the Dataset
83 | ##########################
84 | download_dataset()
85 | df = load_dataset_into_to_dataframe()
86 | if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
87 | partition_dataset(df)
88 |
89 | imdb_dataset = load_dataset(
90 | "csv",
91 | data_files={
92 | "train": "train.csv",
93 | "validation": "val.csv",
94 | "test": "test.csv",
95 | },
96 | )
97 |
98 | #########################################
99 | ### 2 Tokenization and Numericalization
100 | #########################################
101 |
102 | tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
103 | print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
104 | print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
105 |
106 | print("Tokenizing ...", flush=True)
107 | imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
108 | del imdb_dataset
109 | imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
110 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
111 |
112 | #########################################
113 | ### 3 Set Up DataLoaders
114 | #########################################
115 |
116 | train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
117 | val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
118 | test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
119 |
120 | train_loader = DataLoader(
121 | dataset=train_dataset,
122 | batch_size=12,
123 | shuffle=True,
124 | num_workers=4,
125 | drop_last=True,
126 | )
127 |
128 | val_loader = DataLoader(
129 | dataset=val_dataset,
130 | batch_size=12,
131 | num_workers=4,
132 | drop_last=True,
133 | )
134 |
135 | test_loader = DataLoader(
136 | dataset=test_dataset,
137 | batch_size=12,
138 | num_workers=2,
139 | drop_last=True,
140 | )
141 |
142 |
143 | #########################################
144 | ### 4 Initializing the Model
145 | #########################################
146 |
147 | fabric = Fabric(accelerator="cuda", devices=1, precision="16-true")
148 | fabric.launch()
149 |
150 | model = AutoModelForSequenceClassification.from_pretrained(
151 | "distilbert-base-uncased", num_labels=2)
152 |
153 | # model.to(device)
154 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
155 |
156 | model, optimizer = fabric.setup(model, optimizer)
157 | train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
158 | fabric.barrier()
159 |
160 | #########################################
161 | ### 5 Finetuning
162 | #########################################
163 |
164 | start = time.time()
165 | train(
166 | num_epochs=3,
167 | model=model,
168 | optimizer=optimizer,
169 | train_loader=train_loader,
170 | val_loader=val_loader,
171 | fabric=fabric
172 | )
173 |
174 | end = time.time()
175 | elapsed = end-start
176 | print(f"Time elapsed {elapsed/60:.2f} min")
177 |
178 | with torch.no_grad():
179 | model.eval()
180 | test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
181 | for batch in test_loader:
182 |
183 | #for s in ["input_ids", "attention_mask", "label"]:
184 | # batch[s] = batch[s].to(device)
185 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
186 | predicted_labels = torch.argmax(outputs["logits"], 1)
187 | test_acc.update(predicted_labels, batch["label"])
188 |
189 | print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
190 | print(f"Test accuracy {test_acc.compute()*100:.2f}%")
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/float32-regular-high.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import os.path as op
4 | import time
5 |
6 | from datasets import load_dataset
7 | from lightning import Fabric
8 | import torch
9 | from torch.utils.data import DataLoader
10 | import torchmetrics
11 | from transformers import AutoTokenizer
12 | from transformers import AutoModelForSequenceClassification
13 | from watermark import watermark
14 |
15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
16 | from local_dataset_utilities import IMDBDataset
17 |
18 |
19 | def tokenize_text(batch):
20 | return tokenizer(batch["text"], truncation=True, padding=True)
21 |
22 |
23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
24 |
25 | for epoch in range(num_epochs):
26 | train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
27 |
28 | model.train()
29 | for batch_idx, batch in enumerate(train_loader):
30 | model.train()
31 |
32 | # For non-Fabric PyTorch:
33 | #for s in ["input_ids", "attention_mask", "label"]:
34 | # batch[s] = batch[s].to(device)
35 |
36 | ### FORWARD AND BACK PROP
37 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
38 | optimizer.zero_grad()
39 |
40 | # For non-Fabric PyTorch:
41 | #outputs["loss"].backward()
42 | fabric.backward(outputs["loss"])
43 |
44 | ### UPDATE MODEL PARAMETERS
45 | optimizer.step()
46 |
47 | ### LOGGING
48 | if not batch_idx % 300:
49 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
50 |
51 | model.eval()
52 | with torch.no_grad():
53 | predicted_labels = torch.argmax(outputs["logits"], 1)
54 | train_acc.update(predicted_labels, batch["label"])
55 |
56 | ### MORE LOGGING
57 | model.eval()
58 | with torch.no_grad():
59 | val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
60 | for batch in val_loader:
61 |
62 | # For non-Fabric PyTorch:
63 | #for s in ["input_ids", "attention_mask", "label"]:
64 | # batch[s] = batch[s].to(device)
65 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
66 | predicted_labels = torch.argmax(outputs["logits"], 1)
67 | val_acc.update(predicted_labels, batch["label"])
68 |
69 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
70 | train_acc.reset(), val_acc.reset()
71 |
72 |
73 | if __name__ == "__main__":
74 |
75 | print(watermark(packages="torch,lightning,transformers", python=True))
76 | print("Torch CUDA available?", torch.cuda.is_available())
77 | device = "cuda" if torch.cuda.is_available() else "cpu"
78 |
79 | torch.set_float32_matmul_precision("high")
80 | torch.manual_seed(123)
81 |
82 | ##########################
83 | ### 1 Loading the Dataset
84 | ##########################
85 | download_dataset()
86 | df = load_dataset_into_to_dataframe()
87 | if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
88 | partition_dataset(df)
89 |
90 | imdb_dataset = load_dataset(
91 | "csv",
92 | data_files={
93 | "train": "train.csv",
94 | "validation": "val.csv",
95 | "test": "test.csv",
96 | },
97 | )
98 |
99 | #########################################
100 | ### 2 Tokenization and Numericalization
101 | #########################################
102 |
103 | tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
104 | print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
105 | print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
106 |
107 | print("Tokenizing ...", flush=True)
108 | imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
109 | del imdb_dataset
110 | imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
111 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
112 |
113 | #########################################
114 | ### 3 Set Up DataLoaders
115 | #########################################
116 |
117 | train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
118 | val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
119 | test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
120 |
121 | train_loader = DataLoader(
122 | dataset=train_dataset,
123 | batch_size=12,
124 | shuffle=True,
125 | num_workers=4,
126 | drop_last=True,
127 | )
128 |
129 | val_loader = DataLoader(
130 | dataset=val_dataset,
131 | batch_size=12,
132 | num_workers=4,
133 | drop_last=True,
134 | )
135 |
136 | test_loader = DataLoader(
137 | dataset=test_dataset,
138 | batch_size=12,
139 | num_workers=2,
140 | drop_last=True,
141 | )
142 |
143 |
144 | #########################################
145 | ### 4 Initializing the Model
146 | #########################################
147 |
148 | fabric = Fabric(accelerator="cuda", devices=[7], precision="32-true")
149 | fabric.launch()
150 |
151 | model = AutoModelForSequenceClassification.from_pretrained(
152 | "distilbert-base-uncased", num_labels=2)
153 |
154 | # model.to(device)
155 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
156 |
157 | model, optimizer = fabric.setup(model, optimizer)
158 | train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
159 | fabric.barrier()
160 |
161 | #########################################
162 | ### 5 Finetuning
163 | #########################################
164 |
165 | start = time.time()
166 | train(
167 | num_epochs=3,
168 | model=model,
169 | optimizer=optimizer,
170 | train_loader=train_loader,
171 | val_loader=val_loader,
172 | fabric=fabric
173 | )
174 |
175 | end = time.time()
176 | elapsed = end-start
177 | print(f"Time elapsed {elapsed/60:.2f} min")
178 |
179 | with torch.no_grad():
180 | model.eval()
181 | test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
182 | for batch in test_loader:
183 |
184 | #for s in ["input_ids", "attention_mask", "label"]:
185 | # batch[s] = batch[s].to(device)
186 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
187 | predicted_labels = torch.argmax(outputs["logits"], 1)
188 | test_acc.update(predicted_labels, batch["label"])
189 |
190 | print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
191 | print(f"Test accuracy {test_acc.compute()*100:.2f}%")
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/float32-regular-medium.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import os.path as op
4 | import time
5 |
6 | from datasets import load_dataset
7 | from lightning import Fabric
8 | import torch
9 | from torch.utils.data import DataLoader
10 | import torchmetrics
11 | from transformers import AutoTokenizer
12 | from transformers import AutoModelForSequenceClassification
13 | from watermark import watermark
14 |
15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
16 | from local_dataset_utilities import IMDBDataset
17 |
18 |
19 | def tokenize_text(batch):
20 | return tokenizer(batch["text"], truncation=True, padding=True)
21 |
22 |
23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
24 |
25 | for epoch in range(num_epochs):
26 | train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
27 |
28 | model.train()
29 | for batch_idx, batch in enumerate(train_loader):
30 | model.train()
31 |
32 | # For non-Fabric PyTorch:
33 | #for s in ["input_ids", "attention_mask", "label"]:
34 | # batch[s] = batch[s].to(device)
35 |
36 | ### FORWARD AND BACK PROP
37 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
38 | optimizer.zero_grad()
39 |
40 | # For non-Fabric PyTorch:
41 | #outputs["loss"].backward()
42 | fabric.backward(outputs["loss"])
43 |
44 | ### UPDATE MODEL PARAMETERS
45 | optimizer.step()
46 |
47 | ### LOGGING
48 | if not batch_idx % 300:
49 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
50 |
51 | model.eval()
52 | with torch.no_grad():
53 | predicted_labels = torch.argmax(outputs["logits"], 1)
54 | train_acc.update(predicted_labels, batch["label"])
55 |
56 | ### MORE LOGGING
57 | model.eval()
58 | with torch.no_grad():
59 | val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
60 | for batch in val_loader:
61 |
62 | # For non-Fabric PyTorch:
63 | #for s in ["input_ids", "attention_mask", "label"]:
64 | # batch[s] = batch[s].to(device)
65 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
66 | predicted_labels = torch.argmax(outputs["logits"], 1)
67 | val_acc.update(predicted_labels, batch["label"])
68 |
69 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
70 | train_acc.reset(), val_acc.reset()
71 |
72 |
73 | if __name__ == "__main__":
74 |
75 | print(watermark(packages="torch,lightning,transformers", python=True))
76 | print("Torch CUDA available?", torch.cuda.is_available())
77 | device = "cuda" if torch.cuda.is_available() else "cpu"
78 |
79 | torch.set_float32_matmul_precision("medium")
80 | torch.manual_seed(123)
81 |
82 | ##########################
83 | ### 1 Loading the Dataset
84 | ##########################
85 | download_dataset()
86 | df = load_dataset_into_to_dataframe()
87 | if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
88 | partition_dataset(df)
89 |
90 | imdb_dataset = load_dataset(
91 | "csv",
92 | data_files={
93 | "train": "train.csv",
94 | "validation": "val.csv",
95 | "test": "test.csv",
96 | },
97 | )
98 |
99 | #########################################
100 | ### 2 Tokenization and Numericalization
101 | #########################################
102 |
103 | tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
104 | print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
105 | print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
106 |
107 | print("Tokenizing ...", flush=True)
108 | imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
109 | del imdb_dataset
110 | imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
111 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
112 |
113 | #########################################
114 | ### 3 Set Up DataLoaders
115 | #########################################
116 |
117 | train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
118 | val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
119 | test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
120 |
121 | train_loader = DataLoader(
122 | dataset=train_dataset,
123 | batch_size=12,
124 | shuffle=True,
125 | num_workers=4,
126 | drop_last=True,
127 | )
128 |
129 | val_loader = DataLoader(
130 | dataset=val_dataset,
131 | batch_size=12,
132 | num_workers=4,
133 | drop_last=True,
134 | )
135 |
136 | test_loader = DataLoader(
137 | dataset=test_dataset,
138 | batch_size=12,
139 | num_workers=2,
140 | drop_last=True,
141 | )
142 |
143 |
144 | #########################################
145 | ### 4 Initializing the Model
146 | #########################################
147 |
148 | fabric = Fabric(accelerator="cuda", devices=[6], precision="32-true")
149 | fabric.launch()
150 |
151 | model = AutoModelForSequenceClassification.from_pretrained(
152 | "distilbert-base-uncased", num_labels=2)
153 |
154 | # model.to(device)
155 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
156 |
157 | model, optimizer = fabric.setup(model, optimizer)
158 | train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
159 | fabric.barrier()
160 |
161 | #########################################
162 | ### 5 Finetuning
163 | #########################################
164 |
165 | start = time.time()
166 | train(
167 | num_epochs=3,
168 | model=model,
169 | optimizer=optimizer,
170 | train_loader=train_loader,
171 | val_loader=val_loader,
172 | fabric=fabric
173 | )
174 |
175 | end = time.time()
176 | elapsed = end-start
177 | print(f"Time elapsed {elapsed/60:.2f} min")
178 |
179 | with torch.no_grad():
180 | model.eval()
181 | test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
182 | for batch in test_loader:
183 |
184 | #for s in ["input_ids", "attention_mask", "label"]:
185 | # batch[s] = batch[s].to(device)
186 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
187 | predicted_labels = torch.argmax(outputs["logits"], 1)
188 | test_acc.update(predicted_labels, batch["label"])
189 |
190 | print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
191 | print(f"Test accuracy {test_acc.compute()*100:.2f}%")
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/float32-regular.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import os.path as op
4 | import time
5 |
6 | from datasets import load_dataset
7 | from lightning import Fabric
8 | import torch
9 | from torch.utils.data import DataLoader
10 | import torchmetrics
11 | from transformers import AutoTokenizer
12 | from transformers import AutoModelForSequenceClassification
13 | from watermark import watermark
14 |
15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
16 | from local_dataset_utilities import IMDBDataset
17 |
18 |
19 | def tokenize_text(batch):
20 | return tokenizer(batch["text"], truncation=True, padding=True)
21 |
22 |
23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
24 |
25 | for epoch in range(num_epochs):
26 | train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
27 |
28 | model.train()
29 | for batch_idx, batch in enumerate(train_loader):
30 | model.train()
31 |
32 | # For non-Fabric PyTorch:
33 | #for s in ["input_ids", "attention_mask", "label"]:
34 | # batch[s] = batch[s].to(device)
35 |
36 | ### FORWARD AND BACK PROP
37 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
38 | optimizer.zero_grad()
39 |
40 | # For non-Fabric PyTorch:
41 | #outputs["loss"].backward()
42 | fabric.backward(outputs["loss"])
43 |
44 | ### UPDATE MODEL PARAMETERS
45 | optimizer.step()
46 |
47 | ### LOGGING
48 | if not batch_idx % 300:
49 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
50 |
51 | model.eval()
52 | with torch.no_grad():
53 | predicted_labels = torch.argmax(outputs["logits"], 1)
54 | train_acc.update(predicted_labels, batch["label"])
55 |
56 | ### MORE LOGGING
57 | model.eval()
58 | with torch.no_grad():
59 | val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
60 | for batch in val_loader:
61 |
62 | # For non-Fabric PyTorch:
63 | #for s in ["input_ids", "attention_mask", "label"]:
64 | # batch[s] = batch[s].to(device)
65 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
66 | predicted_labels = torch.argmax(outputs["logits"], 1)
67 | val_acc.update(predicted_labels, batch["label"])
68 |
69 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
70 | train_acc.reset(), val_acc.reset()
71 |
72 |
73 | if __name__ == "__main__":
74 |
75 | print(watermark(packages="torch,lightning,transformers", python=True))
76 | print("Torch CUDA available?", torch.cuda.is_available())
77 | device = "cuda" if torch.cuda.is_available() else "cpu"
78 |
79 | torch.manual_seed(123)
80 |
81 | ##########################
82 | ### 1 Loading the Dataset
83 | ##########################
84 | download_dataset()
85 | df = load_dataset_into_to_dataframe()
86 | if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
87 | partition_dataset(df)
88 |
89 | imdb_dataset = load_dataset(
90 | "csv",
91 | data_files={
92 | "train": "train.csv",
93 | "validation": "val.csv",
94 | "test": "test.csv",
95 | },
96 | )
97 |
98 | #########################################
99 | ### 2 Tokenization and Numericalization
100 | #########################################
101 |
102 | tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
103 | print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
104 | print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
105 |
106 | print("Tokenizing ...", flush=True)
107 | imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
108 | del imdb_dataset
109 | imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
110 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
111 |
112 | #########################################
113 | ### 3 Set Up DataLoaders
114 | #########################################
115 |
116 | train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
117 | val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
118 | test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
119 |
120 | train_loader = DataLoader(
121 | dataset=train_dataset,
122 | batch_size=12,
123 | shuffle=True,
124 | num_workers=4,
125 | drop_last=True,
126 | )
127 |
128 | val_loader = DataLoader(
129 | dataset=val_dataset,
130 | batch_size=12,
131 | num_workers=4,
132 | drop_last=True,
133 | )
134 |
135 | test_loader = DataLoader(
136 | dataset=test_dataset,
137 | batch_size=12,
138 | num_workers=2,
139 | drop_last=True,
140 | )
141 |
142 |
143 | #########################################
144 | ### 4 Initializing the Model
145 | #########################################
146 |
147 | fabric = Fabric(accelerator="cuda", devices=1, precision="32-true")
148 | fabric.launch()
149 |
150 | model = AutoModelForSequenceClassification.from_pretrained(
151 | "distilbert-base-uncased", num_labels=2)
152 |
153 | # model.to(device)
154 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
155 |
156 | model, optimizer = fabric.setup(model, optimizer)
157 | train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
158 | fabric.barrier()
159 |
160 | #########################################
161 | ### 5 Finetuning
162 | #########################################
163 |
164 | start = time.time()
165 | train(
166 | num_epochs=3,
167 | model=model,
168 | optimizer=optimizer,
169 | train_loader=train_loader,
170 | val_loader=val_loader,
171 | fabric=fabric
172 | )
173 |
174 | end = time.time()
175 | elapsed = end-start
176 | print(f"Time elapsed {elapsed/60:.2f} min")
177 |
178 | with torch.no_grad():
179 | model.eval()
180 | test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
181 | for batch in test_loader:
182 |
183 | #for s in ["input_ids", "attention_mask", "label"]:
184 | # batch[s] = batch[s].to(device)
185 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
186 | predicted_labels = torch.argmax(outputs["logits"], 1)
187 | test_acc.update(predicted_labels, batch["label"])
188 |
189 | print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
190 | print(f"Test accuracy {test_acc.compute()*100:.2f}%")
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/float64-regular.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import os.path as op
4 | import time
5 |
6 | from datasets import load_dataset
7 | from lightning import Fabric
8 | import torch
9 | from torch.utils.data import DataLoader
10 | import torchmetrics
11 | from transformers import AutoTokenizer
12 | from transformers import AutoModelForSequenceClassification
13 | from watermark import watermark
14 |
15 | from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
16 | from local_dataset_utilities import IMDBDataset
17 |
18 |
19 | def tokenize_text(batch):
20 | return tokenizer(batch["text"], truncation=True, padding=True)
21 |
22 |
23 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
24 |
25 | for epoch in range(num_epochs):
26 | train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
27 |
28 | model.train()
29 | for batch_idx, batch in enumerate(train_loader):
30 | model.train()
31 |
32 | # For non-Fabric PyTorch:
33 | #for s in ["input_ids", "attention_mask", "label"]:
34 | # batch[s] = batch[s].to(device)
35 |
36 | ### FORWARD AND BACK PROP
37 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
38 | optimizer.zero_grad()
39 |
40 | # For non-Fabric PyTorch:
41 | #outputs["loss"].backward()
42 | fabric.backward(outputs["loss"])
43 |
44 | ### UPDATE MODEL PARAMETERS
45 | optimizer.step()
46 |
47 | ### LOGGING
48 | if not batch_idx % 300:
49 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
50 |
51 | model.eval()
52 | with torch.no_grad():
53 | predicted_labels = torch.argmax(outputs["logits"], 1)
54 | train_acc.update(predicted_labels, batch["label"])
55 |
56 | ### MORE LOGGING
57 | model.eval()
58 | with torch.no_grad():
59 | val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
60 | for batch in val_loader:
61 |
62 | # For non-Fabric PyTorch:
63 | #for s in ["input_ids", "attention_mask", "label"]:
64 | # batch[s] = batch[s].to(device)
65 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
66 | predicted_labels = torch.argmax(outputs["logits"], 1)
67 | val_acc.update(predicted_labels, batch["label"])
68 |
69 | print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
70 | train_acc.reset(), val_acc.reset()
71 |
72 |
73 | if __name__ == "__main__":
74 |
75 | print(watermark(packages="torch,lightning,transformers", python=True))
76 | print("Torch CUDA available?", torch.cuda.is_available())
77 | device = "cuda" if torch.cuda.is_available() else "cpu"
78 |
79 | torch.manual_seed(123)
80 |
81 | ##########################
82 | ### 1 Loading the Dataset
83 | ##########################
84 | download_dataset()
85 | df = load_dataset_into_to_dataframe()
86 | if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
87 | partition_dataset(df)
88 |
89 | imdb_dataset = load_dataset(
90 | "csv",
91 | data_files={
92 | "train": "train.csv",
93 | "validation": "val.csv",
94 | "test": "test.csv",
95 | },
96 | )
97 |
98 | #########################################
99 | ### 2 Tokenization and Numericalization
100 | #########################################
101 |
102 | tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
103 | print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
104 | print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)
105 |
106 | print("Tokenizing ...", flush=True)
107 | imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
108 | del imdb_dataset
109 | imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
110 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
111 |
112 | #########################################
113 | ### 3 Set Up DataLoaders
114 | #########################################
115 |
116 | train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
117 | val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
118 | test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")
119 |
120 | train_loader = DataLoader(
121 | dataset=train_dataset,
122 | batch_size=12,
123 | shuffle=True,
124 | num_workers=4,
125 | drop_last=True,
126 | )
127 |
128 | val_loader = DataLoader(
129 | dataset=val_dataset,
130 | batch_size=12,
131 | num_workers=4,
132 | drop_last=True,
133 | )
134 |
135 | test_loader = DataLoader(
136 | dataset=test_dataset,
137 | batch_size=12,
138 | num_workers=2,
139 | drop_last=True,
140 | )
141 |
142 |
143 | #########################################
144 | ### 4 Initializing the Model
145 | #########################################
146 |
147 | fabric = Fabric(accelerator="cuda", devices=1, precision="64-true")
148 | fabric.launch()
149 |
150 | model = AutoModelForSequenceClassification.from_pretrained(
151 | "distilbert-base-uncased", num_labels=2)
152 |
153 | # model.to(device)
154 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
155 |
156 | model, optimizer = fabric.setup(model, optimizer)
157 | train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)
158 | fabric.barrier()
159 |
160 | #########################################
161 | ### 5 Finetuning
162 | #########################################
163 |
164 | start = time.time()
165 | train(
166 | num_epochs=3,
167 | model=model,
168 | optimizer=optimizer,
169 | train_loader=train_loader,
170 | val_loader=val_loader,
171 | fabric=fabric
172 | )
173 |
174 | end = time.time()
175 | elapsed = end-start
176 | print(f"Time elapsed {elapsed/60:.2f} min")
177 |
178 | with torch.no_grad():
179 | model.eval()
180 | test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2).to(fabric.device)
181 | for batch in test_loader:
182 |
183 | #for s in ["input_ids", "attention_mask", "label"]:
184 | # batch[s] = batch[s].to(device)
185 | outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
186 | predicted_labels = torch.argmax(outputs["logits"], 1)
187 | test_acc.update(predicted_labels, batch["label"])
188 |
189 | print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
190 | print(f"Test accuracy {test_acc.compute()*100:.2f}%")
--------------------------------------------------------------------------------
/conventional/distilbert-movie-review/mixed-precision-experiment/local_dataset_utilities.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import tarfile
4 | import time
5 |
6 | import numpy as np
7 | import pandas as pd
8 | from packaging import version
9 | from torch.utils.data import Dataset
10 | from tqdm import tqdm
11 | import urllib
12 |
13 |
14 | def reporthook(count, block_size, total_size):
15 | global start_time
16 | if count == 0:
17 | start_time = time.time()
18 | return
19 | duration = time.time() - start_time
20 | progress_size = int(count * block_size)
21 | speed = progress_size / (1024.0**2 * duration)
22 | percent = count * block_size * 100.0 / total_size
23 |
24 | sys.stdout.write(
25 | f"\r{int(percent)}% | {progress_size / (1024.**2):.2f} MB "
26 | f"| {speed:.2f} MB/s | {duration:.2f} sec elapsed"
27 | )
28 | sys.stdout.flush()
29 |
30 |
31 | def download_dataset():
32 | source = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
33 | target = "aclImdb_v1.tar.gz"
34 |
35 | if os.path.exists(target):
36 | os.remove(target)
37 |
38 | if not os.path.isdir("aclImdb") and not os.path.isfile("aclImdb_v1.tar.gz"):
39 | urllib.request.urlretrieve(source, target, reporthook)
40 |
41 | if not os.path.isdir("aclImdb"):
42 |
43 | with tarfile.open(target, "r:gz") as tar:
44 | tar.extractall()
45 |
46 |
47 | def load_dataset_into_to_dataframe():
48 | basepath = "aclImdb"
49 |
50 | labels = {"pos": 1, "neg": 0}
51 |
52 | df = pd.DataFrame()
53 |
54 | with tqdm(total=50000) as pbar:
55 | for s in ("test", "train"):
56 | for l in ("pos", "neg"):
57 | path = os.path.join(basepath, s, l)
58 | for file in sorted(os.listdir(path)):
59 | with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
60 | txt = infile.read()
61 |
62 | if version.parse(pd.__version__) >= version.parse("1.3.2"):
63 | x = pd.DataFrame(
64 | [[txt, labels[l]]], columns=["review", "sentiment"]
65 | )
66 | df = pd.concat([df, x], ignore_index=False)
67 |
68 | else:
69 | df = df.append([[txt, labels[l]]], ignore_index=True)
70 | pbar.update()
71 | df.columns = ["text", "label"]
72 |
73 | np.random.seed(0)
74 | df = df.reindex(np.random.permutation(df.index))
75 |
76 | print("Class distribution:")
77 | np.bincount(df["label"].values)
78 |
79 | return df
80 |
81 |
82 | def partition_dataset(df):
83 | df_shuffled = df.sample(frac=1, random_state=1).reset_index()
84 |
85 | df_train = df_shuffled.iloc[:35_000]
86 | df_val = df_shuffled.iloc[35_000:40_000]
87 | df_test = df_shuffled.iloc[40_000:]
88 |
89 | df_train.to_csv("train.csv", index=False, encoding="utf-8")
90 | df_val.to_csv("val.csv", index=False, encoding="utf-8")
91 | df_test.to_csv("test.csv", index=False, encoding="utf-8")
92 |
93 |
94 | class IMDBDataset(Dataset):
95 | def __init__(self, dataset_dict, partition_key="train"):
96 | self.partition = dataset_dict[partition_key]
97 |
98 | def __getitem__(self, index):
99 | return self.partition[index]
100 |
101 | def __len__(self):
102 | return self.partition.num_rows
--------------------------------------------------------------------------------
/lit-benchmarks/falcon-7b/README.md:
--------------------------------------------------------------------------------
1 | # Hyperparameter settings for finetuning Falcon 7B
2 |
3 |
4 |
5 | These need to be used in combination with the https://github.com/Lightning-AI/lit-parrot repository.
6 |
7 |
8 |
9 | **Preparing the model and dataset**
10 |
11 | For this benchmark, we will be using the [Lit-Parrot](https://github.com/Lightning-AI/lit-parrot) open-source library, which provides efficient implementations for training and using various LLMs.
12 |
13 | 
14 |
15 | Title: The Lit-Parrot repository (https://github.com/Lightning-AI/lit-parrot)
16 |
17 |
18 |
19 | The first step is to download the model:
20 |
21 | ```
22 | python scripts/download.py --repo_id tiiuae/falcon-7b
23 | ```
24 |
25 | (This requires approximately 20 Gb of storage.)
26 |
27 |
28 | Second, we convert the weights into a standardized form:
29 |
30 | ```
31 | python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/tiiuae/falcon-7b
32 | ```
33 |
34 | Third, we have to download the dataset. For this example, we will be using the Alpaca dataset [link] consisting of 52 instruction pairs:
35 |
36 | ```
37 | python scripts/prepare_alpaca.py
38 | ```
39 |
40 | (More on using custom datasets later.)
41 |
42 | **Running the code**
43 |
44 | Now, we are running the finetuning scripts for the Falcon 7B model. We are going to compare 4 different methods below. For now, we are going to focus on the finetuning results. And we will discuss how these methods work later in this article.
45 |
46 | Adapter:
47 |
48 | ```python finetune/adapter.py --checkpoint_dir checkpoints/tiiuae/falcon-7b/
49 | python finetune/adapter.py --checkpoint_dir checkpoints/tiiuae/falcon-7b/
50 | ```
51 |
52 | Adapter v2:
53 |
54 | ```
55 | python finetune/adapter_v2.py --checkpoint_dir checkpoints/tiiuae/falcon-7b/
56 | ```
57 |
58 | LoRA:
59 |
60 | ```
61 | python finetune/lora.py --checkpoint_dir checkpoints/tiiuae/falcon-7b/
62 | ```
63 |
64 | Full finetuning (updating all layers):
65 |
66 | ```
67 | python finetune/lora.py --checkpoint_dir checkpoints/tiiuae/falcon-7b/
68 | ```
69 |
70 | Let's take a look at the time it takes to finetune the LLM first:
71 |
72 |
73 |
74 |
75 | As we can see in the chart above, using a parameter-efficient finetuning method is about 9 times faster than finetuning all layers ("full"). Moreover, finetuning all layers required 6 GPUs due to memory constraints, whereas **the Adapter methods and LoRA could be used on a single GPU**.
76 |
77 | So, speaking of GPU memory requirements, the peak memory requirements are plotted below:
78 |
79 |
80 |
81 |
82 |
83 | Finetuning all layers of Falcon 7B required ~40 GB on each of the 6 GPUs (here, via tensor sharding using DeepSpeed). So, that's 240 Gb in total. In contrast, the parameter-efficient finetuning methods only required ~16 GB RAM, which allows users to even finetune these models on a single consumer-grade GPU.
84 |
85 | By the way, note that the memory requirements are directly related to the number of parameters that are required to be updated for each method:
86 |
87 | - Full finetuning: 7,217,189,760
88 | - Adapter: 1,365,330
89 | - Adapter v2: 3,839,186
90 | - LoRA: 3,506,176
91 |
--------------------------------------------------------------------------------
/lit-benchmarks/falcon-7b/figures/lit-parrot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/lit-benchmarks/falcon-7b/figures/lit-parrot.png
--------------------------------------------------------------------------------
/lit-benchmarks/falcon-7b/figures/memory-requirements.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/lit-benchmarks/falcon-7b/figures/memory-requirements.png
--------------------------------------------------------------------------------
/lit-benchmarks/falcon-7b/figures/training-time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-finetuning-scripts/cff61447a8e62224e06a47552881e837cb2cb871/lit-benchmarks/falcon-7b/figures/training-time.png
--------------------------------------------------------------------------------
/lit-benchmarks/falcon-7b/finetune/adapter.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import sys
4 | import time
5 | import warnings
6 | from pathlib import Path
7 | from typing import Optional
8 |
9 | import lightning as L
10 | import numpy as np
11 | import torch
12 | from lightning.fabric.strategies import DeepSpeedStrategy, XLAStrategy
13 |
14 | # support running without installing as a package
15 | wd = Path(__file__).parent.parent.resolve()
16 | sys.path.append(str(wd))
17 |
18 | from generate.base import generate
19 | from lit_parrot.adapter import Parrot, Config, mark_only_adapter_as_trainable, adapter_state_from_state_dict
20 | from lit_parrot.tokenizer import Tokenizer
21 | from lit_parrot.utils import lazy_load, check_valid_checkpoint_dir
22 | from scripts.prepare_alpaca import generate_prompt
23 |
24 | eval_interval = 600
25 | save_interval = 1000
26 | eval_iters = 100
27 | log_interval = 1
28 | devices = 1
29 |
30 | # Hyperparameters
31 | learning_rate = 9e-3
32 | batch_size = 128 / devices
33 | micro_batch_size = 1
34 | gradient_accumulation_steps = batch_size // micro_batch_size
35 | assert gradient_accumulation_steps > 0
36 | epoch_size = 52000 # train dataset size
37 | num_epochs = 1
38 | max_iters = num_epochs * (epoch_size // micro_batch_size) // devices
39 | weight_decay = 0.02
40 | warmup_iters = 2 * (epoch_size // micro_batch_size) // devices # 2 epochs
41 |
42 |
43 |
44 | ds_config = {
45 | "train_micro_batch_size_per_gpu": micro_batch_size,
46 | "gradient_accumulation_steps": gradient_accumulation_steps,
47 | "zero_optimization": {"stage": 2},
48 | }
49 |
50 |
51 | def setup(
52 | data_dir: Path = Path("data/alpaca"),
53 | checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
54 | out_dir: Path = Path("out/adapter/alpaca"),
55 | precision: Optional[str] = None,
56 | tpu: bool = False,
57 | ):
58 | if precision is None:
59 | precision = "32-true" if tpu else "bf16-true"
60 | strategy = (
61 | "auto"
62 | if devices <= 1
63 | else XLAStrategy(sync_module_states=False) if tpu else DeepSpeedStrategy(config=ds_config)
64 | )
65 | # For multi-host TPU training, the device count for Fabric is limited to the count on a single host.
66 | fabric_devices = "auto" if (tpu and devices > 1) else devices
67 | fabric = L.Fabric(devices=fabric_devices, strategy=strategy, precision=precision)
68 | fabric.launch(main, data_dir, checkpoint_dir, out_dir)
69 |
70 |
71 | def main(
72 | fabric: L.Fabric = None,
73 | data_dir: Path = Path("data/alpaca"),
74 | checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
75 | out_dir: Path = Path("out/adapter/alpaca"),
76 | ):
77 | check_valid_checkpoint_dir(checkpoint_dir)
78 | fabric.seed_everything(1337 + fabric.global_rank)
79 |
80 | if fabric.global_rank == 0:
81 | os.makedirs(out_dir, exist_ok=True)
82 |
83 | train_data, val_data = load_datasets(data_dir=data_dir)
84 |
85 | config = Config.from_name(name=checkpoint_dir.name)
86 | checkpoint_path = checkpoint_dir / "lit_model.pth"
87 | fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}")
88 | with fabric.init_module():
89 | model = Parrot(config)
90 | with lazy_load(checkpoint_path) as checkpoint:
91 | model.load_state_dict(checkpoint, strict=False)
92 |
93 | mark_only_adapter_as_trainable(model)
94 |
95 | num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
96 | fabric.print(f"Number of trainable parameters: {num_params}")
97 |
98 | optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
99 | model, optimizer = fabric.setup(model, optimizer)
100 |
101 | train_time = time.time()
102 | train(fabric, model, optimizer, train_data, val_data, checkpoint_dir, out_dir)
103 | print(f"Training time: {(time.time()-train_time):.2f}s")
104 |
105 | # Save the final checkpoint at the end of training
106 | save_path = out_dir / "lit_model_adapter_finetuned.pth"
107 | fabric.print(f"Saving adapter weights to {str(save_path)!r}")
108 | save_model_checkpoint(fabric, model, save_path)
109 |
110 |
111 | def train(
112 | fabric: L.Fabric,
113 | model: torch.nn.Module,
114 | optimizer: torch.optim.Optimizer,
115 | train_data: np.ndarray,
116 | val_data: np.ndarray,
117 | checkpoint_dir: Path,
118 | out_dir: Path,
119 | ) -> None:
120 | """The training loop.
121 |
122 | Loosely based on the nanoGPT implementation: https://github.com/karpathy/nanoGPT.
123 | """
124 | step_count = 0
125 |
126 | tokenizer = Tokenizer(checkpoint_dir / "tokenizer.json", checkpoint_dir / "tokenizer_config.json")
127 |
128 | if fabric.device.type == "xla":
129 | import torch_xla.core.xla_model as xm
130 |
131 | xm.mark_step()
132 | for iter_num in range(max_iters):
133 | if step_count <= warmup_iters:
134 | # linear warmup
135 | lr = learning_rate * step_count / warmup_iters
136 | for param_group in optimizer.param_groups:
137 | param_group["lr"] = lr
138 |
139 | t0 = time.time()
140 |
141 | input_ids, targets = get_batch(fabric, train_data)
142 |
143 | with fabric.no_backward_sync(model, enabled=((iter_num + 1) % gradient_accumulation_steps != 0)):
144 | logits = model(input_ids)
145 | loss = loss_fn(logits, targets)
146 | fabric.backward(loss / gradient_accumulation_steps)
147 |
148 | if (iter_num + 1) % gradient_accumulation_steps == 0:
149 | optimizer.step()
150 | if fabric.device.type == "xla":
151 | xm.mark_step()
152 | optimizer.zero_grad()
153 | step_count += 1
154 |
155 | if step_count % eval_interval == 0:
156 | val_loss = validate(fabric, model, val_data, tokenizer)
157 | fabric.print(f"step {iter_num}: val loss {val_loss:.4f}")
158 | fabric.barrier()
159 |
160 | if step_count % save_interval == 0:
161 | save_path = out_dir / f"iter-{iter_num:06d}.pth"
162 | fabric.print(f"Saving adapter weights to {str(save_path)!r}")
163 | # TODO: Provide a function/script to merge the adapter weights with pretrained weights
164 | save_model_checkpoint(fabric, model, save_path)
165 | else:
166 | if fabric.device.type == "xla":
167 | xm.mark_step()
168 |
169 | dt = time.time() - t0
170 | if iter_num % log_interval == 0:
171 | fabric.print(f"iter {iter_num}: loss {loss.item():.4f}, time: {dt*1000:.2f}ms")
172 |
173 |
174 | @torch.no_grad()
175 | def validate(
176 | fabric: L.Fabric, model: torch.nn.Module, val_data: np.ndarray, tokenizer: Tokenizer
177 | ) -> torch.Tensor:
178 | fabric.print("Validating ...")
179 | model.eval()
180 | losses = torch.zeros(eval_iters)
181 | for k in range(eval_iters):
182 | input_ids, targets = get_batch(fabric, val_data)
183 | logits = model(input_ids)
184 | loss = loss_fn(logits, targets)
185 | losses[k] = loss.item()
186 | val_loss = losses.mean()
187 |
188 | # produce an example:
189 | instruction = "Recommend a movie for me to watch during the weekend and explain the reason."
190 | fabric.print(instruction)
191 | sample = {"instruction": instruction, "input": ""}
192 | prompt = generate_prompt(sample)
193 | encoded = tokenizer.encode(prompt, device=model.device)
194 | output = generate(
195 | model, idx=encoded, max_returned_tokens=len(encoded) + 100, max_seq_length=model.config.block_size, temperature=0.8
196 | )
197 | output = tokenizer.decode(output)
198 | fabric.print(output)
199 |
200 | model.train()
201 | return val_loss.item()
202 |
203 |
204 | def loss_fn(logits, targets):
205 | # shift the targets such that output n predicts token n+1
206 | logits = logits[..., :-1, :].contiguous()
207 | targets = targets[..., 1:].contiguous()
208 | loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
209 | return loss
210 |
211 |
212 | def get_batch(fabric: L.Fabric, data: list):
213 | ix = torch.randint(len(data), (micro_batch_size,))
214 |
215 | input_ids = [data[i]["input_ids"].type(torch.int64) for i in ix]
216 | labels = [data[i]["labels"].type(torch.int64) for i in ix]
217 |
218 | max_len = max(len(s) for s in input_ids) if fabric.device.type != "xla" else max_seq_length
219 |
220 | def pad_right(x, pad_id):
221 | # pad right based on the longest sequence
222 | n = max_len - len(x)
223 | return torch.cat((x, torch.full((n,), pad_id, dtype=x.dtype)))
224 |
225 | x = torch.stack([pad_right(x, pad_id=0) for x in input_ids])
226 | y = torch.stack([pad_right(x, pad_id=-1) for x in labels])
227 |
228 | if fabric.device.type in ("mps", "xla"):
229 | x, y = fabric.to_device((x, y))
230 | else:
231 | x, y = fabric.to_device((x.pin_memory(), y.pin_memory()))
232 |
233 | return x, y
234 |
235 |
236 | def load_datasets(data_dir: Path):
237 | train_data = torch.load(data_dir / "train.pt")
238 | val_data = torch.load(data_dir / "test.pt")
239 | return train_data, val_data
240 |
241 |
242 | def save_model_checkpoint(fabric, model, file_path: Path):
243 | file_path = Path(file_path)
244 |
245 | if isinstance(fabric.strategy, DeepSpeedStrategy):
246 | from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
247 |
248 | tmp_path = file_path.with_suffix(".tmp")
249 | fabric.save(tmp_path, {"model": model})
250 | fabric.barrier()
251 | if fabric.global_rank == 0:
252 | # Create a consolidated checkpoint with the same name next to the deepspeed checkpoint
253 | # and only keep the adapter weights
254 | state_dict = get_fp32_state_dict_from_zero_checkpoint(tmp_path)
255 | state_dict = adapter_state_from_state_dict(state_dict)
256 | torch.save(state_dict, file_path)
257 | shutil.rmtree(tmp_path)
258 | else:
259 | state_dict = adapter_state_from_state_dict(model.state_dict())
260 | if fabric.global_rank == 0:
261 | torch.save(state_dict, file_path)
262 | fabric.barrier()
263 |
264 |
265 | if __name__ == "__main__":
266 | # Uncomment this line if you see an error: "Expected is_sm80 to be true, but got false"
267 | # torch.backends.cuda.enable_flash_sdp(False)
268 | torch.set_float32_matmul_precision("high")
269 |
270 | from jsonargparse.cli import CLI
271 |
272 | warnings.filterwarnings(
273 | # false positive using deepspeed: https://github.com/Lightning-AI/lightning/pull/17761#discussion_r1219705307
274 | "ignore",
275 | message="Remove `.no_backward_sync()` from your code",
276 | )
277 |
278 |
279 | import datetime
280 | started = datetime.datetime.now()
281 |
282 | CLI(setup)
283 |
284 | now = datetime.datetime.now()
285 | print("started:", started)
286 | print("finished:", now)
287 |
288 | print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)
--------------------------------------------------------------------------------
/lit-benchmarks/falcon-7b/finetune/adapter_v2.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import sys
4 | import time
5 | import warnings
6 | from pathlib import Path
7 | from typing import Optional
8 |
9 | import lightning as L
10 | import numpy as np
11 | import torch
12 | from lightning.fabric.strategies import DeepSpeedStrategy, XLAStrategy
13 |
14 | # support running without installing as a package
15 | wd = Path(__file__).parent.parent.resolve()
16 | sys.path.append(str(wd))
17 |
18 | from generate.base import generate
19 | from lit_parrot.adapter import Parrot, Config
20 | from lit_parrot.adapter_v2 import (
21 | mark_only_adapter_v2_as_trainable,
22 | add_adapter_v2_parameters_to_linear_layers,
23 | adapter_v2_state_from_state_dict,
24 | )
25 | from lit_parrot.tokenizer import Tokenizer
26 | from lit_parrot.utils import lazy_load, check_valid_checkpoint_dir
27 | from scripts.prepare_alpaca import generate_prompt
28 |
29 | eval_interval = 600
30 | save_interval = 1000
31 | eval_iters = 100
32 | log_interval = 1
33 | devices = 1
34 |
35 | # Hyperparameters
36 | learning_rate = 9e-3
37 | batch_size = 128 / devices
38 | micro_batch_size = 1 # set to 2 because this is fit into 12GB Vram
39 | gradient_accumulation_iters = batch_size // micro_batch_size
40 | assert gradient_accumulation_iters > 0
41 | epoch_size = 52000 # train dataset size
42 | num_epochs = 1
43 | max_iters = num_epochs * (epoch_size // micro_batch_size) // devices
44 | weight_decay = 0.02
45 | warmup_iters = 2 * (epoch_size // micro_batch_size) // devices # 2 epochs
46 |
47 | ds_config = {
48 | "train_micro_batch_size_per_gpu": micro_batch_size,
49 | "gradient_accumulation_steps": gradient_accumulation_iters,
50 | "zero_optimization": {"stage": 2},
51 | }
52 |
53 |
54 | def setup(
55 | data_dir: Path = Path("data/alpaca"),
56 | checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
57 | out_dir: Path = Path("out/adapter_v2/alpaca"),
58 | precision: Optional[str] = None,
59 | tpu: bool = False,
60 | ):
61 | if precision is None:
62 | precision = "32-true" if tpu else "bf16-true"
63 | strategy = (
64 | "auto"
65 | if devices <= 1
66 | else XLAStrategy(sync_module_states=False) if tpu else DeepSpeedStrategy(config=ds_config)
67 | )
68 | # For multi-host TPU training, the device count for Fabric is limited to the count on a single host.
69 | fabric_devices = "auto" if (tpu and devices > 1) else devices
70 | fabric = L.Fabric(devices=fabric_devices, strategy=strategy, precision=precision)
71 | fabric.launch(main, data_dir, checkpoint_dir, out_dir)
72 |
73 |
74 | def main(
75 | fabric: L.Fabric = None,
76 | data_dir: Path = Path("data/alpaca"),
77 | checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
78 | out_dir: Path = Path("out/adapter_v2/alpaca"),
79 | ):
80 | check_valid_checkpoint_dir(checkpoint_dir)
81 | fabric.seed_everything(1337 + fabric.global_rank)
82 |
83 | if fabric.global_rank == 0:
84 | os.makedirs(out_dir, exist_ok=True)
85 |
86 | train_data, val_data = load_datasets(data_dir=data_dir)
87 |
88 | config = Config.from_name(name=checkpoint_dir.name)
89 | checkpoint_path = checkpoint_dir / "lit_model.pth"
90 | fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}")
91 | with fabric.init_module():
92 | model = Parrot(config)
93 | with lazy_load(checkpoint_dir / "lit_model.pth") as checkpoint:
94 | # strict=False because missing keys due to adapter weights not contained in state dict
95 | model.load_state_dict(checkpoint, strict=False)
96 |
97 | add_adapter_v2_parameters_to_linear_layers(model)
98 | mark_only_adapter_v2_as_trainable(model)
99 |
100 | num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
101 | fabric.print(f"Number of trainable parameters: {num_params}")
102 |
103 | optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
104 | model, optimizer = fabric.setup(model, optimizer)
105 | train(fabric, model, optimizer, train_data, val_data, checkpoint_dir, out_dir)
106 |
107 | # Save the final checkpoint at the end of training
108 | save_path = out_dir / "lit_model_adapter_finetuned.pth"
109 | fabric.print(f"Saving adapter weights to {str(save_path)!r}")
110 | save_model_checkpoint(fabric, model, save_path)
111 |
112 |
113 | def train(
114 | fabric: L.Fabric,
115 | model: torch.nn.Module,
116 | optimizer: torch.optim.Optimizer,
117 | train_data: np.ndarray,
118 | val_data: np.ndarray,
119 | checkpoint_dir: Path,
120 | out_dir: Path,
121 | ) -> None:
122 | """The training loop.
123 |
124 | Loosely based on the nanoGPT implementation: https://github.com/karpathy/nanoGPT.
125 | """
126 | step_count = 0
127 |
128 | tokenizer = Tokenizer(checkpoint_dir / "tokenizer.json", checkpoint_dir / "tokenizer_config.json")
129 |
130 | if fabric.device.type == "xla":
131 | import torch_xla.core.xla_model as xm
132 |
133 | xm.mark_step()
134 | for iter_num in range(max_iters):
135 | if step_count <= warmup_iters:
136 | # linear warmup
137 | lr = learning_rate * step_count / warmup_iters
138 | for param_group in optimizer.param_groups:
139 | param_group["lr"] = lr
140 |
141 | t0 = time.time()
142 |
143 | input_ids, targets = get_batch(fabric, train_data)
144 |
145 | with fabric.no_backward_sync(model, enabled=((iter_num + 1) % gradient_accumulation_iters != 0)):
146 | logits = model(input_ids)
147 | loss = loss_fn(logits, targets)
148 | fabric.backward(loss / gradient_accumulation_iters)
149 |
150 | if (iter_num + 1) % gradient_accumulation_iters == 0:
151 | optimizer.step()
152 | if fabric.device.type == "xla":
153 | xm.mark_step()
154 | optimizer.zero_grad()
155 | step_count += 1
156 |
157 | if step_count % eval_interval == 0:
158 | val_loss = validate(fabric, model, val_data, tokenizer)
159 | fabric.print(f"step {iter_num}: val loss {val_loss:.4f}")
160 | fabric.barrier()
161 |
162 | if step_count % save_interval == 0:
163 | save_path = out_dir / f"iter-{iter_num:06d}.pth"
164 | fabric.print(f"Saving adapter weights to {str(save_path)!r}")
165 | # TODO: Provide a function/script to merge the adapter weights with pretrained weights
166 | save_model_checkpoint(fabric, model, save_path)
167 | else:
168 | if fabric.device.type == "xla":
169 | xm.mark_step()
170 |
171 | dt = time.time() - t0
172 | if iter_num % log_interval == 0:
173 | fabric.print(f"iter {iter_num}: loss {loss.item():.4f}, time: {dt*1000:.2f}ms")
174 |
175 |
176 | @torch.no_grad()
177 | def validate(
178 | fabric: L.Fabric, model: torch.nn.Module, val_data: np.ndarray, tokenizer: Tokenizer
179 | ) -> torch.Tensor:
180 | fabric.print("Validating ...")
181 | model.eval()
182 | losses = torch.zeros(eval_iters)
183 | for k in range(eval_iters):
184 | input_ids, targets = get_batch(fabric, val_data)
185 | logits = model(input_ids)
186 | loss = loss_fn(logits, targets)
187 | losses[k] = loss.item()
188 | val_loss = losses.mean()
189 |
190 | # produce an example:
191 | instruction = "Recommend a movie for me to watch during the weekend and explain the reason."
192 | fabric.print(instruction)
193 | sample = {"instruction": instruction, "input": ""}
194 | prompt = generate_prompt(sample)
195 | encoded = tokenizer.encode(prompt, device=model.device)
196 | output = generate(
197 | model, idx=encoded, max_returned_tokens=len(encoded) + 100, max_seq_length=model.config.block_size, temperature=0.8
198 | )
199 | output = tokenizer.decode(output)
200 | fabric.print(output)
201 |
202 | model.train()
203 | return val_loss.item()
204 |
205 |
206 | def loss_fn(logits, targets):
207 | # shift the targets such that output n predicts token n+1
208 | logits = logits[..., :-1, :].contiguous()
209 | targets = targets[..., 1:].contiguous()
210 | loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
211 | return loss
212 |
213 |
214 | def get_batch(fabric: L.Fabric, data: list):
215 | ix = torch.randint(len(data), (micro_batch_size,))
216 |
217 | input_ids = [data[i]["input_ids"].type(torch.int64) for i in ix]
218 | labels = [data[i]["labels"].type(torch.int64) for i in ix]
219 |
220 | max_len = max(len(s) for s in input_ids) if fabric.device.type != "xla" else max_seq_length
221 |
222 | def pad_right(x, pad_id):
223 | # pad right based on the longest sequence
224 | n = max_len - len(x)
225 | return torch.cat((x, torch.full((n,), pad_id, dtype=x.dtype)))
226 |
227 | x = torch.stack([pad_right(x, pad_id=0) for x in input_ids])
228 | y = torch.stack([pad_right(x, pad_id=-1) for x in labels])
229 |
230 | if fabric.device.type in ("mps", "xla"):
231 | x, y = fabric.to_device((x, y))
232 | else:
233 | x, y = fabric.to_device((x.pin_memory(), y.pin_memory()))
234 |
235 | return x, y
236 |
237 |
238 | def load_datasets(data_dir: Path):
239 | train_data = torch.load(data_dir / "train.pt")
240 | val_data = torch.load(data_dir / "test.pt")
241 | return train_data, val_data
242 |
243 |
244 | def save_model_checkpoint(fabric, model, file_path: Path):
245 | file_path = Path(file_path)
246 |
247 | if isinstance(fabric.strategy, DeepSpeedStrategy):
248 | from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
249 |
250 | tmp_path = file_path.with_suffix(".tmp")
251 | fabric.save(tmp_path, {"model": model})
252 | fabric.barrier()
253 | if fabric.global_rank == 0:
254 | # Create a consolidated checkpoint with the same name next to the deepspeed checkpoint
255 | # and only keep the adapter weights
256 | state_dict = get_fp32_state_dict_from_zero_checkpoint(tmp_path)
257 | state_dict = adapter_v2_state_from_state_dict(state_dict)
258 | torch.save(state_dict, file_path)
259 | shutil.rmtree(tmp_path)
260 | else:
261 | state_dict = adapter_v2_state_from_state_dict(model.state_dict())
262 | if fabric.global_rank == 0:
263 | torch.save(state_dict, file_path)
264 | fabric.barrier()
265 |
266 |
267 | if __name__ == "__main__":
268 | # Uncomment this line if you see an error: "Expected is_sm80 to be true, but got false"
269 | # torch.backends.cuda.enable_flash_sdp(False)
270 | torch.set_float32_matmul_precision("high")
271 |
272 | from jsonargparse.cli import CLI
273 |
274 | warnings.filterwarnings(
275 | # false positive using deepspeed: https://github.com/Lightning-AI/lightning/pull/17761#discussion_r1219705307
276 | "ignore",
277 | message="Remove `.no_backward_sync()` from your code",
278 | )
279 | import datetime
280 | started = datetime.datetime.now()
281 |
282 | CLI(setup)
283 |
284 | now = datetime.datetime.now()
285 | print("started:", started)
286 | print("finished:", now)
287 |
288 | print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)
--------------------------------------------------------------------------------
/lit-benchmarks/falcon-7b/finetune/full.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import sys
4 | import time
5 | import warnings
6 | from pathlib import Path
7 | from typing import Literal
8 |
9 | import lightning as L
10 | import numpy as np
11 | import torch
12 | from lightning.fabric.accelerators.mps import MPSAccelerator
13 | from lightning.fabric.strategies import DeepSpeedStrategy
14 |
15 | # support running without installing as a package
16 | wd = Path(__file__).parent.parent.resolve()
17 | sys.path.append(str(wd))
18 |
19 | from generate.base import generate
20 | from lit_parrot.config import Config
21 | from lit_parrot.model import Parrot
22 | from lit_parrot.tokenizer import Tokenizer
23 | from lit_parrot.utils import lazy_load, check_valid_checkpoint_dir
24 | from scripts.prepare_alpaca import generate_prompt
25 |
26 | eval_interval = 600
27 | save_interval = 1000
28 | eval_iters = 100
29 | log_interval = 1
30 | devices = 6
31 |
32 | # Hyperparameters
33 | learning_rate = 9e-3
34 | batch_size = 128 / devices
35 | micro_batch_size = 1
36 | gradient_accumulation_steps = batch_size // micro_batch_size
37 | assert gradient_accumulation_steps > 0
38 | epoch_size = 50000 * 6 # train dataset size
39 | num_epochs = 1
40 | max_iters = num_epochs * (epoch_size // micro_batch_size) // devices
41 | weight_decay = 0.02
42 | max_seq_length = 256 # see scripts/prepare_alpaca.py
43 | warmup_iters = 2 * (epoch_size // micro_batch_size) // devices # 2 epochs
44 |
45 | ds_config = {
46 | "train_micro_batch_size_per_gpu": micro_batch_size,
47 | "gradient_accumulation_steps": gradient_accumulation_steps,
48 | "zero_optimization": {"stage": 3},
49 | }
50 |
51 |
52 | def main(
53 | data_dir: Path = Path("data/alpaca"),
54 | checkpoint_dir: Path = Path("checkpoints/tiiuae/falcon-7b"),
55 | out_dir: Path = Path("out/full/alpaca"),
56 | precision: Literal["bf16-true", "32-true", "bf16-mixed"] = "bf16-true",
57 | ):
58 | check_valid_checkpoint_dir(checkpoint_dir)
59 |
60 | fabric = L.Fabric(
61 | devices=devices, strategy=(DeepSpeedStrategy(config=ds_config) if devices > 1 else "auto"), precision=precision
62 | )
63 | fabric.launch()
64 | fabric.seed_everything(1337 + fabric.global_rank)
65 |
66 | if fabric.global_rank == 0:
67 | os.makedirs(out_dir, exist_ok=True)
68 |
69 | train_data, val_data = load_datasets(data_dir=data_dir)
70 |
71 | config = Config.from_name(name=checkpoint_dir.name, block_size=max_seq_length)
72 | checkpoint_path = checkpoint_dir / "lit_model.pth"
73 | fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}")
74 |
75 |
76 | #with fabric.init_module():
77 | # model = Parrot(config)
78 |
79 | #with lazy_load(checkpoint_path) as checkpoint:
80 | # model.load_state_dict(checkpoint, strict=False)
81 |
82 | checkpoint = torch.load(checkpoint_path)
83 | with fabric.device:
84 | torch.set_default_tensor_type(torch.HalfTensor)
85 | model = Parrot(config).bfloat16()
86 | torch.set_default_tensor_type(torch.FloatTensor)
87 | model.load_state_dict(checkpoint, strict=False)
88 |
89 |
90 | num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
91 | fabric.print(f"Number of trainable parameters: {num_params}")
92 |
93 | optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
94 | model, optimizer = fabric.setup(model, optimizer)
95 | train(fabric, model, optimizer, train_data, val_data, checkpoint_dir, out_dir)
96 |
97 | # Save the final checkpoint at the end of training
98 | save_path = out_dir / "lit_model_full_finetuned.pth"
99 | fabric.print(f"Saving weights to {str(save_path)!r}")
100 | save_model_checkpoint(fabric, model, save_path)
101 |
102 |
103 | def train(
104 | fabric: L.Fabric,
105 | model: torch.nn.Module,
106 | optimizer: torch.optim.Optimizer,
107 | train_data: np.ndarray,
108 | val_data: np.ndarray,
109 | checkpoint_dir: Path,
110 | out_dir: Path,
111 | ) -> None:
112 | """The training loop.
113 |
114 | Loosely based on the nanoGPT implementation: https://github.com/karpathy/nanoGPT.
115 | """
116 | step_count = 0
117 |
118 | tokenizer = Tokenizer(checkpoint_dir / "tokenizer.json", checkpoint_dir / "tokenizer_config.json")
119 |
120 | for iter_num in range(max_iters):
121 | if step_count <= warmup_iters:
122 | # linear warmup
123 | lr = learning_rate * step_count / warmup_iters
124 | for param_group in optimizer.param_groups:
125 | param_group["lr"] = lr
126 |
127 | t0 = time.time()
128 |
129 | input_ids, targets = get_batch(fabric, train_data)
130 |
131 | with fabric.no_backward_sync(model, enabled=((iter_num + 1) % gradient_accumulation_steps != 0)):
132 | logits = model(input_ids)
133 | loss = loss_fn(logits, targets)
134 | fabric.backward(loss / gradient_accumulation_steps)
135 |
136 | if (iter_num + 1) % gradient_accumulation_steps == 0:
137 | optimizer.step()
138 | optimizer.zero_grad()
139 | step_count += 1
140 |
141 | if step_count % eval_interval == 0:
142 | val_loss = validate(fabric, model, val_data, tokenizer)
143 | fabric.print(f"step {iter_num}: val loss {val_loss:.4f}")
144 | fabric.barrier()
145 |
146 | if step_count % save_interval == 0:
147 | save_path = out_dir / f"iter-{iter_num:06d}.pth"
148 | fabric.print(f"Saving weights to {str(save_path)!r}")
149 | save_model_checkpoint(fabric, model, save_path)
150 |
151 | dt = time.time() - t0
152 | if iter_num % log_interval == 0:
153 | fabric.print(f"iter {iter_num}: loss {loss.item():.4f}, time: {dt*1000:.2f}ms")
154 |
155 |
156 | @torch.no_grad()
157 | def validate(fabric: L.Fabric, model: torch.nn.Module, val_data: np.ndarray, tokenizer: Tokenizer) -> torch.Tensor:
158 | fabric.print("Validating ...")
159 | model.eval()
160 | losses = torch.zeros(eval_iters)
161 | for k in range(eval_iters):
162 | input_ids, targets = get_batch(fabric, val_data)
163 | logits = model(input_ids)
164 | loss = loss_fn(logits, targets)
165 | losses[k] = loss.item()
166 | val_loss = losses.mean()
167 |
168 | # produce an example:
169 | instruction = "Recommend a movie for me to watch during the weekend and explain the reason."
170 | fabric.print(instruction)
171 | sample = {"instruction": instruction, "input": ""}
172 | prompt = generate_prompt(sample)
173 | encoded = tokenizer.encode(prompt, device=model.device)
174 | output = generate(
175 | model, idx=encoded, max_returned_tokens=len(encoded) + 100, max_seq_length=max_seq_length, temperature=0.8
176 | )
177 | output = tokenizer.decode(output)
178 | fabric.print(output)
179 |
180 | model.train()
181 | return val_loss.item()
182 |
183 |
184 | def loss_fn(logits, targets):
185 | # shift the targets such that output n predicts token n+1
186 | logits = logits[..., :-1, :].contiguous()
187 | targets = targets[..., 1:].contiguous()
188 | loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
189 | return loss
190 |
191 |
192 | def get_batch(fabric: L.Fabric, data: list):
193 | ix = torch.randint(len(data), (micro_batch_size,))
194 |
195 | input_ids = [data[i]["input_ids"].type(torch.int64) for i in ix]
196 | labels = [data[i]["labels"].type(torch.int64) for i in ix]
197 |
198 | max_len = max(len(s) for s in input_ids)
199 |
200 | def pad_right(x, pad_id):
201 | # pad right based on the longest sequence
202 | n = max_len - len(x)
203 | return torch.cat((x, torch.full((n,), pad_id, dtype=x.dtype)))
204 |
205 | x = torch.stack([pad_right(x, pad_id=0) for x in input_ids])
206 | y = torch.stack([pad_right(x, pad_id=-1) for x in labels])
207 |
208 | if isinstance(fabric.accelerator, MPSAccelerator):
209 | x, y = fabric.to_device((x, y))
210 | else:
211 | x, y = fabric.to_device((x.pin_memory(), y.pin_memory()))
212 |
213 | return x, y
214 |
215 |
216 | def load_datasets(data_dir: Path):
217 | train_data = torch.load(data_dir / "train.pt")
218 | val_data = torch.load(data_dir / "test.pt")
219 | return train_data, val_data
220 |
221 |
222 | def save_model_checkpoint(fabric, model, file_path: Path):
223 | file_path = Path(file_path)
224 |
225 | if isinstance(fabric.strategy, DeepSpeedStrategy):
226 | from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
227 |
228 | tmp_path = file_path.with_suffix(".tmp")
229 | fabric.save(tmp_path, {"model": model})
230 | fabric.barrier()
231 | if fabric.global_rank == 0:
232 | state_dict = get_fp32_state_dict_from_zero_checkpoint(tmp_path)
233 | torch.save(state_dict, file_path)
234 | shutil.rmtree(tmp_path)
235 | else:
236 | if fabric.global_rank == 0:
237 | torch.save(model.state_dict(), file_path)
238 | fabric.barrier()
239 |
240 |
241 | if __name__ == "__main__":
242 | # Uncomment this line if you see an error: "Expected is_sm80 to be true, but got false"
243 | # torch.backends.cuda.enable_flash_sdp(False)
244 | torch.set_float32_matmul_precision("high")
245 |
246 | from jsonargparse.cli import CLI
247 | warnings.filterwarnings(
248 | # false positive using deepspeed: https://github.com/Lightning-AI/lightning/pull/17761#discussion_r1219705307
249 | "ignore", message="Remove `.no_backward_sync()` from your code",
250 | )
251 | import datetime
252 | started = datetime.datetime.now()
253 |
254 | CLI(main)
255 |
256 | now = datetime.datetime.now()
257 | print("started:", started)
258 | print("finished:", now)
259 |
260 | print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)
--------------------------------------------------------------------------------
/lit-benchmarks/falcon-7b/finetune/lora.py:
--------------------------------------------------------------------------------
1 | """
2 | Instruction-tuning with LoRA on the Alpaca dataset.
3 |
4 | Note: If you run into a CUDA error "Expected is_sm80 to be true, but got false", uncomment the line
5 | `torch.backends.cuda.enable_flash_sdp(False)` in the script below (see https://github.com/Lightning-AI/lit-llama/issues/101).
6 | """
7 | import os
8 | import sys
9 | import time
10 | import warnings
11 | from pathlib import Path
12 | from typing import Optional
13 |
14 | import lightning as L
15 | import numpy as np
16 | import torch
17 | from lightning.fabric.strategies import DeepSpeedStrategy, XLAStrategy
18 |
19 | # support running without installing as a package
20 | wd = Path(__file__).parent.parent.resolve()
21 | sys.path.append(str(wd))
22 |
23 | from generate.base import generate
24 | from lit_parrot.lora import mark_only_lora_as_trainable, lora, lora_state_dict
25 | from lit_parrot.model import Parrot, Config
26 | from lit_parrot.tokenizer import Tokenizer
27 | from lit_parrot.utils import lazy_load, check_valid_checkpoint_dir
28 | from scripts.prepare_alpaca import generate_prompt
29 |
30 |
31 | eval_interval = 100
32 | save_interval = 100
33 | eval_iters = 100
34 | log_interval = 1
35 | devices = 1
36 |
37 | # Hyperparameters
38 | learning_rate = 3e-4
39 | batch_size = 128
40 | micro_batch_size = 1
41 | gradient_accumulation_iters = batch_size // micro_batch_size
42 | assert gradient_accumulation_iters > 0
43 | max_iters = 52000 # train dataset size
44 | weight_decay = 0.01
45 | lora_r = 8
46 | lora_alpha = 16
47 | lora_dropout = 0.05
48 | warmup_iters = 100
49 |
50 | ds_config = {
51 | "train_micro_batch_size_per_gpu": micro_batch_size,
52 | "gradient_accumulation_steps": gradient_accumulation_iters,
53 | "zero_optimization": {"stage": 2},
54 | }
55 |
56 |
57 | def setup(
58 | data_dir: Path = Path("data/alpaca"),
59 | checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
60 | out_dir: Path = Path("out/lora/alpaca"),
61 | precision: Optional[str] = None,
62 | tpu: bool = False,
63 | ):
64 | if precision is None:
65 | precision = "32-true" if tpu else "bf16-true"
66 | strategy = (
67 | "auto"
68 | if devices <= 1
69 | else XLAStrategy(sync_module_states=False) if tpu else DeepSpeedStrategy(config=ds_config)
70 | )
71 | # For multi-host TPU training, the device count for Fabric is limited to the count on a single host.
72 | fabric_devices = "auto" if (tpu and devices > 1) else devices
73 | fabric = L.Fabric(devices=fabric_devices, strategy=strategy, precision=precision)
74 | fabric.launch(main, data_dir, checkpoint_dir, out_dir)
75 |
76 |
77 | def main(
78 | fabric: L.Fabric = None,
79 | data_dir: Path = Path("data/alpaca"),
80 | checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
81 | out_dir: Path = Path("out/lora/alpaca"),
82 | ):
83 | check_valid_checkpoint_dir(checkpoint_dir)
84 | fabric.seed_everything(1337 + fabric.global_rank)
85 |
86 | if fabric.global_rank == 0:
87 | os.makedirs(out_dir, exist_ok=True)
88 |
89 | train_data, val_data = load_datasets(data_dir=data_dir)
90 |
91 | config = Config.from_name(name=checkpoint_dir.name)
92 | checkpoint_path = checkpoint_dir / "lit_model.pth"
93 | fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}")
94 | with fabric.init_module(), lora(r=lora_r, alpha=lora_alpha, dropout=lora_dropout, enabled=True):
95 | model = Parrot(config)
96 | with lazy_load(checkpoint_path) as checkpoint:
97 | # strict=False because missing keys due to LoRA weights not contained in state dict
98 | model.load_state_dict(checkpoint, strict=False)
99 |
100 | mark_only_lora_as_trainable(model)
101 | num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
102 | fabric.print(f"Number of trainable parameters: {num_params}")
103 |
104 | optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
105 | model, optimizer = fabric.setup(model, optimizer)
106 | train(fabric, model, optimizer, train_data, val_data, checkpoint_dir, out_dir)
107 |
108 | # Save the final LoRA checkpoint at the end of training
109 | save_path = out_dir / "lit_model_lora_finetuned.pth"
110 | save_lora_checkpoint(fabric, model, path=save_path)
111 |
112 |
113 | def train(
114 | fabric: L.Fabric,
115 | model: torch.nn.Module,
116 | optimizer: torch.optim.Optimizer,
117 | train_data: np.ndarray,
118 | val_data: np.ndarray,
119 | checkpoint_dir: Path,
120 | out_dir: Path,
121 | ) -> None:
122 | """The training loop.
123 |
124 | Loosely based on the nanoGPT implementation: https://github.com/karpathy/nanoGPT.
125 | """
126 | step_count = 0
127 |
128 | tokenizer = Tokenizer(checkpoint_dir / "tokenizer.json", checkpoint_dir / "tokenizer_config.json")
129 |
130 | if fabric.device.type == "xla":
131 | import torch_xla.core.xla_model as xm
132 |
133 | xm.mark_step()
134 | for iter_num in range(max_iters):
135 | if step_count <= warmup_iters:
136 | # linear warmup
137 | lr = learning_rate * step_count / warmup_iters
138 | for param_group in optimizer.param_groups:
139 | param_group["lr"] = lr
140 |
141 | t0 = time.time()
142 |
143 | input_ids, targets = get_batch(fabric, train_data)
144 |
145 | with fabric.no_backward_sync(model, enabled=((iter_num + 1) % gradient_accumulation_iters != 0)):
146 | logits = model(input_ids)
147 | loss = loss_fn(logits, targets)
148 | fabric.backward(loss / gradient_accumulation_iters)
149 |
150 | if (iter_num + 1) % gradient_accumulation_iters == 0:
151 | optimizer.step()
152 | if fabric.device.type == "xla":
153 | xm.mark_step()
154 | optimizer.zero_grad()
155 | step_count += 1
156 |
157 | if step_count % eval_interval == 0:
158 | val_loss = validate(fabric, model, val_data, tokenizer)
159 | fabric.print(f"step {iter_num}: val loss {val_loss:.4f}")
160 | fabric.barrier()
161 |
162 | if step_count % save_interval == 0:
163 | # We are only saving the LoRA weights
164 | save_path = out_dir / f"iter-{iter_num:06d}.pth"
165 | save_lora_checkpoint(fabric, model, save_path)
166 | else:
167 | if fabric.device.type == "xla":
168 | xm.mark_step()
169 |
170 | dt = time.time() - t0
171 | if iter_num % log_interval == 0:
172 | fabric.print(f"iter {iter_num}: loss {loss.item():.4f}, time: {dt*1000:.2f}ms")
173 |
174 |
175 | @torch.no_grad()
176 | def validate(fabric: L.Fabric, model: torch.nn.Module, val_data: np.ndarray, tokenizer: Tokenizer) -> torch.Tensor:
177 | fabric.print("Validating ...")
178 | model.eval()
179 | losses = torch.zeros(eval_iters)
180 | for k in range(eval_iters):
181 | input_ids, targets = get_batch(fabric, val_data)
182 | logits = model(input_ids)
183 | loss = loss_fn(logits, targets)
184 | losses[k] = loss.item()
185 | val_loss = losses.mean()
186 |
187 | # produce an example:
188 | instruction = "Recommend a movie for me to watch during the weekend and explain the reason."
189 | fabric.print(instruction)
190 | sample = {"instruction": instruction, "input": ""}
191 | prompt = generate_prompt(sample)
192 | encoded = tokenizer.encode(prompt, device=model.device)
193 | output = generate(
194 | model,
195 | idx=encoded,
196 | max_returned_tokens=len(encoded) + 100,
197 | max_seq_length=model.config.block_size,
198 | temperature=0.8,
199 | )
200 | output = tokenizer.decode(output)
201 | fabric.print(output)
202 |
203 | model.train()
204 | return val_loss.item()
205 |
206 |
207 | def loss_fn(logits, targets):
208 | # shift the targets such that output n predicts token n+1
209 | logits = logits[..., :-1, :].contiguous()
210 | targets = targets[..., 1:].contiguous()
211 | loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
212 | return loss
213 |
214 |
215 | def get_batch(fabric: L.Fabric, data: list):
216 | ix = torch.randint(len(data), (micro_batch_size,))
217 |
218 | input_ids = [data[i]["input_ids"].type(torch.int64) for i in ix]
219 | labels = [data[i]["labels"].type(torch.int64) for i in ix]
220 |
221 | max_len = max(len(s) for s in input_ids) if fabric.device.type != "xla" else max_seq_length
222 |
223 | def pad_right(x, pad_id):
224 | # pad right based on the longest sequence
225 | n = max_len - len(x)
226 | return torch.cat((x, torch.full((n,), pad_id, dtype=x.dtype)))
227 |
228 | x = torch.stack([pad_right(x, pad_id=0) for x in input_ids])
229 | y = torch.stack([pad_right(x, pad_id=-1) for x in labels])
230 |
231 | if fabric.device.type in ("mps", "xla"):
232 | x, y = fabric.to_device((x, y))
233 | else:
234 | x, y = fabric.to_device((x.pin_memory(), y.pin_memory()))
235 | return x, y
236 |
237 |
238 | def load_datasets(data_dir: Path):
239 | train_data = torch.load(data_dir / "train.pt")
240 | val_data = torch.load(data_dir / "test.pt")
241 | return train_data, val_data
242 |
243 |
244 | def save_lora_checkpoint(fabric, model, path):
245 | fabric.print(f"Saving LoRA weights to {str(path)!r}")
246 | checkpoint = lora_state_dict(model)
247 | torch.save(checkpoint, path)
248 |
249 |
250 | if __name__ == "__main__":
251 | # Uncomment this line if you see an error: "Expected is_sm80 to be true, but got false"
252 | # torch.backends.cuda.enable_flash_sdp(False)
253 | torch.set_float32_matmul_precision("high")
254 |
255 | from jsonargparse.cli import CLI
256 |
257 | warnings.filterwarnings(
258 | # false positive using deepspeed: https://github.com/Lightning-AI/lightning/pull/17761#discussion_r1219705307
259 | "ignore",
260 | message="Remove `.no_backward_sync()` from your code",
261 | )
262 |
263 | import datetime
264 | started = datetime.datetime.now()
265 |
266 | CLI(setup)
267 |
268 | now = datetime.datetime.now()
269 | print("started:", started)
270 | print("finished:", now)
271 |
272 | print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)
--------------------------------------------------------------------------------