├── .gitignore
├── Converting the State Dict.ipynb
├── LICENSE
├── PROFILING.md
├── README.md
├── benchmarking
    ├── large_gpu_benchmarking.sh
    └── small_gpu_benchmarking.sh
├── benchmarks_03_2024.md
├── fsdp_multi_node.sh
├── hf_train.py
├── nbs
    ├── 00-profile_lora_qlora.ipynb
    ├── 00-profile_lora_qlora_hqq.ipynb
    ├── 01-ft_benchmarking.ipynb
    ├── 02-qlora-memeff-loading.ipynb
    └── HQQ.ipynb
├── profile.sh
├── profiling_utils.py
├── scripts
    ├── __init__.py
    ├── block_expansion.py
    ├── dora.py
    └── lora.py
├── table1.sh
├── tests
    ├── test_block_expansion.py
    └── test_dora.py
├── train.py
├── train.sh
├── train_hqq_bench.sh
└── train_sql.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | wandb
 2 | output
 3 | .ipynb_checkpoints
 4 | nbs/profile_snapshots
 5 | **/*checkpoint.ipynb
 6 | **/*.log
 7 | nbs/tmp-*
 8 | data/
 9 | sbatch_outputs/
10 | 
11 | # python ignores
12 | __pycache__/
13 | *.pyc
14 | *.pyo
15 | *.pyd


--------------------------------------------------------------------------------
/Converting the State Dict.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "b657476e-1865-4687-898d-276c69eda4bc",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Converting the State Dict\n",
  9 |     "\n",
 10 |     "The training script (`train.py`) doesn't support any fancy saving/checkpointing methods, but it does optionally save the model right at the end of training into a safetensors file. In this notebook we'll show how to load in these saved weights for downstream evaluation and usage. This should hopefully become unneeded as frameworks integrate the changes needed to make FSDP+QLoRA work natively."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "id": "a2a3fa90-3d40-45db-9e91-3489fc207a14",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "As an example, let's look at a model trained with the following command (using default settings for LoRA rank etc):\n",
 19 |     "\n",
 20 |     "`python train.py --save_model True --train_type qlora --output_dir qlora_output`\n",
 21 |     "\n",
 22 |     "We'll load the saved state_dict, and then copy the relevant weights into a PEFT model to save via their TODO method.\n",
 23 |     "\n",
 24 |     "Let's start by loading the state dict. If you uncomment the print statement, you'll see that for every linear layer that had a LoRA adapter, we have something like this:\n",
 25 |     "```\n",
 26 |     "base_model.model.model.layers.0.mlp.down_proj.base_layer.weight torch.bfloat16 torch.Size([11272192, 1])\n",
 27 |     "base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight torch.bfloat16 torch.Size([8, 11008])\n",
 28 |     "base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight torch.bfloat16 torch.Size([4096, 8])\n",
 29 |     "```\n",
 30 |     "\n",
 31 |     "The base weights are flattened and quantized 4-bit values, which we won't need (we'll load the original base model later), and the lora_A and lora_B adapters are the ones we're interested in."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 1,
 37 |    "id": "fb4bb4b5-a250-489c-be56-5db542ac882e",
 38 |    "metadata": {
 39 |     "scrolled": true
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "from safetensors import safe_open\n",
 44 |     "\n",
 45 |     "tensors = {}\n",
 46 |     "with safe_open(\"qlora_output/model_state_dict.safetensors\", framework=\"pt\", device=0) as f:\n",
 47 |     "    for k in f.keys():\n",
 48 |     "        tensors[k] = f.get_tensor(k) # loads the full tensor given a key\n",
 49 |     "        # print(k, tensors[k].dtype, tensors[k].shape) # Uncomment to view"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "id": "f96a4876-c355-4b00-be1e-853de6be9ce1",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "To save memory, we can delete everything but the LoRA layers:"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 2,
 63 |    "id": "7a63af21-1cbf-4c70-9841-63b1338ee757",
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "for k in tensors:\n",
 68 |     "    if 'lora' not in k: tensors[k] = None"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "id": "5aaa3483-cf79-44bf-9533-3937bd089f6e",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "Next, we load the base model and add a random adapter:"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 3,
 82 |    "id": "879e24cd-eb72-4d23-8583-12cd91ed117f",
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "data": {
 87 |       "application/vnd.jupyter.widget-view+json": {
 88 |        "model_id": "5aeb53e875144f57ad6683becddb7ed0",
 89 |        "version_major": 2,
 90 |        "version_minor": 0
 91 |       },
 92 |       "text/plain": [
 93 |        "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
 94 |       ]
 95 |      },
 96 |      "metadata": {},
 97 |      "output_type": "display_data"
 98 |     },
 99 |     {
100 |      "data": {
101 |       "text/plain": [
102 |        "['base_model.model.model.embed_tokens.weight',\n",
103 |        " 'base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight',\n",
104 |        " 'base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight.absmax',\n",
105 |        " 'base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight.quant_map',\n",
106 |        " 'base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight.quant_state.bitsandbytes__nf4',\n",
107 |        " 'base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight',\n",
108 |        " 'base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight',\n",
109 |        " 'base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight',\n",
110 |        " 'base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight.absmax',\n",
111 |        " 'base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight.quant_map']"
112 |       ]
113 |      },
114 |      "execution_count": 3,
115 |      "metadata": {},
116 |      "output_type": "execute_result"
117 |     }
118 |    ],
119 |    "source": [
120 |     "import torch\n",
121 |     "from transformers import LlamaForCausalLM, BitsAndBytesConfig\n",
122 |     "from peft import get_peft_config, get_peft_model, LoraConfig, TaskType\n",
123 |     "\n",
124 |     "# Make sure the compute type, target modules, rank, alpha etc match!\n",
125 |     "bnb_config = BitsAndBytesConfig(\n",
126 |     "    load_in_4bit=True,\n",
127 |     "    bnb_4bit_quant_type=\"nf4\",\n",
128 |     "    bnb_4bit_use_double_quant=False,\n",
129 |     "    bnb_4bit_compute_dtype=torch.bfloat16\n",
130 |     ")\n",
131 |     "model = LlamaForCausalLM.from_pretrained(\n",
132 |     "    \"meta-llama/Llama-2-7b-hf\",\n",
133 |     "    use_cache=False,\n",
134 |     "    quantization_config=bnb_config\n",
135 |     ")\n",
136 |     "\n",
137 |     "# Freeze\n",
138 |     "for param in model.parameters():\n",
139 |     "    param.requires_grad = False\n",
140 |     "\n",
141 |     "# Add LoRA (make sure your rank (r) and alpha (lora_alpha) values match those used in training!)\n",
142 |     "peft_config = LoraConfig(\n",
143 |     "    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=64, lora_alpha=16, lora_dropout=0.1,\n",
144 |     "    target_modules=[\"k_proj\", \"q_proj\", \"v_proj\", \"up_proj\", \"down_proj\", \"gate_proj\"]\n",
145 |     ")\n",
146 |     "model = get_peft_model(model, peft_config)\n",
147 |     "\n",
148 |     "# Check out the first few keys in the state dict:\n",
149 |     "list(model.state_dict().keys())[:10]"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "id": "322501e6-9170-4cb6-bd14-ed070045f028",
155 |    "metadata": {},
156 |    "source": [
157 |     "Now, if all goes well, we can replace the randomly initialized LoRA layers with our trained ones:"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 4,
163 |    "id": "028a26eb-18d3-4e0e-b593-1bacd4987005",
164 |    "metadata": {},
165 |    "outputs": [
166 |     {
167 |      "data": {
168 |       "text/plain": [
169 |        "<All keys matched successfully>"
170 |       ]
171 |      },
172 |      "execution_count": 4,
173 |      "metadata": {},
174 |      "output_type": "execute_result"
175 |     }
176 |    ],
177 |    "source": [
178 |     "new_sd = model.state_dict()\n",
179 |     "for k in new_sd:\n",
180 |     "    if 'lora' in k:\n",
181 |     "        new_sd[k] = tensors[k]\n",
182 |     "\n",
183 |     "model.load_state_dict(new_sd)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "id": "dd59ea0b-68e4-457e-9c0f-1804b327794c",
189 |    "metadata": {},
190 |    "source": [
191 |     "And now, since we have a regular PEFT model, we can save using the built-in methods:"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 5,
197 |    "id": "21096cf2-9270-478a-b7f8-9de70827c46c",
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "model.save_pretrained(\"lora_adapters\")"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 6,
207 |    "id": "f24feebb-4928-4d1b-aa0a-b8f86e623336",
208 |    "metadata": {},
209 |    "outputs": [
210 |     {
211 |      "name": "stdout",
212 |      "output_type": "stream",
213 |      "text": [
214 |       "README.md  adapter_config.json\tadapter_model.safetensors\n"
215 |      ]
216 |     }
217 |    ],
218 |    "source": [
219 |     "!ls lora_adapters"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 7,
225 |    "id": "ad1300e9-4beb-47e3-ba87-3616e8cd819d",
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "# model.push_to_hub('your_repo_id') # If you want to share your model..."
230 |    ]
231 |   }
232 |  ],
233 |  "metadata": {
234 |   "kernelspec": {
235 |    "display_name": "Python 3 (ipykernel)",
236 |    "language": "python",
237 |    "name": "python3"
238 |   },
239 |   "language_info": {
240 |    "codemirror_mode": {
241 |     "name": "ipython",
242 |     "version": 3
243 |    },
244 |    "file_extension": ".py",
245 |    "mimetype": "text/x-python",
246 |    "name": "python",
247 |    "nbconvert_exporter": "python",
248 |    "pygments_lexer": "ipython3",
249 |    "version": "3.12.1"
250 |   }
251 |  },
252 |  "nbformat": 4,
253 |  "nbformat_minor": 5
254 | }
255 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2024 Answer.AI
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/PROFILING.md:
--------------------------------------------------------------------------------
  1 | ## Profiling
  2 | 
  3 | Documentation for how to profile your training runs.
  4 | 
  5 | **Tips**
  6 | 
  7 | - Only record what is necessary as profiling can significantly slow down training process.
  8 | - Set a `torch.profile.schedule` when running the profiler (description below), as trace artifacts are exported at the end of each profiling cycle and can be very large (on the order of hundreds of MBs each).
  9 | 
 10 | **IMPORTANT**
 11 | There are issues with recording stack traces and exporting traces simultaneously (see this [issue](https://github.com/pytorch/pytorch/issues/113564)) depending on `python` version.
 12 | 
 13 | Tested with `python=3.11.9` and `torch=2.3.0`.
 14 | 
 15 | ## Quickstart
 16 | 
 17 | Running the following:
 18 | 
 19 | ```
 20 | python train.py \
 21 | --model_name "meta-llama/Llama-2-7b-hf" \
 22 | --train_type qlora \
 23 | --profile true \
 24 | --export_trace true \
 25 | --export_memory_timeline true \
 26 | --max_steps 10
 27 | ```
 28 | 
 29 | will result in a directory `{model_name}_{train_type}-{local_rank}` with the following artifacts:
 30 | 
 31 | - `{model_name}-{train_type}-chrome-trace.json.gz` - interactive trace that can be viewed using `chrome::tracing`, `perfetto`, or `tensorboard`
 32 | - `{model_name}-{train_type}-key_averages.txt` - sorted table of events, e.g.:
 33 | 
 34 | | Name                                                                              | Self CPU % | Self CPU | CPU total % | CPU total | CPU time avg | Self CUDA | Self CUDA % | CUDA total | CUDA time avg | # of Calls | Source Location                                                          |
 35 | | --------------------------------------------------------------------------------- | ---------- | -------- | ----------- | --------- | ------------ | --------- | ----------- | ---------- | ------------- | ---------- | ------------------------------------------------------------------------ |
 36 | | ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned int*, unsigned int\*, int) | 0.00%      | 0.000us  | 0.00%       | 0.000us   | 0.000us      | 88.038ms  | 12.14%      | 88.038ms   | 830.547us     | 106        | <built-in method \_allgather_base of PyCapsule object at 0x7f2760c2ea30> |
 37 | |                                                                                   |            |          |             |           |              |           |             |            |               |            | torch/distributed/distributed_c10d.py(2864): all_gather_into_tensor      |
 38 | |                                                                                   |            |          |             |           |              |           |             |            |               |            | torch/distributed/c10d_logger.py(72): wrapper                            |
 39 | |                                                                                   |            |          |             |           |              |           |             |            |               |            | torch/distributed/fsdp/\_flat_param.py(1366): \_all_gather_flat_param    |
 40 | |                                                                                   |            |          |             |           |              |           |             |            |               |            | torch/distributed/fsdp/\_flat_param.py(1285): unshard                    |
 41 | | FullyShardedDataParallel.forward                                                  | 0.00%      | 0.000us  | 0.00%       | 0.000us   | 0.000us      | 59.050ms  | 8.14%       | 59.050ms   | 59.050ms      | 1          | <built-in method embedding of type object at 0x7f281c5787c0>             |
 42 | |                                                                                   |            |          |             |           |              |           |             |            |               |            | torch/nn/functional.py(2154): embedding                                  |
 43 | |                                                                                   |            |          |             |           |              |           |             |            |               |            | torch/nn/modules/sparse.py(162): forward                                 |
 44 | |                                                                                   |            |          |             |           |              |           |             |            |               |            | torch/nn/modules/module.py(1534): \_call_impl                            |
 45 | |                                                                                   |            |          |             |           |              |           |             |            |               |            | nn.Module: Embedding_0                                                   |
 46 | 
 47 | - `{model_name}-{train_type}-memory-timeline.html` - Stacked time series plot of memory use broken down by `Parameter`, `Gradients`, `Activations`, etc.
 48 | - `{model_name}-{train_type}-stacks.txt` - Stack trace. See [docs](https://pytorch.org/docs/stable/profiler.html#torch.profiler._KinetoProfile.export_stacks).
 49 | 
 50 | ## Detailed Usage
 51 | 
 52 | `CLI` options in full:
 53 | 
 54 | - `profile` - whether to profile
 55 | - `profiling_outputs` - output directory for `torch.profiler` artifacts
 56 | - `export_trace` - enables exporting of interactive trace that can be viewed and analyzed using `chrome::tracing`
 57 | - `export_memory_timeline` - exports an HTML memory timeline which shows memory use by category (`parameters`, `activations`, `gradients`, etc.)
 58 | - `with_stack` - exports stack trace
 59 | - `with_shapes` - adds shapes of operators to the trace
 60 | - `{wait, warmup, active}_steps, repeat, profiling_frequency` - controls the profiling schedule:
 61 | 
 62 |   - `wait_steps` - number of steps for the profiler to wait before starting to profile. Overridden if `repeat=0` (see note below).
 63 |   - `warmup_steps` - number of steps for profiler to profile without recording
 64 |   - `active_steps` - number of steps to record
 65 |   - `repeat` - number of times to repeat the above cycle of `wait, warmup, active` if `repeat > 0` else cycles forever
 66 |   - `profiling_frequency` - profiling frequency in steps. Only used if `repeat = 0`, in which case `wait_steps = profiling_frequency - (warmup_steps + active_steps)` such that the effective cycle length = `profiling_frequency`. E.g., if `profiling_frequency=10`, `warmup_steps=2`, `active_steps=1`, then the profiler will wait 8 steps, warmup for 2, record for 1, then repeat.
 67 | 
 68 |     **Note**: Simplest to think of 2 ways of scheduling the profiler:
 69 | 
 70 |     1. Set `repeat` to the number of total number of desired profiling cycles. For example if `wait=1`, `warmup=1`, `active=1`, and `repeat=1`, then the profiler will wait for 1 step, warmup for 1, and record for 1 then stop.
 71 |     2. Set `repeat` to `0` and `profiling_frequency` to the cycle length. E.g., with `repeat=0`, `profiling_frequency=10`, `warmup=2`, `active=1`, then `wait` will be automatically set to `profiling_frequency - (warmup + active) = 7`. The profiler will then continuously execute the following cycle: wait for 7 steps, warmup for 2, record for 1 for the entire training run.
 72 | 
 73 |     See [docs](https://pytorch.org/docs/stable/profiler.html#torch.profiler.schedule) for further details.
 74 | 
 75 | - `max_steps` - maximum number of batches per epoch. E.g., with `num_epochs=1`, stops training after `max_steps` of batches. Note that this is automatically adjusted to accommodate the profiler schedule; for example, if `max_steps < wait_steps + warmup_steps + active_steps`, it will automatically be set to `wait_steps + warmup_steps + active_steps` such that the profiler can run for at least 1 cycle.
 76 | 
 77 | ## Additional Notes
 78 | 
 79 | The default schedule for the profiler is set to continuously execute a 10-step cycle: wait for 7, warmup for 2, record for 1.
 80 | 
 81 | `with_stack` and `with_shapes` are overridden by `export_memory_timeline` since the memory profile requires these options to be `True`.
 82 | 
 83 | ## Examples
 84 | 
 85 | - Record every 5th step, exporting a `chrome` / `tensorboard` trace for each cycle:
 86 | 
 87 |   ```
 88 |   python train.py \
 89 |   --model_name "hf-internal-testing/tiny-random-LlamaForCausalLM" \
 90 |   --gradient_accumulation_steps 2 \
 91 |   --batch_size 1 \
 92 |   --context_length 256 \
 93 |   --num_epochs 1 \
 94 |   --sharding_strategy full_shard \
 95 |   --precision bf16 \
 96 |   --train_type qlora \
 97 |   --use_gradient_checkpointing false \
 98 |   --use_cpu_offload false \
 99 |   --log_to stdout \
100 |   --dataset dummy \
101 |   --profile true \
102 |   --export_trace true \
103 |   --export_memory_timeline false \
104 |   --with_stack true \
105 |   --num_epochs 1 \
106 |   --max_steps 20 \
107 |   --repeat 0 \
108 |   --warmup_steps 4 \
109 |   --active_steps 1 \
110 |   --profiling_frequency 5 \
111 |   --profiling_output llama-test
112 |   ```
113 | 
114 |   The output will be a 4 trace output folders, at iteration 5, 10, ..., each containing a trace with a single training step at that iteration.
115 | 
116 |   Also in the folder will be exported stacks (which can be visualized using flamegraphs or other stack viewers) and `key_averages`, which is a summary table of operations ordered by `cuda` time.
117 | 
118 |   Note that we set `max_steps=20` so that the training loop will exit after 20 batches. If `max_steps=-1` (the default setting), the profiler will repeat the cycle during the entire training run.
119 | 
120 | - Record 5 steps (after 1 warmup step) then stop profiling:
121 |   ```
122 |   python train.py \
123 |   --model_name "hf-internal-testing/tiny-random-LlamaForCausalLM" \
124 |   --gradient_accumulation_steps 2 \
125 |   --batch_size 1 \
126 |   --context_length 256 \
127 |   --num_epochs 1 \
128 |   --sharding_strategy full_shard \
129 |   --precision bf16 \
130 |   --train_type qlora \
131 |   --use_gradient_checkpointing false \
132 |   --use_cpu_offload false \
133 |   --log_to stdout \
134 |   --dataset dummy \
135 |   --profile true \
136 |   --export_trace true \
137 |   --export_memory_timeline true \
138 |   --with_stack true \
139 |   --num_epochs 1 \
140 |   --max_steps 20 \
141 |   --warmup_steps 1 \
142 |   --active_steps 5 \
143 |   --repeat 1 \
144 |   --profiling_output llama-test2
145 |   ```
146 |   The output will be a single trace at `iteration_6` which contains 5 training steps.
147 |   In addition to the `stacks` and `key_averages` artifacts, there will be a `memory_timeline` `html`, which shows a breakdown of memory usage by `parameter`, `gradients`, `activations`, etc.
148 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # fsdp_qlora
  2 | 
  3 | Training LLMs with Quantized LoRA + FSDP.
  4 | 
  5 | Read our [announcement blog post](https://www.answer.ai/posts/2024-03-06-fsdp-qlora.html).
  6 | 
  7 | You should treat this script as an alpha/preview release. If you’re not comfortable with testing and debugging models, we’d suggest holding off for a few months while the community more fully tests the approach.
  8 | 
  9 | ## Integrations
 10 | 
 11 | FSDP+QLoRA has been integrated into:
 12 | - [Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl/pull/1378): experimental support
 13 | 
 14 | ## Installation
 15 | 
 16 | The following steps should work (tested on Cuda 11.7, 11.8 and 12.1):
 17 | - Clone https://github.com/AnswerDotAI/fsdp_qlora
 18 | - `pip install llama-recipes fastcore "transformers!=4.38.*,!=4.39.*" --extra-index-url https://download.pytorch.org/whl/test/cu118` as an easy way to get most dependencies (replace 118 with your desired Cuda version)
 19 | - Install bitsandbytes `pip install bitsandbytes>=0.43.0`
 20 | - Run `huggingface-cli login` (to access Llama 2)
 21 | - Optional Libraries:
 22 |   - HQQ quantization: follow the HQQ installation [instructions](https://github.com/mobiusml/hqq?tab=readme-ov-file#installation). Our training script uses `HQQBackend.ATEN_BACKPROP`, so also make sure to build the custom kernels `cd hqq/kernels && python setup_cuda.py install`.
 23 |   - Weights and Biases logging: `pip install wandb`
 24 | - [Pytorch >= 2.2](https://pytorch.org/blog/pytorch2-2/) is recommended to make use of the native flash-attention 2 kernel.
 25 | 
 26 | ## Finetune Llama-2 70B on Dual 24GB GPUs
 27 | 
 28 | Once installed, run `cd fsdp_qlora` and then run the following command to begin finetuning Llama-2 70B on [Alpaca](https://huggingface.co/datasets/yahma/alpaca-cleaned) at a maximum sequence length of 512 tokens.
 29 | 
 30 | ```bash
 31 | python train.py \
 32 | --model_name meta-llama/Llama-2-70b-hf \
 33 | --batch_size 2 \
 34 | --context_length 512 \
 35 | --precision bf16 \
 36 | --train_type qlora \
 37 | --use_gradient_checkpointing true \
 38 | --use_cpu_offload true \
 39 | --dataset alpaca \
 40 | --reentrant_checkpointing true
 41 | ```
 42 | 
 43 | This example command currently uses just over 128GB of CPU RAM. If you only have 128GB available, we recommend making a 10-20GB swap file to accommodate the initial spike in usage.
 44 | 
 45 | ## Training Options
 46 | 
 47 | For quantization we support HQQ and bitsandbytes. We're currently doing benchmarking to help you decide which to use. If you do use bitsandbytes, be sure to pass `--reentrant_checkpointing True` to avoid triggering a bug in bitsandbytes which results in high memory usage (a fix is in progress).
 48 | 
 49 | ### `--train_type full`
 50 | 
 51 | Full params fine-tuning.
 52 | 
 53 | ```bash
 54 | export CUDA_VISIBLE_DEVICES=4,5 # optionally set devices
 55 | python train.py \
 56 | --world_size 2 \ # optional, on a single machine will be set automatically
 57 | --master_port 12356 \ # optional, defaults to 12355
 58 | --model_name meta-llama/Llama-2-7b-hf \
 59 | --gradient_accumulation_steps 4 \
 60 | --batch_size 8 \
 61 | --context_length 512 \
 62 | --precision bf16 \
 63 | --train_type full \
 64 | --use_gradient_checkpointing true \
 65 | --use_cpu_offload false \
 66 | --use_activation_cpu_offload false \
 67 | --log_to wandb \
 68 | --dataset alpaca
 69 | ```
 70 | 
 71 | ### `--train_type lora`
 72 | 
 73 | LoRA fine-tuning using HF PEFT library.
 74 | 
 75 | ```diff
 76 | - --train_type full \
 77 | + --train_type lora \
 78 | ```
 79 | 
 80 | ### `--train_type custom_lora`
 81 | 
 82 | LoRA fine-tuning using a custom LoRA module.
 83 | 
 84 | ```diff
 85 | - --train_type full \
 86 | + --train_type custom_lora \
 87 | ```
 88 | 
 89 | ### `--train_type qlora`
 90 | 
 91 | 4-bit quantized LoRA fine-tuning using bitsanbytes Linear4bit layer with NF4 quantization and HF PEFT library.
 92 | 
 93 | ```diff
 94 | - --train_type full \
 95 | + --train_type qlora \
 96 | + --reentrant_checkpointing true \
 97 | ```
 98 | 
 99 | ### `--train_type custom_qlora`
100 | 
101 | 4-bit quantized LoRA fine-tuning using bitsanbytes Linear4bit layer with NF4 quantization and a custom LoRA module.
102 | 
103 | ```diff
104 | - --train_type full \
105 | + --train_type custom_qlora \
106 | + --reentrant_checkpointing true \
107 | ```
108 | 
109 | ### `--train_type hqq_lora`
110 | 
111 | 4-bit quantized LoRA fine-tuning using HQQ library and a custom LoRA module.
112 | 
113 | ```diff
114 | - --train_type full \
115 | + --train_type hqq_lora \
116 | ```
117 | 
118 | ### `--train_type bnb_dora`
119 | 
120 | 4-bit quantized DoRA fine-tuning using bitsanbytes Linear4bit layer with NF4 quantization and a custom DoRA module.
121 | 
122 | ```diff
123 | - --train_type full \
124 | + --train_type bnb_dora \
125 | ```
126 | 
127 | ### `--train_type hqq_dora`
128 | 
129 | 4-bit quantized DoRA fine-tuning using HQQ library and a custom DoRA module.
130 | 
131 | ```diff
132 | - --train_type full \
133 | + --train_type hqq_dora \
134 | ```
135 | 
136 | ### `--train_type bnb_llama_pro`
137 | 
138 | 4-bit quantized Llama-Pro fine-tuning using bitsanbytes Linear4bit layer with NF4 quantization.
139 | 
140 | To create llama-pro weights, run the following command:
141 | 
142 | ```bash
143 | python scripts/block_expansion.py \
144 | --model_name meta-llama/Llama-2-7b-hf \
145 | --output_dir /path/to/llama_pro_weights_directory \
146 | --expansion_rate 0.1
147 | ```
148 | 
149 | ```diff
150 | - --train_type full \
151 | + --train_type bnb_llama_pro \
152 | + --llama_pro_path /path/to/llama_pro_weights_directory \
153 | ```
154 | 
155 | ### `--train_type hqq_llama_pro`
156 | 
157 | 4-bit quantized Llama-Pro fine-tuning using HQQ library.
158 | 
159 | To create llama-pro weights, run the following command:
160 | 
161 | ```bash
162 | python scripts/block_expansion.py \
163 | --model_name meta-llama/Llama-2-7b-hf \
164 | --output_dir /path/to/llama_pro_weights_directory \
165 | --expansion_rate 0.1
166 | ```
167 | 
168 | ```diff
169 | - --train_type full \
170 | + --train_type hqq_llama_pro \
171 | + --llama_pro_path /path/to/llama_pro_weights_directory \
172 | ```
173 | 
174 | ## Low Memory Loading
175 | 
176 | During quantized LoRA training we use a custom quantization and loading code to avoid loading the entire model into GPU memory before sharding it across GPUs. This is the default behavior of our training script when any of the following training options `"qlora", "custom_qlora", "hqq_lora"` is used. Other training options are already optimized for low memory loading to their best extent.
177 | 
178 | We load the weights iteratively, quantize them on the GPU and place them back to CPU or meta device (based on their rank) concurrently a few layers at a time. We do this across all GPUs to initialize the quantization parameters, such as zero and scale, while using `sync_module_states=True` to sync the model parameters and buffers across all GPUs during FSDP initialization.
179 | 
180 | ## Mixed Precision Training
181 | 
182 | ### `--precision bf16` (pure bfloat16)
183 | 
184 | This will cast all the model parameters to `torch.bfloat16` before training and won't use FSDP mixed precision. As a result, sharded and unsharded params will be stored in bf16, forward and backward passes will be done in bf16, and gradient reduction and updates will be done in bf16.
185 | 
186 | ### `--precision fp32` (pure float32)
187 | 
188 | This will cast all the model parameters to `torch.float32` before training and won't use FSDP mixed precision. As a result, sharded and unsharded params will be stored in fp32, forward and backward passes will be done in fp32, and gradient reduction and updates will be done in fp32.
189 | 
190 | 
191 | ### `--precision mp_fp16_autocast` (mixed float16 with autocast)
192 | 
193 | This will cast all the model parameters to `torch.float32` before training and will use FSDP mixed precision with
194 | 
195 | ```
196 | mp_policy = MixedPrecision(param_dtype=torch.float32, reduce_dtype=torch.float32, buffer_dtype=torch.float32)
197 | ```
198 | 
199 | As a results, sharded and unsharded params will be stored in fp32. It will use `autocast(torch.float16)` for forward and backward passes, and `autocast(torch.float16)` for gradient reduction and updates.
200 | 
201 | 
202 | ### `--precision mp_bf16_autocast` (mixed bfloat16 with autocast)
203 | 
204 | This will cast all the model parameters to `torch.float32` before training and will use FSDP mixed precision with
205 | 
206 | ```
207 | mp_policy = MixedPrecision(param_dtype=torch.float32, reduce_dtype=torch.float32, buffer_dtype=torch.float32)
208 | ```
209 | 
210 | As a results, sharded and unsharded params will be stored in fp32. It will use `autocast(torch.bfloat16)` for forward and backward passes, and `autocast(torch.bfloat16)` for gradient reduction and updates.
211 | 
212 | 
213 | ### `--precision mp_bf16_buffers_autocast` (bfloat16 params and float32 buffers with autocast)
214 | 
215 | This will cast all the model parameters to `torch.bfloat16` before training but will keep the buffers in `torch.float32` and will use FSDP mixed precision with
216 | 
217 | ```
218 | mp_policy = MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.bfloat16, buffer_dtype=torch.float32)
219 | ```
220 | 
221 | As a results, sharded and unsharded params will be stored in bf16. It will use `autocast(torch.bfloat16)` for forward and backward passes, and `autocast(torch.bfloat16)` for gradient reduction and updates. Buffers and only [eligible operations](https://pytorch.org/docs/stable/amp.html#cuda-ops-that-can-autocast-to-float16) in autocast will be performed in bf16.
222 | 
223 | This option is important for RoPE layer which gives incorrect results when cast to lower precision especially with longer context lengths.
224 | 
225 | ## Comparison to an existing trainer
226 | 
227 | ![Screenshot 2024-02-01 083222](https://github.com/AnswerDotAI/fsdp_qlora/assets/6575163/97bb03fb-c2bb-4679-83ff-63a2e202826f)
228 | `hf_train.py` uses TRL's SFTTrainer for a comparison run. To match with our script, modify the dataloading code to train on everything (not just completions) and then run `train.py --train_type qlora --dataset guanaco --batch_size 8 --lr_scheduler cosine --log_to wandb --save_model True --output_dir guanaco_7B --gradient_accumulation_steps 2 --lr 2e-4`. The SFTTrainer version has to run with a lower batch size (4 vs 8) so we only do 2 gradient accumulation steps vs 4 in the QLoRA+FSDP version.
229 | 
230 | ## Converting Saved Models
231 | 
232 | If you specify `--save_model True` the adapter layers will be saved as a state dict. To convert to the regular Hugging Face format and upload to the hub, see: **Converting the State Dict.ipynb**
233 | 
234 | If `"custom_qlora", "hqq_lora"` training options are used, then only the trainable LoRA parameters will be saved. Before inference, you need to load and quantize the base model again, and separately load the saved LoRA parameters.
235 | 
236 | You can alternatively test to see if merging base model weights and trained LoRA weights and then quantizing them performs similar to keeping the parameters separately as done during training. To make use of `torch.compile` with HQQ, see https://github.com/mobiusml/hqq/issues/18.
237 | 
238 | ## Limitations
239 | 
240 | While QLoRA finetuning works with FSDP, there are some rough edges to be aware of with this alpha release and our example script.
241 | 
242 | First, the current release of Transformer `AutoModel.from_pretrained` cannot be used to load models into quantized weights, as it does not support the new quant_storage or quantization flag. Loading pretrained models requires writing or using custom model loading code. We provide an example of how to load and quantize a QLoRA model for finetuning in our demo script.
243 | 
244 | We are actively working with Hugging Face to resolve this incompatibility in future Transformers and PEFT releases.
245 | 
246 | Second, while FSDP’s Mixed Precision works with QLoRA, practitioners need to be careful to set the `MixedPrecision.param_type` to match the `Linear4Bit.quant_storage` dtype. Otherwise, FSDP’s Mixed Precision could cast the quantized weights to a different precision, essentially turning them into random weights. Our example script shows how to avoid this potential pitfall, and we will be happy to assist model training libraries in correctly exposing FSDP’s Mixed Precision options to users when training with QLoRA
247 | 
248 | ## Example: Llama 70B 4-A100 40GB Training
249 | 
250 | ```bash
251 | # BnB QLoRA
252 | export CUDA_VISIBLE_DEVICES=4,5,6,7
253 | python train.py \
254 | --world_size 4 \
255 | --master_port 12356 \
256 | --model_name meta-llama/Llama-2-70b-hf \
257 | --gradient_accumulation_steps 4 \
258 | --batch_size 2 \
259 | --context_length 512 \
260 | --precision bf16_buffers_autocast \
261 | --train_type custom_qlora \
262 | --use_gradient_checkpointing true \
263 | --reentrant_checkpointing true
264 | --use_cpu_offload false \
265 | --log_to stdout \
266 | --dataset alpaca
267 | 
268 | # HQQ QLoRA
269 | export CUDA_VISIBLE_DEVICES=4,5,6,7
270 | python train.py \
271 | --world_size 4 \
272 | --master_port 12356 \
273 | --model_name meta-llama/Llama-2-70b-hf \
274 | --gradient_accumulation_steps 4 \
275 | --batch_size 2 \
276 | --context_length 512 \
277 | --precision bf16_buffers_autocast \
278 | --train_type hqq_lora \
279 | --use_gradient_checkpointing true \
280 | --use_cpu_offload false \
281 | --log_to stdout \
282 | --dataset alpaca
283 | ```
284 | 
285 | **Note:** For large batch size or long context training HQQ LoRA is a bit more memory efficient compared to BnB LoRA with re-entrant checkpointing. So if you are running into OOM issues, try using HQQ LoRA.
286 | 
287 | 
288 | ## SLURM Training
289 | 
290 | See `fsdp_multi_node.sh` for an example training script using multi-node training with SLURM.
291 | 
292 | ## Add support for a new model
293 | 
294 | First, import the new model's transformer, attention, and MLP layers from Transformers:
295 | 
296 | ```python
297 | from transformers.models.mistral.modeling_mistral import MistralDecoderLayer, MISTRAL_ATTENTION_CLASSES, MistralMLP
298 | ```
299 | 
300 | Then in the `get_wrapping_policy` function, add the attention, MLP, and transformer layers to the `self_attn_policy_fn`, `mlp_policy_fn`, and `transformer_wrap_policy` wrapping policy methods:
301 | 
302 | ```python
303 | def get_wrapping_policy(custom_policy:bool=False):
304 | 
305 |     def self_attn_policy_fn(module):
306 |         return isinstance(module, tuple(*LLAMA_ATTENTION_CLASSES.values(), *MISTRAL_ATTENTION_CLASSES.values()))
307 | 
308 |     def mlp_policy_fn(module):
309 |         return isinstance(module, (LlamaMLP, MistralMLP))
310 | 
311 |     transformer_wrap_policy = functools.partial(
312 |         transformer_auto_wrap_policy,
313 |         transformer_layer_cls=(LlamaDecoderLayer, MistralDecoderLayer),
314 |     )
315 | ```
316 | 
317 | Finally, add gradient checkpointing support by adding the transformer layer to `check_fn`:
318 | 
319 | ```python
320 | if args["use_gradient_checkpointing"]:
321 |     check_fn = lambda submodule: isinstance(submodule, (LlamaDecoderLayer, MistralDecoderLayer))
322 | ```
323 | 


--------------------------------------------------------------------------------
/benchmarking/large_gpu_benchmarking.sh:
--------------------------------------------------------------------------------
 1 | # 4 x A6000 (48GB), 128 CPUs 472GB CPU RAM
 2 | python train.py --batch_size 32 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --train_type lora
 3 | python train.py --batch_size 16 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
 4 | python train.py --batch_size 16 --use_ddp True --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
 5 | # CPU offloading is not needed for this setup.
 6 | python train.py --batch_size 32 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type lora
 7 | # CPU offloading is not needed for this setup.
 8 | python train.py --batch_size 32 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type qlora
 9 | python train.py --batch_size 4 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --train_type lora
10 | python train.py --batch_size 6 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
11 | python train.py --batch_size 4 --use_ddp True --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
12 | # Ignore now, slow.
13 | python train.py --batch_size 128 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type lora
14 | # Ignore now, slow.
15 | python train.py --batch_size 128 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type qlora
16 | 
17 | 
18 | python train.py --batch_size 1 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing True --train_type lora
19 | python train.py --batch_size 10 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
20 | python train.py --batch_size 2 --use_ddp True --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
21 | # python train.py --batch_size 4 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type lora
22 | # python train.py --batch_size 128 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type qlora
23 | # python train.py --batch_size 128 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing False --train_type lora
24 | # python train.py --batch_size 128 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
25 | # python train.py --batch_size 128 --use_ddp True --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
26 | # python train.py --batch_size 128 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type lora
27 | # python train.py --batch_size 128 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type qlora


--------------------------------------------------------------------------------
/benchmarking/small_gpu_benchmarking.sh:
--------------------------------------------------------------------------------
 1 | # Run for 1 fwd-bwd step to find the max bs using a 2xA5000 (24GB each) and 16 CPUs with 88GB RAM machine.
 2 | # https://github.com/AnswerDotAI/fsdp_qlora/blob/299f51a98246d77f5e556fe1a27ab29e107530f0/train.py
 3 | # Uses different default params for train.py script to reduce clutter in the commands below. 
 4 | 
 5 | # Notes:
 6 | # 1) LORA CPU offloading with model sizes larger than 7B fails, most probably due to limited CPU memory. 
 7 | # QLORA CPU offloading works fine.
 8 | # 2) CPU offloading appears exteremely slow. Getting the actual run times will be useful. 
 9 | # Check PCIe stuff. 
10 | 
11 | # Fine.
12 | python train.py --batch_size 48 --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing True --train_type lora
13 | # Check why bs is very low in qlora vs lora? Activation overhead?
14 | python train.py --batch_size 24 --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
15 | # How is qlora full shard same as ddp?
16 | python train.py --batch_size 24 --use_ddp True --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
17 | # Extremely slow.
18 | python train.py --batch_size 96 --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type lora
19 | # Same as before -> Check why bs is very low in qlora vs lora? Activation overhead with large bs?
20 | python train.py --batch_size 30 --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type qlora
21 | # Fine.
22 | python train.py --batch_size 4 --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing False --train_type lora
23 | # Now that bs drops (or activations) we can use larger bs than lora.
24 | python train.py --batch_size 6 --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
25 | # Again -> How is qlora full shard bs same as ddp?
26 | python train.py --batch_size 6 --use_ddp True --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
27 | # Again -> Extremely slow also much lower bs than grad ckpt. Probably smart to prefer grad ckpt over cpu offload.
28 | python train.py --batch_size 8 --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type lora
29 | # Interesting now lora and qlora have same bs when grad ckpt disabled with cpu offloading.
30 | python train.py --batch_size 8 --model_name meta-llama/Llama-2-7b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type qlora
31 | 
32 | # Fine.
33 | python train.py --batch_size 22 --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing True --train_type lora
34 | # Fine.
35 | python train.py --batch_size 16 --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
36 | # 13B -> ~13GB for model. DDP works fine.
37 | python train.py --batch_size 15 --use_ddp True --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
38 | # FIXME: torch.multiprocessing.spawn.ProcessExitedException: process 1 terminated with signal SIGKILl. Needs more CPU memory than 88GB?
39 | # Reducing batch size to 1 doesn't fix it, how come storing 13b params (~26GB) need more than 88GB?
40 | python train.py --batch_size 1 --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type lora
41 | # Qlora cpu offloading works as opposed to lora, likely due to smaller model size after quantization?
42 | python train.py --batch_size 18 --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type qlora
43 | # Fine.
44 | python train.py --batch_size 2 --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing False --train_type lora
45 | # Fine.
46 | python train.py --batch_size 4 --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
47 | # Fine.
48 | python train.py --batch_size 3 --use_ddp True --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
49 | # FIXME: torch.multiprocessing.spawn.ProcessExitedException: process 1 terminated with signal SIGKILl. Needs more CPU memory than 88GB?
50 | # Reducing batch size to 1 doesn't fix it, how come storing 13b params (~26GB) need more than 88GB?
51 | python train.py --batch_size 1 --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type lora
52 | # Qlora cpu offloading works as opposed to lora, likely due to smaller model size after quantization?
53 | python train.py --batch_size 4 --model_name meta-llama/Llama-2-13b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type qlora
54 | 
55 | 
56 | # # Test low memory
57 | # python train.py --batch_size 1 --model_name meta-llama/Llama-2-7b-hf --context_length 16 --use_gradient_checkpointing True --train_type qlora --low_memory True
58 | # # This works now. Custom QLORA nn.module, no changes needed in bnb.
59 | # python train.py --batch_size 1 --model_name meta-llama/Llama-2-70b-hf --context_length 1 --use_gradient_checkpointing True --train_type qlora --low_memory True
60 | 
61 | # This is theoretically not possible:
62 | python train.py --batch_size 128 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --train_type lora
63 | # Fine.
64 | python train.py --batch_size 6 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
65 | # OOM during training.
66 | python train.py --batch_size 1 --use_ddp True --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
67 | # FIXME: torch.multiprocessing.spawn.ProcessExitedException: process 1 terminated with signal SIGKILl. Needs more CPU memory than 88GB?
68 | python train.py --batch_size 1 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type lora
69 | # Qlora cpu offloading works as opposed to lora, likely due to smaller model size after quantization?
70 | python train.py --batch_size 10 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type qlora
71 | # Not possible theoretically, requires at least 68gb/2=34gb per gpu.
72 | python train.py --batch_size 1 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --train_type lora
73 | # OOM. There is some extra memory in gpu:0.
74 | python train.py --batch_size 1 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
75 | # Fine with custom qlora.
76 | python train.py --batch_size 1 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --train_type custom_qlora
77 | # OOM.
78 | python train.py --batch_size 1 --use_ddp True --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
79 | 
80 | python train.py --batch_size 1 --use_ddp True --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --train_type custom_qlora
81 | # Better. Symmetrically distributed memory across gpus.
82 | # python train.py --batch_size 1 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --train_type custom_qlora
83 | # OOM on cpu. low memory needs to be fixed for lora model loading?
84 | python train.py --batch_size 1 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type lora
85 | python train.py --batch_size 2 --model_name codellama/CodeLlama-34b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type qlora
86 | 
87 | 
88 | # Try with 4xA5000 GPUs with 96 CPUs and 500GB RAM. Model fits in mem. but can't train with seqlen=256.
89 | python train.py --batch_size 2 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing True --train_type qlora
90 | # OOM on cpu. low memory needs to be fixed for lora model loading?
91 | python train.py --batch_size 1 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type lora
92 | python train.py --batch_size 4 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing True --use_cpu_offload True --train_type qlora
93 | python train.py --batch_size 1 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing False --train_type qlora
94 | # OOM on cpu.
95 | # python train.py --batch_size 128 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type lora
96 | python train.py --batch_size 1 --model_name meta-llama/Llama-2-70b-hf --context_length 256 --use_gradient_checkpointing False --use_cpu_offload True --train_type qlora


--------------------------------------------------------------------------------
/benchmarks_03_2024.md:
--------------------------------------------------------------------------------
  1 | # Benchmarking QLoRA+FSDP
  2 | 
  3 | ## Exploring training performance across different hardware configurations
  4 | 
  5 | NB: These benchmarks were done in February and March 2024. The exact performance numbers will quickly go out of date but the general lessons may still be of interest. 
  6 | 
  7 | ## Introduction
  8 | 
  9 | We recently announced our first public project, combining [FSDP and QLoRA](https://www.answer.ai/posts/2024-03-06-fsdp-qlora.html) to enable training of 70B models on consumer GPUs. Our first [follow-on post](https://www.answer.ai/posts/2024-03-14-fsdp-qlora-deep-dive.html) went deep into the technical details involved in getting it working. In this note we’ll examine the performance of this new approach to evaluate when it will make the most difference and how you can get the most out of your hardware.
 10 | 
 11 | 
 12 | ## Case Study: A Dual 3090 ‘Basement Rig’
 13 | 
 14 | Rather than starting with a table of results, let’s look at some illustrative examples on a single setup to get a feel for how different choices might affect the memory usage and speed of training a model. Everything in this section is benchmarked on Johno’s personal machine, which features two 3090s (without NVLink), 128GB CPU RAM and an older motherboard. The 3090s are power limited to 280W each.
 15 | 
 16 | 
 17 | ### Starting at 7B
 18 | 
 19 | We’ll use the following command as a template, training on dummy data (so we can control the context length) and logging some stats to Weights and Biases for later comparisons:
 20 | 
 21 | ```{.bash .code-overflow-wrap}
 22 | python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 512 --train_type qlora --use_gradient_checkpointing True --reentrant_checkpointing True --use_cpu_offload False --log_to wandb --dataset dummy --dataset_samples 1024
 23 | ```
 24 | 
 25 | We’re starting out with QLoRA, and by default the script uses FSDP (that is the headline feature after all) to split the model across both GPUs. So, doing some quick napkin math, with a 7 billion parameter model we’d expect 7 billion parameters x 4 bits/parameter / 2 GPUs = ~1.75GB of weights per GPU.
 26 | 
 27 | It’s actually about 3.72GiB (see `reserved_after_model_wrap`). There aren’t exactly 7 billion parameters, we keep some in full precision, there are the LoRA adapter weights, memory reservation overhead… and then once we begin training there are gradients and activations to keep track of too, intermediate values that need to be stored during certain computations, optimizer state for all of the trainable (LoRA) parameters… In total, the command above shows a peak memory usage of 4.98GiB during training.
 28 | 
 29 | Next let’s increase the context length from 512 tokens to 2048 (`--context_length 2048`). There are internal activations for each token in the sequence, so more tokens → more GPU memory used. In this case, the peak memory per GPU goes from 4.98GiB to 5.21GiB. Training also takes longer: 800 seconds vs 550.
 30 | 
 31 | 
 32 | | Train Type | Context Length | Peak Memory (GiB) | Time (s) |
 33 | | :--------: | :------------: | :---------------: | :------: |
 34 | |   QLoRA    |      512       |       4.98        |  1,082   |
 35 | |   QLoRA    |      2048      |       5.21        |  1,564   |
 36 | 
 37 | *Llama-2 7B with a batch size of one*
 38 | 
 39 | What if we weren’t using QLoRA? Keeping the weights in 16-bit precision and doing regular LoRA means we can skip the time spent dequantizing the base weights BUT we need more memory to store the weights (~7GB per GPU) and copying parameters from one GPU to another will be slower (since there is more data to transfer). On my system, the data transfer speed outweighs the gain from avoiding quantization, and the LoRA equivalents run slower than their QLoRA counterparts in this case:
 40 | 
 41 | 
 42 | | Train Type | Context Length | Peak Memory (GiB) | Time (s) |
 43 | | :--------: | :------------: | :---------------: | :------: |
 44 | |    LoRA    |      512       |       10.24       |  2,597   |
 45 | |    LoRA    |      2048      |       10.22       |  3,090   |
 46 | 
 47 | *Llama-2 7B with a batch size of one*
 48 | 
 49 | 
 50 | NB: While the reported peak reserved memory for both 512 and 2048 context length is roughly the same, the peak allocated memory is 8.28 GiB vs 9.16 GiB, respectively. Which matches our intuition that a smaller context length should use less memory.
 51 | 
 52 | None of these runs are close to using the 24GB of VRAM I have available, so let’s scale up the batch size to fill that up a little more:
 53 | 
 54 | 
 55 | | Train Type | Batch Size | Peak Memory (GiB) | Time (s) |
 56 | | :--------: | :--------: | :---------------: | :------: |
 57 | |   QLoRA    |     4      |       11.22       |   998    |
 58 | |   QLoRA    |     10     |       20.97       |   936    |
 59 | |    LoRA    |     4      |       16.14       |  1,366   |
 60 | |    LoRA    |     6      |       21.35       |  1,199   |
 61 | 
 62 | *Llama-2 7B with Context Length of 2048*
 63 | 
 64 | Using a larger batch size results in faster training overall. You still have to do the same amount of computation per sample, but running them through in batches lets you save time by transferring the weights back and forth fewer times in total. Notice also that by using less memory for model weights QLoRA enables a larger max batch size, giving it an extra speed advantage over the standard LoRA version.
 65 | 
 66 | Now, we mentioned transferring the weights between GPUs was slow on my machine, with an older motherboard and slow PCI lanes. Given that, we might reasonably ask if FSDP is even required in this case since we could fit the full model (quantized OR unquantized) in the VRAM of a single GPU. This is a valid point, and we can test it out by specifying `“ddp”` as the sharding strategy[^ddp], which keeps a full copy of the weights on each GPU:
 67 | 
 68 | [^ddp]: This is still using FSDP, but in distributed data parallel mode. Not DistributedDataParallel.
 69 | 
 70 | | Train Type | DDP  | Batch Size | Peak Memory (GiB) | Time (s) |
 71 | | :--------: | :--: | :--------: | :---------------: | :------: |
 72 | |   QLoRA    | True |     8      |       20.94       |   875    |
 73 | |    LoRA    | True |     4      |       22.04       |   881    |
 74 | 
 75 | *Llama-2 7B with Context Length of 2048*
 76 | 
 77 | In the QLoRA case, we now have the full (quantized) weights on each GPU, using more VRAM than with FSDP. Because we don’t have to transfer the weights between GPUs, only gradients, training finishes a little faster than the FSDP case. Even though we use a batch size of 8 vs 10. For LoRA, each GPU has 14GB* of weights and thus much less room for everything else, necessitating a lower batch size of 4 but still finishing much faster than the FSDP version.
 78 | 
 79 | We have our first lesson. If the model is small enough that the weights aren’t dominating your VRAM usage, you may be better off with DDP instead of FSDP. As we move to larger models, the larger batch sizes enabled by sharding the model across multiple GPUs will outweigh the communication overhead.
 80 | 
 81 | 
 82 | ### What About CPU Offloading?
 83 | 
 84 | Now let’s jump up to a larger model: Yi 34B. Napkin math suggests with QLoRA+FSDP we should expect ~17GB of weights per GPU, leaving enough room on my 24GB cards for a batch size of 1 or 2 at most. But there’s another option: CPU offloading (`--use_cpu_offload true`) stores the weights in CPU RAM instead, loading them into each GPU a layer at a time as needed. This leaves the GPU RAM free for activations, gradients etc and allows us to use a batch size of 4 instead. In this example, the extra communication overhead of CPU offloading is offset by the higher batch size it enables and we end up with a slightly faster training run overall:
 85 | 
 86 | 
 87 | | Train Type | CPU Offload | Batch Size | Peak Memory (GiB) | Time (s) |
 88 | | :--------: | :---------: | :--------: | :---------------: | :------: |
 89 | |   QLoRA    |    False    |     2      |       23.05       |  5,041   |
 90 | |   QLoRA    |    True     |     4      |       22.98       |  4,830   |
 91 | 
 92 | *Yi 34B with Context Length of 2048*
 93 | 
 94 | In cases where you have faster interconnect between cards (NVLink, for example) the non-offloading case may win out, but it’s interesting how comparable these are - my assumption was that having the weights on the CPU and copying them over would be *far* slower. On a cloud machine with slower RAM and a wimpy CPU we did see dramatic slowdowns where CPU offloading was many times slower, so YMMV. But the fact that it works reasonably fast on my machine is encouraging, since it does spark the inevitable question: “**can we go bigger?**”
 95 | 
 96 | 
 97 | ### Llama 70B
 98 | 
 99 | When I first tried loading and training a 70B model the script crashed and my hopes fell. Then I spotted an issue: my 128GB of CPU RAM was completely filling up right at the start of training. I created a 10GB swapfile, which is a part of the disk that is treated like RAM when the regular system RAM gets filled. This allowed the system to get over the initial spike and start training:
100 | 
101 | 
102 | | Train Type | CPU Offload | Batch Size | Peak Memory (GiB) | Time (s) |
103 | | :--------: | :---------: | :--------: | :---------------: | :------: |
104 | |   QLoRA    |    True     |     2      |       14.92       |  11,795  |
105 | 
106 | *Llama-2 70B with Context Length of 2048*
107 | 
108 | It’s slower than the smaller models (nearly 10x slower than the 7B model, at nearly 50 seconds per batch) but that’s not bad considering that 70 BILLION parameters are copied to the GPUs each step! And with activation offloading (`--use_activation_cpu_offload True`) the total allocated memory is low enough that training on a 16GB GPU could be possible in theory.
109 | 
110 | 
111 | ## Case Study: A Dual 4090 “Budget Workstation”
112 | 
113 | We ran a subset of the tests on a dual 4090 “budget workstation” with 128GB of CPU RAM[^budget]. Like the 3090 case study, the 4090s don’t have NVLink. But both GPUs have full PCIe v4 x16 lanes which should reduce the FSDP transfer overhead. The 4090s peaked at 400 watts per card[^4090-power].
114 | 
115 | [^budget]: The total workstation cost is less than a single A6000 Ada. Hence a budget workstation.
116 | 
117 | [^4090-power]: Power usage peaked at 400 watts for the 7B and 34B models, and 375 watts for the 70B model.
118 | 
119 | ### Llama-2 7B
120 | 
121 | At the 7 billion parameter scale, the maximum performance difference between LoRA and FSDP methods is ~10 percent.
122 | 
123 | | Train Type | CPU Offload |  DDP  | Batch Size | Peak Memory (GiB) | Time (s) |
124 | | :--------: | :---------: | :---: | :--------: | :---------------: | :------: |
125 | |    LoRA    |    False    | True  |     4      |       22.04       |   437    |
126 | |    LoRA    |    False    | False |     6      |       21.35       |   481    |
127 | |    LoRA    |    True     | False |     10     |       22.69       |   482    |
128 | |   QLoRA    |    False    | True  |     8      |       20.94       |   450    |
129 | |   QLoRA    |    False    | False |     10     |       20.97       |   466    |
130 | |   QLoRA    |    True     | False |     12     |       22.38       |   464    |
131 | 
132 | *Llama-2 7B with Context Length of 2048*
133 | 
134 | This is encouraging, as there is only a small performance hit when trading maximum training speed verses maximum tokens. It also suggests that the slowdown due to using PCIe instead of NVLink is manageable when training large enough models.
135 | 
136 | ### Yi 34B
137 | 
138 | With a full PCIe lanes and FSDP’s overlapping of compute and next layer transfers, there is almost no difference between QLoRA and QLoRA with CPU Offloading. The larger batch size is ~0.5 percent faster.
139 | 
140 | | Train Type | CPU Offload | Batch Size | Peak Memory (GiB) | Time (s) |
141 | | :--------: | :---------: | :--------: | :---------------: | :------: |
142 | |   QLoRA    |    False    |     2      |       23.05       |   2,072  |
143 | |   QLoRA    |    True     |     4      |       22.98       |   2,061  |
144 | 
145 | *Yi 34B with Context Length of 2048*
146 | 
147 | ### Llama-2 70B
148 | 
149 | Increasing from a 34B model to a 70B model shows near linear scaling, with a ~6 percent slowdown per sample.
150 | 
151 | | Train Type | CPU Offload | Batch Size | Peak Memory (GiB) | Time (s) |
152 | | :--------: | :---------: | :--------: | :---------------: | :------: |
153 | |   QLoRA    |    True     |     2      |       14.92       |  4,399   |
154 | 
155 | *Llama-2 70B with Context Length of 2048*
156 | 
157 | ### Bonus: Mistral 7B
158 | 
159 | Mistral 7B v0.2 Base expanded the context window of the base 7B parameter model to 32K tokens. 24GB of memory per GPU isn't quite enough to finetune at the full context length even using QLoRA, but we can manage a respectable 24K tokens.
160 | 
161 | | Train Type | CPU Offload | Batch Size | Context Length | Peak Memory (GiB) | Time (s) |
162 | | :--------: | :---------: | :--------: | :------------: | :---------------: | :------: |
163 | |   QLoRA    |    True     |     12     |     2,048      |       22.54       |   483    |
164 | |   QLoRA    |    True     |     1      |     24,576     |       22.54       |  7,809   |
165 | 
166 | *Mistral 7B v0.2 Base*
167 | 
168 | While the tokens per batch is the same at 24,576, increasing the context length from 2,048 to 24,576 reduces the training speed from 2,200 tokens/second to 1,615 tokens/second.
169 | 
170 | ## Case Study: Conclusions
171 | 
172 | A priori, we expected the dual 4090s to be significantly faster than our dual 3090 test case, in part due to the increased generational performance but mostly due to the faster data transfer speed from full x16 PCIe lanes.
173 | 
174 | Our results confirmed this expectation, highlighting the importance of good multi-GPU interconnect. If you have two 3090s and a non-workstation motherboard, you’ll want NVLink. If you have two 4090s, you’ll want a workstation motherboard that can provide full x16 PCIe lanes to both GPUs.
175 | 
176 | These results are exciting if you already own a dual-GPU system, but now let’s take a step back and consider whether this still makes sense given the other hardware configurations available in the cloud.
177 | 
178 | ## Recommendations for Different Hardware Configurations
179 | 
180 | Let’s consider a number of different hardware configurations and see which gives the best bang-per-buck performance for fine-tuning a 70B model. For each setup we’ve tried to find the fastest possible combination of settings capable of training on context length 2048 with an effective batch size of 32 (or the closest we could get).
181 | 
182 | |  Accelerator   | GPUs | CPU+Activation Offload | Batch Size | Time (s) | Ballpark Cost |
183 | | :------------: | :--: | :--------------------: | :--------: | :------: | ------------- |
184 | |   A5000 24GB   |  2   |          True          |     2      |  9,688   | $2.37 - $4.14 |
185 | |   A5000 24GB   |  4   |         False          |     1      |  4,829   | $2.36 - $4.13 |
186 | |   A5000 24GB   |  8   |         False          |     1      |  2,613   | $2.55 - $4.47 |
187 | | A6000 Ada 48GB |  2   |         False          |     2      |  5,867   | $3.72 - $5.22 |
188 | | A6000 Ada 48GB |  4   |         False          |     3      |  2,904   | $3.68 - $5.16 |
189 | | A100 40GB SMX  |  2   |         False          |     1      |  3,277   | $3.28 - $3.75 |
190 | | A100 40GB SMX  |  4   |         False          |     4      |  1,266   | $2.53 - $2.90 |
191 | | A100 40GB SMX  |  8   |         False          |     4      |   672    | $2.69 - $3.08 |
192 | | H100 80GB SXM  |  4   |         False          |     8      |   667    | $3.48 - $3.53 |
193 | 
194 | *Llama-2 70B QLoRA with Context Length of 2048 on Select Accelerators*
195 | 
196 | NB: Ballpark Cost is an estimated range of training 1,024 samples at a context length of 2,048. Prices from [Cloud GPUs](https://cloud-gpus.com/) are used. Exact numbers will vary by provider and depend on availability.
197 | 
198 | On a machine with four or eight A5000s, CPU offloading was slower despite allowing us to use double the batch size. This is a different outcome to the 2x3090 example on a 34B model, where CPU offloading had a slight edge. The difference likely comes down to the different transfer speeds CPU->GPU and GPU->GPU: copying parameters between GPUs with fast interconnect is faster than transferring them from the CPU RAM to all the GPUs on these machines.
199 | 
200 | It’s interesting to compare the time here of 16 minutes on eight A5000s with the dual 3090 example from earlier. The training is ~4.6X faster, but the machine is ~6X more expensive per hour[^per-hour]. And of course if you already own the 3090s then the longer wait might look like an even better deal.
201 | 
202 | [^per-hour]: 4X-10X depending on where you find your 3090s.
203 | 
204 | This trend holds for the rest of the examples too. Using a higher number of more powerful GPUs speeds things up as you’d expect, but also costs more, such that the total training cost ends up in the same range across the different setups we tested.
205 | 
206 | One final interesting thing we noticed when testing: for the lower-end configurations QLoRA + FSDP was either the fastest option or in some cases the only option, and training speed was bandwidth-bound. Once we moved to the H100 system with fast interconnect and 80GB memory per card, we finally hit the point where compute was the limiting factor. Changing the batch size from 8 to 12 made little difference, as did switching from QLoRA to LoRA - the extra time spent transferring data didn't matter since it was happening while the computation was being done, with the latter being the bottlekneck.
207 | 
208 | 
209 | ## Practical Guide for Optimal Training Speed
210 | 
211 | Here is a practical step-by-step guide to find the optimal FSDP training configuration which we also followed during the experiments above. We use QLoRA which already saves a significant amount of memory by reducing the model size via quantization, and a lot more by limiting the trainable parameters (~1-2%) with LoRA. We also use backward prefetching ([BACKWARD_PRE](https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.BackwardPrefetch)) by default to overlap computation and communication as much as possible, which also comes with an increased memory usage. You can also experiment with other prefetch options: BACKWARD_POST or None to tradeoff memory and speed.
212 | 
213 | It is recommended to have at least two GPUs for this guide to make sense as it leverages FSDP sharding strategies.
214 | 
215 | Follow the steps below to find the optimal configuration for your own problem and hardware:
216 | 
217 | 
218 | 1. **Vanilla Start**:
219 |     * We start with a batch size of 1, sequence length of 2048 (problem dependent) and disable all the memory saving options.
220 |     * This configuration requires the most memory but potentially the fastest/cheapest one.
221 |     * This will use DDP (Distributed Data Parallel).
222 | 
223 | 2. **Try [gradient checkpointing](https://pytorch.org/docs/stable/checkpoint.html#torch-utils-checkpoint)**:
224 |     * Next, we can try gradient checkpointing to save memory.
225 |     * Gradient checkpointing is a technique that allows the model to avoid storing intermediate activations during the backward pass by recomputing them.
226 | 
227 | 3. **Try [SHARD_GRAD_OP](https://pytorch.org/docs/stable/checkpoint.html#torch-utils-checkpoint)**:
228 |     * If DDP with gradient checkpointing didn’t work we can try SHARD_GRAD_OP[^shard-grad] next.
229 |     * Shard-grad-op is a technique that allows the model to split the gradients and optimizer states across multiple GPUs.
230 |     * This can reduce memory usage on each GPU, but it can also increase communication overhead and training time.
231 |     * You can first try without gradient checkpointing and see if it trains without OOM. If not you can set it to true as well.
232 | 
233 | 4. **Try [FULL_SHARD](https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.ShardingStrategy)**:
234 |     * If SHARD_GRAD_OP with gradient checkpointing didn’t work we can try FULL_SHARD[^full-shard] next.
235 |     * Full-sharding is a technique that allows the model to split the model parameters, gradients and optimizer states across multiple GPUs.
236 |     * This can significantly reduce memory usage on each GPU, but it can also increase communication overhead and training time.
237 |     * Similarly, you can first try without gradient checkpointing and see if it trains without OOM. If not you can set it to true as well.
238 | 
239 | 5. **Try CPU Offloading**:
240 |     * If FULL_SHARD with gradient checkpointing didn’t work we can try cpu offloading next.
241 |     * FSDP’s CPU Offloading moves model parameters and gradients to the CPU when they are not involved in computation.
242 |     * This can reduce memory usage on the GPU, but it can also increase training time due to transfers between GPU and CPU.
243 |     * At this point you’ve so far tried both full sharding and gradient checkpointing but still faced OOM issues.
244 | 
245 | 6. **Try [Activation offloading](https://github.com/pytorch/pytorch/blob/2e02e1efad957b86dbcc5b64748e03acfb8d330c/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py#L173)**:
246 |     * Activation offloading is a technique that allows the model to move some activations from the GPU to the CPU, and transfer them back to the GPU when needed.
247 |     * This will reduce memory usage on the GPU, increase memory usage on the CPU and have additional transfers between CPU and GPU.
248 | 
249 | [^shard-grad]: If using multi-node training you can use _HYBRID_SHARD_ZERO2 (--sharding_strategy hybrid_shard_grad_op) to apply SHARD_GRAD_OP strategy within a node and replicate it across nodes.
250 | [^full-shard]: If using multi-node training you can use HYBRID_SHARD (--sharding_strategy hybrid_full_shard) to apply FULL_SHARD strategy within a node and replicate it across nodes.
251 | 
252 | If you are still facing out-of-memory errors after trying all the steps above then you might need to reduce the sequence length if your task allows, find more GPUs or find GPUs with more memory, and repeat the steps again.
253 | 
254 | Once a setup that can train with a batch size of 1 is found, it is recommended to increase the batch size leaving some GPU memory free to avoid memory thrashing. This can help with training speed and avoid out-of-memory errors.
255 | 
256 | After finding the optimal configuration you can give the next step command a try with a higher batch size and see if it increases the throughput and reduces the training time. For example, imagine you are able to train using DDP (step 1). You can also try with gradient checkpointing (step 2) with a larger batch size. There is a chance that this might increase the overall throughput compared to not using gradient checkpointing and result in a faster training.
257 | 
258 | ## Final Thoughts
259 | 
260 | Benchmarking is always complicated: hardware varies between providers, different versions of different libraries introduce hidden optimizations or bottlenecks, and subtle differences can cause dramatic speedups.
261 | 
262 | In this post we’ve tried to give recommendations for common use-cases which we hope will be useful in informing further experimentation, especially as FSDP+QLoRA support is added to more frameworks and the community explores this frontier further. We've also shown just how many more options there are for fine-tuning these large models now that we have these techniques at our disposal.
263 | 
264 | ## Authors
265 | Jonathan Whitaker
266 | Benjamin Warner
267 | Kerem Turgutlu
268 | 
269 | ## Additional References:
270 | 
271 | * [https://pytorch.org/docs/stable/fsdp.html](https://pytorch.org/docs/stable/fsdp.html)
272 | * [https://medium.com/pytorch/training-a-1-trillion-parameter-model-with-pytorch-fully-sharded-data-parallel-on-aws-3ac13aa96cff](https://medium.com/pytorch/training-a-1-trillion-parameter-model-with-pytorch-fully-sharded-data-parallel-on-aws-3ac13aa96cff)
273 | 


--------------------------------------------------------------------------------
/fsdp_multi_node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=answerai
 3 | #SBATCH --partition=a40x
 4 | #SBATCH --nodes=1
 5 | #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
 6 | #SBATCH --gpus-per-node=4
 7 | #SBATCH --mem=256gb
 8 | #SBATCH --cpus-per-gpu=12
 9 | #SBATCH --job-name=fsdp-multi-node-test
10 | #SBATCH --output=sbatch_outputs/%x_%j.out
11 | 
12 | ##### Number of total processes 
13 | echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
14 | echo "Nodelist:= " $SLURM_JOB_NODELIST
15 | echo "Number of nodes:= " $SLURM_JOB_NUM_NODES
16 | echo "Ntasks per node:= "  $SLURM_NTASKS_PER_NODE
17 | echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
18 | 
19 | export MASTER_PORT=12340
20 | export WORLD_SIZE=$(($SLURM_JOB_NUM_NODES * $SLURM_GPUS_PER_NODE))
21 | 
22 | ### get the first node name as master address - customized for vgg slurm
23 | ### e.g. master(gnodee[2-5],gnoded1) == gnodee2
24 | echo "NODELIST="${SLURM_NODELIST}
25 | master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
26 | export MASTER_ADDR=$master_addr
27 | echo "MASTER_ADDR="$MASTER_ADDR
28 | 
29 | echo "Starting python script"
30 | 
31 | 
32 | # run setup script to init environment
33 | module load cuda/11.8
34 | 
35 | SHARED_VOLUME_DIR=/weka/home-$(whoami)
36 | source $SHARED_VOLUME_DIR/py_venvs/fsdp-qlora-py311/bin/activate
37 | 
38 | # nccl
39 | export FI_EFA_FORK_SAFE=1
40 | export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
41 | export FI_EFA_ENABLE_SHM_TRANSFER=0
42 | export OMPI_MCA_mtl_base_verbose=1
43 | export FI_PROVIDER=efa
44 | export NCCL_TREE_THRESHOLD=0
45 | 
46 | export NCCL_DEBUG=ERROR
47 | export NCCL_SOCKET_TIMEOUT=600000 # Set the timeout to 10 minutes (60000 milliseconds)
48 | export NCCL_DEBUG_SUBSYS=ALL
49 | export TORCH_DISTRIBUTED_DEBUG=INFO
50 | 
51 | export NCCL_IBEXT_DISABLE=1
52 | export NCCL_SOCKET_IFNAME=^docker0,lo
53 | 
54 | export OMPI_MCA_mtl_base_verbose=1
55 | export OMPI_MCA_btl="^openib"
56 | echo "Using python from $(which python)"
57 | echo "Using torch from $(python -c 'import torch; print(torch.__file__)')"
58 | echo "Using torch cuda from $(python -c 'import torch; print(torch.version.cuda)')"
59 | echo "Using nccl from $(python -c 'import torch; print(torch.cuda.nccl.version())')"
60 | 
61 | # print cuda home
62 | echo "CUDA_HOME=$CUDA_HOME"
63 | 
64 | # GLOBAL_BATCH_SIZE=64
65 | MAX_BATCH_SIZE=8
66 | GRAD_ACCUM_STEPS=1
67 | 
68 | srun python $SHARED_VOLUME_DIR/git/fsdp_qlora/train.py \
69 | --world_size=$WORLD_SIZE \
70 | --master_addr=$MASTER_ADDR \
71 | --master_port=$MASTER_PORT \
72 | --model_name meta-llama/Llama-2-7b-hf \
73 | --dataset dummy \
74 | --batch_size $MAX_BATCH_SIZE \
75 | --context_length 512 \
76 | --gradient_accumulation_steps $GRAD_ACCUM_STEPS \
77 | --train_type custom_qlora \
78 | --use_gradient_checkpointing True \
79 | --use_activation_cpu_offload True \
80 | --use_cpu_offload False \
81 | --log_to stdout \
82 | --verbose true


--------------------------------------------------------------------------------
/hf_train.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | import torch, os
 3 | from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
 4 | from peft import LoraConfig
 5 | from trl import SFTTrainer
 6 | from transformers import TrainingArguments
 7 | 
 8 | local_rank = os.getenv("LOCAL_RANK")
 9 | device_string = "cuda:" + str(local_rank)
10 | 
11 | # Load the dataset
12 | dataset_name = "timdettmers/openassistant-guanaco"
13 | dataset = load_dataset(dataset_name, split="train")
14 | 
15 | 
16 | # Load the model + tokenizer
17 | model_name = "meta-llama/Llama-2-7b-hf"
18 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
19 | tokenizer.pad_token = tokenizer.eos_token
20 | bnb_config = BitsAndBytesConfig(
21 |     load_in_4bit=True,
22 |     bnb_4bit_quant_type="nf4",
23 |     bnb_4bit_compute_dtype=torch.float16,
24 | )
25 | model = AutoModelForCausalLM.from_pretrained(
26 |     model_name,
27 |     quantization_config=bnb_config,
28 |     trust_remote_code=True,
29 |     use_cache = False,
30 |     device_map={'':device_string}
31 | )
32 | 
33 | # PEFT config
34 | lora_alpha = 16
35 | lora_dropout = 0.1
36 | lora_r = 64
37 | peft_config = LoraConfig(
38 |     lora_alpha=lora_alpha,
39 |     lora_dropout=lora_dropout,
40 |     r=lora_r,
41 |     bias="none",
42 |     task_type="CAUSAL_LM",
43 |     target_modules=["k_proj", "q_proj", "v_proj", "up_proj", "down_proj", "gate_proj"]
44 | )
45 | 
46 | 
47 | # Args 
48 | max_seq_length = 512
49 | output_dir = "./results"
50 | per_device_train_batch_size = 4
51 | gradient_accumulation_steps = 4
52 | optim = "adamw_hf"
53 | save_steps = 10
54 | logging_steps = 1
55 | learning_rate = 2e-4
56 | max_grad_norm = 0.3
57 | max_steps = 311 # Approx the size of guanaco at bs 8, ga 2, 2 GPUs. 
58 | warmup_ratio = 0.1
59 | lr_scheduler_type = "cosine"
60 | training_arguments = TrainingArguments(
61 |     output_dir=output_dir,
62 |     per_device_train_batch_size=per_device_train_batch_size,
63 |     gradient_accumulation_steps=gradient_accumulation_steps,
64 |     optim=optim,
65 |     save_steps=save_steps,
66 |     logging_steps=logging_steps,
67 |     learning_rate=learning_rate,
68 |     fp16=True,
69 |     max_grad_norm=max_grad_norm,
70 |     max_steps=max_steps,
71 |     warmup_ratio=warmup_ratio,
72 |     group_by_length=False, # Otherwise weird loss pattern (see https://github.com/artidoro/qlora/issues/84#issuecomment-1572408347, https://github.com/artidoro/qlora/issues/228, https://wandb.ai/answerdotai/fsdp_qlora/runs/snhj0eyh)
73 |     lr_scheduler_type=lr_scheduler_type,
74 |     gradient_checkpointing=True,
75 |     gradient_checkpointing_kwargs={'use_reentrant':False}, # Needed for DDP
76 |     report_to="wandb",
77 | )
78 | 
79 | # Trainer 
80 | trainer = SFTTrainer(
81 |     model=model,
82 |     train_dataset=dataset,
83 |     peft_config=peft_config,
84 |     dataset_text_field="text",
85 |     max_seq_length=max_seq_length,
86 |     tokenizer=tokenizer,
87 |     args=training_arguments,
88 | )
89 | 
90 | # Not sure if needed but noticed this in https://colab.research.google.com/drive/1t3exfAVLQo4oKIopQT1SKxK4UcYg7rC1#scrollTo=7OyIvEx7b1GT
91 | for name, module in trainer.model.named_modules():
92 |     if "norm" in name:
93 |         module = module.to(torch.float32)
94 | 
95 | # Train :)
96 | trainer.train()


--------------------------------------------------------------------------------
/nbs/00-profile_lora_qlora.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "d30779b0-0df2-445a-829d-fc3b243c462c",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import bitsandbytes as bnb\n",
 11 |     "\n",
 12 |     "import torch\n",
 13 |     "import torch.nn as nn\n",
 14 |     "import torch.nn.functional as F\n",
 15 |     "from transformers import AutoModelForCausalLM\n",
 16 |     "from transformers.utils.quantization_config import BitsAndBytesConfig\n",
 17 |     "from transformers.pytorch_utils import Conv1D\n",
 18 |     "\n",
 19 |     "import transformers\n",
 20 |     "from transformers import LlamaConfig, LlamaForCausalLM\n",
 21 |     "from transformers.integrations.bitsandbytes import replace_with_bnb_linear\n",
 22 |     "from transformers.utils.quantization_config import BitsAndBytesConfig\n",
 23 |     "from transformers.models.llama.modeling_llama import LlamaDecoderLayer\n",
 24 |     "\n",
 25 |     "from peft.tuners.lora.config import LoraConfig\n",
 26 |     "from peft.mapping import get_peft_model\n",
 27 |     "from peft.utils.peft_types import *\n",
 28 |     "\n",
 29 |     "import os\n",
 30 |     "import gc\n",
 31 |     "import inspect\n",
 32 |     "from accelerate.utils import set_seed\n",
 33 |     "from functools import partial\n",
 34 |     "from pathlib import Path"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "id": "82ade2d0-6c49-4f79-8a66-70a4edf9f097",
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "save_dir = Path(\"profile_snapshots/\")\n",
 45 |     "os.makedirs(save_dir, exist_ok=True)"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "id": "b8a001c3-4941-44dc-97b0-dd9f67c5148a",
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "transformers.logging.set_verbosity_warning()"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "id": "8bda461b-c894-4c8b-8d43-a3023a9570bb",
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "def malloc_in_gb():\n",
 66 |     "    return torch.cuda.memory_allocated()/1e9"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "id": "18e63dde-9528-4315-88df-4c7bea0db6ac",
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "def get_model_size_config(model_size):\n",
 77 |     "    if model_size == \"DEBUG\":\n",
 78 |     "        model_size_config = dict(hidden_size=128,\n",
 79 |     "                                num_hidden_layers=2,\n",
 80 |     "                                num_attention_heads=2,\n",
 81 |     "                                num_key_value_heads=2,\n",
 82 |     "                                intermediate_size=256)\n",
 83 |     "    elif model_size == \"60M\":\n",
 84 |     "        model_size_config = dict(hidden_size=512,\n",
 85 |     "                                num_hidden_layers=4,\n",
 86 |     "                                num_attention_heads=4,\n",
 87 |     "                                num_key_value_heads=4,\n",
 88 |     "                                intermediate_size=1024)\n",
 89 |     "    elif model_size == \"120M\":\n",
 90 |     "        model_size_config = dict(hidden_size=768,\n",
 91 |     "                                num_hidden_layers=12,\n",
 92 |     "                                num_attention_heads=12,\n",
 93 |     "                                num_key_value_heads=12,\n",
 94 |     "                                intermediate_size=1536)\n",
 95 |     "    elif model_size == \"290M\":\n",
 96 |     "        model_size_config = dict(hidden_size=1024,\n",
 97 |     "                                num_hidden_layers=12,\n",
 98 |     "                                num_attention_heads=16,\n",
 99 |     "                                num_key_value_heads=16,\n",
100 |     "                                intermediate_size=4096)\n",
101 |     "    elif model_size == \"1B\":\n",
102 |     "        model_size_config = dict(hidden_size=2048,\n",
103 |     "                                num_hidden_layers=24,\n",
104 |     "                                num_attention_heads=16,\n",
105 |     "                                num_key_value_heads=16,\n",
106 |     "                                intermediate_size=4096)\n",
107 |     "    elif model_size == \"7B\":\n",
108 |     "        model_size_config = {}\n",
109 |     "    return model_size_config"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "id": "8bae5ba6-f4cb-44a7-9191-89bab9e930f5",
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "def create_model(model_size=\"1B\"):\n",
120 |     "    model_size_config = get_model_size_config(model_size)\n",
121 |     "    # download model weights and config files.\n",
122 |     "    config = LlamaConfig()\n",
123 |     "    config.update(model_size_config)\n",
124 |     "    model = LlamaForCausalLM(config)\n",
125 |     "    return model"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "id": "a04c9743-43b7-451f-90d9-ff7a3201f4e3",
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "def free_memory():\n",
136 |     "    gc.collect()\n",
137 |     "    torch.cuda.empty_cache()"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "id": "7a3990ca-6cd7-47de-813c-442802520487",
144 |    "metadata": {},
145 |    "outputs": [
146 |     {
147 |      "name": "stdout",
148 |      "output_type": "stream",
149 |      "text": [
150 |       "Memory allocated: 0.000 GB\n"
151 |      ]
152 |     }
153 |    ],
154 |    "source": [
155 |     "print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "id": "d994c610-4ae6-4cef-ab29-32439904a4a0",
162 |    "metadata": {},
163 |    "outputs": [
164 |     {
165 |      "name": "stdout",
166 |      "output_type": "stream",
167 |      "text": [
168 |       "Memory allocated: 0.000 GB\n"
169 |      ]
170 |     }
171 |    ],
172 |    "source": [
173 |     "# create dummy inputs\n",
174 |     "model = create_model(\"DEBUG\")\n",
175 |     "vocab_size = model.model.embed_tokens.weight.size(0)\n",
176 |     "inputs = [torch.randint(0, vocab_size, (1, sl)) for sl in [512,1024,2048,3072]]\n",
177 |     "print(f\"Memory allocated: {malloc_in_gb():.3f} GB\")"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "id": "431913f6-e03f-4e4c-b0a4-c8943934e423",
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "def profile_model(create_model_func, inference=False, save_filename=\"mem_profile.pickle\"):\n",
188 |     "\n",
189 |     "    \"\"\"\n",
190 |     "    https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html#demonstrating-speedups\n",
191 |     "\n",
192 |     "    https://pytorch.org/docs/stable/torch_cuda_memory.html\n",
193 |     "\n",
194 |     "    https://medium.com/pytorch/how-activation-checkpointing-enables-scaling-up-training-deep-learning-models-7a93ae01ff2d\n",
195 |     "\n",
196 |     "    https://pytorch.org/tutorials/intermediate/autograd_saved_tensors_hooks_tutorial.html\n",
197 |     "    \"\"\"\n",
198 |     "    set_seed(42)\n",
199 |     "    torch.cuda.memory._record_memory_history()\n",
200 |     "    for x in inputs:\n",
201 |     "        print(f\"Input Size:{tuple(x.size())}\")\n",
202 |     "        start = torch.cuda.Event(enable_timing=True)\n",
203 |     "        end = torch.cuda.Event(enable_timing=True)\n",
204 |     "\n",
205 |     "        start.record()\n",
206 |     "        if inference:\n",
207 |     "            with torch.no_grad():\n",
208 |     "                model = create_model_func()\n",
209 |     "                model.to(\"cuda\", torch.bfloat16);\n",
210 |     "                print(f\"Memory allocated [MODEL]: {malloc_in_gb():.3f} GB\")\n",
211 |     "                output = model(x.to(\"cuda\"))\n",
212 |     "                print(f\"Memory allocated [FWD]: {malloc_in_gb():.3f} GB\")\n",
213 |     "        else:\n",
214 |     "            model = create_model_func()\n",
215 |     "            model.to(\"cuda\", torch.bfloat16);\n",
216 |     "            print(f\"Memory allocated [MODEL): {malloc_in_gb():.3f} GB\")\n",
217 |     "            output = model(x.to(\"cuda\"))\n",
218 |     "            print(f\"Memory allocated [FWD]: {malloc_in_gb():.3f} GB\")            \n",
219 |     "            output.logits.mean().backward()\n",
220 |     "            print(f\"Memory allocated [BWD]: {malloc_in_gb():.3f} GB\")\n",
221 |     "        end.record()\n",
222 |     "        torch.cuda.synchronize()\n",
223 |     "        secs = start.elapsed_time(end) / 1000\n",
224 |     "        print(f\"Elapsed time: {secs:.3f}\\n\\n\")\n",
225 |     "        output, model = None, None\n",
226 |     "        free_memory()\n",
227 |     "    torch.cuda.memory._dump_snapshot(save_filename)\n",
228 |     "    print(f\"Memory allocated [finish]: {malloc_in_gb():.3f} GB\")"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "id": "ee22e7e6-4838-4bbf-bf0a-a7e29dc4e96e",
235 |    "metadata": {},
236 |    "outputs": [
237 |     {
238 |      "name": "stdout",
239 |      "output_type": "stream",
240 |      "text": [
241 |       "Input Size:(1, 512)\n",
242 |       "Memory allocated [MODEL]: 0.018 GB\n",
243 |       "Memory allocated [FWD]: 0.093 GB\n",
244 |       "Elapsed time: 0.562\n",
245 |       "\n",
246 |       "\n",
247 |       "Input Size:(1, 1024)\n",
248 |       "Memory allocated [MODEL]: 0.027 GB\n",
249 |       "Memory allocated [FWD]: 0.160 GB\n",
250 |       "Elapsed time: 0.111\n",
251 |       "\n",
252 |       "\n",
253 |       "Input Size:(1, 2048)\n",
254 |       "Memory allocated [MODEL]: 0.027 GB\n",
255 |       "Memory allocated [FWD]: 0.291 GB\n",
256 |       "Elapsed time: 0.096\n",
257 |       "\n",
258 |       "\n",
259 |       "Input Size:(1, 3072)\n",
260 |       "Memory allocated [MODEL]: 0.027 GB\n",
261 |       "Memory allocated [FWD]: 0.425 GB\n",
262 |       "Elapsed time: 0.104\n",
263 |       "\n",
264 |       "\n",
265 |       "Memory allocated [finish]: 0.009 GB\n"
266 |      ]
267 |     }
268 |    ],
269 |    "source": [
270 |     "# warmup\n",
271 |     "profile_model(partial(create_model, \"DEBUG\"), inference=True, save_filename=save_dir/\"debug-inference.pickle\")"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "markdown",
276 |    "id": "9247f004-cfcb-4735-9341-f2461cdc473a",
277 |    "metadata": {},
278 |    "source": [
279 |     "### Base Model"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "id": "d23e911a-0e15-4113-b129-83d5a214250c",
286 |    "metadata": {},
287 |    "outputs": [
288 |     {
289 |      "name": "stdout",
290 |      "output_type": "stream",
291 |      "text": [
292 |       "Input Size:(1, 512)\n",
293 |       "Memory allocated [MODEL]: 2.311 GB\n",
294 |       "Memory allocated [FWD]: 2.478 GB\n",
295 |       "Elapsed time: 12.858\n",
296 |       "\n",
297 |       "\n",
298 |       "Input Size:(1, 1024)\n",
299 |       "Memory allocated [MODEL]: 2.311 GB\n",
300 |       "Memory allocated [FWD]: 2.645 GB\n",
301 |       "Elapsed time: 12.719\n",
302 |       "\n",
303 |       "\n",
304 |       "Input Size:(1, 2048)\n",
305 |       "Memory allocated [MODEL]: 2.311 GB\n",
306 |       "Memory allocated [FWD]: 2.976 GB\n",
307 |       "Elapsed time: 12.735\n",
308 |       "\n",
309 |       "\n",
310 |       "Input Size:(1, 3072)\n",
311 |       "Memory allocated [MODEL]: 2.311 GB\n",
312 |       "Memory allocated [FWD]: 3.322 GB\n",
313 |       "Elapsed time: 12.682\n",
314 |       "\n",
315 |       "\n",
316 |       "Memory allocated [finish]: 0.009 GB\n"
317 |      ]
318 |     }
319 |    ],
320 |    "source": [
321 |     "profile_model(partial(create_model, \"1B\"), inference=True, save_filename=save_dir/\"base-inference.pickle\")"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": null,
327 |    "id": "f06773f8-2eba-4a6f-8785-41564a072232",
328 |    "metadata": {},
329 |    "outputs": [
330 |     {
331 |      "name": "stdout",
332 |      "output_type": "stream",
333 |      "text": [
334 |       "Input Size:(1, 512)\n",
335 |       "Memory allocated [MODEL): 2.311 GB\n",
336 |       "Memory allocated [FWD]: 3.605 GB\n",
337 |       "Memory allocated [BWD]: 4.764 GB\n",
338 |       "Elapsed time: 11.823\n",
339 |       "\n",
340 |       "\n",
341 |       "Input Size:(1, 1024)\n",
342 |       "Memory allocated [MODEL): 2.320 GB\n",
343 |       "Memory allocated [FWD]: 4.907 GB\n",
344 |       "Memory allocated [BWD]: 4.930 GB\n",
345 |       "Elapsed time: 12.106\n",
346 |       "\n",
347 |       "\n",
348 |       "Input Size:(1, 2048)\n",
349 |       "Memory allocated [MODEL): 2.320 GB\n",
350 |       "Memory allocated [FWD]: 7.493 GB\n",
351 |       "Memory allocated [BWD]: 5.260 GB\n",
352 |       "Elapsed time: 12.611\n",
353 |       "\n",
354 |       "\n",
355 |       "Input Size:(1, 3072)\n",
356 |       "Memory allocated [MODEL): 2.320 GB\n",
357 |       "Memory allocated [FWD]: 10.093 GB\n",
358 |       "Memory allocated [BWD]: 5.606 GB\n",
359 |       "Elapsed time: 13.033\n",
360 |       "\n",
361 |       "\n",
362 |       "Memory allocated [finish]: 0.017 GB\n"
363 |      ]
364 |     }
365 |    ],
366 |    "source": [
367 |     "# (1, 4096) OOMs with a 16GB GPU\n",
368 |     "profile_model(partial(create_model, \"1B\"), inference=False, save_filename=save_dir/\"base-training.pickle\")"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "markdown",
373 |    "id": "48fb26df-a649-43da-bd14-f095e9913ab4",
374 |    "metadata": {},
375 |    "source": [
376 |     "### LoRA"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": null,
382 |    "id": "4d24d2f1-25bb-432c-be9f-401bd0ff561f",
383 |    "metadata": {},
384 |    "outputs": [],
385 |    "source": [
386 |     "def create_lora_model(model_size=\"1B\", gc_enabled=False):\n",
387 |     "    model_size_config = get_model_size_config(model_size)\n",
388 |     "    # download model weights and config files.\n",
389 |     "    config = LlamaConfig()\n",
390 |     "    config.update(model_size_config)\n",
391 |     "    model = LlamaForCausalLM(config)\n",
392 |     "    peft_config = LoraConfig(\n",
393 |     "        task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1\n",
394 |     "    )\n",
395 |     "    model = get_peft_model(model, peft_config)\n",
396 |     "    if gc_enabled: model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={\"use_reentrant\": False})\n",
397 |     "    return model"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": null,
403 |    "id": "8660235c-52b1-4417-815b-72f4cf2e5cb2",
404 |    "metadata": {},
405 |    "outputs": [
406 |     {
407 |      "name": "stdout",
408 |      "output_type": "stream",
409 |      "text": [
410 |       "Input Size:(1, 512)\n",
411 |       "Memory allocated [MODEL]: 2.323 GB\n",
412 |       "Memory allocated [FWD]: 2.489 GB\n",
413 |       "Elapsed time: 12.622\n",
414 |       "\n",
415 |       "\n",
416 |       "Input Size:(1, 1024)\n",
417 |       "Memory allocated [MODEL]: 2.323 GB\n",
418 |       "Memory allocated [FWD]: 2.657 GB\n",
419 |       "Elapsed time: 12.293\n",
420 |       "\n",
421 |       "\n",
422 |       "Input Size:(1, 2048)\n",
423 |       "Memory allocated [MODEL]: 2.323 GB\n",
424 |       "Memory allocated [FWD]: 2.988 GB\n",
425 |       "Elapsed time: 12.341\n",
426 |       "\n",
427 |       "\n",
428 |       "Input Size:(1, 3072)\n",
429 |       "Memory allocated [MODEL]: 2.323 GB\n",
430 |       "Memory allocated [FWD]: 3.334 GB\n",
431 |       "Elapsed time: 12.339\n",
432 |       "\n",
433 |       "\n",
434 |       "Memory allocated [finish]: 0.017 GB\n"
435 |      ]
436 |     }
437 |    ],
438 |    "source": [
439 |     "profile_model(partial(create_lora_model, \"1B\"), inference=True, save_filename=save_dir/\"lora-inference.pickle\")"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": null,
445 |    "id": "96344a01-3ed4-44b3-891e-d6e6df0ed8e4",
446 |    "metadata": {},
447 |    "outputs": [
448 |     {
449 |      "name": "stdout",
450 |      "output_type": "stream",
451 |      "text": [
452 |       "Input Size:(1, 512)\n",
453 |       "Memory allocated [MODEL): 2.323 GB\n",
454 |       "Memory allocated [FWD]: 3.451 GB\n",
455 |       "Memory allocated [BWD]: 2.492 GB\n",
456 |       "Elapsed time: 11.359\n",
457 |       "\n",
458 |       "\n",
459 |       "Input Size:(1, 1024)\n",
460 |       "Memory allocated [MODEL): 2.323 GB\n",
461 |       "Memory allocated [FWD]: 4.580 GB\n",
462 |       "Memory allocated [BWD]: 2.660 GB\n",
463 |       "Elapsed time: 11.946\n",
464 |       "\n",
465 |       "\n",
466 |       "Input Size:(1, 2048)\n",
467 |       "Memory allocated [MODEL): 2.323 GB\n",
468 |       "Memory allocated [FWD]: 6.835 GB\n",
469 |       "Memory allocated [BWD]: 2.991 GB\n",
470 |       "Elapsed time: 12.710\n",
471 |       "\n",
472 |       "\n",
473 |       "Input Size:(1, 3072)\n",
474 |       "Memory allocated [MODEL): 2.323 GB\n",
475 |       "Memory allocated [FWD]: 9.105 GB\n",
476 |       "Memory allocated [BWD]: 3.337 GB\n",
477 |       "Elapsed time: 13.298\n",
478 |       "\n",
479 |       "\n",
480 |       "Memory allocated [finish]: 0.017 GB\n"
481 |      ]
482 |     }
483 |    ],
484 |    "source": [
485 |     "profile_model(partial(create_lora_model, \"1B\"), inference=False, save_filename=save_dir/\"lora-training.pickle\")"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "markdown",
490 |    "id": "4b479a83-c80a-4c4a-bf4f-74392671921b",
491 |    "metadata": {},
492 |    "source": [
493 |     "### LORA + Gradient Ckpt.\n",
494 |     "\n",
495 |     "Using default HF grad ckpt strategy which wraps each individual decoder layers."
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "code",
500 |    "execution_count": null,
501 |    "id": "ce7f0c9c-480e-45e9-a637-6a4647dfb9c0",
502 |    "metadata": {},
503 |    "outputs": [
504 |     {
505 |      "name": "stdout",
506 |      "output_type": "stream",
507 |      "text": [
508 |       "Input Size:(1, 512)\n",
509 |       "Memory allocated [MODEL): 2.315 GB\n",
510 |       "Memory allocated [FWD]: 2.439 GB\n",
511 |       "Memory allocated [BWD]: 2.392 GB\n",
512 |       "Elapsed time: 11.923\n",
513 |       "\n",
514 |       "\n",
515 |       "Input Size:(1, 1024)\n",
516 |       "Memory allocated [MODEL): 2.323 GB\n",
517 |       "Memory allocated [FWD]: 2.573 GB\n",
518 |       "Memory allocated [BWD]: 2.458 GB\n",
519 |       "Elapsed time: 12.374\n",
520 |       "\n",
521 |       "\n",
522 |       "Input Size:(1, 2048)\n",
523 |       "Memory allocated [MODEL): 2.323 GB\n",
524 |       "Memory allocated [FWD]: 2.820 GB\n",
525 |       "Memory allocated [BWD]: 2.588 GB\n",
526 |       "Elapsed time: 12.543\n",
527 |       "\n",
528 |       "\n",
529 |       "Input Size:(1, 3072)\n",
530 |       "Memory allocated [MODEL): 2.323 GB\n",
531 |       "Memory allocated [FWD]: 3.082 GB\n",
532 |       "Memory allocated [BWD]: 2.733 GB\n",
533 |       "Elapsed time: 13.120\n",
534 |       "\n",
535 |       "\n",
536 |       "Memory allocated [finish]: 0.017 GB\n"
537 |      ]
538 |     }
539 |    ],
540 |    "source": [
541 |     "profile_model(partial(create_lora_model, \"1B\", gc_enabled=True), inference=False, save_filename=save_dir/\"lora-gc-training.pickle\")"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "markdown",
546 |    "id": "31e01883-2551-4784-93a3-e9ca651feb36",
547 |    "metadata": {},
548 |    "source": [
549 |     "### QLoRA"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "code",
554 |    "execution_count": null,
555 |    "id": "fe1306d6-051a-42cb-a313-21c4849d70aa",
556 |    "metadata": {},
557 |    "outputs": [],
558 |    "source": [
559 |     "def replace_with_bnb_4bit_linear(\n",
560 |     "    model,\n",
561 |     "    modules_to_not_convert=None,\n",
562 |     "    current_key_name=None,\n",
563 |     "    quantization_config=None,\n",
564 |     "    has_been_replaced=False,\n",
565 |     "    quant_storage=torch.uint8, \n",
566 |     "    keep_trainable=False,\n",
567 |     "):\n",
568 |     "    \"\"\"\n",
569 |     "    Private method that wraps the recursion for module replacement.\n",
570 |     "\n",
571 |     "    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.\n",
572 |     "    \"\"\"\n",
573 |     "    for name, module in model.named_children():\n",
574 |     "        if current_key_name is None:\n",
575 |     "            current_key_name = []\n",
576 |     "        current_key_name.append(name)\n",
577 |     "\n",
578 |     "        if (isinstance(module, nn.Linear) or isinstance(module, Conv1D)) and name not in modules_to_not_convert:\n",
579 |     "            # Check if the current key is not in the `modules_to_not_convert`\n",
580 |     "            if not any(key in \".\".join(current_key_name) for key in modules_to_not_convert):\n",
581 |     "                # with init_empty_weights():\n",
582 |     "                if isinstance(module, Conv1D):\n",
583 |     "                    in_features, out_features = module.weight.shape\n",
584 |     "                else:\n",
585 |     "                    in_features = module.in_features\n",
586 |     "                    out_features = module.out_features\n",
587 |     "\n",
588 |     "                    model._modules[name] = bnb.nn.Linear4bit(\n",
589 |     "                        in_features,\n",
590 |     "                        out_features,\n",
591 |     "                        module.bias is not None,\n",
592 |     "                        quantization_config.bnb_4bit_compute_dtype,\n",
593 |     "                        compress_statistics=quantization_config.bnb_4bit_use_double_quant,\n",
594 |     "                        quant_type=quantization_config.bnb_4bit_quant_type,\n",
595 |     "                        quant_storage=quant_storage\n",
596 |     "                    )\n",
597 |     "                    has_been_replaced = True\n",
598 |     "                # Store the module class in case we need to transpose the weight later\n",
599 |     "                model._modules[name].source_cls = type(module)\n",
600 |     "                # Force requires grad to False to avoid unexpected errors\n",
601 |     "                if keep_trainable: \n",
602 |     "                    model._modules[name].requires_grad_(True)\n",
603 |     "                else:\n",
604 |     "                    model._modules[name].requires_grad_(True)\n",
605 |     "        if len(list(module.children())) > 0:\n",
606 |     "            _, has_been_replaced = replace_with_bnb_4bit_linear(\n",
607 |     "                module,\n",
608 |     "                modules_to_not_convert,\n",
609 |     "                current_key_name,\n",
610 |     "                quantization_config,\n",
611 |     "                has_been_replaced=has_been_replaced,\n",
612 |     "            )\n",
613 |     "        # Remove the last key for recursion\n",
614 |     "        current_key_name.pop(-1)\n",
615 |     "    return model, has_been_replaced"
616 |    ]
617 |   },
618 |   {
619 |    "cell_type": "code",
620 |    "execution_count": null,
621 |    "id": "b5f71873-662f-40e8-be83-cc2ef63cd561",
622 |    "metadata": {},
623 |    "outputs": [],
624 |    "source": [
625 |     "def create_qlora_model(model_size=\"1B\", with_lora=True, gc_enabled=False, keep_trainable=False):\n",
626 |     "    \n",
627 |     "    model_size_config = get_model_size_config(model_size)\n",
628 |     "    \n",
629 |     "    # download model weights and config files.\n",
630 |     "    config = LlamaConfig()\n",
631 |     "    config.update(model_size_config)\n",
632 |     "    model = LlamaForCausalLM(config)\n",
633 |     "    qconfig = BitsAndBytesConfig(load_in_4bit=True, \n",
634 |     "                       bnb_4bit_quant_type=\"nf4\",\n",
635 |     "                       bnb_4bit_use_double_quant=False,\n",
636 |     "                       bnb_4bit_compute_dtype=torch.bfloat16)\n",
637 |     "    model, has_been_replaced = replace_with_bnb_4bit_linear(model,\n",
638 |     "                                                            modules_to_not_convert=[\"lm_head\"], \n",
639 |     "                                                            quantization_config=qconfig, \n",
640 |     "                                                            keep_trainable=keep_trainable, \n",
641 |     "                                                            quant_storage=torch.bfloat16)\n",
642 |     "    assert has_been_replaced\n",
643 |     "    if with_lora:\n",
644 |     "        peft_config = LoraConfig(\n",
645 |     "            task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1\n",
646 |     "        )\n",
647 |     "        model = get_peft_model(model, peft_config)\n",
648 |     "    if gc_enabled: model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={\"use_reentrant\": False})\n",
649 |     "    return model"
650 |    ]
651 |   },
652 |   {
653 |    "cell_type": "code",
654 |    "execution_count": null,
655 |    "id": "56720604-ea7f-40d8-a6da-b4df0b7cecbb",
656 |    "metadata": {},
657 |    "outputs": [
658 |     {
659 |      "name": "stdout",
660 |      "output_type": "stream",
661 |      "text": [
662 |       "Input Size:(1, 512)\n",
663 |       "Memory allocated [MODEL]: 0.859 GB\n",
664 |       "Memory allocated [FWD]: 1.034 GB\n",
665 |       "Elapsed time: 19.783\n",
666 |       "\n",
667 |       "\n",
668 |       "Input Size:(1, 1024)\n",
669 |       "Memory allocated [MODEL]: 0.868 GB\n",
670 |       "Memory allocated [FWD]: 1.201 GB\n",
671 |       "Elapsed time: 17.461\n",
672 |       "\n",
673 |       "\n",
674 |       "Input Size:(1, 2048)\n",
675 |       "Memory allocated [MODEL]: 0.868 GB\n",
676 |       "Memory allocated [FWD]: 1.532 GB\n",
677 |       "Elapsed time: 17.779\n",
678 |       "\n",
679 |       "\n",
680 |       "Input Size:(1, 3072)\n",
681 |       "Memory allocated [MODEL]: 0.868 GB\n",
682 |       "Memory allocated [FWD]: 1.878 GB\n",
683 |       "Elapsed time: 17.819\n",
684 |       "\n",
685 |       "\n",
686 |       "Memory allocated [finish]: 0.009 GB\n"
687 |      ]
688 |     }
689 |    ],
690 |    "source": [
691 |     "profile_model(partial(create_qlora_model, \"1B\"), inference=True, save_filename=save_dir/\"qlora-inference.pickle\")"
692 |    ]
693 |   },
694 |   {
695 |    "cell_type": "code",
696 |    "execution_count": null,
697 |    "id": "59def356-5ac1-48cc-89ea-d13cbc71c87c",
698 |    "metadata": {},
699 |    "outputs": [
700 |     {
701 |      "name": "stdout",
702 |      "output_type": "stream",
703 |      "text": [
704 |       "Input Size:(1, 512)\n",
705 |       "Memory allocated [MODEL): 0.868 GB\n",
706 |       "Memory allocated [FWD]: 2.195 GB\n",
707 |       "Memory allocated [BWD]: 1.295 GB\n",
708 |       "Elapsed time: 17.303\n",
709 |       "\n",
710 |       "\n",
711 |       "Input Size:(1, 1024)\n",
712 |       "Memory allocated [MODEL): 0.876 GB\n",
713 |       "Memory allocated [FWD]: 3.532 GB\n",
714 |       "Memory allocated [BWD]: 1.712 GB\n",
715 |       "Elapsed time: 17.051\n",
716 |       "\n",
717 |       "\n",
718 |       "Input Size:(1, 2048)\n",
719 |       "Memory allocated [MODEL): 0.876 GB\n",
720 |       "Memory allocated [FWD]: 6.185 GB\n",
721 |       "Memory allocated [BWD]: 2.542 GB\n",
722 |       "Elapsed time: 17.963\n",
723 |       "\n",
724 |       "\n",
725 |       "Input Size:(1, 3072)\n",
726 |       "Memory allocated [MODEL): 0.876 GB\n",
727 |       "Memory allocated [FWD]: 8.853 GB\n",
728 |       "Memory allocated [BWD]: 3.387 GB\n",
729 |       "Elapsed time: 18.167\n",
730 |       "\n",
731 |       "\n",
732 |       "Memory allocated [finish]: 0.017 GB\n"
733 |      ]
734 |     }
735 |    ],
736 |    "source": [
737 |     "profile_model(partial(create_qlora_model, \"1B\"), inference=False, save_filename=save_dir/\"qlora-training.pickle\")"
738 |    ]
739 |   },
740 |   {
741 |    "cell_type": "markdown",
742 |    "id": "cad4724c-2e12-4835-9dc7-631f09ba5b22",
743 |    "metadata": {},
744 |    "source": [
745 |     "### QLORA + Gradient Ckpt.\n",
746 |     "\n",
747 |     "Using default HF grad ckpt strategy which wraps each individual decoder layer."
748 |    ]
749 |   },
750 |   {
751 |    "cell_type": "code",
752 |    "execution_count": null,
753 |    "id": "b76f28a8-edc5-4788-8125-69f3b7b63462",
754 |    "metadata": {},
755 |    "outputs": [
756 |     {
757 |      "name": "stdout",
758 |      "output_type": "stream",
759 |      "text": [
760 |       "Input Size:(1, 512)\n"
761 |      ]
762 |     },
763 |     {
764 |      "name": "stderr",
765 |      "output_type": "stream",
766 |      "text": [
767 |       "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n"
768 |      ]
769 |     },
770 |     {
771 |      "name": "stdout",
772 |      "output_type": "stream",
773 |      "text": [
774 |       "Memory allocated [MODEL): 0.876 GB\n",
775 |       "Memory allocated [FWD]: 1.250 GB\n",
776 |       "Memory allocated [BWD]: 1.194 GB\n",
777 |       "Elapsed time: 17.265\n",
778 |       "\n",
779 |       "\n",
780 |       "Input Size:(1, 1024)\n",
781 |       "Memory allocated [MODEL): 0.876 GB\n",
782 |       "Memory allocated [FWD]: 1.625 GB\n",
783 |       "Memory allocated [BWD]: 1.511 GB\n",
784 |       "Elapsed time: 16.252\n",
785 |       "\n",
786 |       "\n",
787 |       "Input Size:(1, 2048)\n",
788 |       "Memory allocated [MODEL): 0.876 GB\n",
789 |       "Memory allocated [FWD]: 2.371 GB\n",
790 |       "Memory allocated [BWD]: 2.140 GB\n",
791 |       "Elapsed time: 17.468\n",
792 |       "\n",
793 |       "\n",
794 |       "Input Size:(1, 3072)\n",
795 |       "Memory allocated [MODEL): 0.876 GB\n",
796 |       "Memory allocated [FWD]: 3.133 GB\n",
797 |       "Memory allocated [BWD]: 2.783 GB\n",
798 |       "Elapsed time: 18.704\n",
799 |       "\n",
800 |       "\n",
801 |       "Memory allocated [finish]: 0.017 GB\n"
802 |      ]
803 |     }
804 |    ],
805 |    "source": [
806 |     "profile_model(partial(create_qlora_model, \"1B\", gc_enabled=True), inference=False, save_filename=save_dir/\"qlora-gc-training.pickle\")"
807 |    ]
808 |   },
809 |   {
810 |    "cell_type": "code",
811 |    "execution_count": null,
812 |    "id": "1b6aa659-54a3-4622-981e-e8c61e71ce0d",
813 |    "metadata": {},
814 |    "outputs": [
815 |     {
816 |      "name": "stderr",
817 |      "output_type": "stream",
818 |      "text": [
819 |       "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.\n"
820 |      ]
821 |     },
822 |     {
823 |      "name": "stdout",
824 |      "output_type": "stream",
825 |      "text": [
826 |       "Memory allocated [MODEL): 0.882 GB\n",
827 |       "Memory allocated [FWD]: 1.260 GB\n",
828 |       "Memory allocated [BWD]: 1.210 GB\n",
829 |       "Max MemAlloc: 1.360229376\n",
830 |       "Elapsed time: 0.195\n",
831 |       "\n",
832 |       "\n",
833 |       "Memory allocated [MODEL): 0.891 GB\n",
834 |       "Memory allocated [FWD]: 1.646 GB\n",
835 |       "Memory allocated [BWD]: 1.532 GB\n",
836 |       "Max MemAlloc: 1.844102144\n",
837 |       "Elapsed time: 0.194\n",
838 |       "\n",
839 |       "\n",
840 |       "Memory allocated [MODEL): 0.891 GB\n",
841 |       "Memory allocated [FWD]: 2.397 GB\n",
842 |       "Memory allocated [BWD]: 2.174 GB\n",
843 |       "Max MemAlloc: 2.791502848\n",
844 |       "Elapsed time: 0.231\n",
845 |       "\n",
846 |       "\n",
847 |       "Memory allocated [MODEL): 0.891 GB\n",
848 |       "Memory allocated [FWD]: 3.142 GB\n",
849 |       "Memory allocated [BWD]: 2.784 GB\n",
850 |       "Max MemAlloc: 3.733136384\n",
851 |       "Elapsed time: 0.417\n",
852 |       "\n",
853 |       "\n"
854 |      ]
855 |     }
856 |    ],
857 |    "source": [
858 |     "for x in inputs:\n",
859 |     "    set_seed(42)\n",
860 |     "    model = create_qlora_model(\"1B\", gc_enabled=True)\n",
861 |     "    model.to(\"cuda\", torch.bfloat16);\n",
862 |     "    with torch.no_grad():\n",
863 |     "        model(inputs[0].to(\"cuda\"))\n",
864 |     "    \n",
865 |     "    start = torch.cuda.Event(enable_timing=True)\n",
866 |     "    end = torch.cuda.Event(enable_timing=True)\n",
867 |     "    start.record()\n",
868 |     "    \n",
869 |     "    torch.cuda.reset_peak_memory_stats()\n",
870 |     "    print(f\"Memory allocated [MODEL): {malloc_in_gb():.3f} GB\")\n",
871 |     "    output = model(x.to(\"cuda\"))\n",
872 |     "    print(f\"Memory allocated [FWD]: {malloc_in_gb():.3f} GB\")            \n",
873 |     "    output.logits.mean().backward()\n",
874 |     "    print(f\"Memory allocated [BWD]: {malloc_in_gb():.3f} GB\")\n",
875 |     "    max_memory = torch.cuda.memory.max_memory_allocated()/1e9\n",
876 |     "    print(f\"Max MemAlloc: {max_memory}\")\n",
877 |     "    \n",
878 |     "    end.record()\n",
879 |     "    torch.cuda.synchronize()\n",
880 |     "    secs = start.elapsed_time(end) / 1000\n",
881 |     "    print(f\"Elapsed time: {secs:.3f}\\n\\n\")\n",
882 |     "\n",
883 |     "    output, model = None, None\n",
884 |     "    free_memory()"
885 |    ]
886 |   },
887 |   {
888 |    "cell_type": "code",
889 |    "execution_count": null,
890 |    "id": "7a27f8e9-5cd7-408e-bf89-d50ecf0ff806",
891 |    "metadata": {},
892 |    "outputs": [],
893 |    "source": []
894 |   },
895 |   {
896 |    "cell_type": "code",
897 |    "execution_count": null,
898 |    "id": "a40a7dc7-e933-42c9-b341-2eed67868fff",
899 |    "metadata": {},
900 |    "outputs": [],
901 |    "source": []
902 |   },
903 |   {
904 |    "cell_type": "code",
905 |    "execution_count": null,
906 |    "id": "223b8321-d994-4ce7-be6a-ca032d865b42",
907 |    "metadata": {},
908 |    "outputs": [],
909 |    "source": []
910 |   },
911 |   {
912 |    "cell_type": "code",
913 |    "execution_count": null,
914 |    "id": "3d60b942-9d0a-46fb-9666-150463323d33",
915 |    "metadata": {},
916 |    "outputs": [],
917 |    "source": []
918 |   },
919 |   {
920 |    "cell_type": "code",
921 |    "execution_count": null,
922 |    "id": "821da14b-65d8-44ee-875a-e31321e462e7",
923 |    "metadata": {},
924 |    "outputs": [],
925 |    "source": []
926 |   }
927 |  ],
928 |  "metadata": {
929 |   "kernelspec": {
930 |    "display_name": "python3",
931 |    "language": "python",
932 |    "name": "python3"
933 |   }
934 |  },
935 |  "nbformat": 4,
936 |  "nbformat_minor": 5
937 | }
938 | 


--------------------------------------------------------------------------------
/nbs/01-ft_benchmarking.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "c78ebc14",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import itertools\n",
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "from transformers import AutoConfig, AutoModelForCausalLM\n",
 14 |     "import gspread\n",
 15 |     "from gspread_dataframe import get_as_dataframe, set_with_dataframe\n",
 16 |     "import os"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "id": "48d1ee29",
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "gc = gspread.oauth()"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "id": "7c5219bb",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "### Experiments"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "id": "80c3dba7",
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "models = [\n",
 45 |     "    {\"model_name\":\"meta-llama/Llama-2-7b-hf\", \"model_size\":7}, \n",
 46 |     "    {\"model_name\":\"meta-llama/Llama-2-13b-hf\", \"model_size\":13},\n",
 47 |     "    {\"model_name\":\"codellama/CodeLlama-34b-hf\", \"model_size\":34},\n",
 48 |     "    {\"model_name\":\"meta-llama/Llama-2-70b-hf\", \"model_size\":70}\n",
 49 |     "]"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "id": "b1f3d16c",
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# for m in models:\n",
 60 |     "#     cfg = AutoConfig.from_pretrained(m['model_name'])\n",
 61 |     "#     m['config'] = cfg"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "id": "6eeb8969",
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "seqlen = [{\"seqlen\":256}]"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "id": "9e2f8d2a",
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "max_bs = [{\"max_bs\":None}]"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "id": "21d25c63",
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "bs = [{\"bs\":None}]"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "id": "6defaf4c",
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "cpu_offloading = [{\"cpu_offloading\":False}, {\"cpu_offloading\":True}]"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "id": "564866c0",
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "distrib_type = [{\"distrib_type\":\"FSDP\"}, {\"distrib_type\":\"DDP\"}]"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "id": "a03572dd",
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "ft_type = [{\"ft_type\":\"LoRA\"}, {\"ft_type\":\"QLoRA\"}]"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "id": "1fdd62fa",
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "# RTX 3090 is not available in cloud providers A5000 also has 24GB memory\n",
132 |     "gpus = [{\"gpu_model\":\"A5000\", \"num_gpus\":2, \"gpu_mem\":24, \"total_gpu_mem\":48, \"nvlink\":\"False\"},\n",
133 |     "        {\"gpu_model\":\"A100-40\", \"num_gpus\":8, \"gpu_mem\":40, \"total_gpu_mem\":320, \"nvlink\":\"True\"}]"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "id": "f3f1386d",
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "wandb = [{\"wandb_link\":None,\n",
144 |     "          \"memory_peak\":None, \n",
145 |     "          \"memory_after_model_creation\":None,\n",
146 |     "          \"memory_after_model_wrap\":None,          \n",
147 |     "          \"memory_before_forward\":None,\n",
148 |     "          \"memory_after_forward\":None,\n",
149 |     "          \"memory_before_backward\":None,\n",
150 |     "          \"memory_after_backward\":None, \n",
151 |     "          \"time_taken\":None}]"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "id": "8bd72646",
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "grad_ckpt = [{\"use_gradient_checkpointing\":True}, {\"use_gradient_checkpointing\":False}]"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "id": "8d394429",
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "iters = [models, seqlen, max_bs, bs, grad_ckpt, cpu_offloading, distrib_type, ft_type, gpus, wandb]"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "id": "5fab4270",
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "experiments = list(itertools.product(*iters))"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "id": "5d5f2f01",
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "len(experiments)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "id": "d0cad6c0",
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "def flatten_list_of_dicts(l):\n",
202 |     "    final_d = {}\n",
203 |     "    for d in l: \n",
204 |     "        for k,v in d.items():\n",
205 |     "            if k in final_d:\n",
206 |     "                raise ValueError(f\"Key {k} exists.\")\n",
207 |     "            final_d[k] = v\n",
208 |     "    return final_d"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "id": "a910744b",
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "experiments_flat = [flatten_list_of_dicts(exp) for exp in experiments]"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "id": "606c605e",
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "df = pd.DataFrame(experiments_flat)"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "id": "f4b6c3ca",
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "# exclude lora ddp\n",
239 |     "mask = ~((df['ft_type'] == 'LoRA') & (df['distrib_type'] == 'DDP'))\n",
240 |     "# no cpu-offloading with ddp\n",
241 |     "mask = np.logical_and(mask, ~((df['cpu_offloading'] == True) & (df['distrib_type'] == 'DDP')))\n",
242 |     "\n",
243 |     "df = df[mask].reset_index(drop=True)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "id": "fd687cd0",
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "df.shape"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": null,
259 |    "id": "8770de8c",
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "df.head()"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "id": "9d625f04",
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "# !pip install gspread\n",
274 |     "# !pip install gspread-dataframe"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "id": "c0a00615",
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "url = \"https://docs.google.com/spreadsheets/d/1JSQbnkwtqPgc-_wqI3LTCJI6jWCaWafubK0ontWR2_Y\"\n",
285 |     "sheet = gc.open_by_url(url)"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "id": "ad54f86b",
292 |    "metadata": {},
293 |    "outputs": [],
294 |    "source": [
295 |     "# this will overwrite the existing sheet!\n",
296 |     "# use other utils from gspread to add data to specific cells.\n",
297 |     "worksheet = sheet.get_worksheet_by_id(0)\n",
298 |     "set_with_dataframe(worksheet, df)"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "markdown",
303 |    "id": "8213e2ab",
304 |    "metadata": {},
305 |    "source": [
306 |     "### Modify Experiments\n",
307 |     "\n",
308 |     "Flag experiments based on the theoretical limits excluding the activations.\n",
309 |     "\n",
310 |     "**Note:** In DDP script cast all model params to bfloat16 except for RoPE layers.\n",
311 |     "\n",
312 |     "1) DDP requires all params, optimizer states, activations to fit into a single GPU.\n",
313 |     "\n",
314 |     "2) Compute approx memory requirement per GPU with FSDP full sharing, consider cases with and without CPU offloading."
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": null,
320 |    "id": "91fad2a9",
321 |    "metadata": {},
322 |    "outputs": [],
323 |    "source": [
324 |     "url = \"https://docs.google.com/spreadsheets/d/1JSQbnkwtqPgc-_wqI3LTCJI6jWCaWafubK0ontWR2_Y\"\n",
325 |     "sheet = gc.open_by_url(url)"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "id": "374f9835",
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": [
335 |     "worksheet = sheet.get_worksheet_by_id(0)"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": null,
341 |    "id": "0a6dbda3",
342 |    "metadata": {},
343 |    "outputs": [],
344 |    "source": [
345 |     "vals = worksheet.get_all_values()\n",
346 |     "df = pd.DataFrame(vals[1:], columns=vals[0])"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": null,
352 |    "id": "9c10eea9",
353 |    "metadata": {},
354 |    "outputs": [],
355 |    "source": [
356 |     "df.shape"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": null,
362 |    "id": "41ffe007",
363 |    "metadata": {},
364 |    "outputs": [],
365 |    "source": [
366 |     "df.columns"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": null,
372 |    "id": "35082b38",
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": [
376 |     "df.head()"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": null,
382 |    "id": "4420b5b7",
383 |    "metadata": {},
384 |    "outputs": [],
385 |    "source": [
386 |     "# activation memory per layer: https://arxiv.org/pdf/2205.05198.pdf\n",
387 |     "bs = 1 \n",
388 |     "sl = 256\n",
389 |     "h = 4096\n",
390 |     "a = 32\n",
391 |     "(bs * sl * h * (34 + 5 * (a * sl / h))) / 1e9"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": null,
397 |    "id": "2a0a8b31",
398 |    "metadata": {},
399 |    "outputs": [],
400 |    "source": [
401 |     "# exclude optimizer states since lora updates a small fraction of weights\n",
402 |     "# exclude activations \n",
403 |     "oom_ignored = []\n",
404 |     "for row in df.itertuples():\n",
405 |     "    if row.cpu_offloading != 'TRUE':\n",
406 |     "        approx_mem_req = int(row.model_size) * 2 / (int(row.num_gpus) if row.distrib_type == 'FSDP' else 1)\n",
407 |     "        oom_ignored.append(approx_mem_req > int(row.total_gpu_mem))\n",
408 |     "    else:\n",
409 |     "        oom_ignored.append(False)"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": null,
415 |    "id": "638b97e7",
416 |    "metadata": {},
417 |    "outputs": [],
418 |    "source": [
419 |     "df['oom_ignored'] = oom_ignored"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": null,
425 |    "id": "f65cf179",
426 |    "metadata": {},
427 |    "outputs": [],
428 |    "source": [
429 |     "df['oom_ignored'].mean(), df['oom_ignored'].sum()"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": null,
435 |    "id": "abfc63ed",
436 |    "metadata": {},
437 |    "outputs": [],
438 |    "source": [
439 |     "set_with_dataframe(worksheet, df)"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "markdown",
444 |    "id": "8d03fc85",
445 |    "metadata": {},
446 |    "source": [
447 |     "### Create Training Commands"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": null,
453 |    "id": "850ed294",
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": [
457 |     "sub_df = df.query(\"oom_ignored == 'FALSE' or not oom_ignored\")"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "code",
462 |    "execution_count": null,
463 |    "id": "19e84219",
464 |    "metadata": {},
465 |    "outputs": [],
466 |    "source": [
467 |     "df.shape, sub_df.shape"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "code",
472 |    "execution_count": null,
473 |    "id": "113fff1e",
474 |    "metadata": {},
475 |    "outputs": [],
476 |    "source": [
477 |     "small_gpu_commands = []\n",
478 |     "large_gpu_commands = []\n",
479 |     "\n",
480 |     "for _, row in sub_df.iterrows():\n",
481 |     "    cmd_args = [\"python train.py\",\n",
482 |     "                \"--batch_size 128\", # divide by 2 every retry\n",
483 |     "                \"--num_epochs 1\",\n",
484 |     "                \"--dataset alpaca_sample\",\n",
485 |     "                \"--use_flash_attention\",\n",
486 |     "                \"--precision bf16_buffers_autocast\",\n",
487 |     "                \"--log_to wandb\",\n",
488 |     "    ]\n",
489 |     "\n",
490 |     "    if row.distrib_type == \"DDP\":\n",
491 |     "        cmd_args.append(\"--use_dpp\")\n",
492 |     "    elif row.distrib_type == \"FSDP\":\n",
493 |     "        pass\n",
494 |     "    else:\n",
495 |     "        raise ValueError(f\"Unknown distrib_type {distrib_type}\")\n",
496 |     "\n",
497 |     "    cmd_args.append(f\"--model_name {row.model_name}\")\n",
498 |     "\n",
499 |     "    cmd_args.append(f\"--context_length {row.seqlen}\")\n",
500 |     "    \n",
501 |     "    if row.use_gradient_checkpointing == \"TRUE\":\n",
502 |     "        cmd_args.append(\"--use_gradient_checkpointing True\")\n",
503 |     "    else:\n",
504 |     "        cmd_args.append(\"--use_gradient_checkpointing False\")\n",
505 |     "    \n",
506 |     "    if row.cpu_offloading == \"TRUE\":\n",
507 |     "        cmd_args.append(\"--use_cpu_offload\")\n",
508 |     "\n",
509 |     "    if row.ft_type == \"LoRA\":\n",
510 |     "        cmd_args.append(\"--train_type lora\")\n",
511 |     "    elif row.ft_type == \"QLoRA\":\n",
512 |     "        cmd_args.append(\"--train_type qlora\")\n",
513 |     "    else:\n",
514 |     "        raise ValueError(f\"Unknown ft_type {ft_type}\")\n",
515 |     "        \n",
516 |     "    if row.gpu_model == \"A100-40\":\n",
517 |     "        large_gpu_commands.append(\" \".join(cmd_args))\n",
518 |     "    elif row.gpu_model == \"A5000\":\n",
519 |     "        small_gpu_commands.append(\" \".join(cmd_args))\n",
520 |     "    else:\n",
521 |     "        ValueError(\"Unknown gpu model.\")"
522 |    ]
523 |   },
524 |   {
525 |    "cell_type": "code",
526 |    "execution_count": null,
527 |    "id": "a45cc821",
528 |    "metadata": {},
529 |    "outputs": [],
530 |    "source": [
531 |     "os.makedirs(\"../benchmarking\", exist_ok=True)"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "code",
536 |    "execution_count": null,
537 |    "id": "abb58d30",
538 |    "metadata": {},
539 |    "outputs": [],
540 |    "source": [
541 |     "with open(\"../benchmarking/small_gpu_benchmarking.sh\", \"w\") as f: \n",
542 |     "    f.write(\"\\n\".join(small_gpu_commands))\n",
543 |     "\n",
544 |     "with open(\"../benchmarking/large_gpu_benchmarking.sh\", \"w\") as f: \n",
545 |     "    f.write(\"\\n\".join(large_gpu_commands))    "
546 |    ]
547 |   },
548 |   {
549 |    "cell_type": "markdown",
550 |    "id": "366931c9",
551 |    "metadata": {},
552 |    "source": [
553 |     "### Update Sheet with Results"
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "code",
558 |    "execution_count": null,
559 |    "id": "78001191",
560 |    "metadata": {},
561 |    "outputs": [],
562 |    "source": [
563 |     "import wandb"
564 |    ]
565 |   },
566 |   {
567 |    "cell_type": "code",
568 |    "execution_count": null,
569 |    "id": "10fbd127",
570 |    "metadata": {},
571 |    "outputs": [],
572 |    "source": [
573 |     "api = wandb.Api()"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": null,
579 |    "id": "6ec1cd1b",
580 |    "metadata": {},
581 |    "outputs": [],
582 |    "source": [
583 |     "url = \"https://docs.google.com/spreadsheets/d/1JSQbnkwtqPgc-_wqI3LTCJI6jWCaWafubK0ontWR2_Y\"\n",
584 |     "sheet = gc.open_by_url(url)"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "code",
589 |    "execution_count": null,
590 |    "id": "c4de6e4e",
591 |    "metadata": {},
592 |    "outputs": [],
593 |    "source": [
594 |     "empty_worksheet = sheet.get_worksheet_by_id(0)"
595 |    ]
596 |   },
597 |   {
598 |    "cell_type": "code",
599 |    "execution_count": null,
600 |    "id": "6b874b88",
601 |    "metadata": {},
602 |    "outputs": [],
603 |    "source": [
604 |     "filled_worksheet = sheet.get_worksheet_by_id(74399953)"
605 |    ]
606 |   },
607 |   {
608 |    "cell_type": "code",
609 |    "execution_count": null,
610 |    "id": "5effe246",
611 |    "metadata": {},
612 |    "outputs": [],
613 |    "source": [
614 |     "vals = empty_worksheet.get_all_values()\n",
615 |     "df = pd.DataFrame(vals[1:], columns=vals[0])"
616 |    ]
617 |   },
618 |   {
619 |    "cell_type": "code",
620 |    "execution_count": null,
621 |    "id": "01b34280",
622 |    "metadata": {},
623 |    "outputs": [],
624 |    "source": [
625 |     "df.shape"
626 |    ]
627 |   },
628 |   {
629 |    "cell_type": "code",
630 |    "execution_count": null,
631 |    "id": "3e0ce5f3",
632 |    "metadata": {},
633 |    "outputs": [],
634 |    "source": [
635 |     "df.columns"
636 |    ]
637 |   },
638 |   {
639 |    "cell_type": "code",
640 |    "execution_count": null,
641 |    "id": "4d9362e7",
642 |    "metadata": {},
643 |    "outputs": [],
644 |    "source": [
645 |     "wandb_project = \"answerdotai/fsdp-benchmarking\"\n",
646 |     "\n",
647 |     "wandb_cols = ['memory_peak', 'memory_after_model_creation',\n",
648 |     "              'memory_after_model_wrap', 'memory_before_forward',\n",
649 |     "              'memory_after_forward', 'memory_after_backward', \n",
650 |     "              'time_taken']\n",
651 |     "\n",
652 |     "empty_logs = pd.Series({c:None for c in wandb_cols})"
653 |    ]
654 |   },
655 |   {
656 |    "cell_type": "code",
657 |    "execution_count": null,
658 |    "id": "b3f74bbf",
659 |    "metadata": {},
660 |    "outputs": [],
661 |    "source": [
662 |     "wandb_logs = []\n",
663 |     "for row in df.itertuples():\n",
664 |     "    if row.wandb_link == \"\": \n",
665 |     "        wandb_logs.append(empty_logs)\n",
666 |     "    else:\n",
667 |     "        expid = row.wandb_link.split(\"runs/\")[-1].split(\"/\")[0].split(\"?\")[0]\n",
668 |     "        print(row.wandb_link, expid)\n",
669 |     "        run = api.run(wandb_project + \"/\" + expid)\n",
670 |     "        history_df = run.history()\n",
671 |     "        existing_cols = list(set(history_df.columns).intersection(wandb_cols))\n",
672 |     "        wandb_logs.append(history_df[existing_cols].fillna(-1e30).max(axis=0))"
673 |    ]
674 |   },
675 |   {
676 |    "cell_type": "code",
677 |    "execution_count": null,
678 |    "id": "bf02932b",
679 |    "metadata": {},
680 |    "outputs": [],
681 |    "source": [
682 |     "wandb_logs_df = pd.concat(wandb_logs, axis=1).T"
683 |    ]
684 |   },
685 |   {
686 |    "cell_type": "code",
687 |    "execution_count": null,
688 |    "id": "27635e6b",
689 |    "metadata": {},
690 |    "outputs": [],
691 |    "source": [
692 |     "for c in wandb_logs_df.columns:\n",
693 |     "    if c.startswith(\"memory\"):\n",
694 |     "        wandb_logs_df[c] = wandb_logs_df[c] / 1e9"
695 |    ]
696 |   },
697 |   {
698 |    "cell_type": "code",
699 |    "execution_count": null,
700 |    "id": "153973bc",
701 |    "metadata": {},
702 |    "outputs": [],
703 |    "source": [
704 |     "df[wandb_logs_df.columns] = wandb_logs_df"
705 |    ]
706 |   },
707 |   {
708 |    "cell_type": "code",
709 |    "execution_count": null,
710 |    "id": "d82a0a2a",
711 |    "metadata": {},
712 |    "outputs": [],
713 |    "source": [
714 |     "df.head()"
715 |    ]
716 |   },
717 |   {
718 |    "cell_type": "code",
719 |    "execution_count": null,
720 |    "id": "5efa1561",
721 |    "metadata": {},
722 |    "outputs": [],
723 |    "source": [
724 |     "set_with_dataframe(filled_worksheet, df, 1, 1)"
725 |    ]
726 |   },
727 |   {
728 |    "cell_type": "code",
729 |    "execution_count": null,
730 |    "id": "d7fd5775",
731 |    "metadata": {},
732 |    "outputs": [],
733 |    "source": []
734 |   },
735 |   {
736 |    "cell_type": "code",
737 |    "execution_count": null,
738 |    "id": "5e268230",
739 |    "metadata": {},
740 |    "outputs": [],
741 |    "source": []
742 |   },
743 |   {
744 |    "cell_type": "code",
745 |    "execution_count": null,
746 |    "id": "72b7b240",
747 |    "metadata": {},
748 |    "outputs": [],
749 |    "source": []
750 |   }
751 |  ],
752 |  "metadata": {
753 |   "kernelspec": {
754 |    "display_name": "conda-env-modeling-py",
755 |    "language": "python",
756 |    "name": "conda-env-modeling-py"
757 |   }
758 |  },
759 |  "nbformat": 4,
760 |  "nbformat_minor": 5
761 | }
762 | 


--------------------------------------------------------------------------------
/nbs/02-qlora-memeff-loading.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "56c7c28c-651d-40d9-9989-84d5e0acd6c1",
  7 |    "metadata": {},
  8 |    "outputs": [
  9 |     {
 10 |      "name": "stderr",
 11 |      "output_type": "stream",
 12 |      "text": [
 13 |       "/home/paperspace/git/bitsandbytes/bitsandbytes/cuda_setup/main.py:109: UserWarning: \n",
 14 |       "\n",
 15 |       "================================================================================\n",
 16 |       "WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n",
 17 |       "BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n",
 18 |       "If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n",
 19 |       "If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n",
 20 |       "For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64\n",
 21 |       "Loading CUDA version: BNB_CUDA_VERSION=123\n",
 22 |       "================================================================================\n",
 23 |       "\n",
 24 |       "\n",
 25 |       "  warn((f'\\n\\n{\"=\"*80}\\n'\n"
 26 |      ]
 27 |     }
 28 |    ],
 29 |    "source": [
 30 |     "import torch\n",
 31 |     "import bitsandbytes as bnb\n",
 32 |     "import safetensors\n",
 33 |     "from safetensors.torch import save_file"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "id": "60ce68e7-7a82-4bce-a6f5-c1222ac9595a",
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "from bitsandbytes.nn import Linear4bit, Params4bit\n",
 44 |     "import bitsandbytes.functional as F\n",
 45 |     "from transformers.utils import hub, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "id": "f7c0725d-dbda-455d-a02c-32b6f489229d",
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "from transformers import AutoConfig, AutoModelForCausalLM\n",
 56 |     "import torch.nn as nn"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "id": "09c83bfd-bc5f-4e6c-8ad9-c621891edcfa",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "### Test Linear4bit Memory Eff Loading\n",
 65 |     "\n",
 66 |     "This will test that each rank has the correct quant state and params, also compare with original weights loaded. "
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "id": "057decc8-852c-4c5c-b3b5-40cd8c634b01",
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "params_rank0 = torch.load(\"../data/summoned_lora_layer0_q_proj_base_layer_params_rank0.pt\")\n",
 77 |     "params_rank1 = torch.load(\"../data/summoned_lora_layer0_q_proj_base_layer_params_rank1.pt\")"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "id": "98fa6cc2-fcee-4740-b39a-8ea00796e43d",
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "quant_state_rank0 = torch.load(\"../data/summoned_lora_layer0_q_proj_quant_state_rank0.pt\", map_location=\"cpu\")\n",
 88 |     "quant_state_rank1 = torch.load(\"../data/summoned_lora_layer0_q_proj_quant_state_rank1.pt\",  map_location=\"cpu\")"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "id": "7d0e1075-dc77-4d6c-8f82-6b27af267894",
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "# check gathered quantized weights are same in each rank\n",
 99 |     "for p1, p2 in zip(params_rank0, params_rank1):\n",
100 |     "    p1 = p1[~p1.data.isnan()]\n",
101 |     "    p2 = p2[~p2.data.isnan()]\n",
102 |     "    assert torch.allclose(p1, p2)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "id": "5d639c9f-0c99-4bed-9773-770a04fd260b",
109 |    "metadata": {},
110 |    "outputs": [
111 |     {
112 |      "name": "stdout",
113 |      "output_type": "stream",
114 |      "text": [
115 |       "quant_type\n",
116 |       "absmax\n",
117 |       "blocksize\n",
118 |       "quant_map\n",
119 |       "dtype\n",
120 |       "shape\n",
121 |       "nested_absmax\n",
122 |       "nested_blocksize\n",
123 |       "nested_quant_map\n",
124 |       "nested_dtype\n",
125 |       "nested_offset\n"
126 |      ]
127 |     }
128 |    ],
129 |    "source": [
130 |     "# check quant states are same in each rank\n",
131 |     "for k,v in quant_state_rank0.as_dict().items():\n",
132 |     "    print(k)\n",
133 |     "    if isinstance(v, torch.Tensor):\n",
134 |     "        assert torch.equal(v, quant_state_rank1.as_dict()[k])\n",
135 |     "    else:\n",
136 |     "        assert v == quant_state_rank1.as_dict()[k]"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "id": "047fac20-47b5-416d-9948-0d3f1eb39233",
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "quantized_param = Params4bit(data=params_rank0[0], \n",
147 |     "                               requires_grad=False, \n",
148 |     "                               quant_state=quant_state_rank0,\n",
149 |     "                               quant_type=quant_state_rank0.quant_type,\n",
150 |     "                               quant_storage=params_rank0[0].dtype, \n",
151 |     "                               bnb_quantized=True)"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "id": "adbba6bd-b52d-45fc-a695-1b7fc508abaa",
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "quant_state_rank0.to(\"cuda\");"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "id": "7aca49d3-1248-4370-aa66-fce1a8a34a81",
168 |    "metadata": {},
169 |    "outputs": [
170 |     {
171 |      "data": {
172 |       "text/plain": [
173 |        "{'quant_type': 'nf4',\n",
174 |        " 'absmax': tensor([230, 149,  74,  ..., 194, 175, 203], device='cuda:0',\n",
175 |        "        dtype=torch.uint8),\n",
176 |        " 'blocksize': 64,\n",
177 |        " 'quant_map': tensor([-1.0000, -0.6962, -0.5251, -0.3949, -0.2844, -0.1848, -0.0911,  0.0000,\n",
178 |        "          0.0796,  0.1609,  0.2461,  0.3379,  0.4407,  0.5626,  0.7230,  1.0000]),\n",
179 |        " 'dtype': 'bfloat16',\n",
180 |        " 'shape': (8192, 8192),\n",
181 |        " 'nested_absmax': tensor([0.0736, 0.0258, 0.0224,  ..., 0.0658, 0.0902, 0.0638], device='cuda:0'),\n",
182 |        " 'nested_blocksize': 256,\n",
183 |        " 'nested_quant_map': tensor([-9.9297e-01, -9.7891e-01, -9.6484e-01, -9.5078e-01, -9.3672e-01,\n",
184 |        "         -9.2266e-01, -9.0859e-01, -8.9453e-01, -8.8047e-01, -8.6641e-01,\n",
185 |        "         -8.5234e-01, -8.3828e-01, -8.2422e-01, -8.1016e-01, -7.9609e-01,\n",
186 |        "         -7.8203e-01, -7.6797e-01, -7.5391e-01, -7.3984e-01, -7.2578e-01,\n",
187 |        "         -7.1172e-01, -6.9766e-01, -6.8359e-01, -6.6953e-01, -6.5547e-01,\n",
188 |        "         -6.4141e-01, -6.2734e-01, -6.1328e-01, -5.9922e-01, -5.8516e-01,\n",
189 |        "         -5.7109e-01, -5.5703e-01, -5.4297e-01, -5.2891e-01, -5.1484e-01,\n",
190 |        "         -5.0078e-01, -4.8672e-01, -4.7266e-01, -4.5859e-01, -4.4453e-01,\n",
191 |        "         -4.3047e-01, -4.1641e-01, -4.0234e-01, -3.8828e-01, -3.7422e-01,\n",
192 |        "         -3.6016e-01, -3.4609e-01, -3.3203e-01, -3.1797e-01, -3.0391e-01,\n",
193 |        "         -2.8984e-01, -2.7578e-01, -2.6172e-01, -2.4766e-01, -2.3359e-01,\n",
194 |        "         -2.1953e-01, -2.0547e-01, -1.9141e-01, -1.7734e-01, -1.6328e-01,\n",
195 |        "         -1.4922e-01, -1.3516e-01, -1.2109e-01, -1.0703e-01, -9.8594e-02,\n",
196 |        "         -9.5781e-02, -9.2969e-02, -9.0156e-02, -8.7344e-02, -8.4531e-02,\n",
197 |        "         -8.1719e-02, -7.8906e-02, -7.6094e-02, -7.3281e-02, -7.0469e-02,\n",
198 |        "         -6.7656e-02, -6.4844e-02, -6.2031e-02, -5.9219e-02, -5.6406e-02,\n",
199 |        "         -5.3594e-02, -5.0781e-02, -4.7969e-02, -4.5156e-02, -4.2344e-02,\n",
200 |        "         -3.9531e-02, -3.6719e-02, -3.3906e-02, -3.1094e-02, -2.8281e-02,\n",
201 |        "         -2.5469e-02, -2.2656e-02, -1.9844e-02, -1.7031e-02, -1.4219e-02,\n",
202 |        "         -1.1406e-02, -9.7187e-03, -9.1562e-03, -8.5938e-03, -8.0312e-03,\n",
203 |        "         -7.4687e-03, -6.9063e-03, -6.3437e-03, -5.7813e-03, -5.2188e-03,\n",
204 |        "         -4.6562e-03, -4.0937e-03, -3.5312e-03, -2.9687e-03, -2.4062e-03,\n",
205 |        "         -1.8438e-03, -1.2812e-03, -9.4375e-04, -8.3125e-04, -7.1875e-04,\n",
206 |        "         -6.0625e-04, -4.9375e-04, -3.8125e-04, -2.6875e-04, -1.5625e-04,\n",
207 |        "         -8.8750e-05, -6.6250e-05, -4.3750e-05, -2.1250e-05, -7.7500e-06,\n",
208 |        "         -3.2500e-06, -5.5000e-07,  0.0000e+00,  5.5000e-07,  3.2500e-06,\n",
209 |        "          7.7500e-06,  2.1250e-05,  4.3750e-05,  6.6250e-05,  8.8750e-05,\n",
210 |        "          1.5625e-04,  2.6875e-04,  3.8125e-04,  4.9375e-04,  6.0625e-04,\n",
211 |        "          7.1875e-04,  8.3125e-04,  9.4375e-04,  1.2812e-03,  1.8438e-03,\n",
212 |        "          2.4062e-03,  2.9687e-03,  3.5312e-03,  4.0937e-03,  4.6562e-03,\n",
213 |        "          5.2188e-03,  5.7813e-03,  6.3437e-03,  6.9063e-03,  7.4687e-03,\n",
214 |        "          8.0312e-03,  8.5938e-03,  9.1562e-03,  9.7187e-03,  1.1406e-02,\n",
215 |        "          1.4219e-02,  1.7031e-02,  1.9844e-02,  2.2656e-02,  2.5469e-02,\n",
216 |        "          2.8281e-02,  3.1094e-02,  3.3906e-02,  3.6719e-02,  3.9531e-02,\n",
217 |        "          4.2344e-02,  4.5156e-02,  4.7969e-02,  5.0781e-02,  5.3594e-02,\n",
218 |        "          5.6406e-02,  5.9219e-02,  6.2031e-02,  6.4844e-02,  6.7656e-02,\n",
219 |        "          7.0469e-02,  7.3281e-02,  7.6094e-02,  7.8906e-02,  8.1719e-02,\n",
220 |        "          8.4531e-02,  8.7344e-02,  9.0156e-02,  9.2969e-02,  9.5781e-02,\n",
221 |        "          9.8594e-02,  1.0703e-01,  1.2109e-01,  1.3516e-01,  1.4922e-01,\n",
222 |        "          1.6328e-01,  1.7734e-01,  1.9141e-01,  2.0547e-01,  2.1953e-01,\n",
223 |        "          2.3359e-01,  2.4766e-01,  2.6172e-01,  2.7578e-01,  2.8984e-01,\n",
224 |        "          3.0391e-01,  3.1797e-01,  3.3203e-01,  3.4609e-01,  3.6016e-01,\n",
225 |        "          3.7422e-01,  3.8828e-01,  4.0234e-01,  4.1641e-01,  4.3047e-01,\n",
226 |        "          4.4453e-01,  4.5859e-01,  4.7266e-01,  4.8672e-01,  5.0078e-01,\n",
227 |        "          5.1484e-01,  5.2891e-01,  5.4297e-01,  5.5703e-01,  5.7109e-01,\n",
228 |        "          5.8516e-01,  5.9922e-01,  6.1328e-01,  6.2734e-01,  6.4141e-01,\n",
229 |        "          6.5547e-01,  6.6953e-01,  6.8359e-01,  6.9766e-01,  7.1172e-01,\n",
230 |        "          7.2578e-01,  7.3984e-01,  7.5391e-01,  7.6797e-01,  7.8203e-01,\n",
231 |        "          7.9609e-01,  8.1016e-01,  8.2422e-01,  8.3828e-01,  8.5234e-01,\n",
232 |        "          8.6641e-01,  8.8047e-01,  8.9453e-01,  9.0859e-01,  9.2266e-01,\n",
233 |        "          9.3672e-01,  9.5078e-01,  9.6484e-01,  9.7891e-01,  9.9297e-01,\n",
234 |        "          1.0000e+00], device='cuda:0'),\n",
235 |        " 'nested_dtype': 'float32',\n",
236 |        " 'nested_offset': 0.03480497747659683}"
237 |       ]
238 |      },
239 |      "execution_count": null,
240 |      "metadata": {},
241 |      "output_type": "execute_result"
242 |     }
243 |    ],
244 |    "source": [
245 |     "quant_state_rank0.as_dict()"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "id": "61efa86b-cbb5-4e23-80b8-34489bc47785",
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "data = params_rank0[0].data.to(\"cuda\")"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "id": "2c440294-52ba-4fca-959b-1d59138ba0bc",
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "dequantized_weight = F.dequantize_4bit(data, quant_state_rank0)"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": null,
271 |    "id": "d189849e-cdf3-4321-be2c-6cd53a694385",
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "# put here the model name used to save the summoned weights\n",
276 |     "model_name = \"codellama/CodeLlama-34b-hf\""
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "id": "a6151cc2-3f42-41be-b608-cc1d2738c336",
283 |    "metadata": {},
284 |    "outputs": [],
285 |    "source": [
286 |     "idx = hub.cached_file(model_name, SAFE_WEIGHTS_INDEX_NAME)\n",
287 |     "files, _ = hub.get_checkpoint_shard_files(model_name, idx)\n",
288 |     "orig_weight = None\n",
289 |     "for filename in files:\n",
290 |     "    weights = safetensors.torch.load_file(filename)\n",
291 |     "    for name, param in weights.items():\n",
292 |     "        if name == \"model.layers.0.self_attn.q_proj.weight\":\n",
293 |     "            orig_weight = param\n",
294 |     "            break"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "id": "af94ca28-6ecc-4861-8a45-c98fb22c2f24",
301 |    "metadata": {},
302 |    "outputs": [],
303 |    "source": [
304 |     "# some devation is expected from dequantization\n",
305 |     "# Taken from : peft/tests/.../test_4bit_merge_and_disable_lora - Stricter tolerance values needed?\n",
306 |     "assert torch.allclose(dequantized_weight.cpu(), orig_weight, atol=0.01, rtol=10)"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": null,
312 |    "id": "b3531c52-a685-42c9-91da-c998b4cd10c5",
313 |    "metadata": {},
314 |    "outputs": [],
315 |    "source": []
316 |   }
317 |  ],
318 |  "metadata": {
319 |   "kernelspec": {
320 |    "display_name": "python3",
321 |    "language": "python",
322 |    "name": "python3"
323 |   }
324 |  },
325 |  "nbformat": 4,
326 |  "nbformat_minor": 5
327 | }
328 | 


--------------------------------------------------------------------------------
/profile.sh:
--------------------------------------------------------------------------------
 1 | #See PROFILING.md for documentation
 2 | 
 3 | # Run profiler contiguously on a 5-step cycle: 4 warmup steps and 1 active (recording) step.
 4 | python train.py \
 5 | --model_name "hf-internal-testing/tiny-random-LlamaForCausalLM" \
 6 | --gradient_accumulation_steps 2 \
 7 | --batch_size 1 \
 8 | --context_length 256 \
 9 | --num_epochs 1 \
10 | --sharding_strategy full_shard \
11 | --precision bf16 \
12 | --train_type qlora \
13 | --use_gradient_checkpointing false \
14 | --use_cpu_offload false \
15 | --log_to stdout \
16 | --dataset dummy \
17 | --profile true \
18 | --export_trace true \
19 | --export_memory_timeline false \
20 | --with_stack true \
21 | --max_steps 20 \
22 | --repeat 0 \
23 | --warmup_steps 4 \
24 | --active_steps 1 \
25 | --profiling_frequency 5 \
26 | --profiling_output llama-test
27 | 
28 | # Run for 1 cycle then stop profiling
29 | # python train.py \
30 | #   --model_name "hf-internal-testing/tiny-random-LlamaForCausalLM" \
31 | #   --gradient_accumulation_steps 2 \
32 | #   --batch_size 1 \
33 | #   --context_length 256 \
34 | #   --num_epochs 1 \
35 | #   --sharding_strategy full_shard \
36 | #   --precision bf16 \
37 | #   --train_type qlora \
38 | #   --use_gradient_checkpointing false \
39 | #   --use_cpu_offload false \
40 | #   --log_to stdout \
41 | #   --dataset dummy \
42 | #   --profile true \
43 | #   --export_trace true \
44 | #   --export_memory_timeline true \
45 | #   --with_stack true \
46 | #   --num_epochs 1 \
47 | #   --max_steps 20 \
48 | #   --repeat 1 \
49 | #   --warmup_steps 1 \
50 | #   --active_steps 4 \
51 | #   --profiling_output llama-test2


--------------------------------------------------------------------------------
/profiling_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the BSD-style license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import contextlib
  8 | import os
  9 | import time
 10 | import logging
 11 | import torch
 12 | import torch.distributed
 13 | from functools import partial
 14 | import shutil
 15 | from torch.profiler import tensorboard_trace_handler
 16 | WARMUP = 3
 17 | 
 18 | logger = logging.getLogger()
 19 | 
 20 | #adapted from https://github.com/pytorch/torchtitan
 21 |        
 22 | def trace_handler(prof: torch.profiler.profiler.profile, rank, export_memory_timeline, output_dir, metric="self_cuda_time_total", with_stack=True, group_by_stack=0, group_by_input_shape=False, row_limit=25):
 23 |     curr_trace_dir_name = "iteration_" + str(prof.step_num)
 24 |     curr_trace_dir = os.path.join(output_dir, curr_trace_dir_name)
 25 |     if not os.path.exists(curr_trace_dir):
 26 |         os.makedirs(curr_trace_dir, exist_ok=True)
 27 | 
 28 |     #Export chrome / tensorboard trace
 29 |     logger.info(f"Dumping traces at step {prof.step_num}")
 30 |     begin = time.monotonic()
 31 |     
 32 |     #Use tensorboard trace handler rather than directly exporting chrome traces since 
 33 |     #tensorboard doesn't seem to be able to parse traces when with prof.export_chrome_trace
 34 |     exporter = tensorboard_trace_handler(curr_trace_dir, worker_name=f"rank{rank}", use_gzip=True)
 35 |     exporter(prof)
 36 |     #prof.export_chrome_trace(f"{curr_trace_dir}/rank{rank}_trace.json")
 37 |     
 38 |     logger.info(
 39 |         f"Finished dumping traces in {time.monotonic() - begin:.2f} seconds"
 40 |     )
 41 |     
 42 |     #Construct the memory timeline file.
 43 |     if export_memory_timeline:
 44 |         try:
 45 |             prof.export_memory_timeline(
 46 |             f"{curr_trace_dir}/rank{rank}_memory-timeline.html"
 47 |         )
 48 |         except:
 49 |             logger.info("Failed to export memory timeline to html, retrying as gzipped json.")
 50 |             try:
 51 |                 prof.export_memory_timeline(
 52 |                     f"{curr_trace_dir}/rank{rank}_memory-timeline.json.gz"
 53 |                 )
 54 |             except:
 55 |                 
 56 |                 logger.info("Failed to export memory timeline to gzipped json. Saving profiler timeline object instead.")
 57 |                 from torch.profiler._memory_profiler import MemoryProfileTimeline
 58 |                 memory_profile = MemoryProfileTimeline(prof._memory_profile())
 59 |                 torch.save(memory_profile, f"{curr_trace_dir}/rank{rank}_memory-timeline.pt")
 60 |                 
 61 |     #Dump stack traces
 62 |     if with_stack:
 63 |         prof.export_stacks(f"{curr_trace_dir}/rank{rank}_stacks.txt", metric=metric)
 64 | 
 65 |     #Export event averages
 66 |     key_avgs = prof.key_averages(
 67 |         group_by_input_shape=group_by_input_shape, group_by_stack_n=group_by_stack
 68 |     ).table(sort_by=metric, row_limit=row_limit)
 69 |     with open(f"{curr_trace_dir}/rank{rank}_key_averages.txt", "w") as f:
 70 |         print(
 71 |             key_avgs, file=f
 72 |         )
 73 |     if rank == 0:
 74 |         print(f"Saving profiling results to {curr_trace_dir}")
 75 | 
 76 |     #TODO: Is this necessary?
 77 |     torch.distributed.barrier()
 78 | 
 79 | @contextlib.contextmanager
 80 | def profiling_context(args, rank):
 81 |     enable_profiling = args["profile"]
 82 |     
 83 |     if enable_profiling:          
 84 |         model_name = args["model_name"].split("/")[-1]
 85 |         train_type = args["train_type"]
 86 |         output_dir = args["profiling_output"] if args["profiling_output"] else f"./{model_name}_{train_type}"
 87 |       
 88 |         if not os.path.exists(output_dir):
 89 |             os.makedirs(output_dir, exist_ok=True)
 90 |         
 91 |         logger.info(f"Profiling enabled. Traces will be saved at {output_dir}")
 92 | 
 93 |         warmup = args["warmup_steps"]
 94 |         active = args["active_steps"]
 95 |         repeat = args["repeat"]
 96 | 
 97 |         if repeat == 0:
 98 |             steps_per_cycle = args["profiling_frequency"]
 99 |             wait = steps_per_cycle - (active + warmup)
100 |         else:
101 |             wait = args["wait_steps"]
102 |             steps_per_cycle = wait + warmup + active        
103 |         assert (
104 |             wait >= 0
105 |         ), "profile_freq must be greater than or equal to warmup + active"
106 |         logger.info(f"Profiler schedule - steps per cycle: {steps_per_cycle} wait: {wait} warmup: {warmup} active: {active} repeat: {repeat if repeat !=0 else 'inf'}")
107 | 
108 |         profile_memory = args["export_memory_timeline"]
109 |         export_memory_timeline = args["export_memory_timeline"]
110 |         with_stack = args["with_stack"] or args["export_memory_timeline"]
111 |         with_shapes = args["with_shapes"] or export_memory_timeline
112 |         callback = partial(trace_handler, rank=rank, 
113 |                             export_memory_timeline=export_memory_timeline, 
114 |                             output_dir=output_dir,
115 |                             with_stack=with_stack,
116 |                             group_by_input_shape=with_shapes, 
117 |                             group_by_stack=5 if with_stack else 0)
118 |         
119 |         with torch.profiler.profile(
120 |             activities=[
121 |                 torch.profiler.ProfilerActivity.CPU,
122 |                 torch.profiler.ProfilerActivity.CUDA,
123 |             ],
124 |             with_stack=with_stack,
125 |             profile_memory=profile_memory,
126 |             record_shapes=with_shapes,
127 |             schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=repeat),
128 |             on_trace_ready=callback,
129 |             experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True) if with_stack else None,
130 |         ) as torch_profiler:
131 |             yield torch_profiler
132 |     else:
133 |         class FakeProfiler:
134 |             """
135 |             Fake profiler object when profiling is not enabled.
136 |             
137 |             """
138 |             def __enter__(self):
139 |                 return self
140 |             def __exit__(self, *args, **kwargs):
141 |                 pass
142 |             
143 |             def step(self):
144 |                 pass
145 |         
146 |         yield FakeProfiler()


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AnswerDotAI/fsdp_qlora/05ed9f2a60f96a0795cb082bceab70a9b19fd213/scripts/__init__.py


--------------------------------------------------------------------------------
/scripts/block_expansion.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | from transformers import AutoConfig
 4 | import torch
 5 | from transformers.utils import hub, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
 6 | import safetensors
 7 | from safetensors.torch import save_file
 8 | import os 
 9 | from pathlib import Path
10 | 
11 | def main():
12 |     # Set up the argument parser
13 |     parser = argparse.ArgumentParser(description="Receive deepen model's args")
14 |     parser.add_argument("--model_name", default='meta-llama/Llama-2-7b-hf', type=str, help="original model path")
15 |     parser.add_argument("--output_dir", default=None, type=str, help="deepened model ckpt save path")
16 |     parser.add_argument("--expansion_rate", default=0.1, type=float, help="add new trainable % of layers")
17 | 
18 |     # Parse the arguments
19 |     args = parser.parse_args()
20 |         
21 |     idx = hub.cached_file(args.model_name, SAFE_WEIGHTS_INDEX_NAME)
22 |     files, _ = hub.get_checkpoint_shard_files(args.model_name, idx)
23 |     
24 |     cfg = AutoConfig.from_pretrained(args.model_name)
25 |     num_original_layers = cfg.num_hidden_layers
26 |     new_layers = num_original_layers + int(num_original_layers * args.expansion_rate)
27 |     
28 |     split = int(num_original_layers / (new_layers - num_original_layers))
29 |     
30 |     if args.output_dir is None:
31 |         output_dir = Path(os.environ['HOME'])/'models'/(args.model_name + f'_blk_exp-{num_original_layers}-{new_layers}')
32 |     else:
33 |         output_dir = Path(args.output_dir)/(args.model_name + f'_blk_exp-{num_original_layers}-{new_layers}')
34 |     os.makedirs(output_dir, exist_ok=True)
35 |     
36 |     for filename in files:
37 |         weights = safetensors.torch.load_file(filename)
38 |         expanded_weights = {}
39 |         for k,v in iter(weights.items()):
40 |             if 'layers' in k:
41 |                 layer_no = int(k.split('layers.')[1].split('.')[0])
42 |                 # shift existing layers
43 |                 new_layer_no = layer_no + layer_no // split
44 |                 new_k = k.replace(f'layers.{layer_no}', f'layers.{new_layer_no}')
45 |                 expanded_weights[new_k] = v
46 |                 # add new layers
47 |                 if (layer_no+1) % split == 0:
48 |                     new_layer_no += 1
49 |                     new_k = k.replace(f'layers.{layer_no}', f'layers.{new_layer_no}')
50 |                     if 'down_proj' in k or 'o_proj' in k:
51 |                         expanded_weights[new_k] = torch.zeros_like(v)     
52 |                     else:
53 |                         expanded_weights[new_k] = v.clone()
54 |             else:
55 |                 expanded_weights[k] = v
56 |         save_file(expanded_weights, output_dir/Path(filename).name)
57 |     
58 | 
59 | if __name__ == "__main__":
60 |     main()


--------------------------------------------------------------------------------
/scripts/dora.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import bitsandbytes as bnb
  4 | 
  5 | # Wrapping policy requires modules, base_layer has no grad params, lora_A, lora_B, dora_scale have grad params.
  6 | class DORALayer(nn.Module):
  7 |     "Same as LORA but also returnes weight norm. This will be wrapped as a single FSDP unit"
  8 |     def __init__(self, in_features, out_features, lora_rank, device, dtype, *args, **kwargs):
  9 |         super().__init__()
 10 |         # Init LoRA layers.
 11 |         std_dev = 1 / torch.sqrt(torch.tensor(lora_rank).float())
 12 |         lora_A_param = nn.Parameter(torch.randn(lora_rank, in_features).to(device=device, dtype=dtype)*std_dev)
 13 |         self.lora_A = nn.Linear(in_features, lora_rank, bias=False, device=device, dtype=dtype)
 14 |         setattr(self.lora_A, "weight", lora_A_param)
 15 |         
 16 |         self.lora_B = nn.Linear(lora_rank, out_features, bias=False, device=device, dtype=dtype)
 17 |         self.lora_B.weight.data.zero_()
 18 |     
 19 |     def forward(self, x, frozen_weight):
 20 |         output = self.lora_B(self.lora_A(x))
 21 |         # print("lora A shape:", self.lora_A.weight.shape)
 22 |         # print("lora B shape:", self.lora_B.weight.shape)
 23 |         # DoRA Section 4.3. Detach column norm to avoid backprop through it.
 24 |         column_norm = (frozen_weight + self.lora_B.weight @ self.lora_A.weight).norm(p=2, dim=1).detach()
 25 |         # print("column norm shape:", column_norm.shape, column_norm.shape)
 26 |         return output, column_norm
 27 |     
 28 | class MagnitudeLayer(nn.Module):
 29 |     "FSDP doesn't work with nn.ParameterDict hence this module: https://github.com/pytorch/pytorch/issues/79605"
 30 |     def __init__(self, vector_data, device, dtype):
 31 |         super().__init__()
 32 |         self.magnitude = nn.Parameter(vector_data.to(device=device, dtype=dtype))
 33 |         
 34 |     def forward(self, x):
 35 |         return x * self.magnitude.view(1,1,-1)
 36 |     
 37 | class HQQDORA(nn.Module):
 38 |     def __init__(self, base_layer, lora_rank, *args, **kwargs):
 39 |         super().__init__()
 40 |         self.base_layer = base_layer
 41 |         dtype = getattr(base_layer, "compute_dtype", next(base_layer.parameters()).dtype)
 42 |         device = next(base_layer.parameters()).device
 43 |         
 44 |         # Init trainable magnitude parameter.
 45 |         self.magnitude_layer = MagnitudeLayer(self.base_layer.dora_scale.clone().to(dtype=dtype), device, dtype)
 46 |         self.base_layer.dora_scale = None
 47 |         torch.cuda.empty_cache()
 48 |         
 49 |         # Init DORA layers.
 50 |         self.dora_layer = DORALayer(base_layer.in_features, base_layer.out_features, lora_rank, device, dtype, *args, **kwargs)
 51 | 
 52 |     def forward(self, x, *args, **kwargs):
 53 |         result = self.base_layer(x, *args, **kwargs)
 54 |         # As per Tim Dettmers, for 4bit, we need to defensively clone here.
 55 |         # The reason is that in some cases, an error can occur that backprop
 56 |         # does not work on a manipulated view. This issue may be solved with
 57 |         # newer PyTorch versions but this would need extensive testing to be
 58 |         # sure.
 59 |         result = result.clone()
 60 | 
 61 |         requires_conversion = not torch.is_autocast_enabled()
 62 |         if requires_conversion:
 63 |             expected_dtype = result.dtype
 64 |             x = x.to(self.dora_layer.lora_A.weight.dtype)
 65 | 
 66 |         # m * (W + AB / ||W + AB||) @ X == m * ((W @ X + AB @ X) / ||W + AB||)
 67 |         output, column_norm = self.dora_layer(x, self.base_layer.dequantize_aten())
 68 |         if requires_conversion:
 69 |             output = output.to(expected_dtype)
 70 |         
 71 |         result += output        
 72 |         result = result / column_norm.view(1,1,-1) #unit vector result.
 73 |         result = self.magnitude_layer(result) #rescaled result.
 74 |         return result
 75 |     
 76 | class BNBDORA(nn.Module):
 77 |     def __init__(self, base_layer, lora_rank, *args, **kwargs):
 78 |         super().__init__()
 79 |         self.base_layer = base_layer
 80 |         dtype = getattr(base_layer, "compute_dtype", next(base_layer.parameters()).dtype)
 81 |         device = next(base_layer.parameters()).device
 82 |         
 83 |         # Init trainable magnitude parameter.
 84 |         self.magnitude_layer = MagnitudeLayer(self.base_layer.dora_scale.clone().to(dtype=dtype), device, dtype)
 85 |         self.base_layer.dora_scale = None
 86 |         torch.cuda.empty_cache()
 87 |         
 88 |         # Init DORA layers.
 89 |         self.dora_layer = DORALayer(base_layer.in_features, base_layer.out_features, lora_rank, device, dtype, *args, **kwargs)
 90 | 
 91 |     def forward(self, x, *args, **kwargs):
 92 |         result = self.base_layer(x, *args, **kwargs)
 93 |         # As per Tim Dettmers, for 4bit, we need to defensively clone here.
 94 |         # The reason is that in some cases, an error can occur that backprop
 95 |         # does not work on a manipulated view. This issue may be solved with
 96 |         # newer PyTorch versions but this would need extensive testing to be
 97 |         # sure.
 98 |         result = result.clone()
 99 | 
100 |         requires_conversion = not torch.is_autocast_enabled()
101 |         if requires_conversion:
102 |             expected_dtype = result.dtype
103 |             x = x.to(self.dora_layer.lora_A.weight.dtype)
104 | 
105 |         # m * (W + AB / ||W + AB||) @ X == m * ((W @ X + AB @ X) / ||W + AB||)
106 |         output, column_norm = self.dora_layer(x, bnb.functional.dequantize_4bit(self.base_layer.weight.data, 
107 |                                                                                 self.base_layer.weight.quant_state))
108 |         if requires_conversion:
109 |             output = output.to(expected_dtype)
110 |         
111 |         result += output        
112 |         result = result / column_norm.view(1,1,-1) #unit vector result.
113 |         result = self.magnitude_layer(result) #rescaled result.
114 |         return result


--------------------------------------------------------------------------------
/scripts/lora.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class LORA(nn.Module):
 5 |     def __init__(self, base_layer, lora_rank, lora_alpha, lora_dropout):
 6 |         super().__init__()
 7 |         self.base_layer = base_layer
 8 |         dtype = getattr(base_layer, "compute_dtype", next(base_layer.parameters()).dtype)
 9 |         device = next(base_layer.parameters()).device
10 |         lora_A = nn.Linear(base_layer.in_features, lora_rank, bias=False, device=device, dtype=dtype)
11 |         lora_B = nn.Linear(lora_rank, base_layer.out_features, bias=False, device=device, dtype=dtype)
12 |         lora_B.weight.data.zero_()
13 | 
14 |         self.lora_AB = nn.Sequential(lora_A, lora_B)
15 | 
16 |         self.lora_alpha = lora_alpha
17 |         self.lora_dropout = nn.Dropout(lora_dropout)
18 |         self.scaling = self.lora_alpha / lora_rank
19 | 
20 |     def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
21 | 
22 |         result = self.base_layer(x, *args, **kwargs)
23 |         # As per Tim Dettmers, for 4bit, we need to defensively clone here.
24 |         # The reason is that in some cases, an error can occur that backprop
25 |         # does not work on a manipulated view. This issue may be solved with
26 |         # newer PyTorch versions but this would need extensive testing to be
27 |         # sure.
28 |         result = result.clone()
29 | 
30 |         requires_conversion = not torch.is_autocast_enabled()
31 |         if requires_conversion:
32 |             expected_dtype = result.dtype
33 |             x = x.to(next(iter(self.lora_AB)).weight.dtype)
34 | 
35 |         output = self.lora_AB(self.lora_dropout(x))
36 |         if requires_conversion:
37 |             output = output.to(expected_dtype)
38 |         output = output * self.scaling
39 | 
40 |         result += output
41 | 
42 |         return result


--------------------------------------------------------------------------------
/table1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Continue on error
 4 | set +e
 5 | 
 6 | # List of commands
 7 | commands=(
 8 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 512 --num_epochs 1 --train_type qlora --use_gradient_checkpointing True --use_cpu_offload False --log_to wandb --dataset dummy"
 9 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 1024 --num_epochs 1 --train_type qlora --use_gradient_checkpointing True --use_cpu_offload False --log_to wandb --dataset dummy"
10 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 2048 --num_epochs 1 --train_type qlora --use_gradient_checkpointing True --use_cpu_offload False --log_to wandb --dataset dummy"
11 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 4096 --num_epochs 1 --train_type qlora --use_gradient_checkpointing True --use_cpu_offload False --log_to wandb --dataset dummy"
12 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 512 --num_epochs 1 --train_type lora --use_gradient_checkpointing True --use_cpu_offload False --log_to wandb --dataset dummy"
13 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 1024 --num_epochs 1 --train_type lora --use_gradient_checkpointing True --use_cpu_offload False --log_to wandb --dataset dummy"
14 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 2048 --num_epochs 1 --train_type lora --use_gradient_checkpointing True --use_cpu_offload False --log_to wandb --dataset dummy"
15 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 4096 --num_epochs 1 --train_type lora --use_gradient_checkpointing True --use_cpu_offload False --log_to wandb --dataset dummy"
16 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 1024 --num_epochs 1 --train_type lora --use_gradient_checkpointing False --use_cpu_offload False --log_to wandb --dataset dummy"
17 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 4096 --num_epochs 1 --train_type lora --use_gradient_checkpointing False --use_cpu_offload False --log_to wandb --dataset dummy"
18 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 1024 --num_epochs 1 --train_type qlora --use_gradient_checkpointing False --use_cpu_offload False --log_to wandb --dataset dummy"
19 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 4096 --num_epochs 1 --train_type qlora --use_gradient_checkpointing False --use_cpu_offload False --log_to wandb --dataset dummy"
20 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 1024 --num_epochs 1 --train_type lora --use_gradient_checkpointing False --use_cpu_offload True --log_to wandb --dataset dummy"
21 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 1024 --num_epochs 1 --train_type lora --use_gradient_checkpointing True --use_cpu_offload True --log_to wandb --dataset dummy"
22 |      "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 1024 --num_epochs 1 --train_type qlora --use_gradient_checkpointing False --use_cpu_offload True --log_to wandb --dataset dummy"
23 |     "python train.py --model_name meta-llama/Llama-2-7b-hf --batch_size 1 --context_length 1024 --num_epochs 1 --train_type qlora --use_gradient_checkpointing True --use_cpu_offload True --log_to wandb --dataset dummy"
24 | )
25 | 
26 | # Execute each command
27 | for cmd in "${commands[@]}"; do
28 |     echo "Executing: $cmd"
29 |     $cmd
30 | done
31 | 
32 | # Optional: stop on error for subsequent commands
33 | set -e


--------------------------------------------------------------------------------
/tests/test_block_expansion.py:
--------------------------------------------------------------------------------
 1 | import unittest, tempfile
 2 | import torch
 3 | import torch.nn as nn
 4 | import safetensors
 5 | from safetensors.torch import save_file
 6 | from pathlib import Path
 7 | from transformers.utils import hub, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
 8 | from glob import glob 
 9 | 
10 | # python -m unittest tests.test_quantize.TestQuantizer.test_quantizer
11 | class TestBlockExpansion(unittest.TestCase):
12 | 
13 |     def setUp(self) -> None:
14 |         # set seed        
15 |         self.llama_pro_path = Path("/mnt/vol_b/models/meta-llama/Llama-2-7b-hf_blk_exp-32-35")
16 |         self.filenames = glob(str(self.llama_pro_path/"*.safetensors"))
17 |         num_original_layers, num_expanded_layers = self.llama_pro_path.name.split("blk_exp-")[1].split("-")
18 |         self.num_original_layers, self.num_expanded_layers = int(num_original_layers), int(num_expanded_layers)
19 |         self.split = int(self.num_original_layers / (self.num_expanded_layers - self.num_original_layers))
20 | 
21 |         
22 |     def tearDown(self) -> None:
23 |         return super().tearDown()
24 |     
25 |     def test_expanded_weights(self):   
26 |         
27 |         total_new_layers = self.num_expanded_layers - self.num_original_layers
28 |         new_layer_ids = [self.split + (self.split + 1)*n for n in range(total_new_layers)]
29 |         
30 |         verify_weights = {}
31 |         for filename in self.filenames:
32 |             weights = safetensors.torch.load_file(str(filename))
33 |             for k,v in iter(weights.items()):
34 |                 if any(((f"layers.{i}" in k) or (f"layers.{i-1}" in k) for i in new_layer_ids)):
35 |                     verify_weights[k] = v
36 |                     
37 |         for k,v in verify_weights.items():
38 |             if any(((f"layers.{i}" in k) for i in new_layer_ids)):
39 |                 if 'down_proj' in k or 'o_proj' in k:
40 |                     assert torch.equal(v, torch.zeros_like(v))
41 |                 else:
42 |                     lid = int(k.split("layers.")[1].split(".")[0])
43 |                     assert torch.equal(verify_weights[k.replace(f"layers.{lid}", f"layers.{lid-1}")], v)
44 |                 


--------------------------------------------------------------------------------
/tests/test_dora.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('../scripts/')
 3 | import unittest, tempfile
 4 | from hqq.core.quantize import HQQLinear, HQQBackend, BaseQuantizeConfig
 5 | import torch
 6 | import torch.nn as nn
 7 | from dora import HQQDORA, BNBDORA
 8 | 
 9 | import bitsandbytes
10 | import bitsandbytes as bnb
11 | from bitsandbytes.nn import Linear4bit
12 | 
13 | 
14 | # python -m unittest tests.test_quantize.TestQuantizer.test_quantizer
15 | # hqq pinned: 72b2b641aadc44a7ded6b243915f90df3b3be385 (to_empty not compatible with FSDP after this commit)
16 | class TestHQQDORA(unittest.TestCase):
17 | 
18 |     def setUp(self) -> None:
19 |         # set seed
20 |         torch.manual_seed(42)
21 |         quant_config = BaseQuantizeConfig(nbits=4, group_size=64, quant_zero=True,
22 |                                   quant_scale=True, offload_meta=True, view_as_float=True)
23 |         self.base_layer = HQQLinear(nn.Linear(128,256,bias=False), quant_config, compute_dtype=torch.float32)
24 |         HQQLinear.set_backend(HQQBackend.ATEN_BACKPROP)
25 |         return super().setUp()
26 |     
27 |     def tearDown(self) -> None:
28 |         return super().tearDown()
29 |     
30 |     def test_hqq_dora(self):   
31 |         """
32 |         Test:  m * (W + AB / ||W + AB||) @ X == m * ((W @ X + AB @ X) / ||W + AB||)
33 |         """
34 |         frozen_weight = self.base_layer.dequantize_aten().clone().cuda()
35 |         self.base_layer.dora_scale = frozen_weight.norm(p=2,dim=1)
36 |         hqq_dora = HQQDORA(self.base_layer, 16)
37 |         weight = (frozen_weight + hqq_dora.dora_layer.lora_B.weight @ hqq_dora.dora_layer.lora_A.weight)
38 |         norm_adapted = weight / weight.norm(p=2, dim=1).view(-1,1)
39 |         calc_weights = norm_adapted * hqq_dora.magnitude_layer.magnitude.view(-1,1)
40 |         x = torch.randn(1, 16,128).cuda().to(torch.float32)
41 |         closeness = torch.isclose(x @ calc_weights.t(), hqq_dora(x)).float().mean().item()
42 |         self.assertTrue(closeness > 0.99)
43 |         
44 | class TestBNBDORA(unittest.TestCase):
45 | 
46 |     def setUp(self) -> None:
47 |         # set seed
48 |         torch.manual_seed(42)
49 |         self.base_layer = Linear4bit(128, 32, bias=False, quant_type="nf4", 
50 |                                      compute_dtype=torch.float32, quant_storage=torch.float32)
51 |         return super().setUp()
52 |     
53 |     def tearDown(self) -> None:
54 |         return super().tearDown()
55 |     
56 |     def test_bnb_dora(self):   
57 |         """
58 |         Test:  m * (W + AB / ||W + AB||) @ X == m * ((W @ X + AB @ X) / ||W + AB||)
59 |         """
60 |         # quantize and dequantize to avoid numerical mismatch during test.
61 |         self.base_layer.cuda()
62 |         frozen_weight = bnb.functional.dequantize_4bit(self.base_layer.weight.data, 
63 |                                                        self.base_layer.weight.quant_state).clone()
64 |         self.base_layer.dora_scale = frozen_weight.norm(p=2,dim=1)
65 |         bnb_dora = BNBDORA(self.base_layer, 16).cuda()
66 |         
67 |         weight = (frozen_weight + bnb_dora.dora_layer.lora_B.weight @ bnb_dora.dora_layer.lora_A.weight)
68 |         norm_adapted = weight / weight.norm(p=2, dim=1).view(-1,1)
69 |         calc_weights = norm_adapted * bnb_dora.magnitude_layer.magnitude.view(-1,1)
70 |         x = torch.randn(1, 16,128).cuda().to(torch.float32)
71 |         closeness = torch.isclose(x @ calc_weights.t(), bnb_dora(x)).float().mean().item() 
72 |         self.assertTrue(closeness > 0.99)


--------------------------------------------------------------------------------
/train.sh:
--------------------------------------------------------------------------------
  1 | # Compare LORA and QLORA on Alpaca dataset with same effective batch size ~32, lr sched, and lr.
  2 | # Reference for some hyperparams: https://arxiv.org/abs/2305.14314
  3 | # LORA (pure bf16)
  4 | # https://wandb.ai/answerdotai/fsdp/runs/gb34o6p4?workspace=user-k-answer-ai
  5 | # NOTE: Loss curve is flat - 1) use lower lr ? 2) start immediate annealing get_cosine_one_cycle_scheduler(..., min_lr_fraction=0.0)
  6 | python train.py \
  7 | --model_name meta-llama/Llama-2-7b-hf \
  8 | --gradient_accumulation_steps 2 \
  9 | --batch_size 8 \
 10 | --context_length 512 \
 11 | --num_epochs 1 \
 12 | --train_type lora \
 13 | --use_gradient_checkpointing False \
 14 | --use_cpu_offload False \
 15 | --log_to wandb \
 16 | --dataset alpaca \
 17 | --verbose false \
 18 | --save_model true \
 19 | --output_dir ~/models/lora_alpaca
 20 | 
 21 | # QLORA (pure bf16)
 22 | python train.py \
 23 | --model_name meta-llama/Llama-2-7b-hf \
 24 | --gradient_accumulation_steps 2 \
 25 | --batch_size 8 \
 26 | --context_length 512 \
 27 | --num_epochs 1 \
 28 | --train_type qlora \
 29 | --use_gradient_checkpointing False \
 30 | --use_cpu_offload False \
 31 | --log_to wandb \
 32 | --dataset alpaca \
 33 | --verbose false \
 34 | --save_model true \
 35 | --output_dir ~/models/qlora_alpaca
 36 | 
 37 | # QLORA (autocast bf16)
 38 | python train.py \
 39 | --model_name meta-llama/Llama-2-7b-hf \
 40 | --precision bf16_buffers_autocast \
 41 | --gradient_accumulation_steps 2 \
 42 | --batch_size 8 \
 43 | --context_length 512 \
 44 | --num_epochs 1 \
 45 | --train_type qlora \
 46 | --use_gradient_checkpointing False \
 47 | --use_cpu_offload False \
 48 | --log_to wandb \
 49 | --dataset alpaca \
 50 | --verbose false \
 51 | --save_model true \
 52 | --output_dir ~/models/qlora_alpaca_autocast_buffers_bf16
 53 | 
 54 | # stop instance
 55 | # requires: az login --use-device-code
 56 | az vm deallocate -g resource-group-us-east -n a100-duo
 57 | 
 58 | export CUDA_VISIBLE_DEVICES=3,4
 59 | python train.py \
 60 | --world_size 2 \
 61 | --model_name meta-llama/Llama-2-7b-hf \
 62 | --gradient_accumulation_steps 2 \
 63 | --batch_size 1 \
 64 | --context_length 512 \
 65 | --num_epochs 1 \
 66 | --sharding_strategy full_shard \
 67 | --precision bf16 \
 68 | --train_type hqq_lora \
 69 | --use_gradient_checkpointing true \
 70 | --use_cpu_offload false \
 71 | --log_to stdout \
 72 | --dataset alpaca \
 73 | --verbose true
 74 | 
 75 | export CUDA_VISIBLE_DEVICES=4,5
 76 | python train.py \
 77 | --world_size 2 \
 78 | --master_port 12356 \
 79 | --model_name meta-llama/Llama-2-7b-hf \
 80 | --gradient_accumulation_steps 2 \
 81 | --batch_size 1 \
 82 | --context_length 512 \
 83 | --num_epochs 1 \
 84 | --sharding_strategy full_shard \
 85 | --precision bf16 \
 86 | --train_type hqq_lora \
 87 | --use_gradient_checkpointing true \
 88 | --use_cpu_offload false \
 89 | --log_to stdout \
 90 | --dataset dummy \
 91 | --verbose true
 92 | 
 93 | export CUDA_VISIBLE_DEVICES=3,4
 94 | python train.py \
 95 | --world_size 3 \
 96 | --model_name meta-llama/Llama-2-70b-hf \
 97 | --gradient_accumulation_steps 2 \
 98 | --batch_size 1 \
 99 | --context_length 4096 \
100 | --num_epochs 1 \
101 | --sharding_strategy full_shard \
102 | --precision bf16 \
103 | --train_type hqq_dora \
104 | --use_gradient_checkpointing true \
105 | --use_cpu_offload false \
106 | --log_to wandb \
107 | --dataset dummy \
108 | --verbose true      


--------------------------------------------------------------------------------
/train_hqq_bench.sh:
--------------------------------------------------------------------------------
  1 | # Full vs QLORA vs HQQ, batch size = 64
  2 | 
  3 | # Full
  4 | # max batch size / gpu = 8 (38/40 GB)
  5 | # 8 * 2 gpus * 4 grad accum  = 64
  6 | export CUDA_VISIBLE_DEVICES=4,5
  7 | python train.py \
  8 | --world_size 2 \
  9 | --master_port 12356 \
 10 | --model_name meta-llama/Llama-2-7b-hf \
 11 | --gradient_accumulation_steps 4 \
 12 | --batch_size 8 \
 13 | --context_length 512 \
 14 | --precision bf16 \
 15 | --train_type full \
 16 | --use_gradient_checkpointing true \
 17 | --use_cpu_offload false \
 18 | --log_to wandb \
 19 | --dataset alpaca \
 20 | --verbose true
 21 | 
 22 | # BnB (QLORA)
 23 | # max batch size / gpu = 16 (28/40 GB)
 24 | # 16 * 2 gpus * 2 grad accum  = 64
 25 | export CUDA_VISIBLE_DEVICES=4,5
 26 | python train.py \
 27 | --world_size 2 \
 28 | --master_port 12356 \
 29 | --model_name meta-llama/Llama-2-7b-hf \
 30 | --gradient_accumulation_steps 2 \
 31 | --batch_size 16 \
 32 | --context_length 512 \
 33 | --precision bf16 \
 34 | --train_type custom_qlora \
 35 | --use_gradient_checkpointing true \
 36 | --use_cpu_offload false \
 37 | --log_to wandb \
 38 | --dataset alpaca \
 39 | --verbose true
 40 | 
 41 | # HQQ (QLORA)
 42 | # max batch size / gpu = 32 (28/40 GB)
 43 | # 32 * 2 gpus = 64
 44 | export CUDA_VISIBLE_DEVICES=4,5
 45 | python train.py \
 46 | --world_size 2 \
 47 | --master_port 12356 \
 48 | --model_name meta-llama/Llama-2-7b-hf \
 49 | --gradient_accumulation_steps 1 \
 50 | --batch_size 32 \
 51 | --context_length 512 \
 52 | --precision bf16 \
 53 | --train_type hqq_lora \
 54 | --use_gradient_checkpointing true \
 55 | --use_cpu_offload false \
 56 | --log_to wandb \
 57 | --dataset alpaca \
 58 | --verbose true
 59 | 
 60 | # DORA: max batch size / gpu = 32 (28/40 GB)
 61 | # 32 * 2 gpus = 64
 62 | export CUDA_VISIBLE_DEVICES=6,7
 63 | python train.py \
 64 | --world_size 2 \
 65 | --master_port 12357 \
 66 | --model_name meta-llama/Llama-2-7b-hf \
 67 | --gradient_accumulation_steps 1 \
 68 | --batch_size 32 \
 69 | --context_length 512 \
 70 | --precision bf16 \
 71 | --train_type hqq_dora \
 72 | --use_gradient_checkpointing true \
 73 | --use_cpu_offload false \
 74 | --log_to stdout \
 75 | --dataset alpaca \
 76 | --verbose true
 77 | 
 78 | 
 79 | # 32 * 2 gpus = 64
 80 | export CUDA_VISIBLE_DEVICES=2,6
 81 | python train.py \
 82 | --world_size 2 \
 83 | --master_port 12356 \
 84 | --model_name meta-llama/Llama-2-7b-hf \
 85 | --gradient_accumulation_steps 1 \
 86 | --batch_size 32 \
 87 | --context_length 512 \
 88 | --precision bf16 \
 89 | --train_type hqq_lora \
 90 | --use_gradient_checkpointing true \
 91 | --use_cpu_offload false \
 92 | --log_to stdout \
 93 | --dataset dummy \
 94 | --verbose true \
 95 | --save_model true \
 96 | --output_dir /weka/home-keremturgutlu/models/hqq_lora_dummy
 97 | 
 98 | export CUDA_VISIBLE_DEVICES=2,6
 99 | python train.py \
100 | --lr 1e-3 \
101 | --world_size 2 \
102 | --master_port 12356 \
103 | --model_name meta-llama/Llama-2-7b-hf \
104 | --gradient_accumulation_steps 1 \
105 | --batch_size 32 \
106 | --context_length 512 \
107 | --precision bf16 \
108 | --train_type custom_qlora \
109 | --use_gradient_checkpointing true \
110 | --use_cpu_offload false \
111 | --log_to stdout \
112 | --dataset dummy \
113 | --verbose true \
114 | --save_model true \
115 | --output_dir /weka/home-keremturgutlu/models/qlora_dummy
116 | 
117 | 
118 | # BNB 70B
119 | export CUDA_VISIBLE_DEVICES=4,5,6,7
120 | python train.py \
121 | --world_size 4 \
122 | --master_port 12356 \
123 | --model_name meta-llama/Llama-2-70b-hf \
124 | --gradient_accumulation_steps 4 \
125 | --batch_size 2 \
126 | --context_length 512 \
127 | --precision bf16_buffers_autocast \
128 | --train_type custom_qlora \
129 | --use_gradient_checkpointing true \
130 | --use_cpu_offload false \
131 | --log_to stdout \
132 | --dataset alpaca \
133 | --verbose true
134 | 
135 | # HQQ 70B
136 | export CUDA_VISIBLE_DEVICES=4,5,6,7
137 | python train.py \
138 | --world_size 4 \
139 | --master_port 12356 \
140 | --model_name meta-llama/Llama-2-70b-hf \
141 | --gradient_accumulation_steps 4 \
142 | --batch_size 2 \
143 | --context_length 512 \
144 | --precision bf16_buffers_autocast \
145 | --train_type hqq_lora \
146 | --use_gradient_checkpointing true \
147 | --use_cpu_offload false \
148 | --log_to stdout \
149 | --dataset alpaca \
150 | --verbose true


--------------------------------------------------------------------------------
/train_sql.sh:
--------------------------------------------------------------------------------
 1 | # run script also show stdout and save to log file
 2 | python train.py \
 3 | --context_length 256 \
 4 | --model_name codellama/CodeLlama-34b-hf \
 5 | --train_type qlora \
 6 | --batch_size 4 \
 7 | --gradient_accumulation_steps 4 \
 8 | --dataset sql \
 9 | --save_model True \
10 | --output_dir sql_model_qlora \
11 | --apply_gradient_clipping True \
12 | --project_name fsdp_qlora_sql \
13 | --precision bf16_buffers_autocast \
14 | --log_to wandb 2>&1 | tee ~/qlora_sql.log
15 | 
16 | python train.py \
17 | --context_length 256 \
18 | --model_name codellama/CodeLlama-34b-hf \
19 | --train_type custom_qlora \
20 | --batch_size 4 \
21 | --gradient_accumulation_steps 4 \
22 | --dataset sql \
23 | --save_model True \
24 | --output_dir sql_model_custom_qlora \
25 | --apply_gradient_clipping True \
26 | --project_name fsdp_qlora_sql \
27 | --precision bf16_buffers_autocast \
28 | --log_to wandb  2>&1 | tee ~/custom_qlora_sql.log


--------------------------------------------------------------------------------