├── .gitignore
├── LICENSE
├── README.md
├── assets
├── flan-t5-tensorboard.png
├── flan-t5.png
├── grpo.png
├── tensorboard-r1.png
├── tensorboard.png
└── win_rate_gpt-4-1106-preview.png
├── container
├── Dockerfile
└── README.md
├── inference
├── README.md
├── fp8-inference.md
├── idefics.md
├── llama-7b.md
├── speculative.md
├── starcoder_gptq.md
├── starcoder_load.js
└── vllm-function-calling.md
└── training
├── accelerate-tpu-bert-text-classification.ipynb
├── configs
├── accelerate_configs
│ ├── deepspeed_zero1.yaml
│ ├── deepspeed_zero3.yaml
│ ├── fsdp.yaml
│ └── fsdp_qlora.yaml
├── ds_falcon_180b_z3.json
├── ds_falcon_180b_z3_offload.json
├── ds_flan_t5_z3_config.json
├── ds_flan_t5_z3_config_bf16.json
├── ds_flan_t5_z3_offload.json
├── ds_flan_t5_z3_offload_bf16.json
└── spectrum
│ └── snr_results_meta-llama-Meta-Llama-3.1-8B_unfrozenparameters_30percent.yaml
├── deepseed-falcon-180b-lora-fa.ipynb
├── deepseed-flan-t5-summarization.ipynb
├── dpo-align-llms-in-2024-with-trl.ipynb
├── fine-tune-embedding-model-for-rag.ipynb
├── fine-tune-llms-in-2024-with-trl.ipynb
├── fine-tune-llms-in-2025.ipynb
├── fine-tune-modern-bert-in-2025.ipynb
├── fine-tune-multimodal-llms-with-trl.ipynb
├── flan-t5-samsum-summarization.ipynb
├── fsdp-qlora-distributed-llama3.ipynb
├── gemma-lora-example.ipynb
├── inference.py
├── instruction-tune-llama-2-int4.ipynb
├── launch.slurm
├── mini-deepseek-r1-aha-grpo.ipynb
├── optimize-llama-2-gptq.ipynb
├── peft-flan-t5-int8-summarization.ipynb
├── preprocessing
└── create_flan_t5_cnn_dataset.py
├── pytorch-2-0-bert-text-classification.ipynb
├── receipes
├── dpo-llama-3-1-8b-qlora.yaml
├── dpo-llama-3-1-8b.yaml
├── grpo-qwen-2.5-3b-deepseek-r1-countdown.yaml
├── llama-3-1-8b-qlora.yaml
└── llama-3-1-8b-spectrum.yaml
├── rl-with-llms-in-2025-dpo.ipynb
├── run_ds_lora.py
├── scripts
├── bloke_gptq.py
├── dpo
│ ├── create_preference_dataset.py
│ └── run_dpo.py
├── example.slurm
├── merge_adapter_weights.py
├── run_fsdp_qlora.py
├── run_r1_grpo.py
├── run_seq2seq_deepspeed.py
├── run_sft.py
└── test.py
└── utils
├── __init__.py
├── falcon_patch.py
├── llama_patch.py
└── peft_utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 | data/
131 | runs/
132 | flan*/
133 | *dataset*/
134 | *dataset.json
135 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Philipp Schmid
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Getting Started with Deep Learning with PyTorch and Hugging Face
2 |
3 | This repository contains instructions/examples/tutorials for getting started with Deep Learning using PyTorch and Hugging Face libraries like [transformers](https://huggingface.co/docs/transformers/index), [datasets](https://huggingface.co/docs/datasets/index).
4 |
5 | ### Training Examples
6 |
7 | * [Fine-tune FLAN-T5 XL/XXL using DeepSpeed & Hugging Face Transformers](./training/deepseed-flan-t5-summarization.ipynb)
8 | * [Fine-tune FLAN-T5 for chat & dialogue summarization](./training/flan-t5-samsum-summarization.ipynb)
9 | * [Fine-tune Falcon 180B with DeepSpeed ZeRO, LoRA & Flash Attention](./training/deepseed-falcon-180b-lora-fa.ipynb)
10 | * [Getting started with Transformers and TPU using PyTorch](./training/accelerate-tpu-bert-text-classification.ipynb)
11 | * [Extended Guide: Instruction-tune Llama 2](./training/instruction-tune-llama-2-int4.ipynb)
12 | * [Quantize open LLMs using optimum and GPTQ](./training/optimize-llama-2-gptq.ipynb)
13 | * [Fine-tune Embedding models for RAG](./training/fine-tune-embedding-model-for-rag.ipynb)
14 | * [Fine-tune LLMs in 2024 with TRL](./training/fine-tune-llms-in-2024-with-trl.ipynb)
15 | * [Fine-tune LLMs in 2025](./training/fine-tune-llms-in-2025.ipynb)
16 | * [Fine-tune Multimodal LLMs with TRL](./training/fine-tune-multimodal-llms-with-trl.ipynb)
17 | * [RLHF in 2024 with DPO & Hugging Face](./training/dpo-align-llms-in-2024-with-trl.ipynb)
18 | * [Fine-tune Gemma with ChatML](./training/gemma-lora-example.ipynb)
19 | * [Efficiently scale distributed training with FSDP & Q-LoRA](./training/fsdp-qlora-distributed-llama3.ipynb)
20 | * [Fine-tune classifier with ModernBERT in 2025](./training/fine-tune-modern-bert-in-2025.ipynb)
21 | * [How to align open LLMs in 2025 with DPO & Hugging Face](https://github.com/philschmid/deep-learning-pytorch-huggingface/blob/main/training/rl-with-llms-in-2025-dpo.ipynb)
22 |
23 | ### Inference Examples
24 |
25 | * [Text Generation Inference Examples](./inference/README.md)
26 | * [FP8 Inference Benchmarks](./inference/fp8-inference.md)
27 | * [Idefics Inference](./inference/idefics.md)
28 | * [Llama 2 Inference](./inference/llama-7b.md)
29 | * [Speculative Decoding](./inference/speculative.md)
30 | * [StarCoder GPTQ Inference](./inference/starcoder_gptq.md)
31 |
--------------------------------------------------------------------------------
/assets/flan-t5-tensorboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philschmid/deep-learning-pytorch-huggingface/59b37973074de90004d10e5ff636f98160c9743a/assets/flan-t5-tensorboard.png
--------------------------------------------------------------------------------
/assets/flan-t5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philschmid/deep-learning-pytorch-huggingface/59b37973074de90004d10e5ff636f98160c9743a/assets/flan-t5.png
--------------------------------------------------------------------------------
/assets/grpo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philschmid/deep-learning-pytorch-huggingface/59b37973074de90004d10e5ff636f98160c9743a/assets/grpo.png
--------------------------------------------------------------------------------
/assets/tensorboard-r1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philschmid/deep-learning-pytorch-huggingface/59b37973074de90004d10e5ff636f98160c9743a/assets/tensorboard-r1.png
--------------------------------------------------------------------------------
/assets/tensorboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philschmid/deep-learning-pytorch-huggingface/59b37973074de90004d10e5ff636f98160c9743a/assets/tensorboard.png
--------------------------------------------------------------------------------
/assets/win_rate_gpt-4-1106-preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philschmid/deep-learning-pytorch-huggingface/59b37973074de90004d10e5ff636f98160c9743a/assets/win_rate_gpt-4-1106-preview.png
--------------------------------------------------------------------------------
/container/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvcr.io/nvidia/pytorch:22.12-py3
2 |
3 | # Versions
4 | ARG PYTORCH='2.0.1'
5 | ARG CUDA='cu118' # used in the base container: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-12.html
6 | ARG TRANSFORMERS='4.30.2'
7 | ARG DATASETS='2.13.0'
8 | ARG ACCLERATE='0.20.3'
9 | ARG DEEPSPEED='0.9.5'
10 |
11 | LABEL maintainer="Philipp Schmid"
12 | ARG DEBIAN_FRONTEND=noninteractive
13 | ENV PYTHONUNBUFFERED=1
14 |
15 |
16 | RUN apt-get update \
17 | && apt-get -y upgrade --only-upgrade systemd openssl cryptsetup \
18 | && apt-get install -y \
19 | bzip2 \
20 | curl \
21 | git \
22 | git-lfs \
23 | tar \
24 | gcc \
25 | g++ \
26 | libaio-dev \
27 | # audio
28 | libsndfile1-dev \
29 | ffmpeg \
30 | && apt-get clean autoremove --yes \
31 | && rm -rf /var/lib/{apt,dpkg,cache,log}
32 |
33 | # update pip
34 | RUN python3 -m pip install --no-cache-dir --upgrade pip
35 |
36 | # remove old torch and
37 | # Install latest release PyTorch (PyTorch must be installed before any DeepSpeed c++/cuda ops.)
38 | RUN python3 -m pip uninstall -y torch torchvision torchaudio torch-tensorrt \
39 | && python3 -m pip install --no-cache-dir -U torch==${PYTORCH} torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/${CUDA}
40 |
41 | # Install DeepSpeed
42 | RUN python3 -m pip install --no-cache-dir -U deepspeed==${DEEPSPEED}
43 |
44 | # Install Hugging Face Libraries
45 | RUN python3 -m pip install --upgrade --no-cache-dir -U \
46 | transformers[sklearn,sentencepiece,vision]==${TRANSFORMERS} \
47 | datasets==${DATASETS} \
48 | accelerate==${ACCLERATE} \
49 | evaluate \
50 | tensorboard
51 |
--------------------------------------------------------------------------------
/container/README.md:
--------------------------------------------------------------------------------
1 | # PyTorch Deep Learning Containers
2 |
3 | This folder contains Dockerfiles for PyTorch Deep Learning Containers including Hugging Face libraries and/or Deepspeed.
4 |
5 | ## Dockerfiles
6 |
7 | | Container | Versions | URI |
8 | | --------------------------------- | ---------------------------------------------------- | ----------- |
9 | | [Pytorch Deepspeed](./Dockerfile) | torch==2.0.1, transformers==4.30.2, deepspeed==0.9.5 | `philschmi` |
10 |
11 | ## Getting Started
12 |
13 | ### Build the Docker image
14 |
15 | ```bash
16 | docker build -t philschmi/huggingface-pytorch:2.0.1-transformers4.30.2-deepspeed0.9.5-cuda11.8 -t philschmi/huggingface-pytorch:latest -f Dockerfile .
17 | ```
18 |
19 | ### Run the Docker image
20 |
21 | ```bash
22 | docker run --gpus all -it --rm philschmi/huggingface-pytorch:latest
23 | ```
24 |
25 | ### Pull the Docker image
26 |
27 | ```bash
28 | docker pull philschmi/huggingface-pytorch:latest
29 | ```
30 |
31 | ### Push the Docker image
32 |
33 | ```bash
34 | docker login
35 | ```
36 |
37 |
38 | push
39 | ```bash
40 | docker push philschmi/huggingface-pytorch:2.0.1-transformers4.30.2-deepspeed0.9.5-cuda11.8
41 | docker push philschmi/huggingface-pytorch:latest
42 | ```
43 |
44 |
45 |
46 | ## Run PyTorch Scripts
47 |
48 | ```bash
49 | docker run --rm -it --init \
50 | --gpus=all \
51 | --ipc=host \
52 | --user="$(id -u):$(id -g)" \
53 | --volume="$PWD:/workspace" \
54 | philschmi/huggingface-pytorch:latest python train.py --foo bar
55 | ```
56 |
57 | * `--gpus=all`: Enables GPU support. If you have multiple GPUs, you can use
58 | `--gpus=0,1,2` to specify which ones to use.
59 | * `--ipc=host`: Required if using multiprocessing, as explained at
60 | https://github.com/pytorch/pytorch#docker-image.
61 | * `--volume="$PWD:/app"`: Mounts the current working directory into the container.
62 | The default working directory inside the container is `/workspace`. Optional.
63 | * `--user="$(id -u):$(id -g)"`: Sets the user inside the container to match your
64 | user and group ID. Optional, but is useful for writing files with correct
65 | ownership.
66 |
67 |
68 | ## Deriving your own images
69 |
70 | The recommended way of adding additional dependencies to an image is to create
71 | your own Dockerfile this project as a base.
72 |
73 | ```dockerfile
74 | FROM philschmi/huggingface-pytorch:2.0.1-transformers4.30.2-deepspeed0.9.5-cuda11.8
75 |
76 | # Install system libraries required by OpenCV.
77 | RUN sudo apt-get update \
78 | && sudo apt-get install -y libgl1-mesa-glx libgtk2.0-0 libsm6 libxext6 \
79 | && sudo rm -rf /var/lib/apt/lists/*
80 |
81 | # Install OpenCV from PyPI.
82 | RUN pip install opencv-python==4.5.1.48
83 | ```
--------------------------------------------------------------------------------
/inference/README.md:
--------------------------------------------------------------------------------
1 | # Inference Examples
2 |
3 | ## Text Generation Inference
4 |
5 | Run `HuggingFaceH4/starchat-beta` with TGI locally.
6 |
7 | ```bash
8 | model=bigscience/bloom-560m
9 | num_shard=1
10 | quantize=bitsandbytes
11 | max_input_length=1562
12 | max_total_tokens=2048
13 | volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
14 |
15 | docker run --gpus all -ti -p 8080:80 \
16 | -e MODEL_ID=$model \
17 | -e QUANTIZE=$quantize \
18 | -e NUM_SHARD=$num_shard \
19 | -e MAX_INPUT_LENGTH=$max_input_length \
20 | -e MAX_TOTAL_TOKENS=$max_total_tokens \
21 | -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest
22 | ```
23 |
24 | send test request
25 |
26 | ```bash
27 | curl 127.0.0.1:8080/generate \
28 | -X POST \
29 | -d '{"inputs":"<|system|>\n<|end|>\n<|user|>\nWhat is Deep Learning?<|end|>\n<|assistant|>","parameters":{"temperature":0.2, "top_p": 0.95, "stop" : ["<|end|>"]}}' \
30 | -H 'Content-Type: application/json'
31 | ```
32 |
33 |
34 | ## Text Generation Infernece GPTQ
35 |
36 | ### Lama
37 |
38 | Run `TheBloke/Dolphin-Llama2-7B-GPTQ` with TGI locally.
39 |
40 | ```bash
41 | # Model config
42 | # model=TheBloke/Llama-2-7b-Chat-GPTQ
43 | # model=T#heBloke/Dolphin-Llama2-7B-GPTQ
44 | model=TheBloke/Llama-2-13b-Chat-GPTQ
45 | num_shard=1
46 | quantize=gptq
47 | max_input_length=1562
48 | max_total_tokens=4096 # 4096
49 | volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
50 |
51 | docker run --gpus all -ti -p 8080:80 \
52 | -e MODEL_ID=$model \
53 | -e QUANTIZE=$quantize \
54 | -e NUM_SHARD=$num_shard \
55 | -e MAX_INPUT_LENGTH=$max_input_length \
56 | -e MAX_TOTAL_TOKENS=$max_total_tokens \
57 | -e GPTQ_BITS=$gptq_bits \
58 | -e GPTQ_GROUPSIZE=$gptq_groupsize \
59 | -v $volume:/data ghcr.io/huggingface/text-generation-inference:0.9.4
60 | ```
61 |
62 | send test request
63 |
64 | ```bash
65 | curl 127.0.0.1:8080/generate \
66 | -X POST \
67 | -d '{"inputs":"[INST] <>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.\n<>\n\nWhat is 10+10? [\/INST]","parameters":{"temperature":0.2, "top_p": 0.95, "max_new_tokens": 256}}' \
68 | -H 'Content-Type: application/json'
69 | ```
70 |
--------------------------------------------------------------------------------
/inference/fp8-inference.md:
--------------------------------------------------------------------------------
1 | # Benchmark and compare FP8 and FP16 inference for vLLM
2 |
3 | [vLLM supports FP8](https://docs.vllm.ai/en/latest/quantization/fp8.html) (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8. We are using `guidellm`
4 |
5 | ## Summary
6 |
7 | **Model Memory Usage**
8 | - FP8: 8.49 GB
9 | - FP16: 14.99 GB
10 | - Memory Savings: ~43%
11 |
12 |
13 | **Performance Highlights**
14 | - Max Requests per Second:
15 | - FP8: 2.54 req/sec at rate 8/128
16 | - FP16: 1.37 req/sec at rate 2
17 | - Improvement: ~85%
18 | - Token Throughput:
19 | - FP8: 587.68 tokens/sec at rate 64
20 | - FP16: 302.94 tokens/sec at rate 2
21 | - Improvement: ~94%
22 |
23 | - Request Latency:
24 | - FP8: 12.26 sec at rate 1
25 | - FP16: 21.87 sec at rate 1
26 | - Improvement: ~44%
27 |
28 | **Results:**
29 | - FP8 consistently outperforms FP16 across all metrics at the same concurrency level.
30 | - FP8 shows the most significant improvement in Request Latency.
31 | - Even at higher concurrency levels, FP8 generally maintains better performance (though direct comparisons at other levels should be made carefully).
32 |
33 |
34 | ## FP8 Inference
35 |
36 | 1. run vLLM with FP8
37 |
38 | ```bash
39 | docker run --gpus all \
40 | -p 8080:8000 \
41 | --ipc=host \
42 | vllm/vllm-openai:latest \
43 | --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic --max_model_len 8192
44 | ```
45 | Note: Loading model weights took 8.4939 GB
46 |
47 |
48 | 2. benchmark with `guidellm`
49 |
50 | ```bash
51 | guidellm \
52 | --target "http://localhost:8080/v1" \
53 | --model "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic" \
54 | --data-type emulated \
55 | --data "prompt_tokens=550,generated_tokens=250" \
56 | --rate-type constant --rate 1 --rate 2 --rate 4 --rate 8 --rate 16 --rate 64 --rate 128 \
57 | --max-seconds 90
58 | ```
59 |
60 | ```bash
61 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┓
62 | ┃ Benchmark ┃ Requests per Second ┃ Request Latency ┃ Time to First Token ┃ Inter Token Latency ┃ Output Token Throughput ┃
63 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━┩
64 | │ asynchronous@1.00 req/sec │ 0.88 req/sec │ 12.26 sec │ 240.18 ms │ 54.61 ms │ 194.47 tokens/sec │
65 | │ asynchronous@2.00 req/sec │ 1.32 req/sec │ 16.43 sec │ 267.88 ms │ 72.65 ms │ 294.87 tokens/sec │
66 | │ asynchronous@4.00 req/sec │ 1.53 req/sec │ 47.30 sec │ 19242.07 ms │ 127.31 ms │ 338.21 tokens/sec │
67 | │ asynchronous@8.00 req/sec │ 2.54 req/sec │ 31.57 sec │ 3144.09 ms │ 124.14 ms │ 582.76 tokens/sec │
68 | │ asynchronous@16.00 req/sec │ 2.26 req/sec │ 58.66 sec │ 29508.54 ms │ 127.98 ms │ 516.97 tokens/sec │
69 | │ asynchronous@64.00 req/sec │ 2.49 req/sec │ 39.48 sec │ 9327.19 ms │ 127.77 ms │ 587.68 tokens/sec │
70 | │ asynchronous@128.00 req/sec │ 2.54 req/sec │ 37.21 sec │ 10749.84 ms │ 118.26 ms │ 569.52 tokens/sec │
71 | └─────────────────────────────┴─────────────────────┴─────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┘
72 | ```
73 |
74 |
75 | ## FP16 Inference
76 |
77 | 1. run vLLM with FP16
78 |
79 | ```bash
80 | docker run --gpus all \
81 | -p 8080:8000 \
82 | --ipc=host \
83 | --env "HUGGING_FACE_HUB_TOKEN=$(cat ~/.cache/huggingface/token)" \
84 | vllm/vllm-openai:latest \
85 | --model meta-llama/Meta-Llama-3.1-8B-Instruct --max_model_len 8192
86 | ```
87 |
88 | Note: Loading model weights took 14.99 GB
89 |
90 | 1. benchmark with `guidellm`
91 |
92 | ```bash
93 | guidellm \
94 | --target "http://localhost:8080/v1" \
95 | --model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
96 | --data-type emulated \
97 | --data "prompt_tokens=550,generated_tokens=250" \
98 | --rate-type constant --rate 1 --rate 2 --rate 4 --rate 8 --rate 16 --rate 64 --rate 128 \
99 | --max-seconds 90
100 | ```
101 |
102 | ```bash
103 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┓
104 | ┃ Benchmark ┃ Requests per Second ┃ Request Latency ┃ Time to First Token ┃ Inter Token Latency ┃ Output Token Throughput ┃
105 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━┩
106 | │ asynchronous@1.00 req/sec │ 0.76 req/sec │ 21.87 sec │ 314.05 ms │ 94.95 ms │ 172.09 tokens/sec │
107 | │ asynchronous@2.00 req/sec │ 1.37 req/sec │ 23.48 sec │ 820.36 ms │ 102.23 ms │ 302.94 tokens/sec │
108 | │ asynchronous@4.00 req/sec │ 1.02 req/sec │ 45.64 sec │ 19181.45 ms │ 118.46 ms │ 228.36 tokens/sec │
109 | │ asynchronous@8.00 req/sec │ 0.94 req/sec │ 49.13 sec │ 23194.74 ms │ 115.74 ms │ 211.55 tokens/sec │
110 | │ asynchronous@64.00 req/sec │ 0.89 req/sec │ 56.25 sec │ 30167.99 ms │ 115.69 ms │ 199.90 tokens/sec │
111 | │ asynchronous@16.00 req/sec │ 1.25 req/sec │ 56.19 sec │ 31740.33 ms │ 106.55 ms │ 285.55 tokens/sec │
112 | │ asynchronous@128.00 req/sec │ 1.00 req/sec │ 53.18 sec │ 27422.15 ms │ 113.62 ms │ 225.60 tokens/sec │
113 | └─────────────────────────────┴─────────────────────┴─────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┘
114 | ```
--------------------------------------------------------------------------------
/inference/idefics.md:
--------------------------------------------------------------------------------
1 | # Idefics inference
2 |
3 | ```bash
4 | model=HuggingFaceM4/idefics-9b-instruct
5 | num_shard=1
6 | max_input_length=1562
7 | max_total_tokens=2048
8 |
9 | sudo docker run --gpus all -ti -p 8080:80 \
10 | -e MODEL_ID=$model \
11 | -e NUM_SHARD=$num_shard \
12 | -e MAX_INPUT_LENGTH=$max_input_length \
13 | -e MAX_TOTAL_TOKENS=$max_total_tokens \
14 | ghcr.io/huggingface/text-generation-inference:1.1.0
15 | ```
16 |
17 | send test request
18 |
19 | ```bash
20 | curl 127.0.0.1:8080/generate \
21 | -X POST \
22 | -d '{"inputs":"User:Can i charge my iphone with this cable?\n","parameters":{"temperature":0.2, "top_p": 0.95, "max_new_tokens": 256}}' \
23 | -H 'Content-Type: application/json'
24 | ```
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/inference/llama-7b.md:
--------------------------------------------------------------------------------
1 | # Llama 2 inference
2 |
3 | ```bash
4 | model=meta-llama/Llama-2-7b-chat-hf
5 | token=hf_xxx # replace with your token, which access to the repo
6 | num_shard=1
7 | max_input_length=1562
8 | max_total_tokens=2048
9 |
10 | docker run --gpus all -ti -p 8080:80 \
11 | -e MODEL_ID=$model \
12 | -e HUGGING_FACE_HUB_TOKEN=$token \
13 | -e NUM_SHARD=$num_shard \
14 | -e MAX_INPUT_LENGTH=$max_input_length \
15 | -e MAX_TOTAL_TOKENS=$max_total_tokens \
16 | ghcr.io/huggingface/text-generation-inference:latest
17 | ```
18 |
19 | send test request
20 |
21 | ```bash
22 | curl 127.0.0.1:8080/generate \
23 | -X POST \
24 | -d '{"inputs":"[INST] <>\nYou are a helpful, respectful and honest assistant.\n<>\n\nWhat is 10+10? [\/INST]","parameters":{"temperature":0.2, "top_p": 0.95, "max_new_tokens": 256}}' \
25 | -H 'Content-Type: application/json'
26 | ```
--------------------------------------------------------------------------------
/inference/speculative.md:
--------------------------------------------------------------------------------
1 | # Speculative Decoding
2 |
3 | ## MLP Speculator
4 | ```bash
5 | sudo docker run --gpus all -ti --shm-size 1g --ipc=host --rm -p 8080:80 \
6 | -e MODEL_ID=ibm-fms/llama3-8b-accelerator \
7 | -e NUM_SHARD=4 \
8 | -e MAX_INPUT_TOKENS=1562 \
9 | -e MAX_TOTAL_TOKENS=2048 \
10 | -e HF_TOKEN=$(cat ~/.cache/huggingface/token) \
11 | ghcr.io/huggingface/text-generation-inference:sha-b70ae09
12 | ```
13 |
14 | send test request
15 |
16 | ```bash
17 | curl localhost:8080/v1/chat/completions \
18 | -X POST \
19 | -d '{
20 | "model": "tgi",
21 | "messages": [
22 | {
23 | "role": "system",
24 | "content": "You are a helpful assistant."
25 | },
26 | {
27 | "role": "user",
28 | "content": "What is deep learning?"
29 | }
30 | ],
31 | "stream": false,
32 | "max_tokens": 250
33 | }' \
34 | -H 'Content-Type: application/json'
35 | ```
36 |
37 | ## Medusa Speculator
38 | ```bash
39 | sudo docker run --gpus all -ti --shm-size 1g --ipc=host --rm -p 8080:80 \
40 | -e MODEL_ID=text-generation-inference/Mistral-7B-Instruct-v0.2-medusa \
41 | -e NUM_SHARD=1 \
42 | -e MAX_INPUT_TOKENS=1562 \
43 | -e MAX_TOTAL_TOKENS=2048 \
44 | -e HF_TOKEN=$(cat ~/.cache/huggingface/token) \
45 | ghcr.io/huggingface/text-generation-inference:sha-b70ae09
46 | ```
47 |
48 | send test request
49 |
50 | ```bash
51 | curl localhost:8080/v1/chat/completions \
52 | -X POST \
53 | -d '{
54 | "model": "tgi",
55 | "messages": [
56 | {
57 | "role": "user",
58 | "content": "Write a poem for my three year old"
59 | }
60 | ],
61 | "stream": false,
62 | "max_tokens": 250
63 | }' \
64 | -H 'Content-Type: application/json'
65 | ```
66 |
67 |
68 | chat_completions{total_time="2.360607542s" validation_time="256.541µs" queue_time="37.931µs" inference_time="2.36031324s" time_per_token="12.166563ms" seed="Some(5272915472497899851)"}
69 |
70 |
71 | ## EAGLE Speculator
72 |
73 | ```bash
74 | huggingface-cli download Meta-Llama-3-8B-Instruct --local-dir Meta-Llama-3-8B-Instruct
75 | huggingface-cli download Meta-Llama-3-8B-Instruct --local-dir EAGLE-LLaMA3-Instruct-8B
76 |
77 |
78 | ```python
79 | import json
80 |
81 | import torch
82 | from safetensors.torch import load_file, save_file
83 |
84 | ckpt = torch.load("EAGLE-LLaMA3-Instruct-8B/pytorch_model.bin")
85 | ref_ckpt = load_file("Meta-Llama-3-8B-Instruct/model-00004-of-00004.safetensors")
86 |
87 | ckpt['lm_head.weight'] = ref_ckpt['lm_head.weight']
88 |
89 | save_file(ckpt, "EAGLE-LLaMA3-Instruct-8B/model.safetensors")
90 |
91 | with open("EAGLE-LLaMA3-Instruct-8B/config.json") as rf:
92 | cfg = json.load(rf)
93 |
94 | cfg = {"model_type": "eagle", "model": cfg}
95 |
96 | with open("EAGLE-LLaMA3-Instruct-8B/config.json", "w") as wf:
97 | json.dump(cfg, wf)
98 |
99 | # delete EAGLE-LLaMA3-Instruct-8B/pytorch_model.bin
100 | ```
--------------------------------------------------------------------------------
/inference/starcoder_gptq.md:
--------------------------------------------------------------------------------
1 |
2 | ### StarCoder
3 |
4 |
5 | Run `TheBloke/starcoder-GPTQ` with TGI locally.
6 |
7 | ```bash
8 | # Model config
9 | model=TheBloke/starcoder-GPTQ
10 | num_shard=1
11 | quantize=gptq
12 | max_input_length=1562
13 | max_total_tokens=4096 # 4096
14 | volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
15 |
16 | docker run --gpus all -ti -p 8080:80 \
17 | -e MODEL_ID=$model \
18 | -e QUANTIZE=$quantize \
19 | -e NUM_SHARD=$num_shard \
20 | -e MAX_INPUT_LENGTH=$max_input_length \
21 | -e MAX_TOTAL_TOKENS=$max_total_tokens \
22 | -v $volume:/data ghcr.io/huggingface/text-generation-inference:sha-e605c2a
23 | ```
24 |
25 | send test request
26 |
27 | ```bash
28 | curl http://127.0.0.1:8080/generate \
29 | -X POST \
30 | -d '{"inputs":"\n def test():\n x=1+1\n assert x ","parameters":{"max_new_tokens":60,"stop":["<|endoftext|>", "\n\n"],"top_p":0.95}}' \
31 | -H 'Content-Type: application/json'
32 | ```
33 |
34 |
35 | load test with `k6`
36 |
37 | ```bash
38 | k6 run starcoder_load.js
39 | ```
40 |
41 | or with docker
42 | ```bash
43 | docker run --net=host -v $(pwd)/starcoder_load.js:/load.js loadimpact/k6:latest run /load.js
44 | ```
45 |
46 |
47 | ### Inference Results
48 |
49 | We used `k6` with `constant-vus` executor, a fixed number of VUs execute as many iterations as possible for a specified amount of time.
50 |
51 |
52 | | VU | GPU | time per token (p95) | queue time (p95) |
53 | | --- | ---- | -------------------- | ---------------- |
54 | | 1 | A10G | 30ms | 1ms |
55 | | 5 | A10G | 65ms | 105ms |
56 | | 10 | A10G | 104ms | 120ms |
57 | | 20 | A10G | 203ms | 5110ms |
58 | | 1 | A100 | 30ms | 1ms |
59 | | 5 | A100 | 59ms | 64ms |
60 | | 10 | A100 | 50ms | 51ms |
61 | | 20 | A100 | 59ms | 49ms |
62 | | 40 | A100 | 73ms | 1000ms |
63 | | 60 | A100 | 59ms | 113ms |
64 | | 80 | A100 | 92ms | 165ms |
65 | | 100 | A100 | 72ms | 1111ms |
66 | | 120 | A100 | 77ms | 1270ms |
67 | | 140 | A100 | _request start failing_ | _request start failing_ |
68 |
69 |
70 |
--------------------------------------------------------------------------------
/inference/starcoder_load.js:
--------------------------------------------------------------------------------
1 | import { check } from 'k6';
2 | import http from 'k6/http';
3 | import { Trend, Counter } from 'k6/metrics';
4 |
5 | // Define configurations
6 | const host = __ENV.HOST || 'http://127.0.0.1:8080';
7 |
8 | // Define the metrics
9 | const totalTime = new Trend('total_time', true);
10 | const validationTime = new Trend('validation_time', true);
11 | const queueTime = new Trend('queue_time', true);
12 | const inferenceTime = new Trend('inference_time', true);
13 | const timePerToken = new Trend('time_per_token', true);
14 | const generatedTokens = new Counter('generated_tokens');
15 |
16 | export const options = {
17 | thresholds: {
18 | http_req_failed: ['rate==0'],
19 | },
20 | scenarios: {
21 | load_test: {
22 | executor: 'constant-vus',
23 | duration: '60s',
24 | vus: 140,
25 | },
26 | },
27 | };
28 |
29 | export default function () {
30 | // Create Body
31 | const payload = {
32 | inputs: "\n def test():\n x=1+1\n assert x ",
33 | parameters: {
34 | max_new_tokens: 60,
35 | details: true
36 | },
37 | };
38 |
39 | const headers = { 'Content-Type': 'application/json' };
40 | const res = http.post("http://127.0.0.1:8080/generate", JSON.stringify(payload), {
41 | headers
42 | });
43 |
44 | check(res, {
45 | 'Post status is 200': (r) => res.status === 200,
46 | });
47 |
48 | if (res.status === 200) {
49 | totalTime.add(res.headers["X-Total-Time"]);
50 | validationTime.add(res.headers["X-Validation-Time"]);
51 | queueTime.add(res.headers["X-Queue-Time"]);
52 | inferenceTime.add(res.headers["X-Inference-Time"]);
53 | timePerToken.add(res.headers["X-Time-Per-Token"]);
54 | generatedTokens.add(res.json().details.generated_tokens);
55 | }
56 | }
--------------------------------------------------------------------------------
/inference/vllm-function-calling.md:
--------------------------------------------------------------------------------
1 | # vLLM Function Calling Inference
2 |
3 | This guide demonstrates how to run vLLM with function calling capabilities using Llama models.
4 |
5 | ```
6 | docker run --gpus all \
7 | --env "HUGGING_FACE_HUB_TOKEN=$(cat ~/.cache/huggingface/token)" \
8 | -p 8000:8000 \
9 | --shm-size=10G \
10 | --ipc=host \
11 | vllm/vllm-openai:latest \
12 | --model meta-llama/Llama-3.3-70B-Instruct --tensor-parallel-size 8 --max_model_len 4096 --enable-auto-tool-choice --tool-call-parser llama3_json
13 | ```
--------------------------------------------------------------------------------
/training/configs/accelerate_configs/deepspeed_zero1.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | deepspeed_config:
4 | deepspeed_multinode_launcher: standard
5 | offload_optimizer_device: none
6 | offload_param_device: none
7 | zero_stage: 1
8 | distributed_type: DEEPSPEED
9 | downcast_bf16: 'no'
10 | machine_rank: 0
11 | main_training_function: main
12 | mixed_precision: bf16
13 | num_machines: 1
14 | num_processes: 8
15 | rdzv_backend: static
16 | same_network: true
17 | tpu_env: []
18 | tpu_use_cluster: false
19 | tpu_use_sudo: false
20 | use_cpu: false
21 |
--------------------------------------------------------------------------------
/training/configs/accelerate_configs/deepspeed_zero3.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | deepspeed_config:
4 | deepspeed_multinode_launcher: standard
5 | offload_optimizer_device: none
6 | offload_param_device: none
7 | zero3_init_flag: true
8 | zero3_save_16bit_model: true
9 | zero_stage: 3
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | machine_rank: 0
13 | main_training_function: main
14 | mixed_precision: bf16
15 | num_machines: 1
16 | num_processes: 8
17 | rdzv_backend: static
18 | same_network: true
19 | tpu_env: []
20 | tpu_use_cluster: false
21 | tpu_use_sudo: false
22 | use_cpu: false
23 |
--------------------------------------------------------------------------------
/training/configs/accelerate_configs/fsdp.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | distributed_type: FSDP
4 | downcast_bf16: 'no'
5 | enable_cpu_affinity: false
6 | fsdp_config:
7 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
8 | fsdp_backward_prefetch: BACKWARD_PRE
9 | fsdp_cpu_ram_efficient_loading: true
10 | fsdp_forward_prefetch: true
11 | fsdp_offload_params: false
12 | fsdp_sharding_strategy: FULL_SHARD
13 | fsdp_state_dict_type: SHARDED_STATE_DICT
14 | fsdp_sync_module_states: true
15 | fsdp_use_orig_params: true
16 | machine_rank: 0
17 | main_training_function: main
18 | mixed_precision: bf16
19 | num_machines: 1
20 | num_processes: 8
21 | rdzv_backend: static
22 | same_network: true
23 | tpu_env: []
24 | tpu_use_cluster: false
25 | tpu_use_sudo: false
26 | use_cpu: false
27 |
--------------------------------------------------------------------------------
/training/configs/accelerate_configs/fsdp_qlora.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | distributed_type: FSDP
4 | downcast_bf16: 'no'
5 | fsdp_config:
6 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
7 | fsdp_backward_prefetch: BACKWARD_PRE
8 | fsdp_cpu_ram_efficient_loading: true
9 | fsdp_forward_prefetch: false
10 | fsdp_offload_params: true
11 | fsdp_sharding_strategy: FULL_SHARD
12 | fsdp_state_dict_type: SHARDED_STATE_DICT
13 | fsdp_sync_module_states: true
14 | fsdp_use_orig_params: false
15 | machine_rank: 0
16 | main_training_function: main
17 | mixed_precision: 'no'
18 | num_machines: 1
19 | num_processes: 2
20 | rdzv_backend: static
21 | same_network: true
22 | tpu_env: []
23 | tpu_use_cluster: false
24 | tpu_use_sudo: false
25 | use_cpu: false
--------------------------------------------------------------------------------
/training/configs/ds_falcon_180b_z3.json:
--------------------------------------------------------------------------------
1 | {
2 | "bf16": {
3 | "enabled": "auto"
4 | },
5 | "optimizer": {
6 | "type": "AdamW",
7 | "params": {
8 | "lr": "auto",
9 | "betas": "auto",
10 | "eps": "auto",
11 | "weight_decay": "auto"
12 | }
13 | },
14 | "scheduler": {
15 | "type": "WarmupLR",
16 | "params": {
17 | "warmup_min_lr": "auto",
18 | "warmup_max_lr": "auto",
19 | "warmup_num_steps": "auto"
20 | }
21 | },
22 | "zero_optimization": {
23 | "stage": 3,
24 | "overlap_comm": true,
25 | "contiguous_gradients": true,
26 | "sub_group_size": 1e9,
27 | "reduce_bucket_size": "auto",
28 | "stage3_prefetch_bucket_size": "auto",
29 | "stage3_param_persistence_threshold": "auto",
30 | "stage3_max_live_parameters": 1e9,
31 | "stage3_max_reuse_distance": 1e9,
32 | "stage3_gather_16bit_weights_on_model_save": true
33 | },
34 | "gradient_accumulation_steps": "auto",
35 | "gradient_clipping": "auto",
36 | "steps_per_print": 2000,
37 | "train_batch_size": "auto",
38 | "train_micro_batch_size_per_gpu": "auto",
39 | "wall_clock_breakdown": false
40 | }
--------------------------------------------------------------------------------
/training/configs/ds_falcon_180b_z3_offload.json:
--------------------------------------------------------------------------------
1 | {
2 | "bf16": {
3 | "enabled": "auto"
4 | },
5 | "optimizer": {
6 | "type": "AdamW",
7 | "params": {
8 | "lr": "auto",
9 | "betas": "auto",
10 | "eps": "auto",
11 | "weight_decay": "auto"
12 | }
13 | },
14 | "scheduler": {
15 | "type": "WarmupLR",
16 | "params": {
17 | "warmup_min_lr": "auto",
18 | "warmup_max_lr": "auto",
19 | "warmup_num_steps": "auto"
20 | }
21 | },
22 | "zero_optimization": {
23 | "stage": 3,
24 | "offload_optimizer": {
25 | "device": "cpu",
26 | "pin_memory": true
27 | },
28 | "offload_param": {
29 | "device": "cpu",
30 | "pin_memory": true
31 | },
32 | "overlap_comm": true,
33 | "contiguous_gradients": true,
34 | "sub_group_size": 1e9,
35 | "reduce_bucket_size": "auto",
36 | "stage3_prefetch_bucket_size": "auto",
37 | "stage3_param_persistence_threshold": "auto",
38 | "stage3_max_live_parameters": 1e9,
39 | "stage3_max_reuse_distance": 1e9,
40 | "stage3_gather_16bit_weights_on_model_save": true
41 | },
42 | "gradient_accumulation_steps": "auto",
43 | "gradient_clipping": "auto",
44 | "steps_per_print": 2000,
45 | "train_batch_size": "auto",
46 | "train_micro_batch_size_per_gpu": "auto",
47 | "wall_clock_breakdown": false
48 | }
--------------------------------------------------------------------------------
/training/configs/ds_flan_t5_z3_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "optimizer": {
11 | "type": "AdamW",
12 | "params": {
13 | "lr": "auto",
14 | "betas": "auto",
15 | "eps": "auto",
16 | "weight_decay": "auto"
17 | }
18 | },
19 | "scheduler": {
20 | "type": "WarmupLR",
21 | "params": {
22 | "warmup_min_lr": "auto",
23 | "warmup_max_lr": "auto",
24 | "warmup_num_steps": "auto"
25 | }
26 | },
27 | "zero_optimization": {
28 | "stage": 3,
29 | "overlap_comm": true,
30 | "contiguous_gradients": true,
31 | "sub_group_size": 1e9,
32 | "reduce_bucket_size": "auto",
33 | "stage3_prefetch_bucket_size": "auto",
34 | "stage3_param_persistence_threshold": "auto",
35 | "stage3_max_live_parameters": 1e9,
36 | "stage3_max_reuse_distance": 1e9,
37 | "stage3_gather_16bit_weights_on_model_save": true
38 | },
39 | "gradient_accumulation_steps": "auto",
40 | "gradient_clipping": "auto",
41 | "steps_per_print": 2000,
42 | "train_batch_size": "auto",
43 | "train_micro_batch_size_per_gpu": "auto",
44 | "wall_clock_breakdown": false
45 | }
--------------------------------------------------------------------------------
/training/configs/ds_flan_t5_z3_config_bf16.json:
--------------------------------------------------------------------------------
1 | {
2 | "bf16": {
3 | "enabled": "auto"
4 | },
5 | "optimizer": {
6 | "type": "AdamW",
7 | "params": {
8 | "lr": "auto",
9 | "betas": "auto",
10 | "eps": "auto",
11 | "weight_decay": "auto"
12 | }
13 | },
14 | "scheduler": {
15 | "type": "WarmupLR",
16 | "params": {
17 | "warmup_min_lr": "auto",
18 | "warmup_max_lr": "auto",
19 | "warmup_num_steps": "auto"
20 | }
21 | },
22 | "zero_optimization": {
23 | "stage": 3,
24 | "overlap_comm": true,
25 | "contiguous_gradients": true,
26 | "sub_group_size": 1e9,
27 | "reduce_bucket_size": "auto",
28 | "stage3_prefetch_bucket_size": "auto",
29 | "stage3_param_persistence_threshold": "auto",
30 | "stage3_max_live_parameters": 1e9,
31 | "stage3_max_reuse_distance": 1e9,
32 | "stage3_gather_16bit_weights_on_model_save": true
33 | },
34 | "gradient_accumulation_steps": "auto",
35 | "gradient_clipping": "auto",
36 | "steps_per_print": 2000,
37 | "train_batch_size": "auto",
38 | "train_micro_batch_size_per_gpu": "auto",
39 | "wall_clock_breakdown": false
40 | }
--------------------------------------------------------------------------------
/training/configs/ds_flan_t5_z3_offload.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "optimizer": {
11 | "type": "AdamW",
12 | "params": {
13 | "lr": "auto",
14 | "betas": "auto",
15 | "eps": "auto",
16 | "weight_decay": "auto"
17 | }
18 | },
19 | "scheduler": {
20 | "type": "WarmupLR",
21 | "params": {
22 | "warmup_min_lr": "auto",
23 | "warmup_max_lr": "auto",
24 | "warmup_num_steps": "auto"
25 | }
26 | },
27 | "zero_optimization": {
28 | "stage": 3,
29 | "offload_optimizer": {
30 | "device": "cpu",
31 | "pin_memory": true
32 | },
33 | "offload_param": {
34 | "device": "cpu",
35 | "pin_memory": true
36 | },
37 | "overlap_comm": true,
38 | "contiguous_gradients": true,
39 | "sub_group_size": 1e9,
40 | "reduce_bucket_size": "auto",
41 | "stage3_prefetch_bucket_size": "auto",
42 | "stage3_param_persistence_threshold": "auto",
43 | "stage3_max_live_parameters": 1e9,
44 | "stage3_max_reuse_distance": 1e9,
45 | "stage3_gather_16bit_weights_on_model_save": true
46 | },
47 | "gradient_accumulation_steps": "auto",
48 | "gradient_clipping": "auto",
49 | "steps_per_print": 2000,
50 | "train_batch_size": "auto",
51 | "train_micro_batch_size_per_gpu": "auto",
52 | "wall_clock_breakdown": false
53 | }
--------------------------------------------------------------------------------
/training/configs/ds_flan_t5_z3_offload_bf16.json:
--------------------------------------------------------------------------------
1 | {
2 | "bf16": {
3 | "enabled": "auto"
4 | },
5 | "optimizer": {
6 | "type": "AdamW",
7 | "params": {
8 | "lr": "auto",
9 | "betas": "auto",
10 | "eps": "auto",
11 | "weight_decay": "auto"
12 | }
13 | },
14 | "scheduler": {
15 | "type": "WarmupLR",
16 | "params": {
17 | "warmup_min_lr": "auto",
18 | "warmup_max_lr": "auto",
19 | "warmup_num_steps": "auto"
20 | }
21 | },
22 | "zero_optimization": {
23 | "stage": 3,
24 | "offload_optimizer": {
25 | "device": "cpu",
26 | "pin_memory": true
27 | },
28 | "offload_param": {
29 | "device": "cpu",
30 | "pin_memory": true
31 | },
32 | "overlap_comm": true,
33 | "contiguous_gradients": true,
34 | "sub_group_size": 1e9,
35 | "reduce_bucket_size": "auto",
36 | "stage3_prefetch_bucket_size": "auto",
37 | "stage3_param_persistence_threshold": "auto",
38 | "stage3_max_live_parameters": 1e9,
39 | "stage3_max_reuse_distance": 1e9,
40 | "stage3_gather_16bit_weights_on_model_save": true
41 | },
42 | "gradient_accumulation_steps": "auto",
43 | "gradient_clipping": "auto",
44 | "steps_per_print": 2000,
45 | "train_batch_size": "auto",
46 | "train_micro_batch_size_per_gpu": "auto",
47 | "wall_clock_breakdown": false
48 | }
--------------------------------------------------------------------------------
/training/configs/spectrum/snr_results_meta-llama-Meta-Llama-3.1-8B_unfrozenparameters_30percent.yaml:
--------------------------------------------------------------------------------
1 | unfrozen_parameters:
2 | - ^lm_head.weight$
3 | - ^model.embed_tokens.weight$
4 | # input_layernorm layers
5 | - model.layers.0.input_layernorm
6 | - model.layers.1.input_layernorm
7 | - model.layers.2.input_layernorm
8 | - model.layers.3.input_layernorm
9 | - model.layers.4.input_layernorm
10 | - model.layers.5.input_layernorm
11 | - model.layers.6.input_layernorm
12 | - model.layers.7.input_layernorm
13 | - model.layers.8.input_layernorm
14 | # lm_head layers
15 | # mlp.down_proj layers
16 | - model.layers.1.mlp.down_proj
17 | - model.layers.0.mlp.down_proj
18 | - model.layers.30.mlp.down_proj
19 | - model.layers.2.mlp.down_proj
20 | - model.layers.21.mlp.down_proj
21 | - model.layers.22.mlp.down_proj
22 | - model.layers.29.mlp.down_proj
23 | - model.layers.5.mlp.down_proj
24 | - model.layers.4.mlp.down_proj
25 | # mlp.gate_proj layers
26 | - model.layers.1.mlp.gate_proj
27 | - model.layers.2.mlp.gate_proj
28 | - model.layers.3.mlp.gate_proj
29 | - model.layers.4.mlp.gate_proj
30 | - model.layers.0.mlp.gate_proj
31 | - model.layers.25.mlp.gate_proj
32 | - model.layers.26.mlp.gate_proj
33 | - model.layers.5.mlp.gate_proj
34 | - model.layers.24.mlp.gate_proj
35 | # mlp.up_proj layers
36 | - model.layers.4.mlp.up_proj
37 | - model.layers.3.mlp.up_proj
38 | - model.layers.0.mlp.up_proj
39 | - model.layers.5.mlp.up_proj
40 | - model.layers.7.mlp.up_proj
41 | - model.layers.6.mlp.up_proj
42 | - model.layers.2.mlp.up_proj
43 | - model.layers.1.mlp.up_proj
44 | - model.layers.8.mlp.up_proj
45 | # model.embed_tokens layers
46 | # model.norm layers
47 | # post_attention_layernorm layers
48 | - model.layers.0.post_attention_layernorm
49 | - model.layers.1.post_attention_layernorm
50 | - model.layers.2.post_attention_layernorm
51 | - model.layers.3.post_attention_layernorm
52 | - model.layers.4.post_attention_layernorm
53 | - model.layers.5.post_attention_layernorm
54 | - model.layers.6.post_attention_layernorm
55 | - model.layers.7.post_attention_layernorm
56 | - model.layers.8.post_attention_layernorm
57 | # self_attn.k_proj layers
58 | - model.layers.29.self_attn.k_proj
59 | - model.layers.25.self_attn.k_proj
60 | - model.layers.23.self_attn.k_proj
61 | - model.layers.28.self_attn.k_proj
62 | - model.layers.21.self_attn.k_proj
63 | - model.layers.19.self_attn.k_proj
64 | - model.layers.22.self_attn.k_proj
65 | - model.layers.20.self_attn.k_proj
66 | - model.layers.24.self_attn.k_proj
67 | # self_attn.o_proj layers
68 | - model.layers.14.self_attn.o_proj
69 | - model.layers.7.self_attn.o_proj
70 | - model.layers.5.self_attn.o_proj
71 | - model.layers.11.self_attn.o_proj
72 | - model.layers.6.self_attn.o_proj
73 | - model.layers.24.self_attn.o_proj
74 | - model.layers.9.self_attn.o_proj
75 | - model.layers.13.self_attn.o_proj
76 | - model.layers.10.self_attn.o_proj
77 | # self_attn.q_proj layers
78 | - model.layers.8.self_attn.q_proj
79 | - model.layers.13.self_attn.q_proj
80 | - model.layers.9.self_attn.q_proj
81 | - model.layers.14.self_attn.q_proj
82 | - model.layers.10.self_attn.q_proj
83 | - model.layers.11.self_attn.q_proj
84 | - model.layers.0.self_attn.q_proj
85 | - model.layers.15.self_attn.q_proj
86 | - model.layers.1.self_attn.q_proj
87 | # self_attn.v_proj layers
88 | - model.layers.26.self_attn.v_proj
89 | - model.layers.17.self_attn.v_proj
90 | - model.layers.3.self_attn.v_proj
91 | - model.layers.28.self_attn.v_proj
92 | - model.layers.29.self_attn.v_proj
93 | - model.layers.21.self_attn.v_proj
94 | - model.layers.15.self_attn.v_proj
95 | - model.layers.16.self_attn.v_proj
96 | - model.layers.20.self_attn.v_proj
97 |
--------------------------------------------------------------------------------
/training/deepseed-falcon-180b-lora-fa.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "# Fine-tune Falcon 180B with DeepSpeed ZeRO, LoRA & Flash Attention\n",
9 | "\n",
10 | "Falcon 180B is the newest version of Falcon LLM family. It is the biggest open source model with 180B parameter and trained on more data - 3.5T tokens with context length window upto 4K tokens. In this example we will show how to fine-tune Falcon 180B using DeepSpeed, Hugging Face Transformers, LoRA with Flash Attention on a multi-GPU machine.\n",
11 | "\n",
12 | "In detail you will learn how to:\n",
13 | "1. Setup Development Environment\n",
14 | "2. Load and prepare the dataset\n",
15 | "3. Fine-Tune Falcon 180B using DeepSpeed, Hugging Face Transformers, LoRA with Flash Attention\n",
16 | "\n",
17 | "Before we get into the code lets take a quick look on the technologies and methods we are going to use: \n",
18 | "\n",
19 | "### What is DeepSpeed ZeRO?\n",
20 | "\n",
21 | "DeepSpeed ZeRO focuses on efficient large-scale training of Transformers. ZeRO, or Zero Redundancy Optimizer, reduces memory footprint by partitioning model states across devices instead of basic data parallelism. This saves significant memory - ZeRO-Infinity can reduce usage 100x vs data parallelism. ZeRO-Offload further reduces memory by offloading parts of model and optimizer to CPU, enabling 10B+ parameter models on 1 GPU. ZeRO [integrates with HuggingFace Transformers through a configuration file](https://huggingface.co/docs/transformers/v4.26.1/en/main_classes/deepspeed).\n",
22 | "\n",
23 | "### What is LoRA?\n",
24 | "\n",
25 | "[LoRA](https://arxiv.org/abs/2106.09685) enables efficient fine-tuning of large language models. It decomposes weight matrices into smaller, trainable update matrices that adapt while keeping original weights frozen. This drastically reduces trainable parameters for faster, lower-memory tuning. LoRA integrates into [Transformers via Hugging Face's PEFT](https://huggingface.co/docs/peft/conceptual_guides/lora). It combines well with methods like DeepSpeed. Key advantages are efficient tuning, portable models, and no inference latency when merging trained weights. LoRA allows adaptively training massive models with limited resources.\n",
26 | "\n",
27 | "### What is Flash Attention?\n",
28 | "\n",
29 | "Flash Attention is an algorithm that speeds up the core attention mechanism in Transformer language models by restructuring computations. It uses techniques like tiling and recomputation to reduce the high memory costs of attention, enabling models to process longer text sequences. Flash Attention 2 optimizes parallelism and work partitioning for 2x speedup over the previous version, reaching 230 TFLOPS/s on A100 GPUs.\n",
30 | "\n",
31 | "\n",
32 | "### Access Falcon 180B \n",
33 | "\n",
34 | "Before we can start training we have to make sure that we accepted the license [tiiuae/falcon-180B](https://huggingface.co/tiiuae/falcon-180B) to be able to use it. You can accept the license by clicking on the Agree and access repository button on the model page at: \n",
35 | "* [tiiuae/falcon-180B](https://huggingface.co/tiiuae/falcon-180B)\n",
36 | "\n",
37 | "> The example was created and run a DGX A100 8-GPU machine with 80GB GPU memory per GPU.\n",
38 | "\n",
39 | "## 1. Setup Development Environment\n",
40 | "\n",
41 | "conda create --name hf python=3.10 -c conda-forge\n"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "# install torch with the correct cuda version, check nvcc --version\n",
51 | "!pip install torch --extra-index-url https://download.pytorch.org/whl/cu118 --upgrade\n",
52 | "# install Hugging Face Libraries and additional dependencies\n",
53 | "!pip install \"transformers==4.34.0\" \"datasets==2.14.5\" \"accelerate==0.22.0\" \"evaluate==0.4.0\" \"peft==0.5.0\" tensorboard packaging --upgrade\n",
54 | "# install deepspeed and ninja for jit compilations of kernels\n",
55 | "!pip install \"deepspeed==0.10.3\" ninja --upgrade\n",
56 | "# install additional Flash Attention\n",
57 | "!pip install flash-attn --no-build-isolation --upgrade"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "To access any Falcon 180B asset we need to login into our hugging face account. We can do this by running the following command:"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "!huggingface-cli login --token YOUR_TOKEN"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {},
79 | "source": [
80 | "## 2. Load and prepare the dataset\n",
81 | "\n",
82 | "we will use the [dolly](https://huggingface.co/datasets/databricks/databricks-dolly-15k) an open source dataset of instruction-following records generated by thousands of Databricks employees in several of the behavioral categories outlined in the [InstructGPT paper](https://arxiv.org/abs/2203.02155), including brainstorming, classification, closed QA, generation, information extraction, open QA, and summarization.\n",
83 | "\n",
84 | "```python\n",
85 | "{\n",
86 | " \"instruction\": \"What is world of warcraft\",\n",
87 | " \"context\": \"\",\n",
88 | " \"response\": \"World of warcraft is a massive online multi player role playing game. It was released in 2004 by bizarre entertainment\"\n",
89 | "}\n",
90 | "```\n",
91 | "\n",
92 | "To load the `samsum` dataset, we use the `load_dataset()` method from the 🤗 Datasets library."
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "from datasets import load_dataset\n",
102 | "from random import randrange\n",
103 | "\n",
104 | "# Load dataset from the hub\n",
105 | "dataset = load_dataset(\"databricks/databricks-dolly-15k\", split=\"train\")\n",
106 | "\n",
107 | "print(f\"dataset size: {len(dataset)}\")\n",
108 | "print(dataset[randrange(len(dataset))])\n",
109 | "# dataset size: 15011"
110 | ]
111 | },
112 | {
113 | "cell_type": "markdown",
114 | "metadata": {},
115 | "source": [
116 | "To instruct tune our model we need to convert our structured examples into a collection of tasks described via instructions. We define a `formatting_function` that takes a sample and returns a string with our format instruction."
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {},
123 | "outputs": [],
124 | "source": [
125 | "def format_dolly(sample):\n",
126 | " instruction = f\"### Instruction\\n{sample['instruction']}\"\n",
127 | " context = f\"### Context\\n{sample['context']}\" if len(sample[\"context\"]) > 0 else None\n",
128 | " response = f\"### Answer\\n{sample['response']}\"\n",
129 | " # join all the parts together\n",
130 | " prompt = \"\\n\\n\".join([i for i in [instruction, context, response] if i is not None])\n",
131 | " return prompt\n"
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "metadata": {},
137 | "source": [
138 | "lets test our formatting function on a random example."
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": null,
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "from random import randrange\n",
148 | "\n",
149 | "print(format_dolly(dataset[randrange(len(dataset))]))"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "In addition, to formatting our samples we also want to pack multiple samples to one sequence to have a more efficient training."
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "metadata": {},
163 | "outputs": [],
164 | "source": [
165 | "from transformers import AutoTokenizer\n",
166 | "\n",
167 | "model_id = \"tiiuae/falcon-180B\" \n",
168 | "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
169 | "tokenizer.pad_token = tokenizer.eos_token"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {},
175 | "source": [
176 | "We define some helper functions to pack our samples into sequences of a given length and then tokenize them."
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": null,
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "from random import randint\n",
186 | "from itertools import chain\n",
187 | "from functools import partial\n",
188 | "\n",
189 | "\n",
190 | "# template dataset to add prompt to each sample\n",
191 | "def template_dataset(sample):\n",
192 | " sample[\"text\"] = f\"{format_dolly(sample)}{tokenizer.eos_token}\"\n",
193 | " return sample\n",
194 | "\n",
195 | "\n",
196 | "# apply prompt template per sample\n",
197 | "dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))\n",
198 | "# print random sample\n",
199 | "print(dataset[randint(0, len(dataset))][\"text\"])\n",
200 | "\n",
201 | "# empty list to save remainder from batches to use in next batch\n",
202 | "remainder = {\"input_ids\": [], \"attention_mask\": [], \"token_type_ids\": []}\n",
203 | "\n",
204 | "def chunk(sample, chunk_length=2048):\n",
205 | " # define global remainder variable to save remainder from batches to use in next batch\n",
206 | " global remainder\n",
207 | " # Concatenate all texts and add remainder from previous batch\n",
208 | " concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}\n",
209 | " concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}\n",
210 | " # get total number of tokens for batch\n",
211 | " batch_total_length = len(concatenated_examples[list(sample.keys())[0]])\n",
212 | "\n",
213 | " # get max number of chunks for batch\n",
214 | " if batch_total_length >= chunk_length:\n",
215 | " batch_chunk_length = (batch_total_length // chunk_length) * chunk_length\n",
216 | "\n",
217 | " # Split by chunks of max_len.\n",
218 | " result = {\n",
219 | " k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]\n",
220 | " for k, t in concatenated_examples.items()\n",
221 | " }\n",
222 | " # add remainder to global variable for next batch\n",
223 | " remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}\n",
224 | " # prepare labels\n",
225 | " result[\"labels\"] = result[\"input_ids\"].copy()\n",
226 | " return result\n",
227 | "\n",
228 | "\n",
229 | "# tokenize and chunk dataset\n",
230 | "lm_dataset = dataset.map(\n",
231 | " lambda sample: tokenizer(sample[\"text\"]), batched=True, remove_columns=list(dataset.features)\n",
232 | ").map(\n",
233 | " partial(chunk, chunk_length=2048),\n",
234 | " batched=True,\n",
235 | ")\n",
236 | "\n",
237 | "# Print total number of samples\n",
238 | "print(f\"Total number of samples: {len(lm_dataset)}\")"
239 | ]
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "metadata": {},
244 | "source": [
245 | "After we processed the datasets we want to save it to disk to be able to use the processed dataset later during training."
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": null,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "lm_dataset.save_to_disk(\"dolly-processed\")"
255 | ]
256 | },
257 | {
258 | "attachments": {},
259 | "cell_type": "markdown",
260 | "metadata": {},
261 | "source": [
262 | "## 3. Fine-Tune Falcon 180B using DeepSpeed, Hugging Face Transformers, LoRA with Flash Attention\n",
263 | "\n",
264 | "DeepSpeed ZeRO is natively integrated into the [Hugging Face Transformers Trainer](https://huggingface.co/docs/transformers/v4.33.1/en/main_classes/deepspeed). The integration enables leveraging ZeRO by simply providing a DeepSpeed config file, and the Trainer takes care of the rest. We created 2 deepspeed configurations for the experiments we ran, including `CPU offloading`: \n",
265 | "\n",
266 | "- [ds_falcon_180b_z3.json](./configs/ds_falcon_180b_z3.json)\n",
267 | "- [ds_falcon_180b_z3_offload.json](./configs/ds_falcon_180b_z3_offload.json)\n",
268 | "\n",
269 | "As mentioned in the beginning, we ran those example using a 8x NVIDIA A100 80GB. This means we can leverage `bf16`, which reduces the memory footprint of the model by almost ~2x, which allows us to train without offloading efficiently. We are going to use the [ds_falcon_180b_z3.json](./configs/ds_falcon_180b_z3.json). If you are irritated by the `auto` values, check the [documentation](https://huggingface.co/docs/transformers/v4.26.1/en/main_classes/deepspeed#configuration).\n",
270 | "\n",
271 | "In addition to the deepspeed configuration we also need a training script, which implements LoRA and patches our model to use flash-attention. We created a [run_ds_lora.py](./run_ds_lora.py) script, which patches the falcon model using the [falcon_patch.py](./utils/falcon_patch.py) utils and implements LoRA using [peft_utils.py](./utils/peft_utils.py). \n",
272 | "\n",
273 | "> When you run make sure that you have the same folder structure and utils/configs available. The easiest way is to clone the whole repository. Go into the `training` directory and start the training.\n",
274 | "\n",
275 | "Once we made sure that we have the right configuration and training script we can start the training using `torchrun`."
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": null,
281 | "metadata": {},
282 | "outputs": [],
283 | "source": [
284 | "!torchrun --nproc_per_node 8 run_ds_lora.py \\\n",
285 | " --model_id tiiuae/falcon-180B \\\n",
286 | " --dataset_path dolly-processed \\\n",
287 | " --output_dir falcon-180b-lora-fa \\\n",
288 | " --num_train_epochs 3 \\\n",
289 | " --per_device_train_batch_size 1 \\\n",
290 | " --learning_rate 4e-3 \\\n",
291 | " --gradient_checkpointing True \\\n",
292 | " --gradient_accumulation_steps 8 \\\n",
293 | " --bf16 True \\\n",
294 | " --tf32 True \\\n",
295 | " --use_flash_attn True \\\n",
296 | " --lr_scheduler_type \"constant_with_warmup\" \\\n",
297 | " --logging_steps 25 \\\n",
298 | " --save_steps 100 \\\n",
299 | " --save_total_limit 3 \\\n",
300 | " --deepspeed configs/ds_falcon_180b_z3.json"
301 | ]
302 | },
303 | {
304 | "cell_type": "markdown",
305 | "metadata": {},
306 | "source": [
307 | "_Note: Since we are using LoRA we are only saving the \"trained\" adapter weights, to save some storage. If you want to merge the adapters back into the base model and save the merged model you can add `--merge_adapters True` or use the [merge_adapter_weights.py](./scripts/merge_adapter_weights.py) script._\n",
308 | "\n",
309 | "In our example for Falcon 180B, the training time was `153 minutes` or ~2 hours for 3 epochs. For comparison the pretraining cost of Falcon 180B was ~7,000,000 GPU hours, which is 3,500,000 time more than fine-tuning.\n",
310 | "\n",
311 | "## Conclusion \n",
312 | "\n",
313 | "In the blog post you learn how to fine-tune Falcon 180B model using DeepSpeed, Hugging Face Transformers, and LoRA with Flash Attention on a multi-GPU machine. We used: \n",
314 | "\n",
315 | "* DeepSpeed ZeRO for memory optimization, enabling training models with up to trillions of parameters on limited GPU memory. We used stage 3 (ZeRO-Infinity) to optimize memory usage.\n",
316 | "* Hugging Face Transformers and Datasets for easily loading and preparing the text dataset as well as providing an intuitive Trainer API.\n",
317 | "* LoRA, a method to efficiently fine-tune large language models by only updating a small percentage of parameters each iteration. This drastically reduces memory usage and computational costs.\n",
318 | "* Flash Attention - a highly optimized attention implementation that further reduces the memory footprint.\n",
319 | "\n",
320 | "Compining all of those methods allows us to fine-tune LLMs with over 100B+ parameter with limited resources. The example provides a template for efficiently tuning the largest publicly available models."
321 | ]
322 | },
323 | {
324 | "cell_type": "markdown",
325 | "metadata": {},
326 | "source": []
327 | }
328 | ],
329 | "metadata": {
330 | "kernelspec": {
331 | "display_name": "pytorch",
332 | "language": "python",
333 | "name": "python3"
334 | },
335 | "language_info": {
336 | "codemirror_mode": {
337 | "name": "ipython",
338 | "version": 3
339 | },
340 | "file_extension": ".py",
341 | "mimetype": "text/x-python",
342 | "name": "python",
343 | "nbconvert_exporter": "python",
344 | "pygments_lexer": "ipython3",
345 | "version": "3.10.12"
346 | },
347 | "orig_nbformat": 4,
348 | "vscode": {
349 | "interpreter": {
350 | "hash": "2d58e898dde0263bc564c6968b04150abacfd33eed9b19aaa8e45c040360e146"
351 | }
352 | }
353 | },
354 | "nbformat": 4,
355 | "nbformat_minor": 2
356 | }
357 |
--------------------------------------------------------------------------------
/training/fine-tune-modern-bert-in-2025.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "# Fine-tune classifier with ModernBERT in 2025\n",
9 | "\n",
10 | "Large Language Models (LLMs) have become ubiquitous in 2024. However, smaller, specialized models - particularly for classification tasks - remain critical for building efficient and cost-effective AI systems. One key use case is routing user prompts to the most appropriate LLM or selecting optimal few-shot examples, where fast, accurate classification is essential.\n",
11 | "\n",
12 | "This blog post demonstrates how to fine-tune ModernBERT, a new state-of-the-art encoder model, for classifying user prompts to implement an intelligent LLM router. ModernBERT is a refreshed version of BERT models, with 8192 token context length, significantly better downstream performance, and much faster processing speeds.\n",
13 | "\n",
14 | "You will learn how to:\n",
15 | "1. Setup environment and install libraries\n",
16 | "2. Load and prepare the classification dataset \n",
17 | "3. Fine-tune & evaluate ModernBERT with the Hugging Face `Trainer`\n",
18 | "4. Run inference & test model\n",
19 | "\n",
20 | "## Quick intro: ModernBERT\n",
21 | "\n",
22 | "ModernBERT is a modernization of BERT maintaining full backward compatibility while delivering dramatic improvements through architectural innovations like rotary positional embeddings (RoPE), alternating attention patterns, and hardware-optimized design. The model comes in two sizes:\n",
23 | "- ModernBERT Base (139M parameters)\n",
24 | "- ModernBERT Large (395M parameters)\n",
25 | "\n",
26 | "ModernBERT achieves state-of-the-art performance across classification, retrieval and code understanding tasks while being 2-4x faster than previous encoder models. This makes it ideal for high-throughput production applications like LLM routing, where both accuracy and latency are critical.\n",
27 | "\n",
28 | "ModernBERT was trained on 2 trillion tokens of diverse data including web documents, code, and scientific articles - making it much more robust than traditional BERT models trained primarily on Wikipedia. This broader knowledge helps it better understand the nuances of user prompts across different domains.\n",
29 | "\n",
30 | "If you want to learn more about ModernBERT's architecture and training process, check out the official [blog](https://huggingface.co/blog/modernbert). \n",
31 | "\n",
32 | "---\n",
33 | "\n",
34 | "Now let's get started building our LLM router with ModernBERT! 🚀\n",
35 | "\n",
36 | "*Note: This tutorial was created and tested on an NVIDIA L4 GPU with 24GB of VRAM.*\n",
37 | "\n",
38 | "## Setup environment and install libraries\n",
39 | "\n",
40 | "Our first step is to install Hugging Face Libraries and Pyroch, including transformers and datasets. "
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "# Install Pytorch & other libraries\n",
50 | "%pip install \"torch==2.4.1\" tensorboard \n",
51 | "%pip install flash-attn \"setuptools<71.0.0\" scikit-learn \n",
52 | "\n",
53 | "# Install Hugging Face libraries\n",
54 | "%pip install --upgrade \\\n",
55 | " \"datasets==3.1.0\" \\\n",
56 | " \"accelerate==1.2.1\" \\\n",
57 | " \"hf-transfer==0.1.8\"\n",
58 | " #\"transformers==4.47.1\" \\\n",
59 | "\n",
60 | "# ModernBERT is not yet available in an official release, so we need to install it from github\n",
61 | "%pip install \"git+https://github.com/huggingface/transformers.git@6e0515e99c39444caae39472ee1b2fd76ece32f1\" --upgrade\n"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "We will use the [Hugging Face Hub](https://huggingface.co/models) as a remote model versioning service. This means we will automatically push our model, logs and information to the Hub during training. You must register on the [Hugging Face](https://huggingface.co/join) for this. After you have an account, we will use the `login` util from the `huggingface_hub` package to log into our account and store our token (access key) on the disk."
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "from huggingface_hub import login\n",
78 | "\n",
79 | "login(token=\"\", add_to_git_credential=True) # ADD YOUR TOKEN HERE"
80 | ]
81 | },
82 | {
83 | "cell_type": "markdown",
84 | "metadata": {},
85 | "source": [
86 | "## 2. Load and prepare the dataset\n",
87 | "\n",
88 | "In our example we want to fine-tune ModernBERT to act as a router for user prompts. Therefore we need a classification dataset consisting of user prompts and their \"difficulty\" score. We are going to use the `DevQuasar/llm_router_dataset-synth` dataset, which is a synthetic dataset of ~15,000 user prompts with a difficulty score of \"large_llm\" (`1`) or \"small_llm\" (`0`). \n",
89 | "\n",
90 | "\n",
91 | "We will use the `load_dataset()` method from the [🤗 Datasets](https://huggingface.co/docs/datasets/index) library to load the `DevQuasar/llm_router_dataset-synth` dataset."
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "from datasets import load_dataset\n",
101 | "\n",
102 | "# Dataset id from huggingface.co/dataset\n",
103 | "dataset_id = \"DevQuasar/llm_router_dataset-synth\"\n",
104 | "\n",
105 | "# Load raw dataset\n",
106 | "raw_dataset = load_dataset(dataset_id)\n",
107 | "\n",
108 | "print(f\"Train dataset size: {len(raw_dataset['train'])}\")\n",
109 | "print(f\"Test dataset size: {len(raw_dataset['test'])}\")"
110 | ]
111 | },
112 | {
113 | "cell_type": "markdown",
114 | "metadata": {},
115 | "source": [
116 | "Let’s check out an example of the dataset."
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {},
123 | "outputs": [],
124 | "source": [
125 | "from random import randrange\n",
126 | "\n",
127 | "random_id = randrange(len(raw_dataset['train']))\n",
128 | "raw_dataset['train'][random_id]\n",
129 | "# {'id': '6225a9cd-5cba-4840-8e21-1f9cf2ded7e6',\n",
130 | "# 'prompt': 'How many legs does a spider have?',\n",
131 | "# 'label': 0}"
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "metadata": {},
137 | "source": [
138 | "To train our model, we need to convert our text prompts to token IDs. This is done by a Tokenizer, which tokenizes the inputs (including converting the tokens to their corresponding IDs in the pre-trained vocabulary) if you want to learn more about this, out **[chapter 6](https://huggingface.co/course/chapter6/1?fw=pt)** of the [Hugging Face Course](https://huggingface.co/course/chapter1/1)."
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": null,
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "from transformers import AutoTokenizer\n",
148 | "\n",
149 | "# Model id to load the tokenizer\n",
150 | "model_id = \"answerdotai/ModernBERT-base\"\n",
151 | "\n",
152 | "# Load Tokenizer\n",
153 | "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
154 | "tokenizer.model_max_length = 512 # set model_max_length to 512 as prompts are not longer than 1024 tokens\n",
155 | "\n",
156 | "# Tokenize helper function\n",
157 | "def tokenize(batch):\n",
158 | " return tokenizer(batch['text'], padding='max_length', truncation=True, return_tensors=\"pt\")\n",
159 | "\n",
160 | "# Tokenize dataset\n",
161 | "raw_dataset = raw_dataset.rename_column(\"label\", \"labels\") # to match Trainer\n",
162 | "tokenized_dataset = raw_dataset.map(tokenize, batched=True,remove_columns=[\"text\"])\n",
163 | "\n",
164 | "print(tokenized_dataset[\"train\"].features.keys())\n",
165 | "# dict_keys(['input_ids', 'token_type_ids', 'attention_mask','lable'])"
166 | ]
167 | },
168 | {
169 | "cell_type": "markdown",
170 | "metadata": {},
171 | "source": [
172 | "## 3. Fine-tune & evaluate ModernBERT with the Hugging Face `Trainer`\n",
173 | "\n",
174 | "After we have processed our dataset, we can start training our model. We will use the [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base) model. The first step is to load our model with `AutoModelForSequenceClassification` class from the [Hugging Face Hub](https://huggingface.co/answerdotai/ModernBERT-base). This will initialize the pre-trained ModernBERT weights with a classification head on top. Here we pass the number of classes (2) from our dataset and the label names to have readable outputs for inference."
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "metadata": {},
181 | "outputs": [],
182 | "source": [
183 | "from transformers import AutoModelForSequenceClassification\n",
184 | "\n",
185 | "# Model id to load the tokenizer\n",
186 | "model_id = \"answerdotai/ModernBERT-base\"\n",
187 | "\n",
188 | "# Prepare model labels - useful for inference\n",
189 | "labels = tokenized_dataset[\"train\"].features[\"labels\"].names\n",
190 | "num_labels = len(labels)\n",
191 | "label2id, id2label = dict(), dict()\n",
192 | "for i, label in enumerate(labels):\n",
193 | " label2id[label] = str(i)\n",
194 | " id2label[str(i)] = label\n",
195 | "\n",
196 | "# Download the model from huggingface.co/models\n",
197 | "model = AutoModelForSequenceClassification.from_pretrained(\n",
198 | " model_id, num_labels=num_labels, label2id=label2id, id2label=id2label,\n",
199 | ")"
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "We evaluate our model during training. The `Trainer` supports evaluation during training by providing a `compute_metrics` method. We use the `evaluate` library to calculate the [f1 metric](https://huggingface.co/spaces/evaluate-metric/f1) during training on our test split."
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 5,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "import numpy as np\n",
216 | "from sklearn.metrics import f1_score\n",
217 | "\n",
218 | "# Metric helper method\n",
219 | "def compute_metrics(eval_pred):\n",
220 | " predictions, labels = eval_pred\n",
221 | " predictions = np.argmax(predictions, axis=1)\n",
222 | " score = f1_score(\n",
223 | " labels, predictions, labels=labels, pos_label=1, average=\"weighted\"\n",
224 | " )\n",
225 | " return {\"f1\": float(score) if score == 1 else score}"
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "metadata": {},
231 | "source": [
232 | "The last step is to define the hyperparameters (`TrainingArguments`) we use for our training. Here we are adding optimizations introduced features for fast training times using `torch_compile` option in the `TrainingArguments`.\n",
233 | "\n",
234 | "We also leverage the [Hugging Face Hub](https://huggingface.co/models) integration of the `Trainer` to push our checkpoints, logs, and metrics during training into a repository."
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": null,
240 | "metadata": {},
241 | "outputs": [],
242 | "source": [
243 | "from huggingface_hub import HfFolder\n",
244 | "from transformers import Trainer, TrainingArguments\n",
245 | "\n",
246 | "# Define training args\n",
247 | "training_args = TrainingArguments(\n",
248 | " output_dir= \"modernbert-llm-router\",\n",
249 | " per_device_train_batch_size=32,\n",
250 | " per_device_eval_batch_size=16,\n",
251 | " learning_rate=5e-5,\n",
252 | "\t\tnum_train_epochs=5,\n",
253 | " bf16=True, # bfloat16 training \n",
254 | " optim=\"adamw_torch_fused\", # improved optimizer \n",
255 | " # logging & evaluation strategies\n",
256 | " logging_strategy=\"steps\",\n",
257 | " logging_steps=100,\n",
258 | " eval_strategy=\"epoch\",\n",
259 | " save_strategy=\"epoch\",\n",
260 | " save_total_limit=2,\n",
261 | " load_best_model_at_end=True,\n",
262 | " metric_for_best_model=\"f1\",\n",
263 | " # push to hub parameters\n",
264 | " report_to=\"tensorboard\",\n",
265 | " push_to_hub=True,\n",
266 | " hub_strategy=\"every_save\",\n",
267 | " hub_token=HfFolder.get_token(),\n",
268 | "\n",
269 | ")\n",
270 | "\n",
271 | "# Create a Trainer instance\n",
272 | "trainer = Trainer(\n",
273 | " model=model,\n",
274 | " args=training_args,\n",
275 | " train_dataset=tokenized_dataset[\"train\"],\n",
276 | " eval_dataset=tokenized_dataset[\"test\"],\n",
277 | " compute_metrics=compute_metrics,\n",
278 | ")"
279 | ]
280 | },
281 | {
282 | "cell_type": "markdown",
283 | "metadata": {},
284 | "source": [
285 | "We can start our training by using the **`train`** method of the `Trainer`."
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": null,
291 | "metadata": {},
292 | "outputs": [],
293 | "source": [
294 | "# Start training\n",
295 | "trainer.train()"
296 | ]
297 | },
298 | {
299 | "attachments": {},
300 | "cell_type": "markdown",
301 | "metadata": {},
302 | "source": [
303 | "Fine-tuning `answerdotai/ModernBERT-base` on ~15,000 synthetic prompts for 5 epochs took `321` seconds and our best model achieved a `f1` score of `0.993`. 🚀 I also ran the training with `bert-base-uncased` to compare the training time and performance. The original BERT achieved a `f1` score of `0.99` and took `1048` seconds to train. \n",
304 | "\n",
305 | "*Note: ModernBERT and BERT both almost achieve the same performance. This indicates that the dataset is not challenging and probably could be solved using a logistic regression classifier. I ran the same code on the [banking77](https://huggingface.co/datasets/legacy-datasets/banking77) dataset. A dataset of ~13,000 customer service queries with 77 classes. There the ModernBERT outperformed the original BERT by 3% (f1 score of 0.93 vs 0.90)*\n",
306 | "\n",
307 | "\n",
308 | "Lets save our final best model and tokenizer to the Hugging Face Hub and create a model card."
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": null,
314 | "metadata": {},
315 | "outputs": [],
316 | "source": [
317 | "# Save processor and create model card\n",
318 | "tokenizer.save_pretrained(\"modernbert-llm-router\")\n",
319 | "trainer.create_model_card()\n",
320 | "trainer.push_to_hub()"
321 | ]
322 | },
323 | {
324 | "cell_type": "markdown",
325 | "metadata": {},
326 | "source": [
327 | "## 4. Run Inference & test model\n",
328 | "\n",
329 | "To wrap up this tutorial, we will run inference on a few examples and test our model. We will use the `pipeline` method from the `transformers` library to run inference on our model."
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": null,
335 | "metadata": {},
336 | "outputs": [],
337 | "source": [
338 | "from transformers import pipeline\n",
339 | "\n",
340 | "# load model from huggingface.co/models using our repository id\n",
341 | "classifier = pipeline(\"sentiment-analysis\", model=\"modernbert-llm-router\", device=0)\n",
342 | "\n",
343 | "sample = \"How does the structure and function of plasmodesmata affect cell-to-cell communication and signaling in plant tissues, particularly in response to environmental stresses?\"\n",
344 | "\n",
345 | "\n",
346 | "pred = classifier(sample)\n",
347 | "print(pred)\n",
348 | "# [{'label': 'large_llm', 'score': 1.0}]"
349 | ]
350 | },
351 | {
352 | "attachments": {},
353 | "cell_type": "markdown",
354 | "metadata": {},
355 | "source": [
356 | "## Conclusion\n",
357 | "\n",
358 | "In this tutorial, we learned how to fine-tune ModernBERT for an LLM routing classification task. We demonstrated how to leverage the Hugging Face ecosystem to efficiently train and deploy a specialized classifier that can intelligently route user prompts to the most appropriate LLM model.\n",
359 | "\n",
360 | "Using modern training optimizations like flash attention, fused optimizers and mixed precision, we were able to train our model efficiently. Comparing ModernBERT with the original BERT we reduced training time by approximately 3x (1048s vs 321s) on our dataset and outperformed the original BERT by 3% on a more challenging dataset. But more importantly, ModernBERT was trained on 2 trillion tokens, which are more diverse and up to date than the Wikipedia-based training data of the original BERT.\n",
361 | "\n",
362 | "This example showcases how smaller, specialized models remain valuable in the age of large language models - particularly for high-throughput, latency-sensitive tasks like LLM routing. By using ModernBERT's improved architecture and broader training data, we can build more robust and efficient classification systems."
363 | ]
364 | },
365 | {
366 | "cell_type": "markdown",
367 | "metadata": {},
368 | "source": []
369 | }
370 | ],
371 | "metadata": {
372 | "kernelspec": {
373 | "display_name": "pytorch",
374 | "language": "python",
375 | "name": "python3"
376 | },
377 | "language_info": {
378 | "codemirror_mode": {
379 | "name": "ipython",
380 | "version": 3
381 | },
382 | "file_extension": ".py",
383 | "mimetype": "text/x-python",
384 | "name": "python",
385 | "nbconvert_exporter": "python",
386 | "pygments_lexer": "ipython3",
387 | "version": "3.11.11"
388 | },
389 | "orig_nbformat": 4,
390 | "vscode": {
391 | "interpreter": {
392 | "hash": "2d58e898dde0263bc564c6968b04150abacfd33eed9b19aaa8e45c040360e146"
393 | }
394 | }
395 | },
396 | "nbformat": 4,
397 | "nbformat_minor": 2
398 | }
399 |
--------------------------------------------------------------------------------
/training/inference.py:
--------------------------------------------------------------------------------
1 | from vllm import LLM, SamplingParams
2 | from datasets import load_dataset
3 | from random import randint
4 |
5 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=512)
6 |
7 | # use revision without "checkpoints-" as vLLM downloads all of them
8 | llm = LLM(model="philschmid/qwen-2.5-3b-r1-countdown", revision="099c0f8cbfc522e7c3a476edfb749f576b164539")
9 |
10 | # Load dataset from Hugging Face Hub
11 | dataset_id = "Jiayi-Pan/Countdown-Tasks-3to4"
12 | dataset = load_dataset(dataset_id, split="train")
13 | sample = dataset[randint(0, len(dataset))]
14 |
15 | # create conversation
16 | messages = [
17 | {"role": "system", "content": "You are a helpful assistant. You first thinks about the reasoning process in the mind and then provides the user with the answer."},
18 | {"role": "user", "content": f"Using the numbers {sample['nums']}, create an equation that equals {sample['target']}. You can use basic arithmetic operations (+, -, *, /) one or multiple times but each number can only be used once. Show your work in tags. And return the final equation in tags, for example (1 + 2) / 3 . Think step by step inside tags."},
19 | {"role": "assistant", "content": "Let me solve this step by step.\n"}
20 | ]
21 | # generate response
22 | res = llm.generate(llm.get_tokenizer().apply_chat_template(messages, tokenize=False, continue_final_message=True), sampling_params)
23 | res = "" + res[0].outputs[0].text
24 | print(res)
25 |
26 | # We need to use the numbers 37, 15, 4, and 13 with basic arithmetic operations to make 16. Let's try different combinations:
27 | # - 37 - 15 - 4 - 13 = 6 (too low)
28 | # - 37 - 15 + 4 - 13 = 13 (too low)
29 | # - 37 + 15 - 4 - 13 = 35 (too high)
30 | # - 37 - 15 + 4 + 13 = 39 (too high)
31 | # - 15 + 4 + 13 - 37 = -1 (too low)
32 | # - 37 + 15 + 4 - 13 = 43 (too high)
33 | # - 15 + 4 * 13 / 37 = 15 + 52 / 37 (not an integer)
34 | # - 15 * 4 / 37 - 37 = -28.24 (not a whole number)
35 | # - 4 * 13 / 15 - 37 = 41.3333 (not a whole number)
36 | # After all combinations, I got not any integer result as 16.
37 | #
38 | # 37 - 15 + 4 + 13
--------------------------------------------------------------------------------
/training/launch.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --ntasks-per-node=1
3 | #SBATCH --nodes=1
4 | #SBATCH --gres=gpu:4
5 | #SBATCH --qos=high
6 | #SBATCH --partition=hopper-prod # Adjust this for your cluster
7 | #SBATCH --output=/fsx/philipp/logs/%x-%j.out # Adjust this for your cluster
8 | #SBATCH --err=/fsx/philipp/logs/%x-%j.err # Adjust this for your cluster
9 |
10 | set -x -e
11 |
12 | source ~/.bashrc
13 | micromamba activate dpo
14 | echo "START TIME: $(date)"
15 |
16 | CONFIG_FILE=$1
17 |
18 | # Training setup
19 | NUM_NODES=$SLURM_NNODES
20 | GPUS_PER_NODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
21 | WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
22 | # get number of gpus for training which world size - 1
23 | NUM_GPUS_FOR_TRAINING=$(($WORLD_SIZE - 1))
24 |
25 |
26 | # so processes know who to talk to
27 | MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
28 | MASTER_PORT=6000
29 |
30 | export CMD=" \
31 | scripts/run_r1_grpo.py --config $CONFIG_FILE
32 | "
33 |
34 | export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \
35 | --config_file configs/accelerate_configs/deepspeed_zero3.yaml \
36 | --num_machines $NUM_NODES \
37 | --num_processes $NUM_GPUS_FOR_TRAINING \
38 | --main_process_ip $MASTER_ADDR \
39 | --main_process_port $MASTER_PORT \
40 | --machine_rank \$SLURM_PROCID \
41 | --rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT" \
42 | --max_restarts 1 \
43 | --role \$(hostname -s): \
44 | --tee 3 \
45 | "
46 |
47 | # force crashing on nccl issues like hanging broadcast
48 | export NCCL_ASYNC_ERROR_HANDLING=1
49 | # export NCCL_DEBUG=INFO
50 | # export NCCL_DEBUG_SUBSYS=COLL
51 | # export NCCL_SOCKET_NTHREADS=1
52 | # export NCCL_NSOCKS_PERTHREAD=1
53 | # export CUDA_LAUNCH_BLOCKING=1
54 |
55 | # Specific configuration optimized for the Hugging Face Compute Cluster
56 | # Be ye warned this may not work on other clusters!
57 | module load cuda/12.1
58 |
59 | # srun error handling:
60 | # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
61 | # --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
62 | SRUN_ARGS=" \
63 | --wait=60 \
64 | --kill-on-bad-exit=1 \
65 | "
66 |
67 | clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --role \$SLURMD_NODENAME: $CMD" 2>&1
68 |
69 | echo "END TIME: $(date)"
--------------------------------------------------------------------------------
/training/optimize-llama-2-gptq.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "# Quantize open LLMs using optimum and GPTQ\n",
9 | "\n",
10 | "The Hugging Face Optimum team collaborated with AutoGPTQ library to provide a simple API that apply GPTQ quantization on language models. With GPTQ quantization open LLMs to 8, 4, 3 or even 2 bits to run them on smaller Hardware without a big drop of performance. \n",
11 | "\n",
12 | "In the blog, you will learn how to:\n",
13 | "\n",
14 | "1. Setup our development environment\n",
15 | "2. Prepare quantization dataset\n",
16 | "3. Load and Quantize Model\n",
17 | "4. Test performance and inference speed\n",
18 | "5. Bonus: Run Inference with Text Generation Inference\n",
19 | " \n",
20 | "But we before we get started lets take quick look on what GPTQ does. \n",
21 | "\n",
22 | "_Note: This tutorial was created and run on a g5.2xlarge AWS EC2 Instance, including an NVIDIA A10G GPU._\n",
23 | "\n",
24 | "\n",
25 | "## What is GPTQ?\n",
26 | "\n",
27 | "[GPTQ](https://arxiv.org/abs/2210.17323) is a post-training quantziation method to compress LLMs, like GPT. GPTQ compresses GPT models by reducing the number of bits needed to store each weight in the model, from 32 bits down to just 3-4 bits. This means the model takes up much less memory, so it can run on less Hardware, e.g. Single GPU for 13B Llama2 models. GPTQ analyzes each layer of the model separately and approximating the weights in a way that preserves the overall accuracy.\n",
28 | "\n",
29 | "The main benefits are:\n",
30 | "* Quantizes the weights of the model layer-by-layer to 4 bits instead of 16 bits, this reduces the needed memory by 4x.\n",
31 | "* Quantization is done gradually to minimize the accuracy loss from quantization.\n",
32 | "* Achieves same latency as fp16 model, but 4x less memory usage, sometimes faster due to custom kernels, e.g. [Exllama](https://github.com/turboderp/exllama)\n",
33 | "* Quantized weights can be saved to disk for a head of time quantization.\n",
34 | "\n",
35 | "_Note: GPTQ quantization only works for text model for now. Futhermore, the quantization process can take a lot of time. You check on the [Hugging Face Hub](https://huggingface.co/models?search=gptq) if there is not already a GPTQ quantized version of the model you want to use._\n",
36 | "\n",
37 | "--- \n",
38 | "\n",
39 | "## 1. Setup our development environment\n",
40 | "\n",
41 | "Let's start coding, but first, install our dependencies."
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "!pip install \"transformers==4.32.1\" \"optimum==1.12.0\" \"auto-gptq==0.4.2\" \"accelerate==0.22.0\" \"safetensors>=0.3.1\" --upgrade"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "## 2. Prepare quantization dataset\n",
58 | "\n",
59 | "GPTQ is a post-training quantization method, so we need to prepare a dataset to quantize our model. We can either use a dataset from the [Hugging Face Hub](https://huggingface.co/datasets) or use our own dataset. In this blog, we are going to use the [WikiText](https://huggingface.co/datasets/wikitext) dataset from the Hugging Face Hub. The dataset is used to quantize the weights to minimize the performance loss. It is recommended to use a quantization dataset with atleast `128` samples.\n",
60 | "\n",
61 | "_Note: [TheBloke](https://huggingface.co/TheBloke) a very active community member is contributing hundreds of gptq weights to the Hugging Face Hub. He mostly uses wikitext as quantization dataset for general domain models._\n",
62 | "\n",
63 | "If you want to use, e.g. your fine-tuning dataset for quantization you can provide it as a list instead of the \"id\", check out this [example](https://colab.research.google.com/drive/1_TIrmuKOFhuRRiTWN94iLKUFu6ZX4ceb). "
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "# Dataset id from Hugging Face \n",
73 | "dataset_id = \"wikitext2\""
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {},
79 | "source": [
80 | "## 3. Load and Quantize Model\n",
81 | "\n",
82 | "Optimum integrates GPTQ quantization in the `optimum.qptq` namespace with a `GPTQQuantizer`. The quantizer takes our dataset (id or list), bits, and model_seqlen as input. For more customization check [here](https://github.com/huggingface/optimum/blob/234a427450a7dcc978b227fa627ebcdab1764318/optimum/gptq/quantizer.py#L76).\n"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "from optimum.gptq import GPTQQuantizer\n",
92 | "\n",
93 | "# GPTQ quantizer\n",
94 | "quantizer = GPTQQuantizer(bits=4, dataset=dataset_id, model_seqlen=4096)\n",
95 | "quantizer.quant_method = \"gptq\""
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "After we have created our Quantizer we can load our model using Transformers. In our example we will quantize a [Llama 2 7B](https://huggingface.co/philschmid/llama-2-7b-instruction-generator), which we trained in my other blog post [\"Extended Guide: Instruction-tune Llama 2\"](https://www.philschmid.de/instruction-tune-llama-2). We are going to load our model in `fp16` since GPTQ adopts a mixed int4/fp16 quantization scheme where weights are quantized as int4 while activations remain in float16. "
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "import torch\n",
112 | "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
113 | "\n",
114 | "# Hugging Face model id\n",
115 | "model_id = \"philschmid/llama-2-7b-instruction-generator\"\n",
116 | "\n",
117 | "tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False) # bug with fast tokenizer\n",
118 | "model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16) # we load the model in fp16 on purpose"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "After we loaded our model we are ready to quantize it. \n",
126 | "_Note: Quantization can take process can take a lot of time depending on one's hardware. For this example the quantization on a single A10G GPU for a 7B model took ~minutes._ "
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "import os \n",
136 | "import json\n",
137 | "\n",
138 | "# quantize the model \n",
139 | "quantized_model = quantizer.quantize_model(model, tokenizer)\n",
140 | "\n",
141 | "# save the quantize model to disk\n",
142 | "save_folder = \"quantized_llama\"\n",
143 | "quantized_model.save_pretrained(save_folder, safe_serialization=True)\n",
144 | "\n",
145 | "# load fresh, fast tokenizer and save it to disk\n",
146 | "tokenizer = AutoTokenizer.from_pretrained(model_id).save_pretrained(save_folder)\n",
147 | "\n",
148 | "# save quantize_config.json for TGI \n",
149 | "with open(os.path.join(save_folder, \"quantize_config.json\"), \"w\", encoding=\"utf-8\") as f:\n",
150 | " quantizer.disable_exllama = False\n",
151 | " json.dump(quantizer.to_dict(), f, indent=2)"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "since the model was partially offloaded it set `disable_exllama` to `True` to avoid an error. For inference and production load we want to leverage the exllama kernels. Therefore we need to change the `config.json`"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": null,
164 | "metadata": {},
165 | "outputs": [],
166 | "source": [
167 | "with open(os.path.join(save_folder, \"config.json\"), \"r\", encoding=\"utf-8\") as f:\n",
168 | " config = json.load(f)\n",
169 | " config[\"quantization_config\"][\"disable_exllama\"] = False\n",
170 | " with open(os.path.join(save_folder, \"config.json\"), \"w\", encoding=\"utf-8\") as f:\n",
171 | " json.dump(config, f, indent=2)"
172 | ]
173 | },
174 | {
175 | "cell_type": "markdown",
176 | "metadata": {},
177 | "source": [
178 | "## 4. Test performance and inference speed\n",
179 | "\n",
180 | "Since the latest release of transformers we can load any GPTQ quantized model directly using the `AutoModelForCausalLM` class this. You can either load already quantized models from Hugging Face, e.g. [TheBloke/Llama-2-13B-chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ) or models you quantized yourself. Since we want to test here the results of our quantization we are going to load our quantized model from disk and compare it to our non quantize model. \n",
181 | "\n",
182 | "First lets our our non quantized model and test it on a simple prompt."
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": null,
188 | "metadata": {},
189 | "outputs": [],
190 | "source": [
191 | "import time \n",
192 | "\n",
193 | "# The prompt is based on the fine-tuning from the model: https://www.philschmid.de/instruction-tune-llama-2#4-test-model-and-run-inference\n",
194 | "prompt = \"\"\"### Instruction:\n",
195 | "Use the Input below to create an instruction, which could have been used to generate the input using an LLM.\n",
196 | "\n",
197 | "### Input:\n",
198 | "Dear [boss name],\n",
199 | "\n",
200 | "I'm writing to request next week, August 1st through August 4th,\n",
201 | "off as paid time off.\n",
202 | "\n",
203 | "I have some personal matters to attend to that week that require\n",
204 | "me to be out of the office. I wanted to give you as much advance\n",
205 | "notice as possible so you can plan accordingly while I am away.\n",
206 | "\n",
207 | "Thank you, [Your name]\n",
208 | "\n",
209 | "### Response:\n",
210 | "\"\"\"\n",
211 | "\n",
212 | "# helper function to generate text and measure latency\n",
213 | "def generate_helper(pipeline,prompt=prompt):\n",
214 | " # warm up\n",
215 | " for i in range(5):\n",
216 | " _ = pipeline(\"Warm up\")\n",
217 | "\n",
218 | " # measure latency in a simple way \n",
219 | " start = time.time()\n",
220 | " out = pipeline(prompt, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.9)\n",
221 | " end = time.time()\n",
222 | " \n",
223 | " generated_text = out[0][\"generated_text\"][len(prompt):]\n",
224 | " \n",
225 | " latency_per_token_in_ms = ((end-start)/len(pipeline.tokenizer(generated_text)[\"input_ids\"]))*1000\n",
226 | " \n",
227 | " # return the generated text and the latency\n",
228 | " return {\"text\": out[0][\"generated_text\"][len(prompt):], \"latency\": f\"{round(latency_per_token_in_ms,2)}ms/token\"}\n"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "metadata": {},
234 | "source": [
235 | "We can load the vanilla transformers model and run inference using the `pipeline` class. "
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": null,
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "import torch\n",
245 | "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n",
246 | "\n",
247 | "# Hugging Face model id\n",
248 | "model_id = \"philschmid/llama-2-7b-instruction-generator\"\n",
249 | "\n",
250 | "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
251 | "model = AutoModelForCausalLM.from_pretrained(model_id, device_map=\"auto\", torch_dtype=torch.float16) # we load the model in fp16 on purpose\n",
252 | "\n",
253 | "pipe = pipeline(\"text-generation\", model=model, tokenizer=tokenizer)"
254 | ]
255 | },
256 | {
257 | "cell_type": "markdown",
258 | "metadata": {},
259 | "source": [
260 | "lets create our vanilla base line"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": null,
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "import torch \n",
270 | "\n",
271 | "vanilla_res = generate_helper(pipe)\n",
272 | "\n",
273 | "print(f\"Latency: {vanilla_res['latency']}\")\n",
274 | "print(f\"GPU memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB\")\n",
275 | "print(f\"Generated Instruction: {vanilla_res['text']}\")\n",
276 | "\n",
277 | "# Latency: 37.49ms/token\n",
278 | "# GPU memory: 12.62 GB\n",
279 | "# Generated Instruction: Write a request for PTO letter to my boss"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": null,
285 | "metadata": {},
286 | "outputs": [],
287 | "source": [
288 | "# clean up \n",
289 | "del pipe\n",
290 | "del model \n",
291 | "del tokenizer\n",
292 | "torch.cuda.empty_cache()"
293 | ]
294 | },
295 | {
296 | "attachments": {},
297 | "cell_type": "markdown",
298 | "metadata": {},
299 | "source": [
300 | "Since we have now our baseline we can test and validate our GPTQ quantize weights. Therefore we will use the new `gptq` integration into the `AutoModelForCausalLM` class where we can directly load the `gptq` weights. "
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": null,
306 | "metadata": {},
307 | "outputs": [],
308 | "source": [
309 | "import torch\n",
310 | "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n",
311 | "\n",
312 | "# path to gptq weights\n",
313 | "model_id = \"quantized_llama\"\n",
314 | "\n",
315 | "q_tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
316 | "q_model = AutoModelForCausalLM.from_pretrained(model_id, device_map=\"auto\", torch_dtype=torch.float16)\n",
317 | "\n",
318 | "qtq_pipe = pipeline(\"text-generation\", model=q_model, tokenizer=q_tokenizer)"
319 | ]
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "metadata": {},
324 | "source": [
325 | "Now, we can test our quantized model on the same prompt as our baseline."
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": null,
331 | "metadata": {},
332 | "outputs": [],
333 | "source": [
334 | "gpq_res = generate_helper(qtq_pipe)\n",
335 | "\n",
336 | "print(f\"Latency: {gpq_res['latency']}\")\n",
337 | "print(f\"GPU memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB\")\n",
338 | "print(f\"Generated Instruction: {gpq_res['text']}\")\n",
339 | "\n",
340 | "# Latency: 36.0ms/token\n",
341 | "# GPU memory: 3.83 GB\n",
342 | "# Generated Instruction: Write a letter requesting time off"
343 | ]
344 | },
345 | {
346 | "cell_type": "markdown",
347 | "metadata": {},
348 | "source": [
349 | "For comparison the vanilla model needed ~12.6GB Memory and the GPTQ model needed ~3.8GB Memory, with equal performance. GPTQ allowed us to save ~4x memory (don't forget pytorch has default kernels). "
350 | ]
351 | },
352 | {
353 | "cell_type": "markdown",
354 | "metadata": {},
355 | "source": [
356 | "## 5. Bonus: Run Inference with Text Generation Inference\n",
357 | "\n",
358 | "Text Generation Inference supports GPTQ model for more efficient deployments. We simply need to provide `gptq` as `QUANTIZE` environment variable when starting our container. "
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": null,
364 | "metadata": {},
365 | "outputs": [],
366 | "source": [
367 | "model=\"/home/ubuntu/test-gptq\"\n",
368 | "num_shard=1\n",
369 | "quantize=\"gptq\"\n",
370 | "max_input_length=1562\n",
371 | "max_total_tokens=4096 # 4096\n",
372 | "\n",
373 | "!docker run --gpus all -ti -p 8080:80 \\\n",
374 | " -e MODEL_ID=$model \\\n",
375 | " -e QUANTIZE=$quantize \\\n",
376 | " -e NUM_SHARD=$num_shard \\\n",
377 | " -e MAX_INPUT_LENGTH=$max_input_length \\\n",
378 | " -e MAX_TOTAL_TOKENS=$max_total_tokens \\\n",
379 | " -v $model:$model \\\n",
380 | " ghcr.io/huggingface/text-generation-inference:1.0.3"
381 | ]
382 | },
383 | {
384 | "cell_type": "markdown",
385 | "metadata": {},
386 | "source": [
387 | "We can invoke our container using curl. \n",
388 | "_Note: The first request will be slow. _"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": null,
394 | "metadata": {},
395 | "outputs": [],
396 | "source": [
397 | "curl 127.0.0.1:8080/generate \\\n",
398 | " -X POST \\\n",
399 | " -d '{\"inputs\":\"### Instruction:\\nUse the Input below to create an instruction, which could have been used to generate the input using an LLM.\\n\\n### Input:\\nDear [boss name],\\n\\nI am writing to request next week, August 1st through August 4th,\\noff as paid time off.\\n\\nI have some personal matters to attend to that week that require\\nme to be out of the office. I wanted to give you as much advance\\nnotice as possible so you can plan accordingly while I am away.\\n\\nThank you, [Your name]\\n\\n### Response:\",\"parameters\":{\"temperature\":0.2, \"top_p\": 0.95, \"max_new_tokens\": 256}}' \\\n",
400 | " -H 'Content-Type: application/json'"
401 | ]
402 | },
403 | {
404 | "cell_type": "markdown",
405 | "metadata": {},
406 | "source": [
407 | "With Text Generation inference we are achieving ~`22.942983ms` latency per token, which is 2x faster than transformers. If you plan to deploy your model in production, I would recommend to use Text Generation Inference."
408 | ]
409 | },
410 | {
411 | "cell_type": "markdown",
412 | "metadata": {},
413 | "source": []
414 | }
415 | ],
416 | "metadata": {
417 | "kernelspec": {
418 | "display_name": "pytorch",
419 | "language": "python",
420 | "name": "python3"
421 | },
422 | "language_info": {
423 | "codemirror_mode": {
424 | "name": "ipython",
425 | "version": 3
426 | },
427 | "file_extension": ".py",
428 | "mimetype": "text/x-python",
429 | "name": "python",
430 | "nbconvert_exporter": "python",
431 | "pygments_lexer": "ipython3",
432 | "version": "3.9.16"
433 | },
434 | "orig_nbformat": 4,
435 | "vscode": {
436 | "interpreter": {
437 | "hash": "2d58e898dde0263bc564c6968b04150abacfd33eed9b19aaa8e45c040360e146"
438 | }
439 | }
440 | },
441 | "nbformat": 4,
442 | "nbformat_minor": 2
443 | }
444 |
--------------------------------------------------------------------------------
/training/preprocessing/create_flan_t5_cnn_dataset.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset
2 | from transformers import AutoTokenizer
3 | import numpy as np
4 | import os
5 | from datasets import concatenate_datasets
6 | import numpy as np
7 |
8 | # experiment config
9 | model_id = "google/flan-t5-xl"
10 |
11 | # Dataset
12 | dataset_id = "cnn_dailymail"
13 | dataset_config = "3.0.0"
14 | save_dataset_path = "data"
15 | text_column = "article"
16 | summary_column = "highlights"
17 | prompt_start = "Summarize the following news article:\n"
18 | generation_start = "\nSummary:\n"
19 | prompt_template = f"{prompt_start}{{input}}{generation_start}"
20 |
21 | # Load dataset from the hub
22 | dataset = load_dataset(dataset_id, name=dataset_config)
23 | # Load tokenizer of FLAN-t5-base
24 | tokenizer = AutoTokenizer.from_pretrained(model_id)
25 |
26 | print(f"Train dataset size: {len(dataset['train'])}")
27 | print(f"Test dataset size: {len(dataset['test'])}")
28 |
29 | prompt_lenght = len(tokenizer(prompt_template.format(input=""))["input_ids"])
30 | max_sample_length = tokenizer.model_max_length - prompt_lenght
31 | print(f"Prompt lenght: {prompt_lenght}")
32 | print(f"Max input lenght: {max_sample_length}")
33 |
34 | # The maximum total input sequence length after tokenization.
35 | # Sequences longer than this will be truncated, sequences shorter will be padded.
36 | tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(
37 | lambda x: tokenizer(x[text_column], truncation=True), batched=True, remove_columns=[text_column, summary_column]
38 | )
39 | max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
40 | max_source_length = min(max_source_length, max_sample_length)
41 | print(f"Max source length: {max_source_length}")
42 |
43 | # The maximum total sequence length for target text after tokenization.
44 | # Sequences longer than this will be truncated, sequences shorter will be padded."
45 | tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(
46 | lambda x: tokenizer(x[summary_column], truncation=True), batched=True, remove_columns=[text_column, summary_column]
47 | )
48 | target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
49 | # use 95th percentile as max target length
50 | max_target_length = int(np.percentile(target_lenghts, 95))
51 | print(f"Max target length: {max_target_length}")
52 |
53 |
54 | def preprocess_function(sample, padding="max_length"):
55 | # created prompted input
56 | inputs = [prompt_template.format(input=item) for item in sample[text_column]]
57 |
58 | # tokenize inputs
59 | model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
60 |
61 | # Tokenize targets with the `text_target` keyword argument
62 | labels = tokenizer(
63 | text_target=sample[summary_column], max_length=max_target_length, padding=padding, truncation=True
64 | )
65 |
66 | # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
67 | # padding in the loss.
68 | if padding == "max_length":
69 | labels["input_ids"] = [
70 | [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
71 | ]
72 |
73 | model_inputs["labels"] = labels["input_ids"]
74 | return model_inputs
75 |
76 |
77 | tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=list(dataset["train"].features))
78 | print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
79 |
80 | tokenized_dataset["train"].save_to_disk(os.path.join(save_dataset_path, "train"))
81 | tokenized_dataset["test"].save_to_disk(os.path.join(save_dataset_path, "eval"))
82 |
--------------------------------------------------------------------------------
/training/receipes/dpo-llama-3-1-8b-qlora.yaml:
--------------------------------------------------------------------------------
1 | # Model arguments
2 | model_name_or_path: philschmid/llama-3-1-8b-math-orca-spectrum-10k-ep1
3 | tokenizer_name_or_path: philschmid/llama-3-1-8b-math-orca-spectrum-10k-ep1
4 | model_revision: main
5 | torch_dtype: bfloat16
6 | attn_implementation: flash_attention_2
7 | use_liger: false
8 | bf16: true
9 | tf32: true
10 | output_dir: runs/dpo-llama-3-1-8b-math-ep3
11 |
12 | # Dataset arguments
13 | dataset_id_or_path: philschmid/philschmid-llama-3-1-8b-math-orca-spectr-philschmid-DMath-candidates
14 |
15 | # LoRA arguments
16 | use_peft: true
17 | load_in_4bit: true
18 | lora_target_modules: "all-linear"
19 | # important as we need to train the special tokens for the chat template of llama
20 | lora_modules_to_save: ["lm_head", "embed_tokens"] # you might need to change this for qwen or other models
21 | lora_r: 16
22 | lora_alpha: 16
23 |
24 | # Training arguments
25 | beta: 0.1
26 | max_length: 1536
27 | max_prompt_length: 768
28 | loss_type: sigmoid # default loss, alternatives: https://huggingface.co/docs/trl/dpo_trainer#loss-functions
29 | num_train_epochs: 3
30 | per_device_train_batch_size: 1
31 | gradient_accumulation_steps: 8
32 | gradient_checkpointing: true
33 | gradient_checkpointing_kwargs:
34 | use_reentrant: false
35 | learning_rate: 5.0e-6
36 | lr_scheduler_type: constant
37 | warmup_ratio: 0.03
38 |
39 | # Logging arguments
40 | logging_strategy: steps
41 | logging_steps: 5
42 | report_to:
43 | - tensorboard
44 | save_strategy: "epoch"
45 | seed: 42
46 |
47 | # Hugging Face Hub
48 | push_to_hub: true
49 | # hub_model_id: llama-3-1-8b-math-orca-qlora-10k-ep1 # if not defined same as output_dir
50 | hub_strategy: every_save
--------------------------------------------------------------------------------
/training/receipes/dpo-llama-3-1-8b.yaml:
--------------------------------------------------------------------------------
1 | # Model arguments
2 | model_name_or_path: philschmid/llama-3-1-8b-math-orca-spectrum-10k-ep1
3 | tokenizer_name_or_path: philschmid/llama-3-1-8b-math-orca-spectrum-10k-ep1
4 | model_revision: main
5 | torch_dtype: bfloat16
6 | attn_implementation: flash_attention_2
7 | use_liger: false
8 | bf16: true
9 | tf32: true
10 | output_dir: runs/dpo-llama-3-1-8b-math
11 |
12 | # Dataset arguments
13 | dataset_id_or_path: philschmid/philschmid-llama-3-1-8b-math-orca-spectr-philschmid-DMath-candidates
14 |
15 | # Training arguments
16 | beta: 0.1
17 | max_length: 1536
18 | max_prompt_length: 768
19 | loss_type: sigmoid # default loss, alternatives: https://huggingface.co/docs/trl/dpo_trainer#loss-functions
20 | num_train_epochs: 3
21 | per_device_train_batch_size: 2
22 | gradient_accumulation_steps: 8
23 | gradient_checkpointing: true
24 | gradient_checkpointing_kwargs:
25 | use_reentrant: false
26 | learning_rate: 5.0e-7
27 | lr_scheduler_type: constant
28 | warmup_ratio: 0.03
29 |
30 | # Logging arguments
31 | logging_strategy: steps
32 | logging_steps: 5
33 | report_to:
34 | - tensorboard
35 | save_strategy: "epoch"
36 | seed: 42
37 |
38 | # Hugging Face Hub
39 | push_to_hub: true
40 | # hub_model_id: llama-3-1-8b-math-orca-qlora-10k-ep1 # if not defined same as output_dir
41 | hub_strategy: every_save
--------------------------------------------------------------------------------
/training/receipes/grpo-qwen-2.5-3b-deepseek-r1-countdown.yaml:
--------------------------------------------------------------------------------
1 | # Model arguments
2 | model_name_or_path: Qwen/Qwen2.5-3B-Instruct
3 | model_revision: main
4 | torch_dtype: bfloat16
5 | attn_implementation: flash_attention_2
6 | bf16: true
7 | tf32: true
8 | output_dir: runs/qwen-2.5-3b-r1-countdown
9 |
10 | # Dataset arguments
11 | dataset_id_or_path: Jiayi-Pan/Countdown-Tasks-3to4
12 |
13 | # Lora Arguments
14 | # No LoRA is used here
15 |
16 | # Training arguments
17 | max_steps: 450
18 | per_device_train_batch_size: 1
19 | gradient_accumulation_steps: 8
20 | gradient_checkpointing: true
21 | gradient_checkpointing_kwargs:
22 | use_reentrant: false
23 | learning_rate: 5.0e-7 # 1.0e-6 as in the deepseek math paper 5-e7 from https://hijkzzz.notion.site/unraveling-rlhf-and-its-variants-engineering-insights#147d9a33ecc9806090f3d5c749d31f05
24 | lr_scheduler_type: cosine
25 | warmup_ratio: 0.03
26 | # GRPO specific parameters
27 | beta: 0.001 # 0.04 as in the deepseek math paper 0.001 from https://hijkzzz.notion.site/unraveling-rlhf-and-its-variants-engineering-insights#147d9a33ecc9806090f3d5c749d31f05
28 | max_prompt_length: 256
29 | max_completion_length: 1024
30 | num_generations: 8
31 | use_vllm: true
32 | # vllm_device: "cuda:3"
33 | vllm_gpu_memory_utilization: 0.5
34 |
35 | # Logging arguments
36 | logging_strategy: steps
37 | logging_steps: 2
38 | report_to:
39 | - tensorboard
40 | save_strategy: "steps"
41 | save_steps: 25
42 | seed: 42
43 |
44 | # Hugging Face Hub
45 | push_to_hub: true
46 | # hub_model_id: llama-3-1-8b-math-orca-qlora-10k-ep1 # if not defined same as output_dir
47 | hub_strategy: every_save
--------------------------------------------------------------------------------
/training/receipes/llama-3-1-8b-qlora.yaml:
--------------------------------------------------------------------------------
1 | # Model arguments
2 | model_name_or_path: Meta-Llama/Meta-Llama-3.1-8B
3 | tokenizer_name_or_path: Meta-Llama/Meta-Llama-3.1-8B-Instruct
4 | model_revision: main
5 | torch_dtype: bfloat16
6 | attn_implementation: flash_attention_2
7 | use_liger: true
8 | bf16: true
9 | tf32: true
10 | output_dir: runs/llama-3-1-8b-math-orca-qlora-10k-ep1
11 |
12 | # Dataset arguments
13 | dataset_id_or_path: train_dataset.json
14 | max_seq_length: 1024
15 | packing: true
16 |
17 | # LoRA arguments
18 | use_peft: true
19 | load_in_4bit: true
20 | lora_target_modules: "all-linear"
21 | # important as we need to train the special tokens for the chat template of llama
22 | lora_modules_to_save: ["lm_head", "embed_tokens"] # you might need to change this for qwen or other models
23 | lora_r: 16
24 | lora_alpha: 16
25 |
26 | # Training arguments
27 | num_train_epochs: 1
28 | per_device_train_batch_size: 8
29 | gradient_accumulation_steps: 2
30 | gradient_checkpointing: true
31 | gradient_checkpointing_kwargs:
32 | use_reentrant: false
33 | learning_rate: 2.0e-4
34 | lr_scheduler_type: constant
35 | warmup_ratio: 0.1
36 |
37 | # Logging arguments
38 | logging_strategy: steps
39 | logging_steps: 5
40 | report_to:
41 | - tensorboard
42 | save_strategy: "epoch"
43 | seed: 42
44 |
45 | # Hugging Face Hub
46 | push_to_hub: true
47 | # hub_model_id: llama-3-1-8b-math-orca-qlora-10k-ep1 # if not defined same as output_dir
48 | hub_strategy: every_save
--------------------------------------------------------------------------------
/training/receipes/llama-3-1-8b-spectrum.yaml:
--------------------------------------------------------------------------------
1 | # Model arguments
2 | model_name_or_path: Meta-Llama/Meta-Llama-3.1-8B
3 | tokenizer_name_or_path: Meta-Llama/Meta-Llama-3.1-8B-Instruct
4 | model_revision: main
5 | torch_dtype: bfloat16
6 | attn_implementation: flash_attention_2
7 | use_liger: true
8 | bf16: true
9 | tf32: true
10 | output_dir: runs/llama-3-1-8b-math-orca-spectrum-10k-ep1
11 |
12 | # Dataset arguments
13 | dataset_id_or_path: train_dataset.json
14 | max_seq_length: 1024
15 | packing: true
16 |
17 | # Spectrum arguments
18 | spectrum_config_path: configs/spectrum/snr_results_meta-llama-Meta-Llama-3.1-8B_unfrozenparameters_30percent.yaml
19 |
20 | # Training arguments
21 | num_train_epochs: 1
22 | per_device_train_batch_size: 8
23 | gradient_accumulation_steps: 2
24 | gradient_checkpointing: true
25 | gradient_checkpointing_kwargs:
26 | use_reentrant: false
27 | learning_rate: 5.0e-5
28 | lr_scheduler_type: cosine
29 | warmup_ratio: 0.1
30 |
31 | # Logging arguments
32 | logging_strategy: steps
33 | logging_steps: 5
34 | report_to:
35 | - tensorboard
36 | save_strategy: "epoch"
37 | seed: 42
38 |
39 | # Hugging Face Hub
40 | push_to_hub: true
41 | # hub_model_id: llama-3-1-8b-math-orca-qlora-10k-ep1 # if not defined same as output_dir
42 | hub_strategy: every_save
--------------------------------------------------------------------------------
/training/run_ds_lora.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 | from typing import cast
3 |
4 | import os
5 | import subprocess
6 | from typing import Optional
7 | import torch
8 |
9 | from transformers import HfArgumentParser, TrainingArguments, Trainer
10 | from utils.peft_utils import SaveDeepSpeedPeftModelCallback, create_and_prepare_model
11 | from datasets import load_from_disk
12 |
13 |
14 | # Define and parse arguments.
15 | @dataclass
16 | class ScriptArguments:
17 | """
18 | Additional arguments for training, which are not part of TrainingArguments.
19 | """
20 | model_id: str = field(
21 | metadata={
22 | "help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc."
23 | },
24 | )
25 | dataset_path: Optional[str] = field(
26 | default="timdettmers/openassistant-guanaco",
27 | metadata={"help": "The preference dataset to use."},
28 | )
29 | lora_alpha: Optional[int] = field(default=16)
30 | lora_dropout: Optional[float] = field(default=0.1)
31 | lora_r: Optional[int] = field(default=64)
32 | use_flash_attn: Optional[bool] = field(
33 | default=False,
34 | metadata={"help": "Enables Flash attention for training."},
35 | )
36 | merge_adapters: bool = field(
37 | metadata={"help": "Wether to merge weights for LoRA."},
38 | default=False,
39 | )
40 |
41 |
42 | def training_function(script_args:ScriptArguments, training_args:TrainingArguments):
43 |
44 | # Load processed dataset from disk
45 | dataset = load_from_disk(script_args.dataset_path)
46 |
47 | # Load and create peft model
48 | model, peft_config, tokenizer = create_and_prepare_model(script_args.model_id,training_args, script_args)
49 | model.config.use_cache = False
50 |
51 |
52 | # Create trainer and add callbacks
53 | trainer = Trainer(model=model, args=training_args, train_dataset=dataset)
54 | trainer.accelerator.print(f"{trainer.model}")
55 | trainer.model.print_trainable_parameters()
56 | trainer.add_callback(SaveDeepSpeedPeftModelCallback(trainer, save_steps=training_args.save_steps))
57 |
58 | # Start training
59 | trainer.train()
60 |
61 | # Save model on main process
62 | trainer.accelerator.wait_for_everyone()
63 | state_dict = trainer.accelerator.get_state_dict(trainer.deepspeed)
64 | unwrapped_model = trainer.accelerator.unwrap_model(trainer.deepspeed)
65 | if trainer.accelerator.is_main_process:
66 | unwrapped_model.save_pretrained(training_args.output_dir, state_dict=state_dict)
67 | trainer.accelerator.wait_for_everyone()
68 |
69 | # TODO: add merge adapters
70 | # Save everything else on main process
71 | if trainer.args.process_index == 0:
72 | if script_args.merge_adapters:
73 | # merge adapter weights with base model and save
74 | # save int 4 model
75 | trainer.model.save_pretrained(training_args.output_dir, safe_serialization=False)
76 | # clear memory
77 | del model
78 | del trainer
79 | torch.cuda.empty_cache()
80 |
81 | from peft import AutoPeftModelForCausalLM
82 |
83 | # load PEFT model in fp16
84 | model = AutoPeftModelForCausalLM.from_pretrained(
85 | training_args.output_dir,
86 | low_cpu_mem_usage=True,
87 | torch_dtype=torch.float16,
88 | )
89 | # Merge LoRA and base model and save
90 | model = model.merge_and_unload()
91 | model.save_pretrained(
92 | training_args.output_dir, safe_serialization=True, max_shard_size="8GB"
93 | )
94 | else:
95 | trainer.model.save_pretrained(
96 | training_args.output_dir, safe_serialization=True
97 | )
98 |
99 | # save tokenizer
100 | tokenizer.save_pretrained(training_args.output_dir)
101 |
102 |
103 | def main():
104 | parser = HfArgumentParser([ScriptArguments,TrainingArguments])
105 | script_args, training_args = parser.parse_args_into_dataclasses()
106 | script_args = cast(ScriptArguments, script_args)
107 | training_args = cast(TrainingArguments, training_args)
108 |
109 | training_function(script_args, training_args)
110 |
111 |
112 | if __name__ == "__main__":
113 | main()
--------------------------------------------------------------------------------
/training/scripts/bloke_gptq.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copied from TheBloke: https://github.com/TheBlokeAI/AIScripts/blob/main/quant_autogptq.py#L59
3 | # python /home/ubuntu/deep-learning-pytorch-huggingface/training/scripts/bloke_gptq.py philschmid/llama-2-7b-instruction-generator gptq_res/ wikitext --seqlen 1024
4 | #
5 |
6 | import time
7 | import os
8 | import logging
9 | import random
10 | from datasets import load_dataset
11 |
12 | class QuantAutoGPTQ:
13 | def __init__(self, model_name_or_path, output_dir, dataset,
14 | num_samples=128, trust_remote_code=False, cache_examples=True,
15 | use_fast=True, use_triton=False, bits=[4], group_size=[128], damp=[0.01],
16 | desc_act=[False], dtype='float16', seqlen=2048, batch_size=1, stop_file=None,
17 | make_folder=False, GPU=0, cuda_alloc_conf=None):
18 |
19 | # Limit visible GPU to the one specified
20 | # We don't currently support multi-GPU, as AutoGPTQ can't use more than one GPU for quant anyway.
21 | #os.environ["CUDA_VISIBLE_DEVICES"] = str(GPU)
22 |
23 | # Allow specifying CUDA allocation config, eg PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32
24 | # This can allow for quantising larger models without running out of VRAM
25 | #if cuda_alloc_conf is not None:
26 | # os.environ["PYTORCH_CUDA_ALLOC_CONF"] = cuda_alloc_conf
27 |
28 | self.pretrained_model_dir = model_name_or_path
29 | self.output_dir_base = output_dir
30 | self.dataset = dataset
31 | self.num_samples = num_samples
32 | self.trust_remote_code = trust_remote_code
33 | self.cache_examples = cache_examples
34 | self.use_fast = use_fast
35 | self.use_triton = use_triton
36 |
37 | def check_list(item):
38 | return item if isinstance(item, list) else [item]
39 |
40 | self.bits = check_list(bits)
41 | self.group_size = check_list(group_size)
42 | self.desc_act = check_list(desc_act)
43 | self.damp = check_list(damp)
44 |
45 | self.dtype = dtype
46 | self.seqlen = seqlen
47 | self.batch_size = batch_size
48 | self.stop_file = stop_file
49 | self.make_folder = make_folder
50 |
51 | self.logger = logging.getLogger(__name__)
52 | self.logger.propagate = True
53 |
54 | from transformers import AutoTokenizer
55 | self.logger.info("Loading tokenizer")
56 | self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model_dir,
57 | use_fast=self.use_fast,
58 | trust_remote_code=self.trust_remote_code)
59 |
60 | @staticmethod
61 | def append_dataset(tokenized, num_samples, seqlen):
62 | import numpy as np
63 | import torch
64 |
65 | random.seed(0)
66 | np.random.seed(0)
67 | torch.random.manual_seed(0)
68 |
69 | traindataset = []
70 | for _ in range(num_samples):
71 | i = random.randint(0, tokenized.input_ids.shape[1] - seqlen - 1)
72 | j = i + seqlen
73 | inp = tokenized.input_ids[:, i:j]
74 | attention_mask = torch.ones_like(inp)
75 | traindataset.append({'input_ids':inp,'attention_mask': attention_mask})
76 | return traindataset
77 |
78 | #TODO: make a generic method that can load a dataset from HF hub and be told what column(s) to use
79 | def get_math(self):
80 | data = load_dataset('andersonbcdefg/math', split='train')
81 |
82 | extract = data[0:2000]
83 | text = ''
84 | for input, output in zip(extract['message_1'], extract['message_2']):
85 | text += input + ': ' + output + '\n'
86 |
87 | self.logger.info("Tokenising Maths dataset")
88 | tokenized = self.tokenizer(text, return_tensors='pt')
89 |
90 | return self.append_dataset(tokenized, self.num_samples, self.seqlen)
91 | def get_medical(self):
92 | data = load_dataset('medalpaca/medical_meadow_wikidoc', split='train')
93 |
94 | extract = data[0:1000]
95 | text = ''
96 | for input, output in zip(extract['input'], extract['output']):
97 | text += input + ' ' + output + '\n'
98 |
99 | self.logger.info("Tokenising Medical dataset")
100 | tokenized = self.tokenizer(text, return_tensors='pt')
101 |
102 | return self.append_dataset(tokenized, self.num_samples, self.seqlen)
103 |
104 | def get_code(self):
105 | data = load_dataset('nickrosh/Evol-Instruct-Code-80k-v1', split='train')
106 |
107 | extract = data[0:1500]
108 | text = '\n'.join(extract['output'])
109 | self.logger.info("Tokenising Code dataset")
110 | tokenized = self.tokenizer(text, return_tensors='pt')
111 |
112 | return self.append_dataset(tokenized, self.num_samples, self.seqlen)
113 |
114 | def get_german(self):
115 | data = load_dataset('deepset/germanquad', split='train')
116 |
117 | def transform_context(sample):
118 | split_context = sample['context'].split('===')
119 | if len(split_context) >= 3:
120 | trans_context = split_context[2]
121 | else:
122 | trans_context = sample['context']
123 | return {'context': trans_context.strip()}
124 |
125 | subset_data = data.select(range(2000))
126 | transformed_subset = subset_data.map(transform_context)
127 | text = '\n'.join([item['context'] for item in transformed_subset])
128 |
129 | self.logger.info("Tokenising German dataset")
130 | tokenized = self.tokenizer(text, return_tensors='pt')
131 |
132 | return self.append_dataset(tokenized, self.num_samples, self.seqlen)
133 |
134 | def get_french(self):
135 | data = load_dataset('gustavecortal/diverse_french_news', split='train')
136 |
137 | extract = data[0:700]
138 | text = '\n'.join(extract['text'])
139 | self.logger.info("Tokenising French dataset")
140 | tokenized = self.tokenizer(text, return_tensors='pt')
141 |
142 | return self.append_dataset(tokenized, self.num_samples, self.seqlen)
143 |
144 | def get_wikitext2(self):
145 | wikidata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
146 | wikilist = [' \n' if s == '' else s for s in wikidata['text'] ]
147 |
148 | text = ''.join(wikilist)
149 | self.logger.info("Tokenising wikitext2")
150 | tokenized = self.tokenizer(text, return_tensors='pt')
151 |
152 | return self.append_dataset(tokenized, self.num_samples, self.seqlen)
153 |
154 | def get_c4(self):
155 | import numpy as np
156 | import torch
157 | traindata = load_dataset(
158 | 'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train', use_auth_token=False
159 | )
160 |
161 | trainloader = []
162 | for _ in range(self.num_samples):
163 | while True:
164 | i = random.randint(0, len(traindata) - 1)
165 | trainenc = self.tokenizer(traindata[i]['text'], return_tensors='pt')
166 | if trainenc.input_ids.shape[1] >= self.seqlen:
167 | break
168 | i = random.randint(0, trainenc.input_ids.shape[1] - self.seqlen - 1)
169 | j = i + self.seqlen
170 | inp = trainenc.input_ids[:, i:j]
171 | attention_mask = torch.ones_like(inp)
172 | trainloader.append({'input_ids':inp,'attention_mask': attention_mask})
173 |
174 | return trainloader
175 |
176 | def quantize(self, output_dir, traindataset, bits, group_size, desc_act, damp):
177 | # Hide the super annoying bitsandbytes loading message. We don't even use BnB but I don't know if I can stop it loading entirely.
178 | os.environ['BITSANDBYTES_NOWELCOME'] = '1'
179 |
180 | # We only import Torch and AutoGPTQ when needed, so that earlier set env vars will affect them.
181 | import torch
182 | from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
183 |
184 | quantize_config = BaseQuantizeConfig(
185 | bits=bits,
186 | group_size=group_size,
187 | desc_act=desc_act,
188 | damp_percent=damp
189 | )
190 |
191 | if self.dtype == 'float16':
192 | torch_dtype = torch.float16
193 | elif self.dtype == 'float32':
194 | torch_dtype = torch.float32
195 | elif self.dtype == 'bfloat16':
196 | torch_dtype = torch.bfloat16
197 | else:
198 | raise ValueError(f"Unsupported dtype: {self.dtype}")
199 |
200 | self.logger.info(f"Loading model from {self.pretrained_model_dir} with trust_remote_code={self.trust_remote_code} and dtype={torch_dtype}")
201 | model = AutoGPTQForCausalLM.from_pretrained(self.pretrained_model_dir, quantize_config=quantize_config,
202 | low_cpu_mem_usage=True, torch_dtype=torch_dtype, trust_remote_code=self.trust_remote_code)
203 |
204 | self.logger.info(f"Starting quantization to {output_dir} with use_triton={self.use_triton}")
205 | start_time = time.time()
206 | model.quantize(traindataset, use_triton=self.use_triton, batch_size=self.batch_size, cache_examples_on_gpu=self.cache_examples)
207 |
208 | self.logger.info(f"Time to quantize model at {output_dir} with use_triton={self.use_triton}: {time.time() - start_time:.2f}")
209 |
210 | self.logger.info(f"Saving quantized model to {output_dir}")
211 | model.save_quantized(output_dir, use_safetensors=True)
212 | self.logger.info("Done.")
213 |
214 | def run_quantization(self):
215 | #TODO: This is messy, should be dynamic
216 | if self.dataset == 'wikitext':
217 | traindataset = self.get_wikitext2()
218 | elif self.dataset == 'code' or self.dataset == 'evol-instruct-code':
219 | traindataset = self.get_code()
220 | elif self.dataset == 'math' or self.dataset == 'maths' or self.dataset == 'camel-ai/math':
221 | traindataset = self.get_math()
222 | elif self.dataset == 'medical' or self.dataset == 'medical_meadow_wikidoc':
223 | traindataset = self.get_medical()
224 | elif self.dataset == 'german' or self.dataset == 'germanquad':
225 | traindataset = self.get_german()
226 | elif self.dataset == 'french' or self.dataset == 'diverse_french_news':
227 | traindataset = self.get_french()
228 | elif self.dataset == 'c4':
229 | traindataset = self.get_c4()
230 | else:
231 | self.logger.error(f"Unsupported dataset: {self.dataset}")
232 | raise ValueError(f"Unsupported dataset: {self.dataset}")
233 |
234 | abort = False
235 | iterations=[]
236 | for bits in self.bits:
237 | for group_size in self.group_size:
238 | for desc_act in self.desc_act:
239 | for damp in self.damp:
240 | desc_act = desc_act == 1 and True or False
241 | iterations.append({"bits": bits, "group_size": group_size, "desc_act": desc_act, "damp": damp})
242 |
243 | num_iters = len(iterations)
244 | if num_iters > 1:
245 | logger.info(f"Starting {num_iters} quantizations.")
246 | count=1
247 | for iteration in iterations:
248 | if abort:
249 | break
250 | if self.stop_file is not None and os.path.exists(self.stop_file):
251 | self.logger.info(f"Stopping as {self.stop_file} exists")
252 | abort = True
253 | break
254 |
255 | bits = iteration['bits']
256 | group_size = iteration['group_size']
257 | desc_act = iteration['desc_act']
258 | damp = iteration['damp']
259 |
260 | try:
261 | if self.make_folder:
262 | output_dir = os.path.join(self.output_dir_base, f"{bits}bits-{group_size}g-desc_act_{desc_act}-damp_{damp}")
263 | else:
264 | output_dir = self.output_dir_base
265 | os.makedirs(output_dir, exist_ok=True)
266 | try:
267 | if num_iters > 1:
268 | self.logger.info(f"Starting quantization {count}/{num_iters}")
269 | self.logger.info(f"Quantising with bits={bits} group_size={group_size} desc_act={desc_act} damp={damp} to {output_dir}")
270 | self.quantize(output_dir, traindataset, bits, group_size, desc_act, damp)
271 | except KeyboardInterrupt:
272 | logger.error(f"Aborted. Will delete {output_dir}")
273 | os.rmdir(output_dir)
274 | abort = True
275 | except:
276 | raise
277 |
278 | finally:
279 | count += 1
280 |
281 | if __name__ == "__main__":
282 | import argparse
283 | logger = logging.getLogger()
284 | logging.basicConfig(format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
285 | level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
286 |
287 | parser = argparse.ArgumentParser(description='AutoGPTQ quantize')
288 | parser.add_argument('pretrained_model_dir', type=str, help='Repo name')
289 | parser.add_argument('output_dir_base', type=str, help='Output base folder')
290 | parser.add_argument('dataset', type=str, help='Quantisation dataset')
291 | parser.add_argument('--num_samples', type=int, default=128, help='Number of dataset samples')
292 | parser.add_argument('--trust_remote_code', action="store_true", help='Trust remote code')
293 | parser.add_argument('--cache_examples', type=int, default=1, help='Cache examples on GPU')
294 | parser.add_argument('--use_fast', action="store_true", help='Use fast tokenizer')
295 | parser.add_argument('--use_triton', action="store_true", help='Use Triton for quantization')
296 | parser.add_argument('--bits', type=int, nargs='+', default=[4], help='Quantize bit(s)')
297 | parser.add_argument('--group_size', type=int, nargs='+', default=[128], help='Quantize group size(s)')
298 | parser.add_argument('--damp', type=float, nargs='+', default=[0.01], help='Quantize damp_percent(s)')
299 | parser.add_argument('--desc_act', type=int, nargs='+', default=[0], help='Quantize desc_act(s) - 1 = True, 0 = False')
300 | parser.add_argument('--dtype', type=str, choices=['float16', 'float32', 'bfloat16'], default='float16', help='Unquantised model dtype')
301 | parser.add_argument('--seqlen', type=int, default=2048, help='Model sequence length')
302 | parser.add_argument('--batch_size', type=int, default=1, help='Quantize batch size for processing dataset samples')
303 | parser.add_argument('--stop_file', type=str, help='Filename to look for to stop inference, specific to this instance')
304 | parser.add_argument('--make_folders', action="store_true", help='Make folders for each quantization using params in folder name')
305 |
306 | args = parser.parse_args()
307 | quantizer = QuantAutoGPTQ(args.pretrained_model_dir,
308 | args.output_dir_base,
309 | args.dataset,
310 | num_samples=args.num_samples,
311 | trust_remote_code=args.trust_remote_code,
312 | cache_examples=args.cache_examples,
313 | use_fast=args.use_fast,
314 | use_triton=args.use_triton,
315 | bits=args.bits,
316 | group_size=args.group_size,
317 | desc_act=args.desc_act,
318 | damp=args.damp,
319 | dtype=args.dtype,
320 | seqlen=args.seqlen,
321 | batch_size=args.batch_size,
322 | stop_file=args.stop_file,
323 | make_folder=args.make_folders)
324 | quantizer.run_quantization()
--------------------------------------------------------------------------------
/training/scripts/dpo/create_preference_dataset.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 | import logging
3 | import os
4 | import time
5 | from typing import cast
6 | import re
7 |
8 | import torch
9 | from datasets import load_dataset
10 | from tqdm.auto import tqdm
11 | from trl import TrlParser
12 | from vllm import LLM, SamplingParams
13 | from datasets import Dataset
14 | from peft import LoraConfig, AutoPeftModelForCausalLM
15 |
16 | logger = logging.getLogger(__name__)
17 |
18 | @dataclass
19 | class CandidateArguments:
20 | generation_model_name_or_path: str = field(
21 | default=None,
22 | metadata={
23 | 'help': 'Huggingface model name or path to model directory, for the model that will be used for generation, defaults to SFT model or previous iteration model.'
24 | },
25 | )
26 | dataset_id: str = field(
27 | default=None,
28 | metadata={
29 | 'help': 'Path to the input dataset, that will be used to generate candidates, defaults to previous iteration output dataset.'
30 | },
31 | )
32 | sample_size: int = field(
33 | default=None,
34 | metadata={
35 | 'help': 'Number of samples to generate, defaults to as many as possible.'
36 | },
37 | )
38 | prompt_column: str = field(
39 | default='question',
40 | metadata={'help': 'Column name in the input dataset that contains the messages.'},
41 | )
42 | answer_column: str = field(
43 | default='answer',
44 | metadata={'help': 'Column name in the input dataset that contains the answer.'},
45 | )
46 | system_prompt: str = field(
47 | default= """Solve the given high school math problem by providing a clear explanation of each step leading to the final solution.
48 |
49 | Provide a detailed breakdown of your calculations, beginning with an explanation of the problem and describing how you derive each formula, value, or conclusion. Use logical steps that build upon one another, to arrive at the final answer in a systematic manner.
50 |
51 | # Steps
52 |
53 | 1. **Understand the Problem**: Restate the given math problem and clearly identify the main question and any important given values.
54 | 2. **Set Up**: Identify the key formulas or concepts that could help solve the problem (e.g., algebraic manipulation, geometry formulas, trigonometric identities).
55 | 3. **Solve Step-by-Step**: Iteratively progress through each step of the math problem, justifying why each consecutive operation brings you closer to the solution.
56 | 4. **Double Check**: If applicable, double check the work for accuracy and sense, and mention potential alternative approaches if any.
57 | 5. **Final Answer**: Provide the numerical or algebraic solution clearly, accompanied by appropriate units if relevant.
58 |
59 | # Notes
60 |
61 | - Always clearly define any variable or term used.
62 | - Wherever applicable, include unit conversions or context to explain why each formula or step has been chosen.
63 | - Assume the level of mathematics is suitable for high school, and avoid overly advanced math techniques unless they are common at that level.
64 | """,
65 | metadata={'help': 'System prompt to use for generation.'},
66 | )
67 | num_solutions: int = field(
68 | default=5,
69 | metadata={'help': 'Number of solutions to generate for each input.'},
70 | )
71 | batch_size: int = field(
72 | default=1,
73 | metadata={'help': 'Batch size for generation.'},
74 | )
75 | max_new_tokens: int = field(
76 | default=2048,
77 | metadata={'help': 'Maximum number of new tokens to generate.'},
78 | )
79 | temperature: float = field(
80 | default=0.7,
81 | metadata={'help': 'Temperature for generation.'},
82 | )
83 | top_p: float = field(
84 | default=1.0,
85 | metadata={'help': 'Top-p for generation.'},
86 | )
87 |
88 | def score_solutions(
89 | candidate_result: str,
90 | ground_truth_result: str,
91 | ) -> bool:
92 | # finds the answer in the candidate result
93 | regex_pattern = r'\b\d+\b'
94 | match = re.findall(regex_pattern, candidate_result)
95 |
96 | if match:
97 | return match[-1] == ground_truth_result
98 | else:
99 | return False
100 |
101 |
102 | def vllm_create_candidates(
103 | dataset: Dataset,
104 | model_name_or_path: str,
105 | num_solutions: int,
106 | max_new_tokens: int,
107 | batch_size: int = 1,
108 | prompt_column: str = 'prompt',
109 | system_prompt: str = None,
110 | answer_column: str = 'answer',
111 | sample_size: int = None,
112 | **kwargs,
113 | ) -> Dataset:
114 |
115 | # Loads the model on all available GPUs with vLLM
116 | llm = LLM(
117 | model=model_name_or_path,
118 | tokenizer=model_name_or_path,
119 | tensor_parallel_size=torch.cuda.device_count(),
120 | max_model_len=4096,
121 | )
122 | # formats the prompt using the system prompt and the prompt column
123 | tokenizer = llm.get_tokenizer()
124 | def format_prompt(s):
125 | messages = [
126 | {"role": "system", "content": system_prompt},
127 | {"role": "user", "content": s[prompt_column]}
128 | ]
129 | return {"prompt": tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True), "messages": messages}
130 |
131 | dataset = dataset.map(format_prompt)
132 | # print the first prompt
133 | print('First prompt:', dataset['prompt'][0])
134 |
135 | # set sampling params
136 | sampling_params = SamplingParams(
137 | max_tokens=max_new_tokens,
138 | n=num_solutions,
139 | temperature=kwargs.get('temperature', 1.0),
140 | top_p=kwargs.get('top_p', 1),
141 | )
142 |
143 | # Iterate over the dataset with batch size to generate candidates and create preference pairs based on the correct answer and ground truth
144 | preference_dataset = []
145 | for i in tqdm(range(0, len(dataset), batch_size), desc=f'Generating solutions: Already generated {len(preference_dataset)} preference pairs'):
146 | batch = dataset[i : i + batch_size]
147 | # Generate `num_solutions` candidates per batch
148 | result = llm.generate(batch['prompt'], sampling_params, use_tqdm=False)
149 | for j in range(0, len(batch['prompt'])):
150 | # iterate each candidate and check if it is correct
151 | preference_pair = {
152 | "system_prompt": system_prompt,
153 | "prompt": batch[prompt_column][j],
154 | "ground_truth": batch[answer_column][j],
155 | }
156 | for cand in result[j].outputs:
157 | # check if the candidate is correct
158 | cand_score = score_solutions(candidate_result=cand.text, ground_truth_result=batch[answer_column][j])
159 | if cand_score and preference_pair.get('chosen',None) is None:
160 | preference_pair['chosen'] = cand.text
161 | elif not cand_score and preference_pair.get('rejected',None) is None:
162 | preference_pair['rejected'] = cand.text
163 | # check if the pair is complete to prevent overwriting
164 | if preference_pair.get('chosen',None) and preference_pair.get('rejected',None):
165 | continue
166 |
167 | # check is the generated candidates lead to a complete preference pair
168 | if preference_pair.get('chosen',None) and preference_pair.get('rejected',None):
169 | print(f'Found preference pair, adding to dataset.')
170 | preference_dataset.append(preference_pair)
171 |
172 | print(f'Generated {len(preference_dataset)} preference pairs')
173 | if len(preference_dataset) >= sample_size:
174 | break
175 | return Dataset.from_list(preference_dataset)
176 |
177 |
178 | def main():
179 | parser = TrlParser((CandidateArguments))
180 | script_args = parser.parse_args_and_config()[0]
181 | script_args = cast(CandidateArguments, script_args)
182 |
183 | # load dataset and tokenizer
184 | dataset = load_dataset(script_args.dataset_id, split='train')
185 | print(f'Generating {script_args.num_solutions} solutions for {len(dataset)} prompts...')
186 |
187 | start_time = time.time()
188 | candidates_ds = vllm_create_candidates(
189 | dataset,
190 | model_name_or_path=script_args.generation_model_name_or_path,
191 | num_solutions=script_args.num_solutions,
192 | max_new_tokens=script_args.max_new_tokens,
193 | batch_size=script_args.batch_size,
194 | prompt_column=script_args.prompt_column,
195 | answer_column=script_args.answer_column,
196 | system_prompt=script_args.system_prompt,
197 | temperature=script_args.temperature,
198 | top_p=script_args.top_p,
199 | sample_size=script_args.sample_size if script_args.sample_size is not None else len(dataset),
200 | )
201 | print(f'Generated {len(dataset) * script_args.num_solutions} solutions in {time.time() - start_time:.2f} seconds.')
202 |
203 | save_dataset_id = f"{script_args.generation_model_name_or_path.replace('/', '-')[:40]}-{script_args.dataset_id.replace('/', '-')[:40]}-candidates"
204 | candidates_ds.push_to_hub(save_dataset_id)
205 |
206 | if __name__ == '__main__':
207 | main()
208 |
--------------------------------------------------------------------------------
/training/scripts/dpo/run_dpo.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import torch
4 | from transformers import (
5 | AutoModelForCausalLM,
6 | set_seed,
7 | )
8 | from dataclasses import dataclass
9 | from datetime import datetime
10 | from distutils.util import strtobool
11 | import logging
12 | import os
13 | from typing import Optional
14 |
15 | os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
16 | import torch
17 | from transformers import (
18 | AutoModelForCausalLM,
19 | AutoTokenizer,
20 | set_seed,
21 | BitsAndBytesConfig,
22 | )
23 | from transformers.trainer_utils import get_last_checkpoint
24 | from transformers.utils import is_liger_kernel_available
25 | from trl import TrlParser, ModelConfig, get_peft_config
26 | from datasets import load_dataset
27 | from trl import (
28 | DPOTrainer,
29 | DPOConfig,
30 | TrlParser,
31 | get_peft_config,
32 | ModelConfig,
33 | )
34 |
35 | from datasets import load_dataset
36 |
37 |
38 | ########################
39 | # Custom dataclasses
40 | ########################
41 | @dataclass
42 | class ScriptArguments:
43 | dataset_id_or_path: str
44 | dataset_splits: str = "train"
45 | tokenizer_name_or_path: str = None
46 |
47 |
48 | ########################
49 | # Setup logging
50 | ########################
51 | logging.basicConfig(level=logging.INFO)
52 | logger = logging.getLogger(__name__)
53 | logger.setLevel(logging.INFO)
54 | handler = logging.StreamHandler()
55 | handler.setFormatter(
56 | logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
57 | )
58 | logger.addHandler(handler)
59 |
60 | ########################
61 | # Helper functions
62 | ########################
63 |
64 |
65 | def get_checkpoint(training_args: DPOConfig):
66 | last_checkpoint = None
67 | if os.path.isdir(training_args.output_dir):
68 | last_checkpoint = get_last_checkpoint(training_args.output_dir)
69 | return last_checkpoint
70 |
71 |
72 | def dpo_function(
73 | model_args: ModelConfig, script_args: ScriptArguments, training_args: DPOConfig
74 | ):
75 | #########################
76 | # Log parameters
77 | #########################
78 | logger.info(f"Model parameters {model_args}")
79 | logger.info(f"Training/evaluation parameters {training_args}")
80 |
81 | ###############
82 | # Load datasets
83 | ###############
84 | if script_args.dataset_id_or_path.endswith(".json"):
85 | train_dataset = load_dataset(
86 | "json", data_files=script_args.dataset_id_or_path, split="train"
87 | )
88 | else:
89 | train_dataset = load_dataset(
90 | script_args.dataset_id_or_path, split=script_args.dataset_splits
91 | )
92 |
93 | logger.info(
94 | f"Loaded dataset with {len(train_dataset)} samples and the following features: {train_dataset.features}"
95 | )
96 |
97 | ################
98 | # Load tokenizer
99 | ################
100 | tokenizer = AutoTokenizer.from_pretrained(
101 | (
102 | script_args.tokenizer_name_or_path
103 | if script_args.tokenizer_name_or_path
104 | else model_args.model_name_or_path
105 | ),
106 | revision=model_args.model_revision,
107 | trust_remote_code=model_args.trust_remote_code,
108 | )
109 | if tokenizer.pad_token is None:
110 | tokenizer.pad_token = tokenizer.eos_token
111 |
112 | #####################
113 | # Prepare and format dataset
114 | #####################
115 | def format_dpo_sample(sample):
116 | prompt = tokenizer.apply_chat_template(
117 | [
118 | {"role": "system", "content": sample["system_prompt"]},
119 | {"role": "user", "content": sample["prompt"]},
120 | ],
121 | tokenize=False,
122 | )
123 | chosen = tokenizer.apply_chat_template(
124 | [{"role": "user", "content": sample["chosen"]}], tokenize=False
125 | )
126 | rejected = tokenizer.apply_chat_template(
127 | [{"role": "user", "content": sample["rejected"]}], tokenize=False
128 | )
129 | return {"prompt": prompt, "chosen": chosen, "rejected": rejected}
130 |
131 | # For DPO/ORPO, the inputs are triples of (prompt, chosen, rejected), where `chosen` and `rejected` are the final turn of a dialogue
132 | train_dataset = train_dataset.map(
133 | format_dpo_sample, remove_columns=train_dataset.column_names
134 | )
135 |
136 | # remove all columns except chosen, rejected
137 | print(f"Columns: {train_dataset.features.keys()}")
138 | train_dataset = train_dataset.select_columns(["prompt", "chosen", "rejected"])
139 |
140 | #######################################
141 | # Load the model and/or reference model
142 | #######################################
143 |
144 | model_kwargs = dict(
145 | revision=model_args.model_revision, # What revision from Huggingface to use, defaults to main
146 | trust_remote_code=model_args.trust_remote_code, # Whether to trust the remote code, this also you to fine-tune custom architectures
147 | attn_implementation=model_args.attn_implementation, # What attention implementation to use, defaults to flash_attention_2
148 | torch_dtype=(
149 | model_args.torch_dtype
150 | if model_args.torch_dtype in ["auto", None]
151 | else getattr(torch, model_args.torch_dtype)
152 | ), # What torch dtype to use, defaults to auto
153 | use_cache=False if training_args.gradient_checkpointing else True, # Whether
154 | low_cpu_mem_usage=(
155 | True
156 | if not strtobool(os.environ.get("ACCELERATE_USE_DEEPSPEED", "false"))
157 | else None
158 | ), # Reduces memory usage on CPU for loading the model
159 | )
160 |
161 | # Check which training method to use and if 4-bit quantization is needed
162 | if model_args.load_in_4bit:
163 | model_kwargs["quantization_config"] = BitsAndBytesConfig(
164 | load_in_4bit=True,
165 | bnb_4bit_use_double_quant=True,
166 | bnb_4bit_quant_type="nf4",
167 | bnb_4bit_compute_dtype=model_kwargs["torch_dtype"],
168 | bnb_4bit_quant_storage=model_kwargs["torch_dtype"],
169 | )
170 | if model_args.use_peft:
171 | peft_config = get_peft_config(model_args)
172 | else:
173 | peft_config = None
174 |
175 | # Policy Model
176 | model = AutoModelForCausalLM.from_pretrained(
177 | model_args.model_name_or_path, **model_kwargs
178 | )
179 | # Checks wether we use adapters for reference model or not
180 | if peft_config is None:
181 | model_ref = AutoModelForCausalLM.from_pretrained(
182 | model_args.model_name_or_path, **model_kwargs
183 | )
184 | else:
185 | model_ref = None
186 |
187 | #########################
188 | # Instantiate DPO trainer
189 | #########################
190 | trainer = DPOTrainer(
191 | model,
192 | ref_model=model_ref,
193 | args=training_args,
194 | train_dataset=train_dataset,
195 | processing_class=tokenizer,
196 | peft_config=peft_config,
197 | )
198 |
199 | ###############
200 | # Training loop
201 | ###############
202 | # Check for last checkpoint
203 | last_checkpoint = get_checkpoint(training_args)
204 | if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
205 | logger.info(f"Checkpoint detected, resuming training at {last_checkpoint}.")
206 |
207 | # Train the model
208 | logger.info(
209 | f'*** Starting training {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} for {training_args.num_train_epochs} epochs***'
210 | )
211 | train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
212 | # Log and save metrics
213 | metrics = train_result.metrics
214 | metrics["train_samples"] = len(train_dataset)
215 | trainer.log_metrics("train", metrics)
216 | trainer.save_metrics("train", metrics)
217 | trainer.save_state()
218 |
219 | logger.info("*** Training complete ***")
220 |
221 | ##################################
222 | # Save model and create model card
223 | ##################################
224 |
225 | logger.info("*** Save model ***")
226 | if trainer.is_fsdp_enabled and peft_config:
227 | trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
228 | # Restore k,v cache for fast inference
229 | trainer.model.config.use_cache = True
230 | trainer.save_model(training_args.output_dir)
231 | logger.info(f"Model saved to {training_args.output_dir}")
232 | training_args.distributed_state.wait_for_everyone() # wait for all processes to load
233 |
234 | tokenizer.save_pretrained(training_args.output_dir)
235 | logger.info(f"Tokenizer saved to {training_args.output_dir}")
236 |
237 | # Save everything else on main process
238 | if trainer.accelerator.is_main_process:
239 | trainer.create_model_card({"tags": ["sft", "tutorial", "philschmid"]})
240 | # push to hub if needed
241 | if training_args.push_to_hub is True:
242 | logger.info("Pushing to hub...")
243 | trainer.push_to_hub()
244 |
245 | logger.info("*** Training complete! ***")
246 |
247 |
248 | def main():
249 | parser = TrlParser((ModelConfig, ScriptArguments, DPOConfig))
250 | model_args, script_args, training_args = parser.parse_args_and_config()
251 |
252 | # Set seed for reproducibility
253 | set_seed(training_args.seed)
254 |
255 | # Run the main training loop
256 | dpo_function(model_args, script_args, training_args)
257 |
258 |
259 | if __name__ == "__main__":
260 | main()
261 |
--------------------------------------------------------------------------------
/training/scripts/example.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --ntasks-per-node=1
3 | #SBATCH --gres=gpu:8
4 | #SBATCH --qos=high
5 | #SBATCH --partition=hopper-prod # Adjust this for your cluster
6 | #SBATCH --output=/fsx/philipp/logs/%x-%j.out # Adjust this for your cluster
7 | #SBATCH --err=/fsx/philipp/logs/%x-%j.err # Adjust this for your cluster
8 |
9 | set -x -e
10 |
11 | source ~/.bashrc
12 | micromamba activate dpo
13 | echo "START TIME: $(date)"
14 |
15 | CONFIG_FILE=$1
16 |
17 | # Training setup
18 | NUM_NODES=$SLURM_NNODES
19 | GPUS_PER_NODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
20 | WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
21 |
22 | # so processes know who to talk to
23 | MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
24 | MASTER_PORT=6000
25 |
26 | export CMD=" \
27 | scripts/dpo/run_dpo.py --config $CONFIG_FILE
28 | "
29 |
30 | export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \
31 | --config_file configs/accelerate_configs/deepspeed_zero3.yaml \
32 | --num_machines $NUM_NODES \
33 | --num_processes $WORLD_SIZE \
34 | --main_process_ip $MASTER_ADDR \
35 | --main_process_port $MASTER_PORT \
36 | --machine_rank \$SLURM_PROCID \
37 | --rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT" \
38 | --max_restarts 1 \
39 | --role \$(hostname -s): \
40 | --tee 3 \
41 | "
42 |
43 | # force crashing on nccl issues like hanging broadcast
44 | export NCCL_ASYNC_ERROR_HANDLING=1
45 | # export NCCL_DEBUG=INFO
46 | # export NCCL_DEBUG_SUBSYS=COLL
47 | # export NCCL_SOCKET_NTHREADS=1
48 | # export NCCL_NSOCKS_PERTHREAD=1
49 | # export CUDA_LAUNCH_BLOCKING=1
50 |
51 | # Specific configuration optimized for the Hugging Face Compute Cluster
52 | # Be ye warned this may not work on other clusters!
53 | module load cuda/12.1
54 |
55 | # srun error handling:
56 | # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
57 | # --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
58 | SRUN_ARGS=" \
59 | --wait=60 \
60 | --kill-on-bad-exit=1 \
61 | "
62 |
63 | clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --role \$SLURMD_NODENAME: $CMD" 2>&1
64 |
65 | echo "END TIME: $(date)"
--------------------------------------------------------------------------------
/training/scripts/merge_adapter_weights.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 | import tempfile
3 | from typing import Optional
4 | import torch
5 | from peft import AutoPeftModelForCausalLM
6 | from transformers import AutoTokenizer, HfArgumentParser
7 | from huggingface_hub import HfApi
8 |
9 | # Example usage:
10 | # python scripts/merge_adapter_weights.py --peft_model_id falcon-180b-lora-fa --output_dir merged-weights --save_tokenizer True
11 |
12 | def save_model(model_path_or_id, save_dir, save_tokenizer=True):
13 | model = AutoPeftModelForCausalLM.from_pretrained(
14 | model_path_or_id,
15 | low_cpu_mem_usage=True,
16 | torch_dtype=torch.float16,
17 | )
18 | # Merge LoRA and base model and save
19 | model = model.merge_and_unload()
20 | model.save_pretrained(save_dir, safe_serialization=True, max_shard_size="3GB")
21 |
22 | # save tokenizer
23 | if save_tokenizer:
24 | tokenizer = AutoTokenizer.from_pretrained(model_path_or_id)
25 | tokenizer.save_pretrained(save_dir)
26 |
27 |
28 | @dataclass
29 | class ScriptArguments:
30 | peft_model_id: str = field(metadata={"help": "model id or path to model"})
31 | output_dir: Optional[str] = field(default="merged-weights", metadata={"help": "where the merged model should be saved"})
32 | save_tokenizer: Optional[bool] = field(default=True, metadata={"help": "whether to save the tokenizer"})
33 | push_to_hub: Optional[bool] = field(default=False, metadata={"help": "whether to push the model to the hub"})
34 | repository_id: Optional[str] = field(default=None, metadata={"help": "the model name"})
35 |
36 | parser = HfArgumentParser(ScriptArguments)
37 | args = parser.parse_args_into_dataclasses()[0]
38 | api = HfApi()
39 |
40 | if args.push_to_hub:
41 | repo_id = args.repository_id if args.repository_id else args.peft_model_id.split('/')[-1]
42 | with tempfile.TemporaryDirectory() as temp_dir:
43 | save_model(args.peft_model_id, temp_dir, args.save_tokenizer)
44 | api.upload_large_folder(
45 | folder_path=temp_dir,
46 | repo_id=repo_id,
47 | repo_type="model",
48 | )
49 | else:
50 | save_model(args.peft_model_id, args.output_dir, args.save_tokenizer)
--------------------------------------------------------------------------------
/training/scripts/run_fsdp_qlora.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from dataclasses import dataclass, field
3 | import os
4 | import random
5 | import torch
6 | from datasets import load_dataset
7 | from transformers import AutoTokenizer, TrainingArguments
8 | from trl.commands.cli_utils import TrlParser
9 | from transformers import (
10 | AutoModelForCausalLM,
11 | AutoTokenizer,
12 | BitsAndBytesConfig,
13 | set_seed,
14 |
15 | )
16 | from trl import setup_chat_format
17 | from peft import LoraConfig
18 |
19 |
20 | from trl import (
21 | SFTTrainer)
22 |
23 | # Comment in if you want to use the Llama 3 instruct template but make sure to add modules_to_save
24 | # LLAMA_3_CHAT_TEMPLATE="{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
25 |
26 | # Anthropic/Vicuna like template without the need for special tokens
27 | LLAMA_3_CHAT_TEMPLATE = (
28 | "{% for message in messages %}"
29 | "{% if message['role'] == 'system' %}"
30 | "{{ message['content'] }}"
31 | "{% elif message['role'] == 'user' %}"
32 | "{{ '\n\nHuman: ' + message['content'] + eos_token }}"
33 | "{% elif message['role'] == 'assistant' %}"
34 | "{{ '\n\nAssistant: ' + message['content'] + eos_token }}"
35 | "{% endif %}"
36 | "{% endfor %}"
37 | "{% if add_generation_prompt %}"
38 | "{{ '\n\nAssistant: ' }}"
39 | "{% endif %}"
40 | )
41 |
42 |
43 | # ACCELERATE_USE_FSDP=1 FSDP_CPU_RAM_EFFICIENT_LOADING=1 torchrun --nproc_per_node=4 ./scripts/run_fsdp_qlora.py --config llama_3_70b_fsdp_qlora.yaml
44 |
45 | @dataclass
46 | class ScriptArguments:
47 | dataset_path: str = field(
48 | default=None,
49 | metadata={
50 | "help": "Path to the dataset"
51 | },
52 | )
53 | model_id: str = field(
54 | default=None, metadata={"help": "Model ID to use for SFT training"}
55 | )
56 | max_seq_length: int = field(
57 | default=512, metadata={"help": "The maximum sequence length for SFT Trainer"}
58 | )
59 |
60 |
61 | def training_function(script_args, training_args):
62 | ################
63 | # Dataset
64 | ################
65 |
66 | train_dataset = load_dataset(
67 | "json",
68 | data_files=os.path.join(script_args.dataset_path, "train_dataset.json"),
69 | split="train",
70 | )
71 | test_dataset = load_dataset(
72 | "json",
73 | data_files=os.path.join(script_args.dataset_path, "test_dataset.json"),
74 | split="train",
75 | )
76 |
77 | ################
78 | # Model & Tokenizer
79 | ################
80 |
81 | # Tokenizer
82 | tokenizer = AutoTokenizer.from_pretrained(script_args.model_id, use_fast=True)
83 | tokenizer.pad_token = tokenizer.eos_token
84 | tokenizer.chat_template = LLAMA_3_CHAT_TEMPLATE
85 |
86 | # template dataset
87 | def template_dataset(examples):
88 | return{"text": tokenizer.apply_chat_template(examples["messages"], tokenize=False)}
89 |
90 | train_dataset = train_dataset.map(template_dataset, remove_columns=["messages"])
91 | test_dataset = test_dataset.map(template_dataset, remove_columns=["messages"])
92 |
93 | # print random sample
94 | with training_args.main_process_first(
95 | desc="Log a few random samples from the processed training set"
96 | ):
97 | for index in random.sample(range(len(train_dataset)), 2):
98 | print(train_dataset[index]["text"])
99 |
100 | # Model
101 | torch_dtype = torch.bfloat16
102 | quant_storage_dtype = torch.bfloat16
103 |
104 | quantization_config = BitsAndBytesConfig(
105 | load_in_4bit=True,
106 | bnb_4bit_use_double_quant=True,
107 | bnb_4bit_quant_type="nf4",
108 | bnb_4bit_compute_dtype=torch_dtype,
109 | bnb_4bit_quant_storage=quant_storage_dtype,
110 | )
111 |
112 | model = AutoModelForCausalLM.from_pretrained(
113 | script_args.model_id,
114 | quantization_config=quantization_config,
115 | attn_implementation="sdpa", # use sdpa, alternatively use "flash_attention_2"
116 | torch_dtype=quant_storage_dtype,
117 | use_cache=False if training_args.gradient_checkpointing else True, # this is needed for gradient checkpointing
118 | )
119 |
120 | if training_args.gradient_checkpointing:
121 | model.gradient_checkpointing_enable()
122 |
123 | ################
124 | # PEFT
125 | ################
126 |
127 | # LoRA config based on QLoRA paper & Sebastian Raschka experiment
128 | peft_config = LoraConfig(
129 | lora_alpha=8,
130 | lora_dropout=0.05,
131 | r=16,
132 | bias="none",
133 | target_modules="all-linear",
134 | task_type="CAUSAL_LM",
135 | # modules_to_save = ["lm_head", "embed_tokens"] # add if you want to use the Llama 3 instruct template
136 | )
137 |
138 | ################
139 | # Training
140 | ################
141 | trainer = SFTTrainer(
142 | model=model,
143 | args=training_args,
144 | train_dataset=train_dataset,
145 | dataset_text_field="text",
146 | eval_dataset=test_dataset,
147 | peft_config=peft_config,
148 | max_seq_length=script_args.max_seq_length,
149 | tokenizer=tokenizer,
150 | packing=True,
151 | dataset_kwargs={
152 | "add_special_tokens": False, # We template with special tokens
153 | "append_concat_token": False, # No need to add additional separator token
154 | },
155 | )
156 | if trainer.accelerator.is_main_process:
157 | trainer.model.print_trainable_parameters()
158 |
159 | ##########################
160 | # Train model
161 | ##########################
162 | checkpoint = None
163 | if training_args.resume_from_checkpoint is not None:
164 | checkpoint = training_args.resume_from_checkpoint
165 | trainer.train(resume_from_checkpoint=checkpoint)
166 |
167 | ##########################
168 | # SAVE MODEL FOR SAGEMAKER
169 | ##########################
170 | if trainer.is_fsdp_enabled:
171 | trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
172 | trainer.save_model()
173 |
174 | if __name__ == "__main__":
175 | parser = TrlParser((ScriptArguments, TrainingArguments))
176 | script_args, training_args = parser.parse_args_and_config()
177 |
178 | # set use reentrant to False
179 | if training_args.gradient_checkpointing:
180 | training_args.gradient_checkpointing_kwargs = {"use_reentrant": True}
181 | # set seed
182 | set_seed(training_args.seed)
183 |
184 | # launch training
185 | training_function(script_args, training_args)
186 |
--------------------------------------------------------------------------------
/training/scripts/run_r1_grpo.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | from dataclasses import dataclass
4 | from datetime import datetime
5 | import logging
6 | import os
7 | os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
8 | import random
9 | import re
10 | import torch
11 | from transformers.trainer_utils import get_last_checkpoint
12 | from transformers import AutoTokenizer
13 | from datasets import load_dataset
14 | from trl import GRPOConfig, GRPOTrainer, get_peft_config, ModelConfig, TrlParser
15 |
16 |
17 | ########################
18 | # Custom dataclasses
19 | ########################
20 | @dataclass
21 | class ScriptArguments:
22 | dataset_id_or_path: str = "Jiayi-Pan/Countdown-Tasks-3to4"
23 | dataset_splits: str = "train"
24 | tokenizer_name_or_path: str = None
25 |
26 |
27 | ########################
28 | # Setup logging
29 | ########################
30 | logging.basicConfig(level=logging.INFO)
31 | logger = logging.getLogger(__name__)
32 | logger.setLevel(logging.INFO)
33 | handler = logging.StreamHandler()
34 | handler.setFormatter(
35 | logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
36 | )
37 | logger.addHandler(handler)
38 |
39 | ########################
40 | # Helper functions
41 | ########################
42 |
43 | def format_reward_func(completions, target, **kwargs):
44 | """
45 | Format: ......
46 | Args:
47 | completions (list[str]): Generated outputs
48 | target (list[str]): Expected answers
49 |
50 | Returns:
51 | list[float]: Reward scores
52 | """
53 | rewards = []
54 |
55 | for completion, gt in zip(completions, target):
56 |
57 | try:
58 | # add synthetic as its already part of the prompt and prefilled for the assistant to more easily match the regex
59 | completion = "" + completion
60 | if random.random() < 0.1: # 1% chance to write samples into a file
61 | os.makedirs("completion_samples", exist_ok=True)
62 | log_file = os.path.join("completion_samples", "completion_samples.txt")
63 | with open(log_file, "a") as f:
64 | f.write(f"\n\n==============\n")
65 | f.write(completion)
66 |
67 | # Check if the format is correct
68 | regex = r"^([^<]*(?:<(?!/?think>)[^<]*)*)<\/think>\n([\s\S]*?)<\/answer>$"
69 |
70 | match = re.search(regex, completion, re.DOTALL)
71 | # if the format is not correct, reward is 0
72 | if match is None or len(match.groups()) != 2:
73 | rewards.append(0.0)
74 | else:
75 | rewards.append(1.0)
76 | except Exception:
77 | rewards.append(0.0)
78 | return rewards
79 |
80 | def equation_reward_func(completions, target, nums, **kwargs):
81 | """
82 | Evaluates completions based on:
83 | 2. Mathematical correctness of the answer
84 |
85 | Args:
86 | completions (list[str]): Generated outputs
87 | target (list[str]): Expected answers
88 | nums (list[str]): Available numbers
89 |
90 | Returns:
91 | list[float]: Reward scores
92 | """
93 | rewards = []
94 | for completion, gt, numbers in zip(completions, target, nums):
95 | try:
96 | # add synthetic as its already part of the prompt and prefilled for the assistant to more easily match the regex
97 | completion = "" + completion
98 | # Check if the format is correct
99 | match = re.search(r"(.*?)<\/answer>", completion)
100 | if match is None:
101 | rewards.append(0.0)
102 | continue
103 | # Extract the "answer" part from the completion
104 | equation = match.group(1).strip()
105 | # Extract all numbers from the equation
106 | used_numbers = [int(n) for n in re.findall(r'\d+', equation)]
107 |
108 | # Check if all numbers are used exactly once
109 | if sorted(used_numbers) != sorted(numbers):
110 | rewards.append(0.0)
111 | continue
112 | # Define a regex pattern that only allows numbers, operators, parentheses, and whitespace
113 | allowed_pattern = r'^[\d+\-*/().\s]+$'
114 | if not re.match(allowed_pattern, equation):
115 | rewards.append(0.0)
116 | continue
117 |
118 | # Evaluate the equation with restricted globals and locals
119 | result = eval(equation, {"__builtins__": None}, {})
120 | # Check if the equation is correct and matches the ground truth
121 | if abs(float(result) - float(gt)) < 1e-5:
122 | rewards.append(1.0)
123 | if random.random() < 0.10: # 10% chance to write fully successful samples into a file
124 | os.makedirs("completion_samples", exist_ok=True)
125 | log_file = os.path.join("completion_samples", "success_completion_samples.txt")
126 | with open(log_file, "a") as f:
127 | f.write(f"\n\n==============\n")
128 | f.write(completion)
129 | else:
130 | rewards.append(0.0)
131 | except Exception:
132 | # If evaluation fails, reward is 0
133 | rewards.append(0.0)
134 | return rewards
135 |
136 | def get_checkpoint(training_args: GRPOConfig):
137 | last_checkpoint = None
138 | if os.path.isdir(training_args.output_dir):
139 | last_checkpoint = get_last_checkpoint(training_args.output_dir)
140 | return last_checkpoint
141 |
142 |
143 | def grpo_function(
144 | model_args: ModelConfig, script_args: ScriptArguments, training_args: GRPOConfig
145 | ):
146 | #########################
147 | # Log parameters
148 | #########################
149 | logger.info(f"Model parameters {model_args}")
150 | logger.info(f"Training/evaluation parameters {training_args}")
151 |
152 | ################
153 | # Load tokenizer
154 | ################
155 | tokenizer = AutoTokenizer.from_pretrained(
156 | (
157 | script_args.tokenizer_name_or_path
158 | if script_args.tokenizer_name_or_path
159 | else model_args.model_name_or_path
160 | ),
161 | revision=model_args.model_revision,
162 | trust_remote_code=model_args.trust_remote_code,
163 | )
164 | if tokenizer.pad_token is None:
165 | tokenizer.pad_token = tokenizer.eos_token
166 |
167 | ###############
168 | # Load datasets
169 | ###############
170 | # Load dataset from Hugging Face Hub
171 | dataset = load_dataset(script_args.dataset_id_or_path, split=script_args.dataset_splits)
172 | # select a random subset of 50k samples
173 | dataset = dataset.shuffle(seed=42).select(range(50000))
174 |
175 | #####################
176 | # Prepare and format dataset
177 | #####################
178 |
179 | # gemerate r1 prompt with a prefix for the model to already start with the thinking process
180 | def generate_r1_prompt(numbers, target):
181 | r1_prefix = [{
182 | "role": "system",
183 | "content": "You are a helpful assistant. You first thinks about the reasoning process in the mind and then provides the user with the answer."
184 | },
185 | {
186 | "role": "user",
187 | "content": f"Using the numbers {numbers}, create an equation that equals {target}. You can use basic arithmetic operations (+, -, *, /) one or multiple times but each number can only be used once. Show your work in tags. And return the final equation in tags, for example (1 + 2) / 3 . Think step by step inside tags."
188 | },
189 | {
190 | "role": "assistant",
191 | "content": "Let me solve this step by step.\n"
192 | }]
193 | return {"prompt": tokenizer.apply_chat_template(r1_prefix, tokenize=False, continue_final_message=True), "target": target, "nums": numbers}
194 |
195 | # convert our dataset to the r1 prompt
196 | dataset = dataset.map(lambda x: generate_r1_prompt(x["nums"], x["target"]))
197 |
198 | # split the dataset into train and test
199 | train_test_split = dataset.train_test_split(test_size=0.1)
200 |
201 | train_dataset = train_test_split["train"]
202 | test_dataset = train_test_split["test"]
203 |
204 | #########################
205 | # Instantiate DPO trainer
206 | #########################
207 |
208 | trainer = GRPOTrainer(
209 | model=model_args.model_name_or_path,
210 | reward_funcs=[format_reward_func, equation_reward_func],
211 | args=training_args,
212 | train_dataset=train_dataset,
213 | eval_dataset=test_dataset,
214 | peft_config=get_peft_config(model_args),
215 | )
216 |
217 |
218 | ###############
219 | # Training loop
220 | ###############
221 | # Check for last checkpoint
222 | last_checkpoint = get_checkpoint(training_args)
223 | if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
224 | logger.info(f"Checkpoint detected, resuming training at {last_checkpoint}.")
225 |
226 | # Train the model
227 | logger.info(
228 | f'*** Starting training {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} for {training_args.num_train_epochs} epochs***'
229 | )
230 | train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
231 | # Log and save metrics
232 | metrics = train_result.metrics
233 | metrics["train_samples"] = len(train_dataset)
234 | trainer.log_metrics("train", metrics)
235 | trainer.save_metrics("train", metrics)
236 | trainer.save_state()
237 |
238 | logger.info("*** Training complete ***")
239 |
240 | ##################################
241 | # Save model and create model card
242 | ##################################
243 |
244 | logger.info("*** Save model ***")
245 | trainer.model.config.use_cache = True
246 | trainer.save_model(training_args.output_dir)
247 | logger.info(f"Model saved to {training_args.output_dir}")
248 | training_args.distributed_state.wait_for_everyone() # wait for all processes to load
249 |
250 | tokenizer.save_pretrained(training_args.output_dir)
251 | logger.info(f"Tokenizer saved to {training_args.output_dir}")
252 |
253 | # Save everything else on main process
254 | if trainer.accelerator.is_main_process:
255 | trainer.create_model_card({"tags": ["rl","grpo", "tutorial", "philschmid"]})
256 | # push to hub if needed
257 | if training_args.push_to_hub is True:
258 | logger.info("Pushing to hub...")
259 | trainer.push_to_hub()
260 |
261 | logger.info("*** Training complete! ***")
262 |
263 |
264 | def main():
265 | parser = TrlParser((ModelConfig, ScriptArguments, GRPOConfig))
266 | model_args, script_args, training_args = parser.parse_args_and_config()
267 |
268 | # Run the main training loop
269 | grpo_function(model_args, script_args, training_args)
270 |
271 |
272 | if __name__ == "__main__":
273 | main()
--------------------------------------------------------------------------------
/training/scripts/run_seq2seq_deepspeed.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import numpy as np
4 | from transformers import (
5 | AutoModelForSeq2SeqLM,
6 | DataCollatorForSeq2Seq,
7 | AutoTokenizer,
8 | set_seed,
9 | )
10 | from datasets import load_from_disk
11 | import torch
12 | import evaluate
13 | import nltk
14 | import numpy as np
15 |
16 | from huggingface_hub import HfFolder
17 | from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
18 |
19 | nltk.download("punkt", quiet=True)
20 |
21 | # Metric
22 | metric = evaluate.load("rouge")
23 | # evaluation generation args
24 | gen_kwargs = {
25 | "early_stopping": True,
26 | "length_penalty": 2.0,
27 | "max_new_tokens": 50,
28 | "min_length": 30,
29 | "no_repeat_ngram_size": 3,
30 | "num_beams": 4,
31 | }
32 |
33 |
34 | def postprocess_text(preds, labels):
35 | preds = [pred.strip() for pred in preds]
36 | labels = [label.strip() for label in labels]
37 |
38 | # rougeLSum expects newline after each sentence
39 | preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
40 | labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
41 |
42 | return preds, labels
43 |
44 |
45 | def parse_arge():
46 | """Parse the arguments."""
47 | parser = argparse.ArgumentParser()
48 | # add model id and dataset path argument
49 | parser.add_argument("--model_id", type=str, default="google/flan-t5-xl", help="Model id to use for training.")
50 | parser.add_argument("--dataset_path", type=str, default="data", help="Path to the already processed dataset.")
51 | parser.add_argument(
52 | "--repository_id", type=str, default=None, help="Hugging Face Repository id for uploading models"
53 | )
54 | # add training hyperparameters for epochs, batch size, learning rate, and seed
55 | parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for.")
56 | parser.add_argument("--per_device_train_batch_size", type=int, default=8, help="Batch size to use for training.")
57 | parser.add_argument("--per_device_eval_batch_size", type=int, default=8, help="Batch size to use for testing.")
58 | parser.add_argument("--generation_max_length", type=int, default=140, help="Maximum length to use for generation")
59 | parser.add_argument("--generation_num_beams", type=int, default=4, help="Number of beams to use for generation.")
60 | parser.add_argument("--lr", type=float, default=3e-3, help="Learning rate to use for training.")
61 | parser.add_argument("--seed", type=int, default=42, help="Seed to use for training.")
62 | parser.add_argument("--deepspeed", type=str, default=None, help="Path to deepspeed config file.")
63 | parser.add_argument("--gradient_checkpointing", type=bool, default=True, help="Path to deepspeed config file.")
64 | parser.add_argument(
65 | "--bf16",
66 | type=bool,
67 | default=True if torch.cuda.get_device_capability()[0] == 8 else False,
68 | help="Whether to use bf16.",
69 | )
70 | parser.add_argument(
71 | "--hf_token",
72 | type=str,
73 | default=HfFolder.get_token(),
74 | help="Token to use for uploading models to Hugging Face Hub.",
75 | )
76 | args = parser.parse_known_args()
77 | return args
78 |
79 |
80 | def training_function(args):
81 | # set seed
82 | set_seed(args.seed)
83 |
84 | # load dataset from disk and tokenizer
85 | train_dataset = load_from_disk(os.path.join(args.dataset_path, "train"))
86 | eval_dataset = load_from_disk(os.path.join(args.dataset_path, "eval"))
87 | tokenizer = AutoTokenizer.from_pretrained(args.model_id)
88 | # load model from the hub
89 | model = AutoModelForSeq2SeqLM.from_pretrained(
90 | args.model_id,
91 | use_cache=False if args.gradient_checkpointing else True, # this is needed for gradient checkpointing
92 | )
93 |
94 | # we want to ignore tokenizer pad token in the loss
95 | label_pad_token_id = -100
96 | # Data collator
97 | data_collator = DataCollatorForSeq2Seq(
98 | tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8
99 | )
100 |
101 | # Define compute metrics function
102 | def compute_metrics(eval_preds):
103 | preds, labels = eval_preds
104 | if isinstance(preds, tuple):
105 | preds = preds[0]
106 | decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
107 | # Replace -100 in the labels as we can't decode them.
108 | labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
109 | decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
110 |
111 | # Some simple post-processing
112 | decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
113 |
114 | result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
115 | result = {k: round(v * 100, 4) for k, v in result.items()}
116 | prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
117 | result["gen_len"] = np.mean(prediction_lens)
118 | return result
119 |
120 | # Define training args
121 | # output_dir = args.repository_id if args.repository_id else args.model_id.split("/")[-1]
122 | output_dir = args.model_id.split("/")[-1]
123 | training_args = Seq2SeqTrainingArguments(
124 | output_dir=output_dir,
125 | per_device_train_batch_size=args.per_device_train_batch_size,
126 | per_device_eval_batch_size=args.per_device_eval_batch_size,
127 | predict_with_generate=True,
128 | generation_max_length=args.generation_max_length,
129 | generation_num_beams=args.generation_num_beams,
130 | fp16=False, # T5 overflows with fp16
131 | bf16=args.bf16, # Use BF16 if available
132 | learning_rate=args.lr,
133 | num_train_epochs=args.epochs,
134 | deepspeed=args.deepspeed,
135 | gradient_checkpointing=args.gradient_checkpointing,
136 | # logging & evaluation strategies
137 | logging_dir=f"{output_dir}/logs",
138 | logging_strategy="steps",
139 | logging_steps=500,
140 | evaluation_strategy="epoch",
141 | save_strategy="epoch",
142 | save_total_limit=2,
143 | load_best_model_at_end=True,
144 | # push to hub parameters
145 | report_to="tensorboard",
146 | push_to_hub=True if args.repository_id else False,
147 | hub_strategy="every_save",
148 | hub_model_id=args.repository_id if args.repository_id else None,
149 | hub_token=args.hf_token,
150 | )
151 |
152 | # Create Trainer instance
153 | trainer = Seq2SeqTrainer(
154 | model=model,
155 | args=training_args,
156 | train_dataset=train_dataset,
157 | eval_dataset=eval_dataset,
158 | data_collator=data_collator,
159 | compute_metrics=compute_metrics,
160 | )
161 |
162 | # Start training
163 | trainer.train()
164 |
165 | # Save our tokenizer and create model card
166 | tokenizer.save_pretrained(output_dir)
167 | trainer.create_model_card()
168 | # Push the results to the hub
169 | if args.repository_id:
170 | trainer.push_to_hub()
171 |
172 |
173 | def main():
174 | args, _ = parse_arge()
175 | training_function(args)
176 |
177 |
178 | if __name__ == "__main__":
179 | main()
180 |
--------------------------------------------------------------------------------
/training/scripts/run_sft.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from datetime import datetime
3 | from distutils.util import strtobool
4 | import logging
5 | import os
6 | import re
7 | from typing import Optional
8 | os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
9 | import torch
10 | from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, BitsAndBytesConfig
11 | from transformers.trainer_utils import get_last_checkpoint
12 | from transformers.utils import is_liger_kernel_available
13 | from trl import SFTTrainer, TrlParser, ModelConfig, SFTConfig, get_peft_config
14 | from datasets import load_dataset
15 | from peft import AutoPeftModelForCausalLM
16 |
17 | if is_liger_kernel_available():
18 | from liger_kernel.transformers import AutoLigerKernelForCausalLM
19 |
20 |
21 |
22 | ########################
23 | # Custom dataclasses
24 | ########################
25 | @dataclass
26 | class ScriptArguments:
27 | dataset_id_or_path: str
28 | dataset_splits: str = "train"
29 | tokenizer_name_or_path: str = None
30 | spectrum_config_path: Optional[str] = None
31 |
32 |
33 | ########################
34 | # Setup logging
35 | ########################
36 | logging.basicConfig(level=logging.INFO)
37 | logger = logging.getLogger(__name__)
38 | logger.setLevel(logging.INFO)
39 | handler = logging.StreamHandler()
40 | handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
41 | logger.addHandler(handler)
42 |
43 | ########################
44 | # Helper functions
45 | ########################
46 |
47 | def get_checkpoint(training_args: SFTConfig):
48 | last_checkpoint = None
49 | if os.path.isdir(training_args.output_dir):
50 | last_checkpoint = get_last_checkpoint(training_args.output_dir)
51 | return last_checkpoint
52 |
53 |
54 | def setup_model_for_spectrum(model, spectrum_config_path):
55 | unfrozen_parameters = []
56 | with open(spectrum_config_path, "r") as fin:
57 | yaml_parameters = fin.read()
58 |
59 | # get the unfrozen parameters from the yaml file
60 | for line in yaml_parameters.splitlines():
61 | if line.startswith("- "):
62 | unfrozen_parameters.append(line.split("- ")[1])
63 |
64 | # freeze all parameters
65 | for param in model.parameters():
66 | param.requires_grad = False
67 | # unfreeze Spectrum parameters
68 | for name, param in model.named_parameters():
69 | if any(re.match(unfrozen_param, name) for unfrozen_param in unfrozen_parameters):
70 | param.requires_grad = True
71 |
72 | # COMMENT IN: for sanity check print the trainable parameters
73 | # for name, param in model.named_parameters():
74 | # if param.requires_grad:
75 | # print(f"Trainable parameter: {name}")
76 |
77 | return model
78 |
79 | ###########################################################################################################
80 |
81 | def train_function(model_args: ModelConfig, script_args: ScriptArguments, training_args: SFTConfig):
82 | """Main training function."""
83 | #########################
84 | # Log parameters
85 | #########################
86 | logger.info(f'Model parameters {model_args}')
87 | logger.info(f'Script parameters {script_args}')
88 | logger.info(f'Training/evaluation parameters {training_args}')
89 |
90 | ###############
91 | # Load datasets
92 | ###############
93 | if script_args.dataset_id_or_path.endswith('.json'):
94 | train_dataset = load_dataset('json', data_files=script_args.dataset_id_or_path, split='train')
95 | else:
96 | train_dataset = load_dataset(script_args.dataset_id_or_path, split=script_args.dataset_splits)
97 |
98 | train_dataset = train_dataset.select(range(10000))
99 |
100 | logger.info(f'Loaded dataset with {len(train_dataset)} samples and the following features: {train_dataset.features}')
101 |
102 | ################
103 | # Load tokenizer
104 | ################
105 | tokenizer = AutoTokenizer.from_pretrained(
106 | script_args.tokenizer_name_or_path if script_args.tokenizer_name_or_path else model_args.model_name_or_path,
107 | revision=model_args.model_revision,
108 | trust_remote_code=model_args.trust_remote_code,
109 | )
110 | if tokenizer.pad_token is None:
111 | tokenizer.pad_token = tokenizer.eos_token
112 | # if we use peft we need to make sure we use a chat template that is not using special tokens as by default embedding layers will not be trainable
113 |
114 |
115 | #######################
116 | # Load pretrained model
117 | #######################
118 |
119 | # define model kwargs
120 | model_kwargs = dict(
121 | revision=model_args.model_revision, # What revision from Huggingface to use, defaults to main
122 | trust_remote_code=model_args.trust_remote_code, # Whether to trust the remote code, this also you to fine-tune custom architectures
123 | attn_implementation=model_args.attn_implementation, # What attention implementation to use, defaults to flash_attention_2
124 | torch_dtype=model_args.torch_dtype if model_args.torch_dtype in ['auto', None] else getattr(torch, model_args.torch_dtype), # What torch dtype to use, defaults to auto
125 | use_cache=False if training_args.gradient_checkpointing else True, # Whether
126 | low_cpu_mem_usage=True if not strtobool(os.environ.get("ACCELERATE_USE_DEEPSPEED", "false")) else None, # Reduces memory usage on CPU for loading the model
127 | )
128 |
129 | # Check which training method to use and if 4-bit quantization is needed
130 | if model_args.load_in_4bit:
131 | model_kwargs['quantization_config'] = BitsAndBytesConfig(
132 | load_in_4bit=True,
133 | bnb_4bit_use_double_quant=True,
134 | bnb_4bit_quant_type='nf4',
135 | bnb_4bit_compute_dtype=model_kwargs['torch_dtype'],
136 | bnb_4bit_quant_storage=model_kwargs['torch_dtype'],
137 | )
138 | if model_args.use_peft:
139 | peft_config = get_peft_config(model_args)
140 | else:
141 | peft_config = None
142 |
143 | # load the model with our kwargs
144 | if training_args.use_liger:
145 | model = AutoLigerKernelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs)
146 | else:
147 | model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs)
148 | training_args.distributed_state.wait_for_everyone() # wait for all processes to load
149 |
150 |
151 | if script_args.spectrum_config_path:
152 | model = setup_model_for_spectrum(model, script_args.spectrum_config_path)
153 |
154 | ########################
155 | # Initialize the Trainer
156 | ########################
157 | trainer = SFTTrainer(
158 | model=model,
159 | args=training_args,
160 | train_dataset=train_dataset,
161 | tokenizer=tokenizer,
162 | peft_config=peft_config,
163 | )
164 | if trainer.accelerator.is_main_process and peft_config:
165 | trainer.model.print_trainable_parameters()
166 |
167 | ###############
168 | # Training loop
169 | ###############
170 | # Check for last checkpoint
171 | last_checkpoint = get_checkpoint(training_args)
172 | if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
173 | logger.info(f'Checkpoint detected, resuming training at {last_checkpoint}.')
174 |
175 | logger.info(f'*** Starting training {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} for {training_args.num_train_epochs} epochs***')
176 | train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
177 | # log metrics
178 | metrics = train_result.metrics
179 | metrics['train_samples'] = len(train_dataset)
180 | trainer.log_metrics('train', metrics)
181 | trainer.save_metrics('train', metrics)
182 | trainer.save_state()
183 |
184 | ##################################
185 | # Save model and create model card
186 | ##################################
187 |
188 | logger.info('*** Save model ***')
189 | if trainer.is_fsdp_enabled and peft_config:
190 | trainer.accelerator.state.fsdp_plugin.set_state_dict_type('FULL_STATE_DICT')
191 | # Restore k,v cache for fast inference
192 | trainer.model.config.use_cache = True
193 | trainer.save_model(training_args.output_dir)
194 | logger.info(f'Model saved to {training_args.output_dir}')
195 | training_args.distributed_state.wait_for_everyone() # wait for all processes to load
196 |
197 | tokenizer.save_pretrained(training_args.output_dir)
198 | logger.info(f'Tokenizer saved to {training_args.output_dir}')
199 |
200 | # Save everything else on main process
201 | if trainer.accelerator.is_main_process:
202 | trainer.create_model_card({'tags': ['sft', 'tutorial', 'philschmid']})
203 | # push to hub if needed
204 | if training_args.push_to_hub is True:
205 | logger.info('Pushing to hub...')
206 | trainer.push_to_hub()
207 |
208 | logger.info('*** Training complete! ***')
209 |
210 |
211 | def main():
212 | parser = TrlParser((ModelConfig, ScriptArguments, SFTConfig))
213 | model_args, script_args, training_args = parser.parse_args_and_config()
214 |
215 | # Set seed for reproducibility
216 | set_seed(training_args.seed)
217 |
218 | # Run the main training loop
219 | train_function(model_args, script_args, training_args)
220 |
221 |
222 | if __name__ == '__main__':
223 | main()
--------------------------------------------------------------------------------
/training/scripts/test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from peft import AutoPeftModelForCausalLM
3 | from transformers import AutoTokenizer, pipeline
4 |
5 | peft_model_id = "./llama-8b-hf-no-robot"
6 |
7 | # Load Model with PEFT adapter
8 | model = AutoPeftModelForCausalLM.from_pretrained(
9 | peft_model_id,
10 | device_map="auto",
11 | torch_dtype=torch.float16
12 | )
13 | tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
14 |
15 | from datasets import load_dataset
16 | from random import randint
17 |
18 |
19 | # Load our test dataset
20 | eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
21 | rand_idx = randint(0, len(eval_dataset))
22 | messages = eval_dataset[rand_idx]["messages"][:2]
23 | print(tokenizer.eos_token)
24 | # stop generation on eos token or <|eot_id|> token
25 | terminators = [
26 | tokenizer.eos_token_id,
27 | tokenizer.convert_tokens_to_ids("<|eot_id|>"),
28 | ]
29 |
30 | # Test on sample
31 | input_ids = tokenizer.apply_chat_template(messages,add_generation_prompt=True,return_tensors="pt").to(model.device)
32 |
33 | outputs = model.generate(
34 | input_ids,
35 | max_new_tokens=512,
36 | eos_token_id=terminators,
37 | do_sample=True,
38 | temperature=0.6,
39 | top_p=0.9,
40 | )
41 | response = outputs[0][input_ids.shape[-1]:]
42 | print(f"**Query:**\n{eval_dataset[rand_idx]['messages'][1]['content']}")
43 | print(f"**Original Answer:**\n {eval_dataset[rand_idx]['messages'][2]['content']}")
44 | print(f"**Generated Answer:**\n {tokenizer.decode(response,skip_special_tokens=True)}")
--------------------------------------------------------------------------------
/training/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philschmid/deep-learning-pytorch-huggingface/59b37973074de90004d10e5ff636f98160c9743a/training/utils/__init__.py
--------------------------------------------------------------------------------
/training/utils/falcon_patch.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional, Tuple
2 |
3 | import torch
4 | import transformers
5 | from peft.tuners.lora import LoraLayer
6 |
7 | try:
8 | from flash_attn import flash_attn_func
9 | except Exception:
10 | raise ModuleNotFoundError(
11 | "Please install FlashAttention first, e.g., with pip install flash-attn --no-build-isolation, Learn more at https://github.com/Dao-AILab/flash-attention#installation-and-features"
12 | )
13 |
14 | try:
15 | from einops import rearrange
16 | except Exception:
17 | raise ModuleNotFoundError("Please install einops first, e.g., with pip install einops")
18 |
19 |
20 | # ADAPTED https://github.com/pacman100/DHS-LLM-Workshop/blob/main/chat_assistant/training/falcon_flash_attn_monkey_patch.py
21 | def forward(
22 | self,
23 | hidden_states: torch.Tensor,
24 | alibi: Optional[torch.Tensor],
25 | attention_mask: torch.Tensor,
26 | layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
27 | head_mask: Optional[torch.Tensor] = None,
28 | use_cache: bool = False,
29 | output_attentions: bool = False,
30 | ):
31 | fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size]
32 | num_kv_heads = self.num_heads if self.new_decoder_architecture else self.num_kv_heads
33 | # 3 x [batch_size, seq_length, num_heads, head_dim]
34 | (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
35 |
36 | batch_size, query_length, _, _ = query_layer.shape
37 |
38 | query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.num_heads, query_length, self.head_dim)
39 | key_layer = key_layer.transpose(1, 2).reshape(
40 | batch_size * num_kv_heads,
41 | query_length,
42 | self.head_dim,
43 | )
44 | value_layer = value_layer.transpose(1, 2).reshape(batch_size * num_kv_heads, query_length, self.head_dim)
45 |
46 | past_kv_length = 0 if layer_past is None else layer_past[0].shape[1]
47 | query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, past_kv_length)
48 |
49 | if layer_past is not None:
50 | past_key, past_value = layer_past
51 | # concatenate along seq_length dimension:
52 | # - key: [batch_size * self.num_heads, kv_length, head_dim]
53 | # - value: [batch_size * self.num_heads, kv_length, head_dim]
54 | key_layer = torch.cat((past_key, key_layer), dim=1)
55 | value_layer = torch.cat((past_value, value_layer), dim=1)
56 |
57 | _, kv_length, _ = key_layer.shape
58 | if use_cache:
59 | present = (key_layer, value_layer)
60 | else:
61 | present = None
62 | attention_mask_float = (attention_mask * 1.0).masked_fill(attention_mask, float("-1e9")).to(query_layer.dtype)
63 | query_layer_ = (
64 | query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim).transpose(1, 2).to(torch.bfloat16)
65 | )
66 | key_layer_ = key_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim).transpose(1, 2).to(torch.bfloat16)
67 | value_layer_ = value_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim).transpose(1, 2).to(torch.bfloat16)
68 |
69 | if alibi is not None:
70 | raise ValueError("`alibi` is not supported when `use_flash_attn` is True")
71 |
72 | # below output will have shape (batch_size, seqlen, nheads, headdim)
73 | attn_output = flash_attn_func(query_layer_, key_layer_, value_layer_, causal=True)
74 | attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)
75 | output_tensor = self.dense(attn_output)
76 | return output_tensor, present
77 |
78 |
79 | def replace_attn_with_flash_attn():
80 | cuda_major, cuda_minor = torch.cuda.get_device_capability()
81 | if cuda_major < 8:
82 | print(
83 | "Flash attention is only supported on Ampere or Hopper GPU during training due to head dim > 64 backward."
84 | "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
85 | )
86 | transformers.models.falcon.modeling_falcon.FalconAttention.forward = forward
87 |
88 |
89 | def unplace_flash_attn_with_attn():
90 | import importlib
91 | import transformers
92 |
93 | print("Reloading falcon model, unpatching flash attention")
94 | importlib.reload(transformers.models.falcon.modeling_falcon)
95 |
96 |
97 | # Adapted from https://github.com/tmm1/axolotl/blob/2eda9e02a9d15a7a3f92b41f257d9844d72fc220/src/axolotl/utils/models.py#L338
98 | def upcast_layer_for_flash_attention(model, torch_dtype):
99 | # LlamaRMSNorm layers are in fp32 after kbit_training, so we need to
100 | # convert them back to fp16/bf16 for flash-attn compatibility.
101 | for name, module in model.named_modules():
102 | if isinstance(module, LoraLayer):
103 | module.to(torch_dtype)
104 | if "norm" in name:
105 | module.to(torch_dtype)
106 | if "lm_head" in name or "embed_tokens" in name:
107 | if hasattr(module, "weight"):
108 | module.to(torch_dtype)
109 |
110 | return model
111 |
--------------------------------------------------------------------------------
/training/utils/llama_patch.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional, Tuple
2 |
3 | import torch
4 | from torch import nn
5 | import warnings
6 | import transformers
7 | from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
8 | from peft.tuners.lora import LoraLayer
9 |
10 | try:
11 | from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
12 | from flash_attn.bert_padding import unpad_input, pad_input
13 | except Exception:
14 | raise ModuleNotFoundError(
15 | "Please install FlashAttention first, e.g., with pip install flash-attn --no-build-isolation, Learn more at https://github.com/Dao-AILab/flash-attention#installation-and-features"
16 | )
17 |
18 | try:
19 | from einops import rearrange
20 | except Exception:
21 | raise ModuleNotFoundError("Please install einops first, e.g., with pip install einops")
22 |
23 |
24 | # ADAPTED from https://github.com/allenai/open-instruct/blob/main/open_instruct/llama_flash_attn_monkey_patch.py
25 | # AND https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py
26 | # AND https://github.com/LAION-AI/Open-Assistant/blob/04fa9a24b2a58c8885b8aa6a2eb02b18de6b4961/model/model_training/models/patching_llama.py
27 | # AND Sourabh https://github.com/huggingface/transformers/commit/ee81bf5aee0d65f005d157c013777e3d27d8d6bf
28 | def forward(
29 | self,
30 | hidden_states: torch.Tensor,
31 | attention_mask: Optional[torch.Tensor] = None,
32 | position_ids: Optional[torch.Tensor] = None,
33 | past_key_value: Optional[Tuple[torch.Tensor]] = None,
34 | output_attentions: bool = False,
35 | use_cache: bool = False,
36 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
37 | """Input shape: Batch x Time x Channel
38 |
39 | attention_mask: [bsz, q_len]
40 | """
41 | if output_attentions:
42 | warnings.warn("Output attentions is not supported for patched `LlamaAttention`, returning `None` instead.")
43 |
44 | bsz, q_len, _ = hidden_states.size()
45 |
46 | query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
47 | key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
48 | value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
49 | # [bsz, q_len, nh, hd]
50 | # [bsz, nh, q_len, hd]
51 |
52 | kv_seq_len = key_states.shape[-2]
53 | if past_key_value is not None:
54 | kv_seq_len += past_key_value[0].shape[-2]
55 | cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
56 | query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
57 |
58 | # Past Key value support
59 | if past_key_value is not None:
60 | # reuse k, v, self_attention
61 | key_states = torch.cat([past_key_value[0], key_states], dim=2)
62 | value_states = torch.cat([past_key_value[1], value_states], dim=2)
63 |
64 | past_key_value = (key_states, value_states) if use_cache else None
65 |
66 | # Flash attention codes from
67 | # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
68 |
69 | # transform the data into the format required by flash attention
70 | qkv = torch.stack([query_states, key_states, value_states], dim=2) # [bsz, nh, 3, q_len, hd]
71 | qkv = qkv.transpose(1, 3) # [bsz, q_len, 3, nh, hd]
72 | # We have disabled _prepare_decoder_attention_mask in LlamaModel
73 | # the attention_mask should be the same as the key_padding_mask
74 | key_padding_mask = attention_mask
75 |
76 | if key_padding_mask is None:
77 | qkv = rearrange(qkv, "b s ... -> (b s) ...")
78 | max_s = q_len
79 | cu_q_lens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device)
80 | output = flash_attn_varlen_qkvpacked_func(qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True)
81 | output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
82 | else:
83 | nheads = qkv.shape[-2]
84 | x = rearrange(qkv, "b s three h d -> b s (three h d)")
85 | x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
86 | x_unpad = rearrange(x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads)
87 | output_unpad = flash_attn_varlen_qkvpacked_func(
88 | x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
89 | )
90 | output = rearrange(
91 | pad_input(rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len),
92 | "b s (h d) -> b s h d",
93 | h=nheads,
94 | )
95 | return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, past_key_value
96 |
97 |
98 | # Disable the transformation of the attention mask in LlamaModel as the flash attention
99 | # requires the attention mask to be the same as the key_padding_mask
100 | def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
101 | # [bsz, seq_len]
102 | return attention_mask
103 |
104 |
105 | def replace_attn_with_flash_attn():
106 | cuda_major, cuda_minor = torch.cuda.get_device_capability()
107 | if cuda_major < 8:
108 | print(
109 | "Flash attention is only supported on Ampere or Hopper GPU during training due to head dim > 64 backward."
110 | "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
111 | )
112 | transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
113 | _prepare_decoder_attention_mask
114 | )
115 | transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
116 |
117 |
118 | def unplace_flash_attn_with_attn():
119 | import importlib
120 | import transformers
121 |
122 | print("Reloading llama model, unpatching flash attention")
123 | importlib.reload(transformers.models.llama.modeling_llama)
124 |
125 |
126 | # Adapted from https://github.com/tmm1/axolotl/blob/2eda9e02a9d15a7a3f92b41f257d9844d72fc220/src/axolotl/utils/models.py#L338
127 | def upcast_layer_for_flash_attention(model, torch_dtype):
128 | # LlamaRMSNorm layers are in fp32 after kbit_training, so we need to
129 | # convert them back to fp16/bf16 for flash-attn compatibility.
130 | for name, module in model.named_modules():
131 | if isinstance(module, LoraLayer):
132 | module.to(torch_dtype)
133 | if "norm" in name:
134 | module.to(torch_dtype)
135 | if "lm_head" in name or "embed_tokens" in name:
136 | if hasattr(module, "weight"):
137 | module.to(torch_dtype)
138 |
139 | return model
140 |
--------------------------------------------------------------------------------
/training/utils/peft_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl
3 | from peft import LoraConfig, get_peft_model
4 | from peft.tuners.lora import LoraLayer
5 | from transformers import (
6 | AutoModelForCausalLM,
7 | AutoTokenizer,
8 | AutoTokenizer,
9 | TrainingArguments,
10 | )
11 | from utils.falcon_patch import replace_attn_with_flash_attn as replace_falcon_attn_with_flash_attn
12 | from utils.llama_patch import replace_attn_with_flash_attn as replace_llama_attn_with_flash_attn
13 |
14 |
15 | class SaveDeepSpeedPeftModelCallback(TrainerCallback):
16 | def __init__(self, trainer, save_steps=500):
17 | self.trainer = trainer
18 | self.save_steps = save_steps
19 |
20 | def on_step_end(
21 | self,
22 | args: TrainingArguments,
23 | state: TrainerState,
24 | control: TrainerControl,
25 | **kwargs,
26 | ):
27 | if (state.global_step + 1) % self.save_steps == 0:
28 | self.trainer.accelerator.wait_for_everyone()
29 | state_dict = self.trainer.accelerator.get_state_dict(self.trainer.deepspeed)
30 | unwrapped_model = self.trainer.accelerator.unwrap_model(self.trainer.deepspeed)
31 | if self.trainer.accelerator.is_main_process:
32 | unwrapped_model.save_pretrained(args.output_dir, state_dict=state_dict)
33 | self.trainer.accelerator.wait_for_everyone()
34 | return control
35 |
36 |
37 | def create_and_prepare_model(model_id: str, training_args: TrainingArguments, script_args):
38 | model = AutoModelForCausalLM.from_pretrained(
39 | model_id,
40 | use_cache=not training_args.gradient_checkpointing,
41 | use_flash_attention_2=script_args.use_flash_attn,
42 | )
43 | print("model loaded")
44 |
45 | # find all linear modules in model for lora
46 | target_modules = find_all_linear_names(model)
47 |
48 | # create lora config
49 | peft_config = LoraConfig(
50 | lora_alpha=script_args.lora_alpha,
51 | lora_dropout=script_args.lora_dropout,
52 | r=script_args.lora_r,
53 | bias="none",
54 | task_type="CAUSAL_LM",
55 | target_modules=target_modules,
56 | )
57 | # enable gradient checkpointing
58 | if training_args.gradient_checkpointing:
59 | model.gradient_checkpointing_enable()
60 |
61 | # pre-process the model by upcasting the layer norms in float 32 for
62 | # Adapted from https://github.com/tmm1/axolotl/blob/2eda9e02a9d15a7a3f92b41f257d9844d72fc220/src/axolotl/utils/models.py#L338
63 | print("pre-processing model for peft")
64 | for name, module in model.named_modules():
65 | if isinstance(module, LoraLayer):
66 | module = module.to(torch.bfloat16)
67 | if "norm" in name:
68 | module = module.to(torch.bfloat16)
69 | if any(x in name for x in ["lm_head", "embed_tokens", "wte", "wpe"]):
70 | if hasattr(module, "weight"):
71 | module = module.to(torch.bfloat16)
72 |
73 | # initialize peft model
74 | print("initializing peft model")
75 | model = get_peft_model(model, peft_config)
76 |
77 | # logger.info parameters
78 | model.print_trainable_parameters()
79 |
80 | # tokenizer
81 | tokenizer = AutoTokenizer.from_pretrained(model_id)
82 | tokenizer.pad_token = tokenizer.eos_token
83 |
84 | return model, peft_config, tokenizer
85 |
86 |
87 | def find_all_linear_names(model):
88 | cls = torch.nn.Linear
89 | lora_module_names = set()
90 | for name, module in model.named_modules():
91 | if isinstance(module, cls):
92 | names = name.split(".")
93 | lora_module_names.add(names[0] if len(names) == 1 else names[-1])
94 |
95 | if "lm_head" in lora_module_names: # needed for 16-bit
96 | lora_module_names.remove("lm_head")
97 | return list(lora_module_names)
98 |
--------------------------------------------------------------------------------