├── .gitignore ├── LICENSE ├── README.md ├── assets ├── flan-t5-tensorboard.png ├── flan-t5.png ├── grpo.png ├── tensorboard-r1.png ├── tensorboard.png └── win_rate_gpt-4-1106-preview.png ├── container ├── Dockerfile └── README.md ├── inference ├── README.md ├── fp8-inference.md ├── idefics.md ├── llama-7b.md ├── speculative.md ├── starcoder_gptq.md ├── starcoder_load.js └── vllm-function-calling.md └── training ├── accelerate-tpu-bert-text-classification.ipynb ├── configs ├── accelerate_configs │ ├── deepspeed_zero1.yaml │ ├── deepspeed_zero3.yaml │ ├── fsdp.yaml │ └── fsdp_qlora.yaml ├── ds_falcon_180b_z3.json ├── ds_falcon_180b_z3_offload.json ├── ds_flan_t5_z3_config.json ├── ds_flan_t5_z3_config_bf16.json ├── ds_flan_t5_z3_offload.json ├── ds_flan_t5_z3_offload_bf16.json └── spectrum │ └── snr_results_meta-llama-Meta-Llama-3.1-8B_unfrozenparameters_30percent.yaml ├── deepseed-falcon-180b-lora-fa.ipynb ├── deepseed-flan-t5-summarization.ipynb ├── dpo-align-llms-in-2024-with-trl.ipynb ├── fine-tune-embedding-model-for-rag.ipynb ├── fine-tune-llms-in-2024-with-trl.ipynb ├── fine-tune-llms-in-2025.ipynb ├── fine-tune-modern-bert-in-2025.ipynb ├── fine-tune-multimodal-llms-with-trl.ipynb ├── flan-t5-samsum-summarization.ipynb ├── fsdp-qlora-distributed-llama3.ipynb ├── gemma-lora-example.ipynb ├── inference.py ├── instruction-tune-llama-2-int4.ipynb ├── launch.slurm ├── mini-deepseek-r1-aha-grpo.ipynb ├── optimize-llama-2-gptq.ipynb ├── peft-flan-t5-int8-summarization.ipynb ├── preprocessing └── create_flan_t5_cnn_dataset.py ├── pytorch-2-0-bert-text-classification.ipynb ├── receipes ├── dpo-llama-3-1-8b-qlora.yaml ├── dpo-llama-3-1-8b.yaml ├── grpo-qwen-2.5-3b-deepseek-r1-countdown.yaml ├── llama-3-1-8b-qlora.yaml └── llama-3-1-8b-spectrum.yaml ├── rl-with-llms-in-2025-dpo.ipynb ├── run_ds_lora.py ├── scripts ├── bloke_gptq.py ├── dpo │ ├── create_preference_dataset.py │ └── run_dpo.py ├── example.slurm ├── merge_adapter_weights.py ├── run_fsdp_qlora.py ├── run_r1_grpo.py ├── run_seq2seq_deepspeed.py ├── run_sft.py └── test.py └── utils ├── __init__.py ├── falcon_patch.py ├── llama_patch.py └── peft_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | data/ 131 | runs/ 132 | flan*/ 133 | *dataset*/ 134 | *dataset.json 135 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Philipp Schmid 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Getting Started with Deep Learning with PyTorch and Hugging Face 2 | 3 | This repository contains instructions/examples/tutorials for getting started with Deep Learning using PyTorch and Hugging Face libraries like [transformers](https://huggingface.co/docs/transformers/index), [datasets](https://huggingface.co/docs/datasets/index). 4 | 5 | ### Training Examples 6 | 7 | * [Fine-tune FLAN-T5 XL/XXL using DeepSpeed & Hugging Face Transformers](./training/deepseed-flan-t5-summarization.ipynb) 8 | * [Fine-tune FLAN-T5 for chat & dialogue summarization](./training/flan-t5-samsum-summarization.ipynb) 9 | * [Fine-tune Falcon 180B with DeepSpeed ZeRO, LoRA & Flash Attention](./training/deepseed-falcon-180b-lora-fa.ipynb) 10 | * [Getting started with Transformers and TPU using PyTorch](./training/accelerate-tpu-bert-text-classification.ipynb) 11 | * [Extended Guide: Instruction-tune Llama 2](./training/instruction-tune-llama-2-int4.ipynb) 12 | * [Quantize open LLMs using optimum and GPTQ](./training/optimize-llama-2-gptq.ipynb) 13 | * [Fine-tune Embedding models for RAG](./training/fine-tune-embedding-model-for-rag.ipynb) 14 | * [Fine-tune LLMs in 2024 with TRL](./training/fine-tune-llms-in-2024-with-trl.ipynb) 15 | * [Fine-tune LLMs in 2025](./training/fine-tune-llms-in-2025.ipynb) 16 | * [Fine-tune Multimodal LLMs with TRL](./training/fine-tune-multimodal-llms-with-trl.ipynb) 17 | * [RLHF in 2024 with DPO & Hugging Face](./training/dpo-align-llms-in-2024-with-trl.ipynb) 18 | * [Fine-tune Gemma with ChatML](./training/gemma-lora-example.ipynb) 19 | * [Efficiently scale distributed training with FSDP & Q-LoRA](./training/fsdp-qlora-distributed-llama3.ipynb) 20 | * [Fine-tune classifier with ModernBERT in 2025](./training/fine-tune-modern-bert-in-2025.ipynb) 21 | * [How to align open LLMs in 2025 with DPO & Hugging Face](https://github.com/philschmid/deep-learning-pytorch-huggingface/blob/main/training/rl-with-llms-in-2025-dpo.ipynb) 22 | 23 | ### Inference Examples 24 | 25 | * [Text Generation Inference Examples](./inference/README.md) 26 | * [FP8 Inference Benchmarks](./inference/fp8-inference.md) 27 | * [Idefics Inference](./inference/idefics.md) 28 | * [Llama 2 Inference](./inference/llama-7b.md) 29 | * [Speculative Decoding](./inference/speculative.md) 30 | * [StarCoder GPTQ Inference](./inference/starcoder_gptq.md) 31 | -------------------------------------------------------------------------------- /assets/flan-t5-tensorboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philschmid/deep-learning-pytorch-huggingface/59b37973074de90004d10e5ff636f98160c9743a/assets/flan-t5-tensorboard.png -------------------------------------------------------------------------------- /assets/flan-t5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philschmid/deep-learning-pytorch-huggingface/59b37973074de90004d10e5ff636f98160c9743a/assets/flan-t5.png -------------------------------------------------------------------------------- /assets/grpo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philschmid/deep-learning-pytorch-huggingface/59b37973074de90004d10e5ff636f98160c9743a/assets/grpo.png -------------------------------------------------------------------------------- /assets/tensorboard-r1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philschmid/deep-learning-pytorch-huggingface/59b37973074de90004d10e5ff636f98160c9743a/assets/tensorboard-r1.png -------------------------------------------------------------------------------- /assets/tensorboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philschmid/deep-learning-pytorch-huggingface/59b37973074de90004d10e5ff636f98160c9743a/assets/tensorboard.png -------------------------------------------------------------------------------- /assets/win_rate_gpt-4-1106-preview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philschmid/deep-learning-pytorch-huggingface/59b37973074de90004d10e5ff636f98160c9743a/assets/win_rate_gpt-4-1106-preview.png -------------------------------------------------------------------------------- /container/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:22.12-py3 2 | 3 | # Versions 4 | ARG PYTORCH='2.0.1' 5 | ARG CUDA='cu118' # used in the base container: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-12.html 6 | ARG TRANSFORMERS='4.30.2' 7 | ARG DATASETS='2.13.0' 8 | ARG ACCLERATE='0.20.3' 9 | ARG DEEPSPEED='0.9.5' 10 | 11 | LABEL maintainer="Philipp Schmid" 12 | ARG DEBIAN_FRONTEND=noninteractive 13 | ENV PYTHONUNBUFFERED=1 14 | 15 | 16 | RUN apt-get update \ 17 | && apt-get -y upgrade --only-upgrade systemd openssl cryptsetup \ 18 | && apt-get install -y \ 19 | bzip2 \ 20 | curl \ 21 | git \ 22 | git-lfs \ 23 | tar \ 24 | gcc \ 25 | g++ \ 26 | libaio-dev \ 27 | # audio 28 | libsndfile1-dev \ 29 | ffmpeg \ 30 | && apt-get clean autoremove --yes \ 31 | && rm -rf /var/lib/{apt,dpkg,cache,log} 32 | 33 | # update pip 34 | RUN python3 -m pip install --no-cache-dir --upgrade pip 35 | 36 | # remove old torch and 37 | # Install latest release PyTorch (PyTorch must be installed before any DeepSpeed c++/cuda ops.) 38 | RUN python3 -m pip uninstall -y torch torchvision torchaudio torch-tensorrt \ 39 | && python3 -m pip install --no-cache-dir -U torch==${PYTORCH} torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/${CUDA} 40 | 41 | # Install DeepSpeed 42 | RUN python3 -m pip install --no-cache-dir -U deepspeed==${DEEPSPEED} 43 | 44 | # Install Hugging Face Libraries 45 | RUN python3 -m pip install --upgrade --no-cache-dir -U \ 46 | transformers[sklearn,sentencepiece,vision]==${TRANSFORMERS} \ 47 | datasets==${DATASETS} \ 48 | accelerate==${ACCLERATE} \ 49 | evaluate \ 50 | tensorboard 51 | -------------------------------------------------------------------------------- /container/README.md: -------------------------------------------------------------------------------- 1 | # PyTorch Deep Learning Containers 2 | 3 | This folder contains Dockerfiles for PyTorch Deep Learning Containers including Hugging Face libraries and/or Deepspeed. 4 | 5 | ## Dockerfiles 6 | 7 | | Container | Versions | URI | 8 | | --------------------------------- | ---------------------------------------------------- | ----------- | 9 | | [Pytorch Deepspeed](./Dockerfile) | torch==2.0.1, transformers==4.30.2, deepspeed==0.9.5 | `philschmi` | 10 | 11 | ## Getting Started 12 | 13 | ### Build the Docker image 14 | 15 | ```bash 16 | docker build -t philschmi/huggingface-pytorch:2.0.1-transformers4.30.2-deepspeed0.9.5-cuda11.8 -t philschmi/huggingface-pytorch:latest -f Dockerfile . 17 | ``` 18 | 19 | ### Run the Docker image 20 | 21 | ```bash 22 | docker run --gpus all -it --rm philschmi/huggingface-pytorch:latest 23 | ``` 24 | 25 | ### Pull the Docker image 26 | 27 | ```bash 28 | docker pull philschmi/huggingface-pytorch:latest 29 | ``` 30 | 31 | ### Push the Docker image 32 | 33 | ```bash 34 | docker login 35 | ``` 36 | 37 | 38 | push 39 | ```bash 40 | docker push philschmi/huggingface-pytorch:2.0.1-transformers4.30.2-deepspeed0.9.5-cuda11.8 41 | docker push philschmi/huggingface-pytorch:latest 42 | ``` 43 | 44 | 45 | 46 | ## Run PyTorch Scripts 47 | 48 | ```bash 49 | docker run --rm -it --init \ 50 | --gpus=all \ 51 | --ipc=host \ 52 | --user="$(id -u):$(id -g)" \ 53 | --volume="$PWD:/workspace" \ 54 | philschmi/huggingface-pytorch:latest python train.py --foo bar 55 | ``` 56 | 57 | * `--gpus=all`: Enables GPU support. If you have multiple GPUs, you can use 58 | `--gpus=0,1,2` to specify which ones to use. 59 | * `--ipc=host`: Required if using multiprocessing, as explained at 60 | https://github.com/pytorch/pytorch#docker-image. 61 | * `--volume="$PWD:/app"`: Mounts the current working directory into the container. 62 | The default working directory inside the container is `/workspace`. Optional. 63 | * `--user="$(id -u):$(id -g)"`: Sets the user inside the container to match your 64 | user and group ID. Optional, but is useful for writing files with correct 65 | ownership. 66 | 67 | 68 | ## Deriving your own images 69 | 70 | The recommended way of adding additional dependencies to an image is to create 71 | your own Dockerfile this project as a base. 72 | 73 | ```dockerfile 74 | FROM philschmi/huggingface-pytorch:2.0.1-transformers4.30.2-deepspeed0.9.5-cuda11.8 75 | 76 | # Install system libraries required by OpenCV. 77 | RUN sudo apt-get update \ 78 | && sudo apt-get install -y libgl1-mesa-glx libgtk2.0-0 libsm6 libxext6 \ 79 | && sudo rm -rf /var/lib/apt/lists/* 80 | 81 | # Install OpenCV from PyPI. 82 | RUN pip install opencv-python==4.5.1.48 83 | ``` -------------------------------------------------------------------------------- /inference/README.md: -------------------------------------------------------------------------------- 1 | # Inference Examples 2 | 3 | ## Text Generation Inference 4 | 5 | Run `HuggingFaceH4/starchat-beta` with TGI locally. 6 | 7 | ```bash 8 | model=bigscience/bloom-560m 9 | num_shard=1 10 | quantize=bitsandbytes 11 | max_input_length=1562 12 | max_total_tokens=2048 13 | volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run 14 | 15 | docker run --gpus all -ti -p 8080:80 \ 16 | -e MODEL_ID=$model \ 17 | -e QUANTIZE=$quantize \ 18 | -e NUM_SHARD=$num_shard \ 19 | -e MAX_INPUT_LENGTH=$max_input_length \ 20 | -e MAX_TOTAL_TOKENS=$max_total_tokens \ 21 | -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest 22 | ``` 23 | 24 | send test request 25 | 26 | ```bash 27 | curl 127.0.0.1:8080/generate \ 28 | -X POST \ 29 | -d '{"inputs":"<|system|>\n<|end|>\n<|user|>\nWhat is Deep Learning?<|end|>\n<|assistant|>","parameters":{"temperature":0.2, "top_p": 0.95, "stop" : ["<|end|>"]}}' \ 30 | -H 'Content-Type: application/json' 31 | ``` 32 | 33 | 34 | ## Text Generation Infernece GPTQ 35 | 36 | ### Lama 37 | 38 | Run `TheBloke/Dolphin-Llama2-7B-GPTQ` with TGI locally. 39 | 40 | ```bash 41 | # Model config 42 | # model=TheBloke/Llama-2-7b-Chat-GPTQ 43 | # model=T#heBloke/Dolphin-Llama2-7B-GPTQ 44 | model=TheBloke/Llama-2-13b-Chat-GPTQ 45 | num_shard=1 46 | quantize=gptq 47 | max_input_length=1562 48 | max_total_tokens=4096 # 4096 49 | volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run 50 | 51 | docker run --gpus all -ti -p 8080:80 \ 52 | -e MODEL_ID=$model \ 53 | -e QUANTIZE=$quantize \ 54 | -e NUM_SHARD=$num_shard \ 55 | -e MAX_INPUT_LENGTH=$max_input_length \ 56 | -e MAX_TOTAL_TOKENS=$max_total_tokens \ 57 | -e GPTQ_BITS=$gptq_bits \ 58 | -e GPTQ_GROUPSIZE=$gptq_groupsize \ 59 | -v $volume:/data ghcr.io/huggingface/text-generation-inference:0.9.4 60 | ``` 61 | 62 | send test request 63 | 64 | ```bash 65 | curl 127.0.0.1:8080/generate \ 66 | -X POST \ 67 | -d '{"inputs":"[INST] <>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.\n<>\n\nWhat is 10+10? [\/INST]","parameters":{"temperature":0.2, "top_p": 0.95, "max_new_tokens": 256}}' \ 68 | -H 'Content-Type: application/json' 69 | ``` 70 | -------------------------------------------------------------------------------- /inference/fp8-inference.md: -------------------------------------------------------------------------------- 1 | # Benchmark and compare FP8 and FP16 inference for vLLM 2 | 3 | [vLLM supports FP8](https://docs.vllm.ai/en/latest/quantization/fp8.html) (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8. We are using `guidellm` 4 | 5 | ## Summary 6 | 7 | **Model Memory Usage** 8 | - FP8: 8.49 GB 9 | - FP16: 14.99 GB 10 | - Memory Savings: ~43% 11 | 12 | 13 | **Performance Highlights** 14 | - Max Requests per Second: 15 | - FP8: 2.54 req/sec at rate 8/128 16 | - FP16: 1.37 req/sec at rate 2 17 | - Improvement: ~85% 18 | - Token Throughput: 19 | - FP8: 587.68 tokens/sec at rate 64 20 | - FP16: 302.94 tokens/sec at rate 2 21 | - Improvement: ~94% 22 | 23 | - Request Latency: 24 | - FP8: 12.26 sec at rate 1 25 | - FP16: 21.87 sec at rate 1 26 | - Improvement: ~44% 27 | 28 | **Results:** 29 | - FP8 consistently outperforms FP16 across all metrics at the same concurrency level. 30 | - FP8 shows the most significant improvement in Request Latency. 31 | - Even at higher concurrency levels, FP8 generally maintains better performance (though direct comparisons at other levels should be made carefully). 32 | 33 | 34 | ## FP8 Inference 35 | 36 | 1. run vLLM with FP8 37 | 38 | ```bash 39 | docker run --gpus all \ 40 | -p 8080:8000 \ 41 | --ipc=host \ 42 | vllm/vllm-openai:latest \ 43 | --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic --max_model_len 8192 44 | ``` 45 | Note: Loading model weights took 8.4939 GB 46 | 47 | 48 | 2. benchmark with `guidellm` 49 | 50 | ```bash 51 | guidellm \ 52 | --target "http://localhost:8080/v1" \ 53 | --model "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic" \ 54 | --data-type emulated \ 55 | --data "prompt_tokens=550,generated_tokens=250" \ 56 | --rate-type constant --rate 1 --rate 2 --rate 4 --rate 8 --rate 16 --rate 64 --rate 128 \ 57 | --max-seconds 90 58 | ``` 59 | 60 | ```bash 61 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┓ 62 | ┃ Benchmark ┃ Requests per Second ┃ Request Latency ┃ Time to First Token ┃ Inter Token Latency ┃ Output Token Throughput ┃ 63 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━┩ 64 | │ asynchronous@1.00 req/sec │ 0.88 req/sec │ 12.26 sec │ 240.18 ms │ 54.61 ms │ 194.47 tokens/sec │ 65 | │ asynchronous@2.00 req/sec │ 1.32 req/sec │ 16.43 sec │ 267.88 ms │ 72.65 ms │ 294.87 tokens/sec │ 66 | │ asynchronous@4.00 req/sec │ 1.53 req/sec │ 47.30 sec │ 19242.07 ms │ 127.31 ms │ 338.21 tokens/sec │ 67 | │ asynchronous@8.00 req/sec │ 2.54 req/sec │ 31.57 sec │ 3144.09 ms │ 124.14 ms │ 582.76 tokens/sec │ 68 | │ asynchronous@16.00 req/sec │ 2.26 req/sec │ 58.66 sec │ 29508.54 ms │ 127.98 ms │ 516.97 tokens/sec │ 69 | │ asynchronous@64.00 req/sec │ 2.49 req/sec │ 39.48 sec │ 9327.19 ms │ 127.77 ms │ 587.68 tokens/sec │ 70 | │ asynchronous@128.00 req/sec │ 2.54 req/sec │ 37.21 sec │ 10749.84 ms │ 118.26 ms │ 569.52 tokens/sec │ 71 | └─────────────────────────────┴─────────────────────┴─────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┘ 72 | ``` 73 | 74 | 75 | ## FP16 Inference 76 | 77 | 1. run vLLM with FP16 78 | 79 | ```bash 80 | docker run --gpus all \ 81 | -p 8080:8000 \ 82 | --ipc=host \ 83 | --env "HUGGING_FACE_HUB_TOKEN=$(cat ~/.cache/huggingface/token)" \ 84 | vllm/vllm-openai:latest \ 85 | --model meta-llama/Meta-Llama-3.1-8B-Instruct --max_model_len 8192 86 | ``` 87 | 88 | Note: Loading model weights took 14.99 GB 89 | 90 | 1. benchmark with `guidellm` 91 | 92 | ```bash 93 | guidellm \ 94 | --target "http://localhost:8080/v1" \ 95 | --model "meta-llama/Meta-Llama-3.1-8B-Instruct" \ 96 | --data-type emulated \ 97 | --data "prompt_tokens=550,generated_tokens=250" \ 98 | --rate-type constant --rate 1 --rate 2 --rate 4 --rate 8 --rate 16 --rate 64 --rate 128 \ 99 | --max-seconds 90 100 | ``` 101 | 102 | ```bash 103 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┓ 104 | ┃ Benchmark ┃ Requests per Second ┃ Request Latency ┃ Time to First Token ┃ Inter Token Latency ┃ Output Token Throughput ┃ 105 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━┩ 106 | │ asynchronous@1.00 req/sec │ 0.76 req/sec │ 21.87 sec │ 314.05 ms │ 94.95 ms │ 172.09 tokens/sec │ 107 | │ asynchronous@2.00 req/sec │ 1.37 req/sec │ 23.48 sec │ 820.36 ms │ 102.23 ms │ 302.94 tokens/sec │ 108 | │ asynchronous@4.00 req/sec │ 1.02 req/sec │ 45.64 sec │ 19181.45 ms │ 118.46 ms │ 228.36 tokens/sec │ 109 | │ asynchronous@8.00 req/sec │ 0.94 req/sec │ 49.13 sec │ 23194.74 ms │ 115.74 ms │ 211.55 tokens/sec │ 110 | │ asynchronous@64.00 req/sec │ 0.89 req/sec │ 56.25 sec │ 30167.99 ms │ 115.69 ms │ 199.90 tokens/sec │ 111 | │ asynchronous@16.00 req/sec │ 1.25 req/sec │ 56.19 sec │ 31740.33 ms │ 106.55 ms │ 285.55 tokens/sec │ 112 | │ asynchronous@128.00 req/sec │ 1.00 req/sec │ 53.18 sec │ 27422.15 ms │ 113.62 ms │ 225.60 tokens/sec │ 113 | └─────────────────────────────┴─────────────────────┴─────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┘ 114 | ``` -------------------------------------------------------------------------------- /inference/idefics.md: -------------------------------------------------------------------------------- 1 | # Idefics inference 2 | 3 | ```bash 4 | model=HuggingFaceM4/idefics-9b-instruct 5 | num_shard=1 6 | max_input_length=1562 7 | max_total_tokens=2048 8 | 9 | sudo docker run --gpus all -ti -p 8080:80 \ 10 | -e MODEL_ID=$model \ 11 | -e NUM_SHARD=$num_shard \ 12 | -e MAX_INPUT_LENGTH=$max_input_length \ 13 | -e MAX_TOTAL_TOKENS=$max_total_tokens \ 14 | ghcr.io/huggingface/text-generation-inference:1.1.0 15 | ``` 16 | 17 | send test request 18 | 19 | ```bash 20 | curl 127.0.0.1:8080/generate \ 21 | -X POST \ 22 | -d '{"inputs":"User:![](https://m.media-amazon.com/images/I/51M87ywnihL._AC_SX679_.jpg)Can i charge my iphone with this cable?\n","parameters":{"temperature":0.2, "top_p": 0.95, "max_new_tokens": 256}}' \ 23 | -H 'Content-Type: application/json' 24 | ``` 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /inference/llama-7b.md: -------------------------------------------------------------------------------- 1 | # Llama 2 inference 2 | 3 | ```bash 4 | model=meta-llama/Llama-2-7b-chat-hf 5 | token=hf_xxx # replace with your token, which access to the repo 6 | num_shard=1 7 | max_input_length=1562 8 | max_total_tokens=2048 9 | 10 | docker run --gpus all -ti -p 8080:80 \ 11 | -e MODEL_ID=$model \ 12 | -e HUGGING_FACE_HUB_TOKEN=$token \ 13 | -e NUM_SHARD=$num_shard \ 14 | -e MAX_INPUT_LENGTH=$max_input_length \ 15 | -e MAX_TOTAL_TOKENS=$max_total_tokens \ 16 | ghcr.io/huggingface/text-generation-inference:latest 17 | ``` 18 | 19 | send test request 20 | 21 | ```bash 22 | curl 127.0.0.1:8080/generate \ 23 | -X POST \ 24 | -d '{"inputs":"[INST] <>\nYou are a helpful, respectful and honest assistant.\n<>\n\nWhat is 10+10? [\/INST]","parameters":{"temperature":0.2, "top_p": 0.95, "max_new_tokens": 256}}' \ 25 | -H 'Content-Type: application/json' 26 | ``` -------------------------------------------------------------------------------- /inference/speculative.md: -------------------------------------------------------------------------------- 1 | # Speculative Decoding 2 | 3 | ## MLP Speculator 4 | ```bash 5 | sudo docker run --gpus all -ti --shm-size 1g --ipc=host --rm -p 8080:80 \ 6 | -e MODEL_ID=ibm-fms/llama3-8b-accelerator \ 7 | -e NUM_SHARD=4 \ 8 | -e MAX_INPUT_TOKENS=1562 \ 9 | -e MAX_TOTAL_TOKENS=2048 \ 10 | -e HF_TOKEN=$(cat ~/.cache/huggingface/token) \ 11 | ghcr.io/huggingface/text-generation-inference:sha-b70ae09 12 | ``` 13 | 14 | send test request 15 | 16 | ```bash 17 | curl localhost:8080/v1/chat/completions \ 18 | -X POST \ 19 | -d '{ 20 | "model": "tgi", 21 | "messages": [ 22 | { 23 | "role": "system", 24 | "content": "You are a helpful assistant." 25 | }, 26 | { 27 | "role": "user", 28 | "content": "What is deep learning?" 29 | } 30 | ], 31 | "stream": false, 32 | "max_tokens": 250 33 | }' \ 34 | -H 'Content-Type: application/json' 35 | ``` 36 | 37 | ## Medusa Speculator 38 | ```bash 39 | sudo docker run --gpus all -ti --shm-size 1g --ipc=host --rm -p 8080:80 \ 40 | -e MODEL_ID=text-generation-inference/Mistral-7B-Instruct-v0.2-medusa \ 41 | -e NUM_SHARD=1 \ 42 | -e MAX_INPUT_TOKENS=1562 \ 43 | -e MAX_TOTAL_TOKENS=2048 \ 44 | -e HF_TOKEN=$(cat ~/.cache/huggingface/token) \ 45 | ghcr.io/huggingface/text-generation-inference:sha-b70ae09 46 | ``` 47 | 48 | send test request 49 | 50 | ```bash 51 | curl localhost:8080/v1/chat/completions \ 52 | -X POST \ 53 | -d '{ 54 | "model": "tgi", 55 | "messages": [ 56 | { 57 | "role": "user", 58 | "content": "Write a poem for my three year old" 59 | } 60 | ], 61 | "stream": false, 62 | "max_tokens": 250 63 | }' \ 64 | -H 'Content-Type: application/json' 65 | ``` 66 | 67 | 68 | chat_completions{total_time="2.360607542s" validation_time="256.541µs" queue_time="37.931µs" inference_time="2.36031324s" time_per_token="12.166563ms" seed="Some(5272915472497899851)"} 69 | 70 | 71 | ## EAGLE Speculator 72 | 73 | ```bash 74 | huggingface-cli download Meta-Llama-3-8B-Instruct --local-dir Meta-Llama-3-8B-Instruct 75 | huggingface-cli download Meta-Llama-3-8B-Instruct --local-dir EAGLE-LLaMA3-Instruct-8B 76 | 77 | 78 | ```python 79 | import json 80 | 81 | import torch 82 | from safetensors.torch import load_file, save_file 83 | 84 | ckpt = torch.load("EAGLE-LLaMA3-Instruct-8B/pytorch_model.bin") 85 | ref_ckpt = load_file("Meta-Llama-3-8B-Instruct/model-00004-of-00004.safetensors") 86 | 87 | ckpt['lm_head.weight'] = ref_ckpt['lm_head.weight'] 88 | 89 | save_file(ckpt, "EAGLE-LLaMA3-Instruct-8B/model.safetensors") 90 | 91 | with open("EAGLE-LLaMA3-Instruct-8B/config.json") as rf: 92 | cfg = json.load(rf) 93 | 94 | cfg = {"model_type": "eagle", "model": cfg} 95 | 96 | with open("EAGLE-LLaMA3-Instruct-8B/config.json", "w") as wf: 97 | json.dump(cfg, wf) 98 | 99 | # delete EAGLE-LLaMA3-Instruct-8B/pytorch_model.bin 100 | ``` -------------------------------------------------------------------------------- /inference/starcoder_gptq.md: -------------------------------------------------------------------------------- 1 | 2 | ### StarCoder 3 | 4 | 5 | Run `TheBloke/starcoder-GPTQ` with TGI locally. 6 | 7 | ```bash 8 | # Model config 9 | model=TheBloke/starcoder-GPTQ 10 | num_shard=1 11 | quantize=gptq 12 | max_input_length=1562 13 | max_total_tokens=4096 # 4096 14 | volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run 15 | 16 | docker run --gpus all -ti -p 8080:80 \ 17 | -e MODEL_ID=$model \ 18 | -e QUANTIZE=$quantize \ 19 | -e NUM_SHARD=$num_shard \ 20 | -e MAX_INPUT_LENGTH=$max_input_length \ 21 | -e MAX_TOTAL_TOKENS=$max_total_tokens \ 22 | -v $volume:/data ghcr.io/huggingface/text-generation-inference:sha-e605c2a 23 | ``` 24 | 25 | send test request 26 | 27 | ```bash 28 | curl http://127.0.0.1:8080/generate \ 29 | -X POST \ 30 | -d '{"inputs":"\n def test():\n x=1+1\n assert x ","parameters":{"max_new_tokens":60,"stop":["<|endoftext|>", "\n\n"],"top_p":0.95}}' \ 31 | -H 'Content-Type: application/json' 32 | ``` 33 | 34 | 35 | load test with `k6` 36 | 37 | ```bash 38 | k6 run starcoder_load.js 39 | ``` 40 | 41 | or with docker 42 | ```bash 43 | docker run --net=host -v $(pwd)/starcoder_load.js:/load.js loadimpact/k6:latest run /load.js 44 | ``` 45 | 46 | 47 | ### Inference Results 48 | 49 | We used `k6` with `constant-vus` executor, a fixed number of VUs execute as many iterations as possible for a specified amount of time. 50 | 51 | 52 | | VU | GPU | time per token (p95) | queue time (p95) | 53 | | --- | ---- | -------------------- | ---------------- | 54 | | 1 | A10G | 30ms | 1ms | 55 | | 5 | A10G | 65ms | 105ms | 56 | | 10 | A10G | 104ms | 120ms | 57 | | 20 | A10G | 203ms | 5110ms | 58 | | 1 | A100 | 30ms | 1ms | 59 | | 5 | A100 | 59ms | 64ms | 60 | | 10 | A100 | 50ms | 51ms | 61 | | 20 | A100 | 59ms | 49ms | 62 | | 40 | A100 | 73ms | 1000ms | 63 | | 60 | A100 | 59ms | 113ms | 64 | | 80 | A100 | 92ms | 165ms | 65 | | 100 | A100 | 72ms | 1111ms | 66 | | 120 | A100 | 77ms | 1270ms | 67 | | 140 | A100 | _request start failing_ | _request start failing_ | 68 | 69 | 70 | -------------------------------------------------------------------------------- /inference/starcoder_load.js: -------------------------------------------------------------------------------- 1 | import { check } from 'k6'; 2 | import http from 'k6/http'; 3 | import { Trend, Counter } from 'k6/metrics'; 4 | 5 | // Define configurations 6 | const host = __ENV.HOST || 'http://127.0.0.1:8080'; 7 | 8 | // Define the metrics 9 | const totalTime = new Trend('total_time', true); 10 | const validationTime = new Trend('validation_time', true); 11 | const queueTime = new Trend('queue_time', true); 12 | const inferenceTime = new Trend('inference_time', true); 13 | const timePerToken = new Trend('time_per_token', true); 14 | const generatedTokens = new Counter('generated_tokens'); 15 | 16 | export const options = { 17 | thresholds: { 18 | http_req_failed: ['rate==0'], 19 | }, 20 | scenarios: { 21 | load_test: { 22 | executor: 'constant-vus', 23 | duration: '60s', 24 | vus: 140, 25 | }, 26 | }, 27 | }; 28 | 29 | export default function () { 30 | // Create Body 31 | const payload = { 32 | inputs: "\n def test():\n x=1+1\n assert x ", 33 | parameters: { 34 | max_new_tokens: 60, 35 | details: true 36 | }, 37 | }; 38 | 39 | const headers = { 'Content-Type': 'application/json' }; 40 | const res = http.post("http://127.0.0.1:8080/generate", JSON.stringify(payload), { 41 | headers 42 | }); 43 | 44 | check(res, { 45 | 'Post status is 200': (r) => res.status === 200, 46 | }); 47 | 48 | if (res.status === 200) { 49 | totalTime.add(res.headers["X-Total-Time"]); 50 | validationTime.add(res.headers["X-Validation-Time"]); 51 | queueTime.add(res.headers["X-Queue-Time"]); 52 | inferenceTime.add(res.headers["X-Inference-Time"]); 53 | timePerToken.add(res.headers["X-Time-Per-Token"]); 54 | generatedTokens.add(res.json().details.generated_tokens); 55 | } 56 | } -------------------------------------------------------------------------------- /inference/vllm-function-calling.md: -------------------------------------------------------------------------------- 1 | # vLLM Function Calling Inference 2 | 3 | This guide demonstrates how to run vLLM with function calling capabilities using Llama models. 4 | 5 | ``` 6 | docker run --gpus all \ 7 | --env "HUGGING_FACE_HUB_TOKEN=$(cat ~/.cache/huggingface/token)" \ 8 | -p 8000:8000 \ 9 | --shm-size=10G \ 10 | --ipc=host \ 11 | vllm/vllm-openai:latest \ 12 | --model meta-llama/Llama-3.3-70B-Instruct --tensor-parallel-size 8 --max_model_len 4096 --enable-auto-tool-choice --tool-call-parser llama3_json 13 | ``` -------------------------------------------------------------------------------- /training/configs/accelerate_configs/deepspeed_zero1.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero_stage: 1 8 | distributed_type: DEEPSPEED 9 | downcast_bf16: 'no' 10 | machine_rank: 0 11 | main_training_function: main 12 | mixed_precision: bf16 13 | num_machines: 1 14 | num_processes: 8 15 | rdzv_backend: static 16 | same_network: true 17 | tpu_env: [] 18 | tpu_use_cluster: false 19 | tpu_use_sudo: false 20 | use_cpu: false 21 | -------------------------------------------------------------------------------- /training/configs/accelerate_configs/deepspeed_zero3.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | machine_rank: 0 13 | main_training_function: main 14 | mixed_precision: bf16 15 | num_machines: 1 16 | num_processes: 8 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_env: [] 20 | tpu_use_cluster: false 21 | tpu_use_sudo: false 22 | use_cpu: false 23 | -------------------------------------------------------------------------------- /training/configs/accelerate_configs/fsdp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: FSDP 4 | downcast_bf16: 'no' 5 | enable_cpu_affinity: false 6 | fsdp_config: 7 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 8 | fsdp_backward_prefetch: BACKWARD_PRE 9 | fsdp_cpu_ram_efficient_loading: true 10 | fsdp_forward_prefetch: true 11 | fsdp_offload_params: false 12 | fsdp_sharding_strategy: FULL_SHARD 13 | fsdp_state_dict_type: SHARDED_STATE_DICT 14 | fsdp_sync_module_states: true 15 | fsdp_use_orig_params: true 16 | machine_rank: 0 17 | main_training_function: main 18 | mixed_precision: bf16 19 | num_machines: 1 20 | num_processes: 8 21 | rdzv_backend: static 22 | same_network: true 23 | tpu_env: [] 24 | tpu_use_cluster: false 25 | tpu_use_sudo: false 26 | use_cpu: false 27 | -------------------------------------------------------------------------------- /training/configs/accelerate_configs/fsdp_qlora.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: FSDP 4 | downcast_bf16: 'no' 5 | fsdp_config: 6 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 7 | fsdp_backward_prefetch: BACKWARD_PRE 8 | fsdp_cpu_ram_efficient_loading: true 9 | fsdp_forward_prefetch: false 10 | fsdp_offload_params: true 11 | fsdp_sharding_strategy: FULL_SHARD 12 | fsdp_state_dict_type: SHARDED_STATE_DICT 13 | fsdp_sync_module_states: true 14 | fsdp_use_orig_params: false 15 | machine_rank: 0 16 | main_training_function: main 17 | mixed_precision: 'no' 18 | num_machines: 1 19 | num_processes: 2 20 | rdzv_backend: static 21 | same_network: true 22 | tpu_env: [] 23 | tpu_use_cluster: false 24 | tpu_use_sudo: false 25 | use_cpu: false -------------------------------------------------------------------------------- /training/configs/ds_falcon_180b_z3.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": "auto" 4 | }, 5 | "optimizer": { 6 | "type": "AdamW", 7 | "params": { 8 | "lr": "auto", 9 | "betas": "auto", 10 | "eps": "auto", 11 | "weight_decay": "auto" 12 | } 13 | }, 14 | "scheduler": { 15 | "type": "WarmupLR", 16 | "params": { 17 | "warmup_min_lr": "auto", 18 | "warmup_max_lr": "auto", 19 | "warmup_num_steps": "auto" 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 3, 24 | "overlap_comm": true, 25 | "contiguous_gradients": true, 26 | "sub_group_size": 1e9, 27 | "reduce_bucket_size": "auto", 28 | "stage3_prefetch_bucket_size": "auto", 29 | "stage3_param_persistence_threshold": "auto", 30 | "stage3_max_live_parameters": 1e9, 31 | "stage3_max_reuse_distance": 1e9, 32 | "stage3_gather_16bit_weights_on_model_save": true 33 | }, 34 | "gradient_accumulation_steps": "auto", 35 | "gradient_clipping": "auto", 36 | "steps_per_print": 2000, 37 | "train_batch_size": "auto", 38 | "train_micro_batch_size_per_gpu": "auto", 39 | "wall_clock_breakdown": false 40 | } -------------------------------------------------------------------------------- /training/configs/ds_falcon_180b_z3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": "auto" 4 | }, 5 | "optimizer": { 6 | "type": "AdamW", 7 | "params": { 8 | "lr": "auto", 9 | "betas": "auto", 10 | "eps": "auto", 11 | "weight_decay": "auto" 12 | } 13 | }, 14 | "scheduler": { 15 | "type": "WarmupLR", 16 | "params": { 17 | "warmup_min_lr": "auto", 18 | "warmup_max_lr": "auto", 19 | "warmup_num_steps": "auto" 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 3, 24 | "offload_optimizer": { 25 | "device": "cpu", 26 | "pin_memory": true 27 | }, 28 | "offload_param": { 29 | "device": "cpu", 30 | "pin_memory": true 31 | }, 32 | "overlap_comm": true, 33 | "contiguous_gradients": true, 34 | "sub_group_size": 1e9, 35 | "reduce_bucket_size": "auto", 36 | "stage3_prefetch_bucket_size": "auto", 37 | "stage3_param_persistence_threshold": "auto", 38 | "stage3_max_live_parameters": 1e9, 39 | "stage3_max_reuse_distance": 1e9, 40 | "stage3_gather_16bit_weights_on_model_save": true 41 | }, 42 | "gradient_accumulation_steps": "auto", 43 | "gradient_clipping": "auto", 44 | "steps_per_print": 2000, 45 | "train_batch_size": "auto", 46 | "train_micro_batch_size_per_gpu": "auto", 47 | "wall_clock_breakdown": false 48 | } -------------------------------------------------------------------------------- /training/configs/ds_flan_t5_z3_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "optimizer": { 11 | "type": "AdamW", 12 | "params": { 13 | "lr": "auto", 14 | "betas": "auto", 15 | "eps": "auto", 16 | "weight_decay": "auto" 17 | } 18 | }, 19 | "scheduler": { 20 | "type": "WarmupLR", 21 | "params": { 22 | "warmup_min_lr": "auto", 23 | "warmup_max_lr": "auto", 24 | "warmup_num_steps": "auto" 25 | } 26 | }, 27 | "zero_optimization": { 28 | "stage": 3, 29 | "overlap_comm": true, 30 | "contiguous_gradients": true, 31 | "sub_group_size": 1e9, 32 | "reduce_bucket_size": "auto", 33 | "stage3_prefetch_bucket_size": "auto", 34 | "stage3_param_persistence_threshold": "auto", 35 | "stage3_max_live_parameters": 1e9, 36 | "stage3_max_reuse_distance": 1e9, 37 | "stage3_gather_16bit_weights_on_model_save": true 38 | }, 39 | "gradient_accumulation_steps": "auto", 40 | "gradient_clipping": "auto", 41 | "steps_per_print": 2000, 42 | "train_batch_size": "auto", 43 | "train_micro_batch_size_per_gpu": "auto", 44 | "wall_clock_breakdown": false 45 | } -------------------------------------------------------------------------------- /training/configs/ds_flan_t5_z3_config_bf16.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": "auto" 4 | }, 5 | "optimizer": { 6 | "type": "AdamW", 7 | "params": { 8 | "lr": "auto", 9 | "betas": "auto", 10 | "eps": "auto", 11 | "weight_decay": "auto" 12 | } 13 | }, 14 | "scheduler": { 15 | "type": "WarmupLR", 16 | "params": { 17 | "warmup_min_lr": "auto", 18 | "warmup_max_lr": "auto", 19 | "warmup_num_steps": "auto" 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 3, 24 | "overlap_comm": true, 25 | "contiguous_gradients": true, 26 | "sub_group_size": 1e9, 27 | "reduce_bucket_size": "auto", 28 | "stage3_prefetch_bucket_size": "auto", 29 | "stage3_param_persistence_threshold": "auto", 30 | "stage3_max_live_parameters": 1e9, 31 | "stage3_max_reuse_distance": 1e9, 32 | "stage3_gather_16bit_weights_on_model_save": true 33 | }, 34 | "gradient_accumulation_steps": "auto", 35 | "gradient_clipping": "auto", 36 | "steps_per_print": 2000, 37 | "train_batch_size": "auto", 38 | "train_micro_batch_size_per_gpu": "auto", 39 | "wall_clock_breakdown": false 40 | } -------------------------------------------------------------------------------- /training/configs/ds_flan_t5_z3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "optimizer": { 11 | "type": "AdamW", 12 | "params": { 13 | "lr": "auto", 14 | "betas": "auto", 15 | "eps": "auto", 16 | "weight_decay": "auto" 17 | } 18 | }, 19 | "scheduler": { 20 | "type": "WarmupLR", 21 | "params": { 22 | "warmup_min_lr": "auto", 23 | "warmup_max_lr": "auto", 24 | "warmup_num_steps": "auto" 25 | } 26 | }, 27 | "zero_optimization": { 28 | "stage": 3, 29 | "offload_optimizer": { 30 | "device": "cpu", 31 | "pin_memory": true 32 | }, 33 | "offload_param": { 34 | "device": "cpu", 35 | "pin_memory": true 36 | }, 37 | "overlap_comm": true, 38 | "contiguous_gradients": true, 39 | "sub_group_size": 1e9, 40 | "reduce_bucket_size": "auto", 41 | "stage3_prefetch_bucket_size": "auto", 42 | "stage3_param_persistence_threshold": "auto", 43 | "stage3_max_live_parameters": 1e9, 44 | "stage3_max_reuse_distance": 1e9, 45 | "stage3_gather_16bit_weights_on_model_save": true 46 | }, 47 | "gradient_accumulation_steps": "auto", 48 | "gradient_clipping": "auto", 49 | "steps_per_print": 2000, 50 | "train_batch_size": "auto", 51 | "train_micro_batch_size_per_gpu": "auto", 52 | "wall_clock_breakdown": false 53 | } -------------------------------------------------------------------------------- /training/configs/ds_flan_t5_z3_offload_bf16.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": "auto" 4 | }, 5 | "optimizer": { 6 | "type": "AdamW", 7 | "params": { 8 | "lr": "auto", 9 | "betas": "auto", 10 | "eps": "auto", 11 | "weight_decay": "auto" 12 | } 13 | }, 14 | "scheduler": { 15 | "type": "WarmupLR", 16 | "params": { 17 | "warmup_min_lr": "auto", 18 | "warmup_max_lr": "auto", 19 | "warmup_num_steps": "auto" 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 3, 24 | "offload_optimizer": { 25 | "device": "cpu", 26 | "pin_memory": true 27 | }, 28 | "offload_param": { 29 | "device": "cpu", 30 | "pin_memory": true 31 | }, 32 | "overlap_comm": true, 33 | "contiguous_gradients": true, 34 | "sub_group_size": 1e9, 35 | "reduce_bucket_size": "auto", 36 | "stage3_prefetch_bucket_size": "auto", 37 | "stage3_param_persistence_threshold": "auto", 38 | "stage3_max_live_parameters": 1e9, 39 | "stage3_max_reuse_distance": 1e9, 40 | "stage3_gather_16bit_weights_on_model_save": true 41 | }, 42 | "gradient_accumulation_steps": "auto", 43 | "gradient_clipping": "auto", 44 | "steps_per_print": 2000, 45 | "train_batch_size": "auto", 46 | "train_micro_batch_size_per_gpu": "auto", 47 | "wall_clock_breakdown": false 48 | } -------------------------------------------------------------------------------- /training/configs/spectrum/snr_results_meta-llama-Meta-Llama-3.1-8B_unfrozenparameters_30percent.yaml: -------------------------------------------------------------------------------- 1 | unfrozen_parameters: 2 | - ^lm_head.weight$ 3 | - ^model.embed_tokens.weight$ 4 | # input_layernorm layers 5 | - model.layers.0.input_layernorm 6 | - model.layers.1.input_layernorm 7 | - model.layers.2.input_layernorm 8 | - model.layers.3.input_layernorm 9 | - model.layers.4.input_layernorm 10 | - model.layers.5.input_layernorm 11 | - model.layers.6.input_layernorm 12 | - model.layers.7.input_layernorm 13 | - model.layers.8.input_layernorm 14 | # lm_head layers 15 | # mlp.down_proj layers 16 | - model.layers.1.mlp.down_proj 17 | - model.layers.0.mlp.down_proj 18 | - model.layers.30.mlp.down_proj 19 | - model.layers.2.mlp.down_proj 20 | - model.layers.21.mlp.down_proj 21 | - model.layers.22.mlp.down_proj 22 | - model.layers.29.mlp.down_proj 23 | - model.layers.5.mlp.down_proj 24 | - model.layers.4.mlp.down_proj 25 | # mlp.gate_proj layers 26 | - model.layers.1.mlp.gate_proj 27 | - model.layers.2.mlp.gate_proj 28 | - model.layers.3.mlp.gate_proj 29 | - model.layers.4.mlp.gate_proj 30 | - model.layers.0.mlp.gate_proj 31 | - model.layers.25.mlp.gate_proj 32 | - model.layers.26.mlp.gate_proj 33 | - model.layers.5.mlp.gate_proj 34 | - model.layers.24.mlp.gate_proj 35 | # mlp.up_proj layers 36 | - model.layers.4.mlp.up_proj 37 | - model.layers.3.mlp.up_proj 38 | - model.layers.0.mlp.up_proj 39 | - model.layers.5.mlp.up_proj 40 | - model.layers.7.mlp.up_proj 41 | - model.layers.6.mlp.up_proj 42 | - model.layers.2.mlp.up_proj 43 | - model.layers.1.mlp.up_proj 44 | - model.layers.8.mlp.up_proj 45 | # model.embed_tokens layers 46 | # model.norm layers 47 | # post_attention_layernorm layers 48 | - model.layers.0.post_attention_layernorm 49 | - model.layers.1.post_attention_layernorm 50 | - model.layers.2.post_attention_layernorm 51 | - model.layers.3.post_attention_layernorm 52 | - model.layers.4.post_attention_layernorm 53 | - model.layers.5.post_attention_layernorm 54 | - model.layers.6.post_attention_layernorm 55 | - model.layers.7.post_attention_layernorm 56 | - model.layers.8.post_attention_layernorm 57 | # self_attn.k_proj layers 58 | - model.layers.29.self_attn.k_proj 59 | - model.layers.25.self_attn.k_proj 60 | - model.layers.23.self_attn.k_proj 61 | - model.layers.28.self_attn.k_proj 62 | - model.layers.21.self_attn.k_proj 63 | - model.layers.19.self_attn.k_proj 64 | - model.layers.22.self_attn.k_proj 65 | - model.layers.20.self_attn.k_proj 66 | - model.layers.24.self_attn.k_proj 67 | # self_attn.o_proj layers 68 | - model.layers.14.self_attn.o_proj 69 | - model.layers.7.self_attn.o_proj 70 | - model.layers.5.self_attn.o_proj 71 | - model.layers.11.self_attn.o_proj 72 | - model.layers.6.self_attn.o_proj 73 | - model.layers.24.self_attn.o_proj 74 | - model.layers.9.self_attn.o_proj 75 | - model.layers.13.self_attn.o_proj 76 | - model.layers.10.self_attn.o_proj 77 | # self_attn.q_proj layers 78 | - model.layers.8.self_attn.q_proj 79 | - model.layers.13.self_attn.q_proj 80 | - model.layers.9.self_attn.q_proj 81 | - model.layers.14.self_attn.q_proj 82 | - model.layers.10.self_attn.q_proj 83 | - model.layers.11.self_attn.q_proj 84 | - model.layers.0.self_attn.q_proj 85 | - model.layers.15.self_attn.q_proj 86 | - model.layers.1.self_attn.q_proj 87 | # self_attn.v_proj layers 88 | - model.layers.26.self_attn.v_proj 89 | - model.layers.17.self_attn.v_proj 90 | - model.layers.3.self_attn.v_proj 91 | - model.layers.28.self_attn.v_proj 92 | - model.layers.29.self_attn.v_proj 93 | - model.layers.21.self_attn.v_proj 94 | - model.layers.15.self_attn.v_proj 95 | - model.layers.16.self_attn.v_proj 96 | - model.layers.20.self_attn.v_proj 97 | -------------------------------------------------------------------------------- /training/deepseed-falcon-180b-lora-fa.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Fine-tune Falcon 180B with DeepSpeed ZeRO, LoRA & Flash Attention\n", 9 | "\n", 10 | "Falcon 180B is the newest version of Falcon LLM family. It is the biggest open source model with 180B parameter and trained on more data - 3.5T tokens with context length window upto 4K tokens. In this example we will show how to fine-tune Falcon 180B using DeepSpeed, Hugging Face Transformers, LoRA with Flash Attention on a multi-GPU machine.\n", 11 | "\n", 12 | "In detail you will learn how to:\n", 13 | "1. Setup Development Environment\n", 14 | "2. Load and prepare the dataset\n", 15 | "3. Fine-Tune Falcon 180B using DeepSpeed, Hugging Face Transformers, LoRA with Flash Attention\n", 16 | "\n", 17 | "Before we get into the code lets take a quick look on the technologies and methods we are going to use: \n", 18 | "\n", 19 | "### What is DeepSpeed ZeRO?\n", 20 | "\n", 21 | "DeepSpeed ZeRO focuses on efficient large-scale training of Transformers. ZeRO, or Zero Redundancy Optimizer, reduces memory footprint by partitioning model states across devices instead of basic data parallelism. This saves significant memory - ZeRO-Infinity can reduce usage 100x vs data parallelism. ZeRO-Offload further reduces memory by offloading parts of model and optimizer to CPU, enabling 10B+ parameter models on 1 GPU. ZeRO [integrates with HuggingFace Transformers through a configuration file](https://huggingface.co/docs/transformers/v4.26.1/en/main_classes/deepspeed).\n", 22 | "\n", 23 | "### What is LoRA?\n", 24 | "\n", 25 | "[LoRA](https://arxiv.org/abs/2106.09685) enables efficient fine-tuning of large language models. It decomposes weight matrices into smaller, trainable update matrices that adapt while keeping original weights frozen. This drastically reduces trainable parameters for faster, lower-memory tuning. LoRA integrates into [Transformers via Hugging Face's PEFT](https://huggingface.co/docs/peft/conceptual_guides/lora). It combines well with methods like DeepSpeed. Key advantages are efficient tuning, portable models, and no inference latency when merging trained weights. LoRA allows adaptively training massive models with limited resources.\n", 26 | "\n", 27 | "### What is Flash Attention?\n", 28 | "\n", 29 | "Flash Attention is an algorithm that speeds up the core attention mechanism in Transformer language models by restructuring computations. It uses techniques like tiling and recomputation to reduce the high memory costs of attention, enabling models to process longer text sequences. Flash Attention 2 optimizes parallelism and work partitioning for 2x speedup over the previous version, reaching 230 TFLOPS/s on A100 GPUs.\n", 30 | "\n", 31 | "\n", 32 | "### Access Falcon 180B \n", 33 | "\n", 34 | "Before we can start training we have to make sure that we accepted the license [tiiuae/falcon-180B](https://huggingface.co/tiiuae/falcon-180B) to be able to use it. You can accept the license by clicking on the Agree and access repository button on the model page at: \n", 35 | "* [tiiuae/falcon-180B](https://huggingface.co/tiiuae/falcon-180B)\n", 36 | "\n", 37 | "> The example was created and run a DGX A100 8-GPU machine with 80GB GPU memory per GPU.\n", 38 | "\n", 39 | "## 1. Setup Development Environment\n", 40 | "\n", 41 | "conda create --name hf python=3.10 -c conda-forge\n" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "# install torch with the correct cuda version, check nvcc --version\n", 51 | "!pip install torch --extra-index-url https://download.pytorch.org/whl/cu118 --upgrade\n", 52 | "# install Hugging Face Libraries and additional dependencies\n", 53 | "!pip install \"transformers==4.34.0\" \"datasets==2.14.5\" \"accelerate==0.22.0\" \"evaluate==0.4.0\" \"peft==0.5.0\" tensorboard packaging --upgrade\n", 54 | "# install deepspeed and ninja for jit compilations of kernels\n", 55 | "!pip install \"deepspeed==0.10.3\" ninja --upgrade\n", 56 | "# install additional Flash Attention\n", 57 | "!pip install flash-attn --no-build-isolation --upgrade" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "To access any Falcon 180B asset we need to login into our hugging face account. We can do this by running the following command:" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "!huggingface-cli login --token YOUR_TOKEN" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "## 2. Load and prepare the dataset\n", 81 | "\n", 82 | "we will use the [dolly](https://huggingface.co/datasets/databricks/databricks-dolly-15k) an open source dataset of instruction-following records generated by thousands of Databricks employees in several of the behavioral categories outlined in the [InstructGPT paper](https://arxiv.org/abs/2203.02155), including brainstorming, classification, closed QA, generation, information extraction, open QA, and summarization.\n", 83 | "\n", 84 | "```python\n", 85 | "{\n", 86 | " \"instruction\": \"What is world of warcraft\",\n", 87 | " \"context\": \"\",\n", 88 | " \"response\": \"World of warcraft is a massive online multi player role playing game. It was released in 2004 by bizarre entertainment\"\n", 89 | "}\n", 90 | "```\n", 91 | "\n", 92 | "To load the `samsum` dataset, we use the `load_dataset()` method from the 🤗 Datasets library." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "from datasets import load_dataset\n", 102 | "from random import randrange\n", 103 | "\n", 104 | "# Load dataset from the hub\n", 105 | "dataset = load_dataset(\"databricks/databricks-dolly-15k\", split=\"train\")\n", 106 | "\n", 107 | "print(f\"dataset size: {len(dataset)}\")\n", 108 | "print(dataset[randrange(len(dataset))])\n", 109 | "# dataset size: 15011" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "To instruct tune our model we need to convert our structured examples into a collection of tasks described via instructions. We define a `formatting_function` that takes a sample and returns a string with our format instruction." 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "def format_dolly(sample):\n", 126 | " instruction = f\"### Instruction\\n{sample['instruction']}\"\n", 127 | " context = f\"### Context\\n{sample['context']}\" if len(sample[\"context\"]) > 0 else None\n", 128 | " response = f\"### Answer\\n{sample['response']}\"\n", 129 | " # join all the parts together\n", 130 | " prompt = \"\\n\\n\".join([i for i in [instruction, context, response] if i is not None])\n", 131 | " return prompt\n" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "lets test our formatting function on a random example." 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "from random import randrange\n", 148 | "\n", 149 | "print(format_dolly(dataset[randrange(len(dataset))]))" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "In addition, to formatting our samples we also want to pack multiple samples to one sequence to have a more efficient training." 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "from transformers import AutoTokenizer\n", 166 | "\n", 167 | "model_id = \"tiiuae/falcon-180B\" \n", 168 | "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", 169 | "tokenizer.pad_token = tokenizer.eos_token" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "We define some helper functions to pack our samples into sequences of a given length and then tokenize them." 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "from random import randint\n", 186 | "from itertools import chain\n", 187 | "from functools import partial\n", 188 | "\n", 189 | "\n", 190 | "# template dataset to add prompt to each sample\n", 191 | "def template_dataset(sample):\n", 192 | " sample[\"text\"] = f\"{format_dolly(sample)}{tokenizer.eos_token}\"\n", 193 | " return sample\n", 194 | "\n", 195 | "\n", 196 | "# apply prompt template per sample\n", 197 | "dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))\n", 198 | "# print random sample\n", 199 | "print(dataset[randint(0, len(dataset))][\"text\"])\n", 200 | "\n", 201 | "# empty list to save remainder from batches to use in next batch\n", 202 | "remainder = {\"input_ids\": [], \"attention_mask\": [], \"token_type_ids\": []}\n", 203 | "\n", 204 | "def chunk(sample, chunk_length=2048):\n", 205 | " # define global remainder variable to save remainder from batches to use in next batch\n", 206 | " global remainder\n", 207 | " # Concatenate all texts and add remainder from previous batch\n", 208 | " concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}\n", 209 | " concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}\n", 210 | " # get total number of tokens for batch\n", 211 | " batch_total_length = len(concatenated_examples[list(sample.keys())[0]])\n", 212 | "\n", 213 | " # get max number of chunks for batch\n", 214 | " if batch_total_length >= chunk_length:\n", 215 | " batch_chunk_length = (batch_total_length // chunk_length) * chunk_length\n", 216 | "\n", 217 | " # Split by chunks of max_len.\n", 218 | " result = {\n", 219 | " k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]\n", 220 | " for k, t in concatenated_examples.items()\n", 221 | " }\n", 222 | " # add remainder to global variable for next batch\n", 223 | " remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}\n", 224 | " # prepare labels\n", 225 | " result[\"labels\"] = result[\"input_ids\"].copy()\n", 226 | " return result\n", 227 | "\n", 228 | "\n", 229 | "# tokenize and chunk dataset\n", 230 | "lm_dataset = dataset.map(\n", 231 | " lambda sample: tokenizer(sample[\"text\"]), batched=True, remove_columns=list(dataset.features)\n", 232 | ").map(\n", 233 | " partial(chunk, chunk_length=2048),\n", 234 | " batched=True,\n", 235 | ")\n", 236 | "\n", 237 | "# Print total number of samples\n", 238 | "print(f\"Total number of samples: {len(lm_dataset)}\")" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "After we processed the datasets we want to save it to disk to be able to use the processed dataset later during training." 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "lm_dataset.save_to_disk(\"dolly-processed\")" 255 | ] 256 | }, 257 | { 258 | "attachments": {}, 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "## 3. Fine-Tune Falcon 180B using DeepSpeed, Hugging Face Transformers, LoRA with Flash Attention\n", 263 | "\n", 264 | "DeepSpeed ZeRO is natively integrated into the [Hugging Face Transformers Trainer](https://huggingface.co/docs/transformers/v4.33.1/en/main_classes/deepspeed). The integration enables leveraging ZeRO by simply providing a DeepSpeed config file, and the Trainer takes care of the rest. We created 2 deepspeed configurations for the experiments we ran, including `CPU offloading`: \n", 265 | "\n", 266 | "- [ds_falcon_180b_z3.json](./configs/ds_falcon_180b_z3.json)\n", 267 | "- [ds_falcon_180b_z3_offload.json](./configs/ds_falcon_180b_z3_offload.json)\n", 268 | "\n", 269 | "As mentioned in the beginning, we ran those example using a 8x NVIDIA A100 80GB. This means we can leverage `bf16`, which reduces the memory footprint of the model by almost ~2x, which allows us to train without offloading efficiently. We are going to use the [ds_falcon_180b_z3.json](./configs/ds_falcon_180b_z3.json). If you are irritated by the `auto` values, check the [documentation](https://huggingface.co/docs/transformers/v4.26.1/en/main_classes/deepspeed#configuration).\n", 270 | "\n", 271 | "In addition to the deepspeed configuration we also need a training script, which implements LoRA and patches our model to use flash-attention. We created a [run_ds_lora.py](./run_ds_lora.py) script, which patches the falcon model using the [falcon_patch.py](./utils/falcon_patch.py) utils and implements LoRA using [peft_utils.py](./utils/peft_utils.py). \n", 272 | "\n", 273 | "> When you run make sure that you have the same folder structure and utils/configs available. The easiest way is to clone the whole repository. Go into the `training` directory and start the training.\n", 274 | "\n", 275 | "Once we made sure that we have the right configuration and training script we can start the training using `torchrun`." 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "!torchrun --nproc_per_node 8 run_ds_lora.py \\\n", 285 | " --model_id tiiuae/falcon-180B \\\n", 286 | " --dataset_path dolly-processed \\\n", 287 | " --output_dir falcon-180b-lora-fa \\\n", 288 | " --num_train_epochs 3 \\\n", 289 | " --per_device_train_batch_size 1 \\\n", 290 | " --learning_rate 4e-3 \\\n", 291 | " --gradient_checkpointing True \\\n", 292 | " --gradient_accumulation_steps 8 \\\n", 293 | " --bf16 True \\\n", 294 | " --tf32 True \\\n", 295 | " --use_flash_attn True \\\n", 296 | " --lr_scheduler_type \"constant_with_warmup\" \\\n", 297 | " --logging_steps 25 \\\n", 298 | " --save_steps 100 \\\n", 299 | " --save_total_limit 3 \\\n", 300 | " --deepspeed configs/ds_falcon_180b_z3.json" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "_Note: Since we are using LoRA we are only saving the \"trained\" adapter weights, to save some storage. If you want to merge the adapters back into the base model and save the merged model you can add `--merge_adapters True` or use the [merge_adapter_weights.py](./scripts/merge_adapter_weights.py) script._\n", 308 | "\n", 309 | "In our example for Falcon 180B, the training time was `153 minutes` or ~2 hours for 3 epochs. For comparison the pretraining cost of Falcon 180B was ~7,000,000 GPU hours, which is 3,500,000 time more than fine-tuning.\n", 310 | "\n", 311 | "## Conclusion \n", 312 | "\n", 313 | "In the blog post you learn how to fine-tune Falcon 180B model using DeepSpeed, Hugging Face Transformers, and LoRA with Flash Attention on a multi-GPU machine. We used: \n", 314 | "\n", 315 | "* DeepSpeed ZeRO for memory optimization, enabling training models with up to trillions of parameters on limited GPU memory. We used stage 3 (ZeRO-Infinity) to optimize memory usage.\n", 316 | "* Hugging Face Transformers and Datasets for easily loading and preparing the text dataset as well as providing an intuitive Trainer API.\n", 317 | "* LoRA, a method to efficiently fine-tune large language models by only updating a small percentage of parameters each iteration. This drastically reduces memory usage and computational costs.\n", 318 | "* Flash Attention - a highly optimized attention implementation that further reduces the memory footprint.\n", 319 | "\n", 320 | "Compining all of those methods allows us to fine-tune LLMs with over 100B+ parameter with limited resources. The example provides a template for efficiently tuning the largest publicly available models." 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": {}, 326 | "source": [] 327 | } 328 | ], 329 | "metadata": { 330 | "kernelspec": { 331 | "display_name": "pytorch", 332 | "language": "python", 333 | "name": "python3" 334 | }, 335 | "language_info": { 336 | "codemirror_mode": { 337 | "name": "ipython", 338 | "version": 3 339 | }, 340 | "file_extension": ".py", 341 | "mimetype": "text/x-python", 342 | "name": "python", 343 | "nbconvert_exporter": "python", 344 | "pygments_lexer": "ipython3", 345 | "version": "3.10.12" 346 | }, 347 | "orig_nbformat": 4, 348 | "vscode": { 349 | "interpreter": { 350 | "hash": "2d58e898dde0263bc564c6968b04150abacfd33eed9b19aaa8e45c040360e146" 351 | } 352 | } 353 | }, 354 | "nbformat": 4, 355 | "nbformat_minor": 2 356 | } 357 | -------------------------------------------------------------------------------- /training/fine-tune-modern-bert-in-2025.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Fine-tune classifier with ModernBERT in 2025\n", 9 | "\n", 10 | "Large Language Models (LLMs) have become ubiquitous in 2024. However, smaller, specialized models - particularly for classification tasks - remain critical for building efficient and cost-effective AI systems. One key use case is routing user prompts to the most appropriate LLM or selecting optimal few-shot examples, where fast, accurate classification is essential.\n", 11 | "\n", 12 | "This blog post demonstrates how to fine-tune ModernBERT, a new state-of-the-art encoder model, for classifying user prompts to implement an intelligent LLM router. ModernBERT is a refreshed version of BERT models, with 8192 token context length, significantly better downstream performance, and much faster processing speeds.\n", 13 | "\n", 14 | "You will learn how to:\n", 15 | "1. Setup environment and install libraries\n", 16 | "2. Load and prepare the classification dataset \n", 17 | "3. Fine-tune & evaluate ModernBERT with the Hugging Face `Trainer`\n", 18 | "4. Run inference & test model\n", 19 | "\n", 20 | "## Quick intro: ModernBERT\n", 21 | "\n", 22 | "ModernBERT is a modernization of BERT maintaining full backward compatibility while delivering dramatic improvements through architectural innovations like rotary positional embeddings (RoPE), alternating attention patterns, and hardware-optimized design. The model comes in two sizes:\n", 23 | "- ModernBERT Base (139M parameters)\n", 24 | "- ModernBERT Large (395M parameters)\n", 25 | "\n", 26 | "ModernBERT achieves state-of-the-art performance across classification, retrieval and code understanding tasks while being 2-4x faster than previous encoder models. This makes it ideal for high-throughput production applications like LLM routing, where both accuracy and latency are critical.\n", 27 | "\n", 28 | "ModernBERT was trained on 2 trillion tokens of diverse data including web documents, code, and scientific articles - making it much more robust than traditional BERT models trained primarily on Wikipedia. This broader knowledge helps it better understand the nuances of user prompts across different domains.\n", 29 | "\n", 30 | "If you want to learn more about ModernBERT's architecture and training process, check out the official [blog](https://huggingface.co/blog/modernbert). \n", 31 | "\n", 32 | "---\n", 33 | "\n", 34 | "Now let's get started building our LLM router with ModernBERT! 🚀\n", 35 | "\n", 36 | "*Note: This tutorial was created and tested on an NVIDIA L4 GPU with 24GB of VRAM.*\n", 37 | "\n", 38 | "## Setup environment and install libraries\n", 39 | "\n", 40 | "Our first step is to install Hugging Face Libraries and Pyroch, including transformers and datasets. " 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# Install Pytorch & other libraries\n", 50 | "%pip install \"torch==2.4.1\" tensorboard \n", 51 | "%pip install flash-attn \"setuptools<71.0.0\" scikit-learn \n", 52 | "\n", 53 | "# Install Hugging Face libraries\n", 54 | "%pip install --upgrade \\\n", 55 | " \"datasets==3.1.0\" \\\n", 56 | " \"accelerate==1.2.1\" \\\n", 57 | " \"hf-transfer==0.1.8\"\n", 58 | " #\"transformers==4.47.1\" \\\n", 59 | "\n", 60 | "# ModernBERT is not yet available in an official release, so we need to install it from github\n", 61 | "%pip install \"git+https://github.com/huggingface/transformers.git@6e0515e99c39444caae39472ee1b2fd76ece32f1\" --upgrade\n" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "We will use the [Hugging Face Hub](https://huggingface.co/models) as a remote model versioning service. This means we will automatically push our model, logs and information to the Hub during training. You must register on the [Hugging Face](https://huggingface.co/join) for this. After you have an account, we will use the `login` util from the `huggingface_hub` package to log into our account and store our token (access key) on the disk." 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "from huggingface_hub import login\n", 78 | "\n", 79 | "login(token=\"\", add_to_git_credential=True) # ADD YOUR TOKEN HERE" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "## 2. Load and prepare the dataset\n", 87 | "\n", 88 | "In our example we want to fine-tune ModernBERT to act as a router for user prompts. Therefore we need a classification dataset consisting of user prompts and their \"difficulty\" score. We are going to use the `DevQuasar/llm_router_dataset-synth` dataset, which is a synthetic dataset of ~15,000 user prompts with a difficulty score of \"large_llm\" (`1`) or \"small_llm\" (`0`). \n", 89 | "\n", 90 | "\n", 91 | "We will use the `load_dataset()` method from the [🤗 Datasets](https://huggingface.co/docs/datasets/index) library to load the `DevQuasar/llm_router_dataset-synth` dataset." 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "from datasets import load_dataset\n", 101 | "\n", 102 | "# Dataset id from huggingface.co/dataset\n", 103 | "dataset_id = \"DevQuasar/llm_router_dataset-synth\"\n", 104 | "\n", 105 | "# Load raw dataset\n", 106 | "raw_dataset = load_dataset(dataset_id)\n", 107 | "\n", 108 | "print(f\"Train dataset size: {len(raw_dataset['train'])}\")\n", 109 | "print(f\"Test dataset size: {len(raw_dataset['test'])}\")" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "Let’s check out an example of the dataset." 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "from random import randrange\n", 126 | "\n", 127 | "random_id = randrange(len(raw_dataset['train']))\n", 128 | "raw_dataset['train'][random_id]\n", 129 | "# {'id': '6225a9cd-5cba-4840-8e21-1f9cf2ded7e6',\n", 130 | "# 'prompt': 'How many legs does a spider have?',\n", 131 | "# 'label': 0}" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "To train our model, we need to convert our text prompts to token IDs. This is done by a Tokenizer, which tokenizes the inputs (including converting the tokens to their corresponding IDs in the pre-trained vocabulary) if you want to learn more about this, out **[chapter 6](https://huggingface.co/course/chapter6/1?fw=pt)** of the [Hugging Face Course](https://huggingface.co/course/chapter1/1)." 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "from transformers import AutoTokenizer\n", 148 | "\n", 149 | "# Model id to load the tokenizer\n", 150 | "model_id = \"answerdotai/ModernBERT-base\"\n", 151 | "\n", 152 | "# Load Tokenizer\n", 153 | "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", 154 | "tokenizer.model_max_length = 512 # set model_max_length to 512 as prompts are not longer than 1024 tokens\n", 155 | "\n", 156 | "# Tokenize helper function\n", 157 | "def tokenize(batch):\n", 158 | " return tokenizer(batch['text'], padding='max_length', truncation=True, return_tensors=\"pt\")\n", 159 | "\n", 160 | "# Tokenize dataset\n", 161 | "raw_dataset = raw_dataset.rename_column(\"label\", \"labels\") # to match Trainer\n", 162 | "tokenized_dataset = raw_dataset.map(tokenize, batched=True,remove_columns=[\"text\"])\n", 163 | "\n", 164 | "print(tokenized_dataset[\"train\"].features.keys())\n", 165 | "# dict_keys(['input_ids', 'token_type_ids', 'attention_mask','lable'])" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "## 3. Fine-tune & evaluate ModernBERT with the Hugging Face `Trainer`\n", 173 | "\n", 174 | "After we have processed our dataset, we can start training our model. We will use the [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base) model. The first step is to load our model with `AutoModelForSequenceClassification` class from the [Hugging Face Hub](https://huggingface.co/answerdotai/ModernBERT-base). This will initialize the pre-trained ModernBERT weights with a classification head on top. Here we pass the number of classes (2) from our dataset and the label names to have readable outputs for inference." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "from transformers import AutoModelForSequenceClassification\n", 184 | "\n", 185 | "# Model id to load the tokenizer\n", 186 | "model_id = \"answerdotai/ModernBERT-base\"\n", 187 | "\n", 188 | "# Prepare model labels - useful for inference\n", 189 | "labels = tokenized_dataset[\"train\"].features[\"labels\"].names\n", 190 | "num_labels = len(labels)\n", 191 | "label2id, id2label = dict(), dict()\n", 192 | "for i, label in enumerate(labels):\n", 193 | " label2id[label] = str(i)\n", 194 | " id2label[str(i)] = label\n", 195 | "\n", 196 | "# Download the model from huggingface.co/models\n", 197 | "model = AutoModelForSequenceClassification.from_pretrained(\n", 198 | " model_id, num_labels=num_labels, label2id=label2id, id2label=id2label,\n", 199 | ")" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "We evaluate our model during training. The `Trainer` supports evaluation during training by providing a `compute_metrics` method. We use the `evaluate` library to calculate the [f1 metric](https://huggingface.co/spaces/evaluate-metric/f1) during training on our test split." 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 5, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "import numpy as np\n", 216 | "from sklearn.metrics import f1_score\n", 217 | "\n", 218 | "# Metric helper method\n", 219 | "def compute_metrics(eval_pred):\n", 220 | " predictions, labels = eval_pred\n", 221 | " predictions = np.argmax(predictions, axis=1)\n", 222 | " score = f1_score(\n", 223 | " labels, predictions, labels=labels, pos_label=1, average=\"weighted\"\n", 224 | " )\n", 225 | " return {\"f1\": float(score) if score == 1 else score}" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "The last step is to define the hyperparameters (`TrainingArguments`) we use for our training. Here we are adding optimizations introduced features for fast training times using `torch_compile` option in the `TrainingArguments`.\n", 233 | "\n", 234 | "We also leverage the [Hugging Face Hub](https://huggingface.co/models) integration of the `Trainer` to push our checkpoints, logs, and metrics during training into a repository." 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "from huggingface_hub import HfFolder\n", 244 | "from transformers import Trainer, TrainingArguments\n", 245 | "\n", 246 | "# Define training args\n", 247 | "training_args = TrainingArguments(\n", 248 | " output_dir= \"modernbert-llm-router\",\n", 249 | " per_device_train_batch_size=32,\n", 250 | " per_device_eval_batch_size=16,\n", 251 | " learning_rate=5e-5,\n", 252 | "\t\tnum_train_epochs=5,\n", 253 | " bf16=True, # bfloat16 training \n", 254 | " optim=\"adamw_torch_fused\", # improved optimizer \n", 255 | " # logging & evaluation strategies\n", 256 | " logging_strategy=\"steps\",\n", 257 | " logging_steps=100,\n", 258 | " eval_strategy=\"epoch\",\n", 259 | " save_strategy=\"epoch\",\n", 260 | " save_total_limit=2,\n", 261 | " load_best_model_at_end=True,\n", 262 | " metric_for_best_model=\"f1\",\n", 263 | " # push to hub parameters\n", 264 | " report_to=\"tensorboard\",\n", 265 | " push_to_hub=True,\n", 266 | " hub_strategy=\"every_save\",\n", 267 | " hub_token=HfFolder.get_token(),\n", 268 | "\n", 269 | ")\n", 270 | "\n", 271 | "# Create a Trainer instance\n", 272 | "trainer = Trainer(\n", 273 | " model=model,\n", 274 | " args=training_args,\n", 275 | " train_dataset=tokenized_dataset[\"train\"],\n", 276 | " eval_dataset=tokenized_dataset[\"test\"],\n", 277 | " compute_metrics=compute_metrics,\n", 278 | ")" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "We can start our training by using the **`train`** method of the `Trainer`." 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "# Start training\n", 295 | "trainer.train()" 296 | ] 297 | }, 298 | { 299 | "attachments": {}, 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "Fine-tuning `answerdotai/ModernBERT-base` on ~15,000 synthetic prompts for 5 epochs took `321` seconds and our best model achieved a `f1` score of `0.993`. 🚀 I also ran the training with `bert-base-uncased` to compare the training time and performance. The original BERT achieved a `f1` score of `0.99` and took `1048` seconds to train. \n", 304 | "\n", 305 | "*Note: ModernBERT and BERT both almost achieve the same performance. This indicates that the dataset is not challenging and probably could be solved using a logistic regression classifier. I ran the same code on the [banking77](https://huggingface.co/datasets/legacy-datasets/banking77) dataset. A dataset of ~13,000 customer service queries with 77 classes. There the ModernBERT outperformed the original BERT by 3% (f1 score of 0.93 vs 0.90)*\n", 306 | "\n", 307 | "\n", 308 | "Lets save our final best model and tokenizer to the Hugging Face Hub and create a model card." 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "# Save processor and create model card\n", 318 | "tokenizer.save_pretrained(\"modernbert-llm-router\")\n", 319 | "trainer.create_model_card()\n", 320 | "trainer.push_to_hub()" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": {}, 326 | "source": [ 327 | "## 4. Run Inference & test model\n", 328 | "\n", 329 | "To wrap up this tutorial, we will run inference on a few examples and test our model. We will use the `pipeline` method from the `transformers` library to run inference on our model." 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "from transformers import pipeline\n", 339 | "\n", 340 | "# load model from huggingface.co/models using our repository id\n", 341 | "classifier = pipeline(\"sentiment-analysis\", model=\"modernbert-llm-router\", device=0)\n", 342 | "\n", 343 | "sample = \"How does the structure and function of plasmodesmata affect cell-to-cell communication and signaling in plant tissues, particularly in response to environmental stresses?\"\n", 344 | "\n", 345 | "\n", 346 | "pred = classifier(sample)\n", 347 | "print(pred)\n", 348 | "# [{'label': 'large_llm', 'score': 1.0}]" 349 | ] 350 | }, 351 | { 352 | "attachments": {}, 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "## Conclusion\n", 357 | "\n", 358 | "In this tutorial, we learned how to fine-tune ModernBERT for an LLM routing classification task. We demonstrated how to leverage the Hugging Face ecosystem to efficiently train and deploy a specialized classifier that can intelligently route user prompts to the most appropriate LLM model.\n", 359 | "\n", 360 | "Using modern training optimizations like flash attention, fused optimizers and mixed precision, we were able to train our model efficiently. Comparing ModernBERT with the original BERT we reduced training time by approximately 3x (1048s vs 321s) on our dataset and outperformed the original BERT by 3% on a more challenging dataset. But more importantly, ModernBERT was trained on 2 trillion tokens, which are more diverse and up to date than the Wikipedia-based training data of the original BERT.\n", 361 | "\n", 362 | "This example showcases how smaller, specialized models remain valuable in the age of large language models - particularly for high-throughput, latency-sensitive tasks like LLM routing. By using ModernBERT's improved architecture and broader training data, we can build more robust and efficient classification systems." 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [] 369 | } 370 | ], 371 | "metadata": { 372 | "kernelspec": { 373 | "display_name": "pytorch", 374 | "language": "python", 375 | "name": "python3" 376 | }, 377 | "language_info": { 378 | "codemirror_mode": { 379 | "name": "ipython", 380 | "version": 3 381 | }, 382 | "file_extension": ".py", 383 | "mimetype": "text/x-python", 384 | "name": "python", 385 | "nbconvert_exporter": "python", 386 | "pygments_lexer": "ipython3", 387 | "version": "3.11.11" 388 | }, 389 | "orig_nbformat": 4, 390 | "vscode": { 391 | "interpreter": { 392 | "hash": "2d58e898dde0263bc564c6968b04150abacfd33eed9b19aaa8e45c040360e146" 393 | } 394 | } 395 | }, 396 | "nbformat": 4, 397 | "nbformat_minor": 2 398 | } 399 | -------------------------------------------------------------------------------- /training/inference.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | from datasets import load_dataset 3 | from random import randint 4 | 5 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=512) 6 | 7 | # use revision without "checkpoints-" as vLLM downloads all of them 8 | llm = LLM(model="philschmid/qwen-2.5-3b-r1-countdown", revision="099c0f8cbfc522e7c3a476edfb749f576b164539") 9 | 10 | # Load dataset from Hugging Face Hub 11 | dataset_id = "Jiayi-Pan/Countdown-Tasks-3to4" 12 | dataset = load_dataset(dataset_id, split="train") 13 | sample = dataset[randint(0, len(dataset))] 14 | 15 | # create conversation 16 | messages = [ 17 | {"role": "system", "content": "You are a helpful assistant. You first thinks about the reasoning process in the mind and then provides the user with the answer."}, 18 | {"role": "user", "content": f"Using the numbers {sample['nums']}, create an equation that equals {sample['target']}. You can use basic arithmetic operations (+, -, *, /) one or multiple times but each number can only be used once. Show your work in tags. And return the final equation in tags, for example (1 + 2) / 3 . Think step by step inside tags."}, 19 | {"role": "assistant", "content": "Let me solve this step by step.\n"} 20 | ] 21 | # generate response 22 | res = llm.generate(llm.get_tokenizer().apply_chat_template(messages, tokenize=False, continue_final_message=True), sampling_params) 23 | res = "" + res[0].outputs[0].text 24 | print(res) 25 | 26 | # We need to use the numbers 37, 15, 4, and 13 with basic arithmetic operations to make 16. Let's try different combinations: 27 | # - 37 - 15 - 4 - 13 = 6 (too low) 28 | # - 37 - 15 + 4 - 13 = 13 (too low) 29 | # - 37 + 15 - 4 - 13 = 35 (too high) 30 | # - 37 - 15 + 4 + 13 = 39 (too high) 31 | # - 15 + 4 + 13 - 37 = -1 (too low) 32 | # - 37 + 15 + 4 - 13 = 43 (too high) 33 | # - 15 + 4 * 13 / 37 = 15 + 52 / 37 (not an integer) 34 | # - 15 * 4 / 37 - 37 = -28.24 (not a whole number) 35 | # - 4 * 13 / 15 - 37 = 41.3333 (not a whole number) 36 | # After all combinations, I got not any integer result as 16. 37 | # 38 | # 37 - 15 + 4 + 13 -------------------------------------------------------------------------------- /training/launch.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --ntasks-per-node=1 3 | #SBATCH --nodes=1 4 | #SBATCH --gres=gpu:4 5 | #SBATCH --qos=high 6 | #SBATCH --partition=hopper-prod # Adjust this for your cluster 7 | #SBATCH --output=/fsx/philipp/logs/%x-%j.out # Adjust this for your cluster 8 | #SBATCH --err=/fsx/philipp/logs/%x-%j.err # Adjust this for your cluster 9 | 10 | set -x -e 11 | 12 | source ~/.bashrc 13 | micromamba activate dpo 14 | echo "START TIME: $(date)" 15 | 16 | CONFIG_FILE=$1 17 | 18 | # Training setup 19 | NUM_NODES=$SLURM_NNODES 20 | GPUS_PER_NODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) 21 | WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE)) 22 | # get number of gpus for training which world size - 1 23 | NUM_GPUS_FOR_TRAINING=$(($WORLD_SIZE - 1)) 24 | 25 | 26 | # so processes know who to talk to 27 | MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) 28 | MASTER_PORT=6000 29 | 30 | export CMD=" \ 31 | scripts/run_r1_grpo.py --config $CONFIG_FILE 32 | " 33 | 34 | export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \ 35 | --config_file configs/accelerate_configs/deepspeed_zero3.yaml \ 36 | --num_machines $NUM_NODES \ 37 | --num_processes $NUM_GPUS_FOR_TRAINING \ 38 | --main_process_ip $MASTER_ADDR \ 39 | --main_process_port $MASTER_PORT \ 40 | --machine_rank \$SLURM_PROCID \ 41 | --rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT" \ 42 | --max_restarts 1 \ 43 | --role \$(hostname -s): \ 44 | --tee 3 \ 45 | " 46 | 47 | # force crashing on nccl issues like hanging broadcast 48 | export NCCL_ASYNC_ERROR_HANDLING=1 49 | # export NCCL_DEBUG=INFO 50 | # export NCCL_DEBUG_SUBSYS=COLL 51 | # export NCCL_SOCKET_NTHREADS=1 52 | # export NCCL_NSOCKS_PERTHREAD=1 53 | # export CUDA_LAUNCH_BLOCKING=1 54 | 55 | # Specific configuration optimized for the Hugging Face Compute Cluster 56 | # Be ye warned this may not work on other clusters! 57 | module load cuda/12.1 58 | 59 | # srun error handling: 60 | # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks 61 | # --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code 62 | SRUN_ARGS=" \ 63 | --wait=60 \ 64 | --kill-on-bad-exit=1 \ 65 | " 66 | 67 | clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --role \$SLURMD_NODENAME: $CMD" 2>&1 68 | 69 | echo "END TIME: $(date)" -------------------------------------------------------------------------------- /training/optimize-llama-2-gptq.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Quantize open LLMs using optimum and GPTQ\n", 9 | "\n", 10 | "The Hugging Face Optimum team collaborated with AutoGPTQ library to provide a simple API that apply GPTQ quantization on language models. With GPTQ quantization open LLMs to 8, 4, 3 or even 2 bits to run them on smaller Hardware without a big drop of performance. \n", 11 | "\n", 12 | "In the blog, you will learn how to:\n", 13 | "\n", 14 | "1. Setup our development environment\n", 15 | "2. Prepare quantization dataset\n", 16 | "3. Load and Quantize Model\n", 17 | "4. Test performance and inference speed\n", 18 | "5. Bonus: Run Inference with Text Generation Inference\n", 19 | " \n", 20 | "But we before we get started lets take quick look on what GPTQ does. \n", 21 | "\n", 22 | "_Note: This tutorial was created and run on a g5.2xlarge AWS EC2 Instance, including an NVIDIA A10G GPU._\n", 23 | "\n", 24 | "\n", 25 | "## What is GPTQ?\n", 26 | "\n", 27 | "[GPTQ](https://arxiv.org/abs/2210.17323) is a post-training quantziation method to compress LLMs, like GPT. GPTQ compresses GPT models by reducing the number of bits needed to store each weight in the model, from 32 bits down to just 3-4 bits. This means the model takes up much less memory, so it can run on less Hardware, e.g. Single GPU for 13B Llama2 models. GPTQ analyzes each layer of the model separately and approximating the weights in a way that preserves the overall accuracy.\n", 28 | "\n", 29 | "The main benefits are:\n", 30 | "* Quantizes the weights of the model layer-by-layer to 4 bits instead of 16 bits, this reduces the needed memory by 4x.\n", 31 | "* Quantization is done gradually to minimize the accuracy loss from quantization.\n", 32 | "* Achieves same latency as fp16 model, but 4x less memory usage, sometimes faster due to custom kernels, e.g. [Exllama](https://github.com/turboderp/exllama)\n", 33 | "* Quantized weights can be saved to disk for a head of time quantization.\n", 34 | "\n", 35 | "_Note: GPTQ quantization only works for text model for now. Futhermore, the quantization process can take a lot of time. You check on the [Hugging Face Hub](https://huggingface.co/models?search=gptq) if there is not already a GPTQ quantized version of the model you want to use._\n", 36 | "\n", 37 | "--- \n", 38 | "\n", 39 | "## 1. Setup our development environment\n", 40 | "\n", 41 | "Let's start coding, but first, install our dependencies." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "!pip install \"transformers==4.32.1\" \"optimum==1.12.0\" \"auto-gptq==0.4.2\" \"accelerate==0.22.0\" \"safetensors>=0.3.1\" --upgrade" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "## 2. Prepare quantization dataset\n", 58 | "\n", 59 | "GPTQ is a post-training quantization method, so we need to prepare a dataset to quantize our model. We can either use a dataset from the [Hugging Face Hub](https://huggingface.co/datasets) or use our own dataset. In this blog, we are going to use the [WikiText](https://huggingface.co/datasets/wikitext) dataset from the Hugging Face Hub. The dataset is used to quantize the weights to minimize the performance loss. It is recommended to use a quantization dataset with atleast `128` samples.\n", 60 | "\n", 61 | "_Note: [TheBloke](https://huggingface.co/TheBloke) a very active community member is contributing hundreds of gptq weights to the Hugging Face Hub. He mostly uses wikitext as quantization dataset for general domain models._\n", 62 | "\n", 63 | "If you want to use, e.g. your fine-tuning dataset for quantization you can provide it as a list instead of the \"id\", check out this [example](https://colab.research.google.com/drive/1_TIrmuKOFhuRRiTWN94iLKUFu6ZX4ceb). " 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "# Dataset id from Hugging Face \n", 73 | "dataset_id = \"wikitext2\"" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "## 3. Load and Quantize Model\n", 81 | "\n", 82 | "Optimum integrates GPTQ quantization in the `optimum.qptq` namespace with a `GPTQQuantizer`. The quantizer takes our dataset (id or list), bits, and model_seqlen as input. For more customization check [here](https://github.com/huggingface/optimum/blob/234a427450a7dcc978b227fa627ebcdab1764318/optimum/gptq/quantizer.py#L76).\n" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "from optimum.gptq import GPTQQuantizer\n", 92 | "\n", 93 | "# GPTQ quantizer\n", 94 | "quantizer = GPTQQuantizer(bits=4, dataset=dataset_id, model_seqlen=4096)\n", 95 | "quantizer.quant_method = \"gptq\"" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "After we have created our Quantizer we can load our model using Transformers. In our example we will quantize a [Llama 2 7B](https://huggingface.co/philschmid/llama-2-7b-instruction-generator), which we trained in my other blog post [\"Extended Guide: Instruction-tune Llama 2\"](https://www.philschmid.de/instruction-tune-llama-2). We are going to load our model in `fp16` since GPTQ adopts a mixed int4/fp16 quantization scheme where weights are quantized as int4 while activations remain in float16. " 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "import torch\n", 112 | "from transformers import AutoModelForCausalLM, AutoTokenizer\n", 113 | "\n", 114 | "# Hugging Face model id\n", 115 | "model_id = \"philschmid/llama-2-7b-instruction-generator\"\n", 116 | "\n", 117 | "tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False) # bug with fast tokenizer\n", 118 | "model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16) # we load the model in fp16 on purpose" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "After we loaded our model we are ready to quantize it. \n", 126 | "_Note: Quantization can take process can take a lot of time depending on one's hardware. For this example the quantization on a single A10G GPU for a 7B model took ~minutes._ " 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "import os \n", 136 | "import json\n", 137 | "\n", 138 | "# quantize the model \n", 139 | "quantized_model = quantizer.quantize_model(model, tokenizer)\n", 140 | "\n", 141 | "# save the quantize model to disk\n", 142 | "save_folder = \"quantized_llama\"\n", 143 | "quantized_model.save_pretrained(save_folder, safe_serialization=True)\n", 144 | "\n", 145 | "# load fresh, fast tokenizer and save it to disk\n", 146 | "tokenizer = AutoTokenizer.from_pretrained(model_id).save_pretrained(save_folder)\n", 147 | "\n", 148 | "# save quantize_config.json for TGI \n", 149 | "with open(os.path.join(save_folder, \"quantize_config.json\"), \"w\", encoding=\"utf-8\") as f:\n", 150 | " quantizer.disable_exllama = False\n", 151 | " json.dump(quantizer.to_dict(), f, indent=2)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "since the model was partially offloaded it set `disable_exllama` to `True` to avoid an error. For inference and production load we want to leverage the exllama kernels. Therefore we need to change the `config.json`" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "with open(os.path.join(save_folder, \"config.json\"), \"r\", encoding=\"utf-8\") as f:\n", 168 | " config = json.load(f)\n", 169 | " config[\"quantization_config\"][\"disable_exllama\"] = False\n", 170 | " with open(os.path.join(save_folder, \"config.json\"), \"w\", encoding=\"utf-8\") as f:\n", 171 | " json.dump(config, f, indent=2)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "## 4. Test performance and inference speed\n", 179 | "\n", 180 | "Since the latest release of transformers we can load any GPTQ quantized model directly using the `AutoModelForCausalLM` class this. You can either load already quantized models from Hugging Face, e.g. [TheBloke/Llama-2-13B-chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ) or models you quantized yourself. Since we want to test here the results of our quantization we are going to load our quantized model from disk and compare it to our non quantize model. \n", 181 | "\n", 182 | "First lets our our non quantized model and test it on a simple prompt." 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "import time \n", 192 | "\n", 193 | "# The prompt is based on the fine-tuning from the model: https://www.philschmid.de/instruction-tune-llama-2#4-test-model-and-run-inference\n", 194 | "prompt = \"\"\"### Instruction:\n", 195 | "Use the Input below to create an instruction, which could have been used to generate the input using an LLM.\n", 196 | "\n", 197 | "### Input:\n", 198 | "Dear [boss name],\n", 199 | "\n", 200 | "I'm writing to request next week, August 1st through August 4th,\n", 201 | "off as paid time off.\n", 202 | "\n", 203 | "I have some personal matters to attend to that week that require\n", 204 | "me to be out of the office. I wanted to give you as much advance\n", 205 | "notice as possible so you can plan accordingly while I am away.\n", 206 | "\n", 207 | "Thank you, [Your name]\n", 208 | "\n", 209 | "### Response:\n", 210 | "\"\"\"\n", 211 | "\n", 212 | "# helper function to generate text and measure latency\n", 213 | "def generate_helper(pipeline,prompt=prompt):\n", 214 | " # warm up\n", 215 | " for i in range(5):\n", 216 | " _ = pipeline(\"Warm up\")\n", 217 | "\n", 218 | " # measure latency in a simple way \n", 219 | " start = time.time()\n", 220 | " out = pipeline(prompt, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.9)\n", 221 | " end = time.time()\n", 222 | " \n", 223 | " generated_text = out[0][\"generated_text\"][len(prompt):]\n", 224 | " \n", 225 | " latency_per_token_in_ms = ((end-start)/len(pipeline.tokenizer(generated_text)[\"input_ids\"]))*1000\n", 226 | " \n", 227 | " # return the generated text and the latency\n", 228 | " return {\"text\": out[0][\"generated_text\"][len(prompt):], \"latency\": f\"{round(latency_per_token_in_ms,2)}ms/token\"}\n" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "We can load the vanilla transformers model and run inference using the `pipeline` class. " 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "import torch\n", 245 | "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n", 246 | "\n", 247 | "# Hugging Face model id\n", 248 | "model_id = \"philschmid/llama-2-7b-instruction-generator\"\n", 249 | "\n", 250 | "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", 251 | "model = AutoModelForCausalLM.from_pretrained(model_id, device_map=\"auto\", torch_dtype=torch.float16) # we load the model in fp16 on purpose\n", 252 | "\n", 253 | "pipe = pipeline(\"text-generation\", model=model, tokenizer=tokenizer)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "lets create our vanilla base line" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "import torch \n", 270 | "\n", 271 | "vanilla_res = generate_helper(pipe)\n", 272 | "\n", 273 | "print(f\"Latency: {vanilla_res['latency']}\")\n", 274 | "print(f\"GPU memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB\")\n", 275 | "print(f\"Generated Instruction: {vanilla_res['text']}\")\n", 276 | "\n", 277 | "# Latency: 37.49ms/token\n", 278 | "# GPU memory: 12.62 GB\n", 279 | "# Generated Instruction: Write a request for PTO letter to my boss" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "# clean up \n", 289 | "del pipe\n", 290 | "del model \n", 291 | "del tokenizer\n", 292 | "torch.cuda.empty_cache()" 293 | ] 294 | }, 295 | { 296 | "attachments": {}, 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "Since we have now our baseline we can test and validate our GPTQ quantize weights. Therefore we will use the new `gptq` integration into the `AutoModelForCausalLM` class where we can directly load the `gptq` weights. " 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "import torch\n", 310 | "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n", 311 | "\n", 312 | "# path to gptq weights\n", 313 | "model_id = \"quantized_llama\"\n", 314 | "\n", 315 | "q_tokenizer = AutoTokenizer.from_pretrained(model_id)\n", 316 | "q_model = AutoModelForCausalLM.from_pretrained(model_id, device_map=\"auto\", torch_dtype=torch.float16)\n", 317 | "\n", 318 | "qtq_pipe = pipeline(\"text-generation\", model=q_model, tokenizer=q_tokenizer)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "Now, we can test our quantized model on the same prompt as our baseline." 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "gpq_res = generate_helper(qtq_pipe)\n", 335 | "\n", 336 | "print(f\"Latency: {gpq_res['latency']}\")\n", 337 | "print(f\"GPU memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB\")\n", 338 | "print(f\"Generated Instruction: {gpq_res['text']}\")\n", 339 | "\n", 340 | "# Latency: 36.0ms/token\n", 341 | "# GPU memory: 3.83 GB\n", 342 | "# Generated Instruction: Write a letter requesting time off" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "For comparison the vanilla model needed ~12.6GB Memory and the GPTQ model needed ~3.8GB Memory, with equal performance. GPTQ allowed us to save ~4x memory (don't forget pytorch has default kernels). " 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "## 5. Bonus: Run Inference with Text Generation Inference\n", 357 | "\n", 358 | "Text Generation Inference supports GPTQ model for more efficient deployments. We simply need to provide `gptq` as `QUANTIZE` environment variable when starting our container. " 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "model=\"/home/ubuntu/test-gptq\"\n", 368 | "num_shard=1\n", 369 | "quantize=\"gptq\"\n", 370 | "max_input_length=1562\n", 371 | "max_total_tokens=4096 # 4096\n", 372 | "\n", 373 | "!docker run --gpus all -ti -p 8080:80 \\\n", 374 | " -e MODEL_ID=$model \\\n", 375 | " -e QUANTIZE=$quantize \\\n", 376 | " -e NUM_SHARD=$num_shard \\\n", 377 | " -e MAX_INPUT_LENGTH=$max_input_length \\\n", 378 | " -e MAX_TOTAL_TOKENS=$max_total_tokens \\\n", 379 | " -v $model:$model \\\n", 380 | " ghcr.io/huggingface/text-generation-inference:1.0.3" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": {}, 386 | "source": [ 387 | "We can invoke our container using curl. \n", 388 | "_Note: The first request will be slow. _" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "curl 127.0.0.1:8080/generate \\\n", 398 | " -X POST \\\n", 399 | " -d '{\"inputs\":\"### Instruction:\\nUse the Input below to create an instruction, which could have been used to generate the input using an LLM.\\n\\n### Input:\\nDear [boss name],\\n\\nI am writing to request next week, August 1st through August 4th,\\noff as paid time off.\\n\\nI have some personal matters to attend to that week that require\\nme to be out of the office. I wanted to give you as much advance\\nnotice as possible so you can plan accordingly while I am away.\\n\\nThank you, [Your name]\\n\\n### Response:\",\"parameters\":{\"temperature\":0.2, \"top_p\": 0.95, \"max_new_tokens\": 256}}' \\\n", 400 | " -H 'Content-Type: application/json'" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "metadata": {}, 406 | "source": [ 407 | "With Text Generation inference we are achieving ~`22.942983ms` latency per token, which is 2x faster than transformers. If you plan to deploy your model in production, I would recommend to use Text Generation Inference." 408 | ] 409 | }, 410 | { 411 | "cell_type": "markdown", 412 | "metadata": {}, 413 | "source": [] 414 | } 415 | ], 416 | "metadata": { 417 | "kernelspec": { 418 | "display_name": "pytorch", 419 | "language": "python", 420 | "name": "python3" 421 | }, 422 | "language_info": { 423 | "codemirror_mode": { 424 | "name": "ipython", 425 | "version": 3 426 | }, 427 | "file_extension": ".py", 428 | "mimetype": "text/x-python", 429 | "name": "python", 430 | "nbconvert_exporter": "python", 431 | "pygments_lexer": "ipython3", 432 | "version": "3.9.16" 433 | }, 434 | "orig_nbformat": 4, 435 | "vscode": { 436 | "interpreter": { 437 | "hash": "2d58e898dde0263bc564c6968b04150abacfd33eed9b19aaa8e45c040360e146" 438 | } 439 | } 440 | }, 441 | "nbformat": 4, 442 | "nbformat_minor": 2 443 | } 444 | -------------------------------------------------------------------------------- /training/preprocessing/create_flan_t5_cnn_dataset.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | from transformers import AutoTokenizer 3 | import numpy as np 4 | import os 5 | from datasets import concatenate_datasets 6 | import numpy as np 7 | 8 | # experiment config 9 | model_id = "google/flan-t5-xl" 10 | 11 | # Dataset 12 | dataset_id = "cnn_dailymail" 13 | dataset_config = "3.0.0" 14 | save_dataset_path = "data" 15 | text_column = "article" 16 | summary_column = "highlights" 17 | prompt_start = "Summarize the following news article:\n" 18 | generation_start = "\nSummary:\n" 19 | prompt_template = f"{prompt_start}{{input}}{generation_start}" 20 | 21 | # Load dataset from the hub 22 | dataset = load_dataset(dataset_id, name=dataset_config) 23 | # Load tokenizer of FLAN-t5-base 24 | tokenizer = AutoTokenizer.from_pretrained(model_id) 25 | 26 | print(f"Train dataset size: {len(dataset['train'])}") 27 | print(f"Test dataset size: {len(dataset['test'])}") 28 | 29 | prompt_lenght = len(tokenizer(prompt_template.format(input=""))["input_ids"]) 30 | max_sample_length = tokenizer.model_max_length - prompt_lenght 31 | print(f"Prompt lenght: {prompt_lenght}") 32 | print(f"Max input lenght: {max_sample_length}") 33 | 34 | # The maximum total input sequence length after tokenization. 35 | # Sequences longer than this will be truncated, sequences shorter will be padded. 36 | tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map( 37 | lambda x: tokenizer(x[text_column], truncation=True), batched=True, remove_columns=[text_column, summary_column] 38 | ) 39 | max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]]) 40 | max_source_length = min(max_source_length, max_sample_length) 41 | print(f"Max source length: {max_source_length}") 42 | 43 | # The maximum total sequence length for target text after tokenization. 44 | # Sequences longer than this will be truncated, sequences shorter will be padded." 45 | tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map( 46 | lambda x: tokenizer(x[summary_column], truncation=True), batched=True, remove_columns=[text_column, summary_column] 47 | ) 48 | target_lenghts = [len(x) for x in tokenized_targets["input_ids"]] 49 | # use 95th percentile as max target length 50 | max_target_length = int(np.percentile(target_lenghts, 95)) 51 | print(f"Max target length: {max_target_length}") 52 | 53 | 54 | def preprocess_function(sample, padding="max_length"): 55 | # created prompted input 56 | inputs = [prompt_template.format(input=item) for item in sample[text_column]] 57 | 58 | # tokenize inputs 59 | model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True) 60 | 61 | # Tokenize targets with the `text_target` keyword argument 62 | labels = tokenizer( 63 | text_target=sample[summary_column], max_length=max_target_length, padding=padding, truncation=True 64 | ) 65 | 66 | # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore 67 | # padding in the loss. 68 | if padding == "max_length": 69 | labels["input_ids"] = [ 70 | [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] 71 | ] 72 | 73 | model_inputs["labels"] = labels["input_ids"] 74 | return model_inputs 75 | 76 | 77 | tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=list(dataset["train"].features)) 78 | print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}") 79 | 80 | tokenized_dataset["train"].save_to_disk(os.path.join(save_dataset_path, "train")) 81 | tokenized_dataset["test"].save_to_disk(os.path.join(save_dataset_path, "eval")) 82 | -------------------------------------------------------------------------------- /training/receipes/dpo-llama-3-1-8b-qlora.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: philschmid/llama-3-1-8b-math-orca-spectrum-10k-ep1 3 | tokenizer_name_or_path: philschmid/llama-3-1-8b-math-orca-spectrum-10k-ep1 4 | model_revision: main 5 | torch_dtype: bfloat16 6 | attn_implementation: flash_attention_2 7 | use_liger: false 8 | bf16: true 9 | tf32: true 10 | output_dir: runs/dpo-llama-3-1-8b-math-ep3 11 | 12 | # Dataset arguments 13 | dataset_id_or_path: philschmid/philschmid-llama-3-1-8b-math-orca-spectr-philschmid-DMath-candidates 14 | 15 | # LoRA arguments 16 | use_peft: true 17 | load_in_4bit: true 18 | lora_target_modules: "all-linear" 19 | # important as we need to train the special tokens for the chat template of llama 20 | lora_modules_to_save: ["lm_head", "embed_tokens"] # you might need to change this for qwen or other models 21 | lora_r: 16 22 | lora_alpha: 16 23 | 24 | # Training arguments 25 | beta: 0.1 26 | max_length: 1536 27 | max_prompt_length: 768 28 | loss_type: sigmoid # default loss, alternatives: https://huggingface.co/docs/trl/dpo_trainer#loss-functions 29 | num_train_epochs: 3 30 | per_device_train_batch_size: 1 31 | gradient_accumulation_steps: 8 32 | gradient_checkpointing: true 33 | gradient_checkpointing_kwargs: 34 | use_reentrant: false 35 | learning_rate: 5.0e-6 36 | lr_scheduler_type: constant 37 | warmup_ratio: 0.03 38 | 39 | # Logging arguments 40 | logging_strategy: steps 41 | logging_steps: 5 42 | report_to: 43 | - tensorboard 44 | save_strategy: "epoch" 45 | seed: 42 46 | 47 | # Hugging Face Hub 48 | push_to_hub: true 49 | # hub_model_id: llama-3-1-8b-math-orca-qlora-10k-ep1 # if not defined same as output_dir 50 | hub_strategy: every_save -------------------------------------------------------------------------------- /training/receipes/dpo-llama-3-1-8b.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: philschmid/llama-3-1-8b-math-orca-spectrum-10k-ep1 3 | tokenizer_name_or_path: philschmid/llama-3-1-8b-math-orca-spectrum-10k-ep1 4 | model_revision: main 5 | torch_dtype: bfloat16 6 | attn_implementation: flash_attention_2 7 | use_liger: false 8 | bf16: true 9 | tf32: true 10 | output_dir: runs/dpo-llama-3-1-8b-math 11 | 12 | # Dataset arguments 13 | dataset_id_or_path: philschmid/philschmid-llama-3-1-8b-math-orca-spectr-philschmid-DMath-candidates 14 | 15 | # Training arguments 16 | beta: 0.1 17 | max_length: 1536 18 | max_prompt_length: 768 19 | loss_type: sigmoid # default loss, alternatives: https://huggingface.co/docs/trl/dpo_trainer#loss-functions 20 | num_train_epochs: 3 21 | per_device_train_batch_size: 2 22 | gradient_accumulation_steps: 8 23 | gradient_checkpointing: true 24 | gradient_checkpointing_kwargs: 25 | use_reentrant: false 26 | learning_rate: 5.0e-7 27 | lr_scheduler_type: constant 28 | warmup_ratio: 0.03 29 | 30 | # Logging arguments 31 | logging_strategy: steps 32 | logging_steps: 5 33 | report_to: 34 | - tensorboard 35 | save_strategy: "epoch" 36 | seed: 42 37 | 38 | # Hugging Face Hub 39 | push_to_hub: true 40 | # hub_model_id: llama-3-1-8b-math-orca-qlora-10k-ep1 # if not defined same as output_dir 41 | hub_strategy: every_save -------------------------------------------------------------------------------- /training/receipes/grpo-qwen-2.5-3b-deepseek-r1-countdown.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: Qwen/Qwen2.5-3B-Instruct 3 | model_revision: main 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | bf16: true 7 | tf32: true 8 | output_dir: runs/qwen-2.5-3b-r1-countdown 9 | 10 | # Dataset arguments 11 | dataset_id_or_path: Jiayi-Pan/Countdown-Tasks-3to4 12 | 13 | # Lora Arguments 14 | # No LoRA is used here 15 | 16 | # Training arguments 17 | max_steps: 450 18 | per_device_train_batch_size: 1 19 | gradient_accumulation_steps: 8 20 | gradient_checkpointing: true 21 | gradient_checkpointing_kwargs: 22 | use_reentrant: false 23 | learning_rate: 5.0e-7 # 1.0e-6 as in the deepseek math paper 5-e7 from https://hijkzzz.notion.site/unraveling-rlhf-and-its-variants-engineering-insights#147d9a33ecc9806090f3d5c749d31f05 24 | lr_scheduler_type: cosine 25 | warmup_ratio: 0.03 26 | # GRPO specific parameters 27 | beta: 0.001 # 0.04 as in the deepseek math paper 0.001 from https://hijkzzz.notion.site/unraveling-rlhf-and-its-variants-engineering-insights#147d9a33ecc9806090f3d5c749d31f05 28 | max_prompt_length: 256 29 | max_completion_length: 1024 30 | num_generations: 8 31 | use_vllm: true 32 | # vllm_device: "cuda:3" 33 | vllm_gpu_memory_utilization: 0.5 34 | 35 | # Logging arguments 36 | logging_strategy: steps 37 | logging_steps: 2 38 | report_to: 39 | - tensorboard 40 | save_strategy: "steps" 41 | save_steps: 25 42 | seed: 42 43 | 44 | # Hugging Face Hub 45 | push_to_hub: true 46 | # hub_model_id: llama-3-1-8b-math-orca-qlora-10k-ep1 # if not defined same as output_dir 47 | hub_strategy: every_save -------------------------------------------------------------------------------- /training/receipes/llama-3-1-8b-qlora.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: Meta-Llama/Meta-Llama-3.1-8B 3 | tokenizer_name_or_path: Meta-Llama/Meta-Llama-3.1-8B-Instruct 4 | model_revision: main 5 | torch_dtype: bfloat16 6 | attn_implementation: flash_attention_2 7 | use_liger: true 8 | bf16: true 9 | tf32: true 10 | output_dir: runs/llama-3-1-8b-math-orca-qlora-10k-ep1 11 | 12 | # Dataset arguments 13 | dataset_id_or_path: train_dataset.json 14 | max_seq_length: 1024 15 | packing: true 16 | 17 | # LoRA arguments 18 | use_peft: true 19 | load_in_4bit: true 20 | lora_target_modules: "all-linear" 21 | # important as we need to train the special tokens for the chat template of llama 22 | lora_modules_to_save: ["lm_head", "embed_tokens"] # you might need to change this for qwen or other models 23 | lora_r: 16 24 | lora_alpha: 16 25 | 26 | # Training arguments 27 | num_train_epochs: 1 28 | per_device_train_batch_size: 8 29 | gradient_accumulation_steps: 2 30 | gradient_checkpointing: true 31 | gradient_checkpointing_kwargs: 32 | use_reentrant: false 33 | learning_rate: 2.0e-4 34 | lr_scheduler_type: constant 35 | warmup_ratio: 0.1 36 | 37 | # Logging arguments 38 | logging_strategy: steps 39 | logging_steps: 5 40 | report_to: 41 | - tensorboard 42 | save_strategy: "epoch" 43 | seed: 42 44 | 45 | # Hugging Face Hub 46 | push_to_hub: true 47 | # hub_model_id: llama-3-1-8b-math-orca-qlora-10k-ep1 # if not defined same as output_dir 48 | hub_strategy: every_save -------------------------------------------------------------------------------- /training/receipes/llama-3-1-8b-spectrum.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: Meta-Llama/Meta-Llama-3.1-8B 3 | tokenizer_name_or_path: Meta-Llama/Meta-Llama-3.1-8B-Instruct 4 | model_revision: main 5 | torch_dtype: bfloat16 6 | attn_implementation: flash_attention_2 7 | use_liger: true 8 | bf16: true 9 | tf32: true 10 | output_dir: runs/llama-3-1-8b-math-orca-spectrum-10k-ep1 11 | 12 | # Dataset arguments 13 | dataset_id_or_path: train_dataset.json 14 | max_seq_length: 1024 15 | packing: true 16 | 17 | # Spectrum arguments 18 | spectrum_config_path: configs/spectrum/snr_results_meta-llama-Meta-Llama-3.1-8B_unfrozenparameters_30percent.yaml 19 | 20 | # Training arguments 21 | num_train_epochs: 1 22 | per_device_train_batch_size: 8 23 | gradient_accumulation_steps: 2 24 | gradient_checkpointing: true 25 | gradient_checkpointing_kwargs: 26 | use_reentrant: false 27 | learning_rate: 5.0e-5 28 | lr_scheduler_type: cosine 29 | warmup_ratio: 0.1 30 | 31 | # Logging arguments 32 | logging_strategy: steps 33 | logging_steps: 5 34 | report_to: 35 | - tensorboard 36 | save_strategy: "epoch" 37 | seed: 42 38 | 39 | # Hugging Face Hub 40 | push_to_hub: true 41 | # hub_model_id: llama-3-1-8b-math-orca-qlora-10k-ep1 # if not defined same as output_dir 42 | hub_strategy: every_save -------------------------------------------------------------------------------- /training/run_ds_lora.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import cast 3 | 4 | import os 5 | import subprocess 6 | from typing import Optional 7 | import torch 8 | 9 | from transformers import HfArgumentParser, TrainingArguments, Trainer 10 | from utils.peft_utils import SaveDeepSpeedPeftModelCallback, create_and_prepare_model 11 | from datasets import load_from_disk 12 | 13 | 14 | # Define and parse arguments. 15 | @dataclass 16 | class ScriptArguments: 17 | """ 18 | Additional arguments for training, which are not part of TrainingArguments. 19 | """ 20 | model_id: str = field( 21 | metadata={ 22 | "help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc." 23 | }, 24 | ) 25 | dataset_path: Optional[str] = field( 26 | default="timdettmers/openassistant-guanaco", 27 | metadata={"help": "The preference dataset to use."}, 28 | ) 29 | lora_alpha: Optional[int] = field(default=16) 30 | lora_dropout: Optional[float] = field(default=0.1) 31 | lora_r: Optional[int] = field(default=64) 32 | use_flash_attn: Optional[bool] = field( 33 | default=False, 34 | metadata={"help": "Enables Flash attention for training."}, 35 | ) 36 | merge_adapters: bool = field( 37 | metadata={"help": "Wether to merge weights for LoRA."}, 38 | default=False, 39 | ) 40 | 41 | 42 | def training_function(script_args:ScriptArguments, training_args:TrainingArguments): 43 | 44 | # Load processed dataset from disk 45 | dataset = load_from_disk(script_args.dataset_path) 46 | 47 | # Load and create peft model 48 | model, peft_config, tokenizer = create_and_prepare_model(script_args.model_id,training_args, script_args) 49 | model.config.use_cache = False 50 | 51 | 52 | # Create trainer and add callbacks 53 | trainer = Trainer(model=model, args=training_args, train_dataset=dataset) 54 | trainer.accelerator.print(f"{trainer.model}") 55 | trainer.model.print_trainable_parameters() 56 | trainer.add_callback(SaveDeepSpeedPeftModelCallback(trainer, save_steps=training_args.save_steps)) 57 | 58 | # Start training 59 | trainer.train() 60 | 61 | # Save model on main process 62 | trainer.accelerator.wait_for_everyone() 63 | state_dict = trainer.accelerator.get_state_dict(trainer.deepspeed) 64 | unwrapped_model = trainer.accelerator.unwrap_model(trainer.deepspeed) 65 | if trainer.accelerator.is_main_process: 66 | unwrapped_model.save_pretrained(training_args.output_dir, state_dict=state_dict) 67 | trainer.accelerator.wait_for_everyone() 68 | 69 | # TODO: add merge adapters 70 | # Save everything else on main process 71 | if trainer.args.process_index == 0: 72 | if script_args.merge_adapters: 73 | # merge adapter weights with base model and save 74 | # save int 4 model 75 | trainer.model.save_pretrained(training_args.output_dir, safe_serialization=False) 76 | # clear memory 77 | del model 78 | del trainer 79 | torch.cuda.empty_cache() 80 | 81 | from peft import AutoPeftModelForCausalLM 82 | 83 | # load PEFT model in fp16 84 | model = AutoPeftModelForCausalLM.from_pretrained( 85 | training_args.output_dir, 86 | low_cpu_mem_usage=True, 87 | torch_dtype=torch.float16, 88 | ) 89 | # Merge LoRA and base model and save 90 | model = model.merge_and_unload() 91 | model.save_pretrained( 92 | training_args.output_dir, safe_serialization=True, max_shard_size="8GB" 93 | ) 94 | else: 95 | trainer.model.save_pretrained( 96 | training_args.output_dir, safe_serialization=True 97 | ) 98 | 99 | # save tokenizer 100 | tokenizer.save_pretrained(training_args.output_dir) 101 | 102 | 103 | def main(): 104 | parser = HfArgumentParser([ScriptArguments,TrainingArguments]) 105 | script_args, training_args = parser.parse_args_into_dataclasses() 106 | script_args = cast(ScriptArguments, script_args) 107 | training_args = cast(TrainingArguments, training_args) 108 | 109 | training_function(script_args, training_args) 110 | 111 | 112 | if __name__ == "__main__": 113 | main() -------------------------------------------------------------------------------- /training/scripts/bloke_gptq.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copied from TheBloke: https://github.com/TheBlokeAI/AIScripts/blob/main/quant_autogptq.py#L59 3 | # python /home/ubuntu/deep-learning-pytorch-huggingface/training/scripts/bloke_gptq.py philschmid/llama-2-7b-instruction-generator gptq_res/ wikitext --seqlen 1024 4 | # 5 | 6 | import time 7 | import os 8 | import logging 9 | import random 10 | from datasets import load_dataset 11 | 12 | class QuantAutoGPTQ: 13 | def __init__(self, model_name_or_path, output_dir, dataset, 14 | num_samples=128, trust_remote_code=False, cache_examples=True, 15 | use_fast=True, use_triton=False, bits=[4], group_size=[128], damp=[0.01], 16 | desc_act=[False], dtype='float16', seqlen=2048, batch_size=1, stop_file=None, 17 | make_folder=False, GPU=0, cuda_alloc_conf=None): 18 | 19 | # Limit visible GPU to the one specified 20 | # We don't currently support multi-GPU, as AutoGPTQ can't use more than one GPU for quant anyway. 21 | #os.environ["CUDA_VISIBLE_DEVICES"] = str(GPU) 22 | 23 | # Allow specifying CUDA allocation config, eg PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32 24 | # This can allow for quantising larger models without running out of VRAM 25 | #if cuda_alloc_conf is not None: 26 | # os.environ["PYTORCH_CUDA_ALLOC_CONF"] = cuda_alloc_conf 27 | 28 | self.pretrained_model_dir = model_name_or_path 29 | self.output_dir_base = output_dir 30 | self.dataset = dataset 31 | self.num_samples = num_samples 32 | self.trust_remote_code = trust_remote_code 33 | self.cache_examples = cache_examples 34 | self.use_fast = use_fast 35 | self.use_triton = use_triton 36 | 37 | def check_list(item): 38 | return item if isinstance(item, list) else [item] 39 | 40 | self.bits = check_list(bits) 41 | self.group_size = check_list(group_size) 42 | self.desc_act = check_list(desc_act) 43 | self.damp = check_list(damp) 44 | 45 | self.dtype = dtype 46 | self.seqlen = seqlen 47 | self.batch_size = batch_size 48 | self.stop_file = stop_file 49 | self.make_folder = make_folder 50 | 51 | self.logger = logging.getLogger(__name__) 52 | self.logger.propagate = True 53 | 54 | from transformers import AutoTokenizer 55 | self.logger.info("Loading tokenizer") 56 | self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model_dir, 57 | use_fast=self.use_fast, 58 | trust_remote_code=self.trust_remote_code) 59 | 60 | @staticmethod 61 | def append_dataset(tokenized, num_samples, seqlen): 62 | import numpy as np 63 | import torch 64 | 65 | random.seed(0) 66 | np.random.seed(0) 67 | torch.random.manual_seed(0) 68 | 69 | traindataset = [] 70 | for _ in range(num_samples): 71 | i = random.randint(0, tokenized.input_ids.shape[1] - seqlen - 1) 72 | j = i + seqlen 73 | inp = tokenized.input_ids[:, i:j] 74 | attention_mask = torch.ones_like(inp) 75 | traindataset.append({'input_ids':inp,'attention_mask': attention_mask}) 76 | return traindataset 77 | 78 | #TODO: make a generic method that can load a dataset from HF hub and be told what column(s) to use 79 | def get_math(self): 80 | data = load_dataset('andersonbcdefg/math', split='train') 81 | 82 | extract = data[0:2000] 83 | text = '' 84 | for input, output in zip(extract['message_1'], extract['message_2']): 85 | text += input + ': ' + output + '\n' 86 | 87 | self.logger.info("Tokenising Maths dataset") 88 | tokenized = self.tokenizer(text, return_tensors='pt') 89 | 90 | return self.append_dataset(tokenized, self.num_samples, self.seqlen) 91 | def get_medical(self): 92 | data = load_dataset('medalpaca/medical_meadow_wikidoc', split='train') 93 | 94 | extract = data[0:1000] 95 | text = '' 96 | for input, output in zip(extract['input'], extract['output']): 97 | text += input + ' ' + output + '\n' 98 | 99 | self.logger.info("Tokenising Medical dataset") 100 | tokenized = self.tokenizer(text, return_tensors='pt') 101 | 102 | return self.append_dataset(tokenized, self.num_samples, self.seqlen) 103 | 104 | def get_code(self): 105 | data = load_dataset('nickrosh/Evol-Instruct-Code-80k-v1', split='train') 106 | 107 | extract = data[0:1500] 108 | text = '\n'.join(extract['output']) 109 | self.logger.info("Tokenising Code dataset") 110 | tokenized = self.tokenizer(text, return_tensors='pt') 111 | 112 | return self.append_dataset(tokenized, self.num_samples, self.seqlen) 113 | 114 | def get_german(self): 115 | data = load_dataset('deepset/germanquad', split='train') 116 | 117 | def transform_context(sample): 118 | split_context = sample['context'].split('===') 119 | if len(split_context) >= 3: 120 | trans_context = split_context[2] 121 | else: 122 | trans_context = sample['context'] 123 | return {'context': trans_context.strip()} 124 | 125 | subset_data = data.select(range(2000)) 126 | transformed_subset = subset_data.map(transform_context) 127 | text = '\n'.join([item['context'] for item in transformed_subset]) 128 | 129 | self.logger.info("Tokenising German dataset") 130 | tokenized = self.tokenizer(text, return_tensors='pt') 131 | 132 | return self.append_dataset(tokenized, self.num_samples, self.seqlen) 133 | 134 | def get_french(self): 135 | data = load_dataset('gustavecortal/diverse_french_news', split='train') 136 | 137 | extract = data[0:700] 138 | text = '\n'.join(extract['text']) 139 | self.logger.info("Tokenising French dataset") 140 | tokenized = self.tokenizer(text, return_tensors='pt') 141 | 142 | return self.append_dataset(tokenized, self.num_samples, self.seqlen) 143 | 144 | def get_wikitext2(self): 145 | wikidata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test') 146 | wikilist = [' \n' if s == '' else s for s in wikidata['text'] ] 147 | 148 | text = ''.join(wikilist) 149 | self.logger.info("Tokenising wikitext2") 150 | tokenized = self.tokenizer(text, return_tensors='pt') 151 | 152 | return self.append_dataset(tokenized, self.num_samples, self.seqlen) 153 | 154 | def get_c4(self): 155 | import numpy as np 156 | import torch 157 | traindata = load_dataset( 158 | 'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train', use_auth_token=False 159 | ) 160 | 161 | trainloader = [] 162 | for _ in range(self.num_samples): 163 | while True: 164 | i = random.randint(0, len(traindata) - 1) 165 | trainenc = self.tokenizer(traindata[i]['text'], return_tensors='pt') 166 | if trainenc.input_ids.shape[1] >= self.seqlen: 167 | break 168 | i = random.randint(0, trainenc.input_ids.shape[1] - self.seqlen - 1) 169 | j = i + self.seqlen 170 | inp = trainenc.input_ids[:, i:j] 171 | attention_mask = torch.ones_like(inp) 172 | trainloader.append({'input_ids':inp,'attention_mask': attention_mask}) 173 | 174 | return trainloader 175 | 176 | def quantize(self, output_dir, traindataset, bits, group_size, desc_act, damp): 177 | # Hide the super annoying bitsandbytes loading message. We don't even use BnB but I don't know if I can stop it loading entirely. 178 | os.environ['BITSANDBYTES_NOWELCOME'] = '1' 179 | 180 | # We only import Torch and AutoGPTQ when needed, so that earlier set env vars will affect them. 181 | import torch 182 | from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig 183 | 184 | quantize_config = BaseQuantizeConfig( 185 | bits=bits, 186 | group_size=group_size, 187 | desc_act=desc_act, 188 | damp_percent=damp 189 | ) 190 | 191 | if self.dtype == 'float16': 192 | torch_dtype = torch.float16 193 | elif self.dtype == 'float32': 194 | torch_dtype = torch.float32 195 | elif self.dtype == 'bfloat16': 196 | torch_dtype = torch.bfloat16 197 | else: 198 | raise ValueError(f"Unsupported dtype: {self.dtype}") 199 | 200 | self.logger.info(f"Loading model from {self.pretrained_model_dir} with trust_remote_code={self.trust_remote_code} and dtype={torch_dtype}") 201 | model = AutoGPTQForCausalLM.from_pretrained(self.pretrained_model_dir, quantize_config=quantize_config, 202 | low_cpu_mem_usage=True, torch_dtype=torch_dtype, trust_remote_code=self.trust_remote_code) 203 | 204 | self.logger.info(f"Starting quantization to {output_dir} with use_triton={self.use_triton}") 205 | start_time = time.time() 206 | model.quantize(traindataset, use_triton=self.use_triton, batch_size=self.batch_size, cache_examples_on_gpu=self.cache_examples) 207 | 208 | self.logger.info(f"Time to quantize model at {output_dir} with use_triton={self.use_triton}: {time.time() - start_time:.2f}") 209 | 210 | self.logger.info(f"Saving quantized model to {output_dir}") 211 | model.save_quantized(output_dir, use_safetensors=True) 212 | self.logger.info("Done.") 213 | 214 | def run_quantization(self): 215 | #TODO: This is messy, should be dynamic 216 | if self.dataset == 'wikitext': 217 | traindataset = self.get_wikitext2() 218 | elif self.dataset == 'code' or self.dataset == 'evol-instruct-code': 219 | traindataset = self.get_code() 220 | elif self.dataset == 'math' or self.dataset == 'maths' or self.dataset == 'camel-ai/math': 221 | traindataset = self.get_math() 222 | elif self.dataset == 'medical' or self.dataset == 'medical_meadow_wikidoc': 223 | traindataset = self.get_medical() 224 | elif self.dataset == 'german' or self.dataset == 'germanquad': 225 | traindataset = self.get_german() 226 | elif self.dataset == 'french' or self.dataset == 'diverse_french_news': 227 | traindataset = self.get_french() 228 | elif self.dataset == 'c4': 229 | traindataset = self.get_c4() 230 | else: 231 | self.logger.error(f"Unsupported dataset: {self.dataset}") 232 | raise ValueError(f"Unsupported dataset: {self.dataset}") 233 | 234 | abort = False 235 | iterations=[] 236 | for bits in self.bits: 237 | for group_size in self.group_size: 238 | for desc_act in self.desc_act: 239 | for damp in self.damp: 240 | desc_act = desc_act == 1 and True or False 241 | iterations.append({"bits": bits, "group_size": group_size, "desc_act": desc_act, "damp": damp}) 242 | 243 | num_iters = len(iterations) 244 | if num_iters > 1: 245 | logger.info(f"Starting {num_iters} quantizations.") 246 | count=1 247 | for iteration in iterations: 248 | if abort: 249 | break 250 | if self.stop_file is not None and os.path.exists(self.stop_file): 251 | self.logger.info(f"Stopping as {self.stop_file} exists") 252 | abort = True 253 | break 254 | 255 | bits = iteration['bits'] 256 | group_size = iteration['group_size'] 257 | desc_act = iteration['desc_act'] 258 | damp = iteration['damp'] 259 | 260 | try: 261 | if self.make_folder: 262 | output_dir = os.path.join(self.output_dir_base, f"{bits}bits-{group_size}g-desc_act_{desc_act}-damp_{damp}") 263 | else: 264 | output_dir = self.output_dir_base 265 | os.makedirs(output_dir, exist_ok=True) 266 | try: 267 | if num_iters > 1: 268 | self.logger.info(f"Starting quantization {count}/{num_iters}") 269 | self.logger.info(f"Quantising with bits={bits} group_size={group_size} desc_act={desc_act} damp={damp} to {output_dir}") 270 | self.quantize(output_dir, traindataset, bits, group_size, desc_act, damp) 271 | except KeyboardInterrupt: 272 | logger.error(f"Aborted. Will delete {output_dir}") 273 | os.rmdir(output_dir) 274 | abort = True 275 | except: 276 | raise 277 | 278 | finally: 279 | count += 1 280 | 281 | if __name__ == "__main__": 282 | import argparse 283 | logger = logging.getLogger() 284 | logging.basicConfig(format="%(asctime)s %(levelname)s [%(name)s] %(message)s", 285 | level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") 286 | 287 | parser = argparse.ArgumentParser(description='AutoGPTQ quantize') 288 | parser.add_argument('pretrained_model_dir', type=str, help='Repo name') 289 | parser.add_argument('output_dir_base', type=str, help='Output base folder') 290 | parser.add_argument('dataset', type=str, help='Quantisation dataset') 291 | parser.add_argument('--num_samples', type=int, default=128, help='Number of dataset samples') 292 | parser.add_argument('--trust_remote_code', action="store_true", help='Trust remote code') 293 | parser.add_argument('--cache_examples', type=int, default=1, help='Cache examples on GPU') 294 | parser.add_argument('--use_fast', action="store_true", help='Use fast tokenizer') 295 | parser.add_argument('--use_triton', action="store_true", help='Use Triton for quantization') 296 | parser.add_argument('--bits', type=int, nargs='+', default=[4], help='Quantize bit(s)') 297 | parser.add_argument('--group_size', type=int, nargs='+', default=[128], help='Quantize group size(s)') 298 | parser.add_argument('--damp', type=float, nargs='+', default=[0.01], help='Quantize damp_percent(s)') 299 | parser.add_argument('--desc_act', type=int, nargs='+', default=[0], help='Quantize desc_act(s) - 1 = True, 0 = False') 300 | parser.add_argument('--dtype', type=str, choices=['float16', 'float32', 'bfloat16'], default='float16', help='Unquantised model dtype') 301 | parser.add_argument('--seqlen', type=int, default=2048, help='Model sequence length') 302 | parser.add_argument('--batch_size', type=int, default=1, help='Quantize batch size for processing dataset samples') 303 | parser.add_argument('--stop_file', type=str, help='Filename to look for to stop inference, specific to this instance') 304 | parser.add_argument('--make_folders', action="store_true", help='Make folders for each quantization using params in folder name') 305 | 306 | args = parser.parse_args() 307 | quantizer = QuantAutoGPTQ(args.pretrained_model_dir, 308 | args.output_dir_base, 309 | args.dataset, 310 | num_samples=args.num_samples, 311 | trust_remote_code=args.trust_remote_code, 312 | cache_examples=args.cache_examples, 313 | use_fast=args.use_fast, 314 | use_triton=args.use_triton, 315 | bits=args.bits, 316 | group_size=args.group_size, 317 | desc_act=args.desc_act, 318 | damp=args.damp, 319 | dtype=args.dtype, 320 | seqlen=args.seqlen, 321 | batch_size=args.batch_size, 322 | stop_file=args.stop_file, 323 | make_folder=args.make_folders) 324 | quantizer.run_quantization() -------------------------------------------------------------------------------- /training/scripts/dpo/create_preference_dataset.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | import logging 3 | import os 4 | import time 5 | from typing import cast 6 | import re 7 | 8 | import torch 9 | from datasets import load_dataset 10 | from tqdm.auto import tqdm 11 | from trl import TrlParser 12 | from vllm import LLM, SamplingParams 13 | from datasets import Dataset 14 | from peft import LoraConfig, AutoPeftModelForCausalLM 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | @dataclass 19 | class CandidateArguments: 20 | generation_model_name_or_path: str = field( 21 | default=None, 22 | metadata={ 23 | 'help': 'Huggingface model name or path to model directory, for the model that will be used for generation, defaults to SFT model or previous iteration model.' 24 | }, 25 | ) 26 | dataset_id: str = field( 27 | default=None, 28 | metadata={ 29 | 'help': 'Path to the input dataset, that will be used to generate candidates, defaults to previous iteration output dataset.' 30 | }, 31 | ) 32 | sample_size: int = field( 33 | default=None, 34 | metadata={ 35 | 'help': 'Number of samples to generate, defaults to as many as possible.' 36 | }, 37 | ) 38 | prompt_column: str = field( 39 | default='question', 40 | metadata={'help': 'Column name in the input dataset that contains the messages.'}, 41 | ) 42 | answer_column: str = field( 43 | default='answer', 44 | metadata={'help': 'Column name in the input dataset that contains the answer.'}, 45 | ) 46 | system_prompt: str = field( 47 | default= """Solve the given high school math problem by providing a clear explanation of each step leading to the final solution. 48 | 49 | Provide a detailed breakdown of your calculations, beginning with an explanation of the problem and describing how you derive each formula, value, or conclusion. Use logical steps that build upon one another, to arrive at the final answer in a systematic manner. 50 | 51 | # Steps 52 | 53 | 1. **Understand the Problem**: Restate the given math problem and clearly identify the main question and any important given values. 54 | 2. **Set Up**: Identify the key formulas or concepts that could help solve the problem (e.g., algebraic manipulation, geometry formulas, trigonometric identities). 55 | 3. **Solve Step-by-Step**: Iteratively progress through each step of the math problem, justifying why each consecutive operation brings you closer to the solution. 56 | 4. **Double Check**: If applicable, double check the work for accuracy and sense, and mention potential alternative approaches if any. 57 | 5. **Final Answer**: Provide the numerical or algebraic solution clearly, accompanied by appropriate units if relevant. 58 | 59 | # Notes 60 | 61 | - Always clearly define any variable or term used. 62 | - Wherever applicable, include unit conversions or context to explain why each formula or step has been chosen. 63 | - Assume the level of mathematics is suitable for high school, and avoid overly advanced math techniques unless they are common at that level. 64 | """, 65 | metadata={'help': 'System prompt to use for generation.'}, 66 | ) 67 | num_solutions: int = field( 68 | default=5, 69 | metadata={'help': 'Number of solutions to generate for each input.'}, 70 | ) 71 | batch_size: int = field( 72 | default=1, 73 | metadata={'help': 'Batch size for generation.'}, 74 | ) 75 | max_new_tokens: int = field( 76 | default=2048, 77 | metadata={'help': 'Maximum number of new tokens to generate.'}, 78 | ) 79 | temperature: float = field( 80 | default=0.7, 81 | metadata={'help': 'Temperature for generation.'}, 82 | ) 83 | top_p: float = field( 84 | default=1.0, 85 | metadata={'help': 'Top-p for generation.'}, 86 | ) 87 | 88 | def score_solutions( 89 | candidate_result: str, 90 | ground_truth_result: str, 91 | ) -> bool: 92 | # finds the answer in the candidate result 93 | regex_pattern = r'\b\d+\b' 94 | match = re.findall(regex_pattern, candidate_result) 95 | 96 | if match: 97 | return match[-1] == ground_truth_result 98 | else: 99 | return False 100 | 101 | 102 | def vllm_create_candidates( 103 | dataset: Dataset, 104 | model_name_or_path: str, 105 | num_solutions: int, 106 | max_new_tokens: int, 107 | batch_size: int = 1, 108 | prompt_column: str = 'prompt', 109 | system_prompt: str = None, 110 | answer_column: str = 'answer', 111 | sample_size: int = None, 112 | **kwargs, 113 | ) -> Dataset: 114 | 115 | # Loads the model on all available GPUs with vLLM 116 | llm = LLM( 117 | model=model_name_or_path, 118 | tokenizer=model_name_or_path, 119 | tensor_parallel_size=torch.cuda.device_count(), 120 | max_model_len=4096, 121 | ) 122 | # formats the prompt using the system prompt and the prompt column 123 | tokenizer = llm.get_tokenizer() 124 | def format_prompt(s): 125 | messages = [ 126 | {"role": "system", "content": system_prompt}, 127 | {"role": "user", "content": s[prompt_column]} 128 | ] 129 | return {"prompt": tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True), "messages": messages} 130 | 131 | dataset = dataset.map(format_prompt) 132 | # print the first prompt 133 | print('First prompt:', dataset['prompt'][0]) 134 | 135 | # set sampling params 136 | sampling_params = SamplingParams( 137 | max_tokens=max_new_tokens, 138 | n=num_solutions, 139 | temperature=kwargs.get('temperature', 1.0), 140 | top_p=kwargs.get('top_p', 1), 141 | ) 142 | 143 | # Iterate over the dataset with batch size to generate candidates and create preference pairs based on the correct answer and ground truth 144 | preference_dataset = [] 145 | for i in tqdm(range(0, len(dataset), batch_size), desc=f'Generating solutions: Already generated {len(preference_dataset)} preference pairs'): 146 | batch = dataset[i : i + batch_size] 147 | # Generate `num_solutions` candidates per batch 148 | result = llm.generate(batch['prompt'], sampling_params, use_tqdm=False) 149 | for j in range(0, len(batch['prompt'])): 150 | # iterate each candidate and check if it is correct 151 | preference_pair = { 152 | "system_prompt": system_prompt, 153 | "prompt": batch[prompt_column][j], 154 | "ground_truth": batch[answer_column][j], 155 | } 156 | for cand in result[j].outputs: 157 | # check if the candidate is correct 158 | cand_score = score_solutions(candidate_result=cand.text, ground_truth_result=batch[answer_column][j]) 159 | if cand_score and preference_pair.get('chosen',None) is None: 160 | preference_pair['chosen'] = cand.text 161 | elif not cand_score and preference_pair.get('rejected',None) is None: 162 | preference_pair['rejected'] = cand.text 163 | # check if the pair is complete to prevent overwriting 164 | if preference_pair.get('chosen',None) and preference_pair.get('rejected',None): 165 | continue 166 | 167 | # check is the generated candidates lead to a complete preference pair 168 | if preference_pair.get('chosen',None) and preference_pair.get('rejected',None): 169 | print(f'Found preference pair, adding to dataset.') 170 | preference_dataset.append(preference_pair) 171 | 172 | print(f'Generated {len(preference_dataset)} preference pairs') 173 | if len(preference_dataset) >= sample_size: 174 | break 175 | return Dataset.from_list(preference_dataset) 176 | 177 | 178 | def main(): 179 | parser = TrlParser((CandidateArguments)) 180 | script_args = parser.parse_args_and_config()[0] 181 | script_args = cast(CandidateArguments, script_args) 182 | 183 | # load dataset and tokenizer 184 | dataset = load_dataset(script_args.dataset_id, split='train') 185 | print(f'Generating {script_args.num_solutions} solutions for {len(dataset)} prompts...') 186 | 187 | start_time = time.time() 188 | candidates_ds = vllm_create_candidates( 189 | dataset, 190 | model_name_or_path=script_args.generation_model_name_or_path, 191 | num_solutions=script_args.num_solutions, 192 | max_new_tokens=script_args.max_new_tokens, 193 | batch_size=script_args.batch_size, 194 | prompt_column=script_args.prompt_column, 195 | answer_column=script_args.answer_column, 196 | system_prompt=script_args.system_prompt, 197 | temperature=script_args.temperature, 198 | top_p=script_args.top_p, 199 | sample_size=script_args.sample_size if script_args.sample_size is not None else len(dataset), 200 | ) 201 | print(f'Generated {len(dataset) * script_args.num_solutions} solutions in {time.time() - start_time:.2f} seconds.') 202 | 203 | save_dataset_id = f"{script_args.generation_model_name_or_path.replace('/', '-')[:40]}-{script_args.dataset_id.replace('/', '-')[:40]}-candidates" 204 | candidates_ds.push_to_hub(save_dataset_id) 205 | 206 | if __name__ == '__main__': 207 | main() 208 | -------------------------------------------------------------------------------- /training/scripts/dpo/run_dpo.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import torch 4 | from transformers import ( 5 | AutoModelForCausalLM, 6 | set_seed, 7 | ) 8 | from dataclasses import dataclass 9 | from datetime import datetime 10 | from distutils.util import strtobool 11 | import logging 12 | import os 13 | from typing import Optional 14 | 15 | os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" 16 | import torch 17 | from transformers import ( 18 | AutoModelForCausalLM, 19 | AutoTokenizer, 20 | set_seed, 21 | BitsAndBytesConfig, 22 | ) 23 | from transformers.trainer_utils import get_last_checkpoint 24 | from transformers.utils import is_liger_kernel_available 25 | from trl import TrlParser, ModelConfig, get_peft_config 26 | from datasets import load_dataset 27 | from trl import ( 28 | DPOTrainer, 29 | DPOConfig, 30 | TrlParser, 31 | get_peft_config, 32 | ModelConfig, 33 | ) 34 | 35 | from datasets import load_dataset 36 | 37 | 38 | ######################## 39 | # Custom dataclasses 40 | ######################## 41 | @dataclass 42 | class ScriptArguments: 43 | dataset_id_or_path: str 44 | dataset_splits: str = "train" 45 | tokenizer_name_or_path: str = None 46 | 47 | 48 | ######################## 49 | # Setup logging 50 | ######################## 51 | logging.basicConfig(level=logging.INFO) 52 | logger = logging.getLogger(__name__) 53 | logger.setLevel(logging.INFO) 54 | handler = logging.StreamHandler() 55 | handler.setFormatter( 56 | logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") 57 | ) 58 | logger.addHandler(handler) 59 | 60 | ######################## 61 | # Helper functions 62 | ######################## 63 | 64 | 65 | def get_checkpoint(training_args: DPOConfig): 66 | last_checkpoint = None 67 | if os.path.isdir(training_args.output_dir): 68 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 69 | return last_checkpoint 70 | 71 | 72 | def dpo_function( 73 | model_args: ModelConfig, script_args: ScriptArguments, training_args: DPOConfig 74 | ): 75 | ######################### 76 | # Log parameters 77 | ######################### 78 | logger.info(f"Model parameters {model_args}") 79 | logger.info(f"Training/evaluation parameters {training_args}") 80 | 81 | ############### 82 | # Load datasets 83 | ############### 84 | if script_args.dataset_id_or_path.endswith(".json"): 85 | train_dataset = load_dataset( 86 | "json", data_files=script_args.dataset_id_or_path, split="train" 87 | ) 88 | else: 89 | train_dataset = load_dataset( 90 | script_args.dataset_id_or_path, split=script_args.dataset_splits 91 | ) 92 | 93 | logger.info( 94 | f"Loaded dataset with {len(train_dataset)} samples and the following features: {train_dataset.features}" 95 | ) 96 | 97 | ################ 98 | # Load tokenizer 99 | ################ 100 | tokenizer = AutoTokenizer.from_pretrained( 101 | ( 102 | script_args.tokenizer_name_or_path 103 | if script_args.tokenizer_name_or_path 104 | else model_args.model_name_or_path 105 | ), 106 | revision=model_args.model_revision, 107 | trust_remote_code=model_args.trust_remote_code, 108 | ) 109 | if tokenizer.pad_token is None: 110 | tokenizer.pad_token = tokenizer.eos_token 111 | 112 | ##################### 113 | # Prepare and format dataset 114 | ##################### 115 | def format_dpo_sample(sample): 116 | prompt = tokenizer.apply_chat_template( 117 | [ 118 | {"role": "system", "content": sample["system_prompt"]}, 119 | {"role": "user", "content": sample["prompt"]}, 120 | ], 121 | tokenize=False, 122 | ) 123 | chosen = tokenizer.apply_chat_template( 124 | [{"role": "user", "content": sample["chosen"]}], tokenize=False 125 | ) 126 | rejected = tokenizer.apply_chat_template( 127 | [{"role": "user", "content": sample["rejected"]}], tokenize=False 128 | ) 129 | return {"prompt": prompt, "chosen": chosen, "rejected": rejected} 130 | 131 | # For DPO/ORPO, the inputs are triples of (prompt, chosen, rejected), where `chosen` and `rejected` are the final turn of a dialogue 132 | train_dataset = train_dataset.map( 133 | format_dpo_sample, remove_columns=train_dataset.column_names 134 | ) 135 | 136 | # remove all columns except chosen, rejected 137 | print(f"Columns: {train_dataset.features.keys()}") 138 | train_dataset = train_dataset.select_columns(["prompt", "chosen", "rejected"]) 139 | 140 | ####################################### 141 | # Load the model and/or reference model 142 | ####################################### 143 | 144 | model_kwargs = dict( 145 | revision=model_args.model_revision, # What revision from Huggingface to use, defaults to main 146 | trust_remote_code=model_args.trust_remote_code, # Whether to trust the remote code, this also you to fine-tune custom architectures 147 | attn_implementation=model_args.attn_implementation, # What attention implementation to use, defaults to flash_attention_2 148 | torch_dtype=( 149 | model_args.torch_dtype 150 | if model_args.torch_dtype in ["auto", None] 151 | else getattr(torch, model_args.torch_dtype) 152 | ), # What torch dtype to use, defaults to auto 153 | use_cache=False if training_args.gradient_checkpointing else True, # Whether 154 | low_cpu_mem_usage=( 155 | True 156 | if not strtobool(os.environ.get("ACCELERATE_USE_DEEPSPEED", "false")) 157 | else None 158 | ), # Reduces memory usage on CPU for loading the model 159 | ) 160 | 161 | # Check which training method to use and if 4-bit quantization is needed 162 | if model_args.load_in_4bit: 163 | model_kwargs["quantization_config"] = BitsAndBytesConfig( 164 | load_in_4bit=True, 165 | bnb_4bit_use_double_quant=True, 166 | bnb_4bit_quant_type="nf4", 167 | bnb_4bit_compute_dtype=model_kwargs["torch_dtype"], 168 | bnb_4bit_quant_storage=model_kwargs["torch_dtype"], 169 | ) 170 | if model_args.use_peft: 171 | peft_config = get_peft_config(model_args) 172 | else: 173 | peft_config = None 174 | 175 | # Policy Model 176 | model = AutoModelForCausalLM.from_pretrained( 177 | model_args.model_name_or_path, **model_kwargs 178 | ) 179 | # Checks wether we use adapters for reference model or not 180 | if peft_config is None: 181 | model_ref = AutoModelForCausalLM.from_pretrained( 182 | model_args.model_name_or_path, **model_kwargs 183 | ) 184 | else: 185 | model_ref = None 186 | 187 | ######################### 188 | # Instantiate DPO trainer 189 | ######################### 190 | trainer = DPOTrainer( 191 | model, 192 | ref_model=model_ref, 193 | args=training_args, 194 | train_dataset=train_dataset, 195 | processing_class=tokenizer, 196 | peft_config=peft_config, 197 | ) 198 | 199 | ############### 200 | # Training loop 201 | ############### 202 | # Check for last checkpoint 203 | last_checkpoint = get_checkpoint(training_args) 204 | if last_checkpoint is not None and training_args.resume_from_checkpoint is None: 205 | logger.info(f"Checkpoint detected, resuming training at {last_checkpoint}.") 206 | 207 | # Train the model 208 | logger.info( 209 | f'*** Starting training {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} for {training_args.num_train_epochs} epochs***' 210 | ) 211 | train_result = trainer.train(resume_from_checkpoint=last_checkpoint) 212 | # Log and save metrics 213 | metrics = train_result.metrics 214 | metrics["train_samples"] = len(train_dataset) 215 | trainer.log_metrics("train", metrics) 216 | trainer.save_metrics("train", metrics) 217 | trainer.save_state() 218 | 219 | logger.info("*** Training complete ***") 220 | 221 | ################################## 222 | # Save model and create model card 223 | ################################## 224 | 225 | logger.info("*** Save model ***") 226 | if trainer.is_fsdp_enabled and peft_config: 227 | trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT") 228 | # Restore k,v cache for fast inference 229 | trainer.model.config.use_cache = True 230 | trainer.save_model(training_args.output_dir) 231 | logger.info(f"Model saved to {training_args.output_dir}") 232 | training_args.distributed_state.wait_for_everyone() # wait for all processes to load 233 | 234 | tokenizer.save_pretrained(training_args.output_dir) 235 | logger.info(f"Tokenizer saved to {training_args.output_dir}") 236 | 237 | # Save everything else on main process 238 | if trainer.accelerator.is_main_process: 239 | trainer.create_model_card({"tags": ["sft", "tutorial", "philschmid"]}) 240 | # push to hub if needed 241 | if training_args.push_to_hub is True: 242 | logger.info("Pushing to hub...") 243 | trainer.push_to_hub() 244 | 245 | logger.info("*** Training complete! ***") 246 | 247 | 248 | def main(): 249 | parser = TrlParser((ModelConfig, ScriptArguments, DPOConfig)) 250 | model_args, script_args, training_args = parser.parse_args_and_config() 251 | 252 | # Set seed for reproducibility 253 | set_seed(training_args.seed) 254 | 255 | # Run the main training loop 256 | dpo_function(model_args, script_args, training_args) 257 | 258 | 259 | if __name__ == "__main__": 260 | main() 261 | -------------------------------------------------------------------------------- /training/scripts/example.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --ntasks-per-node=1 3 | #SBATCH --gres=gpu:8 4 | #SBATCH --qos=high 5 | #SBATCH --partition=hopper-prod # Adjust this for your cluster 6 | #SBATCH --output=/fsx/philipp/logs/%x-%j.out # Adjust this for your cluster 7 | #SBATCH --err=/fsx/philipp/logs/%x-%j.err # Adjust this for your cluster 8 | 9 | set -x -e 10 | 11 | source ~/.bashrc 12 | micromamba activate dpo 13 | echo "START TIME: $(date)" 14 | 15 | CONFIG_FILE=$1 16 | 17 | # Training setup 18 | NUM_NODES=$SLURM_NNODES 19 | GPUS_PER_NODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) 20 | WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE)) 21 | 22 | # so processes know who to talk to 23 | MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) 24 | MASTER_PORT=6000 25 | 26 | export CMD=" \ 27 | scripts/dpo/run_dpo.py --config $CONFIG_FILE 28 | " 29 | 30 | export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \ 31 | --config_file configs/accelerate_configs/deepspeed_zero3.yaml \ 32 | --num_machines $NUM_NODES \ 33 | --num_processes $WORLD_SIZE \ 34 | --main_process_ip $MASTER_ADDR \ 35 | --main_process_port $MASTER_PORT \ 36 | --machine_rank \$SLURM_PROCID \ 37 | --rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT" \ 38 | --max_restarts 1 \ 39 | --role \$(hostname -s): \ 40 | --tee 3 \ 41 | " 42 | 43 | # force crashing on nccl issues like hanging broadcast 44 | export NCCL_ASYNC_ERROR_HANDLING=1 45 | # export NCCL_DEBUG=INFO 46 | # export NCCL_DEBUG_SUBSYS=COLL 47 | # export NCCL_SOCKET_NTHREADS=1 48 | # export NCCL_NSOCKS_PERTHREAD=1 49 | # export CUDA_LAUNCH_BLOCKING=1 50 | 51 | # Specific configuration optimized for the Hugging Face Compute Cluster 52 | # Be ye warned this may not work on other clusters! 53 | module load cuda/12.1 54 | 55 | # srun error handling: 56 | # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks 57 | # --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code 58 | SRUN_ARGS=" \ 59 | --wait=60 \ 60 | --kill-on-bad-exit=1 \ 61 | " 62 | 63 | clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --role \$SLURMD_NODENAME: $CMD" 2>&1 64 | 65 | echo "END TIME: $(date)" -------------------------------------------------------------------------------- /training/scripts/merge_adapter_weights.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | import tempfile 3 | from typing import Optional 4 | import torch 5 | from peft import AutoPeftModelForCausalLM 6 | from transformers import AutoTokenizer, HfArgumentParser 7 | from huggingface_hub import HfApi 8 | 9 | # Example usage: 10 | # python scripts/merge_adapter_weights.py --peft_model_id falcon-180b-lora-fa --output_dir merged-weights --save_tokenizer True 11 | 12 | def save_model(model_path_or_id, save_dir, save_tokenizer=True): 13 | model = AutoPeftModelForCausalLM.from_pretrained( 14 | model_path_or_id, 15 | low_cpu_mem_usage=True, 16 | torch_dtype=torch.float16, 17 | ) 18 | # Merge LoRA and base model and save 19 | model = model.merge_and_unload() 20 | model.save_pretrained(save_dir, safe_serialization=True, max_shard_size="3GB") 21 | 22 | # save tokenizer 23 | if save_tokenizer: 24 | tokenizer = AutoTokenizer.from_pretrained(model_path_or_id) 25 | tokenizer.save_pretrained(save_dir) 26 | 27 | 28 | @dataclass 29 | class ScriptArguments: 30 | peft_model_id: str = field(metadata={"help": "model id or path to model"}) 31 | output_dir: Optional[str] = field(default="merged-weights", metadata={"help": "where the merged model should be saved"}) 32 | save_tokenizer: Optional[bool] = field(default=True, metadata={"help": "whether to save the tokenizer"}) 33 | push_to_hub: Optional[bool] = field(default=False, metadata={"help": "whether to push the model to the hub"}) 34 | repository_id: Optional[str] = field(default=None, metadata={"help": "the model name"}) 35 | 36 | parser = HfArgumentParser(ScriptArguments) 37 | args = parser.parse_args_into_dataclasses()[0] 38 | api = HfApi() 39 | 40 | if args.push_to_hub: 41 | repo_id = args.repository_id if args.repository_id else args.peft_model_id.split('/')[-1] 42 | with tempfile.TemporaryDirectory() as temp_dir: 43 | save_model(args.peft_model_id, temp_dir, args.save_tokenizer) 44 | api.upload_large_folder( 45 | folder_path=temp_dir, 46 | repo_id=repo_id, 47 | repo_type="model", 48 | ) 49 | else: 50 | save_model(args.peft_model_id, args.output_dir, args.save_tokenizer) -------------------------------------------------------------------------------- /training/scripts/run_fsdp_qlora.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from dataclasses import dataclass, field 3 | import os 4 | import random 5 | import torch 6 | from datasets import load_dataset 7 | from transformers import AutoTokenizer, TrainingArguments 8 | from trl.commands.cli_utils import TrlParser 9 | from transformers import ( 10 | AutoModelForCausalLM, 11 | AutoTokenizer, 12 | BitsAndBytesConfig, 13 | set_seed, 14 | 15 | ) 16 | from trl import setup_chat_format 17 | from peft import LoraConfig 18 | 19 | 20 | from trl import ( 21 | SFTTrainer) 22 | 23 | # Comment in if you want to use the Llama 3 instruct template but make sure to add modules_to_save 24 | # LLAMA_3_CHAT_TEMPLATE="{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" 25 | 26 | # Anthropic/Vicuna like template without the need for special tokens 27 | LLAMA_3_CHAT_TEMPLATE = ( 28 | "{% for message in messages %}" 29 | "{% if message['role'] == 'system' %}" 30 | "{{ message['content'] }}" 31 | "{% elif message['role'] == 'user' %}" 32 | "{{ '\n\nHuman: ' + message['content'] + eos_token }}" 33 | "{% elif message['role'] == 'assistant' %}" 34 | "{{ '\n\nAssistant: ' + message['content'] + eos_token }}" 35 | "{% endif %}" 36 | "{% endfor %}" 37 | "{% if add_generation_prompt %}" 38 | "{{ '\n\nAssistant: ' }}" 39 | "{% endif %}" 40 | ) 41 | 42 | 43 | # ACCELERATE_USE_FSDP=1 FSDP_CPU_RAM_EFFICIENT_LOADING=1 torchrun --nproc_per_node=4 ./scripts/run_fsdp_qlora.py --config llama_3_70b_fsdp_qlora.yaml 44 | 45 | @dataclass 46 | class ScriptArguments: 47 | dataset_path: str = field( 48 | default=None, 49 | metadata={ 50 | "help": "Path to the dataset" 51 | }, 52 | ) 53 | model_id: str = field( 54 | default=None, metadata={"help": "Model ID to use for SFT training"} 55 | ) 56 | max_seq_length: int = field( 57 | default=512, metadata={"help": "The maximum sequence length for SFT Trainer"} 58 | ) 59 | 60 | 61 | def training_function(script_args, training_args): 62 | ################ 63 | # Dataset 64 | ################ 65 | 66 | train_dataset = load_dataset( 67 | "json", 68 | data_files=os.path.join(script_args.dataset_path, "train_dataset.json"), 69 | split="train", 70 | ) 71 | test_dataset = load_dataset( 72 | "json", 73 | data_files=os.path.join(script_args.dataset_path, "test_dataset.json"), 74 | split="train", 75 | ) 76 | 77 | ################ 78 | # Model & Tokenizer 79 | ################ 80 | 81 | # Tokenizer 82 | tokenizer = AutoTokenizer.from_pretrained(script_args.model_id, use_fast=True) 83 | tokenizer.pad_token = tokenizer.eos_token 84 | tokenizer.chat_template = LLAMA_3_CHAT_TEMPLATE 85 | 86 | # template dataset 87 | def template_dataset(examples): 88 | return{"text": tokenizer.apply_chat_template(examples["messages"], tokenize=False)} 89 | 90 | train_dataset = train_dataset.map(template_dataset, remove_columns=["messages"]) 91 | test_dataset = test_dataset.map(template_dataset, remove_columns=["messages"]) 92 | 93 | # print random sample 94 | with training_args.main_process_first( 95 | desc="Log a few random samples from the processed training set" 96 | ): 97 | for index in random.sample(range(len(train_dataset)), 2): 98 | print(train_dataset[index]["text"]) 99 | 100 | # Model 101 | torch_dtype = torch.bfloat16 102 | quant_storage_dtype = torch.bfloat16 103 | 104 | quantization_config = BitsAndBytesConfig( 105 | load_in_4bit=True, 106 | bnb_4bit_use_double_quant=True, 107 | bnb_4bit_quant_type="nf4", 108 | bnb_4bit_compute_dtype=torch_dtype, 109 | bnb_4bit_quant_storage=quant_storage_dtype, 110 | ) 111 | 112 | model = AutoModelForCausalLM.from_pretrained( 113 | script_args.model_id, 114 | quantization_config=quantization_config, 115 | attn_implementation="sdpa", # use sdpa, alternatively use "flash_attention_2" 116 | torch_dtype=quant_storage_dtype, 117 | use_cache=False if training_args.gradient_checkpointing else True, # this is needed for gradient checkpointing 118 | ) 119 | 120 | if training_args.gradient_checkpointing: 121 | model.gradient_checkpointing_enable() 122 | 123 | ################ 124 | # PEFT 125 | ################ 126 | 127 | # LoRA config based on QLoRA paper & Sebastian Raschka experiment 128 | peft_config = LoraConfig( 129 | lora_alpha=8, 130 | lora_dropout=0.05, 131 | r=16, 132 | bias="none", 133 | target_modules="all-linear", 134 | task_type="CAUSAL_LM", 135 | # modules_to_save = ["lm_head", "embed_tokens"] # add if you want to use the Llama 3 instruct template 136 | ) 137 | 138 | ################ 139 | # Training 140 | ################ 141 | trainer = SFTTrainer( 142 | model=model, 143 | args=training_args, 144 | train_dataset=train_dataset, 145 | dataset_text_field="text", 146 | eval_dataset=test_dataset, 147 | peft_config=peft_config, 148 | max_seq_length=script_args.max_seq_length, 149 | tokenizer=tokenizer, 150 | packing=True, 151 | dataset_kwargs={ 152 | "add_special_tokens": False, # We template with special tokens 153 | "append_concat_token": False, # No need to add additional separator token 154 | }, 155 | ) 156 | if trainer.accelerator.is_main_process: 157 | trainer.model.print_trainable_parameters() 158 | 159 | ########################## 160 | # Train model 161 | ########################## 162 | checkpoint = None 163 | if training_args.resume_from_checkpoint is not None: 164 | checkpoint = training_args.resume_from_checkpoint 165 | trainer.train(resume_from_checkpoint=checkpoint) 166 | 167 | ########################## 168 | # SAVE MODEL FOR SAGEMAKER 169 | ########################## 170 | if trainer.is_fsdp_enabled: 171 | trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT") 172 | trainer.save_model() 173 | 174 | if __name__ == "__main__": 175 | parser = TrlParser((ScriptArguments, TrainingArguments)) 176 | script_args, training_args = parser.parse_args_and_config() 177 | 178 | # set use reentrant to False 179 | if training_args.gradient_checkpointing: 180 | training_args.gradient_checkpointing_kwargs = {"use_reentrant": True} 181 | # set seed 182 | set_seed(training_args.seed) 183 | 184 | # launch training 185 | training_function(script_args, training_args) 186 | -------------------------------------------------------------------------------- /training/scripts/run_r1_grpo.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from dataclasses import dataclass 4 | from datetime import datetime 5 | import logging 6 | import os 7 | os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" 8 | import random 9 | import re 10 | import torch 11 | from transformers.trainer_utils import get_last_checkpoint 12 | from transformers import AutoTokenizer 13 | from datasets import load_dataset 14 | from trl import GRPOConfig, GRPOTrainer, get_peft_config, ModelConfig, TrlParser 15 | 16 | 17 | ######################## 18 | # Custom dataclasses 19 | ######################## 20 | @dataclass 21 | class ScriptArguments: 22 | dataset_id_or_path: str = "Jiayi-Pan/Countdown-Tasks-3to4" 23 | dataset_splits: str = "train" 24 | tokenizer_name_or_path: str = None 25 | 26 | 27 | ######################## 28 | # Setup logging 29 | ######################## 30 | logging.basicConfig(level=logging.INFO) 31 | logger = logging.getLogger(__name__) 32 | logger.setLevel(logging.INFO) 33 | handler = logging.StreamHandler() 34 | handler.setFormatter( 35 | logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") 36 | ) 37 | logger.addHandler(handler) 38 | 39 | ######################## 40 | # Helper functions 41 | ######################## 42 | 43 | def format_reward_func(completions, target, **kwargs): 44 | """ 45 | Format: ...... 46 | Args: 47 | completions (list[str]): Generated outputs 48 | target (list[str]): Expected answers 49 | 50 | Returns: 51 | list[float]: Reward scores 52 | """ 53 | rewards = [] 54 | 55 | for completion, gt in zip(completions, target): 56 | 57 | try: 58 | # add synthetic as its already part of the prompt and prefilled for the assistant to more easily match the regex 59 | completion = "" + completion 60 | if random.random() < 0.1: # 1% chance to write samples into a file 61 | os.makedirs("completion_samples", exist_ok=True) 62 | log_file = os.path.join("completion_samples", "completion_samples.txt") 63 | with open(log_file, "a") as f: 64 | f.write(f"\n\n==============\n") 65 | f.write(completion) 66 | 67 | # Check if the format is correct 68 | regex = r"^([^<]*(?:<(?!/?think>)[^<]*)*)<\/think>\n([\s\S]*?)<\/answer>$" 69 | 70 | match = re.search(regex, completion, re.DOTALL) 71 | # if the format is not correct, reward is 0 72 | if match is None or len(match.groups()) != 2: 73 | rewards.append(0.0) 74 | else: 75 | rewards.append(1.0) 76 | except Exception: 77 | rewards.append(0.0) 78 | return rewards 79 | 80 | def equation_reward_func(completions, target, nums, **kwargs): 81 | """ 82 | Evaluates completions based on: 83 | 2. Mathematical correctness of the answer 84 | 85 | Args: 86 | completions (list[str]): Generated outputs 87 | target (list[str]): Expected answers 88 | nums (list[str]): Available numbers 89 | 90 | Returns: 91 | list[float]: Reward scores 92 | """ 93 | rewards = [] 94 | for completion, gt, numbers in zip(completions, target, nums): 95 | try: 96 | # add synthetic as its already part of the prompt and prefilled for the assistant to more easily match the regex 97 | completion = "" + completion 98 | # Check if the format is correct 99 | match = re.search(r"(.*?)<\/answer>", completion) 100 | if match is None: 101 | rewards.append(0.0) 102 | continue 103 | # Extract the "answer" part from the completion 104 | equation = match.group(1).strip() 105 | # Extract all numbers from the equation 106 | used_numbers = [int(n) for n in re.findall(r'\d+', equation)] 107 | 108 | # Check if all numbers are used exactly once 109 | if sorted(used_numbers) != sorted(numbers): 110 | rewards.append(0.0) 111 | continue 112 | # Define a regex pattern that only allows numbers, operators, parentheses, and whitespace 113 | allowed_pattern = r'^[\d+\-*/().\s]+$' 114 | if not re.match(allowed_pattern, equation): 115 | rewards.append(0.0) 116 | continue 117 | 118 | # Evaluate the equation with restricted globals and locals 119 | result = eval(equation, {"__builtins__": None}, {}) 120 | # Check if the equation is correct and matches the ground truth 121 | if abs(float(result) - float(gt)) < 1e-5: 122 | rewards.append(1.0) 123 | if random.random() < 0.10: # 10% chance to write fully successful samples into a file 124 | os.makedirs("completion_samples", exist_ok=True) 125 | log_file = os.path.join("completion_samples", "success_completion_samples.txt") 126 | with open(log_file, "a") as f: 127 | f.write(f"\n\n==============\n") 128 | f.write(completion) 129 | else: 130 | rewards.append(0.0) 131 | except Exception: 132 | # If evaluation fails, reward is 0 133 | rewards.append(0.0) 134 | return rewards 135 | 136 | def get_checkpoint(training_args: GRPOConfig): 137 | last_checkpoint = None 138 | if os.path.isdir(training_args.output_dir): 139 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 140 | return last_checkpoint 141 | 142 | 143 | def grpo_function( 144 | model_args: ModelConfig, script_args: ScriptArguments, training_args: GRPOConfig 145 | ): 146 | ######################### 147 | # Log parameters 148 | ######################### 149 | logger.info(f"Model parameters {model_args}") 150 | logger.info(f"Training/evaluation parameters {training_args}") 151 | 152 | ################ 153 | # Load tokenizer 154 | ################ 155 | tokenizer = AutoTokenizer.from_pretrained( 156 | ( 157 | script_args.tokenizer_name_or_path 158 | if script_args.tokenizer_name_or_path 159 | else model_args.model_name_or_path 160 | ), 161 | revision=model_args.model_revision, 162 | trust_remote_code=model_args.trust_remote_code, 163 | ) 164 | if tokenizer.pad_token is None: 165 | tokenizer.pad_token = tokenizer.eos_token 166 | 167 | ############### 168 | # Load datasets 169 | ############### 170 | # Load dataset from Hugging Face Hub 171 | dataset = load_dataset(script_args.dataset_id_or_path, split=script_args.dataset_splits) 172 | # select a random subset of 50k samples 173 | dataset = dataset.shuffle(seed=42).select(range(50000)) 174 | 175 | ##################### 176 | # Prepare and format dataset 177 | ##################### 178 | 179 | # gemerate r1 prompt with a prefix for the model to already start with the thinking process 180 | def generate_r1_prompt(numbers, target): 181 | r1_prefix = [{ 182 | "role": "system", 183 | "content": "You are a helpful assistant. You first thinks about the reasoning process in the mind and then provides the user with the answer." 184 | }, 185 | { 186 | "role": "user", 187 | "content": f"Using the numbers {numbers}, create an equation that equals {target}. You can use basic arithmetic operations (+, -, *, /) one or multiple times but each number can only be used once. Show your work in tags. And return the final equation in tags, for example (1 + 2) / 3 . Think step by step inside tags." 188 | }, 189 | { 190 | "role": "assistant", 191 | "content": "Let me solve this step by step.\n" 192 | }] 193 | return {"prompt": tokenizer.apply_chat_template(r1_prefix, tokenize=False, continue_final_message=True), "target": target, "nums": numbers} 194 | 195 | # convert our dataset to the r1 prompt 196 | dataset = dataset.map(lambda x: generate_r1_prompt(x["nums"], x["target"])) 197 | 198 | # split the dataset into train and test 199 | train_test_split = dataset.train_test_split(test_size=0.1) 200 | 201 | train_dataset = train_test_split["train"] 202 | test_dataset = train_test_split["test"] 203 | 204 | ######################### 205 | # Instantiate DPO trainer 206 | ######################### 207 | 208 | trainer = GRPOTrainer( 209 | model=model_args.model_name_or_path, 210 | reward_funcs=[format_reward_func, equation_reward_func], 211 | args=training_args, 212 | train_dataset=train_dataset, 213 | eval_dataset=test_dataset, 214 | peft_config=get_peft_config(model_args), 215 | ) 216 | 217 | 218 | ############### 219 | # Training loop 220 | ############### 221 | # Check for last checkpoint 222 | last_checkpoint = get_checkpoint(training_args) 223 | if last_checkpoint is not None and training_args.resume_from_checkpoint is None: 224 | logger.info(f"Checkpoint detected, resuming training at {last_checkpoint}.") 225 | 226 | # Train the model 227 | logger.info( 228 | f'*** Starting training {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} for {training_args.num_train_epochs} epochs***' 229 | ) 230 | train_result = trainer.train(resume_from_checkpoint=last_checkpoint) 231 | # Log and save metrics 232 | metrics = train_result.metrics 233 | metrics["train_samples"] = len(train_dataset) 234 | trainer.log_metrics("train", metrics) 235 | trainer.save_metrics("train", metrics) 236 | trainer.save_state() 237 | 238 | logger.info("*** Training complete ***") 239 | 240 | ################################## 241 | # Save model and create model card 242 | ################################## 243 | 244 | logger.info("*** Save model ***") 245 | trainer.model.config.use_cache = True 246 | trainer.save_model(training_args.output_dir) 247 | logger.info(f"Model saved to {training_args.output_dir}") 248 | training_args.distributed_state.wait_for_everyone() # wait for all processes to load 249 | 250 | tokenizer.save_pretrained(training_args.output_dir) 251 | logger.info(f"Tokenizer saved to {training_args.output_dir}") 252 | 253 | # Save everything else on main process 254 | if trainer.accelerator.is_main_process: 255 | trainer.create_model_card({"tags": ["rl","grpo", "tutorial", "philschmid"]}) 256 | # push to hub if needed 257 | if training_args.push_to_hub is True: 258 | logger.info("Pushing to hub...") 259 | trainer.push_to_hub() 260 | 261 | logger.info("*** Training complete! ***") 262 | 263 | 264 | def main(): 265 | parser = TrlParser((ModelConfig, ScriptArguments, GRPOConfig)) 266 | model_args, script_args, training_args = parser.parse_args_and_config() 267 | 268 | # Run the main training loop 269 | grpo_function(model_args, script_args, training_args) 270 | 271 | 272 | if __name__ == "__main__": 273 | main() -------------------------------------------------------------------------------- /training/scripts/run_seq2seq_deepspeed.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | from transformers import ( 5 | AutoModelForSeq2SeqLM, 6 | DataCollatorForSeq2Seq, 7 | AutoTokenizer, 8 | set_seed, 9 | ) 10 | from datasets import load_from_disk 11 | import torch 12 | import evaluate 13 | import nltk 14 | import numpy as np 15 | 16 | from huggingface_hub import HfFolder 17 | from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments 18 | 19 | nltk.download("punkt", quiet=True) 20 | 21 | # Metric 22 | metric = evaluate.load("rouge") 23 | # evaluation generation args 24 | gen_kwargs = { 25 | "early_stopping": True, 26 | "length_penalty": 2.0, 27 | "max_new_tokens": 50, 28 | "min_length": 30, 29 | "no_repeat_ngram_size": 3, 30 | "num_beams": 4, 31 | } 32 | 33 | 34 | def postprocess_text(preds, labels): 35 | preds = [pred.strip() for pred in preds] 36 | labels = [label.strip() for label in labels] 37 | 38 | # rougeLSum expects newline after each sentence 39 | preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] 40 | labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] 41 | 42 | return preds, labels 43 | 44 | 45 | def parse_arge(): 46 | """Parse the arguments.""" 47 | parser = argparse.ArgumentParser() 48 | # add model id and dataset path argument 49 | parser.add_argument("--model_id", type=str, default="google/flan-t5-xl", help="Model id to use for training.") 50 | parser.add_argument("--dataset_path", type=str, default="data", help="Path to the already processed dataset.") 51 | parser.add_argument( 52 | "--repository_id", type=str, default=None, help="Hugging Face Repository id for uploading models" 53 | ) 54 | # add training hyperparameters for epochs, batch size, learning rate, and seed 55 | parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for.") 56 | parser.add_argument("--per_device_train_batch_size", type=int, default=8, help="Batch size to use for training.") 57 | parser.add_argument("--per_device_eval_batch_size", type=int, default=8, help="Batch size to use for testing.") 58 | parser.add_argument("--generation_max_length", type=int, default=140, help="Maximum length to use for generation") 59 | parser.add_argument("--generation_num_beams", type=int, default=4, help="Number of beams to use for generation.") 60 | parser.add_argument("--lr", type=float, default=3e-3, help="Learning rate to use for training.") 61 | parser.add_argument("--seed", type=int, default=42, help="Seed to use for training.") 62 | parser.add_argument("--deepspeed", type=str, default=None, help="Path to deepspeed config file.") 63 | parser.add_argument("--gradient_checkpointing", type=bool, default=True, help="Path to deepspeed config file.") 64 | parser.add_argument( 65 | "--bf16", 66 | type=bool, 67 | default=True if torch.cuda.get_device_capability()[0] == 8 else False, 68 | help="Whether to use bf16.", 69 | ) 70 | parser.add_argument( 71 | "--hf_token", 72 | type=str, 73 | default=HfFolder.get_token(), 74 | help="Token to use for uploading models to Hugging Face Hub.", 75 | ) 76 | args = parser.parse_known_args() 77 | return args 78 | 79 | 80 | def training_function(args): 81 | # set seed 82 | set_seed(args.seed) 83 | 84 | # load dataset from disk and tokenizer 85 | train_dataset = load_from_disk(os.path.join(args.dataset_path, "train")) 86 | eval_dataset = load_from_disk(os.path.join(args.dataset_path, "eval")) 87 | tokenizer = AutoTokenizer.from_pretrained(args.model_id) 88 | # load model from the hub 89 | model = AutoModelForSeq2SeqLM.from_pretrained( 90 | args.model_id, 91 | use_cache=False if args.gradient_checkpointing else True, # this is needed for gradient checkpointing 92 | ) 93 | 94 | # we want to ignore tokenizer pad token in the loss 95 | label_pad_token_id = -100 96 | # Data collator 97 | data_collator = DataCollatorForSeq2Seq( 98 | tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 99 | ) 100 | 101 | # Define compute metrics function 102 | def compute_metrics(eval_preds): 103 | preds, labels = eval_preds 104 | if isinstance(preds, tuple): 105 | preds = preds[0] 106 | decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) 107 | # Replace -100 in the labels as we can't decode them. 108 | labels = np.where(labels != -100, labels, tokenizer.pad_token_id) 109 | decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) 110 | 111 | # Some simple post-processing 112 | decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) 113 | 114 | result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) 115 | result = {k: round(v * 100, 4) for k, v in result.items()} 116 | prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] 117 | result["gen_len"] = np.mean(prediction_lens) 118 | return result 119 | 120 | # Define training args 121 | # output_dir = args.repository_id if args.repository_id else args.model_id.split("/")[-1] 122 | output_dir = args.model_id.split("/")[-1] 123 | training_args = Seq2SeqTrainingArguments( 124 | output_dir=output_dir, 125 | per_device_train_batch_size=args.per_device_train_batch_size, 126 | per_device_eval_batch_size=args.per_device_eval_batch_size, 127 | predict_with_generate=True, 128 | generation_max_length=args.generation_max_length, 129 | generation_num_beams=args.generation_num_beams, 130 | fp16=False, # T5 overflows with fp16 131 | bf16=args.bf16, # Use BF16 if available 132 | learning_rate=args.lr, 133 | num_train_epochs=args.epochs, 134 | deepspeed=args.deepspeed, 135 | gradient_checkpointing=args.gradient_checkpointing, 136 | # logging & evaluation strategies 137 | logging_dir=f"{output_dir}/logs", 138 | logging_strategy="steps", 139 | logging_steps=500, 140 | evaluation_strategy="epoch", 141 | save_strategy="epoch", 142 | save_total_limit=2, 143 | load_best_model_at_end=True, 144 | # push to hub parameters 145 | report_to="tensorboard", 146 | push_to_hub=True if args.repository_id else False, 147 | hub_strategy="every_save", 148 | hub_model_id=args.repository_id if args.repository_id else None, 149 | hub_token=args.hf_token, 150 | ) 151 | 152 | # Create Trainer instance 153 | trainer = Seq2SeqTrainer( 154 | model=model, 155 | args=training_args, 156 | train_dataset=train_dataset, 157 | eval_dataset=eval_dataset, 158 | data_collator=data_collator, 159 | compute_metrics=compute_metrics, 160 | ) 161 | 162 | # Start training 163 | trainer.train() 164 | 165 | # Save our tokenizer and create model card 166 | tokenizer.save_pretrained(output_dir) 167 | trainer.create_model_card() 168 | # Push the results to the hub 169 | if args.repository_id: 170 | trainer.push_to_hub() 171 | 172 | 173 | def main(): 174 | args, _ = parse_arge() 175 | training_function(args) 176 | 177 | 178 | if __name__ == "__main__": 179 | main() 180 | -------------------------------------------------------------------------------- /training/scripts/run_sft.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from datetime import datetime 3 | from distutils.util import strtobool 4 | import logging 5 | import os 6 | import re 7 | from typing import Optional 8 | os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" 9 | import torch 10 | from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, BitsAndBytesConfig 11 | from transformers.trainer_utils import get_last_checkpoint 12 | from transformers.utils import is_liger_kernel_available 13 | from trl import SFTTrainer, TrlParser, ModelConfig, SFTConfig, get_peft_config 14 | from datasets import load_dataset 15 | from peft import AutoPeftModelForCausalLM 16 | 17 | if is_liger_kernel_available(): 18 | from liger_kernel.transformers import AutoLigerKernelForCausalLM 19 | 20 | 21 | 22 | ######################## 23 | # Custom dataclasses 24 | ######################## 25 | @dataclass 26 | class ScriptArguments: 27 | dataset_id_or_path: str 28 | dataset_splits: str = "train" 29 | tokenizer_name_or_path: str = None 30 | spectrum_config_path: Optional[str] = None 31 | 32 | 33 | ######################## 34 | # Setup logging 35 | ######################## 36 | logging.basicConfig(level=logging.INFO) 37 | logger = logging.getLogger(__name__) 38 | logger.setLevel(logging.INFO) 39 | handler = logging.StreamHandler() 40 | handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) 41 | logger.addHandler(handler) 42 | 43 | ######################## 44 | # Helper functions 45 | ######################## 46 | 47 | def get_checkpoint(training_args: SFTConfig): 48 | last_checkpoint = None 49 | if os.path.isdir(training_args.output_dir): 50 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 51 | return last_checkpoint 52 | 53 | 54 | def setup_model_for_spectrum(model, spectrum_config_path): 55 | unfrozen_parameters = [] 56 | with open(spectrum_config_path, "r") as fin: 57 | yaml_parameters = fin.read() 58 | 59 | # get the unfrozen parameters from the yaml file 60 | for line in yaml_parameters.splitlines(): 61 | if line.startswith("- "): 62 | unfrozen_parameters.append(line.split("- ")[1]) 63 | 64 | # freeze all parameters 65 | for param in model.parameters(): 66 | param.requires_grad = False 67 | # unfreeze Spectrum parameters 68 | for name, param in model.named_parameters(): 69 | if any(re.match(unfrozen_param, name) for unfrozen_param in unfrozen_parameters): 70 | param.requires_grad = True 71 | 72 | # COMMENT IN: for sanity check print the trainable parameters 73 | # for name, param in model.named_parameters(): 74 | # if param.requires_grad: 75 | # print(f"Trainable parameter: {name}") 76 | 77 | return model 78 | 79 | ########################################################################################################### 80 | 81 | def train_function(model_args: ModelConfig, script_args: ScriptArguments, training_args: SFTConfig): 82 | """Main training function.""" 83 | ######################### 84 | # Log parameters 85 | ######################### 86 | logger.info(f'Model parameters {model_args}') 87 | logger.info(f'Script parameters {script_args}') 88 | logger.info(f'Training/evaluation parameters {training_args}') 89 | 90 | ############### 91 | # Load datasets 92 | ############### 93 | if script_args.dataset_id_or_path.endswith('.json'): 94 | train_dataset = load_dataset('json', data_files=script_args.dataset_id_or_path, split='train') 95 | else: 96 | train_dataset = load_dataset(script_args.dataset_id_or_path, split=script_args.dataset_splits) 97 | 98 | train_dataset = train_dataset.select(range(10000)) 99 | 100 | logger.info(f'Loaded dataset with {len(train_dataset)} samples and the following features: {train_dataset.features}') 101 | 102 | ################ 103 | # Load tokenizer 104 | ################ 105 | tokenizer = AutoTokenizer.from_pretrained( 106 | script_args.tokenizer_name_or_path if script_args.tokenizer_name_or_path else model_args.model_name_or_path, 107 | revision=model_args.model_revision, 108 | trust_remote_code=model_args.trust_remote_code, 109 | ) 110 | if tokenizer.pad_token is None: 111 | tokenizer.pad_token = tokenizer.eos_token 112 | # if we use peft we need to make sure we use a chat template that is not using special tokens as by default embedding layers will not be trainable 113 | 114 | 115 | ####################### 116 | # Load pretrained model 117 | ####################### 118 | 119 | # define model kwargs 120 | model_kwargs = dict( 121 | revision=model_args.model_revision, # What revision from Huggingface to use, defaults to main 122 | trust_remote_code=model_args.trust_remote_code, # Whether to trust the remote code, this also you to fine-tune custom architectures 123 | attn_implementation=model_args.attn_implementation, # What attention implementation to use, defaults to flash_attention_2 124 | torch_dtype=model_args.torch_dtype if model_args.torch_dtype in ['auto', None] else getattr(torch, model_args.torch_dtype), # What torch dtype to use, defaults to auto 125 | use_cache=False if training_args.gradient_checkpointing else True, # Whether 126 | low_cpu_mem_usage=True if not strtobool(os.environ.get("ACCELERATE_USE_DEEPSPEED", "false")) else None, # Reduces memory usage on CPU for loading the model 127 | ) 128 | 129 | # Check which training method to use and if 4-bit quantization is needed 130 | if model_args.load_in_4bit: 131 | model_kwargs['quantization_config'] = BitsAndBytesConfig( 132 | load_in_4bit=True, 133 | bnb_4bit_use_double_quant=True, 134 | bnb_4bit_quant_type='nf4', 135 | bnb_4bit_compute_dtype=model_kwargs['torch_dtype'], 136 | bnb_4bit_quant_storage=model_kwargs['torch_dtype'], 137 | ) 138 | if model_args.use_peft: 139 | peft_config = get_peft_config(model_args) 140 | else: 141 | peft_config = None 142 | 143 | # load the model with our kwargs 144 | if training_args.use_liger: 145 | model = AutoLigerKernelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs) 146 | else: 147 | model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs) 148 | training_args.distributed_state.wait_for_everyone() # wait for all processes to load 149 | 150 | 151 | if script_args.spectrum_config_path: 152 | model = setup_model_for_spectrum(model, script_args.spectrum_config_path) 153 | 154 | ######################## 155 | # Initialize the Trainer 156 | ######################## 157 | trainer = SFTTrainer( 158 | model=model, 159 | args=training_args, 160 | train_dataset=train_dataset, 161 | tokenizer=tokenizer, 162 | peft_config=peft_config, 163 | ) 164 | if trainer.accelerator.is_main_process and peft_config: 165 | trainer.model.print_trainable_parameters() 166 | 167 | ############### 168 | # Training loop 169 | ############### 170 | # Check for last checkpoint 171 | last_checkpoint = get_checkpoint(training_args) 172 | if last_checkpoint is not None and training_args.resume_from_checkpoint is None: 173 | logger.info(f'Checkpoint detected, resuming training at {last_checkpoint}.') 174 | 175 | logger.info(f'*** Starting training {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} for {training_args.num_train_epochs} epochs***') 176 | train_result = trainer.train(resume_from_checkpoint=last_checkpoint) 177 | # log metrics 178 | metrics = train_result.metrics 179 | metrics['train_samples'] = len(train_dataset) 180 | trainer.log_metrics('train', metrics) 181 | trainer.save_metrics('train', metrics) 182 | trainer.save_state() 183 | 184 | ################################## 185 | # Save model and create model card 186 | ################################## 187 | 188 | logger.info('*** Save model ***') 189 | if trainer.is_fsdp_enabled and peft_config: 190 | trainer.accelerator.state.fsdp_plugin.set_state_dict_type('FULL_STATE_DICT') 191 | # Restore k,v cache for fast inference 192 | trainer.model.config.use_cache = True 193 | trainer.save_model(training_args.output_dir) 194 | logger.info(f'Model saved to {training_args.output_dir}') 195 | training_args.distributed_state.wait_for_everyone() # wait for all processes to load 196 | 197 | tokenizer.save_pretrained(training_args.output_dir) 198 | logger.info(f'Tokenizer saved to {training_args.output_dir}') 199 | 200 | # Save everything else on main process 201 | if trainer.accelerator.is_main_process: 202 | trainer.create_model_card({'tags': ['sft', 'tutorial', 'philschmid']}) 203 | # push to hub if needed 204 | if training_args.push_to_hub is True: 205 | logger.info('Pushing to hub...') 206 | trainer.push_to_hub() 207 | 208 | logger.info('*** Training complete! ***') 209 | 210 | 211 | def main(): 212 | parser = TrlParser((ModelConfig, ScriptArguments, SFTConfig)) 213 | model_args, script_args, training_args = parser.parse_args_and_config() 214 | 215 | # Set seed for reproducibility 216 | set_seed(training_args.seed) 217 | 218 | # Run the main training loop 219 | train_function(model_args, script_args, training_args) 220 | 221 | 222 | if __name__ == '__main__': 223 | main() -------------------------------------------------------------------------------- /training/scripts/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from peft import AutoPeftModelForCausalLM 3 | from transformers import AutoTokenizer, pipeline 4 | 5 | peft_model_id = "./llama-8b-hf-no-robot" 6 | 7 | # Load Model with PEFT adapter 8 | model = AutoPeftModelForCausalLM.from_pretrained( 9 | peft_model_id, 10 | device_map="auto", 11 | torch_dtype=torch.float16 12 | ) 13 | tokenizer = AutoTokenizer.from_pretrained(peft_model_id) 14 | 15 | from datasets import load_dataset 16 | from random import randint 17 | 18 | 19 | # Load our test dataset 20 | eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train") 21 | rand_idx = randint(0, len(eval_dataset)) 22 | messages = eval_dataset[rand_idx]["messages"][:2] 23 | print(tokenizer.eos_token) 24 | # stop generation on eos token or <|eot_id|> token 25 | terminators = [ 26 | tokenizer.eos_token_id, 27 | tokenizer.convert_tokens_to_ids("<|eot_id|>"), 28 | ] 29 | 30 | # Test on sample 31 | input_ids = tokenizer.apply_chat_template(messages,add_generation_prompt=True,return_tensors="pt").to(model.device) 32 | 33 | outputs = model.generate( 34 | input_ids, 35 | max_new_tokens=512, 36 | eos_token_id=terminators, 37 | do_sample=True, 38 | temperature=0.6, 39 | top_p=0.9, 40 | ) 41 | response = outputs[0][input_ids.shape[-1]:] 42 | print(f"**Query:**\n{eval_dataset[rand_idx]['messages'][1]['content']}") 43 | print(f"**Original Answer:**\n {eval_dataset[rand_idx]['messages'][2]['content']}") 44 | print(f"**Generated Answer:**\n {tokenizer.decode(response,skip_special_tokens=True)}") -------------------------------------------------------------------------------- /training/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philschmid/deep-learning-pytorch-huggingface/59b37973074de90004d10e5ff636f98160c9743a/training/utils/__init__.py -------------------------------------------------------------------------------- /training/utils/falcon_patch.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple 2 | 3 | import torch 4 | import transformers 5 | from peft.tuners.lora import LoraLayer 6 | 7 | try: 8 | from flash_attn import flash_attn_func 9 | except Exception: 10 | raise ModuleNotFoundError( 11 | "Please install FlashAttention first, e.g., with pip install flash-attn --no-build-isolation, Learn more at https://github.com/Dao-AILab/flash-attention#installation-and-features" 12 | ) 13 | 14 | try: 15 | from einops import rearrange 16 | except Exception: 17 | raise ModuleNotFoundError("Please install einops first, e.g., with pip install einops") 18 | 19 | 20 | # ADAPTED https://github.com/pacman100/DHS-LLM-Workshop/blob/main/chat_assistant/training/falcon_flash_attn_monkey_patch.py 21 | def forward( 22 | self, 23 | hidden_states: torch.Tensor, 24 | alibi: Optional[torch.Tensor], 25 | attention_mask: torch.Tensor, 26 | layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, 27 | head_mask: Optional[torch.Tensor] = None, 28 | use_cache: bool = False, 29 | output_attentions: bool = False, 30 | ): 31 | fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] 32 | num_kv_heads = self.num_heads if self.new_decoder_architecture else self.num_kv_heads 33 | # 3 x [batch_size, seq_length, num_heads, head_dim] 34 | (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv) 35 | 36 | batch_size, query_length, _, _ = query_layer.shape 37 | 38 | query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.num_heads, query_length, self.head_dim) 39 | key_layer = key_layer.transpose(1, 2).reshape( 40 | batch_size * num_kv_heads, 41 | query_length, 42 | self.head_dim, 43 | ) 44 | value_layer = value_layer.transpose(1, 2).reshape(batch_size * num_kv_heads, query_length, self.head_dim) 45 | 46 | past_kv_length = 0 if layer_past is None else layer_past[0].shape[1] 47 | query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, past_kv_length) 48 | 49 | if layer_past is not None: 50 | past_key, past_value = layer_past 51 | # concatenate along seq_length dimension: 52 | # - key: [batch_size * self.num_heads, kv_length, head_dim] 53 | # - value: [batch_size * self.num_heads, kv_length, head_dim] 54 | key_layer = torch.cat((past_key, key_layer), dim=1) 55 | value_layer = torch.cat((past_value, value_layer), dim=1) 56 | 57 | _, kv_length, _ = key_layer.shape 58 | if use_cache: 59 | present = (key_layer, value_layer) 60 | else: 61 | present = None 62 | attention_mask_float = (attention_mask * 1.0).masked_fill(attention_mask, float("-1e9")).to(query_layer.dtype) 63 | query_layer_ = ( 64 | query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim).transpose(1, 2).to(torch.bfloat16) 65 | ) 66 | key_layer_ = key_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim).transpose(1, 2).to(torch.bfloat16) 67 | value_layer_ = value_layer.reshape(batch_size, num_kv_heads, -1, self.head_dim).transpose(1, 2).to(torch.bfloat16) 68 | 69 | if alibi is not None: 70 | raise ValueError("`alibi` is not supported when `use_flash_attn` is True") 71 | 72 | # below output will have shape (batch_size, seqlen, nheads, headdim) 73 | attn_output = flash_attn_func(query_layer_, key_layer_, value_layer_, causal=True) 74 | attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim) 75 | output_tensor = self.dense(attn_output) 76 | return output_tensor, present 77 | 78 | 79 | def replace_attn_with_flash_attn(): 80 | cuda_major, cuda_minor = torch.cuda.get_device_capability() 81 | if cuda_major < 8: 82 | print( 83 | "Flash attention is only supported on Ampere or Hopper GPU during training due to head dim > 64 backward." 84 | "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593" 85 | ) 86 | transformers.models.falcon.modeling_falcon.FalconAttention.forward = forward 87 | 88 | 89 | def unplace_flash_attn_with_attn(): 90 | import importlib 91 | import transformers 92 | 93 | print("Reloading falcon model, unpatching flash attention") 94 | importlib.reload(transformers.models.falcon.modeling_falcon) 95 | 96 | 97 | # Adapted from https://github.com/tmm1/axolotl/blob/2eda9e02a9d15a7a3f92b41f257d9844d72fc220/src/axolotl/utils/models.py#L338 98 | def upcast_layer_for_flash_attention(model, torch_dtype): 99 | # LlamaRMSNorm layers are in fp32 after kbit_training, so we need to 100 | # convert them back to fp16/bf16 for flash-attn compatibility. 101 | for name, module in model.named_modules(): 102 | if isinstance(module, LoraLayer): 103 | module.to(torch_dtype) 104 | if "norm" in name: 105 | module.to(torch_dtype) 106 | if "lm_head" in name or "embed_tokens" in name: 107 | if hasattr(module, "weight"): 108 | module.to(torch_dtype) 109 | 110 | return model 111 | -------------------------------------------------------------------------------- /training/utils/llama_patch.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple 2 | 3 | import torch 4 | from torch import nn 5 | import warnings 6 | import transformers 7 | from transformers.models.llama.modeling_llama import apply_rotary_pos_emb 8 | from peft.tuners.lora import LoraLayer 9 | 10 | try: 11 | from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func 12 | from flash_attn.bert_padding import unpad_input, pad_input 13 | except Exception: 14 | raise ModuleNotFoundError( 15 | "Please install FlashAttention first, e.g., with pip install flash-attn --no-build-isolation, Learn more at https://github.com/Dao-AILab/flash-attention#installation-and-features" 16 | ) 17 | 18 | try: 19 | from einops import rearrange 20 | except Exception: 21 | raise ModuleNotFoundError("Please install einops first, e.g., with pip install einops") 22 | 23 | 24 | # ADAPTED from https://github.com/allenai/open-instruct/blob/main/open_instruct/llama_flash_attn_monkey_patch.py 25 | # AND https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py 26 | # AND https://github.com/LAION-AI/Open-Assistant/blob/04fa9a24b2a58c8885b8aa6a2eb02b18de6b4961/model/model_training/models/patching_llama.py 27 | # AND Sourabh https://github.com/huggingface/transformers/commit/ee81bf5aee0d65f005d157c013777e3d27d8d6bf 28 | def forward( 29 | self, 30 | hidden_states: torch.Tensor, 31 | attention_mask: Optional[torch.Tensor] = None, 32 | position_ids: Optional[torch.Tensor] = None, 33 | past_key_value: Optional[Tuple[torch.Tensor]] = None, 34 | output_attentions: bool = False, 35 | use_cache: bool = False, 36 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: 37 | """Input shape: Batch x Time x Channel 38 | 39 | attention_mask: [bsz, q_len] 40 | """ 41 | if output_attentions: 42 | warnings.warn("Output attentions is not supported for patched `LlamaAttention`, returning `None` instead.") 43 | 44 | bsz, q_len, _ = hidden_states.size() 45 | 46 | query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) 47 | key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) 48 | value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) 49 | # [bsz, q_len, nh, hd] 50 | # [bsz, nh, q_len, hd] 51 | 52 | kv_seq_len = key_states.shape[-2] 53 | if past_key_value is not None: 54 | kv_seq_len += past_key_value[0].shape[-2] 55 | cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) 56 | query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) 57 | 58 | # Past Key value support 59 | if past_key_value is not None: 60 | # reuse k, v, self_attention 61 | key_states = torch.cat([past_key_value[0], key_states], dim=2) 62 | value_states = torch.cat([past_key_value[1], value_states], dim=2) 63 | 64 | past_key_value = (key_states, value_states) if use_cache else None 65 | 66 | # Flash attention codes from 67 | # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py 68 | 69 | # transform the data into the format required by flash attention 70 | qkv = torch.stack([query_states, key_states, value_states], dim=2) # [bsz, nh, 3, q_len, hd] 71 | qkv = qkv.transpose(1, 3) # [bsz, q_len, 3, nh, hd] 72 | # We have disabled _prepare_decoder_attention_mask in LlamaModel 73 | # the attention_mask should be the same as the key_padding_mask 74 | key_padding_mask = attention_mask 75 | 76 | if key_padding_mask is None: 77 | qkv = rearrange(qkv, "b s ... -> (b s) ...") 78 | max_s = q_len 79 | cu_q_lens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device) 80 | output = flash_attn_varlen_qkvpacked_func(qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True) 81 | output = rearrange(output, "(b s) ... -> b s ...", b=bsz) 82 | else: 83 | nheads = qkv.shape[-2] 84 | x = rearrange(qkv, "b s three h d -> b s (three h d)") 85 | x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask) 86 | x_unpad = rearrange(x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads) 87 | output_unpad = flash_attn_varlen_qkvpacked_func( 88 | x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True 89 | ) 90 | output = rearrange( 91 | pad_input(rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len), 92 | "b s (h d) -> b s h d", 93 | h=nheads, 94 | ) 95 | return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, past_key_value 96 | 97 | 98 | # Disable the transformation of the attention mask in LlamaModel as the flash attention 99 | # requires the attention mask to be the same as the key_padding_mask 100 | def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): 101 | # [bsz, seq_len] 102 | return attention_mask 103 | 104 | 105 | def replace_attn_with_flash_attn(): 106 | cuda_major, cuda_minor = torch.cuda.get_device_capability() 107 | if cuda_major < 8: 108 | print( 109 | "Flash attention is only supported on Ampere or Hopper GPU during training due to head dim > 64 backward." 110 | "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593" 111 | ) 112 | transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = ( 113 | _prepare_decoder_attention_mask 114 | ) 115 | transformers.models.llama.modeling_llama.LlamaAttention.forward = forward 116 | 117 | 118 | def unplace_flash_attn_with_attn(): 119 | import importlib 120 | import transformers 121 | 122 | print("Reloading llama model, unpatching flash attention") 123 | importlib.reload(transformers.models.llama.modeling_llama) 124 | 125 | 126 | # Adapted from https://github.com/tmm1/axolotl/blob/2eda9e02a9d15a7a3f92b41f257d9844d72fc220/src/axolotl/utils/models.py#L338 127 | def upcast_layer_for_flash_attention(model, torch_dtype): 128 | # LlamaRMSNorm layers are in fp32 after kbit_training, so we need to 129 | # convert them back to fp16/bf16 for flash-attn compatibility. 130 | for name, module in model.named_modules(): 131 | if isinstance(module, LoraLayer): 132 | module.to(torch_dtype) 133 | if "norm" in name: 134 | module.to(torch_dtype) 135 | if "lm_head" in name or "embed_tokens" in name: 136 | if hasattr(module, "weight"): 137 | module.to(torch_dtype) 138 | 139 | return model 140 | -------------------------------------------------------------------------------- /training/utils/peft_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl 3 | from peft import LoraConfig, get_peft_model 4 | from peft.tuners.lora import LoraLayer 5 | from transformers import ( 6 | AutoModelForCausalLM, 7 | AutoTokenizer, 8 | AutoTokenizer, 9 | TrainingArguments, 10 | ) 11 | from utils.falcon_patch import replace_attn_with_flash_attn as replace_falcon_attn_with_flash_attn 12 | from utils.llama_patch import replace_attn_with_flash_attn as replace_llama_attn_with_flash_attn 13 | 14 | 15 | class SaveDeepSpeedPeftModelCallback(TrainerCallback): 16 | def __init__(self, trainer, save_steps=500): 17 | self.trainer = trainer 18 | self.save_steps = save_steps 19 | 20 | def on_step_end( 21 | self, 22 | args: TrainingArguments, 23 | state: TrainerState, 24 | control: TrainerControl, 25 | **kwargs, 26 | ): 27 | if (state.global_step + 1) % self.save_steps == 0: 28 | self.trainer.accelerator.wait_for_everyone() 29 | state_dict = self.trainer.accelerator.get_state_dict(self.trainer.deepspeed) 30 | unwrapped_model = self.trainer.accelerator.unwrap_model(self.trainer.deepspeed) 31 | if self.trainer.accelerator.is_main_process: 32 | unwrapped_model.save_pretrained(args.output_dir, state_dict=state_dict) 33 | self.trainer.accelerator.wait_for_everyone() 34 | return control 35 | 36 | 37 | def create_and_prepare_model(model_id: str, training_args: TrainingArguments, script_args): 38 | model = AutoModelForCausalLM.from_pretrained( 39 | model_id, 40 | use_cache=not training_args.gradient_checkpointing, 41 | use_flash_attention_2=script_args.use_flash_attn, 42 | ) 43 | print("model loaded") 44 | 45 | # find all linear modules in model for lora 46 | target_modules = find_all_linear_names(model) 47 | 48 | # create lora config 49 | peft_config = LoraConfig( 50 | lora_alpha=script_args.lora_alpha, 51 | lora_dropout=script_args.lora_dropout, 52 | r=script_args.lora_r, 53 | bias="none", 54 | task_type="CAUSAL_LM", 55 | target_modules=target_modules, 56 | ) 57 | # enable gradient checkpointing 58 | if training_args.gradient_checkpointing: 59 | model.gradient_checkpointing_enable() 60 | 61 | # pre-process the model by upcasting the layer norms in float 32 for 62 | # Adapted from https://github.com/tmm1/axolotl/blob/2eda9e02a9d15a7a3f92b41f257d9844d72fc220/src/axolotl/utils/models.py#L338 63 | print("pre-processing model for peft") 64 | for name, module in model.named_modules(): 65 | if isinstance(module, LoraLayer): 66 | module = module.to(torch.bfloat16) 67 | if "norm" in name: 68 | module = module.to(torch.bfloat16) 69 | if any(x in name for x in ["lm_head", "embed_tokens", "wte", "wpe"]): 70 | if hasattr(module, "weight"): 71 | module = module.to(torch.bfloat16) 72 | 73 | # initialize peft model 74 | print("initializing peft model") 75 | model = get_peft_model(model, peft_config) 76 | 77 | # logger.info parameters 78 | model.print_trainable_parameters() 79 | 80 | # tokenizer 81 | tokenizer = AutoTokenizer.from_pretrained(model_id) 82 | tokenizer.pad_token = tokenizer.eos_token 83 | 84 | return model, peft_config, tokenizer 85 | 86 | 87 | def find_all_linear_names(model): 88 | cls = torch.nn.Linear 89 | lora_module_names = set() 90 | for name, module in model.named_modules(): 91 | if isinstance(module, cls): 92 | names = name.split(".") 93 | lora_module_names.add(names[0] if len(names) == 1 else names[-1]) 94 | 95 | if "lm_head" in lora_module_names: # needed for 16-bit 96 | lora_module_names.remove("lm_head") 97 | return list(lora_module_names) 98 | --------------------------------------------------------------------------------