├── litgpt ├── chat │ └── __init__.py ├── deploy │ └── __init__.py ├── finetune │ └── __init__.py ├── generate │ └── __init__.py ├── scripts │ ├── __init__.py │ └── convert_pretrained_checkpoint.py ├── data │ ├── microllama.py │ ├── alpaca_gpt4.py │ ├── __init__.py │ ├── alpaca_2k.py │ ├── prepare_slimpajama.py │ ├── prepare_starcoder.py │ ├── lit_data.py │ ├── dolly.py │ ├── longform.py │ ├── tinyllama.py │ └── openwebtext.py ├── __init__.py ├── __main__.py └── args.py ├── tests ├── data │ ├── __init__.py │ ├── test_longform.py │ ├── test_dolly.py │ ├── test_alpaca.py │ ├── test_tinyllama.py │ ├── test_textfiles.py │ ├── test_lit_data.py │ ├── test_openwebtext.py │ ├── test_deita.py │ ├── test_base.py │ └── test_tinystories.py ├── __init__.py ├── test_ci.py ├── test_convert_pretrained_checkpoint.py ├── test_rope.py ├── test_args.py ├── test_thunder_pretrain.py ├── test_cli.py ├── test_generate_adapter.py ├── test_config_hub.py ├── test_evaluate.py ├── test_full.py ├── run_standalone_tests.sh ├── test_batch.py ├── test_thunder_ddp.py ├── test_merge_lora.py ├── test_config.py └── test_pretrain.py ├── tutorials ├── developer-docs │ ├── README.md │ └── python-api.md ├── images │ ├── 0_to_litgpt │ │ ├── usage.webp │ │ ├── commands.webp │ │ ├── finetune.webp │ │ ├── pretrain.webp │ │ ├── instruction-1.webp │ │ └── instruction-2.webp │ └── prepare_dataset │ │ ├── lima.jpg │ │ ├── alpaca.jpg │ │ ├── deita.jpg │ │ ├── dolly.jpg │ │ ├── alpaca-2k.jpg │ │ ├── longform.jpg │ │ ├── alpacagpt4.jpg │ │ ├── alpaca_libre.jpg │ │ └── deita-multiturn.jpg ├── examples │ └── ptl-trainer │ │ ├── README.md │ │ └── litgpt_ptl_medium.py ├── convert_hf_checkpoint.md ├── deploy.md ├── finetune.md ├── finetune_full.md └── convert_lit_models.md ├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── ask-a-question.md │ ├── feature-request.md │ └── bug-report.yaml ├── workflows │ ├── check-links.yml │ ├── publish.yaml │ └── cpu-tests.yml ├── azure-gpu-test.yml └── azure-gpu-test-with-thunder.yml ├── extensions └── thunder │ ├── strategies │ └── __init__.py │ └── unsloth │ └── kernels │ ├── __init__.py │ ├── utils.py │ └── swiglu.py ├── .gitignore ├── pyproject.toml └── config_hub ├── finetune ├── phi-3 │ ├── full.yaml │ ├── lora.yaml │ └── qlora.yaml ├── phi-2 │ └── full.yaml ├── gemma-2b │ └── full.yaml ├── stablelm-base-alpha-3b │ └── full.yaml ├── tiny-llama │ └── full.yaml ├── llama-2-7b │ ├── full.yaml │ └── lora.yaml ├── llama-3-8b │ └── full.yaml ├── llama-3.1-8b │ └── full.yaml ├── falcon-7b │ └── lora.yaml └── mistral-7b │ └── lora.yaml └── pretrain ├── debug.yaml ├── tinyllama.yaml └── microllama.yaml /litgpt/chat/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /litgpt/deploy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /litgpt/finetune/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /litgpt/generate/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /litgpt/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tutorials/developer-docs/README.md: -------------------------------------------------------------------------------- 1 | LitGPT developer documentation files. -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @awaelchli @rasbt @lantiga 2 | /README.md @williamfalcon @lantiga 3 | -------------------------------------------------------------------------------- /tutorials/images/0_to_litgpt/usage.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/0_to_litgpt/usage.webp -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/lima.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/prepare_dataset/lima.jpg -------------------------------------------------------------------------------- /tutorials/images/0_to_litgpt/commands.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/0_to_litgpt/commands.webp -------------------------------------------------------------------------------- /tutorials/images/0_to_litgpt/finetune.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/0_to_litgpt/finetune.webp -------------------------------------------------------------------------------- /tutorials/images/0_to_litgpt/pretrain.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/0_to_litgpt/pretrain.webp -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/alpaca.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/prepare_dataset/alpaca.jpg -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/deita.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/prepare_dataset/deita.jpg -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/dolly.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/prepare_dataset/dolly.jpg -------------------------------------------------------------------------------- /extensions/thunder/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | from .thunder_fsdp import ThunderFSDPStrategy 2 | from .thunder_ddp import ThunderDDPStrategy 3 | -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/alpaca-2k.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/prepare_dataset/alpaca-2k.jpg -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/longform.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/prepare_dataset/longform.jpg -------------------------------------------------------------------------------- /tutorials/images/0_to_litgpt/instruction-1.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/0_to_litgpt/instruction-1.webp -------------------------------------------------------------------------------- /tutorials/images/0_to_litgpt/instruction-2.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/0_to_litgpt/instruction-2.webp -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/alpacagpt4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/prepare_dataset/alpacagpt4.jpg -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/alpaca_libre.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/prepare_dataset/alpaca_libre.jpg -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/deita-multiturn.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/prepare_dataset/deita-multiturn.jpg -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/ask-a-question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Ask a Question 3 | about: Ask and answer questions related to LitGPT 4 | title: '' 5 | labels: question 6 | 7 | --- 8 | 9 | Please describe your question here. -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Suggest a Feature 3 | about: Propose a new feature or enhancement 4 | title: '' 5 | labels: enhancement 6 | 7 | --- 8 | 9 | Please describe the feature or enhancement along with the intended usecase. -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import warnings 4 | 5 | import pytest 6 | 7 | warnings.filterwarnings("ignore", category=pytest.PytestWarning, message=r".*\(rm_rf\) error removing.*") 8 | -------------------------------------------------------------------------------- /extensions/thunder/unsloth/kernels/__init__.py: -------------------------------------------------------------------------------- 1 | from .cross_entropy_loss import _cross_entropy_forward_impl, _cross_entropy_backward_impl 2 | from .rope_embedding import _rope_embedding_forward_impl, _rope_embedding_backward_impl, ROPE_GROUP_SIZE 3 | from .swiglu import swiglu_fg_kernel, swiglu_DWf_DW_dfg_kernel 4 | from .utils import calculate_settings 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .idea 3 | .DS_Store 4 | *.egg-info 5 | build 6 | dist 7 | .venv 8 | .vscode 9 | 10 | # data 11 | data 12 | datasets 13 | !litgpt/data 14 | !tests/data 15 | checkpoints 16 | out 17 | wandb 18 | events.out.tfevents* 19 | 20 | # test artifacts from tests/test_readme.py 21 | **/custom_finetuning_dataset.json 22 | client.py 23 | **/custom_texts/ 24 | -------------------------------------------------------------------------------- /tests/test_ci.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | from tests.conftest import RunIf 4 | from lightning.fabric.plugins.precision.bitsandbytes import _BITSANDBYTES_AVAILABLE 5 | 6 | 7 | @RunIf(min_cuda_gpus=1) 8 | def test_gpu_ci_installs_bitsandbytes(): 9 | assert _BITSANDBYTES_AVAILABLE, str(_BITSANDBYTES_AVAILABLE) 10 | -------------------------------------------------------------------------------- /litgpt/data/microllama.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | from dataclasses import dataclass 3 | from pathlib import Path 4 | from typing import Union 5 | 6 | from litgpt.data import TinyLlama 7 | 8 | 9 | @dataclass 10 | class MicroLlama(TinyLlama): 11 | """The MicroLlama data module is composed of only SlimPajama data.""" 12 | 13 | def __init__(self, data_path: Union[str, Path] = Path("data/"), seed: int = 42, num_workers: int = 8): 14 | super().__init__(data_path=data_path, seed=seed, num_workers=num_workers, use_starcoder=False) 15 | -------------------------------------------------------------------------------- /.github/workflows/check-links.yml: -------------------------------------------------------------------------------- 1 | name: Check hyperlinks 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | test: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | 18 | - name: Set up Python 19 | uses: actions/setup-python@v5 20 | with: 21 | python-version: '3.10' 22 | 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install pytest pytest-check-links 27 | 28 | - name: Check links 29 | run: | 30 | pytest --check-links README.md --check-links-ignore "http*" 31 | pytest --check-links tutorials --check-links-ignore "http*" -------------------------------------------------------------------------------- /tutorials/examples/ptl-trainer/README.md: -------------------------------------------------------------------------------- 1 | ## Minimal PyTorch Lightning Trainer Example 2 | 3 | 4 | 5 | The script in this folder provides minimal examples showing how to train a LitGPT model using LitGPT's `GPT` class with the [PyTorch Lightning](https://github.com/Lightning-AI/pytorch-lightning) Trainer. 6 | 7 | You can run the scripts as follows: 8 | 9 |   10 | ## Small 160M model: 11 | 12 | ```bash 13 | # Download the Pythia model 14 | litgpt download EleutherAI/pythia-160m 15 | 16 | python litgpt_ptl_small.py 17 | ``` 18 | 19 |   20 | ## Medium-sized 8B model: 21 | 22 | ```bash 23 | # Download the Llama 3.1 model 24 | litgpt download meta-llama/Meta-Llama-3.1-8B --access_token hf_... 25 | 26 | python litgpt_ptl_medium.py 27 | ``` 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /litgpt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import logging 4 | import re 5 | 6 | from litgpt.api import LLM 7 | from litgpt.model import GPT # needs to be imported before config 8 | from litgpt.config import Config 9 | from litgpt.prompts import PromptStyle 10 | from litgpt.tokenizer import Tokenizer 11 | 12 | # Suppress excessive warnings, see https://github.com/pytorch/pytorch/issues/111632 13 | pattern = re.compile(".*Profiler function .* will be ignored") 14 | logging.getLogger("torch._dynamo.variables.torch").addFilter(lambda record: not pattern.search(record.getMessage())) 15 | 16 | # Avoid printing state-dict profiling output at the WARNING level when saving a checkpoint 17 | logging.getLogger("torch.distributed.fsdp._optim_utils").disabled = True 18 | logging.getLogger("torch.distributed.fsdp._debug_utils").disabled = True 19 | 20 | __all__ = ["LLM", "GPT", "Config", "PromptStyle", "Tokenizer"] 21 | -------------------------------------------------------------------------------- /litgpt/data/alpaca_gpt4.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | 4 | from dataclasses import dataclass, field 5 | from pathlib import Path 6 | 7 | from litgpt.data.alpaca import Alpaca 8 | 9 | _URL = "https://raw.githubusercontent.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/main/data/alpaca_gpt4_data.json" 10 | 11 | 12 | @dataclass 13 | class AlpacaGPT4(Alpaca): 14 | """AlpacaGPT4 data module for supervised finetuning.""" 15 | 16 | val_split_fraction: float = 0.03847 # to get exactly 2000 test samples, 17 | """The fraction of the dataset to use for the validation dataset. The rest is used for training.""" 18 | download_dir: Path = Path("./data/alpacagpt4") 19 | """The directory in which the downloaded datasetgets saved.""" 20 | file_url: str = field(repr=False, default=_URL) 21 | """The URL from where to download the dataset.""" 22 | file_name: str = field(repr=False, default="alpacagpt4_data_cleaned_archive.json") 23 | """The name of the dataset file to download.""" 24 | -------------------------------------------------------------------------------- /tests/test_convert_pretrained_checkpoint.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import os 4 | 5 | import torch 6 | 7 | from litgpt.scripts.convert_pretrained_checkpoint import convert_pretrained_checkpoint 8 | 9 | 10 | def test_convert_pretrained_checkpoint(tmp_path, fake_checkpoint_dir): 11 | # Pretend we made a checkpoint from pretraining 12 | pretrained_checkpoint = { 13 | "model": {"some.module.weight": torch.rand(2, 2), "_orig_mod.some.other.module.weight": torch.rand(2, 2)}, 14 | "the_optimizer": "optimizer_state", 15 | "other": 1, 16 | } 17 | torch.save(pretrained_checkpoint, fake_checkpoint_dir / "lit_model.pth") 18 | 19 | convert_pretrained_checkpoint(checkpoint_dir=fake_checkpoint_dir, output_dir=(tmp_path / "converted")) 20 | 21 | assert set(os.listdir(tmp_path / "converted")) == { 22 | "lit_model.pth", 23 | "model_config.yaml", 24 | "tokenizer_config.json", 25 | "tokenizer.json", 26 | } 27 | converted_checkpoint = torch.load(tmp_path / "converted" / "lit_model.pth") 28 | assert list(converted_checkpoint.keys()) == ["some.module.weight", "some.other.module.weight"] 29 | -------------------------------------------------------------------------------- /tests/test_rope.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import torch 4 | from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXRotaryEmbedding, apply_rotary_pos_emb 5 | 6 | from litgpt.model import apply_rope, build_rope_cache 7 | 8 | 9 | @torch.inference_mode() 10 | def test_rope(): 11 | bs, seq_len, n_head, n_embed = 1, 6, 2, 8 12 | head_size = n_embed // n_head 13 | x = torch.randint(0, 10000, size=(bs, n_head, seq_len, head_size)).float() 14 | position_ids = torch.arange(seq_len).unsqueeze(0) 15 | 16 | theirs = GPTNeoXRotaryEmbedding(head_size, seq_len) 17 | ours_cos_cached, ours_sin_cached = build_rope_cache(seq_len, head_size, device=x.device) 18 | # their rope cache has 2 added dimensions and the cos/sin is duplicated 19 | torch.testing.assert_close(ours_cos_cached, theirs.cos_cached.squeeze()) 20 | torch.testing.assert_close(ours_sin_cached, theirs.sin_cached.squeeze()) 21 | 22 | ours_x_rope = apply_rope(x, ours_cos_cached, ours_sin_cached) 23 | theirs_x_rope, _ = apply_rotary_pos_emb(x, x, theirs.cos_cached, theirs.sin_cached, position_ids) 24 | torch.testing.assert_close(ours_x_rope, theirs_x_rope) 25 | -------------------------------------------------------------------------------- /litgpt/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | from litgpt.data.base import DataModule, SFTDataset, get_sft_collate_fn 4 | from litgpt.data.alpaca import Alpaca 5 | from litgpt.data.alpaca_2k import Alpaca2k 6 | from litgpt.data.alpaca_gpt4 import AlpacaGPT4 7 | from litgpt.data.json_data import JSON 8 | from litgpt.data.deita import Deita 9 | from litgpt.data.dolly import Dolly 10 | from litgpt.data.flan import FLAN 11 | from litgpt.data.lima import LIMA 12 | from litgpt.data.lit_data import LitData 13 | from litgpt.data.longform import LongForm 14 | from litgpt.data.text_files import TextFiles 15 | from litgpt.data.tinyllama import TinyLlama 16 | from litgpt.data.tinystories import TinyStories 17 | from litgpt.data.openwebtext import OpenWebText 18 | from litgpt.data.microllama import MicroLlama 19 | 20 | 21 | __all__ = [ 22 | "Alpaca", 23 | "Alpaca2k", 24 | "AlpacaGPT4", 25 | "Deita", 26 | "Dolly", 27 | "FLAN", 28 | "JSON", 29 | "LIMA", 30 | "LitData", 31 | "DataModule", 32 | "LongForm", 33 | "OpenWebText", 34 | "SFTDataset", 35 | "TextFiles", 36 | "TinyLlama", 37 | "TinyStories", 38 | "MicroLlama" 39 | "get_sft_collate_fn", 40 | ] 41 | -------------------------------------------------------------------------------- /.github/workflows/publish.yaml: -------------------------------------------------------------------------------- 1 | # To create a release, create a tag and push it to GitHub: 2 | #git tag -a "v0.0.1-beta" -m "beta version testing" 3 | #git push --tags 4 | # https://dev.to/iamtekson/publish-package-to-pypi-and-release-new-version-using-github-actions-108k 5 | name: Publish LitGPT to PyPI 6 | 7 | on: 8 | push: 9 | tags: 10 | - "v*" 11 | jobs: 12 | build-n-publish: 13 | name: Build and publish to PyPI 14 | runs-on: ubuntu-latest 15 | environment: 16 | name: pypi 17 | url: https://pypi.org/p/litgpt 18 | permissions: 19 | id-token: write 20 | 21 | steps: 22 | - name: Checkout source 23 | uses: actions/checkout@v3 24 | 25 | - name: Set up Python 26 | uses: actions/setup-python@v4 27 | with: 28 | python-version: "3.x" 29 | 30 | - name: Build source and wheel distributions 31 | run: | 32 | python -m pip install --upgrade build twine 33 | pip install importlib_metadata==7.2.1 34 | python -m build 35 | twine check --strict dist/* 36 | - name: Publish distribution to PyPI 37 | uses: pypa/gh-action-pypi-publish@release/v1 38 | with: 39 | user: __token__ 40 | password: ${{ secrets.PYPI_API_TOKEN }} 41 | -------------------------------------------------------------------------------- /extensions/thunder/unsloth/kernels/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import triton 16 | 17 | MAX_FUSED_SIZE = 65536 # 2**16 18 | next_power_of_2 = triton.next_power_of_2 19 | 20 | def calculate_settings(n): 21 | BLOCK_SIZE = next_power_of_2(n) 22 | if BLOCK_SIZE > MAX_FUSED_SIZE: 23 | raise RuntimeError(f"Cannot launch Triton kernel since n = {n} exceeds "\ 24 | f"the maximum CUDA blocksize = {MAX_FUSED_SIZE}.") 25 | num_warps = 4 26 | if BLOCK_SIZE >= 32768: num_warps = 32 27 | elif BLOCK_SIZE >= 8192: num_warps = 16 28 | elif BLOCK_SIZE >= 2048: num_warps = 8 29 | return BLOCK_SIZE, num_warps 30 | pass 31 | -------------------------------------------------------------------------------- /tests/data/test_longform.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | from litgpt.data import LongForm 3 | from litgpt.prompts import Longform as LongFormPromptStyle 4 | 5 | 6 | def test_longform(mock_tokenizer, longform_path): 7 | longform = LongForm(download_dir=longform_path, num_workers=0) 8 | assert isinstance(longform.prompt_style, LongFormPromptStyle) 9 | longform.connect(mock_tokenizer, batch_size=2, max_seq_length=10) 10 | longform.prepare_data() 11 | longform.setup() 12 | 13 | train_dataloader = longform.train_dataloader() 14 | val_dataloader = longform.val_dataloader() 15 | 16 | assert len(train_dataloader) == 9 17 | assert len(val_dataloader) == 5 18 | 19 | train_batch = next(iter(train_dataloader)) 20 | val_batch = next(iter(val_dataloader)) 21 | 22 | assert train_batch.keys() == val_batch.keys() == {"input_ids", "labels"} 23 | assert all(seq.shape == (2, 10) for seq in train_batch.values()) 24 | assert all(seq.shape == (2, 10) for seq in val_batch.values()) 25 | 26 | assert isinstance(train_dataloader.dataset.prompt_style, LongFormPromptStyle) 27 | assert isinstance(val_dataloader.dataset.prompt_style, LongFormPromptStyle) 28 | 29 | # has attributes from super class `LightningDataModule` 30 | assert longform.prepare_data_per_node 31 | -------------------------------------------------------------------------------- /tests/data/test_dolly.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | from litgpt.data import Dolly 4 | from litgpt.prompts import Alpaca as AlpacaPromptStyle 5 | 6 | 7 | def test_dolly(mock_tokenizer, dolly_path): 8 | dolly = Dolly(val_split_fraction=0.5, download_dir=dolly_path.parent, file_name=dolly_path.name, num_workers=0) 9 | assert isinstance(dolly.prompt_style, AlpacaPromptStyle) 10 | dolly.connect(mock_tokenizer, batch_size=2, max_seq_length=10) 11 | dolly.prepare_data() 12 | dolly.setup() 13 | 14 | train_dataloader = dolly.train_dataloader() 15 | val_dataloader = dolly.val_dataloader() 16 | 17 | assert len(train_dataloader) == 3 18 | assert len(val_dataloader) == 3 19 | 20 | train_batch = next(iter(train_dataloader)) 21 | val_batch = next(iter(val_dataloader)) 22 | 23 | assert train_batch.keys() == val_batch.keys() == {"input_ids", "labels"} 24 | assert all(seq.shape == (2, 10) for seq in train_batch.values()) 25 | assert all(seq.shape == (2, 10) for seq in val_batch.values()) 26 | 27 | assert isinstance(train_dataloader.dataset.prompt_style, AlpacaPromptStyle) 28 | assert isinstance(val_dataloader.dataset.prompt_style, AlpacaPromptStyle) 29 | 30 | # has attributes from super class `LightningDataModule` 31 | assert dolly.prepare_data_per_node 32 | -------------------------------------------------------------------------------- /tests/data/test_alpaca.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | from litgpt.data import Alpaca 3 | from litgpt.prompts import Alpaca as AlpacaPromptStyle 4 | 5 | 6 | def test_alpaca(mock_tokenizer, alpaca_path): 7 | alpaca = Alpaca(val_split_fraction=0.5, download_dir=alpaca_path.parent, file_name=alpaca_path.name, num_workers=0) 8 | assert isinstance(alpaca.prompt_style, AlpacaPromptStyle) 9 | alpaca.connect(mock_tokenizer, batch_size=2, max_seq_length=10) 10 | alpaca.prepare_data() 11 | alpaca.setup() 12 | 13 | train_dataloader = alpaca.train_dataloader() 14 | val_dataloader = alpaca.val_dataloader() 15 | 16 | assert len(train_dataloader) == 6 17 | assert len(val_dataloader) == 6 18 | 19 | train_batch = next(iter(train_dataloader)) 20 | val_batch = next(iter(val_dataloader)) 21 | 22 | assert train_batch.keys() == val_batch.keys() == {"input_ids", "labels"} 23 | assert all(seq.shape == (2, 10) for seq in train_batch.values()) 24 | assert all(seq.shape == (2, 10) for seq in val_batch.values()) 25 | 26 | assert isinstance(train_dataloader.dataset.prompt_style, AlpacaPromptStyle) 27 | assert isinstance(val_dataloader.dataset.prompt_style, AlpacaPromptStyle) 28 | 29 | # has attributes from super class `LightningDataModule` 30 | assert alpaca.prepare_data_per_node 31 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.yaml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: Report errors related to LitGPT 3 | title: "Description" 4 | labels: bug 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | Thank you for taking the time to report an issue. Please fill out the details below to help us resolve it. 10 | 11 | - type: textarea 12 | id: bug_description 13 | attributes: 14 | label: Bug description 15 | description: A description of the issue. 16 | placeholder: | 17 | Please provide a description of what the bug or issue is. 18 | validations: 19 | required: true 20 | 21 | - type: dropdown 22 | id: operating_system 23 | attributes: 24 | label: What operating system are you using? 25 | description: If applicable, please select the operating system where you experienced this issue. 26 | options: 27 | - "Unknown" 28 | - "macOS" 29 | - "Linux" 30 | - "Windows" 31 | validations: 32 | required: true 33 | 34 | - type: textarea 35 | id: version 36 | attributes: 37 | label: LitGPT Version 38 | description: | 39 | Please provide details about your LitGPT version by running the following code in your terminal: 40 | ``` 41 | pip show litgpt | grep Version: 42 | ``` 43 | You can simply copy and paste the outputs below. 44 | value: | 45 | ``` 46 | 47 | 48 | 49 | ``` 50 | validations: 51 | required: false 52 | -------------------------------------------------------------------------------- /tests/data/test_tinyllama.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | from unittest import mock 3 | 4 | import pytest 5 | from litdata.streaming import CombinedStreamingDataset, StreamingDataLoader, StreamingDataset 6 | from torch.utils.data import DataLoader 7 | 8 | from litgpt.data import TinyLlama 9 | 10 | 11 | @mock.patch("litdata.streaming.dataset.subsample_streaming_dataset", return_value=([], [])) 12 | def test_tinyllama(_, tmp_path): 13 | data = TinyLlama(data_path=(tmp_path / "data")) 14 | assert data.seq_length == 2048 15 | assert data.batch_size == 1 16 | 17 | data.connect(batch_size=2, max_seq_length=1024) 18 | assert data.seq_length == 1025 19 | assert data.batch_size == 2 20 | 21 | with pytest.raises(FileNotFoundError, match="The directory .*data/slimpajama/train does not exist"): 22 | data.prepare_data() 23 | 24 | (tmp_path / "data" / "slimpajama" / "train").mkdir(parents=True) 25 | (tmp_path / "data" / "slimpajama" / "val").mkdir(parents=True) 26 | (tmp_path / "data" / "starcoder").mkdir(parents=True) 27 | 28 | data.prepare_data() 29 | data.setup() 30 | 31 | train_dataloader = data.train_dataloader() 32 | assert isinstance(train_dataloader, StreamingDataLoader) 33 | assert isinstance(train_dataloader.dataset, CombinedStreamingDataset) 34 | 35 | val_dataloader = data.val_dataloader() 36 | assert isinstance(val_dataloader, DataLoader) 37 | assert isinstance(val_dataloader.dataset, StreamingDataset) 38 | 39 | # has attributes from super class `LightningDataModule` 40 | assert data.prepare_data_per_node 41 | -------------------------------------------------------------------------------- /tutorials/convert_hf_checkpoint.md: -------------------------------------------------------------------------------- 1 | # Converting Hugging Face Transformers to LitGPT weights 2 | 3 | By default, the `litgpt download` command converts the downloaded HF checkpoint files into a LitGPT compatible format after downloading. For example, 4 | 5 | ```bash 6 | litgpt download EleutherAI/pythia-14m 7 | ``` 8 | 9 | creates the following files: 10 | 11 | ``` 12 | checkpoints/ 13 | └── EleutherAI/ 14 | └── pythia-14m/ 15 | ├── config.json 16 | ├── generation_config.json 17 | ├── model_config.yaml # LitGPT specific file 18 | ├── lit_model.pth # LitGPT specific file 19 | ├── pytorch_model.bin 20 | ├── tokenizer.json 21 | └── tokenizer_config.json 22 | ``` 23 | 24 | 25 | 26 | To disable the automatic conversion, which is useful for development and debugging purposes, you can run the `litgpt download` with the `--convert_checkpoint false` flag. This will only download the checkpoint files but do not convert them for use in LitGPT: 27 | 28 | ```bash 29 | rm -rf checkpoints/EleutherAI/pythia-14m 30 | 31 | litgpt download EleutherAI/pythia-14m \ 32 | --convert_checkpoint false 33 | 34 | ls checkpoints/EleutherAI/pythia-14m 35 | ``` 36 | 37 | ``` 38 | checkpoints/ 39 | └── EleutherAI/ 40 | └── pythia-14m/ 41 | ├── config.json 42 | ├── generation_config.json 43 | ├── pytorch_model.bin 44 | ├── tokenizer.json 45 | └── tokenizer_config.json 46 | ``` 47 | 48 | The required files `model_config.yaml` and `lit_model.pth` files can then be manually generated via the `litgpt/scripts/convert_hf_checkpoint.py` script: 49 | 50 | ```bash 51 | litgpt convert_to_litgpt checkpoints/EleutherAI/pythia-14m 52 | ``` 53 | -------------------------------------------------------------------------------- /tests/data/test_textfiles.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from litdata import optimize 4 | from torch.utils._pytree import tree_map 5 | 6 | 7 | class Tokenizer: 8 | bos_id = 0 9 | 10 | def encode(self, text, bos, eos): 11 | assert bos 12 | assert not eos 13 | return [self.bos_id] + [ord(c) for c in text] 14 | 15 | 16 | def tokenize(data): 17 | for story in data: 18 | yield torch.tensor(story) 19 | 20 | 21 | def fake_chunk(path, data): 22 | optimize(fn=tokenize, inputs=[data] * len(data), output_dir=str(path), num_workers=1, chunk_bytes="200MB") 23 | 24 | 25 | def test_textfiles_datamodule(tmp_path): 26 | from litgpt.data.text_files import TextFiles 27 | 28 | data_dir = tmp_path / "textfiles" 29 | datamodule = TextFiles(train_data_path=data_dir, num_workers=1) 30 | datamodule.connect(max_seq_length=2, tokenizer=Tokenizer()) 31 | 32 | # simulate `datamodule.prepare_data` 33 | train_data_dir = data_dir / "train" 34 | train_data_dir.mkdir(parents=True) 35 | fake_chunk(train_data_dir, [[12], [0, 23, 15, 63, 0], [73, 5, 0, 1, 1999, 0, 13]]) 36 | datamodule.setup() 37 | 38 | tr_dataloader = datamodule.train_dataloader() 39 | torch.manual_seed(123) 40 | 41 | actual = tree_map(torch.Tensor.tolist, list(tr_dataloader)) 42 | # there is 1 sample per index in the data (13) 43 | assert actual == [ 44 | [[1999, 0, 13]], 45 | [[0, 13, 12]], 46 | [[1, 1999, 0]], 47 | [[63, 0, 73]], 48 | [[5, 0, 1]], 49 | [[0, 73, 5]], 50 | [[0, 23, 15]], 51 | [[0, 1, 1999]], 52 | [[15, 63, 0]], 53 | [[73, 5, 0]], 54 | [[12, 0, 23]], 55 | [[23, 15, 63]], 56 | [[13, 12, 0]] 57 | ] 58 | -------------------------------------------------------------------------------- /tests/test_args.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | import pytest 3 | 4 | from litgpt.args import TrainArgs 5 | 6 | 7 | def test_compute_warmup_iters(): 8 | # warmup disabled 9 | train = TrainArgs(lr_warmup_steps=0, lr_warmup_fraction=0) 10 | assert train.warmup_iters(devices=1, max_iters=1000, train_dataloader=range(10)) == 0 11 | 12 | # lr_warmup_steps and lr_warmup_fraction both are not allowed 13 | with pytest.raises(ValueError, match="Can't provide both `--train.lr_warmup_fraction`"): 14 | TrainArgs(lr_warmup_steps=1, lr_warmup_fraction=0.2) 15 | 16 | # lr_warmup_fraction invalid range 17 | with pytest.raises(ValueError, match=" must be between 0 and 1"): 18 | TrainArgs(lr_warmup_steps=0, lr_warmup_fraction=1.1) 19 | 20 | # lr_warmup_steps 21 | train = TrainArgs(global_batch_size=1, micro_batch_size=1, lr_warmup_steps=100, lr_warmup_fraction=0) 22 | assert train.warmup_iters(devices=1, max_iters=1000, train_dataloader=range(10)) == 100 23 | # lr_warmup_steps multiplied by accumulation factor 24 | train.global_batch_size = 4 25 | assert train.warmup_iters(devices=1, max_iters=1000, train_dataloader=range(10)) == 400 26 | assert train.warmup_iters(devices=2, max_iters=1000, train_dataloader=range(10)) == 200 27 | # lr_warmup_steps truncated by max iters 28 | assert train.warmup_iters(devices=1, max_iters=120, train_dataloader=range(10)) == 120 29 | 30 | # lr_warmup_fraction 31 | train = TrainArgs(global_batch_size=1, micro_batch_size=1, lr_warmup_steps=0, lr_warmup_fraction=0.3) 32 | assert train.warmup_iters(devices=1, max_iters=1000, train_dataloader=range(100)) == 30 33 | # lr_warmup_fraction truncated by max iters 34 | assert train.warmup_iters(devices=1, max_iters=20, train_dataloader=range(100)) == 20 35 | # lr_warmup_fraction rounds up 36 | assert train.warmup_iters(devices=1, max_iters=1000, train_dataloader=range(5)) == 2 37 | -------------------------------------------------------------------------------- /.github/workflows/cpu-tests.yml: -------------------------------------------------------------------------------- 1 | name: CPU tests 2 | 3 | on: 4 | push: 5 | branches: [main, wip] 6 | pull_request: 7 | branches: [main, wip] 8 | 9 | concurrency: 10 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} 11 | cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} 12 | 13 | defaults: 14 | run: 15 | shell: bash 16 | 17 | env: 18 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 19 | 20 | jobs: 21 | cpu-tests: 22 | runs-on: ${{ matrix.os }} 23 | strategy: 24 | fail-fast: false 25 | matrix: 26 | include: 27 | - {os: "macOS-12", python-version: "3.10"} 28 | - {os: "ubuntu-22.04", python-version: "3.11"} 29 | - {os: "ubuntu-22.04", python-version: "3.10"} 30 | - {os: "ubuntu-22.04", python-version: "3.9"} 31 | - {os: "windows-2022", python-version: "3.9"} 32 | timeout-minutes: 25 33 | 34 | steps: 35 | - uses: actions/checkout@v4 36 | 37 | - name: Set up Python ${{ matrix.python-version }} 38 | uses: actions/setup-python@v5 39 | with: 40 | python-version: ${{ matrix.python-version }} 41 | cache: 'pip' 42 | cache-dependency-path: | 43 | pyproject.toml 44 | 45 | - name: Install minimal dependencies 46 | run: | 47 | # python -m pip install --upgrade pip 48 | pip install . 49 | pip list 50 | # make sure all modules are still importable with only the minimal dependencies available 51 | modules=$( 52 | find litgpt -type f -name "*.py" | \ 53 | sed 's/\.py$//' | sed 's/\//./g' | \ 54 | sed 's/.__init__//g' | xargs -I {} echo "import {};" 55 | ) 56 | echo "$modules" 57 | python -c "$modules" 58 | 59 | - name: Install all dependencies 60 | run: | 61 | pip install '.[all,test]' 62 | pip list 63 | 64 | - name: Run tests 65 | run: | 66 | pytest -v --disable-pytest-warnings --strict-markers --color=yes --timeout 120 67 | -------------------------------------------------------------------------------- /.github/azure-gpu-test.yml: -------------------------------------------------------------------------------- 1 | name: GPU tests 2 | 3 | trigger: 4 | branches: 5 | include: 6 | - "main" 7 | - "wip" 8 | 9 | pr: 10 | branches: 11 | include: 12 | - "main" 13 | - "wip" 14 | 15 | jobs: 16 | - job: testing 17 | timeoutInMinutes: "30" 18 | cancelTimeoutInMinutes: "2" 19 | pool: "lit-rtx-3090" 20 | variables: 21 | DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) 22 | CI: "true" 23 | container: 24 | image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0" 25 | options: "--gpus=all --shm-size=8gb" 26 | workspace: 27 | clean: all 28 | steps: 29 | 30 | - bash: | 31 | echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" 32 | displayName: 'set env. vars' 33 | 34 | - bash: | 35 | echo $(DEVICES) 36 | echo $CUDA_VISIBLE_DEVICES 37 | whereis nvidia 38 | nvidia-smi 39 | which python && which pip 40 | python --version 41 | pip --version 42 | pip list 43 | displayName: "Image info & NVIDIA" 44 | 45 | - script: | 46 | pip install --upgrade pip 47 | pip install '.[all,test]' 48 | pip install -U torch torchvision torchaudio 49 | displayName: 'Install dependencies' 50 | 51 | - bash: | 52 | set -e 53 | pip list 54 | python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" 55 | displayName: "Env details" 56 | 57 | - bash: pytest -v --disable-pytest-warnings --strict-markers --color=yes --ignore-glob="tests/test_thunder*" --ignore="tests/test_unsloth_executor.py" 58 | displayName: 'Ordinary tests' 59 | env: 60 | PL_RUN_CUDA_TESTS: "1" 61 | timeoutInMinutes: "5" 62 | 63 | - bash: bash run_standalone_tests.sh 64 | workingDirectory: tests 65 | env: 66 | PL_RUN_CUDA_TESTS: "1" 67 | displayName: "Standalone tests" 68 | timeoutInMinutes: "10" 69 | -------------------------------------------------------------------------------- /.github/azure-gpu-test-with-thunder.yml: -------------------------------------------------------------------------------- 1 | name: GPU tests with Thunder 2 | 3 | trigger: 4 | branches: 5 | include: 6 | - "main" 7 | - "wip" 8 | 9 | pr: 10 | branches: 11 | include: 12 | - "main" 13 | - "wip" 14 | 15 | jobs: 16 | - job: testing 17 | timeoutInMinutes: "30" 18 | cancelTimeoutInMinutes: "2" 19 | pool: "lit-rtx-3090" 20 | variables: 21 | DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) 22 | CI: "true" 23 | container: 24 | image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0" 25 | options: "--gpus=all --shm-size=8gb" 26 | workspace: 27 | clean: all 28 | steps: 29 | 30 | - bash: | 31 | echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" 32 | displayName: 'set env. vars' 33 | 34 | - bash: | 35 | echo $(DEVICES) 36 | echo $CUDA_VISIBLE_DEVICES 37 | whereis nvidia 38 | nvidia-smi 39 | which python && which pip 40 | python --version 41 | pip --version 42 | pip list 43 | displayName: "Image info & NVIDIA" 44 | 45 | - script: | 46 | pip install --upgrade pip 47 | pip install '.[all,test]' 48 | displayName: 'Install dependencies' 49 | 50 | - script: | 51 | pip uninstall -y torchvision torchaudio 52 | pip install --pre 'nvfuser-cu121[torch]' --extra-index-url https://pypi.nvidia.com 53 | displayName: 'Install PyTorch nightly' 54 | 55 | - bash: | 56 | set -e 57 | pip list 58 | python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" 59 | displayName: "Env details" 60 | 61 | - bash: pytest -v --disable-pytest-warnings --strict-markers --color=yes 62 | displayName: 'Ordinary tests' 63 | env: 64 | PL_RUN_CUDA_TESTS: "1" 65 | timeoutInMinutes: "5" 66 | 67 | - bash: bash run_standalone_tests.sh 68 | workingDirectory: tests 69 | env: 70 | PL_RUN_CUDA_TESTS: "1" 71 | displayName: "Standalone tests" 72 | timeoutInMinutes: "10" -------------------------------------------------------------------------------- /tutorials/examples/ptl-trainer/litgpt_ptl_medium.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import litgpt 3 | from litgpt.lora import GPT, merge_lora_weights 4 | from litgpt.data import Alpaca2k 5 | import lightning as L 6 | 7 | 8 | class LitLLM(L.LightningModule): 9 | def __init__(self): 10 | super().__init__() 11 | self.model = GPT.from_name( 12 | name="Llama-3.1-8B", 13 | lora_r=32, 14 | lora_alpha=16, 15 | lora_dropout=0.05, 16 | lora_key=False, 17 | lora_value=True, 18 | ) 19 | litgpt.lora.mark_only_lora_as_trainable(self.model) 20 | 21 | def on_train_start(self): 22 | state_dict = torch.load("checkpoints/meta-llama/Meta-Llama-3.1-8B/lit_model.pth", mmap=True) 23 | self.model.load_state_dict(state_dict, strict=False) 24 | 25 | def training_step(self, batch): 26 | input_ids, targets = batch["input_ids"], batch["labels"] 27 | logits = self.model(input_ids) 28 | loss = litgpt.utils.chunked_cross_entropy(logits[..., :-1, :], targets[..., 1:]) 29 | self.log("train_loss", loss, prog_bar=True) 30 | return loss 31 | 32 | def configure_optimizers(self): 33 | warmup_steps = 10 34 | optimizer = torch.optim.AdamW(self.model.parameters(), lr=0.0002, weight_decay=0.0, betas=(0.9, 0.95)) 35 | scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: step / warmup_steps) 36 | return [optimizer], [scheduler] 37 | 38 | 39 | if __name__ == "__main__": 40 | data = Alpaca2k() 41 | tokenizer = litgpt.Tokenizer("checkpoints/meta-llama/Meta-Llama-3.1-8B") 42 | data.connect(tokenizer, batch_size=1, max_seq_length=512) 43 | 44 | trainer = L.Trainer( 45 | devices=1, 46 | max_epochs=2, 47 | accumulate_grad_batches=8, 48 | precision="bf16-true", 49 | ) 50 | with trainer.init_module(empty_init=True): 51 | model = LitLLM() 52 | 53 | trainer.fit(model, data) 54 | 55 | # Save final checkpoint 56 | merge_lora_weights(model.model) 57 | trainer.save_checkpoint("checkpoints/finetuned.ckpt", weights_only=True) 58 | -------------------------------------------------------------------------------- /tutorials/deploy.md: -------------------------------------------------------------------------------- 1 | # Serve and Deploy LLMs 2 | 3 | This document shows how you can serve a LitGPT for deployment. 4 | 5 |   6 | ## Serve an LLM 7 | 8 | This section illustrates how we can set up an inference server for a phi-2 LLM using `litgpt serve` that is minimal and highly scalable. 9 | 10 | 11 |   12 | ### Step 1: Start the inference server 13 | 14 | 15 | ```bash 16 | # 1) Download a pretrained model (alternatively, use your own finetuned model) 17 | litgpt download microsoft/phi-2 18 | 19 | # 2) Start the server 20 | litgpt serve microsoft/phi-2 21 | ``` 22 | 23 | > [!TIP] 24 | > Use `litgpt serve --help` to display additional options, including the port, devices, LLM temperature setting, and more. 25 | 26 | 27 |   28 | ### Step 2: Query the inference server 29 | 30 | You can now send requests to the inference server you started in step 2. For example, in a new Python session, we can send requests to the inference server as follows: 31 | 32 | 33 | ```python 34 | import requests, json 35 | 36 | response = requests.post( 37 | "http://127.0.0.1:8000/predict", 38 | json={"prompt": "Fix typos in the following sentence: Exampel input"} 39 | ) 40 | 41 | print(response.json()["output"]) 42 | ``` 43 | 44 | Executing the code above prints the following output: 45 | 46 | ``` 47 | Example input. 48 | ``` 49 | 50 |   51 | ## Optional streaming mode 52 | 53 | The 2-step procedure described above returns the complete response all at once. If you want to stream the response on a token-by-token basis, start the server with the streaming option enabled: 54 | 55 | ```bash 56 | litgpt serve microsoft/phi-2 --stream true 57 | ``` 58 | 59 | Then, use the following updated code to query the inference server: 60 | 61 | ```python 62 | import requests, json 63 | 64 | response = requests.post( 65 | "http://127.0.0.1:8000/predict", 66 | json={"prompt": "Fix typos in the following sentence: Exampel input"}, 67 | stream=True 68 | ) 69 | 70 | # stream the response 71 | for line in response.iter_lines(decode_unicode=True): 72 | if line: 73 | print(json.loads(line)["output"], end="") 74 | ``` 75 | 76 | ``` 77 | Sure, here is the corrected sentence: 78 | 79 | Example input 80 | ``` 81 | -------------------------------------------------------------------------------- /litgpt/data/alpaca_2k.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | 4 | from dataclasses import dataclass, field 5 | from pathlib import Path 6 | 7 | from litgpt.data import SFTDataset 8 | from litgpt.data.alpaca import Alpaca 9 | 10 | 11 | @dataclass 12 | class Alpaca2k(Alpaca): 13 | """Alpaca2k data module for supervised finetuning.""" 14 | 15 | val_split_fraction: float = 0.05 # to get exactly 100 validation samples, 16 | """The fraction of the dataset to use for the validation dataset. The rest is used for training.""" 17 | download_dir: Path = Path("./data/alpaca2k") 18 | """The directory in which the downloaded datasetgets saved.""" 19 | repo_id: str = field(repr=False, default="mhenrichsen/alpaca_2k_test") 20 | """The URL from where to download the dataset.""" 21 | file_name: str = field(repr=False, default="alpaca2k_data_cleaned_archive.json") 22 | """The name of the dataset file to download.""" 23 | 24 | def prepare_data(self) -> None: 25 | from datasets import load_dataset 26 | 27 | load_dataset(self.repo_id, cache_dir=self.download_dir) 28 | 29 | def setup(self, stage: str = "") -> None: 30 | from datasets import load_dataset 31 | 32 | dataset = load_dataset(self.repo_id, cache_dir=self.download_dir) 33 | 34 | train_validation_split = dataset["train"].train_test_split(test_size=self.val_split_fraction, seed=self.seed) 35 | train_data = train_validation_split["train"] 36 | test_data = train_validation_split["test"] 37 | 38 | self.train_dataset = SFTDataset( 39 | data=train_data, 40 | tokenizer=self.tokenizer, 41 | prompt_style=self.prompt_style, 42 | max_seq_length=self.max_seq_length, 43 | mask_prompt=self.mask_prompt, 44 | ignore_index=self.ignore_index, 45 | ) 46 | self.test_dataset = SFTDataset( 47 | data=test_data, 48 | tokenizer=self.tokenizer, 49 | prompt_style=self.prompt_style, 50 | max_seq_length=self.max_seq_length, 51 | mask_prompt=self.mask_prompt, 52 | ignore_index=self.ignore_index, 53 | ) 54 | -------------------------------------------------------------------------------- /tests/test_thunder_pretrain.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from contextlib import redirect_stdout 4 | from io import StringIO 5 | from pathlib import Path 6 | from unittest.mock import Mock 7 | 8 | import torch 9 | from tests.conftest import RunIf 10 | from torch.utils.data import DataLoader 11 | 12 | from litgpt import Config 13 | from litgpt.args import EvalArgs, TrainArgs 14 | 15 | # support running without installing as a package 16 | wd = Path(__file__).parent.parent.resolve() 17 | sys.path.append(str(wd)) 18 | 19 | import extensions.thunder.pretrain as pretrain 20 | 21 | 22 | @RunIf(min_cuda_gpus=1, thunder=True) 23 | def test_pretrain(tmp_path, monkeypatch): 24 | model_config = Config(block_size=2, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8) 25 | 26 | dataset = torch.tensor([[0, 1, 2], [3, 4, 5], [0, 1, 2]]) 27 | dataloader = DataLoader(dataset) 28 | monkeypatch.setattr(pretrain, "get_dataloaders", Mock(return_value=(dataloader, dataloader))) 29 | monkeypatch.setattr(pretrain, "save_hyperparameters", Mock()) 30 | 31 | out_dir = tmp_path / "out" 32 | stdout = StringIO() 33 | with redirect_stdout(stdout): 34 | pretrain.setup( 35 | devices=1, 36 | model_config=model_config, 37 | out_dir=out_dir, 38 | train=TrainArgs(global_batch_size=2, max_tokens=16, save_interval=1, micro_batch_size=1, max_norm=1.0), 39 | eval=EvalArgs(interval=1, max_iters=1), 40 | optimizer="AdamW", 41 | ) 42 | 43 | out_dir_contents = set(os.listdir(out_dir)) 44 | checkpoint_dirs = {"step-00000001", "step-00000002", "step-00000003", "step-00000004"} 45 | assert checkpoint_dirs.issubset(out_dir_contents) 46 | assert all((out_dir / p).is_dir() for p in checkpoint_dirs) 47 | for checkpoint_dir in checkpoint_dirs: 48 | # the `tokenizer_dir` is None by default, so only 'lit_model.pth' shows here 49 | assert set(os.listdir(out_dir / checkpoint_dir)) == {"lit_model.pth", "model_config.yaml"} 50 | 51 | assert (out_dir / "logs" / "tensorboard" / "version_0").is_dir() 52 | 53 | logs = stdout.getvalue() 54 | assert logs.count("(step)") == 4 55 | assert logs.count("val loss") == 4 56 | assert "Total parameters: 1,888" in logs 57 | -------------------------------------------------------------------------------- /litgpt/data/prepare_slimpajama.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import json 4 | import os 5 | import time 6 | from pathlib import Path 7 | 8 | from litgpt.tokenizer import Tokenizer 9 | from litgpt.data.prepare_starcoder import DataChunkRecipe 10 | from litgpt.utils import CLI, extend_checkpoint_dir 11 | 12 | 13 | class SlimPajamaDataRecipe(DataChunkRecipe): 14 | is_generator = True 15 | 16 | def __init__(self, tokenizer: Tokenizer, chunk_size: int): 17 | super().__init__(chunk_size) 18 | self.tokenizer = tokenizer 19 | 20 | def prepare_structure(self, input_dir): 21 | files = Path(input_dir).rglob("*.zst") 22 | return [str(file) for file in files] 23 | 24 | def prepare_item(self, filepath): 25 | import zstandard as zstd 26 | 27 | with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f: 28 | for row in f: 29 | text = json.loads(row)["text"] 30 | if json.loads(row)["meta"]["redpajama_set_name"] == "RedPajamaGithub": 31 | continue # exclude the GitHub data since it overlaps with starcoder 32 | text_ids = self.tokenizer.encode(text, bos=False, eos=True) 33 | yield text_ids 34 | 35 | 36 | def prepare( 37 | input_dir: Path = Path("data/SlimPajama-627B/train"), 38 | output_dir: Path = Path("data/slimpajama/train"), 39 | tokenizer_path: Path = Path("checkpoints/Llama-2-7b-hf/"), 40 | chunk_size: int = (2049 * 16384), 41 | fast_dev_run: bool = False, 42 | ) -> None: 43 | from litdata.processing.data_processor import DataProcessor 44 | 45 | tokenizer_path = extend_checkpoint_dir(tokenizer_path) 46 | data_recipe = SlimPajamaDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size) 47 | data_processor = DataProcessor( 48 | input_dir=str(input_dir), 49 | output_dir=str(output_dir), 50 | fast_dev_run=fast_dev_run, 51 | num_workers=os.cpu_count(), 52 | num_downloaders=1, 53 | ) 54 | 55 | start_time = time.time() 56 | data_processor.run(data_recipe) 57 | elapsed_time = time.time() - start_time 58 | print(f"Time taken: {elapsed_time:.2f} seconds") 59 | 60 | 61 | if __name__ == "__main__": 62 | CLI(prepare) 63 | -------------------------------------------------------------------------------- /tests/data/test_lit_data.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | import sys 3 | from unittest import mock 4 | from unittest.mock import ANY 5 | 6 | import pytest 7 | 8 | from litgpt.data import LitData 9 | 10 | 11 | @pytest.mark.skipif(sys.platform == "win32", reason="Needs to implement platform agnostic path/url joining") 12 | @mock.patch("litgpt.data.lit_data.LitData._dataloader") 13 | def test_input_dir_and_splits(dl_mock, tmp_path): 14 | 15 | with pytest.raises(ValueError, match="If provided `split_names` must be a tuple of two strings"): 16 | LitData(data_path=tmp_path, split_names=("train",)) 17 | 18 | # local dir, no splits 19 | data = LitData(data_path=tmp_path) 20 | data.train_dataloader() 21 | dl_mock.assert_called_with(input_dir=str(tmp_path), train=True) 22 | data.val_dataloader() 23 | dl_mock.assert_called_with(input_dir=str(tmp_path), train=False) 24 | 25 | # local dir, splits 26 | data = LitData(data_path=tmp_path, split_names=("train", "val")) 27 | data.train_dataloader() 28 | dl_mock.assert_called_with(input_dir=str(tmp_path / "train"), train=True) 29 | data.val_dataloader() 30 | dl_mock.assert_called_with(input_dir=str(tmp_path / "val"), train=False) 31 | 32 | # remote dir, splits 33 | data = LitData(data_path="s3://mydataset/data", split_names=("train", "val")) 34 | data.train_dataloader() 35 | dl_mock.assert_called_with(input_dir=str("s3://mydataset/data/train"), train=True) 36 | data.val_dataloader() 37 | dl_mock.assert_called_with(input_dir=str("s3://mydataset/data/val"), train=False) 38 | 39 | 40 | @pytest.mark.skipif(sys.platform == "win32", reason="Needs to implement platform agnostic path/url joining") 41 | @mock.patch("litdata.streaming.StreamingDataset") 42 | @mock.patch("litdata.streaming.StreamingDataLoader") 43 | def test_dataset_args(streaming_dataloader_mock, streaming_dataset_mock, tmp_path): 44 | data = LitData(data_path=tmp_path, seed=1000) 45 | data.train_dataloader() 46 | streaming_dataset_mock.assert_called_with( 47 | input_dir=str(tmp_path), 48 | item_loader=ANY, 49 | shuffle=True, 50 | seed=1000, 51 | ) 52 | streaming_dataloader_mock.assert_called_with( 53 | streaming_dataset_mock(), 54 | batch_size=1, 55 | pin_memory=True, 56 | num_workers=8, 57 | drop_last=True, 58 | ) 59 | -------------------------------------------------------------------------------- /tutorials/developer-docs/python-api.md: -------------------------------------------------------------------------------- 1 | # LitGPT High-level Python API 2 | 3 | This is a work-in-progress draft for a high-level LitGPT Python API. 4 | 5 |   6 | ## Model loading & saving 7 | 8 | The `LLM.load` command loads an `llm` object, which contains both the model object (a PyTorch module) and a preprocessor. 9 | 10 | ```python 11 | from litgpt import LLM 12 | 13 | llm = LLM.load( 14 | model="url | local_path", 15 | # high-level user only needs to care about those: 16 | memory_reduction="none | medium | strong" 17 | # advanced options for technical users: 18 | source="hf | local | other" 19 | quantize="bnb.nf4", 20 | precision="bf16-true", 21 | device=""auto | cuda | cpu", 22 | ) 23 | ``` 24 | 25 | Here, 26 | 27 | - `llm.model` contains the PyTorch Module 28 | - and `llm.preprocessor.tokenizer` contains the tokenizer 29 | 30 | The `llm.save` command saves the model weights, tokenizer, and configuration information. 31 | 32 | 33 | ```python 34 | llm.save(checkpoint_dir, format="lightning | ollama | hf") 35 | ``` 36 | 37 | 38 |   39 | ## Inference / Chat 40 | 41 | ``` 42 | response = llm.generate( 43 | prompt="What do Llamas eat?", 44 | temperature=0.1, 45 | top_p=0.8, 46 | ... 47 | ) 48 | ``` 49 | 50 | 51 |   52 | ## Dataset 53 | 54 | The `llm.prepare_dataset` command prepares a dataset for training. 55 | 56 | ``` 57 | llm.download_dataset( 58 | URL, 59 | ... 60 | ) 61 | ``` 62 | 63 | ``` 64 | dataset = llm.prepare_dataset( 65 | path, 66 | task="pretrain | instruction_finetune", 67 | test_portion=0.1, 68 | ... 69 | ) 70 | ``` 71 | 72 |   73 | ## Training 74 | 75 | 76 | ```python 77 | llm.instruction_finetune( 78 | config=None, 79 | dataset=dataset, 80 | max_iter=10, 81 | method="full | lora | adapter | adapter_v2" 82 | ) 83 | ``` 84 | 85 | ```python 86 | llm.pretrain(config=None, dataset=dataset, max_iter=10, ...) 87 | ``` 88 | 89 |   90 | ## Serving 91 | 92 | 93 | ```python 94 | llm.serve(port=8000) 95 | ``` 96 | 97 | Then in another Python session: 98 | 99 | ```python 100 | import requests, json 101 | 102 | response = requests.post( 103 | "http://127.0.0.1:8000/predict", 104 | json={"prompt": "Fix typos in the following sentence: Exampel input"} 105 | ) 106 | 107 | print(response.json()["output"]) 108 | ``` 109 | -------------------------------------------------------------------------------- /litgpt/scripts/convert_pretrained_checkpoint.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | from pathlib import Path 4 | from pprint import pprint 5 | import torch 6 | 7 | from litgpt.utils import ( 8 | copy_config_files, 9 | extend_checkpoint_dir, 10 | incremental_save 11 | ) 12 | 13 | 14 | @torch.inference_mode() 15 | def convert_pretrained_checkpoint(checkpoint_dir: Path, output_dir: Path) -> None: 16 | """Convert a checkpoint after pretraining. 17 | 18 | The pretrained checkpoint contains optimizer states and several other metadata that are not needed after training 19 | is finished. This script will export the state-dict of the model and place it in the chosen output folder, 20 | which then can be loaded by other scripts for inference, evaluation, etc. 21 | 22 | Args: 23 | checkpoint_dir: Path to a checkpoint directory produced by ``litgpt.pretrain``. 24 | output_dir: The output folder where the converted state-dict file and config files will be saved to. 25 | """ 26 | checkpoint_dir = extend_checkpoint_dir(checkpoint_dir) 27 | pprint(locals()) 28 | 29 | if output_dir.is_dir() and output_dir.glob("*"): 30 | raise FileExistsError( 31 | f"The output folder exists and is not empty: {str(output_dir)}." 32 | " Please delete it first or choose a different name." 33 | ) 34 | 35 | output_dir.mkdir(parents=True) 36 | checkpoint_file = checkpoint_dir / "lit_model.pth" 37 | output_checkpoint_file = output_dir / "lit_model.pth" 38 | 39 | # TODO: Consolidate sharded checkpoint if applicable 40 | # Extract the model state dict and save to output folder 41 | with incremental_save(output_checkpoint_file) as saver: 42 | print("Processing", checkpoint_file) 43 | full_checkpoint = torch.load(str(checkpoint_file), mmap=True) 44 | loaded_state_dict = full_checkpoint["model"] 45 | converted_state_dict = {} 46 | for param_name, param in loaded_state_dict.items(): 47 | saver.store_early(param) 48 | # remove prefix for compiled model (if any) 49 | param_name = param_name.replace("_orig_mod.", "") 50 | converted_state_dict[param_name] = param 51 | print(f"Saving converted checkpoint to {str(output_checkpoint_file)}.") 52 | saver.save(converted_state_dict) 53 | 54 | copy_config_files(checkpoint_dir, output_dir) 55 | -------------------------------------------------------------------------------- /tests/data/test_openwebtext.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | import sys 3 | from unittest import mock 4 | from unittest.mock import ANY, call 5 | 6 | import pytest 7 | from litdata.streaming import StreamingDataLoader, StreamingDataset 8 | from torch.utils.data import DataLoader 9 | 10 | from litgpt.data import OpenWebText 11 | 12 | 13 | @pytest.mark.skipif(sys.platform == "win32", reason="Not in the mood to add Windows support right now.") 14 | @mock.patch("litdata.optimize") 15 | @mock.patch("litdata.streaming.dataset.subsample_streaming_dataset", return_value=([], [])) 16 | @mock.patch("datasets.load_dataset") 17 | def test_openwebtext(_, __, optimize_mock, tmp_path, mock_tokenizer): 18 | data = OpenWebText(data_path=(tmp_path / "openwebtext")) 19 | assert data.seq_length == 2048 20 | assert data.batch_size == 1 21 | 22 | data.connect(tokenizer=mock_tokenizer, batch_size=2, max_seq_length=1024) 23 | assert data.seq_length == 1025 24 | assert data.batch_size == 2 25 | 26 | # Data does not exist, preprocess it 27 | data.prepare_data() 28 | optimize_mock.assert_has_calls( 29 | [ 30 | call( 31 | fn=ANY, 32 | num_workers=ANY, 33 | inputs=[], 34 | output_dir=str(tmp_path / "openwebtext" / "train"), 35 | chunk_bytes="200MB", 36 | ), 37 | call( 38 | fn=ANY, 39 | num_workers=ANY, 40 | inputs=[], 41 | output_dir=str(tmp_path / "openwebtext" / "val"), 42 | chunk_bytes="200MB", 43 | ), 44 | ] 45 | ) 46 | optimize_mock.reset_mock() 47 | 48 | # Data exists, already preprocessed 49 | (tmp_path / "openwebtext" / "train").mkdir(parents=True) 50 | (tmp_path / "openwebtext" / "val").mkdir(parents=True) 51 | data.prepare_data() 52 | optimize_mock.assert_not_called() 53 | 54 | data.setup() 55 | 56 | train_dataloader = data.train_dataloader() 57 | assert isinstance(train_dataloader, StreamingDataLoader) 58 | assert isinstance(train_dataloader.dataset, StreamingDataset) 59 | 60 | val_dataloader = data.val_dataloader() 61 | assert isinstance(val_dataloader, DataLoader) 62 | assert isinstance(val_dataloader.dataset, StreamingDataset) 63 | 64 | # has attributes from super class `LightningDataModule` 65 | assert data.prepare_data_per_node 66 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "litgpt" 3 | version = "0.4.11" 4 | description = "Hackable implementation of state-of-the-art open-source LLMs" 5 | authors = [ 6 | { name = "Lightning AI", email = "contact@lightning.ai" }, 7 | ] 8 | readme = "README.md" 9 | license = { file = "LICENSE" } 10 | 11 | dependencies = [ 12 | "torch>=2.2.0", 13 | "lightning==2.4.0.dev20240728", 14 | "jsonargparse[signatures]>=4.27.6", 15 | "huggingface_hub>=0.23.5", # download models 16 | "safetensors>=0.4.3", # download models 17 | "tokenizers>=0.15.2", # tokenization in most models 18 | "tqdm>=4.66.0", # convert_hf_checkpoint 19 | ] 20 | 21 | [project.urls] 22 | homepage = "https://github.com/lightning-AI/litgpt" 23 | documentation = "https://github.com/lightning-AI/litgpt/tutorials" 24 | 25 | [project.scripts] 26 | litgpt = "litgpt.__main__:main" 27 | 28 | [project.optional-dependencies] 29 | test = [ 30 | "pytest>=8.1.1", 31 | "pytest-rerunfailures>=14.0", 32 | "pytest-timeout>=2.3.1", 33 | "pytest-dependency>=0.6.0", 34 | "transformers>=4.38.0", # numerical comparisons 35 | "einops>=0.7.0", 36 | "protobuf>=4.23.4", 37 | "lightning-thunder @ git+https://github.com/Lightning-AI/lightning-thunder/ ; python_version >= '3.10' and sys_platform == 'linux'", 38 | ] 39 | all = [ 40 | "bitsandbytes==0.42.0", # quantization 41 | "sentencepiece>=0.2.0", # llama-based models 42 | "requests>=2.31.0", # litgpt.data 43 | "litdata==0.2.17", # litgpt.data 44 | "litserve>=0.1.5", # litgpt.deploy 45 | "zstandard>=0.22.0", # litgpt.data.prepare_slimpajama.py 46 | "pandas>=1.9.0", # litgpt.data.prepare_starcoder.py 47 | "pyarrow>=15.0.2", # litgpt.data.prepare_starcoder.py 48 | "tensorboard>=2.14.0", # litgpt.pretrain 49 | "torchmetrics>=1.3.1", # litgpt.pretrain 50 | "datasets>=2.18.0", # litgpt.evaluate 51 | "transformers>=4.38.0", # litgpt.evaluate 52 | "lm-eval>=0.4.2", # litgpt.evaluate 53 | "huggingface_hub[hf_transfer]>=0.21.0" # download 54 | ] 55 | 56 | [build-system] 57 | requires = [ 58 | "setuptools>=68.2.2", 59 | "wheel>=0.41.2", 60 | ] 61 | build-backend = "setuptools.build_meta" 62 | 63 | [tool.setuptools.packages.find] 64 | include = [ 65 | "litgpt", 66 | "litgpt.*", 67 | ] 68 | exclude = [] 69 | 70 | [tool.setuptools.package-data] 71 | litgpt = [ 72 | "LICENSE", 73 | "README.md", 74 | ] 75 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from contextlib import redirect_stdout 3 | from io import StringIO 4 | from unittest import mock 5 | 6 | import pytest 7 | from packaging.version import Version 8 | 9 | from litgpt.__main__ import main 10 | 11 | 12 | def test_cli(): 13 | out = StringIO() 14 | with pytest.raises(SystemExit), redirect_stdout(out), mock.patch("sys.argv", ["litgpt", "-h"]): 15 | main() 16 | out = out.getvalue() 17 | assert "usage: litgpt" in out 18 | assert ("{download,chat,finetune,finetune_lora,finetune_full,finetune_adapter,finetune_adapter_v2," 19 | "pretrain,generate,generate_full,generate_adapter,generate_adapter_v2,generate_sequentially," 20 | "generate_tp,convert_to_litgpt,convert_from_litgpt,convert_pretrained_checkpoint," 21 | "merge_lora,evaluate,serve}" in out) 22 | assert ( 23 | """Available subcommands: 24 | download Download weights or tokenizer data from the Hugging 25 | Face Hub. 26 | chat Chat with a model.""" 27 | in out 28 | ) 29 | assert """evaluate Evaluate a model with the LM Evaluation Harness.""" in out 30 | assert """serve Serve a LitGPT model using LitServe.""" in out 31 | out = StringIO() 32 | with pytest.raises(SystemExit), redirect_stdout(out), mock.patch("sys.argv", ["litgpt", "finetune_lora", "-h"]): 33 | main() 34 | out = out.getvalue() 35 | assert ( 36 | """--lora_alpha LORA_ALPHA 37 | The LoRA alpha. (type: int, default: 16)""" 38 | in out 39 | ) 40 | 41 | if Version(f"{sys.version_info.major}.{sys.version_info.minor}") < Version("3.9"): 42 | # python 3.8 prints `Union[int, null]` instead of `Optional[int]` 43 | return 44 | 45 | out = StringIO() 46 | with pytest.raises(SystemExit), redirect_stdout(out), mock.patch("sys.argv", ["litgpt", "pretrain", "-h"]): 47 | main() 48 | out = out.getvalue() 49 | print(out) 50 | assert ( 51 | """--train.max_tokens MAX_TOKENS 52 | Total number of tokens to train on (type: 53 | Optional[int], default: 3000000000000)""" 54 | in out 55 | ) 56 | 57 | 58 | def test_rewrite_finetune_command(): 59 | out1 = StringIO() 60 | with pytest.raises(SystemExit), redirect_stdout(out1), mock.patch("sys.argv", ["litgpt", "fineune", "-h"]): 61 | main() 62 | out2 = StringIO() 63 | with pytest.raises(SystemExit), redirect_stdout(out2), mock.patch("sys.argv", ["litgpt", "fineune_lora", "-h"]): 64 | main() 65 | assert out1.getvalue() == out2.getvalue() 66 | -------------------------------------------------------------------------------- /tests/test_generate_adapter.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import re 4 | import subprocess 5 | import sys 6 | from contextlib import redirect_stderr, redirect_stdout 7 | from io import StringIO 8 | from pathlib import Path 9 | from unittest.mock import ANY, Mock, call 10 | 11 | import pytest 12 | import torch 13 | import yaml 14 | 15 | 16 | @pytest.mark.parametrize("version", ("v1", "v2")) 17 | def test_main(fake_checkpoint_dir, monkeypatch, version, tensor_like): 18 | if version == "v1": 19 | import litgpt.generate.adapter as generate 20 | else: 21 | import litgpt.generate.adapter_v2 as generate 22 | 23 | config_path = fake_checkpoint_dir / "model_config.yaml" 24 | config = {"block_size": 128, "vocab_size": 50, "n_layer": 2, "n_head": 4, "n_embd": 8, "rotary_percentage": 1} 25 | config_path.write_text(yaml.dump(config)) 26 | 27 | monkeypatch.setattr(generate, "lazy_load", Mock()) 28 | monkeypatch.setattr(generate.GPT, "load_state_dict", Mock()) 29 | tokenizer_mock = Mock() 30 | tokenizer_mock.return_value.encode.return_value = torch.tensor([[1, 2, 3]]) 31 | tokenizer_mock.return_value.decode.return_value = "### Response:foo bar baz" 32 | monkeypatch.setattr(generate, "Tokenizer", tokenizer_mock) 33 | generate_mock = Mock() 34 | generate_mock.return_value = torch.tensor([[3, 2, 1]]) 35 | monkeypatch.setattr(generate, "generate", generate_mock) 36 | 37 | num_samples = 1 38 | out, err = StringIO(), StringIO() 39 | with redirect_stdout(out), redirect_stderr(err): 40 | generate.main(temperature=2.0, top_k=2, top_p=0.9, checkpoint_dir=fake_checkpoint_dir) 41 | 42 | assert len(tokenizer_mock.return_value.decode.mock_calls) == num_samples 43 | assert torch.allclose(tokenizer_mock.return_value.decode.call_args[0][0], generate_mock.return_value) 44 | assert generate_mock.mock_calls == [call(ANY, tensor_like, 101, temperature=2.0, top_k=2, top_p=0.9, eos_id=ANY)] * num_samples 45 | 46 | expected_output = "foo bar baz\n" * num_samples 47 | # Allow for the config to be printed before the expected repeated strings. 48 | pattern = rf".*^{re.escape(expected_output.strip())}$.*" 49 | assert re.match(pattern, out.getvalue().strip(), re.DOTALL | re.MULTILINE) 50 | 51 | assert "'padded_vocab_size': 512, 'n_layer': 2, 'n_head': 4, 'head_size': 2, 'n_embd': 8" in err.getvalue() 52 | 53 | 54 | @pytest.mark.parametrize("version", ("", "_v2")) 55 | def test_cli(version): 56 | args = ["litgpt", f"generate_adapter{version}", "-h"] 57 | output = subprocess.check_output(args) 58 | output = str(output.decode()) 59 | assert "For models finetuned with" in output 60 | -------------------------------------------------------------------------------- /litgpt/data/prepare_starcoder.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import os 4 | import time 5 | import traceback 6 | from pathlib import Path 7 | 8 | from lightning_utilities.core.imports import RequirementCache 9 | 10 | from litgpt.tokenizer import Tokenizer 11 | from litgpt.utils import CLI, extend_checkpoint_dir 12 | 13 | _LITDATA_AVAILABLE = RequirementCache("litdata") 14 | if _LITDATA_AVAILABLE: 15 | from litdata.processing.data_processor import DataChunkRecipe 16 | else: 17 | DataChunkRecipe = object 18 | 19 | 20 | class StarcoderDataRecipe(DataChunkRecipe): 21 | is_generator = True 22 | 23 | def __init__(self, tokenizer: Tokenizer, chunk_size: int): 24 | super().__init__(chunk_size) 25 | self.tokenizer = tokenizer 26 | 27 | def prepare_structure(self, input_dir): 28 | files = Path(input_dir).rglob("*.parquet") 29 | return [str(file) for file in files] 30 | 31 | def prepare_item(self, item_metadata): 32 | import pyarrow.parquet as pq 33 | 34 | filepath = item_metadata 35 | start = time.time() 36 | 37 | try: 38 | parquet_file = pq.ParquetFile(filepath) 39 | # reduce RAM usage 40 | for batch in parquet_file.iter_batches(batch_size=8192, columns=["content"]): 41 | for text in batch.to_pandas()["content"]: 42 | yield self.tokenizer.encode(text, bos=False, eos=True) 43 | 44 | except Exception: 45 | print(traceback.format_exc()) 46 | print(f"Error reading {filepath}") 47 | return 48 | 49 | parquet_file.close() 50 | end = time.time() 51 | print(f"Took {end - start:.2f} seconds total", filepath) 52 | 53 | 54 | def prepare( 55 | input_dir: Path = Path("data/starcoderdata"), 56 | output_dir: Path = Path("data/starcoder"), 57 | tokenizer_path: Path = Path("checkpoints/Llama-2-7b-hf/"), 58 | chunk_size: int = (2049 * 8192), 59 | fast_dev_run: bool = False, 60 | ) -> None: 61 | from litdata.processing.data_processor import DataProcessor 62 | 63 | tokenizer_path = extend_checkpoint_dir(tokenizer_path) 64 | tokenizer = Tokenizer(tokenizer_path) 65 | data_recipe = StarcoderDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size) 66 | data_processor = DataProcessor( 67 | input_dir=str(input_dir), 68 | output_dir=str(output_dir), 69 | fast_dev_run=fast_dev_run, 70 | num_workers=os.cpu_count(), 71 | num_downloaders=1, 72 | ) 73 | 74 | start_time = time.time() 75 | data_processor.run(data_recipe) 76 | elapsed_time = time.time() - start_time 77 | print(f"Time taken: {elapsed_time:.2f} seconds") 78 | 79 | 80 | if __name__ == "__main__": 81 | CLI(prepare) 82 | -------------------------------------------------------------------------------- /tests/test_config_hub.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import importlib.util 3 | from pathlib import Path 4 | from unittest import mock 5 | from unittest.mock import Mock 6 | 7 | import pytest 8 | from lightning.fabric.plugins import Precision 9 | 10 | from litgpt import Config 11 | from litgpt.utils import CLI 12 | 13 | fixed_pairs = [ 14 | ("litgpt/pretrain.py", "pretrain/debug.yaml"), 15 | ("litgpt/pretrain.py", "pretrain/tinyllama.yaml"), 16 | ("litgpt/pretrain.py", "pretrain/tinystories.yaml"), 17 | ( 18 | "litgpt/pretrain.py", 19 | "https://raw.githubusercontent.com/Lightning-AI/litgpt/4d55ab6d0aa404f0da0d03a80a8801ed60e07e83/config_hub/pretrain/tinystories.yaml", # TODO: Update with path from main after merge 20 | ), 21 | ] 22 | 23 | config_hub_path = Path(__file__).parent.parent / "config_hub" / "finetune" 24 | model_pairs = [] 25 | 26 | for model_dir in config_hub_path.iterdir(): 27 | if model_dir.is_dir(): 28 | model_name = model_dir.name 29 | for yaml_file in model_dir.glob("*.yaml"): 30 | config_name = yaml_file.stem 31 | python_file = "litgpt/finetune/full.py" if config_name == "full" else "litgpt/finetune/lora.py" 32 | relative_yaml_path = yaml_file.relative_to(config_hub_path.parent) 33 | model_pairs.append((python_file, str(relative_yaml_path))) 34 | 35 | all_pairs = fixed_pairs + model_pairs 36 | 37 | 38 | @pytest.mark.parametrize(("script_file", "config_file"), all_pairs) 39 | def test_config_help(script_file, config_file, monkeypatch): 40 | """Test that configs validate against the signature in the scripts.""" 41 | script_file = Path(__file__).parent.parent / script_file 42 | assert script_file.is_file() 43 | if "http" not in str(config_file): 44 | config_file = Path(__file__).parent.parent / "config_hub" / config_file 45 | assert config_file.is_file() 46 | 47 | spec = importlib.util.spec_from_file_location(str(script_file.parent.name), script_file) 48 | module = importlib.util.module_from_spec(spec) 49 | spec.loader.exec_module(module) 50 | 51 | monkeypatch.setattr(module, "main", Mock()) 52 | monkeypatch.setattr(module, "Tokenizer", Mock()) 53 | monkeypatch.setattr(module, "BitsandbytesPrecision", Mock(return_value=Precision()), raising=False) 54 | monkeypatch.setattr(module, "Config", Mock(return_value=Config.from_name("pythia-14m"))) 55 | monkeypatch.setattr(module, "check_valid_checkpoint_dir", Mock(), raising=False) 56 | 57 | try: 58 | with mock.patch("sys.argv", [script_file.name, "--config", str(config_file), "--devices", "1"]): 59 | CLI(module.setup) 60 | module.main.assert_called_once() 61 | except FileNotFoundError: 62 | pass 63 | # FileNotFound occurs here because we have not downloaded the model weights referenced in the config files 64 | # which is ok because here we just want to validate the config file itself. 65 | -------------------------------------------------------------------------------- /tests/test_evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import pytest 4 | import subprocess 5 | from contextlib import redirect_stdout 6 | from dataclasses import asdict 7 | from io import StringIO 8 | from unittest import mock 9 | 10 | import torch 11 | import yaml 12 | 13 | import litgpt.eval.evaluate as module 14 | from litgpt import GPT, Config 15 | from litgpt.scripts.download import download_from_hub 16 | 17 | 18 | def test_evaluate_script(tmp_path): 19 | ours_config = Config.from_name("pythia-14m") 20 | download_from_hub(repo_id="EleutherAI/pythia-14m", tokenizer_only=True, checkpoint_dir=tmp_path) 21 | checkpoint_dir = tmp_path / "EleutherAI" / "pythia-14m" 22 | ours_model = GPT(ours_config) 23 | torch.save(ours_model.state_dict(), checkpoint_dir / "lit_model.pth") 24 | with open(checkpoint_dir / "model_config.yaml", "w", encoding="utf-8") as fp: 25 | yaml.dump(asdict(ours_config), fp) 26 | 27 | stdout = StringIO() 28 | with redirect_stdout(stdout), mock.patch("sys.argv", ["eval/evaluate.py"]): 29 | with pytest.raises(ValueError) as excinfo: 30 | module.convert_and_evaluate( 31 | checkpoint_dir, 32 | out_dir=tmp_path / "out_dir", 33 | device=None, 34 | dtype=torch.float32, 35 | limit=5, 36 | tasks="logiqa", 37 | batch_size=0 # Test for non-positive integer 38 | ) 39 | assert "batch_size must be a positive integer, 'auto', or in the format 'auto:N'." in str(excinfo.value) 40 | 41 | with pytest.raises(ValueError) as excinfo: 42 | module.convert_and_evaluate( 43 | checkpoint_dir, 44 | out_dir=tmp_path / "out_dir", 45 | device=None, 46 | dtype=torch.float32, 47 | limit=5, 48 | tasks="logiqa", 49 | batch_size="invalid" # Test for invalid string 50 | ) 51 | assert "batch_size must be a positive integer, 'auto', or in the format 'auto:N'." in str(excinfo.value) 52 | 53 | stdout = StringIO() 54 | with redirect_stdout(stdout), mock.patch("sys.argv", ["eval/evaluate.py"]): 55 | module.convert_and_evaluate( 56 | checkpoint_dir, 57 | out_dir=tmp_path / "out_dir", 58 | device=None, 59 | dtype=torch.float32, 60 | limit=5, 61 | tasks="logiqa", 62 | batch_size=1 # Valid case 63 | ) 64 | stdout = stdout.getvalue() 65 | assert (tmp_path / "out_dir" / "results.json").is_file() 66 | assert "logiqa" in stdout 67 | assert "Metric" in stdout 68 | assert "Loading checkpoint shards" not in stdout 69 | 70 | 71 | def test_cli(): 72 | args = ["litgpt", "evaluate", "-h"] 73 | output = subprocess.check_output(args) 74 | output = str(output.decode()) 75 | assert "Evaluate a model with the LM Evaluation Harness" in output 76 | -------------------------------------------------------------------------------- /tests/data/test_deita.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | from unittest import mock 3 | 4 | from litgpt.data import Deita, SFTDataset 5 | from litgpt.data.deita import format_dataset 6 | from litgpt.prompts import Alpaca as AlpacaPromptStyle 7 | 8 | 9 | def test_format_dataset(): 10 | data = [ 11 | { 12 | "prompt": "prompt1", 13 | "prompt_id": "1", 14 | "messages": [ 15 | {"content": "question1", "role": "user"}, 16 | {"content": "response1", "role": "assistant"}, 17 | {"content": "question2", "role": "user"}, 18 | {"content": "response2", "role": "assistant"}, 19 | ], 20 | }, 21 | { 22 | "prompt": "prompt2", 23 | "prompt_id": "2", 24 | "messages": [ 25 | {"content": "question3", "role": "user"}, 26 | {"content": "response3", "role": "assistant"}, 27 | {"content": "question4", "role": "user"}, 28 | {"content": "response4", "role": "assistant"}, 29 | ], 30 | }, 31 | ] 32 | 33 | assert format_dataset(data, include_multi_turn_conversations=False) == [ 34 | {"instruction": "question1", "output": "response1", "input": ""}, 35 | {"instruction": "question3", "output": "response3", "input": ""}, 36 | ] 37 | assert format_dataset(data, include_multi_turn_conversations=True) == [ 38 | {"instruction": "question1", "output": "response1", "input": ""}, 39 | {"instruction": "question2", "output": "response2", "input": ""}, 40 | {"instruction": "question3", "output": "response3", "input": ""}, 41 | {"instruction": "question4", "output": "response4", "input": ""}, 42 | ] 43 | 44 | 45 | @mock.patch("litgpt.data.deita.format_dataset") 46 | @mock.patch("datasets.load_dataset") 47 | def test_deita(_, format_dataset_mock, mock_tokenizer, tmp_path): 48 | format_dataset_mock.return_value = [ 49 | {"instruction": "inst1", "output": "out1"}, 50 | {"instruction": "inst2", "output": "out2"}, 51 | {"instruction": "inst3", "output": "out3"}, 52 | ] 53 | 54 | deita = Deita(num_workers=0, download_dir=tmp_path) 55 | assert isinstance(deita.prompt_style, AlpacaPromptStyle) 56 | deita.connect(mock_tokenizer, batch_size=2, max_seq_length=10) 57 | deita.prepare_data() 58 | deita.setup() 59 | 60 | train_dataloader = deita.train_dataloader() 61 | assert isinstance(train_dataloader.dataset, SFTDataset) 62 | assert len(train_dataloader) == 2 63 | 64 | val_dataloader = deita.val_dataloader() 65 | assert isinstance(val_dataloader.dataset, SFTDataset) 66 | assert len(val_dataloader) == 2 67 | 68 | assert isinstance(train_dataloader.dataset.prompt_style, AlpacaPromptStyle) 69 | assert isinstance(val_dataloader.dataset.prompt_style, AlpacaPromptStyle) 70 | 71 | # has attributes from super class `LightningDataModule` 72 | assert deita.prepare_data_per_node 73 | -------------------------------------------------------------------------------- /litgpt/data/lit_data.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | import os 3 | from dataclasses import dataclass, field 4 | from pathlib import Path 5 | from typing import Optional, Tuple, Union 6 | 7 | from torch.utils.data import DataLoader 8 | 9 | from litgpt.tokenizer import Tokenizer 10 | from litgpt.data import DataModule 11 | 12 | 13 | @dataclass 14 | class LitData(DataModule): 15 | """Loads data using LitData's StreamingDataset given a path to a folder of preprocessed data (chunks).""" 16 | 17 | data_path: Union[str, Path] = Path("data/") 18 | """The path to the data directory containing the preprocessed chunks for the streaming dataset 19 | The path can also be a remote path (e.g., s3://). See also ``split_names`` if this path contains subfolders 20 | for training- and validation splits.""" 21 | split_names: Optional[Tuple[str, str]] = None 22 | """Optional tuple for names of subfolders for training and validation under ``data_path``. If not provided, 23 | all data under data_path will be used for training, and the validation dataloader will be identical to the 24 | train dataloader.""" 25 | seed: int = 42 26 | """The random seed for shuffling the dataset.""" 27 | num_workers: int = 8 28 | """How many DataLoader processes to use for loading.""" 29 | 30 | batch_size: int = field(init=False, repr=False, default=1) 31 | seq_length: int = field(init=False, repr=False, default=2048) 32 | 33 | def __post_init__(self) -> None: 34 | super().__init__() 35 | if self.split_names is not None and len(self.split_names) != 2: 36 | raise ValueError("If provided `split_names` must be a tuple of two strings, for example: ('train', 'val').") 37 | 38 | def connect( 39 | self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: Optional[int] = None 40 | ) -> None: 41 | self.batch_size = batch_size 42 | self.seq_length = max_seq_length + 1 # Increase by one because we need the next token as well 43 | 44 | def train_dataloader(self) -> DataLoader: 45 | input_dir = os.path.join(self.data_path, self.split_names[0]) if self.split_names else str(self.data_path) 46 | return self._dataloader(input_dir=input_dir, train=True) 47 | 48 | def val_dataloader(self) -> DataLoader: 49 | input_dir = os.path.join(self.data_path, self.split_names[1]) if self.split_names else str(self.data_path) 50 | return self._dataloader(input_dir=input_dir, train=False) 51 | 52 | def _dataloader(self, input_dir: str, train: bool): 53 | from litdata.streaming import StreamingDataset, StreamingDataLoader, TokensLoader 54 | 55 | dataset = StreamingDataset( 56 | input_dir=input_dir, 57 | item_loader=TokensLoader(block_size=self.seq_length), 58 | shuffle=train, 59 | seed=self.seed, 60 | ) 61 | dataloader = StreamingDataLoader( 62 | dataset, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True 63 | ) 64 | return dataloader 65 | -------------------------------------------------------------------------------- /tests/data/test_base.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import pytest 4 | import torch 5 | 6 | from litgpt.data import SFTDataset, get_sft_collate_fn 7 | from litgpt.prompts import PromptStyle 8 | 9 | 10 | @pytest.mark.parametrize("mask_prompt", [True, False]) 11 | @pytest.mark.parametrize("ignore_index", [-1, -100]) 12 | @pytest.mark.parametrize("max_seq_length", [1000, 5, -1]) 13 | def test_sft_dataset(max_seq_length, ignore_index, mask_prompt, mock_tokenizer): 14 | class Style(PromptStyle): 15 | def apply(self, prompt, **kwargs): 16 | return f"In: {prompt} Out:" 17 | 18 | i = ignore_index 19 | data = [{"instruction": "Foo", "output": "Bar"}, {"instruction": "Boo", "output": "Ahh"}] 20 | 21 | dataset = SFTDataset( 22 | data=data, 23 | tokenizer=mock_tokenizer, 24 | prompt_style=Style(), 25 | mask_prompt=mask_prompt, 26 | ignore_index=ignore_index, 27 | max_seq_length=max_seq_length, 28 | ) 29 | assert len(dataset) == len(data) 30 | 31 | expected_input_ids = torch.tensor([73, 110, 58, 32, 70, 111, 111, 32, 79, 117, 116, 58, 66, 97, 114, 1]) 32 | # If prompt is not masked, labels == input_ids 33 | expected_labels = ( 34 | torch.tensor([i, i, i, i, i, i, i, i, i, i, i, i, 66, 97, 114, 1]) if mask_prompt else expected_input_ids 35 | ) 36 | 37 | if max_seq_length == -1: 38 | assert torch.equal(dataset[0]["input_ids"], expected_input_ids) 39 | assert torch.equal(dataset[0]["labels"], expected_labels) 40 | else: 41 | assert torch.equal(dataset[0]["input_ids"], expected_input_ids[:max_seq_length]) 42 | assert torch.equal(dataset[0]["labels"], expected_labels[:max_seq_length]) 43 | 44 | 45 | @pytest.mark.parametrize("ignore_index", [-1, -100]) 46 | @pytest.mark.parametrize("pad_id", [0, 100]) 47 | def test_sft_collate_fn_padding(pad_id, ignore_index): 48 | collate = get_sft_collate_fn(pad_id=pad_id, ignore_index=ignore_index) 49 | samples = [ 50 | {"input_ids": torch.tensor([1, 2, 3]), "labels": torch.tensor([10, 20, 30])}, 51 | {"input_ids": torch.tensor([4, 5, 6, 7, 8]), "labels": torch.tensor([40, 50, 60, 70, 80])}, 52 | ] 53 | expected = { 54 | "input_ids": torch.tensor([[1, 2, 3, pad_id, pad_id], [4, 5, 6, 7, 8]]), 55 | "labels": torch.tensor([[10, 20, 30, ignore_index, ignore_index], [40, 50, 60, 70, 80]]), 56 | } 57 | batch = collate(samples) 58 | assert all(torch.equal(batch[k], expected[k]) for k in ("input_ids", "labels")) 59 | 60 | 61 | def test_sft_collate_fn_truncation(): 62 | collate = get_sft_collate_fn(max_seq_length=2) 63 | samples = [ 64 | {"input_ids": torch.tensor([1, 2, 3]), "labels": torch.tensor([10, 20, 30])}, 65 | {"input_ids": torch.tensor([4, 5, 6, 7, 8]), "labels": torch.tensor([40, 50, 60, 70, 80])}, 66 | ] 67 | expected = {"input_ids": torch.tensor([[1, 2], [4, 5]]), "labels": torch.tensor([[10, 20], [40, 50]])} 68 | batch = collate(samples) 69 | assert all(torch.equal(batch[k], expected[k]) for k in ("input_ids", "labels")) 70 | -------------------------------------------------------------------------------- /tutorials/finetune.md: -------------------------------------------------------------------------------- 1 | # Finetuning 2 | 3 | We provide a simple finetuning commands (`litgpt finetune_*`) that instruction-finetune a pretrained model on datasets such as [Alpaca](https://github.com/tatsu-lab/stanford_alpaca), [Dolly](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm), and others. For more information on the supported instruction datasets and how to prepare your own custom datasets, please see the [tutorials/prepare_dataset](prepare_dataset.md) tutorials. 4 | 5 | LitGPT currently supports the following finetuning methods: 6 | 7 | ```bash 8 | litgpt finetune_full 9 | litgpt finetune_lora 10 | litgpt finetune_adapter 11 | litgpt finetune_adapter_v2 12 | ``` 13 | 14 |   15 | > [!TIP] 16 | > To install all required dependencies before finetuning, first run `pip install "litgpt[all]"`. 17 |   18 | 19 | 20 | The following section provides more details about these methods, including links for additional resources. 21 | 22 | 23 |   24 | ## LitGPT finetuning commands 25 | 26 | The section below provides additional information on the available and links to further resources. 27 | 28 |   29 | ### Full finetuning 30 | 31 | ```bash 32 | litgpt finetune_full 33 | ``` 34 | 35 | This method trains all model weight parameters and is the most memory-intensive finetuning technique in LitGPT. 36 | 37 | **More information and resources:** 38 | 39 | - the LitGPT [tutorials/finetune_full](finetune_full.md) tutorial 40 | 41 | 42 |   43 | ### LoRA and QLoRA finetuning 44 | 45 | ```bash 46 | litgpt finetune_lora stabilityai/stablelm-base-alpha-3b 47 | ``` 48 | 49 | LoRA and QLoRA are parameter-efficient finetuning technique that only require updating a small number of parameters, which makes this a more memory-efficienty alternative to full finetuning. 50 | 51 | **More information and resources:** 52 | 53 | - the LitGPT [tutorials/finetune_lora](finetune_lora.md) tutorial 54 | - the LoRA paper by ([Hu et al. 2021](https://arxiv.org/abs/2106.09685)) 55 | - the conceptual tutorial [Parameter-Efficient LLM Finetuning With Low-Rank Adaptation (LoRA)](https://lightning.ai/pages/community/tutorial/lora-llm/) 56 | 57 | 58 |   59 | ### Adapter finetuning 60 | 61 | ```bash 62 | litgpt finetune_adapter stabilityai/stablelm-base-alpha-3b 63 | ``` 64 | 65 | or 66 | 67 | ```bash 68 | litgpt finetune_adapter_v2 stabilityai/stablelm-base-alpha-3b 69 | ``` 70 | 71 | Similar to LoRA, adapter finetuning is a parameter-efficient finetuning technique that only requires training a small subset of weight parameters, making this finetuning method more memory-efficient than full-parameter finetuning. 72 | 73 | **More information and resources:** 74 | 75 | - the LitGPT [tutorials/finetune_adapter](finetune_adapter.md) tutorial 76 | - the Llama-Adapter ([Gao et al. 2023](https://arxiv.org/abs/2304.15010)) and Llama-Adapter v2 ([Zhang et al. 2023](https://arxiv.org/abs/2303.16199)) papers that originally introduces these methods 77 | - the conceptual tutorial [Understanding Parameter-Efficient Finetuning of Large Language Models: From Prefix Tuning to LLaMA-Adapters](https://lightning.ai/pages/community/article/understanding-llama-adapters/) 78 | -------------------------------------------------------------------------------- /tests/test_full.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import os 4 | from contextlib import redirect_stdout 5 | from io import StringIO 6 | from unittest import mock 7 | from unittest.mock import Mock 8 | 9 | import torch 10 | import yaml 11 | 12 | import litgpt.finetune.full as module 13 | from litgpt.args import EvalArgs, TrainArgs 14 | from litgpt.data import Alpaca 15 | 16 | 17 | @mock.patch.dict(os.environ, {"LT_ACCELERATOR": "cpu"}) 18 | def test_full_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_path): 19 | model_config = dict(block_size=128, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8) 20 | (fake_checkpoint_dir / "model_config.yaml").write_text(yaml.dump(model_config)) 21 | monkeypatch.setattr(module, "load_checkpoint", Mock()) 22 | 23 | tokenizer_mock = Mock() 24 | tokenizer_mock.return_value = tokenizer_mock 25 | tokenizer_mock.encode = lambda *_, **__: torch.tensor([3, 2, 1]) 26 | monkeypatch.setattr(module, "Tokenizer", tokenizer_mock) 27 | 28 | out_dir = tmp_path / "out" 29 | setup_args = (fake_checkpoint_dir, ) 30 | setup_kwargs = dict( 31 | data=Alpaca(download_dir=alpaca_path.parent, file_name=alpaca_path.name, val_split_fraction=0.5, num_workers=0), 32 | out_dir=out_dir, 33 | precision="32-true", 34 | train=TrainArgs(global_batch_size=1, save_interval=2, epochs=1, max_steps=6, micro_batch_size=1), 35 | eval=EvalArgs(interval=2, max_iters=2, max_new_tokens=1), 36 | ) 37 | stdout = StringIO() 38 | with redirect_stdout(stdout), mock.patch("sys.argv", ["full.py", str(fake_checkpoint_dir)]): 39 | module.setup(*setup_args, **setup_kwargs) 40 | 41 | out_dir_contents = set(os.listdir(out_dir)) 42 | checkpoint_dirs = {"step-000002", "step-000004", "step-000006", "final"} 43 | assert checkpoint_dirs.issubset(out_dir_contents) 44 | assert all((out_dir / p).is_dir() for p in checkpoint_dirs) 45 | for checkpoint_dir in checkpoint_dirs: 46 | assert set(os.listdir(out_dir / checkpoint_dir)) == { 47 | "lit_model.pth", 48 | "model_config.yaml", 49 | "tokenizer_config.json", 50 | "tokenizer.json", 51 | "hyperparameters.yaml", 52 | "prompt_style.yaml", 53 | } 54 | assert (out_dir / "logs" / "csv" / "version_0" / "metrics.csv").is_file() 55 | 56 | logs = stdout.getvalue() 57 | assert logs.count("(step)") == 6 58 | assert logs.count("val loss") == 4 # 3 validations + 1 final validation 59 | assert logs.count("Final evaluation") == 1 60 | assert "of trainable parameters: 1,888" in logs 61 | 62 | # Resume training and do 2 steps more 63 | setup_kwargs["train"].max_steps = 8 64 | setup_kwargs["resume"] = True 65 | stdout = StringIO() 66 | with redirect_stdout(stdout), mock.patch("sys.argv", ["full.py", str(fake_checkpoint_dir)]): 67 | module.setup(*setup_args, **setup_kwargs) 68 | logs = stdout.getvalue() 69 | assert f"Resuming training from {out_dir / 'step-000006' / 'lit_model.pth'}" in logs 70 | assert logs.count("(step)") == 2 71 | assert out_dir / "step-000008" in set(out_dir.iterdir()) 72 | -------------------------------------------------------------------------------- /litgpt/data/dolly.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import json 4 | from dataclasses import dataclass, field 5 | from pathlib import Path 6 | from typing import Union 7 | 8 | import torch 9 | from torch.utils.data import random_split 10 | 11 | from litgpt.prompts import PromptStyle 12 | from litgpt.data import Alpaca, SFTDataset 13 | 14 | _URL: str = "https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl" 15 | 16 | 17 | @dataclass 18 | class Dolly(Alpaca): 19 | """Dolly data module for supervised finetuning.""" 20 | 21 | mask_prompt: bool = False 22 | """Whether to mask the prompt section from the label (with ``ignore_index``).""" 23 | val_split_fraction: float = 0.1 24 | """The fraction of the dataset to use for the validation dataset. The rest is used for training.""" 25 | prompt_style: Union[str, PromptStyle] = "alpaca" 26 | """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles.""" 27 | ignore_index: int = -100 28 | """The index to use for elements to be ignored in the label.""" 29 | seed: int = 42 30 | """The random seed for creating the train/val splits and shuffling the dataset.""" 31 | num_workers: int = 4 32 | """How many DataLoader processes to use for loading.""" 33 | download_dir: Path = Path("./data/dolly") 34 | """The directory in which the downloaded dataset gets saved.""" 35 | file_url: str = field(repr=False, default=_URL) 36 | """The URL from where to download the dataset.""" 37 | file_name: str = field(repr=False, default="dolly_data_cleaned.json") 38 | """The name of the dataset file to download.""" 39 | 40 | def setup(self, stage: str = "") -> None: 41 | with open(self.download_dir / self.file_name, "r", encoding="utf-8") as file: 42 | data = file.readlines() 43 | data = [json.loads(line) for line in data] 44 | 45 | # Partition the dataset into train and test 46 | train_data, test_data = random_split( 47 | data, 48 | [1.0 - self.val_split_fraction, self.val_split_fraction], 49 | generator=torch.Generator().manual_seed(self.seed), 50 | ) 51 | train_data, test_data = list(train_data), list(test_data) 52 | 53 | self.train_dataset = SFTDataset( 54 | data=train_data, 55 | tokenizer=self.tokenizer, 56 | prompt_style=self.prompt_style, 57 | max_seq_length=self.max_seq_length, 58 | mask_prompt=self.mask_prompt, 59 | ignore_index=self.ignore_index, 60 | transform=_transform, 61 | ) 62 | self.test_dataset = SFTDataset( 63 | data=test_data, 64 | tokenizer=self.tokenizer, 65 | prompt_style=self.prompt_style, 66 | max_seq_length=self.max_seq_length, 67 | mask_prompt=self.mask_prompt, 68 | ignore_index=self.ignore_index, 69 | transform=_transform, 70 | ) 71 | 72 | 73 | def _transform(item: dict) -> dict: 74 | item["input"] = item.pop("context") 75 | item["output"] = item.pop("response") 76 | return item 77 | -------------------------------------------------------------------------------- /tests/data/test_tinystories.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | import torch 5 | from litdata import optimize 6 | from litdata.streaming import StreamingDataset, TokensLoader 7 | from torch.utils._pytree import tree_map 8 | 9 | 10 | def tokenize(data): 11 | for story in data: 12 | yield torch.tensor(story) 13 | 14 | 15 | def fake_chunk(path, data): 16 | optimize(fn=tokenize, inputs=[data] * len(data), output_dir=str(path), num_workers=1, chunk_bytes="200MB") 17 | 18 | 19 | @pytest.mark.parametrize( 20 | ("max_seq_len", "expected"), 21 | [ 22 | (2, [[0, 23, 15], [63, 0, 73], [5, 0, 1], [1999, 0, 13]]), 23 | (5, [[0, 23, 15, 63, 0, 73], [5, 0, 1, 1999, 0, 13]]), 24 | (6, [[0, 23, 15, 63, 0, 73, 5]]), 25 | (7, [[0, 23, 15, 63, 0, 73, 5, 0]]), 26 | ], 27 | ) 28 | def test_pretok_dataset(tmp_path, max_seq_len, expected): 29 | fake_data = [0, 23, 15, 63, 0, 73, 5, 0, 1, 1999, 0, 13] 30 | assert len(fake_data) == 12 31 | fake_chunk(tmp_path, [fake_data]) 32 | 33 | dataset = StreamingDataset( 34 | input_dir=str(tmp_path), item_loader=TokensLoader(block_size=max_seq_len + 1), shuffle=False, drop_last=False 35 | ) 36 | actual = tree_map(torch.Tensor.tolist, list(dataset)) 37 | assert actual == expected 38 | 39 | 40 | def test_tokenize(tmp_path, monkeypatch): 41 | from litgpt.data.tinystories import tokenize 42 | 43 | story1, story2 = "foo bar", " fun " 44 | data = [{"story": story1}, {"story": story2}] 45 | shard_path = tmp_path / "data.json" 46 | with open(shard_path, "w", encoding="utf-8") as f: 47 | json.dump(data, f) 48 | 49 | class Tokenizer: 50 | bos_id = 0 51 | 52 | def encode(self, text, bos, eos): 53 | assert bos 54 | assert not eos 55 | return [self.bos_id] + [ord(c) for c in text] 56 | 57 | monkeypatch.setenv("DATA_OPTIMIZER_GLOBAL_RANK", "0") 58 | monkeypatch.setenv("DATA_OPTIMIZER_NUM_WORKERS", "1") 59 | data = tokenize(str(shard_path), Tokenizer()) 60 | assert list(data) == [[0, 102, 111, 111, 32, 98, 97, 114], [0, 102, 117, 110]] 61 | 62 | 63 | def test_tinystories_datamodule(tmp_path): 64 | from litgpt.data.tinystories import TinyStories 65 | 66 | data_dir = tmp_path / "tinystories" 67 | 68 | datamodule = TinyStories(data_dir, seed=42, num_workers=1) 69 | datamodule.connect(max_seq_length=2) 70 | 71 | # simulate `datamodule.prepare_data` 72 | train_data_dir = data_dir / "train" 73 | train_data_dir.mkdir(parents=True) 74 | fake_chunk(train_data_dir, [[12], [0, 23, 15, 63, 0], [73, 5, 0, 1, 1999, 0, 13]]) 75 | 76 | datamodule.setup() 77 | 78 | tr_dataloader = datamodule.train_dataloader() 79 | torch.manual_seed(0) 80 | actual = tree_map(torch.Tensor.tolist, list(tr_dataloader)) 81 | # there is 1 sample per index in the data (13) 82 | assert actual == [ 83 | [[1999, 0, 13]], 84 | [[0, 13, 12]], 85 | [[1, 1999, 0]], 86 | [[63, 0, 73]], 87 | [[5, 0, 1]], 88 | [[0, 73, 5]], 89 | [[0, 23, 15]], 90 | [[0, 1, 1999]], 91 | [[15, 63, 0]], 92 | [[73, 5, 0]], 93 | [[12, 0, 23]], 94 | [[23, 15, 63]], 95 | [[13, 12, 0]] 96 | ] 97 | -------------------------------------------------------------------------------- /litgpt/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import warnings 4 | import torch 5 | 6 | from litgpt.chat.base import main as chat_fn 7 | from litgpt.finetune.adapter import setup as finetune_adapter_fn 8 | from litgpt.finetune.adapter_v2 import setup as finetune_adapter_v2_fn 9 | from litgpt.finetune.full import setup as finetune_full_fn 10 | from litgpt.finetune.lora import setup as finetune_lora_fn 11 | from litgpt.generate.adapter import main as generate_adapter_fn 12 | from litgpt.generate.adapter_v2 import main as generate_adapter_v2_fn 13 | from litgpt.generate.base import main as generate_base_fn 14 | from litgpt.generate.full import main as generate_full_fn 15 | from litgpt.generate.sequentially import main as generate_sequentially_fn 16 | from litgpt.generate.tp import main as generate_tp_fn 17 | from litgpt.pretrain import setup as pretrain_fn 18 | from litgpt.scripts.convert_hf_checkpoint import convert_hf_checkpoint as convert_hf_checkpoint_fn 19 | from litgpt.scripts.convert_lit_checkpoint import convert_lit_checkpoint as convert_lit_checkpoint_fn 20 | from litgpt.scripts.convert_pretrained_checkpoint import ( 21 | convert_pretrained_checkpoint as convert_pretrained_checkpoint_fn, 22 | ) 23 | from litgpt.scripts.download import download_from_hub as download_fn 24 | from litgpt.scripts.merge_lora import merge_lora as merge_lora_fn 25 | from litgpt.eval.evaluate import convert_and_evaluate as evaluate_fn 26 | from litgpt.deploy.serve import run_server as serve_fn 27 | from jsonargparse import set_config_read_mode, set_docstring_parse_options, CLI 28 | 29 | 30 | def main() -> None: 31 | parser_data = { 32 | "download": download_fn, 33 | "chat": chat_fn, 34 | "finetune": finetune_lora_fn, 35 | "finetune_lora": finetune_lora_fn, 36 | "finetune_full": finetune_full_fn, 37 | "finetune_adapter": finetune_adapter_fn, 38 | "finetune_adapter_v2": finetune_adapter_v2_fn, 39 | "pretrain": pretrain_fn, 40 | "generate": generate_base_fn, 41 | "generate_full": generate_full_fn, 42 | "generate_adapter": generate_adapter_fn, 43 | "generate_adapter_v2": generate_adapter_v2_fn, 44 | "generate_sequentially": generate_sequentially_fn, 45 | "generate_tp": generate_tp_fn, 46 | "convert_to_litgpt": convert_hf_checkpoint_fn, 47 | "convert_from_litgpt": convert_lit_checkpoint_fn, 48 | "convert_pretrained_checkpoint": convert_pretrained_checkpoint_fn, 49 | "merge_lora": merge_lora_fn, 50 | "evaluate": evaluate_fn, 51 | "serve": serve_fn 52 | } 53 | 54 | set_docstring_parse_options(attribute_docstrings=True) 55 | set_config_read_mode(urls_enabled=True) 56 | 57 | # PyTorch bug that raises a false-positive warning 58 | # More info: https://github.com/Lightning-AI/litgpt/issues/1561 59 | warning_message = ( 60 | r"The epoch parameter in `scheduler.step\(\)` was not necessary and is being deprecated.*" 61 | ) 62 | 63 | warnings.filterwarnings( 64 | action="ignore", 65 | message=warning_message, 66 | category=UserWarning, 67 | module=r'.*torch\.optim\.lr_scheduler.*' 68 | ) 69 | 70 | torch.set_float32_matmul_precision("high") 71 | CLI(parser_data) 72 | 73 | 74 | if __name__ == "__main__": 75 | main() 76 | -------------------------------------------------------------------------------- /tests/run_standalone_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Batch size for testing: Determines how many standalone test invocations run in parallel 5 | # It can be set through the env variable PL_STANDALONE_TESTS_BATCH_SIZE 6 | test_batch_size="${PL_STANDALONE_TESTS_BATCH_SIZE:-1}" 7 | 8 | # this environment variable allows special tests to run 9 | export PL_RUN_STANDALONE_TESTS=1 10 | # python arguments 11 | defaults="-m pytest --no-header -v --disable-pytest-warnings --strict-markers --color=yes -s --timeout 120" 12 | echo "Using defaults: ${defaults}" 13 | 14 | # find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster 15 | grep_output=$(grep --recursive --word-regexp . --regexp 'standalone=True' --include '*.py') 16 | 17 | # file paths, remove duplicates 18 | files=$(echo "$grep_output" | cut -f1 -d: | sort | uniq) 19 | 20 | # get the list of parametrizations. we need to call them separately. the last two lines are removed. 21 | # note: if there's a syntax error, this will fail with some garbled output 22 | if [[ "$OSTYPE" == "darwin"* ]]; then 23 | parametrizations=$(python3 -m pytest $files --collect-only --quiet --disable-pytest-warnings "$@" | tail -r | sed -e '1,3d' | tail -r) 24 | else 25 | parametrizations=$(python3 -m pytest $files --collect-only --quiet --disable-pytest-warnings "$@" | head -n -2) 26 | fi 27 | # remove the "tests/" path suffix 28 | path_suffix=$(basename "$(pwd)")"/" # https://stackoverflow.com/a/8223345 29 | parametrizations=${parametrizations//$path_suffix/} 30 | parametrizations_arr=($parametrizations) 31 | 32 | report='' 33 | 34 | rm -f standalone_test_output.txt # in case it exists, remove it 35 | function show_batched_output { 36 | if [ -f standalone_test_output.txt ]; then # if exists 37 | cat standalone_test_output.txt 38 | # heuristic: stop if there's mentions of errors. this can prevent false negatives when only some of the ranks fail 39 | if grep -iE 'error|exception|traceback|failed' standalone_test_output.txt | grep -qvE 'on_exception|xfailed'; then 40 | echo "Potential error! Stopping." 41 | rm standalone_test_output.txt 42 | exit 1 43 | fi 44 | rm standalone_test_output.txt 45 | fi 46 | } 47 | trap show_batched_output EXIT # show the output on exit 48 | 49 | for i in "${!parametrizations_arr[@]}"; do 50 | parametrization=${parametrizations_arr[$i]} 51 | prefix="$((i+1))/${#parametrizations_arr[@]}" 52 | 53 | echo "$prefix: Running $parametrization" 54 | # execute the test in the background 55 | # redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them 56 | # output to std{out,err} because the outputs would be garbled together 57 | python3 ${defaults} "$parametrization" &>> standalone_test_output.txt & 58 | # save the PID in an array 59 | pids[${i}]=$! 60 | # add row to the final report 61 | report+="Ran\t$parametrization\n" 62 | 63 | if ((($i + 1) % $test_batch_size == 0)); then 64 | # wait for running tests 65 | for pid in ${pids[*]}; do wait $pid; done 66 | unset pids # empty the array 67 | show_batched_output 68 | fi 69 | done 70 | # wait for leftover tests 71 | for pid in ${pids[*]}; do wait $pid; done 72 | show_batched_output 73 | 74 | # echo test report 75 | printf '=%.s' {1..80} 76 | printf "\n$report" 77 | printf '=%.s' {1..80} 78 | printf '\n' 79 | -------------------------------------------------------------------------------- /config_hub/finetune/phi-3/full.yaml: -------------------------------------------------------------------------------- 1 | 2 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 3 | checkpoint_dir: checkpoints/microsoft/Phi-3-mini-4k-instruct 4 | 5 | # Directory in which to save checkpoints and logs. (type: , default: out/finetune/full) 6 | out_dir: out/finetune/full-phi-3 7 | 8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 9 | precision: bf16-true 10 | 11 | # How many devices/GPUs to use (type: Union[int, str], default: 1) 12 | devices: 1 13 | 14 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 15 | data: 16 | class_path: litgpt.data.Alpaca2k 17 | init_args: 18 | mask_prompt: false 19 | prompt_style: alpaca 20 | ignore_index: -100 21 | seed: 42 22 | num_workers: 4 23 | 24 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 25 | train: 26 | 27 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 28 | save_interval: 200 29 | 30 | # Number of iterations between logging calls (type: int, default: 1) 31 | log_interval: 1 32 | 33 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64) 34 | global_batch_size: 8 35 | 36 | # Number of samples per data-parallel rank (type: int, default: 1) 37 | micro_batch_size: 4 38 | 39 | # Number of iterations with learning rate warmup active (type: int, default: 100) 40 | lr_warmup_steps: 200 41 | 42 | # Number of epochs to train on (type: Optional[int], default: 5) 43 | epochs: 1 44 | 45 | # Total number of tokens to train on (type: Optional[int], default: null) 46 | max_tokens: 47 | 48 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 49 | max_steps: 50 | 51 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 52 | max_seq_length: 512 53 | 54 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 55 | tie_embeddings: 56 | 57 | # (type: Optional[float], default: null) 58 | max_norm: 59 | 60 | # (type: float, default: 6e-05) 61 | min_lr: 6.0e-05 62 | 63 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 64 | eval: 65 | 66 | # Number of optimizer steps between evaluation calls (type: int, default: 600) 67 | interval: 25 68 | 69 | # Number of tokens to generate (type: Optional[int], default: 100) 70 | max_new_tokens: 100 71 | 72 | # Number of iterations (type: int, default: 100) 73 | max_iters: 100 74 | 75 | # Whether to evaluate on the validation set at the beginning of the training 76 | initial_validation: false 77 | 78 | # Whether to evaluate on the validation set at the end the training 79 | final_validation: true 80 | 81 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 82 | logger_name: csv 83 | 84 | # The random seed to use for reproducibility. (type: int, default: 1337) 85 | seed: 1337 86 | 87 | # Optimizer-related arguments 88 | optimizer: 89 | 90 | class_path: torch.optim.AdamW 91 | 92 | init_args: 93 | 94 | # (type: float, default: 0.001) 95 | lr: 0.0002 96 | 97 | # (type: float, default: 0.01) 98 | weight_decay: 0.1 99 | 100 | # (type: tuple, default: (0.9,0.999)) 101 | betas: 102 | - 0.9 103 | - 0.95 104 | -------------------------------------------------------------------------------- /extensions/thunder/unsloth/kernels/swiglu.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import triton 17 | import triton.language as tl 18 | 19 | 20 | @triton.jit 21 | def _fg_kernel(e, g, h, n_elements, BLOCK_SIZE : tl.constexpr,): 22 | block_idx = tl.program_id(0) 23 | offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 24 | mask = offsets < n_elements 25 | 26 | e_row = tl.load(e + offsets, mask = mask, other = 0).to(tl.float32) 27 | g_row = tl.load(g + offsets, mask = mask, other = 0)#.to(tl.float32) 28 | 29 | # f = e * sigmoid(e) 30 | f_row = e_row * tl.sigmoid(e_row) # e_row / (1 + tl.exp(-e_row)) 31 | f_row = f_row.to(g_row.dtype) # Exact copy from HF 32 | # h = f * g 33 | h_row = f_row * g_row 34 | 35 | # Store h 36 | tl.store(h + offsets, h_row, mask = mask) 37 | pass 38 | 39 | 40 | def swiglu_fg_kernel(e, g): 41 | batch, seq_len, hd = e.shape 42 | n_elements = e.numel() 43 | h = torch.empty((batch, seq_len, hd), dtype = e.dtype, device = "cuda") 44 | grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) 45 | _fg_kernel[grid](e, g, h, n_elements, BLOCK_SIZE = 1024,) 46 | return h 47 | pass 48 | 49 | 50 | @triton.jit 51 | def _DWf_DW_dfg_kernel(DW, e, g, n_elements, BLOCK_SIZE : tl.constexpr,): 52 | """ 53 | e = e.float() 54 | se = 1.0 / (1.0 + torch.exp(-e)) 55 | f = (se * e).to(dtype) 56 | h = f * g 57 | df = DW * f 58 | dg = DW * g 59 | de = (dg.float() * se * (1.0 + e * (1.0 - se))).to(dtype) 60 | """ 61 | block_idx = tl.program_id(0) 62 | offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 63 | mask = offsets < n_elements 64 | 65 | DW_row = tl.load(DW + offsets, mask = mask, other = 0)#.to(tl.float32) 66 | e_row = tl.load(e + offsets, mask = mask, other = 0).to(tl.float32) 67 | g_row = tl.load(g + offsets, mask = mask, other = 0)#.to(tl.float32) 68 | 69 | # e = e.float() 70 | # se = 1.0 / (1.0 + torch.exp(-e)) 71 | se_row = tl.sigmoid(e_row) # 1.0 / (1.0 + tl.exp(-e_row)) 72 | # f = (se * e).to(dtype) 73 | f_row = se_row * e_row 74 | f_row = f_row.to(DW_row.dtype) 75 | # h = f * g 76 | h_row = f_row * g_row 77 | # df = DW * f 78 | df_row = DW_row * f_row 79 | # dg = DW * g 80 | dg_row = DW_row * g_row 81 | # de = (dg.float() * se * (1.0 + e * (1.0 - se))).to(dtype) 82 | de_row = dg_row.to(tl.float32) * se_row * (1.0 + e_row * (1.0 - se_row)) 83 | de_row = de_row.to(DW_row.dtype) 84 | 85 | # Store derivatives in buffers 86 | tl.store(DW + offsets, h_row, mask = mask) # h = f * g 87 | tl.store(e + offsets, df_row, mask = mask) # df = DW * f 88 | tl.store(g + offsets, de_row, mask = mask) # de 89 | pass 90 | 91 | 92 | def swiglu_DWf_DW_dfg_kernel(DW, e, g): 93 | batch_seq_len, hd = e.shape 94 | n_elements = e.numel() 95 | grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) 96 | _DWf_DW_dfg_kernel[grid](DW, e, g, n_elements, BLOCK_SIZE = 1024,) 97 | return DW, e, g 98 | pass 99 | -------------------------------------------------------------------------------- /tutorials/finetune_full.md: -------------------------------------------------------------------------------- 1 | # Finetuning the whole model 2 | 3 | If you are interested in parameter-efficient finetuning, check out [finetune_adapter.md](finetune_adapter.md). In contrast to parameter-efficient finetuning, this "full" approach finetunes all model parameters, which is substantially more expensive. It may only be recommended as a baseline for comparison studies. 4 | 5 | ## Preparation 6 | 7 | The steps here only need to be done once: 8 | 9 | 1. Follow the instructions in the [README](../README.md) to install the dependencies. 10 | 2. Download and convert the weights following our [guide](download_model_weights.md). 11 | 12 | LitGPT provides common datasets for finetuning, such as Alpaca, LIMA, Dolly, and more. 13 | You can optionally [prepare your own dataset](#tune-on-your-dataset). 14 | For more information about dataset preparation, also see the [prepare_dataset.md](./prepare_dataset.md) tutorial. 15 | 16 | ## Running the finetuning 17 | 18 | ```bash 19 | litgpt finetune_full tiiuae/falcon-7b \ 20 | --data Alpaca \ 21 | ``` 22 | 23 | Finetuning the falcon-7b model requires at least 8 GPUs with ~40 GB memory each. 24 | 25 | You can speed up training by passing the `devices` argument to the script to utilize more GPUs if available. 26 | Depending on the available GPU memory, you can also tune the `micro_batch_size` parameter to utilize the GPU efficiently. 27 | 28 | This script will save checkpoints periodically to the `out_dir` directory. If you are finetuning different models or on your own dataset, you can specify an output directory with your preferred name: 29 | 30 | ```bash 31 | litgpt finetune_full tiiuae/falcon-7b \ 32 | --data Alpaca \ 33 | --out_dir out/full/my-model-finetuned 34 | ``` 35 | 36 | If your GPU does not support `bfloat16`, you can pass the `--precision 32-true` argument. 37 | For instance, to fine-tune on MPS (the GPU on modern Macs), you can run 38 | 39 | ```bash 40 | litgpt finetune_full tiiuae/falcon-7b \ 41 | --data Alpaca \ 42 | --out_dir out/full/my-model-finetuned \ 43 | --precision 32-true 44 | ``` 45 | 46 | Note that `mps` as the accelerator will be picked up automatically by Fabric when running on a modern Mac. 47 | 48 | ## Test the model 49 | 50 | You can test the finetuned model with your own instructions by running: 51 | 52 | ```bash 53 | litgpt generate tiiuae/falcon-7b \ 54 | --prompt "Recommend a movie to watch on the weekend." \ 55 | --finetuned_path out/full/my-model-finetuned/lit_model_finetuned.pth 56 | ``` 57 | 58 | Output: 59 | 60 | ```text 61 | A good movie to watch on the weekend would be The Lion King, since it's a classic family film that everyone can enjoy... 62 | ``` 63 | 64 | If your GPU supports `bfloat16`, the script will automatically use it. 65 | 66 | ## Tune on your dataset 67 | 68 | You can easily train on your own instruction dataset saved in JSON format. 69 | 70 | 1. Create a JSON file in which each row holds one instruction-response pair. 71 | A row has an entry for 'instruction', 'input', and 'output', where 'input' is optional and can be 72 | the empty string if the instruction doesn't require a context. Below is an example json file: 73 | 74 | ```text 75 | [ 76 | { 77 | "instruction": "Arrange the given numbers in ascending order.", 78 | "input": "2, 4, 0, 8, 3", 79 | "output": "0, 2, 3, 4, 8" 80 | }, 81 | ... 82 | ] 83 | ``` 84 | 85 | 2. Run `litgpt finetune` by passing in the location of your data (and optionally other parameters): 86 | 87 | ```bash 88 | litgpt finetune tiiuae/falcon-7b \ 89 | --data JSON \ 90 | --data.json_path data/mydata.json \ 91 | --out_dir data/mydata-finetuned 92 | ``` 93 | -------------------------------------------------------------------------------- /config_hub/finetune/phi-2/full.yaml: -------------------------------------------------------------------------------- 1 | 2 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 3 | checkpoint_dir: checkpoints/microsoft/phi-2 4 | 5 | # Directory in which to save checkpoints and logs. (type: , default: out/finetune/full) 6 | out_dir: out/finetune/full-phi-2 7 | 8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 9 | precision: bf16-true 10 | 11 | # How many devices/GPUs to use (type: Union[int, str], default: 1) 12 | devices: 2 13 | 14 | # How many nodes to use. (type: int, default: 1) 15 | num_nodes: 1 16 | 17 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 18 | data: 19 | class_path: litgpt.data.Alpaca2k 20 | init_args: 21 | mask_prompt: false 22 | prompt_style: alpaca 23 | ignore_index: -100 24 | seed: 42 25 | num_workers: 4 26 | 27 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 28 | train: 29 | 30 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 31 | save_interval: 200 32 | 33 | # Number of iterations between logging calls (type: int, default: 1) 34 | log_interval: 1 35 | 36 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64) 37 | global_batch_size: 8 38 | 39 | # Number of samples per data-parallel rank (type: int, default: 1) 40 | micro_batch_size: 4 41 | 42 | # Number of iterations with learning rate warmup active (type: int, default: 100) 43 | lr_warmup_steps: 200 44 | 45 | # Number of epochs to train on (type: Optional[int], default: 5) 46 | epochs: 1 47 | 48 | # Total number of tokens to train on (type: Optional[int], default: null) 49 | max_tokens: 50 | 51 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 52 | max_steps: 100 53 | 54 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 55 | max_seq_length: 512 56 | 57 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 58 | tie_embeddings: 59 | 60 | # (type: Optional[float], default: null) 61 | max_norm: 62 | 63 | # (type: float, default: 6e-05) 64 | min_lr: 6.0e-05 65 | 66 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 67 | eval: 68 | 69 | # Number of optimizer steps between evaluation calls (type: int, default: 600) 70 | interval: 25 71 | 72 | # Number of tokens to generate (type: Optional[int], default: 100) 73 | max_new_tokens: 100 74 | 75 | # Number of iterations (type: int, default: 100) 76 | max_iters: 100 77 | 78 | # Whether to evaluate on the validation set at the beginning of the training 79 | initial_validation: false 80 | 81 | # Whether to evaluate on the validation set at the end the training 82 | final_validation: true 83 | 84 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 85 | logger_name: csv 86 | 87 | # The random seed to use for reproducibility. (type: int, default: 1337) 88 | seed: 1337 89 | 90 | # Optimizer-related arguments 91 | optimizer: 92 | 93 | class_path: torch.optim.AdamW 94 | 95 | init_args: 96 | 97 | # (type: float, default: 0.001) 98 | lr: 0.0002 99 | 100 | # (type: float, default: 0.01) 101 | weight_decay: 0.1 102 | 103 | # (type: tuple, default: (0.9,0.999)) 104 | betas: 105 | - 0.9 106 | - 0.95 107 | -------------------------------------------------------------------------------- /config_hub/finetune/gemma-2b/full.yaml: -------------------------------------------------------------------------------- 1 | 2 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 3 | checkpoint_dir: checkpoints/google/gemma-2b 4 | 5 | # Directory in which to save checkpoints and logs. (type: , default: out/lora) 6 | out_dir: out/finetune/full-gemma-2b 7 | 8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 9 | precision: bf16-true 10 | 11 | # How many devices/GPUs to use. (type: Union[int, str], default: 1) 12 | devices: 4 13 | 14 | # How many nodes to use. (type: int, default: 1) 15 | num_nodes: 1 16 | 17 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 18 | data: 19 | class_path: litgpt.data.Alpaca2k 20 | init_args: 21 | mask_prompt: false 22 | val_split_fraction: 0.03847 23 | prompt_style: alpaca 24 | ignore_index: -100 25 | seed: 42 26 | num_workers: 4 27 | 28 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 29 | train: 30 | 31 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 32 | save_interval: 800 33 | 34 | # Number of iterations between logging calls (type: int, default: 1) 35 | log_interval: 1 36 | 37 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) 38 | global_batch_size: 16 39 | 40 | # Number of samples per data-parallel rank (type: int, default: 4) 41 | micro_batch_size: 1 42 | 43 | # Number of iterations with learning rate warmup active (type: int, default: 100) 44 | lr_warmup_steps: 100 45 | 46 | # Number of epochs to train on (type: Optional[int], default: 5) 47 | epochs: 1 48 | 49 | # Total number of tokens to train on (type: Optional[int], default: null) 50 | max_tokens: 51 | 52 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 53 | max_steps: 50 54 | 55 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 56 | max_seq_length: 512 57 | 58 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 59 | tie_embeddings: 60 | 61 | # (type: Optional[float], default: null) 62 | max_norm: 63 | 64 | # (type: float, default: 6e-05) 65 | min_lr: 6.0e-05 66 | 67 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 68 | eval: 69 | 70 | # Number of optimizer steps between evaluation calls (type: int, default: 100) 71 | interval: 25 72 | 73 | # Number of tokens to generate (type: Optional[int], default: 100) 74 | max_new_tokens: 100 75 | 76 | # Number of iterations (type: int, default: 100) 77 | max_iters: 100 78 | 79 | # Whether to evaluate on the validation set at the beginning of the training 80 | initial_validation: false 81 | 82 | # Whether to evaluate on the validation set at the end the training 83 | final_validation: true 84 | 85 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 86 | logger_name: csv 87 | 88 | # The random seed to use for reproducibility. (type: int, default: 1337) 89 | seed: 1337 90 | 91 | # Optimizer-related arguments 92 | optimizer: 93 | 94 | class_path: torch.optim.AdamW 95 | 96 | init_args: 97 | 98 | # (type: float, default: 0.001) 99 | lr: 0.0002 100 | 101 | # (type: float, default: 0.01) 102 | weight_decay: 0.0 103 | 104 | # (type: tuple, default: (0.9,0.999)) 105 | betas: 106 | - 0.9 107 | - 0.95 108 | -------------------------------------------------------------------------------- /config_hub/finetune/stablelm-base-alpha-3b/full.yaml: -------------------------------------------------------------------------------- 1 | 2 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 3 | checkpoint_dir: checkpoints/stabilityai/stablelm-base-alpha-3b 4 | 5 | # Directory in which to save checkpoints and logs. (type: , default: out/lora) 6 | out_dir: out/finetune/full-stablelm-base-alpha-3b 7 | 8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 9 | precision: bf16-true 10 | 11 | # How many devices/GPUs to use. (type: Union[int, str], default: 1) 12 | devices: 2 13 | 14 | # How many nodes to use. (type: int, default: 1) 15 | num_nodes: 1 16 | 17 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 18 | data: 19 | class_path: litgpt.data.Alpaca2k 20 | init_args: 21 | mask_prompt: false 22 | val_split_fraction: 0.03847 23 | prompt_style: alpaca 24 | ignore_index: -100 25 | seed: 42 26 | num_workers: 4 27 | 28 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 29 | train: 30 | 31 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 32 | save_interval: 800 33 | 34 | # Number of iterations between logging calls (type: int, default: 1) 35 | log_interval: 1 36 | 37 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) 38 | global_batch_size: 8 39 | 40 | # Number of samples per data-parallel rank (type: int, default: 4) 41 | micro_batch_size: 1 42 | 43 | # Number of iterations with learning rate warmup active (type: int, default: 100) 44 | lr_warmup_steps: 1000 45 | 46 | # Number of epochs to train on (type: Optional[int], default: 5) 47 | epochs: 1 48 | 49 | # Total number of tokens to train on (type: Optional[int], default: null) 50 | max_tokens: 51 | 52 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 53 | max_steps: 54 | 55 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 56 | max_seq_length: 512 57 | 58 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 59 | tie_embeddings: 60 | 61 | # (type: Optional[float], default: null) 62 | max_norm: 63 | 64 | # (type: float, default: 6e-05) 65 | min_lr: 6.0e-05 66 | 67 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 68 | eval: 69 | 70 | # Number of optimizer steps between evaluation calls (type: int, default: 100) 71 | interval: 25 72 | 73 | # Number of tokens to generate (type: Optional[int], default: 100) 74 | max_new_tokens: 100 75 | 76 | # Number of iterations (type: int, default: 100) 77 | max_iters: 100 78 | 79 | # Whether to evaluate on the validation set at the beginning of the training 80 | initial_validation: false 81 | 82 | # Whether to evaluate on the validation set at the end the training 83 | final_validation: true 84 | 85 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 86 | logger_name: csv 87 | 88 | # The random seed to use for reproducibility. (type: int, default: 1337) 89 | seed: 1337 90 | 91 | # Optimizer-related arguments 92 | optimizer: 93 | 94 | class_path: torch.optim.AdamW 95 | 96 | init_args: 97 | 98 | # (type: float, default: 0.001) 99 | lr: 0.0002 100 | 101 | # (type: float, default: 0.01) 102 | weight_decay: 0.1 103 | 104 | # (type: tuple, default: (0.9,0.999)) 105 | betas: 106 | - 0.9 107 | - 0.95 108 | -------------------------------------------------------------------------------- /config_hub/finetune/tiny-llama/full.yaml: -------------------------------------------------------------------------------- 1 | 2 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 3 | checkpoint_dir: checkpoints/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T 4 | 5 | # Directory in which to save checkpoints and logs. (type: , default: out/lora) 6 | out_dir: out/finetune/full-tiny-llama-1.1b 7 | 8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 9 | precision: bf16-true 10 | 11 | # How many devices/GPUs to use. (type: Union[int, str], default: 1) 12 | devices: 1 13 | 14 | # How many nodes to use. (type: int, default: 1) 15 | num_nodes: 1 16 | 17 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 18 | data: 19 | class_path: litgpt.data.Alpaca2k 20 | init_args: 21 | mask_prompt: false 22 | val_split_fraction: 0.03847 23 | prompt_style: alpaca 24 | ignore_index: -100 25 | seed: 42 26 | num_workers: 4 27 | 28 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 29 | train: 30 | 31 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 32 | save_interval: 800 33 | 34 | # Number of iterations between logging calls (type: int, default: 1) 35 | log_interval: 1 36 | 37 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) 38 | global_batch_size: 32 39 | 40 | # Number of samples per data-parallel rank (type: int, default: 4) 41 | micro_batch_size: 4 42 | 43 | # Number of iterations with learning rate warmup active (type: int, default: 100) 44 | lr_warmup_steps: 1000 45 | 46 | # Number of epochs to train on (type: Optional[int], default: 5) 47 | epochs: 1 48 | 49 | # Total number of tokens to train on (type: Optional[int], default: null) 50 | max_tokens: 51 | 52 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 53 | max_steps: 54 | 55 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 56 | max_seq_length: 512 57 | 58 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 59 | tie_embeddings: 60 | 61 | # (type: Optional[float], default: null) 62 | max_norm: 63 | 64 | # (type: float, default: 6e-05) 65 | min_lr: 6.0e-05 66 | 67 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 68 | eval: 69 | 70 | # Number of optimizer steps between evaluation calls (type: int, default: 100) 71 | interval: 25 72 | 73 | # Number of tokens to generate (type: Optional[int], default: 100) 74 | max_new_tokens: 100 75 | 76 | # Number of iterations (type: int, default: 100) 77 | max_iters: 100 78 | 79 | # Whether to evaluate on the validation set at the beginning of the training 80 | initial_validation: false 81 | 82 | # Whether to evaluate on the validation set at the end the training 83 | final_validation: true 84 | 85 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 86 | logger_name: csv 87 | 88 | # The random seed to use for reproducibility. (type: int, default: 1337) 89 | seed: 1337 90 | 91 | # Optimizer-related arguments 92 | optimizer: 93 | 94 | class_path: torch.optim.AdamW 95 | 96 | init_args: 97 | 98 | # (type: float, default: 0.001) 99 | lr: 0.0002 100 | 101 | # (type: float, default: 0.01) 102 | weight_decay: 0.0 103 | 104 | # (type: tuple, default: (0.9,0.999)) 105 | betas: 106 | - 0.9 107 | - 0.95 108 | -------------------------------------------------------------------------------- /litgpt/data/longform.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import json 4 | from dataclasses import dataclass, field 5 | from pathlib import Path 6 | from typing import Optional, Union 7 | 8 | import torch 9 | from torch.utils.data import DataLoader 10 | 11 | from litgpt.prompts import PromptStyle 12 | from litgpt.data import DataModule, SFTDataset, get_sft_collate_fn 13 | from litgpt.data.alpaca import download_if_missing 14 | from litgpt.tokenizer import Tokenizer 15 | 16 | _URL = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset" 17 | 18 | 19 | @dataclass 20 | class LongForm(DataModule): 21 | """LongForm data module for supervised finetuning.""" 22 | 23 | mask_prompt: bool = False 24 | """Whether to mask the prompt section from the label (with ``ignore_index``).""" 25 | prompt_style: Union[str, PromptStyle] = "longform" 26 | """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles.""" 27 | ignore_index: int = -100 28 | """The index to use for elements to be ignored in the label.""" 29 | seed: int = 42 30 | """The random seed for shuffling the dataset.""" 31 | num_workers: int = 4 32 | """How many DataLoader processes to use for loading.""" 33 | download_dir: Path = Path("./data/longform") 34 | """The directory in which the downloaded dataset gets saved.""" 35 | 36 | tokenizer: Optional[Tokenizer] = field(default=None, init=False, repr=False) 37 | batch_size: int = field(default=1, init=False, repr=False) 38 | max_seq_length: int = field(default=-1, init=False, repr=False) 39 | train_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False) 40 | test_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False) 41 | 42 | def __post_init__(self) -> None: 43 | super().__init__() 44 | if isinstance(self.prompt_style, str): 45 | self.prompt_style = PromptStyle.from_name(self.prompt_style) 46 | 47 | def connect( 48 | self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: Optional[int] = None 49 | ) -> None: 50 | self.tokenizer = tokenizer 51 | self.batch_size = batch_size 52 | self.max_seq_length = -1 if max_seq_length is None else max_seq_length 53 | 54 | def prepare_data(self) -> None: 55 | self.download_dir.mkdir(parents=True, exist_ok=True) 56 | download_if_missing(self.download_dir / "train.json", f"{_URL}/train.json") 57 | download_if_missing(self.download_dir / "val.json", f"{_URL}/val.json") 58 | 59 | def train_dataloader(self): 60 | return self._dataloader("train") 61 | 62 | def val_dataloader(self): 63 | return self._dataloader("val") 64 | 65 | def _dataloader(self, split: str) -> DataLoader: 66 | with open(self.download_dir / f"{split}.json", "r", encoding="utf-8") as file: 67 | data = json.load(file) 68 | 69 | dataset = SFTDataset( 70 | data=data, 71 | tokenizer=self.tokenizer, 72 | prompt_style=self.prompt_style, 73 | max_seq_length=self.max_seq_length, 74 | mask_prompt=self.mask_prompt, 75 | ignore_index=self.ignore_index, 76 | transform=_transform, 77 | ) 78 | return DataLoader( 79 | dataset=dataset, 80 | batch_size=self.batch_size, 81 | shuffle=(split == "train"), 82 | generator=torch.Generator().manual_seed(self.seed), 83 | num_workers=self.num_workers, 84 | collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index), 85 | ) 86 | 87 | 88 | def _transform(item: dict) -> dict: 89 | item["instruction"] = item.pop("input") 90 | return item 91 | -------------------------------------------------------------------------------- /litgpt/args.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | import math 3 | from dataclasses import dataclass 4 | from typing import Optional 5 | import warnings 6 | 7 | 8 | @dataclass 9 | class TrainArgs: 10 | """Training-related arguments""" 11 | 12 | save_interval: Optional[int] = 1000 13 | """Number of optimizer steps between saving checkpoints""" 14 | log_interval: int = 1 15 | """Number of iterations between logging calls""" 16 | global_batch_size: int = 64 17 | """Number of samples between optimizer steps across data-parallel ranks""" 18 | micro_batch_size: int = 4 19 | """Number of samples per data-parallel rank""" 20 | lr_warmup_steps: Optional[int] = 100 21 | """Number of iterations with learning rate warmup active""" 22 | lr_warmup_fraction: Optional[float] = None 23 | """The fraction of an epoch to use for learning rate warmup""" 24 | epochs: Optional[int] = None 25 | """Number of epochs to train on""" 26 | # TODO: `pretrain` is the only script using `max_tokens` explicitly. replace it with epoch_size*epochs? 27 | max_tokens: Optional[int] = None 28 | """Total number of tokens to train on""" 29 | max_steps: Optional[int] = None 30 | """Limits the number of optimizer steps to run""" 31 | max_seq_length: Optional[int] = None 32 | """Limits the length of samples""" 33 | tie_embeddings: Optional[bool] = None 34 | """Whether to tie the embedding weights with the language modeling head weights""" 35 | 36 | # Optimization args 37 | max_norm: Optional[float] = None 38 | min_lr: float = 6e-5 39 | 40 | def __post_init__(self) -> None: 41 | if self.lr_warmup_fraction and self.lr_warmup_steps: 42 | raise ValueError( 43 | "Can't provide both `--train.lr_warmup_fraction` and `--train.lr_warmup_steps`. Choose one." 44 | ) 45 | if self.lr_warmup_fraction and not (0 <= self.lr_warmup_fraction <= 1): 46 | raise ValueError("`--train.lr_warmup_fraction` must be between 0 and 1.") 47 | 48 | if self.lr_warmup_steps and self.max_steps and (self.lr_warmup_steps >= self.max_steps): 49 | warnings.warn( 50 | "`--train.lr_warmup_steps` should be less than `--train.max_steps`." 51 | f" Got {self.lr_warmup_steps} lr_warmup_steps and {self.max_steps} max_steps.", UserWarning) 52 | 53 | def gradient_accumulation_iters(self, devices: int) -> int: 54 | """Number of iterations between gradient synchronizations""" 55 | gradient_accumulation_iters = self.batch_size(devices) // self.micro_batch_size 56 | assert gradient_accumulation_iters > 0 57 | return gradient_accumulation_iters 58 | 59 | def batch_size(self, devices: int) -> int: 60 | """Number of samples between optimizer steps per data-parallel rank""" 61 | batch_size = self.global_batch_size // devices 62 | assert batch_size > 0 63 | return batch_size 64 | 65 | def warmup_iters(self, devices: int, max_iters: int, train_dataloader) -> int: 66 | """Number of iterations to warm up the learning rate.""" 67 | if self.lr_warmup_fraction: 68 | return min(max_iters, math.ceil(self.lr_warmup_fraction * len(train_dataloader))) 69 | if self.lr_warmup_steps: 70 | return min(max_iters, self.lr_warmup_steps * self.gradient_accumulation_iters(devices)) 71 | return 0 72 | 73 | 74 | @dataclass 75 | class EvalArgs: 76 | """Evaluation-related arguments""" 77 | 78 | interval: int = 600 79 | """Number of optimizer steps between evaluation calls""" 80 | max_new_tokens: Optional[int] = None 81 | """Number of tokens to generate""" 82 | max_iters: int = 100 83 | """Number of iterations""" 84 | initial_validation: bool = False 85 | """Whether to evaluate on the validation set at the beginning of the training""" 86 | final_validation: bool = True 87 | """Whether to evaluate on the validation set at the end of the training""" 88 | -------------------------------------------------------------------------------- /tests/test_batch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pytest 3 | import warnings 4 | from pathlib import Path 5 | import litgpt 6 | from litgpt.generate.base import next_token, batched_next_token 7 | from litgpt.api import LLM, GPT 8 | from litgpt.scripts.download import download_from_hub 9 | from tests.conftest import RunIf 10 | 11 | warnings.filterwarnings("ignore") 12 | 13 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="Test requires a GPU.") 14 | def test_batched_equivalence(tmp_path): 15 | 16 | model_name = "microsoft/phi-2" 17 | download_from_hub(repo_id=model_name, tokenizer_only=True, checkpoint_dir=tmp_path) 18 | 19 | device = "cuda:0" 20 | batch_size = 3 21 | sample_kwargs = {"top_k": 1} 22 | 23 | llm: LLM = LLM.load( 24 | model_name, 25 | tokenizer_dir=Path(tmp_path / model_name), 26 | init="random", 27 | ) 28 | model: GPT = llm.model 29 | model.set_kv_cache(batch_size=1, max_seq_length=50, device=device) 30 | 31 | input_pos_1 = torch.tensor( 32 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=torch.int64, device=device 33 | ) 34 | input_pos_2 = torch.tensor([10], dtype=torch.int64, device=device) 35 | 36 | x = torch.tensor( 37 | [43993, 25, 1867, 466, 32660, 17485, 4483, 30, 198, 26410], 38 | device=device, 39 | dtype=torch.int64, 40 | ) 41 | 42 | batch_x1 = torch.stack([x] * batch_size, dim=0) 43 | 44 | # Single token generation baseline 45 | tok_1 = next_token(model, input_pos_1, x.unsqueeze(0), **sample_kwargs) 46 | tok_2 = next_token(model, input_pos_2, tok_1.unsqueeze(0), **sample_kwargs) 47 | 48 | assert tok_1.ndim == 1 49 | assert tok_2.ndim == 1 50 | assert tok_1.size(0) == 1 51 | assert tok_2.size(0) == 1 52 | 53 | # Switch to batched generation 54 | model.clear_kv_cache() 55 | model.set_kv_cache(batch_size=batch_size, max_seq_length=50, device="cuda:0") 56 | 57 | toks_1: torch.Tensor = batched_next_token(model, input_pos_1, batch_x1, sample_kwargs) 58 | toks_2: torch.Tensor = batched_next_token(model, input_pos_2, toks_1, sample_kwargs) 59 | 60 | assert toks_1.ndim == 2 61 | assert toks_2.ndim == 2 62 | assert toks_1.size(0) == batch_size 63 | assert toks_2.size(0) == batch_size 64 | 65 | # Assert that single and batched next token generation are equivalent 66 | assert all(t == tok_1 for t in toks_1), f"{tok_1} != {toks_1}" 67 | assert all(t == tok_2 for t in toks_2), f"{tok_2} != {toks_2}" 68 | 69 | 70 | @RunIf(min_cuda_gpus=1) 71 | def test_simple_batch(): 72 | old_allow_tf32 = torch.backends.cuda.matmul.allow_tf32 73 | torch.backends.cuda.matmul.allow_tf32 = False 74 | config = litgpt.Config.from_name( 75 | "Llama-3.1-8B", padded_vocab_size=10000, n_layer=2, n_head=8, n_embd=256 76 | ) 77 | with torch.device("cuda"): 78 | m = litgpt.GPT(config).requires_grad_(False).eval() 79 | x0 = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 7]]) 80 | input_pos0 = torch.tensor([[0, 1, 2, 3], [0, 1, 2, 2]]) 81 | x1 = torch.tensor([[1], [2]]) 82 | input_pos1 = torch.tensor([[4], [3]]) 83 | 84 | with torch.device("cuda"): 85 | m.set_kv_cache(2) 86 | outs0 = m(x0, input_pos0) 87 | outs1 = m(x1, input_pos1) 88 | 89 | with torch.device("cuda"): 90 | m.set_kv_cache(1) 91 | 92 | outs0_ref0 = m(x0[:1], input_pos0[0]) 93 | outs1_ref0 = m(x1[:1], input_pos1[0]) 94 | 95 | with torch.device("cuda"): 96 | m.set_kv_cache(1) 97 | 98 | outs0_ref1 = m(x0[1:], input_pos0[1]) 99 | outs1_ref1 = m(x1[1:], input_pos1[1]) 100 | 101 | outs0_ref = torch.cat([outs0_ref0, outs0_ref1]) 102 | outs1_ref = torch.cat([outs1_ref0, outs1_ref1]) 103 | 104 | print(outs0_ref - outs0) 105 | print(outs0.shape) 106 | torch.testing.assert_close(outs0, outs0_ref) 107 | torch.testing.assert_close(outs1, outs1_ref) 108 | torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32 109 | -------------------------------------------------------------------------------- /tests/test_thunder_ddp.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | import pytest 5 | import torch 6 | from tests.conftest import RunIf 7 | from lightning import Fabric 8 | 9 | # support running without installing as a package 10 | wd = Path(__file__).parent.parent.resolve() 11 | sys.path.append(str(wd)) 12 | 13 | from extensions.thunder.strategies.thunder_ddp import ThunderDDPStrategy 14 | from extensions.thunder.strategies.thunder_fsdp import ThunderFSDPStrategy 15 | 16 | 17 | @RunIf(thunder=True) 18 | def test_thunder_strategy_input_parsing(): 19 | with pytest.raises(ValueError, match="doesn't have an effect with `jit=False"): 20 | ThunderDDPStrategy(jit=False, executors=("python",)) 21 | 22 | 23 | @RunIf(min_cuda_gpus=2, thunder=True, standalone=True) 24 | @pytest.mark.parametrize("choice", ["ddp", "thunder_ddp", "fsdp", "thunder_fsdp"]) 25 | def test_no_backward_sync(choice): 26 | if choice == "thunder_ddp": 27 | strategy = ThunderDDPStrategy() 28 | elif choice == "thunder_fsdp": 29 | strategy = ThunderFSDPStrategy() 30 | else: 31 | strategy = choice 32 | 33 | fabric = Fabric(devices=2, accelerator="cuda", strategy=strategy) 34 | fabric.launch() 35 | 36 | # account for sharding in the case of FSDP 37 | out_features = 1 if "ddp" in choice else fabric.world_size 38 | 39 | model = torch.nn.Linear(1, out_features, bias=False, device=fabric.device) 40 | x = torch.randn(1, 1, device=fabric.device) 41 | model = fabric.setup(model) 42 | 43 | # 6 iters, 3 grad accumulation iters 44 | for i, enabled in enumerate((True, True, False, True, True, False), 1): 45 | x = torch.tensor([i * (fabric.local_rank + 1)], device=fabric.device, dtype=torch.float32) 46 | 47 | with fabric.no_backward_sync(model, enabled): 48 | y = model(x) 49 | fabric.backward(y.sum()) 50 | if not enabled: 51 | # Math for the first 3 iters 52 | # 53 | # DistributedDataParallel 54 | # (1*1+2*1+3*1 + 1*2+2*2+3*2) / 2 = 9 55 | # ^^^^^^^^^^^ ^^^^^^^^^^^ ^^^ 56 | # rank0 rank1 allreduce 57 | # 58 | # thunder.distributed.ddp 59 | # ((1*1+2*1) + (1*2+2*2)) / 2 + (3*1 + 3*2) / 2 = 9 60 | # ^^^^^^^ ^^^^^^^ ^^^ ^^^ ^^^ ^^^ 61 | # rank0 rank1 allreduce1 rank0 rank1 allreduce2 62 | assert model.weight.grad.shape.numel() == 1, model.weight.grad.shape 63 | assert model.weight.grad.item() == (9.0 if i == 3 else 22.5) 64 | assert not hasattr(model.weight, "_thunder_fsdp_unsharded_grad") 65 | model.weight.grad = None 66 | elif choice == "thunder_fsdp": 67 | assert model.weight._thunder_fsdp_unsharded_grad.shape == (2, 1) 68 | assert model.weight.grad is None 69 | 70 | 71 | @RunIf(min_cuda_gpus=2, thunder=True, standalone=True) 72 | @pytest.mark.parametrize("jit", (False, True)) 73 | def test_jit_before_setup(jit): 74 | import thunder 75 | 76 | fabric = Fabric(devices=2, accelerator="cuda", strategy=ThunderDDPStrategy(jit=jit)) 77 | fabric.launch() 78 | 79 | x = torch.randn(1, 1, device=fabric.device) 80 | model = torch.nn.Linear(1, 2, bias=False, device=fabric.device) 81 | 82 | tmodel = thunder.jit(model) 83 | fmodel = fabric.setup(tmodel) 84 | fmodel(x) 85 | 86 | assert "all_reduce" in thunder.last_backward_traces(tmodel)[-1].python() 87 | 88 | 89 | @RunIf(min_cuda_gpus=1, thunder=True) 90 | def test_setup_already_traced(): 91 | import thunder 92 | 93 | device = torch.device("cuda") 94 | x = torch.randn(1, 1, device=device) 95 | model = torch.nn.Linear(1, 2, bias=False, device=device) 96 | 97 | strategy = ThunderDDPStrategy() 98 | 99 | tmodel = thunder.jit(model) 100 | tmodel(x) 101 | with pytest.raises(RuntimeError, match="already called"): 102 | strategy.setup_module(tmodel) 103 | -------------------------------------------------------------------------------- /config_hub/finetune/llama-2-7b/full.yaml: -------------------------------------------------------------------------------- 1 | 2 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 3 | checkpoint_dir: checkpoints/meta-llama/Llama-2-7b-hf 4 | 5 | # Directory in which to save checkpoints and logs. (type: , default: out/finetune/full) 6 | out_dir: out/finetune/full-llama2-7b 7 | 8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 9 | precision: bf16-true 10 | 11 | # How many devices/GPUs to use (type: Union[int, str], default: 1) 12 | devices: 4 13 | 14 | # How many nodes to use. (type: int, default: 1) 15 | num_nodes: 1 16 | 17 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume 18 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing 19 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists. 20 | # (type: Union[bool, Literal["auto"], Path], default: False) 21 | resume: false 22 | 23 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 24 | data: 25 | class_path: litgpt.data.Alpaca2k 26 | init_args: 27 | mask_prompt: false 28 | prompt_style: alpaca 29 | ignore_index: -100 30 | seed: 42 31 | num_workers: 4 32 | 33 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 34 | train: 35 | 36 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 37 | save_interval: 200 38 | 39 | # Number of iterations between logging calls (type: int, default: 1) 40 | log_interval: 1 41 | 42 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64) 43 | global_batch_size: 64 44 | 45 | # Number of samples per data-parallel rank (type: int, default: 1) 46 | micro_batch_size: 4 47 | 48 | # Number of iterations with learning rate warmup active (type: int, default: 100) 49 | lr_warmup_steps: 25 50 | 51 | # Number of epochs to train on (type: Optional[int], default: 5) 52 | epochs: 1 53 | 54 | # Total number of tokens to train on (type: Optional[int], default: null) 55 | max_tokens: 56 | 57 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 58 | max_steps: 59 | 60 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 61 | max_seq_length: 512 62 | 63 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 64 | tie_embeddings: 65 | 66 | # (type: Optional[float], default: null) 67 | max_norm: 68 | 69 | # (type: float, default: 6e-05) 70 | min_lr: 6.0e-05 71 | 72 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 73 | eval: 74 | 75 | # Number of optimizer steps between evaluation calls (type: int, default: 600) 76 | interval: 25 77 | 78 | # Number of tokens to generate (type: Optional[int], default: 100) 79 | max_new_tokens: 100 80 | 81 | # Number of iterations (type: int, default: 100) 82 | max_iters: 100 83 | 84 | # Whether to evaluate on the validation set at the beginning of the training 85 | initial_validation: false 86 | 87 | # Whether to evaluate on the validation set at the end the training 88 | final_validation: true 89 | 90 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 91 | logger_name: csv 92 | 93 | # The random seed to use for reproducibility. (type: int, default: 1337) 94 | seed: 1337 95 | 96 | # Optimizer-related arguments 97 | optimizer: 98 | 99 | class_path: torch.optim.AdamW 100 | 101 | init_args: 102 | 103 | # (type: float, default: 0.001) 104 | lr: 0.0002 105 | 106 | # (type: float, default: 0.01) 107 | weight_decay: 0.0 108 | 109 | # (type: tuple, default: (0.9,0.999)) 110 | betas: 111 | - 0.9 112 | - 0.95 113 | -------------------------------------------------------------------------------- /config_hub/finetune/llama-3-8b/full.yaml: -------------------------------------------------------------------------------- 1 | 2 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 3 | checkpoint_dir: checkpoints/meta-llama/Meta-Llama-3-8B 4 | 5 | # Directory in which to save checkpoints and logs. (type: , default: out/finetune/full) 6 | out_dir: out/finetune/full-llama-3-8b 7 | 8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 9 | precision: bf16-true 10 | 11 | # How many devices/GPUs to use (type: Union[int, str], default: 1) 12 | devices: 4 13 | 14 | # How many nodes to use. (type: int, default: 1) 15 | num_nodes: 1 16 | 17 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume 18 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing 19 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists. 20 | # (type: Union[bool, Literal["auto"], Path], default: False) 21 | resume: false 22 | 23 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 24 | data: 25 | class_path: litgpt.data.Alpaca2k 26 | init_args: 27 | mask_prompt: false 28 | prompt_style: alpaca 29 | ignore_index: -100 30 | seed: 42 31 | num_workers: 4 32 | 33 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 34 | train: 35 | 36 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 37 | save_interval: 200 38 | 39 | # Number of iterations between logging calls (type: int, default: 1) 40 | log_interval: 1 41 | 42 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64) 43 | global_batch_size: 64 44 | 45 | # Number of samples per data-parallel rank (type: int, default: 1) 46 | micro_batch_size: 4 47 | 48 | # Number of iterations with learning rate warmup active (type: int, default: 100) 49 | lr_warmup_steps: 25 50 | 51 | # Number of epochs to train on (type: Optional[int], default: 5) 52 | epochs: 1 53 | 54 | # Total number of tokens to train on (type: Optional[int], default: null) 55 | max_tokens: 56 | 57 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 58 | max_steps: 59 | 60 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 61 | max_seq_length: 512 62 | 63 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 64 | tie_embeddings: 65 | 66 | # (type: Optional[float], default: null) 67 | max_norm: 68 | 69 | # (type: float, default: 6e-05) 70 | min_lr: 6.0e-05 71 | 72 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 73 | eval: 74 | 75 | # Number of optimizer steps between evaluation calls (type: int, default: 600) 76 | interval: 25 77 | 78 | # Number of tokens to generate (type: Optional[int], default: 100) 79 | max_new_tokens: 100 80 | 81 | # Number of iterations (type: int, default: 100) 82 | max_iters: 100 83 | 84 | # Whether to evaluate on the validation set at the beginning of the training 85 | initial_validation: false 86 | 87 | # Whether to evaluate on the validation set at the end the training 88 | final_validation: true 89 | 90 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 91 | logger_name: csv 92 | 93 | # The random seed to use for reproducibility. (type: int, default: 1337) 94 | seed: 1337 95 | 96 | # Optimizer-related arguments 97 | optimizer: 98 | 99 | class_path: torch.optim.AdamW 100 | 101 | init_args: 102 | 103 | # (type: float, default: 0.001) 104 | lr: 0.0002 105 | 106 | # (type: float, default: 0.01) 107 | weight_decay: 0.1 108 | 109 | # (type: tuple, default: (0.9,0.999)) 110 | betas: 111 | - 0.9 112 | - 0.95 113 | -------------------------------------------------------------------------------- /config_hub/finetune/llama-3.1-8b/full.yaml: -------------------------------------------------------------------------------- 1 | 2 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 3 | checkpoint_dir: checkpoints/meta-llama/Meta-Llama-3.1-8B 4 | 5 | # Directory in which to save checkpoints and logs. (type: , default: out/finetune/full) 6 | out_dir: out/finetune/full-llama-3.1-8b 7 | 8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 9 | precision: bf16-true 10 | 11 | # How many devices/GPUs to use (type: Union[int, str], default: 1) 12 | devices: 4 13 | 14 | # How many nodes to use. (type: int, default: 1) 15 | num_nodes: 1 16 | 17 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume 18 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing 19 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists. 20 | # (type: Union[bool, Literal["auto"], Path], default: False) 21 | resume: false 22 | 23 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 24 | data: 25 | class_path: litgpt.data.Alpaca2k 26 | init_args: 27 | mask_prompt: false 28 | prompt_style: alpaca 29 | ignore_index: -100 30 | seed: 42 31 | num_workers: 4 32 | 33 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 34 | train: 35 | 36 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 37 | save_interval: 200 38 | 39 | # Number of iterations between logging calls (type: int, default: 1) 40 | log_interval: 1 41 | 42 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64) 43 | global_batch_size: 64 44 | 45 | # Number of samples per data-parallel rank (type: int, default: 1) 46 | micro_batch_size: 4 47 | 48 | # Number of iterations with learning rate warmup active (type: int, default: 100) 49 | lr_warmup_steps: 25 50 | 51 | # Number of epochs to train on (type: Optional[int], default: 5) 52 | epochs: 1 53 | 54 | # Total number of tokens to train on (type: Optional[int], default: null) 55 | max_tokens: 56 | 57 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 58 | max_steps: 59 | 60 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 61 | max_seq_length: 512 62 | 63 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 64 | tie_embeddings: 65 | 66 | # (type: Optional[float], default: null) 67 | max_norm: 68 | 69 | # (type: float, default: 6e-05) 70 | min_lr: 6.0e-05 71 | 72 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 73 | eval: 74 | 75 | # Number of optimizer steps between evaluation calls (type: int, default: 600) 76 | interval: 25 77 | 78 | # Number of tokens to generate (type: Optional[int], default: 100) 79 | max_new_tokens: 100 80 | 81 | # Number of iterations (type: int, default: 100) 82 | max_iters: 100 83 | 84 | # Whether to evaluate on the validation set at the beginning of the training 85 | initial_validation: false 86 | 87 | # Whether to evaluate on the validation set at the end the training 88 | final_validation: true 89 | 90 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 91 | logger_name: csv 92 | 93 | # The random seed to use for reproducibility. (type: int, default: 1337) 94 | seed: 1337 95 | 96 | # Optimizer-related arguments 97 | optimizer: 98 | 99 | class_path: torch.optim.AdamW 100 | 101 | init_args: 102 | 103 | # (type: float, default: 0.001) 104 | lr: 0.0002 105 | 106 | # (type: float, default: 0.01) 107 | weight_decay: 0.1 108 | 109 | # (type: tuple, default: (0.9,0.999)) 110 | betas: 111 | - 0.9 112 | - 0.95 113 | -------------------------------------------------------------------------------- /tests/test_merge_lora.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import os 4 | import shutil 5 | from contextlib import redirect_stdout 6 | from io import StringIO 7 | from pathlib import Path 8 | from unittest import mock 9 | 10 | import pytest 11 | import torch 12 | import yaml 13 | 14 | from litgpt.lora import GPT as LoRAGPT 15 | from litgpt.lora import lora_filter 16 | from litgpt.model import GPT 17 | from litgpt.scripts.merge_lora import load_lora_metadata, merge_lora 18 | 19 | 20 | @mock.patch.dict(os.environ, {"LT_ACCELERATOR": "cpu"}) 21 | @pytest.mark.parametrize( 22 | ("pretrained_dtype", "lora_dtype"), [(None, None), (torch.float16, torch.float32), (torch.float16, torch.bfloat16)] 23 | ) 24 | def test_merge_lora(tmp_path, fake_checkpoint_dir, pretrained_dtype, lora_dtype): 25 | pretrained_checkpoint_dir = tmp_path / "pretrained" 26 | lora_checkpoint_dir = tmp_path / "lora" 27 | shutil.copytree(fake_checkpoint_dir, pretrained_checkpoint_dir) 28 | shutil.copytree(fake_checkpoint_dir, lora_checkpoint_dir) 29 | (lora_checkpoint_dir / "lit_model.pth").unlink() # should not already exist 30 | shutil.rmtree(tmp_path / "checkpoints") 31 | 32 | # Create a fake pretrained checkpoint 33 | config = dict(block_size=128, padded_vocab_size=256, n_layer=3, n_head=8, n_embd=16) 34 | with open(pretrained_checkpoint_dir / "model_config.yaml", "w", encoding="utf-8") as fp: 35 | yaml.dump(config, fp) 36 | base_model = GPT.from_name("pythia-14m", **config).to(dtype=pretrained_dtype) 37 | state_dict = base_model.state_dict() 38 | assert len(state_dict) == 40 39 | torch.save(state_dict, pretrained_checkpoint_dir / "lit_model.pth") 40 | 41 | # Create a fake LoRA checkpoint 42 | lora_kwargs = dict(lora_r=8, lora_alpha=16, lora_dropout=0.05, lora_query=True, lora_value=True) 43 | lora_model = LoRAGPT.from_name("pythia-14m", **config, **lora_kwargs).to(dtype=lora_dtype) 44 | state_dict = {k: v for k, v in lora_model.state_dict().items() if lora_filter(k, v)} 45 | assert len(state_dict) == 6 46 | torch.save(state_dict, lora_checkpoint_dir / "lit_model.pth.lora") 47 | hparams = dict(checkpoint_dir=str(pretrained_checkpoint_dir), **lora_kwargs) 48 | with open(lora_checkpoint_dir / "hyperparameters.yaml", "w", encoding="utf-8") as file: 49 | yaml.dump(hparams, file) 50 | shutil.copyfile(pretrained_checkpoint_dir / "model_config.yaml", lora_checkpoint_dir / "model_config.yaml") 51 | 52 | assert set(os.listdir(tmp_path)) == {"lora", "pretrained"} 53 | merge_lora(lora_checkpoint_dir) 54 | assert set(os.listdir(tmp_path)) == {"lora", "pretrained"} 55 | assert set(os.listdir(lora_checkpoint_dir)) == { 56 | "model_config.yaml", 57 | "lit_model.pth", 58 | "lit_model.pth.lora", 59 | "tokenizer.json", 60 | "tokenizer_config.json", 61 | "hyperparameters.yaml", 62 | } 63 | 64 | # Assert that the merged weights can be loaded back into the base model 65 | merged = torch.load(lora_checkpoint_dir / "lit_model.pth") 66 | keys = base_model.load_state_dict(merged, strict=True) 67 | assert not keys.missing_keys 68 | assert not keys.unexpected_keys 69 | 70 | # Attempt to merge again 71 | stdout = StringIO() 72 | with redirect_stdout(stdout): 73 | merge_lora(lora_checkpoint_dir) 74 | assert "LoRA weights have already been merged" in stdout.getvalue() 75 | 76 | 77 | def test_load_lora_metadata(fake_checkpoint_dir): 78 | assert not (fake_checkpoint_dir / "hyperparameters.yaml").is_file() 79 | with pytest.raises(FileNotFoundError, match="missing a `hyperparameters.yaml` file"): 80 | load_lora_metadata(fake_checkpoint_dir) 81 | 82 | hparams = dict(precision="bf16-mixed", checkpoint_dir="checkpoints/meta-llama/Llama-2-7b", lora_r=8, lora_alpha=16) 83 | with open(fake_checkpoint_dir / "hyperparameters.yaml", "w", encoding="utf-8") as file: 84 | yaml.dump(hparams, file) 85 | 86 | lora_args, pretrained_dir, precision = load_lora_metadata(fake_checkpoint_dir) 87 | assert lora_args == dict(lora_r=8, lora_alpha=16) 88 | assert pretrained_dir == Path("checkpoints/meta-llama/Llama-2-7b") 89 | assert precision == "bf16-mixed" 90 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import pytest 4 | import yaml 5 | 6 | import litgpt.config as config_module 7 | from litgpt import Config 8 | 9 | 10 | def test_config(): 11 | config = Config() 12 | assert config.name == "" 13 | assert config.block_size == 4096 14 | 15 | config = Config(block_size=2048) 16 | assert config.block_size == 2048 17 | 18 | config = Config.from_name("pythia-14m") 19 | assert config.block_size == 512 20 | 21 | config = Config.from_name("pythia-14m", block_size=4096) 22 | assert config.block_size == 4096 23 | 24 | config = Config(hf_config={"name": "pythia-14m"}) 25 | assert config.name == "pythia-14m" 26 | 27 | 28 | def test_from_hf_name(): 29 | # by short-hand name 30 | config0 = Config.from_name("tiny-llama-1.1b") 31 | # or by huggingface hub repo name 32 | config1 = Config.from_name("TinyLlama-1.1B-intermediate-step-1431k-3T") 33 | assert config0 is not None 34 | assert config1 is not None 35 | assert config0 == config1 36 | 37 | 38 | def test_nonexisting_name(): 39 | with pytest.raises(ValueError, match="'invalid-model-name' is not a supported config name"): 40 | Config.from_name("invalid-model-name") 41 | 42 | 43 | @pytest.mark.parametrize("config", config_module.configs, ids=[c["name"] for c in config_module.configs]) 44 | def test_short_and_hf_names_are_equal_unless_on_purpose(config): 45 | # by short-hand name 46 | config0 = Config.from_name(config["name"]) 47 | # or by huggingface hub repo name 48 | config1 = Config.from_name(config["hf_config"]["name"]) 49 | assert config0.name == config1.name 50 | 51 | 52 | def test_from_hf_name_with_org_string(): 53 | # Test case 1: valid input 54 | config0 = Config.from_name("tiny-llama-1.1b") 55 | config1 = Config.from_name("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T") 56 | assert config0 is not None 57 | assert config1 is not None 58 | assert config0 == config1 59 | 60 | # Test case 2: invalid input - org not found 61 | with pytest.raises(ValueError, match="'UnknownOrg/TinyLlama-1.1B-intermediate-step-1431k-3T' is not a supported config name"): 62 | Config.from_name("UnknownOrg/TinyLlama-1.1B-intermediate-step-1431k-3T") 63 | 64 | # Test case 3: invalid input - name not found 65 | with pytest.raises(ValueError, match="'TinyLlama/TinyLlama-XYZ' is not a supported config name"): 66 | Config.from_name("TinyLlama/TinyLlama-XYZ") 67 | 68 | 69 | def test_from_checkpoint(tmp_path): 70 | # 1. Neither `lit_config.py` nor matching config exists. 71 | with pytest.raises(FileNotFoundError, match="neither 'model_config.yaml' nor matching config exists"): 72 | Config.from_checkpoint(tmp_path / "non_existing_checkpoint") 73 | 74 | # 2. If `lit_config.py` doesn't exists, but there is a matching config in `litgpt/config.py`. 75 | config = Config.from_checkpoint(tmp_path / "pythia-14m") 76 | assert config.name == "pythia-14m" 77 | assert config.block_size == 512 78 | assert config.n_layer == 6 79 | 80 | # 3. If only `lit_config.py` exists. 81 | config_data = {"name": "pythia-14m", "block_size": 24, "n_layer": 2} 82 | with open(tmp_path / "model_config.yaml", "w", encoding="utf-8") as file: 83 | yaml.dump(config_data, file) 84 | config = Config.from_checkpoint(tmp_path) 85 | assert config.name == "pythia-14m" 86 | assert config.block_size == 24 87 | assert config.n_layer == 2 88 | 89 | # 4. Both `lit_config.py` and a matching config exist, but `lit_config.py` supersedes matching config 90 | (tmp_path / "pythia-14m").mkdir() 91 | with open(tmp_path / "pythia-14m/model_config.yaml", "w", encoding="utf-8") as file: 92 | yaml.dump(config_data, file) 93 | config = Config.from_checkpoint(tmp_path / "pythia-14m") 94 | assert config.name == "pythia-14m" 95 | assert config.block_size == 24 96 | assert config.n_layer == 2 97 | 98 | 99 | @pytest.mark.parametrize("head_size", [None, 128]) 100 | def test_head_size(head_size): 101 | config = Config(head_size) 102 | 103 | assert config.head_size == head_size or config.n_embd // config.n_head 104 | -------------------------------------------------------------------------------- /config_hub/pretrain/debug.yaml: -------------------------------------------------------------------------------- 1 | 2 | # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with 3 | # ``model_config``. (type: Optional[str], default: null) 4 | model_name: pythia-14m 5 | 6 | # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with 7 | # ``model_config``. (type: Optional[Config], default: null) 8 | model_config: 9 | 10 | # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in 11 | # /teamspace/jobs//share. (type: , default: out/pretrain) 12 | out_dir: out/pretrain/debug 13 | 14 | # The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 15 | precision: bf16-mixed 16 | 17 | # Optional path to a checkpoint directory to initialize the model from. 18 | # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null) 19 | initial_checkpoint_dir: 20 | 21 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume 22 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing 23 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists. 24 | # (type: Union[bool, Literal["auto"], Path], default: False) 25 | resume: false 26 | 27 | # Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``. 28 | data: TinyStories 29 | 30 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 31 | train: 32 | 33 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 34 | save_interval: 1000 35 | 36 | # Number of iterations between logging calls (type: int, default: 1) 37 | log_interval: 1 38 | 39 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512) 40 | global_batch_size: 125 41 | 42 | # Number of samples per data-parallel rank (type: int, default: 4) 43 | micro_batch_size: 5 44 | 45 | # Number of iterations with learning rate warmup active (type: int, default: 2000) 46 | lr_warmup_steps: 100 47 | 48 | # Number of epochs to train on (type: Optional[int], default: null) 49 | epochs: 50 | 51 | # Total number of tokens to train on (type: Optional[int], default: 3000000000000) 52 | max_tokens: 100000000 53 | 54 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 55 | max_steps: 56 | 57 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 58 | max_seq_length: 59 | 60 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False) 61 | tie_embeddings: 62 | 63 | # (type: Optional[float], default: 1.0) 64 | max_norm: 1.0 65 | 66 | # (type: float, default: 4e-05) 67 | min_lr: 6e-5 68 | 69 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 70 | eval: 71 | 72 | # Number of optimizer steps between evaluation calls (type: int, default: 1000) 73 | interval: 1000 74 | 75 | # Number of tokens to generate (type: Optional[int], default: null) 76 | max_new_tokens: 77 | 78 | # Number of iterations (type: int, default: 100) 79 | max_iters: 100 80 | 81 | # Whether to evaluate on the validation set at the beginning of the training 82 | initial_validation: false 83 | 84 | # Whether to evaluate on the validation set at the end the training 85 | final_validation: false 86 | 87 | # Optimizer-related arguments 88 | optimizer: 89 | 90 | class_path: torch.optim.AdamW 91 | 92 | init_args: 93 | 94 | # (type: float, default: 0.001) 95 | lr: 6e-4 96 | 97 | # (type: float, default: 0.01) 98 | weight_decay: 0.1 99 | 100 | # (type: tuple, default: (0.9,0.999)) 101 | betas: 102 | - 0.9 103 | - 0.95 104 | 105 | # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto) 106 | devices: auto 107 | 108 | # How many nodes to use. (type: int, default: 1) 109 | num_nodes: 1 110 | 111 | # Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data 112 | # module require this. (type: Optional[Path], default: null) 113 | tokenizer_dir: checkpoints/EleutherAI/pythia-14m 114 | 115 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard) 116 | logger_name: tensorboard 117 | 118 | # The random seed to use for reproducibility. (type: int, default: 42) 119 | seed: 42 120 | -------------------------------------------------------------------------------- /config_hub/pretrain/tinyllama.yaml: -------------------------------------------------------------------------------- 1 | 2 | # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with 3 | # ``model_config``. (type: Optional[str], default: null) 4 | model_name: tiny-llama-1.1b 5 | 6 | # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with 7 | # ``model_config``. (type: Optional[Config], default: null) 8 | model_config: 9 | 10 | # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in 11 | # /teamspace/jobs//share. (type: , default: out/pretrain) 12 | out_dir: out/pretrain/tiny-llama 13 | 14 | # The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 15 | precision: bf16-mixed 16 | 17 | # Optional path to a checkpoint directory to initialize the model from. 18 | # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null) 19 | initial_checkpoint_dir: 20 | 21 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume 22 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing 23 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists. 24 | # (type: Union[bool, Literal["auto"], Path], default: False) 25 | resume: false 26 | 27 | # Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``. 28 | data: TinyLlama 29 | 30 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 31 | train: 32 | 33 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 34 | save_interval: 1000 35 | 36 | # Number of iterations between logging calls (type: int, default: 1) 37 | log_interval: 1 38 | 39 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512) 40 | global_batch_size: 512 41 | 42 | # Number of samples per data-parallel rank (type: int, default: 4) 43 | micro_batch_size: 4 44 | 45 | # Number of iterations with learning rate warmup active (type: int, default: 2000) 46 | lr_warmup_steps: 2000 47 | 48 | # Number of epochs to train on (type: Optional[int], default: null) 49 | epochs: 50 | 51 | # Total number of tokens to train on (type: Optional[int], default: 3000000000000) 52 | max_tokens: 3000000000000 53 | 54 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 55 | max_steps: 56 | 57 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 58 | max_seq_length: 2048 59 | 60 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False) 61 | tie_embeddings: 62 | 63 | # (type: Optional[float], default: 1.0) 64 | max_norm: 1.0 65 | 66 | # (type: float, default: 4e-05) 67 | min_lr: 4.0e-05 68 | 69 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 70 | eval: 71 | 72 | # Number of optimizer steps between evaluation calls (type: int, default: 1000) 73 | interval: 1000 74 | 75 | # Number of tokens to generate (type: Optional[int], default: null) 76 | max_new_tokens: 77 | 78 | # Number of iterations (type: int, default: 100) 79 | max_iters: 100 80 | 81 | # Whether to evaluate on the validation set at the beginning of the training 82 | initial_validation: false 83 | 84 | # Whether to evaluate on the validation set at the end the training 85 | final_validation: false 86 | 87 | # Optimizer-related arguments 88 | optimizer: 89 | 90 | class_path: torch.optim.AdamW 91 | 92 | init_args: 93 | 94 | # (type: float, default: 0.001) 95 | lr: 4e-4 96 | 97 | # (type: float, default: 0.01) 98 | weight_decay: 0.1 99 | 100 | # (type: tuple, default: (0.9,0.999)) 101 | betas: 102 | - 0.9 103 | - 0.95 104 | 105 | # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto) 106 | devices: auto 107 | 108 | # How many nodes to use. (type: int, default: 1) 109 | num_nodes: 1 110 | 111 | # Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data 112 | # module require this. (type: Optional[Path], default: null) 113 | tokenizer_dir: checkpoints/meta-llama/Llama-2-7b-hf 114 | 115 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard) 116 | logger_name: tensorboard 117 | 118 | # The random seed to use for reproducibility. (type: int, default: 42) 119 | seed: 42 120 | -------------------------------------------------------------------------------- /litgpt/data/tinyllama.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | from dataclasses import dataclass, field 3 | from pathlib import Path 4 | from typing import Optional, Union 5 | 6 | from torch.utils.data import DataLoader 7 | 8 | from litgpt.tokenizer import Tokenizer 9 | from litgpt.data import DataModule 10 | 11 | 12 | @dataclass 13 | class TinyLlama(DataModule): 14 | """The TinyLlama data module is composed of a mix of SlimPajama and Starcoder data. 15 | 16 | Provides training and validation streaming dataloaders that return batches of tokens. 17 | """ 18 | 19 | data_path: Union[str, Path] = Path("data/") 20 | """The path to the data directory, containing two folders 'slimpajama' and 'starcoder' 21 | which are the output of the preprocessing step done in advance. See the `tutorial/pretrain_tinyllama.md` 22 | for instructions. The path can also be a remote path (e.g., s3://).""" 23 | seed: int = 42 24 | """The random seed for shuffling the dataset.""" 25 | num_workers: int = 8 26 | """How many DataLoader processes to use for loading.""" 27 | use_starcoder: bool = True 28 | """Toggle for using Starcoder data.""" 29 | 30 | batch_size: int = field(init=False, repr=False, default=1) 31 | seq_length: int = field(init=False, repr=False, default=2048) 32 | 33 | def __post_init__(self): 34 | super().__init__() 35 | # Could be a remote path (s3://) or a local path 36 | self.slimpajama_train = str(self.data_path).rstrip("/") + "/slimpajama/train" 37 | self.slimpajama_val = str(self.data_path).rstrip("/") + "/slimpajama/val" 38 | self.required_paths = [self.slimpajama_train, self.slimpajama_val] 39 | 40 | if self.use_starcoder: 41 | self.starcoder_train = str(self.data_path).rstrip("/") + "/starcoder" 42 | self.required_paths += [self.starcoder_train] 43 | 44 | def connect( 45 | self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: Optional[int] = None 46 | ) -> None: 47 | self.batch_size = batch_size 48 | self.seq_length = max_seq_length + 1 # Increase by one because we need the next token as well 49 | 50 | def prepare_data(self) -> None: 51 | for path in self.required_paths: 52 | if not path.startswith("s3://") and not Path(path).is_dir(): 53 | raise FileNotFoundError( 54 | "The data path for TinyLlama is expected to be the directory containing these subdirectories:" 55 | f" `slimpajama/train`, `slimpajama/val`, `starcoder`. The directory {path} does not exist." 56 | " Set it via `--data.data_path=...`" 57 | ) 58 | 59 | def train_dataloader(self) -> DataLoader: 60 | from litdata.streaming import CombinedStreamingDataset, StreamingDataLoader, StreamingDataset, TokensLoader 61 | 62 | slim_train_data = StreamingDataset( 63 | input_dir=self.slimpajama_train, 64 | item_loader=TokensLoader(block_size=self.seq_length), 65 | shuffle=True, 66 | drop_last=True, 67 | ) 68 | train_data = slim_train_data 69 | 70 | if self.use_starcoder: 71 | train_datasets = [ 72 | slim_train_data, 73 | StreamingDataset( 74 | input_dir=self.starcoder_train, 75 | item_loader=TokensLoader(block_size=self.seq_length), 76 | shuffle=True, 77 | drop_last=True, 78 | ), 79 | ] 80 | 81 | # Mix SlimPajama data and Starcoder data with these proportions: 82 | weights = (0.693584, 0.306416) 83 | train_data = CombinedStreamingDataset( 84 | datasets=train_datasets, seed=self.seed, weights=weights, iterate_over_all=False 85 | ) 86 | 87 | train_dataloader = StreamingDataLoader( 88 | train_data, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True 89 | ) 90 | return train_dataloader 91 | 92 | def val_dataloader(self) -> DataLoader: 93 | from litdata.streaming import StreamingDataLoader, StreamingDataset, TokensLoader 94 | 95 | val_dataset = StreamingDataset( 96 | input_dir=self.slimpajama_val, 97 | item_loader=TokensLoader(block_size=self.seq_length), 98 | shuffle=True, 99 | ) 100 | val_dataloader = StreamingDataLoader( 101 | val_dataset, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True 102 | ) 103 | return val_dataloader 104 | -------------------------------------------------------------------------------- /tests/test_pretrain.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import os 4 | from contextlib import redirect_stdout 5 | from io import StringIO 6 | from unittest import mock 7 | from unittest.mock import ANY, Mock 8 | 9 | import pytest 10 | import torch 11 | from lightning.fabric.strategies import FSDPStrategy, SingleDeviceStrategy 12 | from torch.utils.data import DataLoader 13 | 14 | from litgpt import pretrain 15 | from litgpt.args import EvalArgs, TrainArgs 16 | from litgpt.config import Config 17 | from litgpt.pretrain import initialize_weights 18 | from tests.conftest import RunIf 19 | 20 | 21 | @RunIf(min_cuda_gpus=2, standalone=True) 22 | # Set CUDA_VISIBLE_DEVICES for FSDP hybrid-shard, if fewer GPUs are used than are available 23 | @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) 24 | # If we were to use `save_hyperparameters()`, we would have to patch `sys.argv` or otherwise 25 | # the CLI would capture pytest args, but unfortunately patching would mess with subprocess 26 | # launching, so we need to mock `save_hyperparameters()` 27 | @mock.patch("litgpt.pretrain.save_hyperparameters") 28 | def test_pretrain(_, tmp_path): 29 | model_config = Config(block_size=2, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8) 30 | 31 | dataset = torch.tensor([[0, 1, 2], [3, 4, 5], [0, 1, 2]]) 32 | dataloader = DataLoader(dataset) 33 | pretrain.get_dataloaders = Mock(return_value=(dataloader, dataloader)) 34 | 35 | out_dir = tmp_path / "out" 36 | stdout = StringIO() 37 | with redirect_stdout(stdout): 38 | pretrain.setup( 39 | "pythia-14m", 40 | devices=2, 41 | model_config=model_config, 42 | out_dir=out_dir, 43 | train=TrainArgs(global_batch_size=2, max_tokens=16, save_interval=1, micro_batch_size=1, max_norm=1.0), 44 | eval=EvalArgs(interval=1, max_iters=1, final_validation=False), 45 | ) 46 | 47 | if torch.distributed.get_rank() == 0: 48 | # tmp_path is not the same across all ranks, run assert only on rank 0 49 | out_dir_contents = set(os.listdir(out_dir)) 50 | checkpoint_dirs = {"step-00000001", "step-00000002", "step-00000003", "step-00000004", "final"} 51 | assert checkpoint_dirs.issubset(out_dir_contents) 52 | assert all((out_dir / p).is_dir() for p in checkpoint_dirs) 53 | for checkpoint_dir in checkpoint_dirs: 54 | # the `tokenizer_dir` is None by default, so only 'lit_model.pth' shows here 55 | assert set(os.listdir(out_dir / checkpoint_dir)) == {"lit_model.pth", "model_config.yaml"} 56 | 57 | assert (out_dir / "logs" / "tensorboard" / "version_0").is_dir() 58 | 59 | # logs only appear on rank 0 60 | logs = stdout.getvalue() 61 | assert logs.count("(step)") == 4 62 | assert logs.count("val loss") == 4 63 | assert "Total parameters: 1,888" in logs 64 | 65 | torch.distributed.barrier() 66 | 67 | 68 | @RunIf(min_cuda_gpus=2, standalone=True) 69 | # Set CUDA_VISIBLE_DEVICES for FSDP hybrid-shard, if fewer GPUs are used than are available 70 | @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) 71 | @mock.patch("litgpt.pretrain.L.Fabric.load_raw") 72 | # See comment in `test_pretrain` why we need to mock `save_hyperparameters()` 73 | @mock.patch("litgpt.pretrain.save_hyperparameters") 74 | def test_initial_checkpoint_dir(_, load_mock, tmp_path): 75 | model_config = Config(block_size=2, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8) 76 | 77 | dataset = torch.tensor([[0, 1, 2], [3, 4, 5], [0, 1, 2]]) 78 | dataloader = DataLoader(dataset) 79 | pretrain.get_dataloaders = Mock(return_value=(dataloader, dataloader)) 80 | pretrain.fit = Mock() 81 | 82 | pretrain.setup("pythia-14m", initial_checkpoint_dir=tmp_path, devices=2, model_config=model_config, out_dir=tmp_path) 83 | 84 | load_mock.assert_called_once_with(tmp_path / "lit_model.pth", ANY) 85 | 86 | 87 | @pytest.mark.parametrize(("strategy", "expected"), [(SingleDeviceStrategy, True), (FSDPStrategy, False)]) 88 | def test_initialize_weights(strategy, expected): 89 | fabric_mock = Mock() 90 | fabric_mock.strategy = Mock(spec=strategy) 91 | 92 | class Child(torch.nn.Module): 93 | pass 94 | 95 | class Parent(torch.nn.Module): 96 | def __init__(self): 97 | super().__init__() 98 | self.child = Child() 99 | 100 | model = Parent() 101 | model.reset_parameters = Mock() 102 | model.child.reset_parameters = Mock() 103 | 104 | initialize_weights(fabric_mock, model, n_layer=2, n_embd=8) 105 | assert model.reset_parameters.call_count == int(expected) 106 | assert model.child.reset_parameters.call_count == int(expected) 107 | -------------------------------------------------------------------------------- /config_hub/pretrain/microllama.yaml: -------------------------------------------------------------------------------- 1 | 2 | # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with 3 | # ``model_config``. (type: Optional[str], default: null) 4 | model_name: micro-llama-300M 5 | 6 | # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with 7 | # ``model_config``. (type: Optional[Config], default: null) 8 | model_config: 9 | 10 | # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in 11 | # /teamspace/jobs//share. (type: , default: out/pretrain) 12 | out_dir: out/pretrain/micro-llama 13 | 14 | # The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 15 | precision: bf16-mixed 16 | 17 | # Optional path to a checkpoint directory to initialize the model from. 18 | # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null) 19 | initial_checkpoint_dir: 20 | 21 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume 22 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing 23 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists. 24 | # (type: Union[bool, Literal["auto"], Path], default: False) 25 | resume: false 26 | 27 | # Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``. 28 | data: MicroLlama 29 | 30 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 31 | train: 32 | 33 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 34 | save_interval: 1000 35 | 36 | # Number of iterations between logging calls (type: int, default: 1) 37 | log_interval: 1 38 | 39 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 48) 40 | # Scale this number according to the number of GPU and memory size per GPU 41 | # For example, we used 48 for 4 x 24G 4090 42 | global_batch_size: 48 43 | 44 | # Number of samples per data-parallel rank (type: int, default: 12) 45 | # Scale this number according to the memory size per GPU 46 | # For example, we used 12 for 24G 4090 47 | micro_batch_size: 12 48 | 49 | # Number of iterations with learning rate warmup active (type: int, default: 2000) 50 | lr_warmup_steps: 2000 51 | 52 | # Number of epochs to train on (type: Optional[int], default: null) 53 | epochs: 54 | 55 | # Total number of tokens to train on (type: Optional[int], default: 3000000000000) 56 | max_tokens: 3000000000000 57 | 58 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 59 | max_steps: 60 | 61 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 62 | max_seq_length: 2048 63 | 64 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False) 65 | tie_embeddings: 66 | 67 | # (type: Optional[float], default: 1.0) 68 | max_norm: 1.0 69 | 70 | # (type: float, default: 4e-05) 71 | min_lr: 4.0e-05 72 | 73 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 74 | eval: 75 | 76 | # Number of optimizer steps between evaluation calls (type: int, default: 1000) 77 | interval: 1000 78 | 79 | # Number of tokens to generate (type: Optional[int], default: null) 80 | max_new_tokens: 81 | 82 | # Number of iterations (type: int, default: 100) 83 | max_iters: 100 84 | 85 | # Whether to evaluate on the validation set at the beginning of the training 86 | initial_validation: false 87 | 88 | # Optimizer-related arguments 89 | optimizer: 90 | 91 | class_path: torch.optim.AdamW 92 | 93 | init_args: 94 | 95 | # (type: float, default: 0.001) 96 | lr: 4e-4 97 | 98 | # (type: float, default: 0.01) 99 | weight_decay: 0.1 100 | 101 | # (type: tuple, default: (0.9,0.999)) 102 | betas: 103 | - 0.9 104 | - 0.95 105 | 106 | # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto) 107 | devices: auto 108 | 109 | # How many nodes to use. (type: int, default: 1) 110 | num_nodes: 1 111 | 112 | # Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data 113 | # module require this. (type: Optional[Path], default: null) 114 | tokenizer_dir: checkpoints/meta-llama/Llama-2-7b-hf 115 | 116 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard) 117 | logger_name: tensorboard 118 | 119 | # The random seed to use for reproducibility. (type: int, default: 42) 120 | seed: 42 121 | -------------------------------------------------------------------------------- /config_hub/finetune/phi-3/lora.yaml: -------------------------------------------------------------------------------- 1 | 2 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 3 | checkpoint_dir: checkpoints/microsoft/Phi-3-mini-4k-instruct 4 | 5 | # Directory in which to save checkpoints and logs. (type: , default: out/lora) 6 | out_dir: out/finetune/lora-phi-3 7 | 8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 9 | precision: bf16-true 10 | 11 | # If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) 12 | quantize: 13 | 14 | # How many devices/GPUs to use. (type: Union[int, str], default: 1) 15 | devices: 1 16 | 17 | # The LoRA rank. (type: int, default: 8) 18 | lora_r: 8 19 | 20 | # The LoRA alpha. (type: int, default: 16) 21 | lora_alpha: 16 22 | 23 | # The LoRA dropout value. (type: float, default: 0.05) 24 | lora_dropout: 0.05 25 | 26 | # Whether to apply LoRA to the query weights in attention. (type: bool, default: True) 27 | lora_query: true 28 | 29 | # Whether to apply LoRA to the key weights in attention. (type: bool, default: False) 30 | lora_key: true 31 | 32 | # Whether to apply LoRA to the value weights in attention. (type: bool, default: True) 33 | lora_value: true 34 | 35 | # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) 36 | lora_projection: true 37 | 38 | # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) 39 | lora_mlp: true 40 | 41 | # Whether to apply LoRA to output head in GPT. (type: bool, default: False) 42 | lora_head: true 43 | 44 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 45 | data: 46 | class_path: litgpt.data.Alpaca2k 47 | init_args: 48 | mask_prompt: false 49 | val_split_fraction: 0.03847 50 | prompt_style: alpaca 51 | ignore_index: -100 52 | seed: 42 53 | num_workers: 4 54 | 55 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 56 | train: 57 | 58 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 59 | save_interval: 800 60 | 61 | # Number of iterations between logging calls (type: int, default: 1) 62 | log_interval: 1 63 | 64 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) 65 | global_batch_size: 8 66 | 67 | # Number of samples per data-parallel rank (type: int, default: 4) 68 | micro_batch_size: 4 69 | 70 | # Number of iterations with learning rate warmup active (type: int, default: 100) 71 | lr_warmup_steps: 10 72 | 73 | # Number of epochs to train on (type: Optional[int], default: 5) 74 | epochs: 1 75 | 76 | # Total number of tokens to train on (type: Optional[int], default: null) 77 | max_tokens: 78 | 79 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 80 | max_steps: 81 | 82 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 83 | max_seq_length: 512 84 | 85 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 86 | tie_embeddings: 87 | 88 | # (type: Optional[float], default: null) 89 | max_norm: 90 | 91 | # (type: float, default: 6e-05) 92 | min_lr: 6.0e-05 93 | 94 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 95 | eval: 96 | 97 | # Number of optimizer steps between evaluation calls (type: int, default: 100) 98 | interval: 100 99 | 100 | # Number of tokens to generate (type: Optional[int], default: 100) 101 | max_new_tokens: 100 102 | 103 | # Number of iterations (type: int, default: 100) 104 | max_iters: 100 105 | 106 | # Whether to evaluate on the validation set at the beginning of the training 107 | initial_validation: false 108 | 109 | # Whether to evaluate on the validation set at the end the training 110 | final_validation: true 111 | 112 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 113 | logger_name: csv 114 | 115 | # The random seed to use for reproducibility. (type: int, default: 1337) 116 | seed: 1337 117 | 118 | # Optimizer-related arguments 119 | optimizer: 120 | 121 | class_path: torch.optim.AdamW 122 | 123 | init_args: 124 | 125 | # (type: float, default: 0.001) 126 | lr: 0.0002 127 | 128 | # (type: float, default: 0.01) 129 | weight_decay: 0.0 130 | 131 | # (type: tuple, default: (0.9,0.999)) 132 | betas: 133 | - 0.9 134 | - 0.95 135 | -------------------------------------------------------------------------------- /config_hub/finetune/phi-3/qlora.yaml: -------------------------------------------------------------------------------- 1 | 2 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 3 | checkpoint_dir: checkpoints/microsoft/Phi-3-mini-4k-instruct 4 | 5 | # Directory in which to save checkpoints and logs. (type: , default: out/lora) 6 | out_dir: out/finetune/qlora-phi-3 7 | 8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 9 | precision: bf16-true 10 | 11 | # If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) 12 | quantize: bnb.nf4 13 | 14 | # How many devices/GPUs to use. (type: Union[int, str], default: 1) 15 | devices: 1 16 | 17 | # The LoRA rank. (type: int, default: 8) 18 | lora_r: 8 19 | 20 | # The LoRA alpha. (type: int, default: 16) 21 | lora_alpha: 16 22 | 23 | # The LoRA dropout value. (type: float, default: 0.05) 24 | lora_dropout: 0.05 25 | 26 | # Whether to apply LoRA to the query weights in attention. (type: bool, default: True) 27 | lora_query: true 28 | 29 | # Whether to apply LoRA to the key weights in attention. (type: bool, default: False) 30 | lora_key: true 31 | 32 | # Whether to apply LoRA to the value weights in attention. (type: bool, default: True) 33 | lora_value: true 34 | 35 | # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) 36 | lora_projection: true 37 | 38 | # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) 39 | lora_mlp: true 40 | 41 | # Whether to apply LoRA to output head in GPT. (type: bool, default: False) 42 | lora_head: true 43 | 44 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 45 | data: 46 | class_path: litgpt.data.Alpaca2k 47 | init_args: 48 | mask_prompt: false 49 | val_split_fraction: 0.03847 50 | prompt_style: alpaca 51 | ignore_index: -100 52 | seed: 42 53 | num_workers: 4 54 | 55 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 56 | train: 57 | 58 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 59 | save_interval: 800 60 | 61 | # Number of iterations between logging calls (type: int, default: 1) 62 | log_interval: 1 63 | 64 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) 65 | global_batch_size: 8 66 | 67 | # Number of samples per data-parallel rank (type: int, default: 4) 68 | micro_batch_size: 4 69 | 70 | # Number of iterations with learning rate warmup active (type: int, default: 100) 71 | lr_warmup_steps: 10 72 | 73 | # Number of epochs to train on (type: Optional[int], default: 5) 74 | epochs: 1 75 | 76 | # Total number of tokens to train on (type: Optional[int], default: null) 77 | max_tokens: 78 | 79 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 80 | max_steps: 81 | 82 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 83 | max_seq_length: 512 84 | 85 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 86 | tie_embeddings: 87 | 88 | # (type: Optional[float], default: null) 89 | max_norm: 90 | 91 | # (type: float, default: 6e-05) 92 | min_lr: 6.0e-05 93 | 94 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 95 | eval: 96 | 97 | # Number of optimizer steps between evaluation calls (type: int, default: 100) 98 | interval: 100 99 | 100 | # Number of tokens to generate (type: Optional[int], default: 100) 101 | max_new_tokens: 100 102 | 103 | # Number of iterations (type: int, default: 100) 104 | max_iters: 100 105 | 106 | # Whether to evaluate on the validation set at the beginning of the training 107 | initial_validation: false 108 | 109 | # Whether to evaluate on the validation set at the end the training 110 | final_validation: true 111 | 112 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 113 | logger_name: csv 114 | 115 | # The random seed to use for reproducibility. (type: int, default: 1337) 116 | seed: 1337 117 | 118 | # Optimizer-related arguments 119 | optimizer: 120 | 121 | class_path: torch.optim.AdamW 122 | 123 | init_args: 124 | 125 | # (type: float, default: 0.001) 126 | lr: 0.0002 127 | 128 | # (type: float, default: 0.01) 129 | weight_decay: 0.0 130 | 131 | # (type: tuple, default: (0.9,0.999)) 132 | betas: 133 | - 0.9 134 | - 0.95 135 | -------------------------------------------------------------------------------- /tutorials/convert_lit_models.md: -------------------------------------------------------------------------------- 1 | ## Converting LitGPT weights to Hugging Face Transformers 2 | 3 | LitGPT weights need to be converted to a format that Hugging Face understands with a [conversion script](../litgpt/scripts/convert_lit_checkpoint.py) before our scripts can run. 4 | 5 | We provide a helpful command to convert models LitGPT models back to their equivalent Hugging Face Transformers format: 6 | 7 | ```bash 8 | litgpt convert_from_litgpt checkpoint_dir converted_dir 9 | ``` 10 | 11 | These paths are just placeholders, you will need to customize them based on which finetuning or pretraining command you ran and its configuration. 12 | 13 | ### Loading converted LitGPT checkpoints into transformers 14 | 15 | 16 | For example, 17 | 18 | ```bash 19 | cp checkpoints/repo_id/config.json converted/config.json 20 | ``` 21 | 22 | Then, you can load the checkpoint file in a Python session as follows: 23 | 24 | ```python 25 | import torch 26 | from transformers import AutoModel 27 | 28 | 29 | state_dict = torch.load("output_dir/model.pth") 30 | model = AutoModel.from_pretrained( 31 | "output_dir/", local_files_only=True, state_dict=state_dict 32 | ) 33 | ``` 34 | 35 | Alternatively, you can also load the model without copying the `config.json` file as follows: 36 | 37 | ```python 38 | model = AutoModel.from_pretrained("online_repo_id", state_dict=state_dict) 39 | ``` 40 | 41 | 42 | 43 | ### Merging LoRA weights 44 | 45 | Please note that if you want to convert a model that has been finetuned using an adapter like LoRA, these weights should be [merged](../litgpt/scripts/merge_lora.py) to the checkpoint prior to converting. 46 | 47 | ```sh 48 | litgpt merge_lora path/to/lora/checkpoint_dir 49 | ``` 50 | 51 |
52 |
53 | 54 | # A finetuning and conversion tutorial 55 | 56 | This section contains a reproducible example for finetuning a LitGPT model and converting it back into a HF `transformer` model. 57 | 58 | 1. Download a model of interest: 59 | 60 | For convenience, we first specify an environment variable (optional) to avoid copy and pasting the whole path: 61 | 62 | ```bash 63 | export repo_id=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T 64 | ``` 65 | 66 | Instead of using TinyLlama, you can replace the `repo_id` target with any other model repository 67 | specifier that is currently supported by LitGPT. You can get a list of supported repository specifier 68 | by running `litgpt/scripts/download.py` without any additional arguments. 69 | 70 | Then, we download the model we specified via `$repo_id` above: 71 | 72 | ```bash 73 | litgpt download $repo_id 74 | ``` 75 | 76 | 2. Finetune the model: 77 | 78 | 79 | ```bash 80 | export finetuned_dir=out/lit-finetuned-model 81 | 82 | litgpt finetune_lora $repo_id \ 83 | --out_dir $finetuned_dir \ 84 | --train.epochs 1 \ 85 | --data Alpaca 86 | ``` 87 | 88 | 3. Merge LoRA weights: 89 | 90 | Note that this step only applies if the model was finetuned with `lora.py` above and not when `full.py` was used for finetuning. 91 | 92 | ```bash 93 | litgpt merge_lora $finetuned_dir/final 94 | ``` 95 | 96 | 97 | 4. Convert the finetuning model back into a HF format: 98 | 99 | ```bash 100 | litgpt convert_from_litgpt $finetuned_dir/final/ out/hf-tinyllama/converted 101 | ``` 102 | 103 | 104 | 5. Load the model into a `transformers` model: 105 | 106 | ```python 107 | import torch 108 | from transformers import AutoModel 109 | 110 | state_dict = torch.load('out/hf-tinyllama/converted/model.pth') 111 | model = AutoModel.from_pretrained("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", state_dict=state_dict) 112 | ``` 113 | 114 |   115 | ## Using the LM Evaluation Harness 116 | 117 | To evaluate LitGPT models, use the integrated evaluation utilities based on Eleuther AI's LM Evaluation Harness. For more information, please see the [evaluation](evaluation.md) documentation. 118 | 119 | Alternatively, if you wish to use converted LitGPT models with the LM Evaluation Harness from [Eleuther AI's GitHub repository](https://github.com/EleutherAI/lm-evaluation-harness), you can use the following steps. 120 | 121 | 1. Follow the instructions above to load the model into a Hugging Face transformers model. 122 | 123 | 2. Create a `model.safetensor` file: 124 | 125 | ```python 126 | model.save_pretrained("out/hf-tinyllama/converted/") 127 | ``` 128 | 129 | 3. Copy the tokenizer files into the model-containing directory: 130 | 131 | ```bash 132 | cp checkpoints/$repo_id/tokenizer* out/hf-tinyllama/converted 133 | ``` 134 | 135 | 4. Run the evaluation harness, for example: 136 | 137 | ```bash 138 | lm_eval --model hf \ 139 | --model_args pretrained=out/hf-tinyllama/converted \ 140 | --tasks "hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge" \ 141 | --device "cuda:0" \ 142 | --batch_size 4 143 | ``` 144 | -------------------------------------------------------------------------------- /config_hub/finetune/falcon-7b/lora.yaml: -------------------------------------------------------------------------------- 1 | 2 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 3 | checkpoint_dir: checkpoints/tiiuae/falcon-7b 4 | 5 | # Directory in which to save checkpoints and logs. (type: , default: out/lora) 6 | out_dir: out/finetune/lora-falcon-7b 7 | 8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 9 | precision: bf16-true 10 | 11 | # If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) 12 | quantize: 13 | 14 | # How many devices/GPUs to use. (type: Union[int, str], default: 1) 15 | devices: 1 16 | 17 | # How many nodes to use. (type: int, default: 1) 18 | num_nodes: 1 19 | 20 | # The LoRA rank. (type: int, default: 8) 21 | lora_r: 32 22 | 23 | # The LoRA alpha. (type: int, default: 16) 24 | lora_alpha: 16 25 | 26 | # The LoRA dropout value. (type: float, default: 0.05) 27 | lora_dropout: 0.05 28 | 29 | # Whether to apply LoRA to the query weights in attention. (type: bool, default: True) 30 | lora_query: true 31 | 32 | # Whether to apply LoRA to the key weights in attention. (type: bool, default: False) 33 | lora_key: false 34 | 35 | # Whether to apply LoRA to the value weights in attention. (type: bool, default: True) 36 | lora_value: true 37 | 38 | # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) 39 | lora_projection: false 40 | 41 | # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) 42 | lora_mlp: false 43 | 44 | # Whether to apply LoRA to output head in GPT. (type: bool, default: False) 45 | lora_head: false 46 | 47 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 48 | data: 49 | class_path: litgpt.data.Alpaca2k 50 | init_args: 51 | mask_prompt: false 52 | prompt_style: alpaca 53 | ignore_index: -100 54 | seed: 42 55 | num_workers: 4 56 | 57 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 58 | train: 59 | 60 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 61 | save_interval: 200 62 | 63 | # Number of iterations between logging calls (type: int, default: 1) 64 | log_interval: 1 65 | 66 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) 67 | global_batch_size: 8 68 | 69 | # Number of samples per data-parallel rank (type: int, default: 4) 70 | micro_batch_size: 1 71 | 72 | # Number of iterations with learning rate warmup active (type: int, default: 100) 73 | lr_warmup_steps: 10 74 | 75 | # Number of epochs to train on (type: Optional[int], default: 5) 76 | epochs: 4 77 | 78 | # Total number of tokens to train on (type: Optional[int], default: null) 79 | max_tokens: 80 | 81 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 82 | max_steps: 83 | 84 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 85 | max_seq_length: 512 86 | 87 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 88 | tie_embeddings: 89 | 90 | # (type: Optional[float], default: null) 91 | max_norm: 92 | 93 | # (type: float, default: 6e-05) 94 | min_lr: 6.0e-05 95 | 96 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 97 | eval: 98 | 99 | # Number of optimizer steps between evaluation calls (type: int, default: 100) 100 | interval: 100 101 | 102 | # Number of tokens to generate (type: Optional[int], default: 100) 103 | max_new_tokens: 100 104 | 105 | # Number of iterations (type: int, default: 100) 106 | max_iters: 100 107 | 108 | # Whether to evaluate on the validation set at the beginning of the training 109 | initial_validation: false 110 | 111 | # Whether to evaluate on the validation set at the end the training 112 | final_validation: true 113 | 114 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 115 | logger_name: csv 116 | 117 | # The random seed to use for reproducibility. (type: int, default: 1337) 118 | seed: 1337 119 | 120 | # Optimizer-related arguments 121 | optimizer: 122 | 123 | class_path: torch.optim.AdamW 124 | 125 | init_args: 126 | 127 | # (type: float, default: 0.001) 128 | lr: 0.0002 129 | 130 | # (type: float, default: 0.01) 131 | weight_decay: 0.0 132 | 133 | # (type: tuple, default: (0.9,0.999)) 134 | betas: 135 | - 0.9 136 | - 0.95 137 | -------------------------------------------------------------------------------- /litgpt/data/openwebtext.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | import os 3 | from dataclasses import dataclass, field 4 | from functools import partial 5 | from pathlib import Path 6 | from typing import Optional, Union 7 | 8 | from torch.utils.data import DataLoader 9 | 10 | from litgpt.tokenizer import Tokenizer 11 | from litgpt.data import DataModule 12 | 13 | 14 | @dataclass 15 | class OpenWebText(DataModule): 16 | """The OpenWebText data module for pretraining.""" 17 | 18 | data_path: Union[str, Path] = Path("data/openwebtext") 19 | """The path to the data directory, containing two folders 'train' and 'val' 20 | which are the output of the preprocessing step. The path can also be a remote path (e.g., s3://).""" 21 | val_split_fraction: float = 0.0005 22 | """The fraction of data that should be put aside for validation.""" 23 | seed: int = 42 24 | """The seed to use for shuffling the training data.""" 25 | num_workers: int = 8 26 | """The number of workers to use for the dataloaders.""" 27 | 28 | tokenizer: Optional[Tokenizer] = field(default=None, repr=False, init=False) 29 | batch_size: int = field(default=1, repr=False, init=False) 30 | seq_length: int = field(default=2048, repr=False, init=False) 31 | 32 | def __post_init__(self) -> None: 33 | super().__init__() 34 | # Could be a remote path (s3://) or a local path 35 | self.data_path_train = str(self.data_path).rstrip("/") + "/train" 36 | self.data_path_val = str(self.data_path).rstrip("/") + "/val" 37 | 38 | def connect( 39 | self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: Optional[int] = 2048 40 | ) -> None: 41 | self.tokenizer = tokenizer 42 | self.batch_size = batch_size 43 | self.seq_length = max_seq_length + 1 # Increase by one because we need the next token as well 44 | 45 | def prepare_data(self) -> None: 46 | from datasets import Dataset, load_dataset 47 | from litdata import optimize 48 | 49 | if str(self.data_path).startswith("s3://"): 50 | print(f"The OpenWebText data path points to an S3 location: {self.data_path}. Skipping preprocessing.") 51 | return 52 | 53 | if Path(self.data_path_train).is_dir() and Path(self.data_path_val).is_dir(): 54 | print(f"Found OpenWebText train and val dir: {self.data_path}. Skipping preprocessing.") 55 | return 56 | 57 | dataset = load_dataset("openwebtext", num_proc=(os.cpu_count() // 2), trust_remote_code=True) 58 | 59 | # Split the data in training and validation 60 | split_dataset = dataset["train"].train_test_split( 61 | test_size=self.val_split_fraction, seed=self.seed, shuffle=True 62 | ) 63 | split_dataset["val"] = split_dataset.pop("test") # rename the test split to val 64 | 65 | def tokenize(data: Dataset, index: int): 66 | yield self.tokenizer.encode(data[index]["text"], eos=True) 67 | 68 | optimize( 69 | fn=partial(tokenize, split_dataset["train"]), 70 | inputs=list(range(len(split_dataset["train"]))), 71 | output_dir=self.data_path_train, 72 | num_workers=min(64, os.cpu_count() - 1), 73 | chunk_bytes="200MB", 74 | ) 75 | optimize( 76 | fn=partial(tokenize, split_dataset["val"]), 77 | inputs=list(range(len(split_dataset["val"]))), 78 | output_dir=self.data_path_val, 79 | num_workers=min(8, os.cpu_count() - 1), 80 | chunk_bytes="200MB", 81 | ) 82 | 83 | def train_dataloader(self) -> DataLoader: 84 | from litdata.streaming import StreamingDataLoader, StreamingDataset, TokensLoader 85 | 86 | train_dataset = StreamingDataset( 87 | input_dir=self.data_path_train, 88 | item_loader=TokensLoader(block_size=self.seq_length), 89 | shuffle=True, 90 | ) 91 | train_dataloader = StreamingDataLoader( 92 | train_dataset, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True 93 | ) 94 | return train_dataloader 95 | 96 | def val_dataloader(self) -> DataLoader: 97 | from litdata.streaming import StreamingDataLoader, StreamingDataset, TokensLoader 98 | 99 | val_dataset = StreamingDataset( 100 | input_dir=self.data_path_val, 101 | item_loader=TokensLoader(block_size=self.seq_length), 102 | shuffle=True, 103 | ) 104 | val_dataloader = StreamingDataLoader( 105 | val_dataset, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True 106 | ) 107 | return val_dataloader 108 | -------------------------------------------------------------------------------- /config_hub/finetune/llama-2-7b/lora.yaml: -------------------------------------------------------------------------------- 1 | 2 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 3 | checkpoint_dir: checkpoints/meta-llama/Llama-2-7b-hf 4 | 5 | # Directory in which to save checkpoints and logs. (type: , default: out/lora) 6 | out_dir: out/finetune/lora-llama2-7b 7 | 8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 9 | precision: bf16-true 10 | 11 | # If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) 12 | quantize: 13 | 14 | # How many devices/GPUs to use. (type: Union[int, str], default: 1) 15 | devices: 1 16 | 17 | # How many nodes to use. (type: int, default: 1) 18 | num_nodes: 1 19 | 20 | # The LoRA rank. (type: int, default: 8) 21 | lora_r: 32 22 | 23 | # The LoRA alpha. (type: int, default: 16) 24 | lora_alpha: 16 25 | 26 | # The LoRA dropout value. (type: float, default: 0.05) 27 | lora_dropout: 0.05 28 | 29 | # Whether to apply LoRA to the query weights in attention. (type: bool, default: True) 30 | lora_query: true 31 | 32 | # Whether to apply LoRA to the key weights in attention. (type: bool, default: False) 33 | lora_key: false 34 | 35 | # Whether to apply LoRA to the value weights in attention. (type: bool, default: True) 36 | lora_value: true 37 | 38 | # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) 39 | lora_projection: false 40 | 41 | # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) 42 | lora_mlp: false 43 | 44 | # Whether to apply LoRA to output head in GPT. (type: bool, default: False) 45 | lora_head: false 46 | 47 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 48 | data: 49 | class_path: litgpt.data.Alpaca2k 50 | init_args: 51 | mask_prompt: false 52 | prompt_style: alpaca 53 | ignore_index: -100 54 | seed: 42 55 | num_workers: 4 56 | 57 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 58 | train: 59 | 60 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 61 | save_interval: 200 62 | 63 | # Number of iterations between logging calls (type: int, default: 1) 64 | log_interval: 1 65 | 66 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) 67 | global_batch_size: 8 68 | 69 | # Number of samples per data-parallel rank (type: int, default: 4) 70 | micro_batch_size: 2 71 | 72 | # Number of iterations with learning rate warmup active (type: int, default: 100) 73 | lr_warmup_steps: 10 74 | 75 | # Number of epochs to train on (type: Optional[int], default: 5) 76 | epochs: 4 77 | 78 | # Total number of tokens to train on (type: Optional[int], default: null) 79 | max_tokens: 80 | 81 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 82 | max_steps: 83 | 84 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 85 | max_seq_length: 512 86 | 87 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 88 | tie_embeddings: 89 | 90 | # (type: Optional[float], default: null) 91 | max_norm: 92 | 93 | # (type: float, default: 6e-05) 94 | min_lr: 6.0e-05 95 | 96 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 97 | eval: 98 | 99 | # Number of optimizer steps between evaluation calls (type: int, default: 100) 100 | interval: 100 101 | 102 | # Number of tokens to generate (type: Optional[int], default: 100) 103 | max_new_tokens: 100 104 | 105 | # Number of iterations (type: int, default: 100) 106 | max_iters: 100 107 | 108 | # Whether to evaluate on the validation set at the beginning of the training 109 | initial_validation: false 110 | 111 | # Whether to evaluate on the validation set at the end the training 112 | final_validation: true 113 | 114 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 115 | logger_name: csv 116 | 117 | # The random seed to use for reproducibility. (type: int, default: 1337) 118 | seed: 1337 119 | 120 | # Optimizer-related arguments 121 | optimizer: 122 | 123 | class_path: torch.optim.AdamW 124 | 125 | init_args: 126 | 127 | # (type: float, default: 0.001) 128 | lr: 0.0002 129 | 130 | # (type: float, default: 0.01) 131 | weight_decay: 0.0 132 | 133 | # (type: tuple, default: (0.9,0.999)) 134 | betas: 135 | - 0.9 136 | - 0.95 137 | -------------------------------------------------------------------------------- /config_hub/finetune/mistral-7b/lora.yaml: -------------------------------------------------------------------------------- 1 | 2 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 3 | checkpoint_dir: checkpoints/mistralai/Mistral-7B-v0.1 4 | 5 | # Directory in which to save checkpoints and logs. (type: , default: out/lora) 6 | out_dir: out/finetune/lora-mistral-7b 7 | 8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 9 | precision: bf16-true 10 | 11 | # If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) 12 | quantize: 13 | 14 | # How many devices/GPUs to use. (type: Union[int, str], default: 1) 15 | devices: 1 16 | 17 | # How many nodes to use. (type: int, default: 1) 18 | num_nodes: 1 19 | 20 | # The LoRA rank. (type: int, default: 8) 21 | lora_r: 32 22 | 23 | # The LoRA alpha. (type: int, default: 16) 24 | lora_alpha: 16 25 | 26 | # The LoRA dropout value. (type: float, default: 0.05) 27 | lora_dropout: 0.05 28 | 29 | # Whether to apply LoRA to the query weights in attention. (type: bool, default: True) 30 | lora_query: true 31 | 32 | # Whether to apply LoRA to the key weights in attention. (type: bool, default: False) 33 | lora_key: false 34 | 35 | # Whether to apply LoRA to the value weights in attention. (type: bool, default: True) 36 | lora_value: true 37 | 38 | # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) 39 | lora_projection: false 40 | 41 | # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) 42 | lora_mlp: false 43 | 44 | # Whether to apply LoRA to output head in GPT. (type: bool, default: False) 45 | lora_head: false 46 | 47 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 48 | data: 49 | class_path: litgpt.data.Alpaca2k 50 | init_args: 51 | mask_prompt: false 52 | prompt_style: alpaca 53 | ignore_index: -100 54 | seed: 42 55 | num_workers: 4 56 | 57 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 58 | train: 59 | 60 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 61 | save_interval: 200 62 | 63 | # Number of iterations between logging calls (type: int, default: 1) 64 | log_interval: 1 65 | 66 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) 67 | global_batch_size: 8 68 | 69 | # Number of samples per data-parallel rank (type: int, default: 4) 70 | micro_batch_size: 2 71 | 72 | # Number of iterations with learning rate warmup active (type: int, default: 100) 73 | lr_warmup_steps: 10 74 | 75 | # Number of epochs to train on (type: Optional[int], default: 5) 76 | epochs: 4 77 | 78 | # Total number of tokens to train on (type: Optional[int], default: null) 79 | max_tokens: 80 | 81 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 82 | max_steps: 83 | 84 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 85 | max_seq_length: 512 86 | 87 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 88 | tie_embeddings: 89 | 90 | # (type: Optional[float], default: null) 91 | max_norm: 92 | 93 | # (type: float, default: 6e-05) 94 | min_lr: 6.0e-05 95 | 96 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 97 | eval: 98 | 99 | # Number of optimizer steps between evaluation calls (type: int, default: 100) 100 | interval: 100 101 | 102 | # Number of tokens to generate (type: Optional[int], default: 100) 103 | max_new_tokens: 100 104 | 105 | # Number of iterations (type: int, default: 100) 106 | max_iters: 100 107 | 108 | # Whether to evaluate on the validation set at the beginning of the training 109 | initial_validation: false 110 | 111 | # Whether to evaluate on the validation set at the end the training 112 | final_validation: true 113 | 114 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 115 | logger_name: csv 116 | 117 | # The random seed to use for reproducibility. (type: int, default: 1337) 118 | seed: 1337 119 | 120 | # Optimizer-related arguments 121 | optimizer: 122 | 123 | class_path: torch.optim.AdamW 124 | 125 | init_args: 126 | 127 | # (type: float, default: 0.001) 128 | lr: 0.0002 129 | 130 | # (type: float, default: 0.01) 131 | weight_decay: 0.0 132 | 133 | # (type: tuple, default: (0.9,0.999)) 134 | betas: 135 | - 0.9 136 | - 0.95 137 | --------------------------------------------------------------------------------