├── litgpt
    ├── chat
    │   └── __init__.py
    ├── deploy
    │   └── __init__.py
    ├── finetune
    │   └── __init__.py
    ├── generate
    │   └── __init__.py
    ├── scripts
    │   ├── __init__.py
    │   └── convert_pretrained_checkpoint.py
    ├── data
    │   ├── microllama.py
    │   ├── alpaca_gpt4.py
    │   ├── __init__.py
    │   ├── alpaca_2k.py
    │   ├── prepare_slimpajama.py
    │   ├── prepare_starcoder.py
    │   ├── lit_data.py
    │   ├── dolly.py
    │   ├── longform.py
    │   ├── tinyllama.py
    │   └── openwebtext.py
    ├── __init__.py
    ├── __main__.py
    └── args.py
├── tests
    ├── data
    │   ├── __init__.py
    │   ├── test_longform.py
    │   ├── test_dolly.py
    │   ├── test_alpaca.py
    │   ├── test_tinyllama.py
    │   ├── test_textfiles.py
    │   ├── test_lit_data.py
    │   ├── test_openwebtext.py
    │   ├── test_deita.py
    │   ├── test_base.py
    │   └── test_tinystories.py
    ├── __init__.py
    ├── test_ci.py
    ├── test_convert_pretrained_checkpoint.py
    ├── test_rope.py
    ├── test_args.py
    ├── test_thunder_pretrain.py
    ├── test_cli.py
    ├── test_generate_adapter.py
    ├── test_config_hub.py
    ├── test_evaluate.py
    ├── test_full.py
    ├── run_standalone_tests.sh
    ├── test_batch.py
    ├── test_thunder_ddp.py
    ├── test_merge_lora.py
    ├── test_config.py
    └── test_pretrain.py
├── tutorials
    ├── developer-docs
    │   ├── README.md
    │   └── python-api.md
    ├── images
    │   ├── 0_to_litgpt
    │   │   ├── usage.webp
    │   │   ├── commands.webp
    │   │   ├── finetune.webp
    │   │   ├── pretrain.webp
    │   │   ├── instruction-1.webp
    │   │   └── instruction-2.webp
    │   └── prepare_dataset
    │   │   ├── lima.jpg
    │   │   ├── alpaca.jpg
    │   │   ├── deita.jpg
    │   │   ├── dolly.jpg
    │   │   ├── alpaca-2k.jpg
    │   │   ├── longform.jpg
    │   │   ├── alpacagpt4.jpg
    │   │   ├── alpaca_libre.jpg
    │   │   └── deita-multiturn.jpg
    ├── examples
    │   └── ptl-trainer
    │   │   ├── README.md
    │   │   └── litgpt_ptl_medium.py
    ├── convert_hf_checkpoint.md
    ├── deploy.md
    ├── finetune.md
    ├── finetune_full.md
    └── convert_lit_models.md
├── .github
    ├── CODEOWNERS
    ├── ISSUE_TEMPLATE
    │   ├── ask-a-question.md
    │   ├── feature-request.md
    │   └── bug-report.yaml
    ├── workflows
    │   ├── check-links.yml
    │   ├── publish.yaml
    │   └── cpu-tests.yml
    ├── azure-gpu-test.yml
    └── azure-gpu-test-with-thunder.yml
├── extensions
    └── thunder
    │   ├── strategies
    │       └── __init__.py
    │   └── unsloth
    │       └── kernels
    │           ├── __init__.py
    │           ├── utils.py
    │           └── swiglu.py
├── .gitignore
├── pyproject.toml
└── config_hub
    ├── finetune
        ├── phi-3
        │   ├── full.yaml
        │   ├── lora.yaml
        │   └── qlora.yaml
        ├── phi-2
        │   └── full.yaml
        ├── gemma-2b
        │   └── full.yaml
        ├── stablelm-base-alpha-3b
        │   └── full.yaml
        ├── tiny-llama
        │   └── full.yaml
        ├── llama-2-7b
        │   ├── full.yaml
        │   └── lora.yaml
        ├── llama-3-8b
        │   └── full.yaml
        ├── llama-3.1-8b
        │   └── full.yaml
        ├── falcon-7b
        │   └── lora.yaml
        └── mistral-7b
        │   └── lora.yaml
    └── pretrain
        ├── debug.yaml
        ├── tinyllama.yaml
        └── microllama.yaml


/litgpt/chat/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/litgpt/deploy/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/litgpt/finetune/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/litgpt/generate/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/litgpt/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tutorials/developer-docs/README.md:
--------------------------------------------------------------------------------
1 | LitGPT developer documentation files.


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @awaelchli @rasbt @lantiga
2 | /README.md                           @williamfalcon @lantiga
3 | 


--------------------------------------------------------------------------------
/tutorials/images/0_to_litgpt/usage.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/0_to_litgpt/usage.webp


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/lima.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/prepare_dataset/lima.jpg


--------------------------------------------------------------------------------
/tutorials/images/0_to_litgpt/commands.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/0_to_litgpt/commands.webp


--------------------------------------------------------------------------------
/tutorials/images/0_to_litgpt/finetune.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/0_to_litgpt/finetune.webp


--------------------------------------------------------------------------------
/tutorials/images/0_to_litgpt/pretrain.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/0_to_litgpt/pretrain.webp


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/alpaca.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/prepare_dataset/alpaca.jpg


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/deita.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/prepare_dataset/deita.jpg


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/dolly.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/prepare_dataset/dolly.jpg


--------------------------------------------------------------------------------
/extensions/thunder/strategies/__init__.py:
--------------------------------------------------------------------------------
1 | from .thunder_fsdp import ThunderFSDPStrategy
2 | from .thunder_ddp import ThunderDDPStrategy
3 | 


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/alpaca-2k.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/prepare_dataset/alpaca-2k.jpg


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/longform.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/prepare_dataset/longform.jpg


--------------------------------------------------------------------------------
/tutorials/images/0_to_litgpt/instruction-1.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/0_to_litgpt/instruction-1.webp


--------------------------------------------------------------------------------
/tutorials/images/0_to_litgpt/instruction-2.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/0_to_litgpt/instruction-2.webp


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/alpacagpt4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/prepare_dataset/alpacagpt4.jpg


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/alpaca_libre.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/prepare_dataset/alpaca_libre.jpg


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/deita-multiturn.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grangier/litgpt/main/tutorials/images/prepare_dataset/deita-multiturn.jpg


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/ask-a-question.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Ask a Question
3 | about: Ask and answer questions related to LitGPT
4 | title: ''
5 | labels: question
6 | 
7 | ---
8 | 
9 | Please describe your question here.


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Suggest a Feature
3 | about: Propose a new feature or enhancement
4 | title: ''
5 | labels: enhancement
6 | 
7 | ---
8 | 
9 | Please describe the feature or enhancement along with the intended usecase.


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
2 | 
3 | import warnings
4 | 
5 | import pytest
6 | 
7 | warnings.filterwarnings("ignore", category=pytest.PytestWarning, message=r".*\(rm_rf\) error removing.*")
8 | 


--------------------------------------------------------------------------------
/extensions/thunder/unsloth/kernels/__init__.py:
--------------------------------------------------------------------------------
1 | from .cross_entropy_loss import _cross_entropy_forward_impl, _cross_entropy_backward_impl
2 | from .rope_embedding import _rope_embedding_forward_impl, _rope_embedding_backward_impl, ROPE_GROUP_SIZE
3 | from .swiglu import swiglu_fg_kernel, swiglu_DWf_DW_dfg_kernel
4 | from .utils import calculate_settings
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | .idea
 3 | .DS_Store
 4 | *.egg-info
 5 | build
 6 | dist
 7 | .venv
 8 | .vscode
 9 | 
10 | # data
11 | data
12 | datasets
13 | !litgpt/data
14 | !tests/data
15 | checkpoints
16 | out
17 | wandb
18 | events.out.tfevents*
19 | 
20 | # test artifacts from tests/test_readme.py
21 | **/custom_finetuning_dataset.json
22 | client.py
23 | **/custom_texts/
24 | 


--------------------------------------------------------------------------------
/tests/test_ci.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | from tests.conftest import RunIf
 4 | from lightning.fabric.plugins.precision.bitsandbytes import _BITSANDBYTES_AVAILABLE
 5 | 
 6 | 
 7 | @RunIf(min_cuda_gpus=1)
 8 | def test_gpu_ci_installs_bitsandbytes():
 9 |     assert _BITSANDBYTES_AVAILABLE, str(_BITSANDBYTES_AVAILABLE)
10 | 


--------------------------------------------------------------------------------
/litgpt/data/microllama.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | from dataclasses import dataclass
 3 | from pathlib import Path
 4 | from typing import Union
 5 | 
 6 | from litgpt.data import TinyLlama
 7 | 
 8 | 
 9 | @dataclass
10 | class MicroLlama(TinyLlama):
11 |     """The MicroLlama data module is composed of only SlimPajama data."""
12 | 
13 |     def __init__(self, data_path: Union[str, Path] = Path("data/"), seed: int = 42, num_workers: int = 8):
14 |         super().__init__(data_path=data_path, seed=seed, num_workers=num_workers, use_starcoder=False)
15 | 


--------------------------------------------------------------------------------
/.github/workflows/check-links.yml:
--------------------------------------------------------------------------------
 1 | name: Check hyperlinks
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 | 
11 | jobs:
12 |   test:
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v4
17 | 
18 |     - name: Set up Python
19 |       uses: actions/setup-python@v5
20 |       with:
21 |         python-version: '3.10'
22 | 
23 |     - name: Install dependencies
24 |       run: |
25 |         python -m pip install --upgrade pip
26 |         pip install pytest pytest-check-links
27 | 
28 |     - name: Check links
29 |       run: |
30 |         pytest --check-links README.md --check-links-ignore "http*"
31 |         pytest --check-links tutorials --check-links-ignore "http*"


--------------------------------------------------------------------------------
/tutorials/examples/ptl-trainer/README.md:
--------------------------------------------------------------------------------
 1 | ## Minimal PyTorch Lightning Trainer Example
 2 | 
 3 | 
 4 | 
 5 | The script in this folder provides minimal examples showing how to train a LitGPT model using LitGPT's `GPT` class with the [PyTorch Lightning](https://github.com/Lightning-AI/pytorch-lightning) Trainer.
 6 | 
 7 | You can run the scripts as follows:
 8 | 
 9 | &nbsp
10 | ## Small 160M model:
11 | 
12 | ```bash
13 | # Download the Pythia model
14 | litgpt download EleutherAI/pythia-160m
15 | 
16 | python litgpt_ptl_small.py
17 | ```
18 | 
19 | &nbsp
20 | ## Medium-sized 8B model:
21 | 
22 | ```bash
23 | # Download the Llama 3.1 model
24 | litgpt download meta-llama/Meta-Llama-3.1-8B --access_token hf_...
25 | 
26 | python litgpt_ptl_medium.py
27 | ```
28 | 
29 | 
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/litgpt/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import logging
 4 | import re
 5 | 
 6 | from litgpt.api import LLM
 7 | from litgpt.model import GPT  # needs to be imported before config
 8 | from litgpt.config import Config
 9 | from litgpt.prompts import PromptStyle
10 | from litgpt.tokenizer import Tokenizer
11 | 
12 | # Suppress excessive warnings, see https://github.com/pytorch/pytorch/issues/111632
13 | pattern = re.compile(".*Profiler function .* will be ignored")
14 | logging.getLogger("torch._dynamo.variables.torch").addFilter(lambda record: not pattern.search(record.getMessage()))
15 | 
16 | # Avoid printing state-dict profiling output at the WARNING level when saving a checkpoint
17 | logging.getLogger("torch.distributed.fsdp._optim_utils").disabled = True
18 | logging.getLogger("torch.distributed.fsdp._debug_utils").disabled = True
19 | 
20 | __all__ = ["LLM", "GPT", "Config", "PromptStyle", "Tokenizer"]
21 | 


--------------------------------------------------------------------------------
/litgpt/data/alpaca_gpt4.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | 
 4 | from dataclasses import dataclass, field
 5 | from pathlib import Path
 6 | 
 7 | from litgpt.data.alpaca import Alpaca
 8 | 
 9 | _URL = "https://raw.githubusercontent.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/main/data/alpaca_gpt4_data.json"
10 | 
11 | 
12 | @dataclass
13 | class AlpacaGPT4(Alpaca):
14 |     """AlpacaGPT4 data module for supervised finetuning."""
15 | 
16 |     val_split_fraction: float = 0.03847  # to get exactly 2000 test samples,
17 |     """The fraction of the dataset to use for the validation dataset. The rest is used for training."""
18 |     download_dir: Path = Path("./data/alpacagpt4")
19 |     """The directory in which the downloaded datasetgets saved."""
20 |     file_url: str = field(repr=False, default=_URL)
21 |     """The URL from where to download the dataset."""
22 |     file_name: str = field(repr=False, default="alpacagpt4_data_cleaned_archive.json")
23 |     """The name of the dataset file to download."""
24 | 


--------------------------------------------------------------------------------
/tests/test_convert_pretrained_checkpoint.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import os
 4 | 
 5 | import torch
 6 | 
 7 | from litgpt.scripts.convert_pretrained_checkpoint import convert_pretrained_checkpoint
 8 | 
 9 | 
10 | def test_convert_pretrained_checkpoint(tmp_path, fake_checkpoint_dir):
11 |     # Pretend we made a checkpoint from pretraining
12 |     pretrained_checkpoint = {
13 |         "model": {"some.module.weight": torch.rand(2, 2), "_orig_mod.some.other.module.weight": torch.rand(2, 2)},
14 |         "the_optimizer": "optimizer_state",
15 |         "other": 1,
16 |     }
17 |     torch.save(pretrained_checkpoint, fake_checkpoint_dir / "lit_model.pth")
18 | 
19 |     convert_pretrained_checkpoint(checkpoint_dir=fake_checkpoint_dir, output_dir=(tmp_path / "converted"))
20 | 
21 |     assert set(os.listdir(tmp_path / "converted")) == {
22 |         "lit_model.pth",
23 |         "model_config.yaml",
24 |         "tokenizer_config.json",
25 |         "tokenizer.json",
26 |     }
27 |     converted_checkpoint = torch.load(tmp_path / "converted" / "lit_model.pth")
28 |     assert list(converted_checkpoint.keys()) == ["some.module.weight", "some.other.module.weight"]
29 | 


--------------------------------------------------------------------------------
/tests/test_rope.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import torch
 4 | from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXRotaryEmbedding, apply_rotary_pos_emb
 5 | 
 6 | from litgpt.model import apply_rope, build_rope_cache
 7 | 
 8 | 
 9 | @torch.inference_mode()
10 | def test_rope():
11 |     bs, seq_len, n_head, n_embed = 1, 6, 2, 8
12 |     head_size = n_embed // n_head
13 |     x = torch.randint(0, 10000, size=(bs, n_head, seq_len, head_size)).float()
14 |     position_ids = torch.arange(seq_len).unsqueeze(0)
15 | 
16 |     theirs = GPTNeoXRotaryEmbedding(head_size, seq_len)
17 |     ours_cos_cached, ours_sin_cached = build_rope_cache(seq_len, head_size, device=x.device)
18 |     # their rope cache has 2 added dimensions and the cos/sin is duplicated
19 |     torch.testing.assert_close(ours_cos_cached, theirs.cos_cached.squeeze())
20 |     torch.testing.assert_close(ours_sin_cached, theirs.sin_cached.squeeze())
21 | 
22 |     ours_x_rope = apply_rope(x, ours_cos_cached, ours_sin_cached)
23 |     theirs_x_rope, _ = apply_rotary_pos_emb(x, x, theirs.cos_cached, theirs.sin_cached, position_ids)
24 |     torch.testing.assert_close(ours_x_rope, theirs_x_rope)
25 | 


--------------------------------------------------------------------------------
/litgpt/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | from litgpt.data.base import DataModule, SFTDataset, get_sft_collate_fn
 4 | from litgpt.data.alpaca import Alpaca
 5 | from litgpt.data.alpaca_2k import Alpaca2k
 6 | from litgpt.data.alpaca_gpt4 import AlpacaGPT4
 7 | from litgpt.data.json_data import JSON
 8 | from litgpt.data.deita import Deita
 9 | from litgpt.data.dolly import Dolly
10 | from litgpt.data.flan import FLAN
11 | from litgpt.data.lima import LIMA
12 | from litgpt.data.lit_data import LitData
13 | from litgpt.data.longform import LongForm
14 | from litgpt.data.text_files import TextFiles
15 | from litgpt.data.tinyllama import TinyLlama
16 | from litgpt.data.tinystories import TinyStories
17 | from litgpt.data.openwebtext import OpenWebText
18 | from litgpt.data.microllama import MicroLlama
19 | 
20 | 
21 | __all__ = [
22 |     "Alpaca",
23 |     "Alpaca2k",
24 |     "AlpacaGPT4",
25 |     "Deita",
26 |     "Dolly",
27 |     "FLAN",
28 |     "JSON",
29 |     "LIMA",
30 |     "LitData",
31 |     "DataModule",
32 |     "LongForm",
33 |     "OpenWebText",
34 |     "SFTDataset",
35 |     "TextFiles",
36 |     "TinyLlama",
37 |     "TinyStories",
38 |     "MicroLlama"
39 |     "get_sft_collate_fn",
40 | ]
41 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yaml:
--------------------------------------------------------------------------------
 1 | # To create a release, create a tag and push it to GitHub:
 2 | #git tag -a "v0.0.1-beta" -m "beta version testing"
 3 | #git push --tags
 4 | # https://dev.to/iamtekson/publish-package-to-pypi-and-release-new-version-using-github-actions-108k
 5 | name: Publish LitGPT to PyPI
 6 | 
 7 | on:
 8 |   push:
 9 |     tags:
10 |       - "v*"
11 | jobs:
12 |   build-n-publish:
13 |     name: Build and publish to PyPI
14 |     runs-on: ubuntu-latest
15 |     environment:
16 |       name: pypi
17 |       url: https://pypi.org/p/litgpt
18 |     permissions:
19 |       id-token: write
20 | 
21 |     steps:
22 |       - name: Checkout source
23 |         uses: actions/checkout@v3
24 | 
25 |       - name: Set up Python
26 |         uses: actions/setup-python@v4
27 |         with:
28 |           python-version: "3.x"
29 | 
30 |       - name: Build source and wheel distributions
31 |         run: |
32 |           python -m pip install --upgrade build twine
33 |           pip install importlib_metadata==7.2.1
34 |           python -m build
35 |           twine check --strict dist/*
36 |       - name: Publish distribution to PyPI
37 |         uses: pypa/gh-action-pypi-publish@release/v1
38 |         with:
39 |           user: __token__
40 |           password: ${{ secrets.PYPI_API_TOKEN }}
41 | 


--------------------------------------------------------------------------------
/extensions/thunder/unsloth/kernels/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import triton
16 | 
17 | MAX_FUSED_SIZE = 65536  # 2**16
18 | next_power_of_2 = triton.next_power_of_2
19 | 
20 | def calculate_settings(n):
21 |     BLOCK_SIZE = next_power_of_2(n)
22 |     if BLOCK_SIZE > MAX_FUSED_SIZE:
23 |         raise RuntimeError(f"Cannot launch Triton kernel since n = {n} exceeds "\
24 |                            f"the maximum CUDA blocksize = {MAX_FUSED_SIZE}.")
25 |     num_warps = 4
26 |     if   BLOCK_SIZE >= 32768: num_warps = 32
27 |     elif BLOCK_SIZE >=  8192: num_warps = 16
28 |     elif BLOCK_SIZE >=  2048: num_warps = 8
29 |     return BLOCK_SIZE, num_warps
30 | pass
31 | 


--------------------------------------------------------------------------------
/tests/data/test_longform.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | from litgpt.data import LongForm
 3 | from litgpt.prompts import Longform as LongFormPromptStyle
 4 | 
 5 | 
 6 | def test_longform(mock_tokenizer, longform_path):
 7 |     longform = LongForm(download_dir=longform_path, num_workers=0)
 8 |     assert isinstance(longform.prompt_style, LongFormPromptStyle)
 9 |     longform.connect(mock_tokenizer, batch_size=2, max_seq_length=10)
10 |     longform.prepare_data()
11 |     longform.setup()
12 | 
13 |     train_dataloader = longform.train_dataloader()
14 |     val_dataloader = longform.val_dataloader()
15 | 
16 |     assert len(train_dataloader) == 9
17 |     assert len(val_dataloader) == 5
18 | 
19 |     train_batch = next(iter(train_dataloader))
20 |     val_batch = next(iter(val_dataloader))
21 | 
22 |     assert train_batch.keys() == val_batch.keys() == {"input_ids", "labels"}
23 |     assert all(seq.shape == (2, 10) for seq in train_batch.values())
24 |     assert all(seq.shape == (2, 10) for seq in val_batch.values())
25 | 
26 |     assert isinstance(train_dataloader.dataset.prompt_style, LongFormPromptStyle)
27 |     assert isinstance(val_dataloader.dataset.prompt_style, LongFormPromptStyle)
28 | 
29 |     # has attributes from super class `LightningDataModule`
30 |     assert longform.prepare_data_per_node
31 | 


--------------------------------------------------------------------------------
/tests/data/test_dolly.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | from litgpt.data import Dolly
 4 | from litgpt.prompts import Alpaca as AlpacaPromptStyle
 5 | 
 6 | 
 7 | def test_dolly(mock_tokenizer, dolly_path):
 8 |     dolly = Dolly(val_split_fraction=0.5, download_dir=dolly_path.parent, file_name=dolly_path.name, num_workers=0)
 9 |     assert isinstance(dolly.prompt_style, AlpacaPromptStyle)
10 |     dolly.connect(mock_tokenizer, batch_size=2, max_seq_length=10)
11 |     dolly.prepare_data()
12 |     dolly.setup()
13 | 
14 |     train_dataloader = dolly.train_dataloader()
15 |     val_dataloader = dolly.val_dataloader()
16 | 
17 |     assert len(train_dataloader) == 3
18 |     assert len(val_dataloader) == 3
19 | 
20 |     train_batch = next(iter(train_dataloader))
21 |     val_batch = next(iter(val_dataloader))
22 | 
23 |     assert train_batch.keys() == val_batch.keys() == {"input_ids", "labels"}
24 |     assert all(seq.shape == (2, 10) for seq in train_batch.values())
25 |     assert all(seq.shape == (2, 10) for seq in val_batch.values())
26 | 
27 |     assert isinstance(train_dataloader.dataset.prompt_style, AlpacaPromptStyle)
28 |     assert isinstance(val_dataloader.dataset.prompt_style, AlpacaPromptStyle)
29 | 
30 |     # has attributes from super class `LightningDataModule`
31 |     assert dolly.prepare_data_per_node
32 | 


--------------------------------------------------------------------------------
/tests/data/test_alpaca.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | from litgpt.data import Alpaca
 3 | from litgpt.prompts import Alpaca as AlpacaPromptStyle
 4 | 
 5 | 
 6 | def test_alpaca(mock_tokenizer, alpaca_path):
 7 |     alpaca = Alpaca(val_split_fraction=0.5, download_dir=alpaca_path.parent, file_name=alpaca_path.name, num_workers=0)
 8 |     assert isinstance(alpaca.prompt_style, AlpacaPromptStyle)
 9 |     alpaca.connect(mock_tokenizer, batch_size=2, max_seq_length=10)
10 |     alpaca.prepare_data()
11 |     alpaca.setup()
12 | 
13 |     train_dataloader = alpaca.train_dataloader()
14 |     val_dataloader = alpaca.val_dataloader()
15 | 
16 |     assert len(train_dataloader) == 6
17 |     assert len(val_dataloader) == 6
18 | 
19 |     train_batch = next(iter(train_dataloader))
20 |     val_batch = next(iter(val_dataloader))
21 | 
22 |     assert train_batch.keys() == val_batch.keys() == {"input_ids", "labels"}
23 |     assert all(seq.shape == (2, 10) for seq in train_batch.values())
24 |     assert all(seq.shape == (2, 10) for seq in val_batch.values())
25 | 
26 |     assert isinstance(train_dataloader.dataset.prompt_style, AlpacaPromptStyle)
27 |     assert isinstance(val_dataloader.dataset.prompt_style, AlpacaPromptStyle)
28 | 
29 |     # has attributes from super class `LightningDataModule`
30 |     assert alpaca.prepare_data_per_node
31 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.yaml:
--------------------------------------------------------------------------------
 1 | name: Bug Report
 2 | description: Report errors related to LitGPT
 3 | title: "Description"
 4 | labels: bug
 5 | body:
 6 |   - type: markdown
 7 |     attributes:
 8 |       value: |
 9 |         Thank you for taking the time to report an issue. Please fill out the details below to help us resolve it.
10 | 
11 |   - type: textarea
12 |     id: bug_description
13 |     attributes:
14 |       label: Bug description
15 |       description: A description of the issue.
16 |       placeholder: |
17 |         Please provide a description of what the bug or issue is.
18 |     validations:
19 |       required: true
20 | 
21 |   - type: dropdown
22 |     id: operating_system
23 |     attributes:
24 |       label: What operating system are you using?
25 |       description: If applicable, please select the operating system where you experienced this issue.
26 |       options:
27 |         - "Unknown"
28 |         - "macOS"
29 |         - "Linux"
30 |         - "Windows"
31 |     validations:
32 |       required: true
33 | 
34 |   - type: textarea
35 |     id: version
36 |     attributes:
37 |       label: LitGPT Version
38 |       description: |
39 |         Please provide details about your LitGPT version by running the following code in your terminal:
40 |         ```
41 |         pip show litgpt | grep Version:
42 |         ```
43 |         You can simply copy and paste the outputs below.
44 |       value: |
45 |         ``` 
46 | 
47 | 
48 | 
49 |         ```
50 |     validations:
51 |       required: false
52 | 


--------------------------------------------------------------------------------
/tests/data/test_tinyllama.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | from unittest import mock
 3 | 
 4 | import pytest
 5 | from litdata.streaming import CombinedStreamingDataset, StreamingDataLoader, StreamingDataset
 6 | from torch.utils.data import DataLoader
 7 | 
 8 | from litgpt.data import TinyLlama
 9 | 
10 | 
11 | @mock.patch("litdata.streaming.dataset.subsample_streaming_dataset", return_value=([], []))
12 | def test_tinyllama(_, tmp_path):
13 |     data = TinyLlama(data_path=(tmp_path / "data"))
14 |     assert data.seq_length == 2048
15 |     assert data.batch_size == 1
16 | 
17 |     data.connect(batch_size=2, max_seq_length=1024)
18 |     assert data.seq_length == 1025
19 |     assert data.batch_size == 2
20 | 
21 |     with pytest.raises(FileNotFoundError, match="The directory .*data/slimpajama/train does not exist"):
22 |         data.prepare_data()
23 | 
24 |     (tmp_path / "data" / "slimpajama" / "train").mkdir(parents=True)
25 |     (tmp_path / "data" / "slimpajama" / "val").mkdir(parents=True)
26 |     (tmp_path / "data" / "starcoder").mkdir(parents=True)
27 | 
28 |     data.prepare_data()
29 |     data.setup()
30 | 
31 |     train_dataloader = data.train_dataloader()
32 |     assert isinstance(train_dataloader, StreamingDataLoader)
33 |     assert isinstance(train_dataloader.dataset, CombinedStreamingDataset)
34 | 
35 |     val_dataloader = data.val_dataloader()
36 |     assert isinstance(val_dataloader, DataLoader)
37 |     assert isinstance(val_dataloader.dataset, StreamingDataset)
38 | 
39 |     # has attributes from super class `LightningDataModule`
40 |     assert data.prepare_data_per_node
41 | 


--------------------------------------------------------------------------------
/tutorials/convert_hf_checkpoint.md:
--------------------------------------------------------------------------------
 1 | # Converting Hugging Face Transformers to LitGPT weights
 2 | 
 3 | By default, the `litgpt download` command converts the downloaded HF checkpoint files into a LitGPT compatible format after downloading. For example,
 4 | 
 5 | ```bash
 6 | litgpt download EleutherAI/pythia-14m
 7 | ```
 8 | 
 9 | creates the following files:
10 | 
11 | ```
12 | checkpoints/
13 | └── EleutherAI/
14 |     └── pythia-14m/
15 |         ├── config.json
16 |         ├── generation_config.json
17 |         ├── model_config.yaml      # LitGPT specific file
18 |         ├── lit_model.pth          # LitGPT specific file
19 |         ├── pytorch_model.bin
20 |         ├── tokenizer.json
21 |         └── tokenizer_config.json
22 | ```
23 | 
24 | 
25 | 
26 | To disable the automatic conversion, which is useful for development and debugging purposes, you can run the `litgpt download` with the `--convert_checkpoint false` flag. This will only download the checkpoint files but do not convert them for use in LitGPT:
27 | 
28 | ```bash
29 | rm -rf checkpoints/EleutherAI/pythia-14m
30 | 
31 | litgpt download EleutherAI/pythia-14m \
32 |   --convert_checkpoint false
33 | 
34 | ls checkpoints/EleutherAI/pythia-14m
35 | ```
36 | 
37 | ```
38 |  checkpoints/
39 | └── EleutherAI/
40 |     └── pythia-14m/
41 |         ├── config.json
42 |         ├── generation_config.json
43 |         ├── pytorch_model.bin
44 |         ├── tokenizer.json
45 |         └── tokenizer_config.json
46 | ```
47 | 
48 | The required files `model_config.yaml` and `lit_model.pth` files can then be manually generated via the `litgpt/scripts/convert_hf_checkpoint.py` script:
49 | 
50 | ```bash
51 | litgpt convert_to_litgpt checkpoints/EleutherAI/pythia-14m
52 | ```
53 | 


--------------------------------------------------------------------------------
/tests/data/test_textfiles.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from litdata import optimize
 4 | from torch.utils._pytree import tree_map
 5 | 
 6 | 
 7 | class Tokenizer:
 8 |     bos_id = 0
 9 | 
10 |     def encode(self, text, bos, eos):
11 |         assert bos
12 |         assert not eos
13 |         return [self.bos_id] + [ord(c) for c in text]
14 | 
15 | 
16 | def tokenize(data):
17 |     for story in data:
18 |         yield torch.tensor(story)
19 | 
20 | 
21 | def fake_chunk(path, data):
22 |     optimize(fn=tokenize, inputs=[data] * len(data), output_dir=str(path), num_workers=1, chunk_bytes="200MB")
23 | 
24 | 
25 | def test_textfiles_datamodule(tmp_path):
26 |     from litgpt.data.text_files import TextFiles
27 | 
28 |     data_dir = tmp_path / "textfiles"
29 |     datamodule = TextFiles(train_data_path=data_dir, num_workers=1)
30 |     datamodule.connect(max_seq_length=2, tokenizer=Tokenizer())
31 | 
32 |     # simulate `datamodule.prepare_data`
33 |     train_data_dir = data_dir / "train"
34 |     train_data_dir.mkdir(parents=True)
35 |     fake_chunk(train_data_dir, [[12], [0, 23, 15, 63, 0], [73, 5, 0, 1, 1999, 0, 13]])
36 |     datamodule.setup()
37 | 
38 |     tr_dataloader = datamodule.train_dataloader()
39 |     torch.manual_seed(123)
40 | 
41 |     actual = tree_map(torch.Tensor.tolist, list(tr_dataloader))
42 |     # there is 1 sample per index in the data (13)
43 |     assert actual == [
44 |         [[1999, 0, 13]],
45 |         [[0, 13, 12]],
46 |         [[1, 1999, 0]],
47 |         [[63, 0, 73]],
48 |         [[5, 0, 1]],
49 |         [[0, 73, 5]],
50 |         [[0, 23, 15]],
51 |         [[0, 1, 1999]],
52 |         [[15, 63, 0]],
53 |         [[73, 5, 0]],
54 |         [[12, 0, 23]],
55 |         [[23, 15, 63]],
56 |         [[13, 12, 0]]
57 |     ]
58 | 


--------------------------------------------------------------------------------
/tests/test_args.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | import pytest
 3 | 
 4 | from litgpt.args import TrainArgs
 5 | 
 6 | 
 7 | def test_compute_warmup_iters():
 8 |     # warmup disabled
 9 |     train = TrainArgs(lr_warmup_steps=0, lr_warmup_fraction=0)
10 |     assert train.warmup_iters(devices=1, max_iters=1000, train_dataloader=range(10)) == 0
11 | 
12 |     # lr_warmup_steps and lr_warmup_fraction both are not allowed
13 |     with pytest.raises(ValueError, match="Can't provide both `--train.lr_warmup_fraction`"):
14 |         TrainArgs(lr_warmup_steps=1, lr_warmup_fraction=0.2)
15 | 
16 |     # lr_warmup_fraction invalid range
17 |     with pytest.raises(ValueError, match=" must be between 0 and 1"):
18 |         TrainArgs(lr_warmup_steps=0, lr_warmup_fraction=1.1)
19 | 
20 |     # lr_warmup_steps
21 |     train = TrainArgs(global_batch_size=1, micro_batch_size=1, lr_warmup_steps=100, lr_warmup_fraction=0)
22 |     assert train.warmup_iters(devices=1, max_iters=1000, train_dataloader=range(10)) == 100
23 |     # lr_warmup_steps multiplied by accumulation factor
24 |     train.global_batch_size = 4
25 |     assert train.warmup_iters(devices=1, max_iters=1000, train_dataloader=range(10)) == 400
26 |     assert train.warmup_iters(devices=2, max_iters=1000, train_dataloader=range(10)) == 200
27 |     # lr_warmup_steps truncated by max iters
28 |     assert train.warmup_iters(devices=1, max_iters=120, train_dataloader=range(10)) == 120
29 | 
30 |     # lr_warmup_fraction
31 |     train = TrainArgs(global_batch_size=1, micro_batch_size=1, lr_warmup_steps=0, lr_warmup_fraction=0.3)
32 |     assert train.warmup_iters(devices=1, max_iters=1000, train_dataloader=range(100)) == 30
33 |     # lr_warmup_fraction truncated by max iters
34 |     assert train.warmup_iters(devices=1, max_iters=20, train_dataloader=range(100)) == 20
35 |     # lr_warmup_fraction rounds up
36 |     assert train.warmup_iters(devices=1, max_iters=1000, train_dataloader=range(5)) == 2
37 | 


--------------------------------------------------------------------------------
/.github/workflows/cpu-tests.yml:
--------------------------------------------------------------------------------
 1 | name: CPU tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main, wip]
 6 |   pull_request:
 7 |     branches: [main, wip]
 8 | 
 9 | concurrency:
10 |   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
11 |   cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
12 | 
13 | defaults:
14 |   run:
15 |     shell: bash
16 | 
17 | env:
18 |   HF_TOKEN: ${{ secrets.HF_TOKEN }}
19 | 
20 | jobs:
21 |   cpu-tests:
22 |     runs-on: ${{ matrix.os }}
23 |     strategy:
24 |       fail-fast: false
25 |       matrix:
26 |         include:
27 |           - {os: "macOS-12", python-version: "3.10"}
28 |           - {os: "ubuntu-22.04", python-version: "3.11"}
29 |           - {os: "ubuntu-22.04", python-version: "3.10"}
30 |           - {os: "ubuntu-22.04", python-version: "3.9"}
31 |           - {os: "windows-2022", python-version: "3.9"}
32 |     timeout-minutes: 25
33 | 
34 |     steps:
35 |     - uses: actions/checkout@v4
36 | 
37 |     - name: Set up Python ${{ matrix.python-version }}
38 |       uses: actions/setup-python@v5
39 |       with:
40 |         python-version: ${{ matrix.python-version }}
41 |         cache: 'pip'
42 |         cache-dependency-path: |
43 |           pyproject.toml
44 | 
45 |     - name: Install minimal dependencies
46 |       run: |
47 |         # python -m pip install --upgrade pip
48 |         pip install .
49 |         pip list
50 |         # make sure all modules are still importable with only the minimal dependencies available
51 |         modules=$(
52 |           find litgpt -type f -name "*.py" | \
53 |           sed 's/\.py$//' | sed 's/\//./g' | \
54 |           sed 's/.__init__//g' | xargs -I {} echo "import {};"
55 |         )
56 |         echo "$modules"
57 |         python -c "$modules"
58 | 
59 |     - name: Install all dependencies
60 |       run: |
61 |         pip install '.[all,test]'
62 |         pip list
63 | 
64 |     - name: Run tests
65 |       run: |
66 |         pytest -v --disable-pytest-warnings --strict-markers --color=yes --timeout 120
67 | 


--------------------------------------------------------------------------------
/.github/azure-gpu-test.yml:
--------------------------------------------------------------------------------
 1 | name: GPU tests
 2 | 
 3 | trigger:
 4 |   branches:
 5 |     include:
 6 |       - "main"
 7 |       - "wip"
 8 | 
 9 | pr:
10 |   branches:
11 |     include:
12 |       - "main"
13 |       - "wip"
14 | 
15 | jobs:
16 |   - job: testing
17 |     timeoutInMinutes: "30"
18 |     cancelTimeoutInMinutes: "2"
19 |     pool: "lit-rtx-3090"
20 |     variables:
21 |       DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
22 |       CI: "true"
23 |     container:
24 |       image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
25 |       options: "--gpus=all --shm-size=8gb"
26 |     workspace:
27 |       clean: all
28 |     steps:
29 | 
30 |     - bash: |
31 |         echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
32 |       displayName: 'set env. vars'
33 | 
34 |     - bash: |
35 |         echo $(DEVICES)
36 |         echo $CUDA_VISIBLE_DEVICES
37 |         whereis nvidia
38 |         nvidia-smi
39 |         which python && which pip
40 |         python --version
41 |         pip --version
42 |         pip list
43 |       displayName: "Image info & NVIDIA"
44 | 
45 |     - script: |
46 |         pip install --upgrade pip
47 |         pip install '.[all,test]'
48 |         pip install -U torch torchvision torchaudio
49 |       displayName: 'Install dependencies'
50 | 
51 |     - bash: |
52 |         set -e
53 |         pip list
54 |         python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
55 |       displayName: "Env details"
56 | 
57 |     - bash: pytest -v --disable-pytest-warnings --strict-markers --color=yes --ignore-glob="tests/test_thunder*" --ignore="tests/test_unsloth_executor.py"
58 |       displayName: 'Ordinary tests'
59 |       env:
60 |         PL_RUN_CUDA_TESTS: "1"
61 |       timeoutInMinutes: "5"
62 | 
63 |     - bash: bash run_standalone_tests.sh
64 |       workingDirectory: tests
65 |       env:
66 |         PL_RUN_CUDA_TESTS: "1"
67 |       displayName: "Standalone tests"
68 |       timeoutInMinutes: "10"
69 | 


--------------------------------------------------------------------------------
/.github/azure-gpu-test-with-thunder.yml:
--------------------------------------------------------------------------------
 1 | name: GPU tests with Thunder
 2 | 
 3 | trigger:
 4 |   branches:
 5 |     include:
 6 |       - "main"
 7 |       - "wip"
 8 | 
 9 | pr:
10 |   branches:
11 |     include:
12 |       - "main"
13 |       - "wip"
14 | 
15 | jobs:
16 |   - job: testing
17 |     timeoutInMinutes: "30"
18 |     cancelTimeoutInMinutes: "2"
19 |     pool: "lit-rtx-3090"
20 |     variables:
21 |       DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
22 |       CI: "true"
23 |     container:
24 |       image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
25 |       options: "--gpus=all --shm-size=8gb"
26 |     workspace:
27 |       clean: all
28 |     steps:
29 | 
30 |     - bash: |
31 |         echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
32 |       displayName: 'set env. vars'
33 | 
34 |     - bash: |
35 |         echo $(DEVICES)
36 |         echo $CUDA_VISIBLE_DEVICES
37 |         whereis nvidia
38 |         nvidia-smi
39 |         which python && which pip
40 |         python --version
41 |         pip --version
42 |         pip list
43 |       displayName: "Image info & NVIDIA"
44 | 
45 |     - script: |
46 |         pip install --upgrade pip
47 |         pip install '.[all,test]'
48 |       displayName: 'Install dependencies'
49 | 
50 |     - script: |
51 |         pip uninstall -y torchvision torchaudio
52 |         pip install --pre 'nvfuser-cu121[torch]' --extra-index-url https://pypi.nvidia.com
53 |       displayName: 'Install PyTorch nightly'
54 | 
55 |     - bash: |
56 |         set -e
57 |         pip list
58 |         python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
59 |       displayName: "Env details"
60 | 
61 |     - bash: pytest -v --disable-pytest-warnings --strict-markers --color=yes
62 |       displayName: 'Ordinary tests'
63 |       env:
64 |         PL_RUN_CUDA_TESTS: "1"
65 |       timeoutInMinutes: "5"
66 | 
67 |     - bash: bash run_standalone_tests.sh
68 |       workingDirectory: tests
69 |       env:
70 |         PL_RUN_CUDA_TESTS: "1"
71 |       displayName: "Standalone tests"
72 |       timeoutInMinutes: "10"


--------------------------------------------------------------------------------
/tutorials/examples/ptl-trainer/litgpt_ptl_medium.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import litgpt
 3 | from litgpt.lora import GPT, merge_lora_weights
 4 | from litgpt.data import Alpaca2k
 5 | import lightning as L
 6 | 
 7 | 
 8 | class LitLLM(L.LightningModule):
 9 |     def __init__(self):
10 |         super().__init__()
11 |         self.model = GPT.from_name(
12 |             name="Llama-3.1-8B",
13 |             lora_r=32,
14 |             lora_alpha=16,
15 |             lora_dropout=0.05,
16 |             lora_key=False,
17 |             lora_value=True,
18 |         )
19 |         litgpt.lora.mark_only_lora_as_trainable(self.model)
20 | 
21 |     def on_train_start(self):
22 |         state_dict = torch.load("checkpoints/meta-llama/Meta-Llama-3.1-8B/lit_model.pth", mmap=True)
23 |         self.model.load_state_dict(state_dict, strict=False)
24 | 
25 |     def training_step(self, batch):
26 |         input_ids, targets = batch["input_ids"], batch["labels"]
27 |         logits = self.model(input_ids)
28 |         loss = litgpt.utils.chunked_cross_entropy(logits[..., :-1, :], targets[..., 1:])
29 |         self.log("train_loss", loss, prog_bar=True)
30 |         return loss
31 | 
32 |     def configure_optimizers(self):
33 |         warmup_steps = 10
34 |         optimizer = torch.optim.AdamW(self.model.parameters(), lr=0.0002, weight_decay=0.0, betas=(0.9, 0.95))
35 |         scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: step / warmup_steps)
36 |         return [optimizer], [scheduler]
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     data = Alpaca2k()
41 |     tokenizer = litgpt.Tokenizer("checkpoints/meta-llama/Meta-Llama-3.1-8B")
42 |     data.connect(tokenizer, batch_size=1, max_seq_length=512)
43 | 
44 |     trainer = L.Trainer(
45 |         devices=1,
46 |         max_epochs=2,
47 |         accumulate_grad_batches=8,
48 |         precision="bf16-true",
49 |     )
50 |     with trainer.init_module(empty_init=True):
51 |         model = LitLLM()
52 | 
53 |     trainer.fit(model, data)
54 | 
55 |     # Save final checkpoint
56 |     merge_lora_weights(model.model)
57 |     trainer.save_checkpoint("checkpoints/finetuned.ckpt", weights_only=True)
58 | 


--------------------------------------------------------------------------------
/tutorials/deploy.md:
--------------------------------------------------------------------------------
 1 | # Serve and Deploy LLMs
 2 | 
 3 | This document shows how you can serve a LitGPT for deployment. 
 4 | 
 5 | &nbsp;
 6 | ## Serve an LLM
 7 | 
 8 | This section illustrates how we can set up an inference server for a phi-2 LLM using `litgpt serve` that is minimal and highly scalable.
 9 | 
10 | 
11 | &nbsp;
12 | ### Step 1: Start the inference server
13 | 
14 | 
15 | ```bash
16 | # 1) Download a pretrained model (alternatively, use your own finetuned model)
17 | litgpt download microsoft/phi-2
18 | 
19 | # 2) Start the server
20 | litgpt serve microsoft/phi-2
21 | ```
22 | 
23 | > [!TIP]
24 | > Use `litgpt serve --help` to display additional options, including the port, devices, LLM temperature setting, and more.
25 | 
26 | 
27 | &nbsp;
28 | ### Step 2: Query the inference server
29 | 
30 | You can now send requests to the inference server you started in step 2. For example, in a new Python session, we can send requests to the inference server as follows:
31 | 
32 | 
33 | ```python
34 | import requests, json
35 | 
36 | response = requests.post(
37 |     "http://127.0.0.1:8000/predict", 
38 |     json={"prompt": "Fix typos in the following sentence: Exampel input"}
39 | )
40 | 
41 | print(response.json()["output"])
42 | ```
43 | 
44 | Executing the code above prints the following output:
45 | 
46 | ```
47 | Example input.
48 | ```
49 | 
50 | &nbsp;
51 | ## Optional streaming mode
52 | 
53 | The 2-step procedure described above returns the complete response all at once. If you want to stream the response on a token-by-token basis, start the server with the streaming option enabled:
54 | 
55 | ```bash
56 | litgpt serve microsoft/phi-2 --stream true
57 | ```
58 | 
59 | Then, use the following updated code to query the inference server:
60 | 
61 | ```python
62 | import requests, json
63 | 
64 | response = requests.post(
65 |     "http://127.0.0.1:8000/predict", 
66 |     json={"prompt": "Fix typos in the following sentence: Exampel input"},
67 |     stream=True
68 | )
69 | 
70 | # stream the response
71 | for line in response.iter_lines(decode_unicode=True):
72 |     if line:
73 |         print(json.loads(line)["output"], end="")
74 | ```
75 | 
76 | ```
77 | Sure, here is the corrected sentence:
78 | 
79 | Example input
80 | ```
81 | 


--------------------------------------------------------------------------------
/litgpt/data/alpaca_2k.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | 
 4 | from dataclasses import dataclass, field
 5 | from pathlib import Path
 6 | 
 7 | from litgpt.data import SFTDataset
 8 | from litgpt.data.alpaca import Alpaca
 9 | 
10 | 
11 | @dataclass
12 | class Alpaca2k(Alpaca):
13 |     """Alpaca2k data module for supervised finetuning."""
14 | 
15 |     val_split_fraction: float = 0.05  # to get exactly 100 validation samples,
16 |     """The fraction of the dataset to use for the validation dataset. The rest is used for training."""
17 |     download_dir: Path = Path("./data/alpaca2k")
18 |     """The directory in which the downloaded datasetgets saved."""
19 |     repo_id: str = field(repr=False, default="mhenrichsen/alpaca_2k_test")
20 |     """The URL from where to download the dataset."""
21 |     file_name: str = field(repr=False, default="alpaca2k_data_cleaned_archive.json")
22 |     """The name of the dataset file to download."""
23 | 
24 |     def prepare_data(self) -> None:
25 |         from datasets import load_dataset
26 | 
27 |         load_dataset(self.repo_id, cache_dir=self.download_dir)
28 | 
29 |     def setup(self, stage: str = "") -> None:
30 |         from datasets import load_dataset
31 | 
32 |         dataset = load_dataset(self.repo_id, cache_dir=self.download_dir)
33 | 
34 |         train_validation_split = dataset["train"].train_test_split(test_size=self.val_split_fraction, seed=self.seed)
35 |         train_data = train_validation_split["train"]
36 |         test_data = train_validation_split["test"]
37 | 
38 |         self.train_dataset = SFTDataset(
39 |             data=train_data,
40 |             tokenizer=self.tokenizer,
41 |             prompt_style=self.prompt_style,
42 |             max_seq_length=self.max_seq_length,
43 |             mask_prompt=self.mask_prompt,
44 |             ignore_index=self.ignore_index,
45 |         )
46 |         self.test_dataset = SFTDataset(
47 |             data=test_data,
48 |             tokenizer=self.tokenizer,
49 |             prompt_style=self.prompt_style,
50 |             max_seq_length=self.max_seq_length,
51 |             mask_prompt=self.mask_prompt,
52 |             ignore_index=self.ignore_index,
53 |         )
54 | 


--------------------------------------------------------------------------------
/tests/test_thunder_pretrain.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from contextlib import redirect_stdout
 4 | from io import StringIO
 5 | from pathlib import Path
 6 | from unittest.mock import Mock
 7 | 
 8 | import torch
 9 | from tests.conftest import RunIf
10 | from torch.utils.data import DataLoader
11 | 
12 | from litgpt import Config
13 | from litgpt.args import EvalArgs, TrainArgs
14 | 
15 | # support running without installing as a package
16 | wd = Path(__file__).parent.parent.resolve()
17 | sys.path.append(str(wd))
18 | 
19 | import extensions.thunder.pretrain as pretrain
20 | 
21 | 
22 | @RunIf(min_cuda_gpus=1, thunder=True)
23 | def test_pretrain(tmp_path, monkeypatch):
24 |     model_config = Config(block_size=2, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8)
25 | 
26 |     dataset = torch.tensor([[0, 1, 2], [3, 4, 5], [0, 1, 2]])
27 |     dataloader = DataLoader(dataset)
28 |     monkeypatch.setattr(pretrain, "get_dataloaders", Mock(return_value=(dataloader, dataloader)))
29 |     monkeypatch.setattr(pretrain, "save_hyperparameters", Mock())
30 | 
31 |     out_dir = tmp_path / "out"
32 |     stdout = StringIO()
33 |     with redirect_stdout(stdout):
34 |         pretrain.setup(
35 |             devices=1,
36 |             model_config=model_config,
37 |             out_dir=out_dir,
38 |             train=TrainArgs(global_batch_size=2, max_tokens=16, save_interval=1, micro_batch_size=1, max_norm=1.0),
39 |             eval=EvalArgs(interval=1, max_iters=1),
40 |             optimizer="AdamW",
41 |         )
42 | 
43 |     out_dir_contents = set(os.listdir(out_dir))
44 |     checkpoint_dirs = {"step-00000001", "step-00000002", "step-00000003", "step-00000004"}
45 |     assert checkpoint_dirs.issubset(out_dir_contents)
46 |     assert all((out_dir / p).is_dir() for p in checkpoint_dirs)
47 |     for checkpoint_dir in checkpoint_dirs:
48 |         # the `tokenizer_dir` is None by default, so only 'lit_model.pth' shows here
49 |         assert set(os.listdir(out_dir / checkpoint_dir)) == {"lit_model.pth", "model_config.yaml"}
50 | 
51 |     assert (out_dir / "logs" / "tensorboard" / "version_0").is_dir()
52 | 
53 |     logs = stdout.getvalue()
54 |     assert logs.count("(step)") == 4
55 |     assert logs.count("val loss") == 4
56 |     assert "Total parameters: 1,888" in logs
57 | 


--------------------------------------------------------------------------------
/litgpt/data/prepare_slimpajama.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import json
 4 | import os
 5 | import time
 6 | from pathlib import Path
 7 | 
 8 | from litgpt.tokenizer import Tokenizer
 9 | from litgpt.data.prepare_starcoder import DataChunkRecipe
10 | from litgpt.utils import CLI, extend_checkpoint_dir
11 | 
12 | 
13 | class SlimPajamaDataRecipe(DataChunkRecipe):
14 |     is_generator = True
15 | 
16 |     def __init__(self, tokenizer: Tokenizer, chunk_size: int):
17 |         super().__init__(chunk_size)
18 |         self.tokenizer = tokenizer
19 | 
20 |     def prepare_structure(self, input_dir):
21 |         files = Path(input_dir).rglob("*.zst")
22 |         return [str(file) for file in files]
23 | 
24 |     def prepare_item(self, filepath):
25 |         import zstandard as zstd
26 | 
27 |         with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
28 |             for row in f:
29 |                 text = json.loads(row)["text"]
30 |                 if json.loads(row)["meta"]["redpajama_set_name"] == "RedPajamaGithub":
31 |                     continue  # exclude the GitHub data since it overlaps with starcoder
32 |                 text_ids = self.tokenizer.encode(text, bos=False, eos=True)
33 |                 yield text_ids
34 | 
35 | 
36 | def prepare(
37 |     input_dir: Path = Path("data/SlimPajama-627B/train"),
38 |     output_dir: Path = Path("data/slimpajama/train"),
39 |     tokenizer_path: Path = Path("checkpoints/Llama-2-7b-hf/"),
40 |     chunk_size: int = (2049 * 16384),
41 |     fast_dev_run: bool = False,
42 | ) -> None:
43 |     from litdata.processing.data_processor import DataProcessor
44 | 
45 |     tokenizer_path = extend_checkpoint_dir(tokenizer_path)
46 |     data_recipe = SlimPajamaDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size)
47 |     data_processor = DataProcessor(
48 |         input_dir=str(input_dir),
49 |         output_dir=str(output_dir),
50 |         fast_dev_run=fast_dev_run,
51 |         num_workers=os.cpu_count(),
52 |         num_downloaders=1,
53 |     )
54 | 
55 |     start_time = time.time()
56 |     data_processor.run(data_recipe)
57 |     elapsed_time = time.time() - start_time
58 |     print(f"Time taken: {elapsed_time:.2f} seconds")
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     CLI(prepare)
63 | 


--------------------------------------------------------------------------------
/tests/data/test_lit_data.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | import sys
 3 | from unittest import mock
 4 | from unittest.mock import ANY
 5 | 
 6 | import pytest
 7 | 
 8 | from litgpt.data import LitData
 9 | 
10 | 
11 | @pytest.mark.skipif(sys.platform == "win32", reason="Needs to implement platform agnostic path/url joining")
12 | @mock.patch("litgpt.data.lit_data.LitData._dataloader")
13 | def test_input_dir_and_splits(dl_mock, tmp_path):
14 | 
15 |     with pytest.raises(ValueError, match="If provided `split_names` must be a tuple of two strings"):
16 |         LitData(data_path=tmp_path, split_names=("train",))
17 | 
18 |     # local dir, no splits
19 |     data = LitData(data_path=tmp_path)
20 |     data.train_dataloader()
21 |     dl_mock.assert_called_with(input_dir=str(tmp_path), train=True)
22 |     data.val_dataloader()
23 |     dl_mock.assert_called_with(input_dir=str(tmp_path), train=False)
24 | 
25 |     # local dir, splits
26 |     data = LitData(data_path=tmp_path, split_names=("train", "val"))
27 |     data.train_dataloader()
28 |     dl_mock.assert_called_with(input_dir=str(tmp_path / "train"), train=True)
29 |     data.val_dataloader()
30 |     dl_mock.assert_called_with(input_dir=str(tmp_path / "val"), train=False)
31 | 
32 |     # remote dir, splits
33 |     data = LitData(data_path="s3://mydataset/data", split_names=("train", "val"))
34 |     data.train_dataloader()
35 |     dl_mock.assert_called_with(input_dir=str("s3://mydataset/data/train"), train=True)
36 |     data.val_dataloader()
37 |     dl_mock.assert_called_with(input_dir=str("s3://mydataset/data/val"), train=False)
38 | 
39 | 
40 | @pytest.mark.skipif(sys.platform == "win32", reason="Needs to implement platform agnostic path/url joining")
41 | @mock.patch("litdata.streaming.StreamingDataset")
42 | @mock.patch("litdata.streaming.StreamingDataLoader")
43 | def test_dataset_args(streaming_dataloader_mock, streaming_dataset_mock, tmp_path):
44 |     data = LitData(data_path=tmp_path, seed=1000)
45 |     data.train_dataloader()
46 |     streaming_dataset_mock.assert_called_with(
47 |         input_dir=str(tmp_path),
48 |         item_loader=ANY,
49 |         shuffle=True,
50 |         seed=1000,
51 |     )
52 |     streaming_dataloader_mock.assert_called_with(
53 |         streaming_dataset_mock(),
54 |         batch_size=1,
55 |         pin_memory=True,
56 |         num_workers=8,
57 |         drop_last=True,
58 |     )
59 | 


--------------------------------------------------------------------------------
/tutorials/developer-docs/python-api.md:
--------------------------------------------------------------------------------
  1 | # LitGPT High-level Python API
  2 | 
  3 | This is a work-in-progress draft for a high-level LitGPT Python API.
  4 | 
  5 | &nbsp;
  6 | ## Model loading & saving
  7 | 
  8 | The `LLM.load` command loads an `llm` object, which contains both the model object (a PyTorch module) and a preprocessor.
  9 | 
 10 | ```python
 11 | from litgpt import LLM
 12 | 
 13 | llm = LLM.load(
 14 |     model="url | local_path",
 15 |     # high-level user only needs to care about those:
 16 |     memory_reduction="none | medium | strong"
 17 |     # advanced options for technical users:
 18 |     source="hf | local | other"
 19 |     quantize="bnb.nf4",
 20 |     precision="bf16-true",
 21 |     device=""auto | cuda | cpu",
 22 | )
 23 | ```
 24 | 
 25 | Here,
 26 | 
 27 | -  `llm.model` contains the PyTorch Module
 28 | - and `llm.preprocessor.tokenizer`  contains the tokenizer
 29 | 
 30 | The `llm.save` command saves the model weights, tokenizer, and configuration information.
 31 | 
 32 | 
 33 | ```python
 34 | llm.save(checkpoint_dir, format="lightning | ollama | hf")
 35 | ```
 36 | 
 37 | 
 38 | &nbsp;
 39 | ## Inference / Chat
 40 | 
 41 | ```
 42 | response = llm.generate(
 43 |     prompt="What do Llamas eat?",
 44 |     temperature=0.1,
 45 |     top_p=0.8,
 46 |     ...
 47 | )
 48 | ```
 49 | 
 50 | 
 51 | &nbsp;
 52 | ## Dataset
 53 | 
 54 | The `llm.prepare_dataset` command prepares a dataset for training.
 55 | 
 56 | ```
 57 | llm.download_dataset(
 58 |     URL,
 59 |     ...
 60 | )
 61 | ```
 62 | 
 63 | ```
 64 | dataset = llm.prepare_dataset(
 65 |     path,
 66 |     task="pretrain | instruction_finetune",
 67 |     test_portion=0.1,
 68 |     ...
 69 | )
 70 | ```
 71 | 
 72 | &nbsp;
 73 | ## Training
 74 | 
 75 | 
 76 | ```python
 77 | llm.instruction_finetune(
 78 |     config=None,
 79 |     dataset=dataset,
 80 |     max_iter=10,
 81 |     method="full | lora | adapter | adapter_v2"
 82 | )
 83 | ```
 84 | 
 85 | ```python
 86 | llm.pretrain(config=None, dataset=dataset, max_iter=10, ...)
 87 | ```
 88 | 
 89 | &nbsp;
 90 | ## Serving
 91 | 
 92 | 
 93 | ```python
 94 | llm.serve(port=8000)
 95 | ```
 96 | 
 97 | Then in another Python session:
 98 | 
 99 | ```python
100 | import requests, json
101 | 
102 | response = requests.post(
103 |     "http://127.0.0.1:8000/predict", 
104 |     json={"prompt": "Fix typos in the following sentence: Exampel input"}
105 | )
106 | 
107 | print(response.json()["output"])
108 | ```
109 | 


--------------------------------------------------------------------------------
/litgpt/scripts/convert_pretrained_checkpoint.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | from pathlib import Path
 4 | from pprint import pprint
 5 | import torch
 6 | 
 7 | from litgpt.utils import (
 8 |     copy_config_files,
 9 |     extend_checkpoint_dir,
10 |     incremental_save
11 | )
12 | 
13 | 
14 | @torch.inference_mode()
15 | def convert_pretrained_checkpoint(checkpoint_dir: Path, output_dir: Path) -> None:
16 |     """Convert a checkpoint after pretraining.
17 | 
18 |     The pretrained checkpoint contains optimizer states and several other metadata that are not needed after training
19 |     is finished. This script will export the state-dict of the model and place it in the chosen output folder,
20 |     which then can be loaded by other scripts for inference, evaluation, etc.
21 | 
22 |     Args:
23 |         checkpoint_dir: Path to a checkpoint directory produced by ``litgpt.pretrain``.
24 |         output_dir: The output folder where the converted state-dict file and config files will be saved to.
25 |     """
26 |     checkpoint_dir = extend_checkpoint_dir(checkpoint_dir)
27 |     pprint(locals())
28 | 
29 |     if output_dir.is_dir() and output_dir.glob("*"):
30 |         raise FileExistsError(
31 |             f"The output folder exists and is not empty: {str(output_dir)}."
32 |             " Please delete it first or choose a different name."
33 |         )
34 | 
35 |     output_dir.mkdir(parents=True)
36 |     checkpoint_file = checkpoint_dir / "lit_model.pth"
37 |     output_checkpoint_file = output_dir / "lit_model.pth"
38 | 
39 |     # TODO: Consolidate sharded checkpoint if applicable
40 |     # Extract the model state dict and save to output folder
41 |     with incremental_save(output_checkpoint_file) as saver:
42 |         print("Processing", checkpoint_file)
43 |         full_checkpoint = torch.load(str(checkpoint_file), mmap=True)
44 |         loaded_state_dict = full_checkpoint["model"]
45 |         converted_state_dict = {}
46 |         for param_name, param in loaded_state_dict.items():
47 |             saver.store_early(param)
48 |             # remove prefix for compiled model (if any)
49 |             param_name = param_name.replace("_orig_mod.", "")
50 |             converted_state_dict[param_name] = param
51 |         print(f"Saving converted checkpoint to {str(output_checkpoint_file)}.")
52 |         saver.save(converted_state_dict)
53 | 
54 |     copy_config_files(checkpoint_dir, output_dir)
55 | 


--------------------------------------------------------------------------------
/tests/data/test_openwebtext.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | import sys
 3 | from unittest import mock
 4 | from unittest.mock import ANY, call
 5 | 
 6 | import pytest
 7 | from litdata.streaming import StreamingDataLoader, StreamingDataset
 8 | from torch.utils.data import DataLoader
 9 | 
10 | from litgpt.data import OpenWebText
11 | 
12 | 
13 | @pytest.mark.skipif(sys.platform == "win32", reason="Not in the mood to add Windows support right now.")
14 | @mock.patch("litdata.optimize")
15 | @mock.patch("litdata.streaming.dataset.subsample_streaming_dataset", return_value=([], []))
16 | @mock.patch("datasets.load_dataset")
17 | def test_openwebtext(_, __, optimize_mock, tmp_path, mock_tokenizer):
18 |     data = OpenWebText(data_path=(tmp_path / "openwebtext"))
19 |     assert data.seq_length == 2048
20 |     assert data.batch_size == 1
21 | 
22 |     data.connect(tokenizer=mock_tokenizer, batch_size=2, max_seq_length=1024)
23 |     assert data.seq_length == 1025
24 |     assert data.batch_size == 2
25 | 
26 |     # Data does not exist, preprocess it
27 |     data.prepare_data()
28 |     optimize_mock.assert_has_calls(
29 |         [
30 |             call(
31 |                 fn=ANY,
32 |                 num_workers=ANY,
33 |                 inputs=[],
34 |                 output_dir=str(tmp_path / "openwebtext" / "train"),
35 |                 chunk_bytes="200MB",
36 |             ),
37 |             call(
38 |                 fn=ANY,
39 |                 num_workers=ANY,
40 |                 inputs=[],
41 |                 output_dir=str(tmp_path / "openwebtext" / "val"),
42 |                 chunk_bytes="200MB",
43 |             ),
44 |         ]
45 |     )
46 |     optimize_mock.reset_mock()
47 | 
48 |     # Data exists, already preprocessed
49 |     (tmp_path / "openwebtext" / "train").mkdir(parents=True)
50 |     (tmp_path / "openwebtext" / "val").mkdir(parents=True)
51 |     data.prepare_data()
52 |     optimize_mock.assert_not_called()
53 | 
54 |     data.setup()
55 | 
56 |     train_dataloader = data.train_dataloader()
57 |     assert isinstance(train_dataloader, StreamingDataLoader)
58 |     assert isinstance(train_dataloader.dataset, StreamingDataset)
59 | 
60 |     val_dataloader = data.val_dataloader()
61 |     assert isinstance(val_dataloader, DataLoader)
62 |     assert isinstance(val_dataloader.dataset, StreamingDataset)
63 | 
64 |     # has attributes from super class `LightningDataModule`
65 |     assert data.prepare_data_per_node
66 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "litgpt"
 3 | version = "0.4.11"
 4 | description = "Hackable implementation of state-of-the-art open-source LLMs"
 5 | authors = [
 6 |     { name = "Lightning AI", email = "contact@lightning.ai" },
 7 | ]
 8 | readme = "README.md"
 9 | license = { file = "LICENSE" }
10 | 
11 | dependencies = [
12 |     "torch>=2.2.0",
13 |     "lightning==2.4.0.dev20240728",
14 |     "jsonargparse[signatures]>=4.27.6",
15 |     "huggingface_hub>=0.23.5",          # download models
16 |     "safetensors>=0.4.3",               # download models
17 |     "tokenizers>=0.15.2",               # tokenization in most models
18 |     "tqdm>=4.66.0",                     # convert_hf_checkpoint
19 | ]
20 | 
21 | [project.urls]
22 | homepage = "https://github.com/lightning-AI/litgpt"
23 | documentation = "https://github.com/lightning-AI/litgpt/tutorials"
24 | 
25 | [project.scripts]
26 | litgpt = "litgpt.__main__:main"
27 | 
28 | [project.optional-dependencies]
29 | test = [
30 |     "pytest>=8.1.1",
31 |     "pytest-rerunfailures>=14.0",
32 |     "pytest-timeout>=2.3.1",
33 |     "pytest-dependency>=0.6.0",
34 |     "transformers>=4.38.0",  # numerical comparisons
35 |     "einops>=0.7.0",
36 |     "protobuf>=4.23.4",
37 |     "lightning-thunder @ git+https://github.com/Lightning-AI/lightning-thunder/ ; python_version >= '3.10' and sys_platform == 'linux'",
38 | ]
39 | all = [
40 |     "bitsandbytes==0.42.0",      # quantization
41 |     "sentencepiece>=0.2.0",      # llama-based models
42 |     "requests>=2.31.0",          # litgpt.data
43 |     "litdata==0.2.17",           # litgpt.data
44 |     "litserve>=0.1.5",           # litgpt.deploy
45 |     "zstandard>=0.22.0",         # litgpt.data.prepare_slimpajama.py
46 |     "pandas>=1.9.0",             # litgpt.data.prepare_starcoder.py
47 |     "pyarrow>=15.0.2",           # litgpt.data.prepare_starcoder.py
48 |     "tensorboard>=2.14.0",       # litgpt.pretrain
49 |     "torchmetrics>=1.3.1",       # litgpt.pretrain
50 |     "datasets>=2.18.0",          # litgpt.evaluate
51 |     "transformers>=4.38.0",      # litgpt.evaluate
52 |     "lm-eval>=0.4.2",            # litgpt.evaluate
53 |     "huggingface_hub[hf_transfer]>=0.21.0"  # download
54 | ]
55 | 
56 | [build-system]
57 | requires = [
58 |     "setuptools>=68.2.2",
59 |     "wheel>=0.41.2",
60 | ]
61 | build-backend = "setuptools.build_meta"
62 | 
63 | [tool.setuptools.packages.find]
64 | include = [
65 |     "litgpt",
66 |     "litgpt.*",
67 | ]
68 | exclude = []
69 | 
70 | [tool.setuptools.package-data]
71 | litgpt = [
72 |     "LICENSE",
73 |     "README.md",
74 | ]
75 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from contextlib import redirect_stdout
 3 | from io import StringIO
 4 | from unittest import mock
 5 | 
 6 | import pytest
 7 | from packaging.version import Version
 8 | 
 9 | from litgpt.__main__ import main
10 | 
11 | 
12 | def test_cli():
13 |     out = StringIO()
14 |     with pytest.raises(SystemExit), redirect_stdout(out), mock.patch("sys.argv", ["litgpt", "-h"]):
15 |         main()
16 |     out = out.getvalue()
17 |     assert "usage: litgpt" in out
18 |     assert ("{download,chat,finetune,finetune_lora,finetune_full,finetune_adapter,finetune_adapter_v2,"
19 |             "pretrain,generate,generate_full,generate_adapter,generate_adapter_v2,generate_sequentially,"
20 |             "generate_tp,convert_to_litgpt,convert_from_litgpt,convert_pretrained_checkpoint,"
21 |             "merge_lora,evaluate,serve}" in out)
22 |     assert (
23 |         """Available subcommands:
24 |     download            Download weights or tokenizer data from the Hugging
25 |                         Face Hub.
26 |     chat                Chat with a model."""
27 |         in out
28 |     )
29 |     assert """evaluate            Evaluate a model with the LM Evaluation Harness.""" in out
30 |     assert """serve               Serve a LitGPT model using LitServe.""" in out
31 |     out = StringIO()
32 |     with pytest.raises(SystemExit), redirect_stdout(out), mock.patch("sys.argv", ["litgpt", "finetune_lora", "-h"]):
33 |         main()
34 |     out = out.getvalue()
35 |     assert (
36 |         """--lora_alpha LORA_ALPHA
37 |                         The LoRA alpha. (type: int, default: 16)"""
38 |         in out
39 |     )
40 | 
41 |     if Version(f"{sys.version_info.major}.{sys.version_info.minor}") < Version("3.9"):
42 |         # python 3.8 prints `Union[int, null]` instead of `Optional[int]`
43 |         return
44 | 
45 |     out = StringIO()
46 |     with pytest.raises(SystemExit), redirect_stdout(out), mock.patch("sys.argv", ["litgpt", "pretrain", "-h"]):
47 |         main()
48 |     out = out.getvalue()
49 |     print(out)
50 |     assert (
51 |         """--train.max_tokens MAX_TOKENS
52 |                         Total number of tokens to train on (type:
53 |                         Optional[int], default: 3000000000000)"""
54 |         in out
55 |     )
56 | 
57 | 
58 | def test_rewrite_finetune_command():
59 |     out1 = StringIO()
60 |     with pytest.raises(SystemExit), redirect_stdout(out1), mock.patch("sys.argv", ["litgpt", "fineune", "-h"]):
61 |         main()
62 |     out2 = StringIO()
63 |     with pytest.raises(SystemExit), redirect_stdout(out2), mock.patch("sys.argv", ["litgpt", "fineune_lora", "-h"]):
64 |         main()
65 |     assert out1.getvalue() == out2.getvalue()
66 | 


--------------------------------------------------------------------------------
/tests/test_generate_adapter.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import re
 4 | import subprocess
 5 | import sys
 6 | from contextlib import redirect_stderr, redirect_stdout
 7 | from io import StringIO
 8 | from pathlib import Path
 9 | from unittest.mock import ANY, Mock, call
10 | 
11 | import pytest
12 | import torch
13 | import yaml
14 | 
15 | 
16 | @pytest.mark.parametrize("version", ("v1", "v2"))
17 | def test_main(fake_checkpoint_dir, monkeypatch, version, tensor_like):
18 |     if version == "v1":
19 |         import litgpt.generate.adapter as generate
20 |     else:
21 |         import litgpt.generate.adapter_v2 as generate
22 | 
23 |     config_path = fake_checkpoint_dir / "model_config.yaml"
24 |     config = {"block_size": 128, "vocab_size": 50, "n_layer": 2, "n_head": 4, "n_embd": 8, "rotary_percentage": 1}
25 |     config_path.write_text(yaml.dump(config))
26 | 
27 |     monkeypatch.setattr(generate, "lazy_load", Mock())
28 |     monkeypatch.setattr(generate.GPT, "load_state_dict", Mock())
29 |     tokenizer_mock = Mock()
30 |     tokenizer_mock.return_value.encode.return_value = torch.tensor([[1, 2, 3]])
31 |     tokenizer_mock.return_value.decode.return_value = "### Response:foo bar baz"
32 |     monkeypatch.setattr(generate, "Tokenizer", tokenizer_mock)
33 |     generate_mock = Mock()
34 |     generate_mock.return_value = torch.tensor([[3, 2, 1]])
35 |     monkeypatch.setattr(generate, "generate", generate_mock)
36 | 
37 |     num_samples = 1
38 |     out, err = StringIO(), StringIO()
39 |     with redirect_stdout(out), redirect_stderr(err):
40 |         generate.main(temperature=2.0, top_k=2, top_p=0.9, checkpoint_dir=fake_checkpoint_dir)
41 | 
42 |     assert len(tokenizer_mock.return_value.decode.mock_calls) == num_samples
43 |     assert torch.allclose(tokenizer_mock.return_value.decode.call_args[0][0], generate_mock.return_value)
44 |     assert generate_mock.mock_calls == [call(ANY, tensor_like, 101, temperature=2.0, top_k=2, top_p=0.9, eos_id=ANY)] * num_samples
45 | 
46 |     expected_output = "foo bar baz\n" * num_samples
47 |     # Allow for the config to be printed before the expected repeated strings.
48 |     pattern = rf".*^{re.escape(expected_output.strip())}$.*"
49 |     assert re.match(pattern, out.getvalue().strip(), re.DOTALL | re.MULTILINE)
50 | 
51 |     assert "'padded_vocab_size': 512, 'n_layer': 2, 'n_head': 4, 'head_size': 2, 'n_embd': 8" in err.getvalue()
52 | 
53 | 
54 | @pytest.mark.parametrize("version", ("", "_v2"))
55 | def test_cli(version):
56 |     args = ["litgpt", f"generate_adapter{version}", "-h"]
57 |     output = subprocess.check_output(args)
58 |     output = str(output.decode())
59 |     assert "For models finetuned with" in output
60 | 


--------------------------------------------------------------------------------
/litgpt/data/prepare_starcoder.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import os
 4 | import time
 5 | import traceback
 6 | from pathlib import Path
 7 | 
 8 | from lightning_utilities.core.imports import RequirementCache
 9 | 
10 | from litgpt.tokenizer import Tokenizer
11 | from litgpt.utils import CLI, extend_checkpoint_dir
12 | 
13 | _LITDATA_AVAILABLE = RequirementCache("litdata")
14 | if _LITDATA_AVAILABLE:
15 |     from litdata.processing.data_processor import DataChunkRecipe
16 | else:
17 |     DataChunkRecipe = object
18 | 
19 | 
20 | class StarcoderDataRecipe(DataChunkRecipe):
21 |     is_generator = True
22 | 
23 |     def __init__(self, tokenizer: Tokenizer, chunk_size: int):
24 |         super().__init__(chunk_size)
25 |         self.tokenizer = tokenizer
26 | 
27 |     def prepare_structure(self, input_dir):
28 |         files = Path(input_dir).rglob("*.parquet")
29 |         return [str(file) for file in files]
30 | 
31 |     def prepare_item(self, item_metadata):
32 |         import pyarrow.parquet as pq
33 | 
34 |         filepath = item_metadata
35 |         start = time.time()
36 | 
37 |         try:
38 |             parquet_file = pq.ParquetFile(filepath)
39 |             # reduce RAM usage
40 |             for batch in parquet_file.iter_batches(batch_size=8192, columns=["content"]):
41 |                 for text in batch.to_pandas()["content"]:
42 |                     yield self.tokenizer.encode(text, bos=False, eos=True)
43 | 
44 |         except Exception:
45 |             print(traceback.format_exc())
46 |             print(f"Error reading {filepath}")
47 |             return
48 | 
49 |         parquet_file.close()
50 |         end = time.time()
51 |         print(f"Took {end - start:.2f} seconds total", filepath)
52 | 
53 | 
54 | def prepare(
55 |     input_dir: Path = Path("data/starcoderdata"),
56 |     output_dir: Path = Path("data/starcoder"),
57 |     tokenizer_path: Path = Path("checkpoints/Llama-2-7b-hf/"),
58 |     chunk_size: int = (2049 * 8192),
59 |     fast_dev_run: bool = False,
60 | ) -> None:
61 |     from litdata.processing.data_processor import DataProcessor
62 | 
63 |     tokenizer_path = extend_checkpoint_dir(tokenizer_path)
64 |     tokenizer = Tokenizer(tokenizer_path)
65 |     data_recipe = StarcoderDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size)
66 |     data_processor = DataProcessor(
67 |         input_dir=str(input_dir),
68 |         output_dir=str(output_dir),
69 |         fast_dev_run=fast_dev_run,
70 |         num_workers=os.cpu_count(),
71 |         num_downloaders=1,
72 |     )
73 | 
74 |     start_time = time.time()
75 |     data_processor.run(data_recipe)
76 |     elapsed_time = time.time() - start_time
77 |     print(f"Time taken: {elapsed_time:.2f} seconds")
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     CLI(prepare)
82 | 


--------------------------------------------------------------------------------
/tests/test_config_hub.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import importlib.util
 3 | from pathlib import Path
 4 | from unittest import mock
 5 | from unittest.mock import Mock
 6 | 
 7 | import pytest
 8 | from lightning.fabric.plugins import Precision
 9 | 
10 | from litgpt import Config
11 | from litgpt.utils import CLI
12 | 
13 | fixed_pairs = [
14 |     ("litgpt/pretrain.py", "pretrain/debug.yaml"),
15 |     ("litgpt/pretrain.py", "pretrain/tinyllama.yaml"),
16 |     ("litgpt/pretrain.py", "pretrain/tinystories.yaml"),
17 |     (
18 |         "litgpt/pretrain.py",
19 |         "https://raw.githubusercontent.com/Lightning-AI/litgpt/4d55ab6d0aa404f0da0d03a80a8801ed60e07e83/config_hub/pretrain/tinystories.yaml",  # TODO: Update with path from main after merge
20 |     ),
21 | ]
22 | 
23 | config_hub_path = Path(__file__).parent.parent / "config_hub" / "finetune"
24 | model_pairs = []
25 | 
26 | for model_dir in config_hub_path.iterdir():
27 |     if model_dir.is_dir():
28 |         model_name = model_dir.name
29 |         for yaml_file in model_dir.glob("*.yaml"):
30 |             config_name = yaml_file.stem
31 |             python_file = "litgpt/finetune/full.py" if config_name == "full" else "litgpt/finetune/lora.py"
32 |             relative_yaml_path = yaml_file.relative_to(config_hub_path.parent)
33 |             model_pairs.append((python_file, str(relative_yaml_path)))
34 | 
35 | all_pairs = fixed_pairs + model_pairs
36 | 
37 | 
38 | @pytest.mark.parametrize(("script_file", "config_file"), all_pairs)
39 | def test_config_help(script_file, config_file, monkeypatch):
40 |     """Test that configs validate against the signature in the scripts."""
41 |     script_file = Path(__file__).parent.parent / script_file
42 |     assert script_file.is_file()
43 |     if "http" not in str(config_file):
44 |         config_file = Path(__file__).parent.parent / "config_hub" / config_file
45 |         assert config_file.is_file()
46 | 
47 |     spec = importlib.util.spec_from_file_location(str(script_file.parent.name), script_file)
48 |     module = importlib.util.module_from_spec(spec)
49 |     spec.loader.exec_module(module)
50 | 
51 |     monkeypatch.setattr(module, "main", Mock())
52 |     monkeypatch.setattr(module, "Tokenizer", Mock())
53 |     monkeypatch.setattr(module, "BitsandbytesPrecision", Mock(return_value=Precision()), raising=False)
54 |     monkeypatch.setattr(module, "Config", Mock(return_value=Config.from_name("pythia-14m")))
55 |     monkeypatch.setattr(module, "check_valid_checkpoint_dir", Mock(), raising=False)
56 | 
57 |     try:
58 |         with mock.patch("sys.argv", [script_file.name, "--config", str(config_file), "--devices", "1"]):
59 |             CLI(module.setup)
60 |             module.main.assert_called_once()
61 |     except FileNotFoundError:
62 |         pass
63 |         # FileNotFound occurs here because we have not downloaded the model weights referenced in the config files
64 |         # which is ok because here we just want to validate the config file itself.
65 | 


--------------------------------------------------------------------------------
/tests/test_evaluate.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import pytest
 4 | import subprocess
 5 | from contextlib import redirect_stdout
 6 | from dataclasses import asdict
 7 | from io import StringIO
 8 | from unittest import mock
 9 | 
10 | import torch
11 | import yaml
12 | 
13 | import litgpt.eval.evaluate as module
14 | from litgpt import GPT, Config
15 | from litgpt.scripts.download import download_from_hub
16 | 
17 | 
18 | def test_evaluate_script(tmp_path):
19 |     ours_config = Config.from_name("pythia-14m")
20 |     download_from_hub(repo_id="EleutherAI/pythia-14m", tokenizer_only=True, checkpoint_dir=tmp_path)
21 |     checkpoint_dir = tmp_path / "EleutherAI" / "pythia-14m"
22 |     ours_model = GPT(ours_config)
23 |     torch.save(ours_model.state_dict(), checkpoint_dir / "lit_model.pth")
24 |     with open(checkpoint_dir / "model_config.yaml", "w", encoding="utf-8") as fp:
25 |         yaml.dump(asdict(ours_config), fp)
26 | 
27 |     stdout = StringIO()
28 |     with redirect_stdout(stdout), mock.patch("sys.argv", ["eval/evaluate.py"]):
29 |         with pytest.raises(ValueError) as excinfo:
30 |             module.convert_and_evaluate(
31 |                 checkpoint_dir,
32 |                 out_dir=tmp_path / "out_dir",
33 |                 device=None,
34 |                 dtype=torch.float32,
35 |                 limit=5,
36 |                 tasks="logiqa",
37 |                 batch_size=0  # Test for non-positive integer
38 |             )
39 |         assert "batch_size must be a positive integer, 'auto', or in the format 'auto:N'." in str(excinfo.value)
40 | 
41 |         with pytest.raises(ValueError) as excinfo:
42 |             module.convert_and_evaluate(
43 |                 checkpoint_dir,
44 |                 out_dir=tmp_path / "out_dir",
45 |                 device=None,
46 |                 dtype=torch.float32,
47 |                 limit=5,
48 |                 tasks="logiqa",
49 |                 batch_size="invalid"  # Test for invalid string
50 |             )
51 |         assert "batch_size must be a positive integer, 'auto', or in the format 'auto:N'." in str(excinfo.value)
52 | 
53 |     stdout = StringIO()
54 |     with redirect_stdout(stdout), mock.patch("sys.argv", ["eval/evaluate.py"]):
55 |         module.convert_and_evaluate(
56 |             checkpoint_dir,
57 |             out_dir=tmp_path / "out_dir",
58 |             device=None,
59 |             dtype=torch.float32,
60 |             limit=5,
61 |             tasks="logiqa",
62 |             batch_size=1  # Valid case
63 |         )
64 |     stdout = stdout.getvalue()
65 |     assert (tmp_path / "out_dir" / "results.json").is_file()
66 |     assert "logiqa" in stdout
67 |     assert "Metric" in stdout
68 |     assert "Loading checkpoint shards" not in stdout
69 | 
70 | 
71 | def test_cli():
72 |     args = ["litgpt", "evaluate", "-h"]
73 |     output = subprocess.check_output(args)
74 |     output = str(output.decode())
75 |     assert "Evaluate a model with the LM Evaluation Harness" in output
76 | 


--------------------------------------------------------------------------------
/tests/data/test_deita.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | from unittest import mock
 3 | 
 4 | from litgpt.data import Deita, SFTDataset
 5 | from litgpt.data.deita import format_dataset
 6 | from litgpt.prompts import Alpaca as AlpacaPromptStyle
 7 | 
 8 | 
 9 | def test_format_dataset():
10 |     data = [
11 |         {
12 |             "prompt": "prompt1",
13 |             "prompt_id": "1",
14 |             "messages": [
15 |                 {"content": "question1", "role": "user"},
16 |                 {"content": "response1", "role": "assistant"},
17 |                 {"content": "question2", "role": "user"},
18 |                 {"content": "response2", "role": "assistant"},
19 |             ],
20 |         },
21 |         {
22 |             "prompt": "prompt2",
23 |             "prompt_id": "2",
24 |             "messages": [
25 |                 {"content": "question3", "role": "user"},
26 |                 {"content": "response3", "role": "assistant"},
27 |                 {"content": "question4", "role": "user"},
28 |                 {"content": "response4", "role": "assistant"},
29 |             ],
30 |         },
31 |     ]
32 | 
33 |     assert format_dataset(data, include_multi_turn_conversations=False) == [
34 |         {"instruction": "question1", "output": "response1", "input": ""},
35 |         {"instruction": "question3", "output": "response3", "input": ""},
36 |     ]
37 |     assert format_dataset(data, include_multi_turn_conversations=True) == [
38 |         {"instruction": "question1", "output": "response1", "input": ""},
39 |         {"instruction": "question2", "output": "response2", "input": ""},
40 |         {"instruction": "question3", "output": "response3", "input": ""},
41 |         {"instruction": "question4", "output": "response4", "input": ""},
42 |     ]
43 | 
44 | 
45 | @mock.patch("litgpt.data.deita.format_dataset")
46 | @mock.patch("datasets.load_dataset")
47 | def test_deita(_, format_dataset_mock, mock_tokenizer, tmp_path):
48 |     format_dataset_mock.return_value = [
49 |         {"instruction": "inst1", "output": "out1"},
50 |         {"instruction": "inst2", "output": "out2"},
51 |         {"instruction": "inst3", "output": "out3"},
52 |     ]
53 | 
54 |     deita = Deita(num_workers=0, download_dir=tmp_path)
55 |     assert isinstance(deita.prompt_style, AlpacaPromptStyle)
56 |     deita.connect(mock_tokenizer, batch_size=2, max_seq_length=10)
57 |     deita.prepare_data()
58 |     deita.setup()
59 | 
60 |     train_dataloader = deita.train_dataloader()
61 |     assert isinstance(train_dataloader.dataset, SFTDataset)
62 |     assert len(train_dataloader) == 2
63 | 
64 |     val_dataloader = deita.val_dataloader()
65 |     assert isinstance(val_dataloader.dataset, SFTDataset)
66 |     assert len(val_dataloader) == 2
67 | 
68 |     assert isinstance(train_dataloader.dataset.prompt_style, AlpacaPromptStyle)
69 |     assert isinstance(val_dataloader.dataset.prompt_style, AlpacaPromptStyle)
70 | 
71 |     # has attributes from super class `LightningDataModule`
72 |     assert deita.prepare_data_per_node
73 | 


--------------------------------------------------------------------------------
/litgpt/data/lit_data.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | import os
 3 | from dataclasses import dataclass, field
 4 | from pathlib import Path
 5 | from typing import Optional, Tuple, Union
 6 | 
 7 | from torch.utils.data import DataLoader
 8 | 
 9 | from litgpt.tokenizer import Tokenizer
10 | from litgpt.data import DataModule
11 | 
12 | 
13 | @dataclass
14 | class LitData(DataModule):
15 |     """Loads data using LitData's StreamingDataset given a path to a folder of preprocessed data (chunks)."""
16 | 
17 |     data_path: Union[str, Path] = Path("data/")
18 |     """The path to the data directory containing the preprocessed chunks for the streaming dataset
19 |     The path can also be a remote path (e.g., s3://). See also ``split_names`` if this path contains subfolders
20 |     for training- and validation splits."""
21 |     split_names: Optional[Tuple[str, str]] = None
22 |     """Optional tuple for names of subfolders for training and validation under ``data_path``. If not provided,
23 |     all data under data_path will be used for training, and the validation dataloader will be identical to the
24 |     train dataloader."""
25 |     seed: int = 42
26 |     """The random seed for shuffling the dataset."""
27 |     num_workers: int = 8
28 |     """How many DataLoader processes to use for loading."""
29 | 
30 |     batch_size: int = field(init=False, repr=False, default=1)
31 |     seq_length: int = field(init=False, repr=False, default=2048)
32 | 
33 |     def __post_init__(self) -> None:
34 |         super().__init__()
35 |         if self.split_names is not None and len(self.split_names) != 2:
36 |             raise ValueError("If provided `split_names` must be a tuple of two strings, for example: ('train', 'val').")
37 | 
38 |     def connect(
39 |         self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: Optional[int] = None
40 |     ) -> None:
41 |         self.batch_size = batch_size
42 |         self.seq_length = max_seq_length + 1  # Increase by one because we need the next token as well
43 | 
44 |     def train_dataloader(self) -> DataLoader:
45 |         input_dir = os.path.join(self.data_path, self.split_names[0]) if self.split_names else str(self.data_path)
46 |         return self._dataloader(input_dir=input_dir, train=True)
47 | 
48 |     def val_dataloader(self) -> DataLoader:
49 |         input_dir = os.path.join(self.data_path, self.split_names[1]) if self.split_names else str(self.data_path)
50 |         return self._dataloader(input_dir=input_dir, train=False)
51 | 
52 |     def _dataloader(self, input_dir: str, train: bool):
53 |         from litdata.streaming import StreamingDataset, StreamingDataLoader, TokensLoader
54 | 
55 |         dataset = StreamingDataset(
56 |             input_dir=input_dir,
57 |             item_loader=TokensLoader(block_size=self.seq_length),
58 |             shuffle=train,
59 |             seed=self.seed,
60 |         )
61 |         dataloader = StreamingDataLoader(
62 |             dataset, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True
63 |         )
64 |         return dataloader
65 | 


--------------------------------------------------------------------------------
/tests/data/test_base.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import pytest
 4 | import torch
 5 | 
 6 | from litgpt.data import SFTDataset, get_sft_collate_fn
 7 | from litgpt.prompts import PromptStyle
 8 | 
 9 | 
10 | @pytest.mark.parametrize("mask_prompt", [True, False])
11 | @pytest.mark.parametrize("ignore_index", [-1, -100])
12 | @pytest.mark.parametrize("max_seq_length", [1000, 5, -1])
13 | def test_sft_dataset(max_seq_length, ignore_index, mask_prompt, mock_tokenizer):
14 |     class Style(PromptStyle):
15 |         def apply(self, prompt, **kwargs):
16 |             return f"In: {prompt} Out:"
17 | 
18 |     i = ignore_index
19 |     data = [{"instruction": "Foo", "output": "Bar"}, {"instruction": "Boo", "output": "Ahh"}]
20 | 
21 |     dataset = SFTDataset(
22 |         data=data,
23 |         tokenizer=mock_tokenizer,
24 |         prompt_style=Style(),
25 |         mask_prompt=mask_prompt,
26 |         ignore_index=ignore_index,
27 |         max_seq_length=max_seq_length,
28 |     )
29 |     assert len(dataset) == len(data)
30 | 
31 |     expected_input_ids = torch.tensor([73, 110, 58, 32, 70, 111, 111, 32, 79, 117, 116, 58, 66, 97, 114, 1])
32 |     # If prompt is not masked, labels == input_ids
33 |     expected_labels = (
34 |         torch.tensor([i, i, i, i, i, i, i, i, i, i, i, i, 66, 97, 114, 1]) if mask_prompt else expected_input_ids
35 |     )
36 | 
37 |     if max_seq_length == -1:
38 |         assert torch.equal(dataset[0]["input_ids"], expected_input_ids)
39 |         assert torch.equal(dataset[0]["labels"], expected_labels)
40 |     else:
41 |         assert torch.equal(dataset[0]["input_ids"], expected_input_ids[:max_seq_length])
42 |         assert torch.equal(dataset[0]["labels"], expected_labels[:max_seq_length])
43 | 
44 | 
45 | @pytest.mark.parametrize("ignore_index", [-1, -100])
46 | @pytest.mark.parametrize("pad_id", [0, 100])
47 | def test_sft_collate_fn_padding(pad_id, ignore_index):
48 |     collate = get_sft_collate_fn(pad_id=pad_id, ignore_index=ignore_index)
49 |     samples = [
50 |         {"input_ids": torch.tensor([1, 2, 3]), "labels": torch.tensor([10, 20, 30])},
51 |         {"input_ids": torch.tensor([4, 5, 6, 7, 8]), "labels": torch.tensor([40, 50, 60, 70, 80])},
52 |     ]
53 |     expected = {
54 |         "input_ids": torch.tensor([[1, 2, 3, pad_id, pad_id], [4, 5, 6, 7, 8]]),
55 |         "labels": torch.tensor([[10, 20, 30, ignore_index, ignore_index], [40, 50, 60, 70, 80]]),
56 |     }
57 |     batch = collate(samples)
58 |     assert all(torch.equal(batch[k], expected[k]) for k in ("input_ids", "labels"))
59 | 
60 | 
61 | def test_sft_collate_fn_truncation():
62 |     collate = get_sft_collate_fn(max_seq_length=2)
63 |     samples = [
64 |         {"input_ids": torch.tensor([1, 2, 3]), "labels": torch.tensor([10, 20, 30])},
65 |         {"input_ids": torch.tensor([4, 5, 6, 7, 8]), "labels": torch.tensor([40, 50, 60, 70, 80])},
66 |     ]
67 |     expected = {"input_ids": torch.tensor([[1, 2], [4, 5]]), "labels": torch.tensor([[10, 20], [40, 50]])}
68 |     batch = collate(samples)
69 |     assert all(torch.equal(batch[k], expected[k]) for k in ("input_ids", "labels"))
70 | 


--------------------------------------------------------------------------------
/tutorials/finetune.md:
--------------------------------------------------------------------------------
 1 | # Finetuning
 2 | 
 3 | We provide a simple finetuning commands (`litgpt finetune_*`) that instruction-finetune a pretrained model on datasets such as [Alpaca](https://github.com/tatsu-lab/stanford_alpaca), [Dolly](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm), and others. For more information on the supported instruction datasets and how to prepare your own custom datasets, please see the [tutorials/prepare_dataset](prepare_dataset.md) tutorials.
 4 | 
 5 | LitGPT currently supports the following finetuning methods:
 6 | 
 7 | ```bash
 8 | litgpt finetune_full
 9 | litgpt finetune_lora
10 | litgpt finetune_adapter
11 | litgpt finetune_adapter_v2
12 | ```
13 | 
14 | &nbsp;
15 | > [!TIP]
16 | > To install all required dependencies before finetuning, first run `pip install "litgpt[all]"`.
17 | &nbsp;
18 | 
19 | 
20 | The following section provides more details about these methods, including links for additional resources.
21 | 
22 | 
23 | &nbsp;
24 | ## LitGPT finetuning commands
25 | 
26 | The section below provides additional information on the available and links to further resources.
27 | 
28 | &nbsp;
29 | ### Full finetuning
30 | 
31 | ```bash
32 | litgpt finetune_full
33 | ```
34 | 
35 | This method trains all model weight parameters and is the most memory-intensive finetuning technique in LitGPT.
36 | 
37 | **More information and resources:**
38 | 
39 | - the LitGPT [tutorials/finetune_full](finetune_full.md) tutorial
40 | 
41 | 
42 | &nbsp;
43 | ### LoRA and QLoRA finetuning
44 | 
45 | ```bash
46 | litgpt finetune_lora stabilityai/stablelm-base-alpha-3b
47 | ```
48 | 
49 | LoRA and QLoRA are parameter-efficient finetuning technique that only require updating a small number of parameters, which makes this a more memory-efficienty alternative to full finetuning.
50 | 
51 | **More information and resources:**
52 | 
53 | - the LitGPT [tutorials/finetune_lora](finetune_lora.md) tutorial
54 | - the LoRA paper by ([Hu et al. 2021](https://arxiv.org/abs/2106.09685))
55 | - the conceptual tutorial [Parameter-Efficient LLM Finetuning With Low-Rank Adaptation (LoRA)](https://lightning.ai/pages/community/tutorial/lora-llm/)
56 | 
57 | 
58 | &nbsp;
59 | ### Adapter finetuning
60 | 
61 | ```bash
62 | litgpt finetune_adapter stabilityai/stablelm-base-alpha-3b
63 | ```
64 | 
65 | or
66 | 
67 | ```bash
68 | litgpt finetune_adapter_v2 stabilityai/stablelm-base-alpha-3b
69 | ```
70 | 
71 | Similar to LoRA, adapter finetuning is a parameter-efficient finetuning technique that only requires training a small subset of weight parameters, making this finetuning method more memory-efficient than full-parameter finetuning. 
72 | 
73 | **More information and resources:**
74 | 
75 | - the LitGPT [tutorials/finetune_adapter](finetune_adapter.md) tutorial
76 | - the Llama-Adapter ([Gao et al. 2023](https://arxiv.org/abs/2304.15010)) and Llama-Adapter v2  ([Zhang et al. 2023](https://arxiv.org/abs/2303.16199)) papers that originally introduces these methods
77 | - the conceptual tutorial [Understanding Parameter-Efficient Finetuning of Large Language Models: From Prefix Tuning to LLaMA-Adapters](https://lightning.ai/pages/community/article/understanding-llama-adapters/)
78 | 


--------------------------------------------------------------------------------
/tests/test_full.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import os
 4 | from contextlib import redirect_stdout
 5 | from io import StringIO
 6 | from unittest import mock
 7 | from unittest.mock import Mock
 8 | 
 9 | import torch
10 | import yaml
11 | 
12 | import litgpt.finetune.full as module
13 | from litgpt.args import EvalArgs, TrainArgs
14 | from litgpt.data import Alpaca
15 | 
16 | 
17 | @mock.patch.dict(os.environ, {"LT_ACCELERATOR": "cpu"})
18 | def test_full_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_path):
19 |     model_config = dict(block_size=128, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8)
20 |     (fake_checkpoint_dir / "model_config.yaml").write_text(yaml.dump(model_config))
21 |     monkeypatch.setattr(module, "load_checkpoint", Mock())
22 | 
23 |     tokenizer_mock = Mock()
24 |     tokenizer_mock.return_value = tokenizer_mock
25 |     tokenizer_mock.encode = lambda *_, **__: torch.tensor([3, 2, 1])
26 |     monkeypatch.setattr(module, "Tokenizer", tokenizer_mock)
27 | 
28 |     out_dir = tmp_path / "out"
29 |     setup_args = (fake_checkpoint_dir, )
30 |     setup_kwargs = dict(
31 |         data=Alpaca(download_dir=alpaca_path.parent, file_name=alpaca_path.name, val_split_fraction=0.5, num_workers=0),
32 |         out_dir=out_dir,
33 |         precision="32-true",
34 |         train=TrainArgs(global_batch_size=1, save_interval=2, epochs=1, max_steps=6, micro_batch_size=1),
35 |         eval=EvalArgs(interval=2, max_iters=2, max_new_tokens=1),
36 |     )
37 |     stdout = StringIO()
38 |     with redirect_stdout(stdout), mock.patch("sys.argv", ["full.py", str(fake_checkpoint_dir)]):
39 |         module.setup(*setup_args, **setup_kwargs)
40 | 
41 |     out_dir_contents = set(os.listdir(out_dir))
42 |     checkpoint_dirs = {"step-000002", "step-000004", "step-000006", "final"}
43 |     assert checkpoint_dirs.issubset(out_dir_contents)
44 |     assert all((out_dir / p).is_dir() for p in checkpoint_dirs)
45 |     for checkpoint_dir in checkpoint_dirs:
46 |         assert set(os.listdir(out_dir / checkpoint_dir)) == {
47 |             "lit_model.pth",
48 |             "model_config.yaml",
49 |             "tokenizer_config.json",
50 |             "tokenizer.json",
51 |             "hyperparameters.yaml",
52 |             "prompt_style.yaml",
53 |         }
54 |     assert (out_dir / "logs" / "csv" / "version_0" / "metrics.csv").is_file()
55 | 
56 |     logs = stdout.getvalue()
57 |     assert logs.count("(step)") == 6
58 |     assert logs.count("val loss") == 4  # 3 validations + 1 final validation
59 |     assert logs.count("Final evaluation") == 1
60 |     assert "of trainable parameters: 1,888" in logs
61 | 
62 |     # Resume training and do 2 steps more
63 |     setup_kwargs["train"].max_steps = 8
64 |     setup_kwargs["resume"] = True
65 |     stdout = StringIO()
66 |     with redirect_stdout(stdout), mock.patch("sys.argv", ["full.py", str(fake_checkpoint_dir)]):
67 |         module.setup(*setup_args, **setup_kwargs)
68 |     logs = stdout.getvalue()
69 |     assert f"Resuming training from {out_dir / 'step-000006' / 'lit_model.pth'}" in logs
70 |     assert logs.count("(step)") == 2
71 |     assert out_dir / "step-000008" in set(out_dir.iterdir())
72 | 


--------------------------------------------------------------------------------
/litgpt/data/dolly.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import json
 4 | from dataclasses import dataclass, field
 5 | from pathlib import Path
 6 | from typing import Union
 7 | 
 8 | import torch
 9 | from torch.utils.data import random_split
10 | 
11 | from litgpt.prompts import PromptStyle
12 | from litgpt.data import Alpaca, SFTDataset
13 | 
14 | _URL: str = "https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl"
15 | 
16 | 
17 | @dataclass
18 | class Dolly(Alpaca):
19 |     """Dolly data module for supervised finetuning."""
20 | 
21 |     mask_prompt: bool = False
22 |     """Whether to mask the prompt section from the label (with ``ignore_index``)."""
23 |     val_split_fraction: float = 0.1
24 |     """The fraction of the dataset to use for the validation dataset. The rest is used for training."""
25 |     prompt_style: Union[str, PromptStyle] = "alpaca"
26 |     """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
27 |     ignore_index: int = -100
28 |     """The index to use for elements to be ignored in the label."""
29 |     seed: int = 42
30 |     """The random seed for creating the train/val splits and shuffling the dataset."""
31 |     num_workers: int = 4
32 |     """How many DataLoader processes to use for loading."""
33 |     download_dir: Path = Path("./data/dolly")
34 |     """The directory in which the downloaded dataset gets saved."""
35 |     file_url: str = field(repr=False, default=_URL)
36 |     """The URL from where to download the dataset."""
37 |     file_name: str = field(repr=False, default="dolly_data_cleaned.json")
38 |     """The name of the dataset file to download."""
39 | 
40 |     def setup(self, stage: str = "") -> None:
41 |         with open(self.download_dir / self.file_name, "r", encoding="utf-8") as file:
42 |             data = file.readlines()
43 |             data = [json.loads(line) for line in data]
44 | 
45 |         # Partition the dataset into train and test
46 |         train_data, test_data = random_split(
47 |             data,
48 |             [1.0 - self.val_split_fraction, self.val_split_fraction],
49 |             generator=torch.Generator().manual_seed(self.seed),
50 |         )
51 |         train_data, test_data = list(train_data), list(test_data)
52 | 
53 |         self.train_dataset = SFTDataset(
54 |             data=train_data,
55 |             tokenizer=self.tokenizer,
56 |             prompt_style=self.prompt_style,
57 |             max_seq_length=self.max_seq_length,
58 |             mask_prompt=self.mask_prompt,
59 |             ignore_index=self.ignore_index,
60 |             transform=_transform,
61 |         )
62 |         self.test_dataset = SFTDataset(
63 |             data=test_data,
64 |             tokenizer=self.tokenizer,
65 |             prompt_style=self.prompt_style,
66 |             max_seq_length=self.max_seq_length,
67 |             mask_prompt=self.mask_prompt,
68 |             ignore_index=self.ignore_index,
69 |             transform=_transform,
70 |         )
71 | 
72 | 
73 | def _transform(item: dict) -> dict:
74 |     item["input"] = item.pop("context")
75 |     item["output"] = item.pop("response")
76 |     return item
77 | 


--------------------------------------------------------------------------------
/tests/data/test_tinystories.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import pytest
 4 | import torch
 5 | from litdata import optimize
 6 | from litdata.streaming import StreamingDataset, TokensLoader
 7 | from torch.utils._pytree import tree_map
 8 | 
 9 | 
10 | def tokenize(data):
11 |     for story in data:
12 |         yield torch.tensor(story)
13 | 
14 | 
15 | def fake_chunk(path, data):
16 |     optimize(fn=tokenize, inputs=[data] * len(data), output_dir=str(path), num_workers=1, chunk_bytes="200MB")
17 | 
18 | 
19 | @pytest.mark.parametrize(
20 |     ("max_seq_len", "expected"),
21 |     [
22 |         (2, [[0, 23, 15], [63, 0, 73], [5, 0, 1], [1999, 0, 13]]),
23 |         (5, [[0, 23, 15, 63, 0, 73], [5, 0, 1, 1999, 0, 13]]),
24 |         (6, [[0, 23, 15, 63, 0, 73, 5]]),
25 |         (7, [[0, 23, 15, 63, 0, 73, 5, 0]]),
26 |     ],
27 | )
28 | def test_pretok_dataset(tmp_path, max_seq_len, expected):
29 |     fake_data = [0, 23, 15, 63, 0, 73, 5, 0, 1, 1999, 0, 13]
30 |     assert len(fake_data) == 12
31 |     fake_chunk(tmp_path, [fake_data])
32 | 
33 |     dataset = StreamingDataset(
34 |         input_dir=str(tmp_path), item_loader=TokensLoader(block_size=max_seq_len + 1), shuffle=False, drop_last=False
35 |     )
36 |     actual = tree_map(torch.Tensor.tolist, list(dataset))
37 |     assert actual == expected
38 | 
39 | 
40 | def test_tokenize(tmp_path, monkeypatch):
41 |     from litgpt.data.tinystories import tokenize
42 | 
43 |     story1, story2 = "foo bar", "    fun    "
44 |     data = [{"story": story1}, {"story": story2}]
45 |     shard_path = tmp_path / "data.json"
46 |     with open(shard_path, "w", encoding="utf-8") as f:
47 |         json.dump(data, f)
48 | 
49 |     class Tokenizer:
50 |         bos_id = 0
51 | 
52 |         def encode(self, text, bos, eos):
53 |             assert bos
54 |             assert not eos
55 |             return [self.bos_id] + [ord(c) for c in text]
56 | 
57 |     monkeypatch.setenv("DATA_OPTIMIZER_GLOBAL_RANK", "0")
58 |     monkeypatch.setenv("DATA_OPTIMIZER_NUM_WORKERS", "1")
59 |     data = tokenize(str(shard_path), Tokenizer())
60 |     assert list(data) == [[0, 102, 111, 111, 32, 98, 97, 114], [0, 102, 117, 110]]
61 | 
62 | 
63 | def test_tinystories_datamodule(tmp_path):
64 |     from litgpt.data.tinystories import TinyStories
65 | 
66 |     data_dir = tmp_path / "tinystories"
67 | 
68 |     datamodule = TinyStories(data_dir, seed=42, num_workers=1)
69 |     datamodule.connect(max_seq_length=2)
70 | 
71 |     # simulate `datamodule.prepare_data`
72 |     train_data_dir = data_dir / "train"
73 |     train_data_dir.mkdir(parents=True)
74 |     fake_chunk(train_data_dir, [[12], [0, 23, 15, 63, 0], [73, 5, 0, 1, 1999, 0, 13]])
75 | 
76 |     datamodule.setup()
77 | 
78 |     tr_dataloader = datamodule.train_dataloader()
79 |     torch.manual_seed(0)
80 |     actual = tree_map(torch.Tensor.tolist, list(tr_dataloader))
81 |     # there is 1 sample per index in the data (13)
82 |     assert actual == [
83 |         [[1999, 0, 13]],
84 |         [[0, 13, 12]],
85 |         [[1, 1999, 0]],
86 |         [[63, 0, 73]],
87 |         [[5, 0, 1]],
88 |         [[0, 73, 5]],
89 |         [[0, 23, 15]],
90 |         [[0, 1, 1999]],
91 |         [[15, 63, 0]],
92 |         [[73, 5, 0]],
93 |         [[12, 0, 23]],
94 |         [[23, 15, 63]],
95 |         [[13, 12, 0]]
96 |     ]
97 | 


--------------------------------------------------------------------------------
/litgpt/__main__.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import warnings
 4 | import torch
 5 | 
 6 | from litgpt.chat.base import main as chat_fn
 7 | from litgpt.finetune.adapter import setup as finetune_adapter_fn
 8 | from litgpt.finetune.adapter_v2 import setup as finetune_adapter_v2_fn
 9 | from litgpt.finetune.full import setup as finetune_full_fn
10 | from litgpt.finetune.lora import setup as finetune_lora_fn
11 | from litgpt.generate.adapter import main as generate_adapter_fn
12 | from litgpt.generate.adapter_v2 import main as generate_adapter_v2_fn
13 | from litgpt.generate.base import main as generate_base_fn
14 | from litgpt.generate.full import main as generate_full_fn
15 | from litgpt.generate.sequentially import main as generate_sequentially_fn
16 | from litgpt.generate.tp import main as generate_tp_fn
17 | from litgpt.pretrain import setup as pretrain_fn
18 | from litgpt.scripts.convert_hf_checkpoint import convert_hf_checkpoint as convert_hf_checkpoint_fn
19 | from litgpt.scripts.convert_lit_checkpoint import convert_lit_checkpoint as convert_lit_checkpoint_fn
20 | from litgpt.scripts.convert_pretrained_checkpoint import (
21 |     convert_pretrained_checkpoint as convert_pretrained_checkpoint_fn,
22 | )
23 | from litgpt.scripts.download import download_from_hub as download_fn
24 | from litgpt.scripts.merge_lora import merge_lora as merge_lora_fn
25 | from litgpt.eval.evaluate import convert_and_evaluate as evaluate_fn
26 | from litgpt.deploy.serve import run_server as serve_fn
27 | from jsonargparse import set_config_read_mode, set_docstring_parse_options, CLI
28 | 
29 | 
30 | def main() -> None:
31 |     parser_data = {
32 |         "download": download_fn,
33 |         "chat": chat_fn,
34 |         "finetune": finetune_lora_fn,
35 |         "finetune_lora": finetune_lora_fn,
36 |         "finetune_full": finetune_full_fn,
37 |         "finetune_adapter": finetune_adapter_fn,
38 |         "finetune_adapter_v2": finetune_adapter_v2_fn,
39 |         "pretrain": pretrain_fn,
40 |         "generate": generate_base_fn,
41 |         "generate_full": generate_full_fn,
42 |         "generate_adapter": generate_adapter_fn,
43 |         "generate_adapter_v2": generate_adapter_v2_fn,
44 |         "generate_sequentially": generate_sequentially_fn,
45 |         "generate_tp": generate_tp_fn,
46 |         "convert_to_litgpt": convert_hf_checkpoint_fn,
47 |         "convert_from_litgpt": convert_lit_checkpoint_fn,
48 |         "convert_pretrained_checkpoint": convert_pretrained_checkpoint_fn,
49 |         "merge_lora": merge_lora_fn,
50 |         "evaluate": evaluate_fn,
51 |         "serve": serve_fn
52 |     }
53 | 
54 |     set_docstring_parse_options(attribute_docstrings=True)
55 |     set_config_read_mode(urls_enabled=True)
56 | 
57 |     # PyTorch bug that raises a false-positive warning
58 |     # More info: https://github.com/Lightning-AI/litgpt/issues/1561
59 |     warning_message = (
60 |         r"The epoch parameter in `scheduler.step\(\)` was not necessary and is being deprecated.*"
61 |     )
62 | 
63 |     warnings.filterwarnings(
64 |         action="ignore",
65 |         message=warning_message,
66 |         category=UserWarning,
67 |         module=r'.*torch\.optim\.lr_scheduler.*'
68 |     )
69 | 
70 |     torch.set_float32_matmul_precision("high")
71 |     CLI(parser_data)
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     main()
76 | 


--------------------------------------------------------------------------------
/tests/run_standalone_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # Batch size for testing: Determines how many standalone test invocations run in parallel
 5 | # It can be set through the env variable PL_STANDALONE_TESTS_BATCH_SIZE
 6 | test_batch_size="${PL_STANDALONE_TESTS_BATCH_SIZE:-1}"
 7 | 
 8 | # this environment variable allows special tests to run
 9 | export PL_RUN_STANDALONE_TESTS=1
10 | # python arguments
11 | defaults="-m pytest --no-header -v --disable-pytest-warnings --strict-markers --color=yes -s --timeout 120"
12 | echo "Using defaults: ${defaults}"
13 | 
14 | # find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster
15 | grep_output=$(grep --recursive --word-regexp . --regexp 'standalone=True' --include '*.py')
16 | 
17 | # file paths, remove duplicates
18 | files=$(echo "$grep_output" | cut -f1 -d: | sort | uniq)
19 | 
20 | # get the list of parametrizations. we need to call them separately. the last two lines are removed.
21 | # note: if there's a syntax error, this will fail with some garbled output
22 | if [[ "$OSTYPE" == "darwin"* ]]; then
23 |   parametrizations=$(python3 -m pytest $files --collect-only --quiet --disable-pytest-warnings "$@" | tail -r | sed -e '1,3d' | tail -r)
24 | else
25 |   parametrizations=$(python3 -m pytest $files --collect-only --quiet --disable-pytest-warnings "$@" | head -n -2)
26 | fi
27 | # remove the "tests/" path suffix
28 | path_suffix=$(basename "$(pwd)")"/"  # https://stackoverflow.com/a/8223345
29 | parametrizations=${parametrizations//$path_suffix/}
30 | parametrizations_arr=($parametrizations)
31 | 
32 | report=''
33 | 
34 | rm -f standalone_test_output.txt  # in case it exists, remove it
35 | function show_batched_output {
36 |   if [ -f standalone_test_output.txt ]; then  # if exists
37 |     cat standalone_test_output.txt
38 |     # heuristic: stop if there's mentions of errors. this can prevent false negatives when only some of the ranks fail
39 |     if grep -iE 'error|exception|traceback|failed' standalone_test_output.txt | grep -qvE 'on_exception|xfailed'; then
40 |       echo "Potential error! Stopping."
41 |       rm standalone_test_output.txt
42 |       exit 1
43 |     fi
44 |     rm standalone_test_output.txt
45 |   fi
46 | }
47 | trap show_batched_output EXIT  # show the output on exit
48 | 
49 | for i in "${!parametrizations_arr[@]}"; do
50 |   parametrization=${parametrizations_arr[$i]}
51 |   prefix="$((i+1))/${#parametrizations_arr[@]}"
52 | 
53 |   echo "$prefix: Running $parametrization"
54 |   # execute the test in the background
55 |   # redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them
56 |   # output to std{out,err} because the outputs would be garbled together
57 |   python3 ${defaults} "$parametrization" &>> standalone_test_output.txt &
58 |   # save the PID in an array
59 |   pids[${i}]=$!
60 |   # add row to the final report
61 |   report+="Ran\t$parametrization\n"
62 | 
63 |   if ((($i + 1) % $test_batch_size == 0)); then
64 |     # wait for running tests
65 |     for pid in ${pids[*]}; do wait $pid; done
66 |     unset pids  # empty the array
67 |     show_batched_output
68 |   fi
69 | done
70 | # wait for leftover tests
71 | for pid in ${pids[*]}; do wait $pid; done
72 | show_batched_output
73 | 
74 | # echo test report
75 | printf '=%.s' {1..80}
76 | printf "\n$report"
77 | printf '=%.s' {1..80}
78 | printf '\n'
79 | 


--------------------------------------------------------------------------------
/config_hub/finetune/phi-3/full.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  3 | checkpoint_dir: checkpoints/microsoft/Phi-3-mini-4k-instruct
  4 | 
  5 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/finetune/full)
  6 | out_dir: out/finetune/full-phi-3
  7 | 
  8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  9 | precision: bf16-true
 10 | 
 11 | # How many devices/GPUs to use (type: Union[int, str], default: 1)
 12 | devices: 1
 13 | 
 14 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 15 | data:
 16 |   class_path: litgpt.data.Alpaca2k
 17 |   init_args:
 18 |     mask_prompt: false
 19 |     prompt_style: alpaca
 20 |     ignore_index: -100
 21 |     seed: 42
 22 |     num_workers: 4
 23 | 
 24 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 25 | train:
 26 | 
 27 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 28 |   save_interval: 200
 29 | 
 30 |   # Number of iterations between logging calls (type: int, default: 1)
 31 |   log_interval: 1
 32 | 
 33 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64)
 34 |   global_batch_size: 8
 35 | 
 36 |   # Number of samples per data-parallel rank (type: int, default: 1)
 37 |   micro_batch_size: 4
 38 | 
 39 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 40 |   lr_warmup_steps: 200
 41 | 
 42 |   # Number of epochs to train on (type: Optional[int], default: 5)
 43 |   epochs: 1
 44 | 
 45 |   # Total number of tokens to train on (type: Optional[int], default: null)
 46 |   max_tokens:
 47 | 
 48 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 49 |   max_steps:
 50 | 
 51 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 52 |   max_seq_length: 512
 53 | 
 54 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 55 |   tie_embeddings:
 56 | 
 57 |   #   (type: Optional[float], default: null)
 58 |   max_norm:
 59 | 
 60 |   #   (type: float, default: 6e-05)
 61 |   min_lr: 6.0e-05
 62 | 
 63 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 64 | eval:
 65 | 
 66 |   # Number of optimizer steps between evaluation calls (type: int, default: 600)
 67 |   interval: 25
 68 | 
 69 |   # Number of tokens to generate (type: Optional[int], default: 100)
 70 |   max_new_tokens: 100
 71 | 
 72 |   # Number of iterations (type: int, default: 100)
 73 |   max_iters: 100
 74 | 
 75 |   # Whether to evaluate on the validation set at the beginning of the training
 76 |   initial_validation: false
 77 | 
 78 |   # Whether to evaluate on the validation set at the end the training
 79 |   final_validation: true
 80 | 
 81 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 82 | logger_name: csv
 83 | 
 84 | # The random seed to use for reproducibility. (type: int, default: 1337)
 85 | seed: 1337
 86 | 
 87 | # Optimizer-related arguments
 88 | optimizer:
 89 | 
 90 |   class_path: torch.optim.AdamW
 91 |   
 92 |   init_args:
 93 |     
 94 |     #   (type: float, default: 0.001)
 95 |     lr: 0.0002
 96 |     
 97 |     #   (type: float, default: 0.01)
 98 |     weight_decay: 0.1
 99 |     
100 |     #   (type: tuple, default: (0.9,0.999))
101 |     betas:
102 |       - 0.9
103 |       - 0.95
104 | 


--------------------------------------------------------------------------------
/extensions/thunder/unsloth/kernels/swiglu.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch
16 | import triton
17 | import triton.language as tl
18 | 
19 | 
20 | @triton.jit
21 | def _fg_kernel(e, g, h, n_elements, BLOCK_SIZE : tl.constexpr,):
22 |     block_idx = tl.program_id(0)
23 |     offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
24 |     mask = offsets < n_elements
25 | 
26 |     e_row = tl.load(e + offsets, mask = mask, other = 0).to(tl.float32)
27 |     g_row = tl.load(g + offsets, mask = mask, other = 0)#.to(tl.float32)
28 | 
29 |     # f = e * sigmoid(e)
30 |     f_row = e_row * tl.sigmoid(e_row) # e_row / (1 + tl.exp(-e_row))
31 |     f_row = f_row.to(g_row.dtype) # Exact copy from HF
32 |     # h = f * g
33 |     h_row = f_row * g_row
34 | 
35 |     # Store h
36 |     tl.store(h + offsets, h_row, mask = mask)
37 | pass
38 | 
39 | 
40 | def swiglu_fg_kernel(e, g):
41 |     batch, seq_len, hd = e.shape
42 |     n_elements = e.numel()
43 |     h = torch.empty((batch, seq_len, hd), dtype = e.dtype, device = "cuda")
44 |     grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
45 |     _fg_kernel[grid](e, g, h, n_elements, BLOCK_SIZE = 1024,)
46 |     return h
47 | pass
48 | 
49 | 
50 | @triton.jit
51 | def _DWf_DW_dfg_kernel(DW, e, g, n_elements, BLOCK_SIZE : tl.constexpr,):
52 |     """
53 |     e = e.float()
54 |     se = 1.0 / (1.0 + torch.exp(-e))
55 |     f = (se * e).to(dtype)
56 |     h = f * g
57 |     df = DW * f
58 |     dg = DW * g
59 |     de = (dg.float() * se * (1.0 + e * (1.0 - se))).to(dtype)
60 |     """
61 |     block_idx = tl.program_id(0)
62 |     offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
63 |     mask = offsets < n_elements
64 | 
65 |     DW_row = tl.load(DW + offsets, mask = mask, other = 0)#.to(tl.float32)
66 |     e_row  = tl.load(e  + offsets, mask = mask, other = 0).to(tl.float32)
67 |     g_row  = tl.load(g  + offsets, mask = mask, other = 0)#.to(tl.float32)
68 | 
69 |     # e = e.float()
70 |     # se = 1.0 / (1.0 + torch.exp(-e))
71 |     se_row = tl.sigmoid(e_row) # 1.0 / (1.0 + tl.exp(-e_row))
72 |     # f = (se * e).to(dtype)
73 |     f_row = se_row * e_row
74 |     f_row = f_row.to(DW_row.dtype)
75 |     # h = f * g
76 |     h_row  =  f_row * g_row
77 |     # df = DW * f
78 |     df_row = DW_row * f_row
79 |     # dg = DW * g
80 |     dg_row = DW_row * g_row
81 |     # de = (dg.float() * se * (1.0 + e * (1.0 - se))).to(dtype)
82 |     de_row = dg_row.to(tl.float32) * se_row * (1.0 + e_row * (1.0 - se_row))
83 |     de_row = de_row.to(DW_row.dtype)
84 | 
85 |     # Store derivatives in buffers
86 |     tl.store(DW + offsets, h_row,  mask = mask) # h  = f * g
87 |     tl.store(e  + offsets, df_row, mask = mask) # df = DW * f
88 |     tl.store(g  + offsets, de_row, mask = mask) # de
89 | pass
90 | 
91 | 
92 | def swiglu_DWf_DW_dfg_kernel(DW, e, g):
93 |     batch_seq_len, hd = e.shape
94 |     n_elements = e.numel()
95 |     grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
96 |     _DWf_DW_dfg_kernel[grid](DW, e, g, n_elements, BLOCK_SIZE = 1024,)
97 |     return DW, e, g
98 | pass
99 | 


--------------------------------------------------------------------------------
/tutorials/finetune_full.md:
--------------------------------------------------------------------------------
 1 | # Finetuning the whole model
 2 | 
 3 | If you are interested in parameter-efficient finetuning, check out [finetune_adapter.md](finetune_adapter.md). In contrast to parameter-efficient finetuning, this "full" approach finetunes all model parameters, which is substantially more expensive. It may only be recommended as a baseline for comparison studies.
 4 | 
 5 | ## Preparation
 6 | 
 7 | The steps here only need to be done once:
 8 | 
 9 | 1. Follow the instructions in the [README](../README.md) to install the dependencies.
10 | 2. Download and convert the weights following our [guide](download_model_weights.md).
11 | 
12 | LitGPT provides common datasets for finetuning, such as Alpaca, LIMA, Dolly, and more.
13 | You can optionally [prepare your own dataset](#tune-on-your-dataset).
14 | For more information about dataset preparation, also see the [prepare_dataset.md](./prepare_dataset.md) tutorial.
15 | 
16 | ## Running the finetuning
17 | 
18 | ```bash
19 | litgpt finetune_full tiiuae/falcon-7b \
20 |   --data Alpaca \
21 | ```
22 | 
23 | Finetuning the falcon-7b model requires at least 8 GPUs with ~40 GB memory each.
24 | 
25 | You can speed up training by passing the `devices` argument to the script to utilize more GPUs if available.
26 | Depending on the available GPU memory, you can also tune the `micro_batch_size` parameter to utilize the GPU efficiently.
27 | 
28 | This script will save checkpoints periodically to the `out_dir` directory. If you are finetuning different models or on your own dataset, you can specify an output directory with your preferred name:
29 | 
30 | ```bash
31 | litgpt finetune_full tiiuae/falcon-7b \
32 |   --data Alpaca \
33 |   --out_dir out/full/my-model-finetuned
34 | ```
35 | 
36 | If your GPU does not support `bfloat16`, you can pass the `--precision 32-true` argument.
37 | For instance, to fine-tune on MPS (the GPU on modern Macs), you can run
38 | 
39 | ```bash
40 | litgpt finetune_full tiiuae/falcon-7b \
41 |   --data Alpaca \
42 |   --out_dir out/full/my-model-finetuned \
43 |   --precision 32-true
44 | ```
45 | 
46 | Note that `mps` as the accelerator will be picked up automatically by Fabric when running on a modern Mac.
47 | 
48 | ## Test the model
49 | 
50 | You can test the finetuned model with your own instructions by running:
51 | 
52 | ```bash
53 | litgpt generate tiiuae/falcon-7b \
54 |     --prompt "Recommend a movie to watch on the weekend." \
55 |     --finetuned_path out/full/my-model-finetuned/lit_model_finetuned.pth
56 | ```
57 | 
58 | Output:
59 | 
60 | ```text
61 | A good movie to watch on the weekend would be The Lion King, since it's a classic family film that everyone can enjoy...
62 | ```
63 | 
64 | If your GPU supports `bfloat16`, the script will automatically use it.
65 | 
66 | ## Tune on your dataset
67 | 
68 | You can easily train on your own instruction dataset saved in JSON format.
69 | 
70 | 1. Create a JSON file in which each row holds one instruction-response pair.
71 |    A row has an entry for 'instruction', 'input', and 'output', where 'input' is optional and can be
72 |    the empty string if the instruction doesn't require a context. Below is an example json file:
73 | 
74 |     ```text
75 |     [
76 |         {
77 |             "instruction": "Arrange the given numbers in ascending order.",
78 |             "input": "2, 4, 0, 8, 3",
79 |             "output": "0, 2, 3, 4, 8"
80 |         },
81 |         ...
82 |     ]
83 |     ```
84 | 
85 | 2. Run `litgpt finetune` by passing in the location of your data (and optionally other parameters):
86 | 
87 |     ```bash
88 |     litgpt finetune tiiuae/falcon-7b \
89 |         --data JSON \
90 |         --data.json_path data/mydata.json \
91 |         --out_dir data/mydata-finetuned
92 |     ```
93 | 


--------------------------------------------------------------------------------
/config_hub/finetune/phi-2/full.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  3 | checkpoint_dir: checkpoints/microsoft/phi-2
  4 | 
  5 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/finetune/full)
  6 | out_dir: out/finetune/full-phi-2
  7 | 
  8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  9 | precision: bf16-true
 10 | 
 11 | # How many devices/GPUs to use (type: Union[int, str], default: 1)
 12 | devices: 2
 13 | 
 14 | # How many nodes to use. (type: int, default: 1)
 15 | num_nodes: 1
 16 | 
 17 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 18 | data:
 19 |   class_path: litgpt.data.Alpaca2k
 20 |   init_args:
 21 |     mask_prompt: false
 22 |     prompt_style: alpaca
 23 |     ignore_index: -100
 24 |     seed: 42
 25 |     num_workers: 4
 26 | 
 27 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 28 | train:
 29 | 
 30 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 31 |   save_interval: 200
 32 | 
 33 |   # Number of iterations between logging calls (type: int, default: 1)
 34 |   log_interval: 1
 35 | 
 36 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64)
 37 |   global_batch_size: 8
 38 | 
 39 |   # Number of samples per data-parallel rank (type: int, default: 1)
 40 |   micro_batch_size: 4
 41 | 
 42 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 43 |   lr_warmup_steps: 200
 44 | 
 45 |   # Number of epochs to train on (type: Optional[int], default: 5)
 46 |   epochs: 1
 47 | 
 48 |   # Total number of tokens to train on (type: Optional[int], default: null)
 49 |   max_tokens:
 50 | 
 51 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 52 |   max_steps: 100
 53 | 
 54 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 55 |   max_seq_length: 512
 56 | 
 57 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 58 |   tie_embeddings:
 59 | 
 60 |   #   (type: Optional[float], default: null)
 61 |   max_norm:
 62 | 
 63 |   #   (type: float, default: 6e-05)
 64 |   min_lr: 6.0e-05
 65 | 
 66 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 67 | eval:
 68 | 
 69 |   # Number of optimizer steps between evaluation calls (type: int, default: 600)
 70 |   interval: 25
 71 | 
 72 |   # Number of tokens to generate (type: Optional[int], default: 100)
 73 |   max_new_tokens: 100
 74 | 
 75 |   # Number of iterations (type: int, default: 100)
 76 |   max_iters: 100
 77 | 
 78 |   # Whether to evaluate on the validation set at the beginning of the training
 79 |   initial_validation: false
 80 | 
 81 |   # Whether to evaluate on the validation set at the end the training
 82 |   final_validation: true
 83 | 
 84 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 85 | logger_name: csv
 86 | 
 87 | # The random seed to use for reproducibility. (type: int, default: 1337)
 88 | seed: 1337
 89 | 
 90 | # Optimizer-related arguments
 91 | optimizer:
 92 | 
 93 |   class_path: torch.optim.AdamW
 94 |   
 95 |   init_args:
 96 |     
 97 |     #   (type: float, default: 0.001)
 98 |     lr: 0.0002
 99 |     
100 |     #   (type: float, default: 0.01)
101 |     weight_decay: 0.1
102 |     
103 |     #   (type: tuple, default: (0.9,0.999))
104 |     betas:
105 |       - 0.9
106 |       - 0.95
107 | 


--------------------------------------------------------------------------------
/config_hub/finetune/gemma-2b/full.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  3 | checkpoint_dir: checkpoints/google/gemma-2b
  4 | 
  5 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
  6 | out_dir: out/finetune/full-gemma-2b
  7 | 
  8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  9 | precision: bf16-true
 10 | 
 11 | # How many devices/GPUs to use. (type: Union[int, str], default: 1)
 12 | devices: 4
 13 | 
 14 | # How many nodes to use. (type: int, default: 1)
 15 | num_nodes: 1
 16 | 
 17 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 18 | data:
 19 |   class_path: litgpt.data.Alpaca2k
 20 |   init_args:
 21 |     mask_prompt: false
 22 |     val_split_fraction: 0.03847
 23 |     prompt_style: alpaca
 24 |     ignore_index: -100
 25 |     seed: 42
 26 |     num_workers: 4
 27 | 
 28 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 29 | train:
 30 | 
 31 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 32 |   save_interval: 800
 33 | 
 34 |   # Number of iterations between logging calls (type: int, default: 1)
 35 |   log_interval: 1
 36 | 
 37 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
 38 |   global_batch_size: 16
 39 | 
 40 |   # Number of samples per data-parallel rank (type: int, default: 4)
 41 |   micro_batch_size: 1
 42 | 
 43 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 44 |   lr_warmup_steps: 100
 45 | 
 46 |   # Number of epochs to train on (type: Optional[int], default: 5)
 47 |   epochs: 1
 48 | 
 49 |   # Total number of tokens to train on (type: Optional[int], default: null)
 50 |   max_tokens:
 51 | 
 52 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 53 |   max_steps: 50
 54 | 
 55 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 56 |   max_seq_length: 512
 57 | 
 58 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 59 |   tie_embeddings:
 60 | 
 61 |   #   (type: Optional[float], default: null)
 62 |   max_norm:
 63 | 
 64 |   #   (type: float, default: 6e-05)
 65 |   min_lr: 6.0e-05
 66 | 
 67 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 68 | eval:
 69 | 
 70 |   # Number of optimizer steps between evaluation calls (type: int, default: 100)
 71 |   interval: 25
 72 | 
 73 |   # Number of tokens to generate (type: Optional[int], default: 100)
 74 |   max_new_tokens: 100
 75 | 
 76 |   # Number of iterations (type: int, default: 100)
 77 |   max_iters: 100
 78 | 
 79 |   # Whether to evaluate on the validation set at the beginning of the training
 80 |   initial_validation: false
 81 | 
 82 |   # Whether to evaluate on the validation set at the end the training
 83 |   final_validation: true
 84 | 
 85 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 86 | logger_name: csv
 87 | 
 88 | # The random seed to use for reproducibility. (type: int, default: 1337)
 89 | seed: 1337
 90 | 
 91 | # Optimizer-related arguments
 92 | optimizer:
 93 | 
 94 |   class_path: torch.optim.AdamW
 95 |   
 96 |   init_args:
 97 |     
 98 |     #   (type: float, default: 0.001)
 99 |     lr: 0.0002
100 |     
101 |     #   (type: float, default: 0.01)
102 |     weight_decay: 0.0
103 |     
104 |     #   (type: tuple, default: (0.9,0.999))
105 |     betas:
106 |       - 0.9
107 |       - 0.95
108 | 


--------------------------------------------------------------------------------
/config_hub/finetune/stablelm-base-alpha-3b/full.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  3 | checkpoint_dir: checkpoints/stabilityai/stablelm-base-alpha-3b
  4 | 
  5 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
  6 | out_dir: out/finetune/full-stablelm-base-alpha-3b
  7 | 
  8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  9 | precision: bf16-true
 10 | 
 11 | # How many devices/GPUs to use. (type: Union[int, str], default: 1)
 12 | devices: 2
 13 | 
 14 | # How many nodes to use. (type: int, default: 1)
 15 | num_nodes: 1
 16 | 
 17 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 18 | data:
 19 |   class_path: litgpt.data.Alpaca2k
 20 |   init_args:
 21 |     mask_prompt: false
 22 |     val_split_fraction: 0.03847
 23 |     prompt_style: alpaca
 24 |     ignore_index: -100
 25 |     seed: 42
 26 |     num_workers: 4
 27 | 
 28 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 29 | train:
 30 | 
 31 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 32 |   save_interval: 800
 33 | 
 34 |   # Number of iterations between logging calls (type: int, default: 1)
 35 |   log_interval: 1
 36 | 
 37 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
 38 |   global_batch_size: 8
 39 | 
 40 |   # Number of samples per data-parallel rank (type: int, default: 4)
 41 |   micro_batch_size: 1
 42 | 
 43 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 44 |   lr_warmup_steps: 1000
 45 | 
 46 |   # Number of epochs to train on (type: Optional[int], default: 5)
 47 |   epochs: 1
 48 | 
 49 |   # Total number of tokens to train on (type: Optional[int], default: null)
 50 |   max_tokens:
 51 | 
 52 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 53 |   max_steps:
 54 | 
 55 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 56 |   max_seq_length: 512
 57 | 
 58 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 59 |   tie_embeddings:
 60 | 
 61 |   #   (type: Optional[float], default: null)
 62 |   max_norm:
 63 | 
 64 |   #   (type: float, default: 6e-05)
 65 |   min_lr: 6.0e-05
 66 | 
 67 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 68 | eval:
 69 | 
 70 |   # Number of optimizer steps between evaluation calls (type: int, default: 100)
 71 |   interval: 25
 72 | 
 73 |   # Number of tokens to generate (type: Optional[int], default: 100)
 74 |   max_new_tokens: 100
 75 | 
 76 |   # Number of iterations (type: int, default: 100)
 77 |   max_iters: 100
 78 | 
 79 |   # Whether to evaluate on the validation set at the beginning of the training
 80 |   initial_validation: false
 81 | 
 82 |   # Whether to evaluate on the validation set at the end the training
 83 |   final_validation: true
 84 | 
 85 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 86 | logger_name: csv
 87 | 
 88 | # The random seed to use for reproducibility. (type: int, default: 1337)
 89 | seed: 1337
 90 | 
 91 | # Optimizer-related arguments
 92 | optimizer:
 93 | 
 94 |   class_path: torch.optim.AdamW
 95 |   
 96 |   init_args:
 97 |     
 98 |     #   (type: float, default: 0.001)
 99 |     lr: 0.0002
100 |     
101 |     #   (type: float, default: 0.01)
102 |     weight_decay: 0.1
103 |     
104 |     #   (type: tuple, default: (0.9,0.999))
105 |     betas:
106 |       - 0.9
107 |       - 0.95
108 | 


--------------------------------------------------------------------------------
/config_hub/finetune/tiny-llama/full.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  3 | checkpoint_dir: checkpoints/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
  4 | 
  5 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
  6 | out_dir: out/finetune/full-tiny-llama-1.1b
  7 | 
  8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  9 | precision: bf16-true
 10 | 
 11 | # How many devices/GPUs to use. (type: Union[int, str], default: 1)
 12 | devices: 1
 13 | 
 14 | # How many nodes to use. (type: int, default: 1)
 15 | num_nodes: 1
 16 | 
 17 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 18 | data:
 19 |   class_path: litgpt.data.Alpaca2k
 20 |   init_args:
 21 |     mask_prompt: false
 22 |     val_split_fraction: 0.03847
 23 |     prompt_style: alpaca
 24 |     ignore_index: -100
 25 |     seed: 42
 26 |     num_workers: 4
 27 | 
 28 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 29 | train:
 30 | 
 31 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 32 |   save_interval: 800
 33 | 
 34 |   # Number of iterations between logging calls (type: int, default: 1)
 35 |   log_interval: 1
 36 | 
 37 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
 38 |   global_batch_size: 32
 39 | 
 40 |   # Number of samples per data-parallel rank (type: int, default: 4)
 41 |   micro_batch_size: 4
 42 | 
 43 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 44 |   lr_warmup_steps: 1000
 45 | 
 46 |   # Number of epochs to train on (type: Optional[int], default: 5)
 47 |   epochs: 1
 48 | 
 49 |   # Total number of tokens to train on (type: Optional[int], default: null)
 50 |   max_tokens:
 51 | 
 52 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 53 |   max_steps:
 54 | 
 55 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 56 |   max_seq_length: 512
 57 | 
 58 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 59 |   tie_embeddings:
 60 | 
 61 |   #   (type: Optional[float], default: null)
 62 |   max_norm:
 63 | 
 64 |   #   (type: float, default: 6e-05)
 65 |   min_lr: 6.0e-05
 66 | 
 67 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 68 | eval:
 69 | 
 70 |   # Number of optimizer steps between evaluation calls (type: int, default: 100)
 71 |   interval: 25
 72 | 
 73 |   # Number of tokens to generate (type: Optional[int], default: 100)
 74 |   max_new_tokens: 100
 75 | 
 76 |   # Number of iterations (type: int, default: 100)
 77 |   max_iters: 100
 78 | 
 79 |   # Whether to evaluate on the validation set at the beginning of the training
 80 |   initial_validation: false
 81 | 
 82 |   # Whether to evaluate on the validation set at the end the training
 83 |   final_validation: true
 84 | 
 85 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 86 | logger_name: csv
 87 | 
 88 | # The random seed to use for reproducibility. (type: int, default: 1337)
 89 | seed: 1337
 90 | 
 91 | # Optimizer-related arguments
 92 | optimizer:
 93 | 
 94 |   class_path: torch.optim.AdamW
 95 |   
 96 |   init_args:
 97 |     
 98 |     #   (type: float, default: 0.001)
 99 |     lr: 0.0002
100 |     
101 |     #   (type: float, default: 0.01)
102 |     weight_decay: 0.0
103 |     
104 |     #   (type: tuple, default: (0.9,0.999))
105 |     betas:
106 |       - 0.9
107 |       - 0.95
108 | 


--------------------------------------------------------------------------------
/litgpt/data/longform.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import json
 4 | from dataclasses import dataclass, field
 5 | from pathlib import Path
 6 | from typing import Optional, Union
 7 | 
 8 | import torch
 9 | from torch.utils.data import DataLoader
10 | 
11 | from litgpt.prompts import PromptStyle
12 | from litgpt.data import DataModule, SFTDataset, get_sft_collate_fn
13 | from litgpt.data.alpaca import download_if_missing
14 | from litgpt.tokenizer import Tokenizer
15 | 
16 | _URL = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset"
17 | 
18 | 
19 | @dataclass
20 | class LongForm(DataModule):
21 |     """LongForm data module for supervised finetuning."""
22 | 
23 |     mask_prompt: bool = False
24 |     """Whether to mask the prompt section from the label (with ``ignore_index``)."""
25 |     prompt_style: Union[str, PromptStyle] = "longform"
26 |     """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
27 |     ignore_index: int = -100
28 |     """The index to use for elements to be ignored in the label."""
29 |     seed: int = 42
30 |     """The random seed for shuffling the dataset."""
31 |     num_workers: int = 4
32 |     """How many DataLoader processes to use for loading."""
33 |     download_dir: Path = Path("./data/longform")
34 |     """The directory in which the downloaded dataset gets saved."""
35 | 
36 |     tokenizer: Optional[Tokenizer] = field(default=None, init=False, repr=False)
37 |     batch_size: int = field(default=1, init=False, repr=False)
38 |     max_seq_length: int = field(default=-1, init=False, repr=False)
39 |     train_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
40 |     test_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
41 | 
42 |     def __post_init__(self) -> None:
43 |         super().__init__()
44 |         if isinstance(self.prompt_style, str):
45 |             self.prompt_style = PromptStyle.from_name(self.prompt_style)
46 | 
47 |     def connect(
48 |         self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: Optional[int] = None
49 |     ) -> None:
50 |         self.tokenizer = tokenizer
51 |         self.batch_size = batch_size
52 |         self.max_seq_length = -1 if max_seq_length is None else max_seq_length
53 | 
54 |     def prepare_data(self) -> None:
55 |         self.download_dir.mkdir(parents=True, exist_ok=True)
56 |         download_if_missing(self.download_dir / "train.json", f"{_URL}/train.json")
57 |         download_if_missing(self.download_dir / "val.json", f"{_URL}/val.json")
58 | 
59 |     def train_dataloader(self):
60 |         return self._dataloader("train")
61 | 
62 |     def val_dataloader(self):
63 |         return self._dataloader("val")
64 | 
65 |     def _dataloader(self, split: str) -> DataLoader:
66 |         with open(self.download_dir / f"{split}.json", "r", encoding="utf-8") as file:
67 |             data = json.load(file)
68 | 
69 |         dataset = SFTDataset(
70 |             data=data,
71 |             tokenizer=self.tokenizer,
72 |             prompt_style=self.prompt_style,
73 |             max_seq_length=self.max_seq_length,
74 |             mask_prompt=self.mask_prompt,
75 |             ignore_index=self.ignore_index,
76 |             transform=_transform,
77 |         )
78 |         return DataLoader(
79 |             dataset=dataset,
80 |             batch_size=self.batch_size,
81 |             shuffle=(split == "train"),
82 |             generator=torch.Generator().manual_seed(self.seed),
83 |             num_workers=self.num_workers,
84 |             collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index),
85 |         )
86 | 
87 | 
88 | def _transform(item: dict) -> dict:
89 |     item["instruction"] = item.pop("input")
90 |     return item
91 | 


--------------------------------------------------------------------------------
/litgpt/args.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | import math
 3 | from dataclasses import dataclass
 4 | from typing import Optional
 5 | import warnings
 6 | 
 7 | 
 8 | @dataclass
 9 | class TrainArgs:
10 |     """Training-related arguments"""
11 | 
12 |     save_interval: Optional[int] = 1000
13 |     """Number of optimizer steps between saving checkpoints"""
14 |     log_interval: int = 1
15 |     """Number of iterations between logging calls"""
16 |     global_batch_size: int = 64
17 |     """Number of samples between optimizer steps across data-parallel ranks"""
18 |     micro_batch_size: int = 4
19 |     """Number of samples per data-parallel rank"""
20 |     lr_warmup_steps: Optional[int] = 100
21 |     """Number of iterations with learning rate warmup active"""
22 |     lr_warmup_fraction: Optional[float] = None
23 |     """The fraction of an epoch to use for learning rate warmup"""
24 |     epochs: Optional[int] = None
25 |     """Number of epochs to train on"""
26 |     # TODO: `pretrain` is the only script using `max_tokens` explicitly. replace it with epoch_size*epochs?
27 |     max_tokens: Optional[int] = None
28 |     """Total number of tokens to train on"""
29 |     max_steps: Optional[int] = None
30 |     """Limits the number of optimizer steps to run"""
31 |     max_seq_length: Optional[int] = None
32 |     """Limits the length of samples"""
33 |     tie_embeddings: Optional[bool] = None
34 |     """Whether to tie the embedding weights with the language modeling head weights"""
35 | 
36 |     # Optimization args
37 |     max_norm: Optional[float] = None
38 |     min_lr: float = 6e-5
39 | 
40 |     def __post_init__(self) -> None:
41 |         if self.lr_warmup_fraction and self.lr_warmup_steps:
42 |             raise ValueError(
43 |                 "Can't provide both `--train.lr_warmup_fraction` and `--train.lr_warmup_steps`. Choose one."
44 |             )
45 |         if self.lr_warmup_fraction and not (0 <= self.lr_warmup_fraction <= 1):
46 |             raise ValueError("`--train.lr_warmup_fraction` must be between 0 and 1.")
47 | 
48 |         if self.lr_warmup_steps and self.max_steps and (self.lr_warmup_steps >= self.max_steps):
49 |             warnings.warn(
50 |                 "`--train.lr_warmup_steps` should be less than `--train.max_steps`."
51 |                 f" Got {self.lr_warmup_steps} lr_warmup_steps and {self.max_steps} max_steps.", UserWarning)
52 | 
53 |     def gradient_accumulation_iters(self, devices: int) -> int:
54 |         """Number of iterations between gradient synchronizations"""
55 |         gradient_accumulation_iters = self.batch_size(devices) // self.micro_batch_size
56 |         assert gradient_accumulation_iters > 0
57 |         return gradient_accumulation_iters
58 | 
59 |     def batch_size(self, devices: int) -> int:
60 |         """Number of samples between optimizer steps per data-parallel rank"""
61 |         batch_size = self.global_batch_size // devices
62 |         assert batch_size > 0
63 |         return batch_size
64 | 
65 |     def warmup_iters(self, devices: int, max_iters: int, train_dataloader) -> int:
66 |         """Number of iterations to warm up the learning rate."""
67 |         if self.lr_warmup_fraction:
68 |             return min(max_iters, math.ceil(self.lr_warmup_fraction * len(train_dataloader)))
69 |         if self.lr_warmup_steps:
70 |             return min(max_iters, self.lr_warmup_steps * self.gradient_accumulation_iters(devices))
71 |         return 0
72 | 
73 | 
74 | @dataclass
75 | class EvalArgs:
76 |     """Evaluation-related arguments"""
77 | 
78 |     interval: int = 600
79 |     """Number of optimizer steps between evaluation calls"""
80 |     max_new_tokens: Optional[int] = None
81 |     """Number of tokens to generate"""
82 |     max_iters: int = 100
83 |     """Number of iterations"""
84 |     initial_validation: bool = False
85 |     """Whether to evaluate on the validation set at the beginning of the training"""
86 |     final_validation: bool = True
87 |     """Whether to evaluate on the validation set at the end of the training"""
88 | 


--------------------------------------------------------------------------------
/tests/test_batch.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import pytest
  3 | import warnings
  4 | from pathlib import Path
  5 | import litgpt
  6 | from litgpt.generate.base import next_token, batched_next_token
  7 | from litgpt.api import LLM, GPT
  8 | from litgpt.scripts.download import download_from_hub
  9 | from tests.conftest import RunIf
 10 | 
 11 | warnings.filterwarnings("ignore")
 12 | 
 13 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="Test requires a GPU.")
 14 | def test_batched_equivalence(tmp_path):
 15 | 
 16 |     model_name = "microsoft/phi-2"
 17 |     download_from_hub(repo_id=model_name, tokenizer_only=True, checkpoint_dir=tmp_path)
 18 | 
 19 |     device = "cuda:0"
 20 |     batch_size = 3
 21 |     sample_kwargs = {"top_k": 1}
 22 | 
 23 |     llm: LLM = LLM.load(
 24 |         model_name,
 25 |         tokenizer_dir=Path(tmp_path / model_name),
 26 |         init="random",
 27 |     )
 28 |     model: GPT = llm.model
 29 |     model.set_kv_cache(batch_size=1, max_seq_length=50, device=device)
 30 | 
 31 |     input_pos_1 = torch.tensor(
 32 |         [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=torch.int64, device=device
 33 |     )
 34 |     input_pos_2 = torch.tensor([10], dtype=torch.int64, device=device)
 35 | 
 36 |     x = torch.tensor(
 37 |         [43993, 25, 1867, 466, 32660, 17485, 4483, 30, 198, 26410],
 38 |         device=device,
 39 |         dtype=torch.int64,
 40 |     )
 41 | 
 42 |     batch_x1 = torch.stack([x] * batch_size, dim=0)
 43 | 
 44 |     # Single token generation baseline
 45 |     tok_1 = next_token(model, input_pos_1, x.unsqueeze(0), **sample_kwargs)
 46 |     tok_2 = next_token(model, input_pos_2, tok_1.unsqueeze(0), **sample_kwargs)
 47 | 
 48 |     assert tok_1.ndim == 1
 49 |     assert tok_2.ndim == 1
 50 |     assert tok_1.size(0) == 1
 51 |     assert tok_2.size(0) == 1
 52 | 
 53 |     # Switch to batched generation
 54 |     model.clear_kv_cache()
 55 |     model.set_kv_cache(batch_size=batch_size, max_seq_length=50, device="cuda:0")
 56 | 
 57 |     toks_1: torch.Tensor = batched_next_token(model, input_pos_1, batch_x1, sample_kwargs)
 58 |     toks_2: torch.Tensor = batched_next_token(model, input_pos_2, toks_1, sample_kwargs)
 59 | 
 60 |     assert toks_1.ndim == 2
 61 |     assert toks_2.ndim == 2
 62 |     assert toks_1.size(0) == batch_size
 63 |     assert toks_2.size(0) == batch_size
 64 | 
 65 |     # Assert that single and batched next token generation are equivalent
 66 |     assert all(t == tok_1 for t in toks_1), f"{tok_1} != {toks_1}"
 67 |     assert all(t == tok_2 for t in toks_2), f"{tok_2} != {toks_2}"
 68 | 
 69 | 
 70 | @RunIf(min_cuda_gpus=1)
 71 | def test_simple_batch():
 72 |     old_allow_tf32 = torch.backends.cuda.matmul.allow_tf32
 73 |     torch.backends.cuda.matmul.allow_tf32 = False
 74 |     config = litgpt.Config.from_name(
 75 |         "Llama-3.1-8B", padded_vocab_size=10000, n_layer=2, n_head=8, n_embd=256
 76 |     )
 77 |     with torch.device("cuda"):
 78 |         m = litgpt.GPT(config).requires_grad_(False).eval()
 79 |         x0 = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 7]])
 80 |         input_pos0 = torch.tensor([[0, 1, 2, 3], [0, 1, 2, 2]])
 81 |         x1 = torch.tensor([[1], [2]])
 82 |         input_pos1 = torch.tensor([[4], [3]])
 83 | 
 84 |     with torch.device("cuda"):
 85 |         m.set_kv_cache(2)
 86 |     outs0 = m(x0, input_pos0)
 87 |     outs1 = m(x1, input_pos1)
 88 | 
 89 |     with torch.device("cuda"):
 90 |         m.set_kv_cache(1)
 91 | 
 92 |     outs0_ref0 = m(x0[:1], input_pos0[0])
 93 |     outs1_ref0 = m(x1[:1], input_pos1[0])
 94 | 
 95 |     with torch.device("cuda"):
 96 |         m.set_kv_cache(1)
 97 | 
 98 |     outs0_ref1 = m(x0[1:], input_pos0[1])
 99 |     outs1_ref1 = m(x1[1:], input_pos1[1])
100 | 
101 |     outs0_ref = torch.cat([outs0_ref0, outs0_ref1])
102 |     outs1_ref = torch.cat([outs1_ref0, outs1_ref1])
103 | 
104 |     print(outs0_ref - outs0)
105 |     print(outs0.shape)
106 |     torch.testing.assert_close(outs0, outs0_ref)
107 |     torch.testing.assert_close(outs1, outs1_ref)
108 |     torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32
109 | 


--------------------------------------------------------------------------------
/tests/test_thunder_ddp.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from pathlib import Path
  3 | 
  4 | import pytest
  5 | import torch
  6 | from tests.conftest import RunIf
  7 | from lightning import Fabric
  8 | 
  9 | # support running without installing as a package
 10 | wd = Path(__file__).parent.parent.resolve()
 11 | sys.path.append(str(wd))
 12 | 
 13 | from extensions.thunder.strategies.thunder_ddp import ThunderDDPStrategy
 14 | from extensions.thunder.strategies.thunder_fsdp import ThunderFSDPStrategy
 15 | 
 16 | 
 17 | @RunIf(thunder=True)
 18 | def test_thunder_strategy_input_parsing():
 19 |     with pytest.raises(ValueError, match="doesn't have an effect with `jit=False"):
 20 |         ThunderDDPStrategy(jit=False, executors=("python",))
 21 | 
 22 | 
 23 | @RunIf(min_cuda_gpus=2, thunder=True, standalone=True)
 24 | @pytest.mark.parametrize("choice", ["ddp", "thunder_ddp", "fsdp", "thunder_fsdp"])
 25 | def test_no_backward_sync(choice):
 26 |     if choice == "thunder_ddp":
 27 |         strategy = ThunderDDPStrategy()
 28 |     elif choice == "thunder_fsdp":
 29 |         strategy = ThunderFSDPStrategy()
 30 |     else:
 31 |         strategy = choice
 32 | 
 33 |     fabric = Fabric(devices=2, accelerator="cuda", strategy=strategy)
 34 |     fabric.launch()
 35 | 
 36 |     # account for sharding in the case of FSDP
 37 |     out_features = 1 if "ddp" in choice else fabric.world_size
 38 |     
 39 |     model = torch.nn.Linear(1, out_features, bias=False, device=fabric.device)
 40 |     x = torch.randn(1, 1, device=fabric.device)
 41 |     model = fabric.setup(model)
 42 | 
 43 |     # 6 iters, 3 grad accumulation iters
 44 |     for i, enabled in enumerate((True, True, False, True, True, False), 1):
 45 |         x = torch.tensor([i * (fabric.local_rank + 1)], device=fabric.device, dtype=torch.float32)
 46 | 
 47 |         with fabric.no_backward_sync(model, enabled):
 48 |             y = model(x)
 49 |             fabric.backward(y.sum())
 50 |         if not enabled:
 51 |             # Math for the first 3 iters
 52 |             #
 53 |             # DistributedDataParallel
 54 |             # (1*1+2*1+3*1 + 1*2+2*2+3*2) / 2       = 9
 55 |             #  ^^^^^^^^^^^   ^^^^^^^^^^^  ^^^
 56 |             #  rank0         rank1        allreduce
 57 |             #
 58 |             # thunder.distributed.ddp
 59 |             # ((1*1+2*1) + (1*2+2*2)) / 2        + (3*1 + 3*2)  / 2        = 9
 60 |             #   ^^^^^^^     ^^^^^^^   ^^^           ^^^   ^^^   ^^^
 61 |             #   rank0       rank1     allreduce1    rank0 rank1 allreduce2
 62 |             assert model.weight.grad.shape.numel() == 1, model.weight.grad.shape
 63 |             assert model.weight.grad.item() == (9.0 if i == 3 else 22.5)
 64 |             assert not hasattr(model.weight, "_thunder_fsdp_unsharded_grad")
 65 |             model.weight.grad = None
 66 |         elif choice == "thunder_fsdp":
 67 |             assert model.weight._thunder_fsdp_unsharded_grad.shape == (2, 1)
 68 |             assert model.weight.grad is None
 69 | 
 70 | 
 71 | @RunIf(min_cuda_gpus=2, thunder=True, standalone=True)
 72 | @pytest.mark.parametrize("jit", (False, True))
 73 | def test_jit_before_setup(jit):
 74 |     import thunder
 75 | 
 76 |     fabric = Fabric(devices=2, accelerator="cuda", strategy=ThunderDDPStrategy(jit=jit))
 77 |     fabric.launch()
 78 | 
 79 |     x = torch.randn(1, 1, device=fabric.device)
 80 |     model = torch.nn.Linear(1, 2, bias=False, device=fabric.device)
 81 | 
 82 |     tmodel = thunder.jit(model)
 83 |     fmodel = fabric.setup(tmodel)
 84 |     fmodel(x)
 85 | 
 86 |     assert "all_reduce" in thunder.last_backward_traces(tmodel)[-1].python()
 87 | 
 88 | 
 89 | @RunIf(min_cuda_gpus=1, thunder=True)
 90 | def test_setup_already_traced():
 91 |     import thunder
 92 | 
 93 |     device = torch.device("cuda")
 94 |     x = torch.randn(1, 1, device=device)
 95 |     model = torch.nn.Linear(1, 2, bias=False, device=device)
 96 | 
 97 |     strategy = ThunderDDPStrategy()
 98 | 
 99 |     tmodel = thunder.jit(model)
100 |     tmodel(x)
101 |     with pytest.raises(RuntimeError, match="already called"):
102 |         strategy.setup_module(tmodel)
103 | 


--------------------------------------------------------------------------------
/config_hub/finetune/llama-2-7b/full.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  3 | checkpoint_dir: checkpoints/meta-llama/Llama-2-7b-hf
  4 | 
  5 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/finetune/full)
  6 | out_dir: out/finetune/full-llama2-7b
  7 | 
  8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  9 | precision: bf16-true
 10 | 
 11 | # How many devices/GPUs to use (type: Union[int, str], default: 1)
 12 | devices: 4
 13 | 
 14 | # How many nodes to use. (type: int, default: 1)
 15 | num_nodes: 1
 16 | 
 17 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
 18 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
 19 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
 20 | # (type: Union[bool, Literal["auto"], Path], default: False)
 21 | resume: false
 22 | 
 23 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 24 | data:
 25 |   class_path: litgpt.data.Alpaca2k
 26 |   init_args:
 27 |     mask_prompt: false
 28 |     prompt_style: alpaca
 29 |     ignore_index: -100
 30 |     seed: 42
 31 |     num_workers: 4
 32 | 
 33 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 34 | train:
 35 | 
 36 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 37 |   save_interval: 200
 38 | 
 39 |   # Number of iterations between logging calls (type: int, default: 1)
 40 |   log_interval: 1
 41 | 
 42 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64)
 43 |   global_batch_size: 64
 44 | 
 45 |   # Number of samples per data-parallel rank (type: int, default: 1)
 46 |   micro_batch_size: 4
 47 | 
 48 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 49 |   lr_warmup_steps: 25
 50 | 
 51 |   # Number of epochs to train on (type: Optional[int], default: 5)
 52 |   epochs: 1
 53 | 
 54 |   # Total number of tokens to train on (type: Optional[int], default: null)
 55 |   max_tokens:
 56 | 
 57 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 58 |   max_steps:
 59 | 
 60 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 61 |   max_seq_length: 512
 62 | 
 63 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 64 |   tie_embeddings:
 65 | 
 66 |   #   (type: Optional[float], default: null)
 67 |   max_norm:
 68 | 
 69 |   #   (type: float, default: 6e-05)
 70 |   min_lr: 6.0e-05
 71 | 
 72 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 73 | eval:
 74 | 
 75 |   # Number of optimizer steps between evaluation calls (type: int, default: 600)
 76 |   interval: 25
 77 | 
 78 |   # Number of tokens to generate (type: Optional[int], default: 100)
 79 |   max_new_tokens: 100
 80 | 
 81 |   # Number of iterations (type: int, default: 100)
 82 |   max_iters: 100
 83 | 
 84 |   # Whether to evaluate on the validation set at the beginning of the training
 85 |   initial_validation: false
 86 | 
 87 |   # Whether to evaluate on the validation set at the end the training
 88 |   final_validation: true
 89 | 
 90 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 91 | logger_name: csv
 92 | 
 93 | # The random seed to use for reproducibility. (type: int, default: 1337)
 94 | seed: 1337
 95 | 
 96 | # Optimizer-related arguments
 97 | optimizer:
 98 | 
 99 |   class_path: torch.optim.AdamW
100 |   
101 |   init_args:
102 |     
103 |     #   (type: float, default: 0.001)
104 |     lr: 0.0002
105 |     
106 |     #   (type: float, default: 0.01)
107 |     weight_decay: 0.0
108 |     
109 |     #   (type: tuple, default: (0.9,0.999))
110 |     betas:
111 |       - 0.9
112 |       - 0.95
113 | 


--------------------------------------------------------------------------------
/config_hub/finetune/llama-3-8b/full.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  3 | checkpoint_dir: checkpoints/meta-llama/Meta-Llama-3-8B
  4 | 
  5 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/finetune/full)
  6 | out_dir: out/finetune/full-llama-3-8b
  7 | 
  8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  9 | precision: bf16-true
 10 | 
 11 | # How many devices/GPUs to use (type: Union[int, str], default: 1)
 12 | devices: 4
 13 | 
 14 | # How many nodes to use. (type: int, default: 1)
 15 | num_nodes: 1
 16 | 
 17 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
 18 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
 19 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
 20 | # (type: Union[bool, Literal["auto"], Path], default: False)
 21 | resume: false
 22 | 
 23 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 24 | data:
 25 |   class_path: litgpt.data.Alpaca2k
 26 |   init_args:
 27 |     mask_prompt: false
 28 |     prompt_style: alpaca
 29 |     ignore_index: -100
 30 |     seed: 42
 31 |     num_workers: 4
 32 | 
 33 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 34 | train:
 35 | 
 36 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 37 |   save_interval: 200
 38 | 
 39 |   # Number of iterations between logging calls (type: int, default: 1)
 40 |   log_interval: 1
 41 | 
 42 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64)
 43 |   global_batch_size: 64
 44 | 
 45 |   # Number of samples per data-parallel rank (type: int, default: 1)
 46 |   micro_batch_size: 4
 47 | 
 48 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 49 |   lr_warmup_steps: 25
 50 | 
 51 |   # Number of epochs to train on (type: Optional[int], default: 5)
 52 |   epochs: 1
 53 | 
 54 |   # Total number of tokens to train on (type: Optional[int], default: null)
 55 |   max_tokens:
 56 | 
 57 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 58 |   max_steps:
 59 | 
 60 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 61 |   max_seq_length: 512
 62 | 
 63 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 64 |   tie_embeddings:
 65 | 
 66 |   #   (type: Optional[float], default: null)
 67 |   max_norm:
 68 | 
 69 |   #   (type: float, default: 6e-05)
 70 |   min_lr: 6.0e-05
 71 | 
 72 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 73 | eval:
 74 | 
 75 |   # Number of optimizer steps between evaluation calls (type: int, default: 600)
 76 |   interval: 25
 77 | 
 78 |   # Number of tokens to generate (type: Optional[int], default: 100)
 79 |   max_new_tokens: 100
 80 | 
 81 |   # Number of iterations (type: int, default: 100)
 82 |   max_iters: 100
 83 | 
 84 |   # Whether to evaluate on the validation set at the beginning of the training
 85 |   initial_validation: false
 86 | 
 87 |   # Whether to evaluate on the validation set at the end the training
 88 |   final_validation: true
 89 | 
 90 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 91 | logger_name: csv
 92 | 
 93 | # The random seed to use for reproducibility. (type: int, default: 1337)
 94 | seed: 1337
 95 | 
 96 | # Optimizer-related arguments
 97 | optimizer:
 98 | 
 99 |   class_path: torch.optim.AdamW
100 |   
101 |   init_args:
102 |     
103 |     #   (type: float, default: 0.001)
104 |     lr: 0.0002
105 |     
106 |     #   (type: float, default: 0.01)
107 |     weight_decay: 0.1
108 |     
109 |     #   (type: tuple, default: (0.9,0.999))
110 |     betas:
111 |       - 0.9
112 |       - 0.95
113 | 


--------------------------------------------------------------------------------
/config_hub/finetune/llama-3.1-8b/full.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  3 | checkpoint_dir: checkpoints/meta-llama/Meta-Llama-3.1-8B
  4 | 
  5 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/finetune/full)
  6 | out_dir: out/finetune/full-llama-3.1-8b
  7 | 
  8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  9 | precision: bf16-true
 10 | 
 11 | # How many devices/GPUs to use (type: Union[int, str], default: 1)
 12 | devices: 4
 13 | 
 14 | # How many nodes to use. (type: int, default: 1)
 15 | num_nodes: 1
 16 | 
 17 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
 18 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
 19 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
 20 | # (type: Union[bool, Literal["auto"], Path], default: False)
 21 | resume: false
 22 | 
 23 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 24 | data:
 25 |   class_path: litgpt.data.Alpaca2k
 26 |   init_args:
 27 |     mask_prompt: false
 28 |     prompt_style: alpaca
 29 |     ignore_index: -100
 30 |     seed: 42
 31 |     num_workers: 4
 32 | 
 33 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 34 | train:
 35 | 
 36 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 37 |   save_interval: 200
 38 | 
 39 |   # Number of iterations between logging calls (type: int, default: 1)
 40 |   log_interval: 1
 41 | 
 42 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64)
 43 |   global_batch_size: 64
 44 | 
 45 |   # Number of samples per data-parallel rank (type: int, default: 1)
 46 |   micro_batch_size: 4
 47 | 
 48 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 49 |   lr_warmup_steps: 25
 50 | 
 51 |   # Number of epochs to train on (type: Optional[int], default: 5)
 52 |   epochs: 1
 53 | 
 54 |   # Total number of tokens to train on (type: Optional[int], default: null)
 55 |   max_tokens:
 56 | 
 57 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 58 |   max_steps:
 59 | 
 60 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 61 |   max_seq_length: 512
 62 | 
 63 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 64 |   tie_embeddings:
 65 | 
 66 |   #   (type: Optional[float], default: null)
 67 |   max_norm:
 68 | 
 69 |   #   (type: float, default: 6e-05)
 70 |   min_lr: 6.0e-05
 71 | 
 72 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 73 | eval:
 74 | 
 75 |   # Number of optimizer steps between evaluation calls (type: int, default: 600)
 76 |   interval: 25
 77 | 
 78 |   # Number of tokens to generate (type: Optional[int], default: 100)
 79 |   max_new_tokens: 100
 80 | 
 81 |   # Number of iterations (type: int, default: 100)
 82 |   max_iters: 100
 83 | 
 84 |   # Whether to evaluate on the validation set at the beginning of the training
 85 |   initial_validation: false
 86 | 
 87 |   # Whether to evaluate on the validation set at the end the training
 88 |   final_validation: true
 89 | 
 90 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 91 | logger_name: csv
 92 | 
 93 | # The random seed to use for reproducibility. (type: int, default: 1337)
 94 | seed: 1337
 95 | 
 96 | # Optimizer-related arguments
 97 | optimizer:
 98 | 
 99 |   class_path: torch.optim.AdamW
100 |   
101 |   init_args:
102 |     
103 |     #   (type: float, default: 0.001)
104 |     lr: 0.0002
105 |     
106 |     #   (type: float, default: 0.01)
107 |     weight_decay: 0.1
108 |     
109 |     #   (type: tuple, default: (0.9,0.999))
110 |     betas:
111 |       - 0.9
112 |       - 0.95
113 | 


--------------------------------------------------------------------------------
/tests/test_merge_lora.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import os
 4 | import shutil
 5 | from contextlib import redirect_stdout
 6 | from io import StringIO
 7 | from pathlib import Path
 8 | from unittest import mock
 9 | 
10 | import pytest
11 | import torch
12 | import yaml
13 | 
14 | from litgpt.lora import GPT as LoRAGPT
15 | from litgpt.lora import lora_filter
16 | from litgpt.model import GPT
17 | from litgpt.scripts.merge_lora import load_lora_metadata, merge_lora
18 | 
19 | 
20 | @mock.patch.dict(os.environ, {"LT_ACCELERATOR": "cpu"})
21 | @pytest.mark.parametrize(
22 |     ("pretrained_dtype", "lora_dtype"), [(None, None), (torch.float16, torch.float32), (torch.float16, torch.bfloat16)]
23 | )
24 | def test_merge_lora(tmp_path, fake_checkpoint_dir, pretrained_dtype, lora_dtype):
25 |     pretrained_checkpoint_dir = tmp_path / "pretrained"
26 |     lora_checkpoint_dir = tmp_path / "lora"
27 |     shutil.copytree(fake_checkpoint_dir, pretrained_checkpoint_dir)
28 |     shutil.copytree(fake_checkpoint_dir, lora_checkpoint_dir)
29 |     (lora_checkpoint_dir / "lit_model.pth").unlink()  # should not already exist
30 |     shutil.rmtree(tmp_path / "checkpoints")
31 | 
32 |     # Create a fake pretrained checkpoint
33 |     config = dict(block_size=128, padded_vocab_size=256, n_layer=3, n_head=8, n_embd=16)
34 |     with open(pretrained_checkpoint_dir / "model_config.yaml", "w", encoding="utf-8") as fp:
35 |         yaml.dump(config, fp)
36 |     base_model = GPT.from_name("pythia-14m", **config).to(dtype=pretrained_dtype)
37 |     state_dict = base_model.state_dict()
38 |     assert len(state_dict) == 40
39 |     torch.save(state_dict, pretrained_checkpoint_dir / "lit_model.pth")
40 | 
41 |     # Create a fake LoRA checkpoint
42 |     lora_kwargs = dict(lora_r=8, lora_alpha=16, lora_dropout=0.05, lora_query=True, lora_value=True)
43 |     lora_model = LoRAGPT.from_name("pythia-14m", **config, **lora_kwargs).to(dtype=lora_dtype)
44 |     state_dict = {k: v for k, v in lora_model.state_dict().items() if lora_filter(k, v)}
45 |     assert len(state_dict) == 6
46 |     torch.save(state_dict, lora_checkpoint_dir / "lit_model.pth.lora")
47 |     hparams = dict(checkpoint_dir=str(pretrained_checkpoint_dir), **lora_kwargs)
48 |     with open(lora_checkpoint_dir / "hyperparameters.yaml", "w", encoding="utf-8") as file:
49 |         yaml.dump(hparams, file)
50 |     shutil.copyfile(pretrained_checkpoint_dir / "model_config.yaml", lora_checkpoint_dir / "model_config.yaml")
51 | 
52 |     assert set(os.listdir(tmp_path)) == {"lora", "pretrained"}
53 |     merge_lora(lora_checkpoint_dir)
54 |     assert set(os.listdir(tmp_path)) == {"lora", "pretrained"}
55 |     assert set(os.listdir(lora_checkpoint_dir)) == {
56 |         "model_config.yaml",
57 |         "lit_model.pth",
58 |         "lit_model.pth.lora",
59 |         "tokenizer.json",
60 |         "tokenizer_config.json",
61 |         "hyperparameters.yaml",
62 |     }
63 | 
64 |     # Assert that the merged weights can be loaded back into the base model
65 |     merged = torch.load(lora_checkpoint_dir / "lit_model.pth")
66 |     keys = base_model.load_state_dict(merged, strict=True)
67 |     assert not keys.missing_keys
68 |     assert not keys.unexpected_keys
69 | 
70 |     # Attempt to merge again
71 |     stdout = StringIO()
72 |     with redirect_stdout(stdout):
73 |         merge_lora(lora_checkpoint_dir)
74 |     assert "LoRA weights have already been merged" in stdout.getvalue()
75 | 
76 | 
77 | def test_load_lora_metadata(fake_checkpoint_dir):
78 |     assert not (fake_checkpoint_dir / "hyperparameters.yaml").is_file()
79 |     with pytest.raises(FileNotFoundError, match="missing a `hyperparameters.yaml` file"):
80 |         load_lora_metadata(fake_checkpoint_dir)
81 | 
82 |     hparams = dict(precision="bf16-mixed", checkpoint_dir="checkpoints/meta-llama/Llama-2-7b", lora_r=8, lora_alpha=16)
83 |     with open(fake_checkpoint_dir / "hyperparameters.yaml", "w", encoding="utf-8") as file:
84 |         yaml.dump(hparams, file)
85 | 
86 |     lora_args, pretrained_dir, precision = load_lora_metadata(fake_checkpoint_dir)
87 |     assert lora_args == dict(lora_r=8, lora_alpha=16)
88 |     assert pretrained_dir == Path("checkpoints/meta-llama/Llama-2-7b")
89 |     assert precision == "bf16-mixed"
90 | 


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
  1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
  2 | 
  3 | import pytest
  4 | import yaml
  5 | 
  6 | import litgpt.config as config_module
  7 | from litgpt import Config
  8 | 
  9 | 
 10 | def test_config():
 11 |     config = Config()
 12 |     assert config.name == ""
 13 |     assert config.block_size == 4096
 14 | 
 15 |     config = Config(block_size=2048)
 16 |     assert config.block_size == 2048
 17 | 
 18 |     config = Config.from_name("pythia-14m")
 19 |     assert config.block_size == 512
 20 | 
 21 |     config = Config.from_name("pythia-14m", block_size=4096)
 22 |     assert config.block_size == 4096
 23 | 
 24 |     config = Config(hf_config={"name": "pythia-14m"})
 25 |     assert config.name == "pythia-14m"
 26 | 
 27 | 
 28 | def test_from_hf_name():
 29 |     # by short-hand name
 30 |     config0 = Config.from_name("tiny-llama-1.1b")
 31 |     # or by huggingface hub repo name
 32 |     config1 = Config.from_name("TinyLlama-1.1B-intermediate-step-1431k-3T")
 33 |     assert config0 is not None
 34 |     assert config1 is not None
 35 |     assert config0 == config1
 36 | 
 37 | 
 38 | def test_nonexisting_name():
 39 |     with pytest.raises(ValueError, match="'invalid-model-name' is not a supported config name"):
 40 |         Config.from_name("invalid-model-name")
 41 | 
 42 | 
 43 | @pytest.mark.parametrize("config", config_module.configs, ids=[c["name"] for c in config_module.configs])
 44 | def test_short_and_hf_names_are_equal_unless_on_purpose(config):
 45 |     # by short-hand name
 46 |     config0 = Config.from_name(config["name"])
 47 |     # or by huggingface hub repo name
 48 |     config1 = Config.from_name(config["hf_config"]["name"])
 49 |     assert config0.name == config1.name
 50 | 
 51 | 
 52 | def test_from_hf_name_with_org_string():
 53 |     # Test case 1: valid input
 54 |     config0 = Config.from_name("tiny-llama-1.1b")
 55 |     config1 = Config.from_name("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T")
 56 |     assert config0 is not None
 57 |     assert config1 is not None
 58 |     assert config0 == config1
 59 | 
 60 |     # Test case 2: invalid input - org not found
 61 |     with pytest.raises(ValueError, match="'UnknownOrg/TinyLlama-1.1B-intermediate-step-1431k-3T' is not a supported config name"):
 62 |         Config.from_name("UnknownOrg/TinyLlama-1.1B-intermediate-step-1431k-3T")
 63 | 
 64 |     # Test case 3: invalid input - name not found
 65 |     with pytest.raises(ValueError, match="'TinyLlama/TinyLlama-XYZ' is not a supported config name"):
 66 |         Config.from_name("TinyLlama/TinyLlama-XYZ")
 67 | 
 68 | 
 69 | def test_from_checkpoint(tmp_path):
 70 |     # 1. Neither `lit_config.py` nor matching config exists.
 71 |     with pytest.raises(FileNotFoundError, match="neither 'model_config.yaml' nor matching config exists"):
 72 |         Config.from_checkpoint(tmp_path / "non_existing_checkpoint")
 73 | 
 74 |     # 2. If `lit_config.py` doesn't exists, but there is a matching config in `litgpt/config.py`.
 75 |     config = Config.from_checkpoint(tmp_path / "pythia-14m")
 76 |     assert config.name == "pythia-14m"
 77 |     assert config.block_size == 512
 78 |     assert config.n_layer == 6
 79 | 
 80 |     # 3. If only `lit_config.py` exists.
 81 |     config_data = {"name": "pythia-14m", "block_size": 24, "n_layer": 2}
 82 |     with open(tmp_path / "model_config.yaml", "w", encoding="utf-8") as file:
 83 |         yaml.dump(config_data, file)
 84 |     config = Config.from_checkpoint(tmp_path)
 85 |     assert config.name == "pythia-14m"
 86 |     assert config.block_size == 24
 87 |     assert config.n_layer == 2
 88 | 
 89 |     # 4. Both `lit_config.py` and a matching config exist, but `lit_config.py` supersedes matching config
 90 |     (tmp_path / "pythia-14m").mkdir()
 91 |     with open(tmp_path / "pythia-14m/model_config.yaml", "w", encoding="utf-8") as file:
 92 |         yaml.dump(config_data, file)
 93 |     config = Config.from_checkpoint(tmp_path / "pythia-14m")
 94 |     assert config.name == "pythia-14m"
 95 |     assert config.block_size == 24
 96 |     assert config.n_layer == 2
 97 | 
 98 | 
 99 | @pytest.mark.parametrize("head_size", [None, 128])
100 | def test_head_size(head_size):
101 |     config = Config(head_size)
102 | 
103 |     assert config.head_size == head_size or config.n_embd // config.n_head
104 | 


--------------------------------------------------------------------------------
/config_hub/pretrain/debug.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
  3 | # ``model_config``. (type: Optional[str], default: null)
  4 | model_name: pythia-14m
  5 | 
  6 | # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
  7 | # ``model_config``. (type: Optional[Config], default: null)
  8 | model_config:
  9 | 
 10 | # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
 11 | # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
 12 | out_dir: out/pretrain/debug
 13 | 
 14 | # The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
 15 | precision: bf16-mixed
 16 | 
 17 | # Optional path to a checkpoint directory to initialize the model from.
 18 | # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
 19 | initial_checkpoint_dir:
 20 | 
 21 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
 22 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
 23 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
 24 | # (type: Union[bool, Literal["auto"], Path], default: False)
 25 | resume: false
 26 | 
 27 | # Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
 28 | data: TinyStories
 29 | 
 30 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 31 | train:
 32 | 
 33 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 34 |   save_interval: 1000
 35 | 
 36 |   # Number of iterations between logging calls (type: int, default: 1)
 37 |   log_interval: 1
 38 | 
 39 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
 40 |   global_batch_size: 125
 41 | 
 42 |   # Number of samples per data-parallel rank (type: int, default: 4)
 43 |   micro_batch_size: 5
 44 | 
 45 |   # Number of iterations with learning rate warmup active (type: int, default: 2000)
 46 |   lr_warmup_steps: 100
 47 | 
 48 |   # Number of epochs to train on (type: Optional[int], default: null)
 49 |   epochs:
 50 | 
 51 |   # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
 52 |   max_tokens: 100000000
 53 | 
 54 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 55 |   max_steps:
 56 | 
 57 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 58 |   max_seq_length:
 59 | 
 60 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
 61 |   tie_embeddings:
 62 | 
 63 |   #   (type: Optional[float], default: 1.0)
 64 |   max_norm: 1.0
 65 | 
 66 |   #   (type: float, default: 4e-05)
 67 |   min_lr: 6e-5
 68 | 
 69 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 70 | eval:
 71 | 
 72 |   # Number of optimizer steps between evaluation calls (type: int, default: 1000)
 73 |   interval: 1000
 74 | 
 75 |   # Number of tokens to generate (type: Optional[int], default: null)
 76 |   max_new_tokens:
 77 | 
 78 |   # Number of iterations (type: int, default: 100)
 79 |   max_iters: 100
 80 | 
 81 |   # Whether to evaluate on the validation set at the beginning of the training
 82 |   initial_validation: false
 83 | 
 84 |   # Whether to evaluate on the validation set at the end the training
 85 |   final_validation: false
 86 | 
 87 | # Optimizer-related arguments
 88 | optimizer:
 89 | 
 90 |   class_path: torch.optim.AdamW
 91 |   
 92 |   init_args:
 93 |     
 94 |     #   (type: float, default: 0.001)
 95 |     lr: 6e-4
 96 |     
 97 |     #   (type: float, default: 0.01)
 98 |     weight_decay: 0.1
 99 |     
100 |     #   (type: tuple, default: (0.9,0.999))
101 |     betas:
102 |       - 0.9
103 |       - 0.95
104 | 
105 | # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
106 | devices: auto
107 | 
108 | # How many nodes to use. (type: int, default: 1)
109 | num_nodes: 1
110 | 
111 | # Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
112 | # module require this. (type: Optional[Path], default: null)
113 | tokenizer_dir: checkpoints/EleutherAI/pythia-14m
114 | 
115 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
116 | logger_name: tensorboard
117 | 
118 | # The random seed to use for reproducibility. (type: int, default: 42)
119 | seed: 42
120 | 


--------------------------------------------------------------------------------
/config_hub/pretrain/tinyllama.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
  3 | # ``model_config``. (type: Optional[str], default: null)
  4 | model_name: tiny-llama-1.1b
  5 | 
  6 | # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
  7 | # ``model_config``. (type: Optional[Config], default: null)
  8 | model_config:
  9 | 
 10 | # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
 11 | # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
 12 | out_dir: out/pretrain/tiny-llama
 13 | 
 14 | # The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
 15 | precision: bf16-mixed
 16 | 
 17 | # Optional path to a checkpoint directory to initialize the model from.
 18 | # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
 19 | initial_checkpoint_dir:
 20 | 
 21 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
 22 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
 23 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
 24 | # (type: Union[bool, Literal["auto"], Path], default: False)
 25 | resume: false
 26 | 
 27 | # Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
 28 | data: TinyLlama
 29 | 
 30 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 31 | train:
 32 | 
 33 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 34 |   save_interval: 1000
 35 | 
 36 |   # Number of iterations between logging calls (type: int, default: 1)
 37 |   log_interval: 1
 38 | 
 39 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
 40 |   global_batch_size: 512
 41 | 
 42 |   # Number of samples per data-parallel rank (type: int, default: 4)
 43 |   micro_batch_size: 4
 44 | 
 45 |   # Number of iterations with learning rate warmup active (type: int, default: 2000)
 46 |   lr_warmup_steps: 2000
 47 | 
 48 |   # Number of epochs to train on (type: Optional[int], default: null)
 49 |   epochs:
 50 | 
 51 |   # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
 52 |   max_tokens: 3000000000000
 53 | 
 54 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 55 |   max_steps:
 56 | 
 57 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 58 |   max_seq_length: 2048
 59 | 
 60 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
 61 |   tie_embeddings:
 62 | 
 63 |   #   (type: Optional[float], default: 1.0)
 64 |   max_norm: 1.0
 65 | 
 66 |   #   (type: float, default: 4e-05)
 67 |   min_lr: 4.0e-05
 68 | 
 69 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 70 | eval:
 71 | 
 72 |   # Number of optimizer steps between evaluation calls (type: int, default: 1000)
 73 |   interval: 1000
 74 | 
 75 |   # Number of tokens to generate (type: Optional[int], default: null)
 76 |   max_new_tokens:
 77 | 
 78 |   # Number of iterations (type: int, default: 100)
 79 |   max_iters: 100
 80 | 
 81 |   # Whether to evaluate on the validation set at the beginning of the training
 82 |   initial_validation: false
 83 | 
 84 |   # Whether to evaluate on the validation set at the end the training
 85 |   final_validation: false
 86 | 
 87 | # Optimizer-related arguments
 88 | optimizer:
 89 | 
 90 |   class_path: torch.optim.AdamW
 91 |   
 92 |   init_args:
 93 |     
 94 |     #   (type: float, default: 0.001)
 95 |     lr: 4e-4
 96 |     
 97 |     #   (type: float, default: 0.01)
 98 |     weight_decay: 0.1
 99 |     
100 |     #   (type: tuple, default: (0.9,0.999))
101 |     betas:
102 |       - 0.9
103 |       - 0.95
104 | 
105 | # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
106 | devices: auto
107 | 
108 | # How many nodes to use. (type: int, default: 1)
109 | num_nodes: 1
110 | 
111 | # Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
112 | # module require this. (type: Optional[Path], default: null)
113 | tokenizer_dir: checkpoints/meta-llama/Llama-2-7b-hf
114 | 
115 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
116 | logger_name: tensorboard
117 | 
118 | # The random seed to use for reproducibility. (type: int, default: 42)
119 | seed: 42
120 | 


--------------------------------------------------------------------------------
/litgpt/data/tinyllama.py:
--------------------------------------------------------------------------------
  1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
  2 | from dataclasses import dataclass, field
  3 | from pathlib import Path
  4 | from typing import Optional, Union
  5 | 
  6 | from torch.utils.data import DataLoader
  7 | 
  8 | from litgpt.tokenizer import Tokenizer
  9 | from litgpt.data import DataModule
 10 | 
 11 | 
 12 | @dataclass
 13 | class TinyLlama(DataModule):
 14 |     """The TinyLlama data module is composed of a mix of SlimPajama and Starcoder data.
 15 | 
 16 |     Provides training and validation streaming dataloaders that return batches of tokens.
 17 |     """
 18 | 
 19 |     data_path: Union[str, Path] = Path("data/")
 20 |     """The path to the data directory, containing two folders 'slimpajama' and 'starcoder'
 21 |     which are the output of the preprocessing step done in advance. See the `tutorial/pretrain_tinyllama.md`
 22 |     for instructions. The path can also be a remote path (e.g., s3://)."""
 23 |     seed: int = 42
 24 |     """The random seed for shuffling the dataset."""
 25 |     num_workers: int = 8
 26 |     """How many DataLoader processes to use for loading."""
 27 |     use_starcoder: bool = True
 28 |     """Toggle for using Starcoder data."""
 29 | 
 30 |     batch_size: int = field(init=False, repr=False, default=1)
 31 |     seq_length: int = field(init=False, repr=False, default=2048)
 32 | 
 33 |     def __post_init__(self):
 34 |         super().__init__()
 35 |         # Could be a remote path (s3://) or a local path
 36 |         self.slimpajama_train = str(self.data_path).rstrip("/") + "/slimpajama/train"
 37 |         self.slimpajama_val = str(self.data_path).rstrip("/") + "/slimpajama/val"
 38 |         self.required_paths = [self.slimpajama_train, self.slimpajama_val]
 39 | 
 40 |         if self.use_starcoder:
 41 |             self.starcoder_train = str(self.data_path).rstrip("/") + "/starcoder"
 42 |             self.required_paths += [self.starcoder_train]
 43 | 
 44 |     def connect(
 45 |         self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: Optional[int] = None
 46 |     ) -> None:
 47 |         self.batch_size = batch_size
 48 |         self.seq_length = max_seq_length + 1  # Increase by one because we need the next token as well
 49 | 
 50 |     def prepare_data(self) -> None:
 51 |         for path in self.required_paths:
 52 |             if not path.startswith("s3://") and not Path(path).is_dir():
 53 |                 raise FileNotFoundError(
 54 |                     "The data path for TinyLlama is expected to be the directory containing these subdirectories:"
 55 |                     f" `slimpajama/train`, `slimpajama/val`, `starcoder`. The directory {path} does not exist."
 56 |                     " Set it via `--data.data_path=...`"
 57 |                 )
 58 | 
 59 |     def train_dataloader(self) -> DataLoader:
 60 |         from litdata.streaming import CombinedStreamingDataset, StreamingDataLoader, StreamingDataset, TokensLoader
 61 | 
 62 |         slim_train_data = StreamingDataset(
 63 |             input_dir=self.slimpajama_train,
 64 |             item_loader=TokensLoader(block_size=self.seq_length),
 65 |             shuffle=True,
 66 |             drop_last=True,
 67 |         )
 68 |         train_data = slim_train_data
 69 | 
 70 |         if self.use_starcoder:
 71 |             train_datasets = [
 72 |                 slim_train_data,
 73 |                 StreamingDataset(
 74 |                     input_dir=self.starcoder_train,
 75 |                     item_loader=TokensLoader(block_size=self.seq_length),
 76 |                     shuffle=True,
 77 |                     drop_last=True,
 78 |                 ),
 79 |             ]
 80 | 
 81 |             # Mix SlimPajama data and Starcoder data with these proportions:
 82 |             weights = (0.693584, 0.306416)
 83 |             train_data = CombinedStreamingDataset(
 84 |                 datasets=train_datasets, seed=self.seed, weights=weights, iterate_over_all=False
 85 |             )
 86 | 
 87 |         train_dataloader = StreamingDataLoader(
 88 |             train_data, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True
 89 |         )
 90 |         return train_dataloader
 91 | 
 92 |     def val_dataloader(self) -> DataLoader:
 93 |         from litdata.streaming import StreamingDataLoader, StreamingDataset, TokensLoader
 94 | 
 95 |         val_dataset = StreamingDataset(
 96 |             input_dir=self.slimpajama_val,
 97 |             item_loader=TokensLoader(block_size=self.seq_length),
 98 |             shuffle=True,
 99 |         )
100 |         val_dataloader = StreamingDataLoader(
101 |             val_dataset, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True
102 |         )
103 |         return val_dataloader
104 | 


--------------------------------------------------------------------------------
/tests/test_pretrain.py:
--------------------------------------------------------------------------------
  1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
  2 | 
  3 | import os
  4 | from contextlib import redirect_stdout
  5 | from io import StringIO
  6 | from unittest import mock
  7 | from unittest.mock import ANY, Mock
  8 | 
  9 | import pytest
 10 | import torch
 11 | from lightning.fabric.strategies import FSDPStrategy, SingleDeviceStrategy
 12 | from torch.utils.data import DataLoader
 13 | 
 14 | from litgpt import pretrain
 15 | from litgpt.args import EvalArgs, TrainArgs
 16 | from litgpt.config import Config
 17 | from litgpt.pretrain import initialize_weights
 18 | from tests.conftest import RunIf
 19 | 
 20 | 
 21 | @RunIf(min_cuda_gpus=2, standalone=True)
 22 | # Set CUDA_VISIBLE_DEVICES for FSDP hybrid-shard, if fewer GPUs are used than are available
 23 | @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
 24 | # If we were to use `save_hyperparameters()`, we would have to patch `sys.argv` or otherwise
 25 | # the CLI would capture pytest args, but unfortunately patching would mess with subprocess
 26 | # launching, so we need to mock `save_hyperparameters()`
 27 | @mock.patch("litgpt.pretrain.save_hyperparameters")
 28 | def test_pretrain(_, tmp_path):
 29 |     model_config = Config(block_size=2, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8)
 30 | 
 31 |     dataset = torch.tensor([[0, 1, 2], [3, 4, 5], [0, 1, 2]])
 32 |     dataloader = DataLoader(dataset)
 33 |     pretrain.get_dataloaders = Mock(return_value=(dataloader, dataloader))
 34 | 
 35 |     out_dir = tmp_path / "out"
 36 |     stdout = StringIO()
 37 |     with redirect_stdout(stdout):
 38 |         pretrain.setup(
 39 |             "pythia-14m",
 40 |             devices=2,
 41 |             model_config=model_config,
 42 |             out_dir=out_dir,
 43 |             train=TrainArgs(global_batch_size=2, max_tokens=16, save_interval=1, micro_batch_size=1, max_norm=1.0),
 44 |             eval=EvalArgs(interval=1, max_iters=1, final_validation=False),
 45 |         )
 46 | 
 47 |     if torch.distributed.get_rank() == 0:
 48 |         # tmp_path is not the same across all ranks, run assert only on rank 0
 49 |         out_dir_contents = set(os.listdir(out_dir))
 50 |         checkpoint_dirs = {"step-00000001", "step-00000002", "step-00000003", "step-00000004", "final"}
 51 |         assert checkpoint_dirs.issubset(out_dir_contents)
 52 |         assert all((out_dir / p).is_dir() for p in checkpoint_dirs)
 53 |         for checkpoint_dir in checkpoint_dirs:
 54 |             # the `tokenizer_dir` is None by default, so only 'lit_model.pth' shows here
 55 |             assert set(os.listdir(out_dir / checkpoint_dir)) == {"lit_model.pth", "model_config.yaml"}
 56 | 
 57 |         assert (out_dir / "logs" / "tensorboard" / "version_0").is_dir()
 58 | 
 59 |         # logs only appear on rank 0
 60 |         logs = stdout.getvalue()
 61 |         assert logs.count("(step)") == 4
 62 |         assert logs.count("val loss") == 4
 63 |         assert "Total parameters: 1,888" in logs
 64 | 
 65 |     torch.distributed.barrier()
 66 | 
 67 | 
 68 | @RunIf(min_cuda_gpus=2, standalone=True)
 69 | # Set CUDA_VISIBLE_DEVICES for FSDP hybrid-shard, if fewer GPUs are used than are available
 70 | @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
 71 | @mock.patch("litgpt.pretrain.L.Fabric.load_raw")
 72 | # See comment in `test_pretrain` why we need to mock `save_hyperparameters()`
 73 | @mock.patch("litgpt.pretrain.save_hyperparameters")
 74 | def test_initial_checkpoint_dir(_, load_mock, tmp_path):
 75 |     model_config = Config(block_size=2, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8)
 76 | 
 77 |     dataset = torch.tensor([[0, 1, 2], [3, 4, 5], [0, 1, 2]])
 78 |     dataloader = DataLoader(dataset)
 79 |     pretrain.get_dataloaders = Mock(return_value=(dataloader, dataloader))
 80 |     pretrain.fit = Mock()
 81 | 
 82 |     pretrain.setup("pythia-14m", initial_checkpoint_dir=tmp_path, devices=2, model_config=model_config, out_dir=tmp_path)
 83 | 
 84 |     load_mock.assert_called_once_with(tmp_path / "lit_model.pth", ANY)
 85 | 
 86 | 
 87 | @pytest.mark.parametrize(("strategy", "expected"), [(SingleDeviceStrategy, True), (FSDPStrategy, False)])
 88 | def test_initialize_weights(strategy, expected):
 89 |     fabric_mock = Mock()
 90 |     fabric_mock.strategy = Mock(spec=strategy)
 91 | 
 92 |     class Child(torch.nn.Module):
 93 |         pass
 94 | 
 95 |     class Parent(torch.nn.Module):
 96 |         def __init__(self):
 97 |             super().__init__()
 98 |             self.child = Child()
 99 | 
100 |     model = Parent()
101 |     model.reset_parameters = Mock()
102 |     model.child.reset_parameters = Mock()
103 | 
104 |     initialize_weights(fabric_mock, model, n_layer=2, n_embd=8)
105 |     assert model.reset_parameters.call_count == int(expected)
106 |     assert model.child.reset_parameters.call_count == int(expected)
107 | 


--------------------------------------------------------------------------------
/config_hub/pretrain/microllama.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
  3 | # ``model_config``. (type: Optional[str], default: null)
  4 | model_name: micro-llama-300M
  5 | 
  6 | # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
  7 | # ``model_config``. (type: Optional[Config], default: null)
  8 | model_config:
  9 | 
 10 | # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
 11 | # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
 12 | out_dir: out/pretrain/micro-llama
 13 | 
 14 | # The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
 15 | precision: bf16-mixed
 16 | 
 17 | # Optional path to a checkpoint directory to initialize the model from.
 18 | # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
 19 | initial_checkpoint_dir:
 20 | 
 21 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
 22 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
 23 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
 24 | # (type: Union[bool, Literal["auto"], Path], default: False)
 25 | resume: false
 26 | 
 27 | # Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
 28 | data: MicroLlama
 29 | 
 30 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 31 | train:
 32 | 
 33 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 34 |   save_interval: 1000
 35 | 
 36 |   # Number of iterations between logging calls (type: int, default: 1)
 37 |   log_interval: 1
 38 | 
 39 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 48)
 40 |   # Scale this number according to the number of GPU and memory size per GPU
 41 |   # For example, we used 48 for 4 x 24G 4090 
 42 |   global_batch_size: 48
 43 | 
 44 |   # Number of samples per data-parallel rank (type: int, default: 12)
 45 |   # Scale this number according to the memory size per GPU
 46 |   # For example, we used 12 for 24G 4090
 47 |   micro_batch_size: 12
 48 | 
 49 |   # Number of iterations with learning rate warmup active (type: int, default: 2000)
 50 |   lr_warmup_steps: 2000
 51 | 
 52 |   # Number of epochs to train on (type: Optional[int], default: null)
 53 |   epochs:
 54 | 
 55 |   # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
 56 |   max_tokens: 3000000000000
 57 | 
 58 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 59 |   max_steps:
 60 | 
 61 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 62 |   max_seq_length: 2048
 63 | 
 64 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
 65 |   tie_embeddings:
 66 | 
 67 |   #   (type: Optional[float], default: 1.0)
 68 |   max_norm: 1.0
 69 | 
 70 |   #   (type: float, default: 4e-05)
 71 |   min_lr: 4.0e-05
 72 | 
 73 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 74 | eval:
 75 | 
 76 |   # Number of optimizer steps between evaluation calls (type: int, default: 1000)
 77 |   interval: 1000
 78 | 
 79 |   # Number of tokens to generate (type: Optional[int], default: null)
 80 |   max_new_tokens:
 81 | 
 82 |   # Number of iterations (type: int, default: 100)
 83 |   max_iters: 100
 84 | 
 85 |   # Whether to evaluate on the validation set at the beginning of the training
 86 |   initial_validation: false
 87 | 
 88 | # Optimizer-related arguments
 89 | optimizer:
 90 | 
 91 |   class_path: torch.optim.AdamW
 92 |   
 93 |   init_args:
 94 |     
 95 |     #   (type: float, default: 0.001)
 96 |     lr: 4e-4
 97 |     
 98 |     #   (type: float, default: 0.01)
 99 |     weight_decay: 0.1
100 |     
101 |     #   (type: tuple, default: (0.9,0.999))
102 |     betas:
103 |       - 0.9
104 |       - 0.95
105 | 
106 | # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
107 | devices: auto
108 | 
109 | # How many nodes to use. (type: int, default: 1)
110 | num_nodes: 1
111 | 
112 | # Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
113 | # module require this. (type: Optional[Path], default: null)
114 | tokenizer_dir: checkpoints/meta-llama/Llama-2-7b-hf
115 | 
116 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
117 | logger_name: tensorboard
118 | 
119 | # The random seed to use for reproducibility. (type: int, default: 42)
120 | seed: 42
121 | 


--------------------------------------------------------------------------------
/config_hub/finetune/phi-3/lora.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  3 | checkpoint_dir: checkpoints/microsoft/Phi-3-mini-4k-instruct
  4 | 
  5 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
  6 | out_dir: out/finetune/lora-phi-3
  7 | 
  8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  9 | precision: bf16-true
 10 | 
 11 | # If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
 12 | quantize:
 13 | 
 14 | # How many devices/GPUs to use. (type: Union[int, str], default: 1)
 15 | devices: 1
 16 | 
 17 | # The LoRA rank. (type: int, default: 8)
 18 | lora_r: 8
 19 | 
 20 | # The LoRA alpha. (type: int, default: 16)
 21 | lora_alpha: 16
 22 | 
 23 | # The LoRA dropout value. (type: float, default: 0.05)
 24 | lora_dropout: 0.05
 25 | 
 26 | # Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
 27 | lora_query: true
 28 | 
 29 | # Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
 30 | lora_key: true
 31 | 
 32 | # Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
 33 | lora_value: true
 34 | 
 35 | # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
 36 | lora_projection: true
 37 | 
 38 | # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
 39 | lora_mlp: true
 40 | 
 41 | # Whether to apply LoRA to output head in GPT. (type: bool, default: False)
 42 | lora_head: true
 43 | 
 44 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 45 | data:
 46 |   class_path: litgpt.data.Alpaca2k
 47 |   init_args:
 48 |     mask_prompt: false
 49 |     val_split_fraction: 0.03847
 50 |     prompt_style: alpaca
 51 |     ignore_index: -100
 52 |     seed: 42
 53 |     num_workers: 4
 54 | 
 55 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 56 | train:
 57 | 
 58 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 59 |   save_interval: 800
 60 | 
 61 |   # Number of iterations between logging calls (type: int, default: 1)
 62 |   log_interval: 1
 63 | 
 64 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
 65 |   global_batch_size: 8
 66 | 
 67 |   # Number of samples per data-parallel rank (type: int, default: 4)
 68 |   micro_batch_size: 4
 69 | 
 70 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 71 |   lr_warmup_steps: 10
 72 | 
 73 |   # Number of epochs to train on (type: Optional[int], default: 5)
 74 |   epochs: 1
 75 | 
 76 |   # Total number of tokens to train on (type: Optional[int], default: null)
 77 |   max_tokens:
 78 | 
 79 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 80 |   max_steps:
 81 | 
 82 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 83 |   max_seq_length: 512
 84 | 
 85 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 86 |   tie_embeddings:
 87 | 
 88 |   #   (type: Optional[float], default: null)
 89 |   max_norm:
 90 | 
 91 |   #   (type: float, default: 6e-05)
 92 |   min_lr: 6.0e-05
 93 | 
 94 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 95 | eval:
 96 | 
 97 |   # Number of optimizer steps between evaluation calls (type: int, default: 100)
 98 |   interval: 100
 99 | 
100 |   # Number of tokens to generate (type: Optional[int], default: 100)
101 |   max_new_tokens: 100
102 | 
103 |   # Number of iterations (type: int, default: 100)
104 |   max_iters: 100
105 | 
106 |   # Whether to evaluate on the validation set at the beginning of the training
107 |   initial_validation: false
108 | 
109 |   # Whether to evaluate on the validation set at the end the training
110 |   final_validation: true
111 | 
112 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
113 | logger_name: csv
114 | 
115 | # The random seed to use for reproducibility. (type: int, default: 1337)
116 | seed: 1337
117 | 
118 | # Optimizer-related arguments
119 | optimizer:
120 | 
121 |   class_path: torch.optim.AdamW
122 |   
123 |   init_args:
124 |     
125 |     #   (type: float, default: 0.001)
126 |     lr: 0.0002
127 |     
128 |     #   (type: float, default: 0.01)
129 |     weight_decay: 0.0
130 |     
131 |     #   (type: tuple, default: (0.9,0.999))
132 |     betas:
133 |       - 0.9
134 |       - 0.95
135 | 


--------------------------------------------------------------------------------
/config_hub/finetune/phi-3/qlora.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  3 | checkpoint_dir: checkpoints/microsoft/Phi-3-mini-4k-instruct
  4 | 
  5 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
  6 | out_dir: out/finetune/qlora-phi-3
  7 | 
  8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  9 | precision: bf16-true
 10 | 
 11 | # If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
 12 | quantize: bnb.nf4
 13 | 
 14 | # How many devices/GPUs to use. (type: Union[int, str], default: 1)
 15 | devices: 1
 16 | 
 17 | # The LoRA rank. (type: int, default: 8)
 18 | lora_r: 8
 19 | 
 20 | # The LoRA alpha. (type: int, default: 16)
 21 | lora_alpha: 16
 22 | 
 23 | # The LoRA dropout value. (type: float, default: 0.05)
 24 | lora_dropout: 0.05
 25 | 
 26 | # Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
 27 | lora_query: true
 28 | 
 29 | # Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
 30 | lora_key: true
 31 | 
 32 | # Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
 33 | lora_value: true
 34 | 
 35 | # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
 36 | lora_projection: true
 37 | 
 38 | # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
 39 | lora_mlp: true
 40 | 
 41 | # Whether to apply LoRA to output head in GPT. (type: bool, default: False)
 42 | lora_head: true
 43 | 
 44 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 45 | data:
 46 |   class_path: litgpt.data.Alpaca2k
 47 |   init_args:
 48 |     mask_prompt: false
 49 |     val_split_fraction: 0.03847
 50 |     prompt_style: alpaca
 51 |     ignore_index: -100
 52 |     seed: 42
 53 |     num_workers: 4
 54 | 
 55 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 56 | train:
 57 | 
 58 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 59 |   save_interval: 800
 60 | 
 61 |   # Number of iterations between logging calls (type: int, default: 1)
 62 |   log_interval: 1
 63 | 
 64 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
 65 |   global_batch_size: 8
 66 | 
 67 |   # Number of samples per data-parallel rank (type: int, default: 4)
 68 |   micro_batch_size: 4
 69 | 
 70 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 71 |   lr_warmup_steps: 10
 72 | 
 73 |   # Number of epochs to train on (type: Optional[int], default: 5)
 74 |   epochs: 1
 75 | 
 76 |   # Total number of tokens to train on (type: Optional[int], default: null)
 77 |   max_tokens:
 78 | 
 79 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 80 |   max_steps:
 81 | 
 82 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 83 |   max_seq_length: 512
 84 | 
 85 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 86 |   tie_embeddings:
 87 | 
 88 |   #   (type: Optional[float], default: null)
 89 |   max_norm:
 90 | 
 91 |   #   (type: float, default: 6e-05)
 92 |   min_lr: 6.0e-05
 93 | 
 94 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 95 | eval:
 96 | 
 97 |   # Number of optimizer steps between evaluation calls (type: int, default: 100)
 98 |   interval: 100
 99 | 
100 |   # Number of tokens to generate (type: Optional[int], default: 100)
101 |   max_new_tokens: 100
102 | 
103 |   # Number of iterations (type: int, default: 100)
104 |   max_iters: 100
105 | 
106 |   # Whether to evaluate on the validation set at the beginning of the training
107 |   initial_validation: false
108 | 
109 |   # Whether to evaluate on the validation set at the end the training
110 |   final_validation: true
111 | 
112 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
113 | logger_name: csv
114 | 
115 | # The random seed to use for reproducibility. (type: int, default: 1337)
116 | seed: 1337
117 | 
118 | # Optimizer-related arguments
119 | optimizer:
120 | 
121 |   class_path: torch.optim.AdamW
122 |   
123 |   init_args:
124 |     
125 |     #   (type: float, default: 0.001)
126 |     lr: 0.0002
127 |     
128 |     #   (type: float, default: 0.01)
129 |     weight_decay: 0.0
130 |     
131 |     #   (type: tuple, default: (0.9,0.999))
132 |     betas:
133 |       - 0.9
134 |       - 0.95
135 | 


--------------------------------------------------------------------------------
/tutorials/convert_lit_models.md:
--------------------------------------------------------------------------------
  1 | ## Converting LitGPT weights to Hugging Face Transformers
  2 | 
  3 | LitGPT weights need to be converted to a format that Hugging Face understands with a [conversion script](../litgpt/scripts/convert_lit_checkpoint.py) before our scripts can run.
  4 | 
  5 | We provide a helpful command to convert models LitGPT models back to their equivalent Hugging Face Transformers format:
  6 | 
  7 | ```bash
  8 | litgpt convert_from_litgpt checkpoint_dir converted_dir
  9 | ```
 10 | 
 11 | These paths are just placeholders, you will need to customize them based on which finetuning or pretraining command you ran and its configuration.
 12 | 
 13 | ### Loading converted LitGPT checkpoints into transformers
 14 | 
 15 | 
 16 | For example,
 17 | 
 18 | ```bash
 19 | cp checkpoints/repo_id/config.json converted/config.json
 20 | ```
 21 | 
 22 | Then, you can load the checkpoint file in a Python session as follows:
 23 | 
 24 | ```python
 25 | import torch
 26 | from transformers import AutoModel
 27 | 
 28 | 
 29 | state_dict = torch.load("output_dir/model.pth")
 30 | model = AutoModel.from_pretrained(
 31 |     "output_dir/", local_files_only=True, state_dict=state_dict
 32 | )
 33 | ```
 34 | 
 35 | Alternatively, you can also load the model without copying the `config.json` file as follows:
 36 | 
 37 | ```python
 38 | model = AutoModel.from_pretrained("online_repo_id", state_dict=state_dict)
 39 | ```
 40 | 
 41 | 
 42 | 
 43 | ### Merging LoRA weights
 44 | 
 45 | Please note that if you want to convert a model that has been finetuned using an adapter like LoRA, these weights should be [merged](../litgpt/scripts/merge_lora.py) to the checkpoint prior to converting.
 46 | 
 47 | ```sh
 48 | litgpt merge_lora path/to/lora/checkpoint_dir
 49 | ```
 50 | 
 51 | <br>
 52 | <br>
 53 | 
 54 | # A finetuning and conversion tutorial
 55 | 
 56 | This section contains a reproducible example for finetuning a LitGPT model and converting it back into a HF `transformer` model.
 57 | 
 58 | 1. Download a model of interest:
 59 | 
 60 | For convenience, we first specify an environment variable (optional) to avoid copy and pasting the whole path:
 61 | 
 62 | ```bash
 63 | export repo_id=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
 64 | ```
 65 | 
 66 | Instead of using TinyLlama, you can replace the `repo_id` target with any other model repository
 67 | specifier that is currently supported by LitGPT. You can get a list of supported repository specifier
 68 | by running `litgpt/scripts/download.py` without any additional arguments.
 69 | 
 70 | Then, we download the model we specified via `$repo_id` above:
 71 | 
 72 | ```bash
 73 | litgpt download $repo_id
 74 | ```
 75 | 
 76 | 2. Finetune the model:
 77 | 
 78 | 
 79 | ```bash
 80 | export finetuned_dir=out/lit-finetuned-model
 81 | 
 82 | litgpt finetune_lora $repo_id \
 83 |    --out_dir $finetuned_dir \
 84 |    --train.epochs 1 \
 85 |    --data Alpaca
 86 | ```
 87 | 
 88 | 3. Merge LoRA weights:
 89 | 
 90 | Note that this step only applies if the model was finetuned with `lora.py` above and not when `full.py` was used for finetuning.
 91 | 
 92 | ```bash
 93 | litgpt merge_lora $finetuned_dir/final
 94 | ```
 95 | 
 96 | 
 97 | 4. Convert the finetuning model back into a HF format:
 98 | 
 99 | ```bash
100 | litgpt convert_from_litgpt $finetuned_dir/final/ out/hf-tinyllama/converted
101 | ```
102 | 
103 | 
104 | 5. Load the model into a `transformers` model:
105 | 
106 | ```python
107 | import torch
108 | from transformers import AutoModel
109 | 
110 | state_dict = torch.load('out/hf-tinyllama/converted/model.pth')
111 | model = AutoModel.from_pretrained("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", state_dict=state_dict)
112 | ```
113 | 
114 | &nbsp;
115 | ## Using the LM Evaluation Harness
116 | 
117 | To evaluate LitGPT models, use the integrated evaluation utilities based on Eleuther AI's LM Evaluation Harness. For more information, please see the [evaluation](evaluation.md) documentation.
118 | 
119 | Alternatively, if you wish to use converted LitGPT models with the LM Evaluation Harness from [Eleuther AI's GitHub repository](https://github.com/EleutherAI/lm-evaluation-harness), you can use the following steps.
120 | 
121 | 1. Follow the instructions above to load the model into a Hugging Face transformers model.
122 | 
123 | 2. Create a `model.safetensor` file:
124 | 
125 | ```python
126 | model.save_pretrained("out/hf-tinyllama/converted/")
127 | ```
128 | 
129 | 3. Copy the tokenizer files into the model-containing directory:
130 | 
131 | ```bash
132 | cp checkpoints/$repo_id/tokenizer* out/hf-tinyllama/converted
133 | ```
134 | 
135 | 4. Run the evaluation harness, for example:
136 | 
137 | ```bash
138 | lm_eval --model hf \
139 |     --model_args pretrained=out/hf-tinyllama/converted \
140 |     --tasks "hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge" \
141 |     --device "cuda:0" \
142 |     --batch_size 4
143 | ```
144 | 


--------------------------------------------------------------------------------
/config_hub/finetune/falcon-7b/lora.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  3 | checkpoint_dir: checkpoints/tiiuae/falcon-7b
  4 | 
  5 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
  6 | out_dir: out/finetune/lora-falcon-7b
  7 | 
  8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  9 | precision: bf16-true
 10 | 
 11 | # If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
 12 | quantize:
 13 | 
 14 | # How many devices/GPUs to use. (type: Union[int, str], default: 1)
 15 | devices: 1
 16 | 
 17 | # How many nodes to use. (type: int, default: 1)
 18 | num_nodes: 1
 19 | 
 20 | # The LoRA rank. (type: int, default: 8)
 21 | lora_r: 32
 22 | 
 23 | # The LoRA alpha. (type: int, default: 16)
 24 | lora_alpha: 16
 25 | 
 26 | # The LoRA dropout value. (type: float, default: 0.05)
 27 | lora_dropout: 0.05
 28 | 
 29 | # Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
 30 | lora_query: true
 31 | 
 32 | # Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
 33 | lora_key: false
 34 | 
 35 | # Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
 36 | lora_value: true
 37 | 
 38 | # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
 39 | lora_projection: false
 40 | 
 41 | # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
 42 | lora_mlp: false
 43 | 
 44 | # Whether to apply LoRA to output head in GPT. (type: bool, default: False)
 45 | lora_head: false
 46 | 
 47 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 48 | data:
 49 |   class_path: litgpt.data.Alpaca2k
 50 |   init_args:
 51 |     mask_prompt: false
 52 |     prompt_style: alpaca
 53 |     ignore_index: -100
 54 |     seed: 42
 55 |     num_workers: 4
 56 | 
 57 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 58 | train:
 59 | 
 60 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 61 |   save_interval: 200
 62 | 
 63 |   # Number of iterations between logging calls (type: int, default: 1)
 64 |   log_interval: 1
 65 | 
 66 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
 67 |   global_batch_size: 8
 68 | 
 69 |   # Number of samples per data-parallel rank (type: int, default: 4)
 70 |   micro_batch_size: 1
 71 | 
 72 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 73 |   lr_warmup_steps: 10
 74 | 
 75 |   # Number of epochs to train on (type: Optional[int], default: 5)
 76 |   epochs: 4
 77 | 
 78 |   # Total number of tokens to train on (type: Optional[int], default: null)
 79 |   max_tokens:
 80 | 
 81 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 82 |   max_steps:
 83 | 
 84 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 85 |   max_seq_length: 512
 86 | 
 87 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 88 |   tie_embeddings:
 89 | 
 90 |   #   (type: Optional[float], default: null)
 91 |   max_norm:
 92 | 
 93 |   #   (type: float, default: 6e-05)
 94 |   min_lr: 6.0e-05
 95 | 
 96 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 97 | eval:
 98 | 
 99 |   # Number of optimizer steps between evaluation calls (type: int, default: 100)
100 |   interval: 100
101 | 
102 |   # Number of tokens to generate (type: Optional[int], default: 100)
103 |   max_new_tokens: 100
104 | 
105 |   # Number of iterations (type: int, default: 100)
106 |   max_iters: 100
107 | 
108 |   # Whether to evaluate on the validation set at the beginning of the training
109 |   initial_validation: false
110 | 
111 |   # Whether to evaluate on the validation set at the end the training
112 |   final_validation: true
113 | 
114 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
115 | logger_name: csv
116 | 
117 | # The random seed to use for reproducibility. (type: int, default: 1337)
118 | seed: 1337
119 | 
120 | # Optimizer-related arguments
121 | optimizer:
122 | 
123 |   class_path: torch.optim.AdamW
124 |   
125 |   init_args:
126 |     
127 |     #   (type: float, default: 0.001)
128 |     lr: 0.0002
129 |     
130 |     #   (type: float, default: 0.01)
131 |     weight_decay: 0.0
132 |     
133 |     #   (type: tuple, default: (0.9,0.999))
134 |     betas:
135 |       - 0.9
136 |       - 0.95
137 | 


--------------------------------------------------------------------------------
/litgpt/data/openwebtext.py:
--------------------------------------------------------------------------------
  1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
  2 | import os
  3 | from dataclasses import dataclass, field
  4 | from functools import partial
  5 | from pathlib import Path
  6 | from typing import Optional, Union
  7 | 
  8 | from torch.utils.data import DataLoader
  9 | 
 10 | from litgpt.tokenizer import Tokenizer
 11 | from litgpt.data import DataModule
 12 | 
 13 | 
 14 | @dataclass
 15 | class OpenWebText(DataModule):
 16 |     """The OpenWebText data module for pretraining."""
 17 | 
 18 |     data_path: Union[str, Path] = Path("data/openwebtext")
 19 |     """The path to the data directory, containing two folders 'train' and 'val'
 20 |     which are the output of the preprocessing step. The path can also be a remote path (e.g., s3://)."""
 21 |     val_split_fraction: float = 0.0005
 22 |     """The fraction of data that should be put aside for validation."""
 23 |     seed: int = 42
 24 |     """The seed to use for shuffling the training data."""
 25 |     num_workers: int = 8
 26 |     """The number of workers to use for the dataloaders."""
 27 | 
 28 |     tokenizer: Optional[Tokenizer] = field(default=None, repr=False, init=False)
 29 |     batch_size: int = field(default=1, repr=False, init=False)
 30 |     seq_length: int = field(default=2048, repr=False, init=False)
 31 | 
 32 |     def __post_init__(self) -> None:
 33 |         super().__init__()
 34 |         # Could be a remote path (s3://) or a local path
 35 |         self.data_path_train = str(self.data_path).rstrip("/") + "/train"
 36 |         self.data_path_val = str(self.data_path).rstrip("/") + "/val"
 37 | 
 38 |     def connect(
 39 |         self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: Optional[int] = 2048
 40 |     ) -> None:
 41 |         self.tokenizer = tokenizer
 42 |         self.batch_size = batch_size
 43 |         self.seq_length = max_seq_length + 1  # Increase by one because we need the next token as well
 44 | 
 45 |     def prepare_data(self) -> None:
 46 |         from datasets import Dataset, load_dataset
 47 |         from litdata import optimize
 48 | 
 49 |         if str(self.data_path).startswith("s3://"):
 50 |             print(f"The OpenWebText data path points to an S3 location: {self.data_path}. Skipping preprocessing.")
 51 |             return
 52 | 
 53 |         if Path(self.data_path_train).is_dir() and Path(self.data_path_val).is_dir():
 54 |             print(f"Found OpenWebText train and val dir: {self.data_path}. Skipping preprocessing.")
 55 |             return
 56 | 
 57 |         dataset = load_dataset("openwebtext", num_proc=(os.cpu_count() // 2), trust_remote_code=True)
 58 | 
 59 |         # Split the data in training and validation
 60 |         split_dataset = dataset["train"].train_test_split(
 61 |             test_size=self.val_split_fraction, seed=self.seed, shuffle=True
 62 |         )
 63 |         split_dataset["val"] = split_dataset.pop("test")  # rename the test split to val
 64 | 
 65 |         def tokenize(data: Dataset, index: int):
 66 |             yield self.tokenizer.encode(data[index]["text"], eos=True)
 67 | 
 68 |         optimize(
 69 |             fn=partial(tokenize, split_dataset["train"]),
 70 |             inputs=list(range(len(split_dataset["train"]))),
 71 |             output_dir=self.data_path_train,
 72 |             num_workers=min(64, os.cpu_count() - 1),
 73 |             chunk_bytes="200MB",
 74 |         )
 75 |         optimize(
 76 |             fn=partial(tokenize, split_dataset["val"]),
 77 |             inputs=list(range(len(split_dataset["val"]))),
 78 |             output_dir=self.data_path_val,
 79 |             num_workers=min(8, os.cpu_count() - 1),
 80 |             chunk_bytes="200MB",
 81 |         )
 82 | 
 83 |     def train_dataloader(self) -> DataLoader:
 84 |         from litdata.streaming import StreamingDataLoader, StreamingDataset, TokensLoader
 85 | 
 86 |         train_dataset = StreamingDataset(
 87 |             input_dir=self.data_path_train,
 88 |             item_loader=TokensLoader(block_size=self.seq_length),
 89 |             shuffle=True,
 90 |         )
 91 |         train_dataloader = StreamingDataLoader(
 92 |             train_dataset, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True
 93 |         )
 94 |         return train_dataloader
 95 | 
 96 |     def val_dataloader(self) -> DataLoader:
 97 |         from litdata.streaming import StreamingDataLoader, StreamingDataset, TokensLoader
 98 | 
 99 |         val_dataset = StreamingDataset(
100 |             input_dir=self.data_path_val,
101 |             item_loader=TokensLoader(block_size=self.seq_length),
102 |             shuffle=True,
103 |         )
104 |         val_dataloader = StreamingDataLoader(
105 |             val_dataset, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True
106 |         )
107 |         return val_dataloader
108 | 


--------------------------------------------------------------------------------
/config_hub/finetune/llama-2-7b/lora.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  3 | checkpoint_dir: checkpoints/meta-llama/Llama-2-7b-hf
  4 | 
  5 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
  6 | out_dir: out/finetune/lora-llama2-7b
  7 | 
  8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  9 | precision: bf16-true
 10 | 
 11 | # If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
 12 | quantize:
 13 | 
 14 | # How many devices/GPUs to use. (type: Union[int, str], default: 1)
 15 | devices: 1
 16 | 
 17 | # How many nodes to use. (type: int, default: 1)
 18 | num_nodes: 1
 19 | 
 20 | # The LoRA rank. (type: int, default: 8)
 21 | lora_r: 32
 22 | 
 23 | # The LoRA alpha. (type: int, default: 16)
 24 | lora_alpha: 16
 25 | 
 26 | # The LoRA dropout value. (type: float, default: 0.05)
 27 | lora_dropout: 0.05
 28 | 
 29 | # Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
 30 | lora_query: true
 31 | 
 32 | # Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
 33 | lora_key: false
 34 | 
 35 | # Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
 36 | lora_value: true
 37 | 
 38 | # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
 39 | lora_projection: false
 40 | 
 41 | # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
 42 | lora_mlp: false
 43 | 
 44 | # Whether to apply LoRA to output head in GPT. (type: bool, default: False)
 45 | lora_head: false
 46 | 
 47 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 48 | data:
 49 |   class_path: litgpt.data.Alpaca2k
 50 |   init_args:
 51 |     mask_prompt: false
 52 |     prompt_style: alpaca
 53 |     ignore_index: -100
 54 |     seed: 42
 55 |     num_workers: 4
 56 | 
 57 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 58 | train:
 59 | 
 60 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 61 |   save_interval: 200
 62 | 
 63 |   # Number of iterations between logging calls (type: int, default: 1)
 64 |   log_interval: 1
 65 | 
 66 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
 67 |   global_batch_size: 8
 68 | 
 69 |   # Number of samples per data-parallel rank (type: int, default: 4)
 70 |   micro_batch_size: 2
 71 | 
 72 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 73 |   lr_warmup_steps: 10
 74 | 
 75 |   # Number of epochs to train on (type: Optional[int], default: 5)
 76 |   epochs: 4
 77 | 
 78 |   # Total number of tokens to train on (type: Optional[int], default: null)
 79 |   max_tokens:
 80 | 
 81 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 82 |   max_steps:
 83 | 
 84 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 85 |   max_seq_length: 512
 86 | 
 87 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 88 |   tie_embeddings:
 89 | 
 90 |   #   (type: Optional[float], default: null)
 91 |   max_norm:
 92 | 
 93 |   #   (type: float, default: 6e-05)
 94 |   min_lr: 6.0e-05
 95 | 
 96 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 97 | eval:
 98 | 
 99 |   # Number of optimizer steps between evaluation calls (type: int, default: 100)
100 |   interval: 100
101 | 
102 |   # Number of tokens to generate (type: Optional[int], default: 100)
103 |   max_new_tokens: 100
104 | 
105 |   # Number of iterations (type: int, default: 100)
106 |   max_iters: 100
107 | 
108 |   # Whether to evaluate on the validation set at the beginning of the training
109 |   initial_validation: false
110 | 
111 |   # Whether to evaluate on the validation set at the end the training
112 |   final_validation: true
113 | 
114 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
115 | logger_name: csv
116 | 
117 | # The random seed to use for reproducibility. (type: int, default: 1337)
118 | seed: 1337
119 | 
120 | # Optimizer-related arguments
121 | optimizer:
122 | 
123 |   class_path: torch.optim.AdamW
124 |   
125 |   init_args:
126 |     
127 |     #   (type: float, default: 0.001)
128 |     lr: 0.0002
129 |     
130 |     #   (type: float, default: 0.01)
131 |     weight_decay: 0.0
132 |     
133 |     #   (type: tuple, default: (0.9,0.999))
134 |     betas:
135 |       - 0.9
136 |       - 0.95
137 | 


--------------------------------------------------------------------------------
/config_hub/finetune/mistral-7b/lora.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  3 | checkpoint_dir: checkpoints/mistralai/Mistral-7B-v0.1
  4 | 
  5 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
  6 | out_dir: out/finetune/lora-mistral-7b
  7 | 
  8 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  9 | precision: bf16-true
 10 | 
 11 | # If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
 12 | quantize:
 13 | 
 14 | # How many devices/GPUs to use. (type: Union[int, str], default: 1)
 15 | devices: 1
 16 | 
 17 | # How many nodes to use. (type: int, default: 1)
 18 | num_nodes: 1
 19 | 
 20 | # The LoRA rank. (type: int, default: 8)
 21 | lora_r: 32
 22 | 
 23 | # The LoRA alpha. (type: int, default: 16)
 24 | lora_alpha: 16
 25 | 
 26 | # The LoRA dropout value. (type: float, default: 0.05)
 27 | lora_dropout: 0.05
 28 | 
 29 | # Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
 30 | lora_query: true
 31 | 
 32 | # Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
 33 | lora_key: false
 34 | 
 35 | # Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
 36 | lora_value: true
 37 | 
 38 | # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
 39 | lora_projection: false
 40 | 
 41 | # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
 42 | lora_mlp: false
 43 | 
 44 | # Whether to apply LoRA to output head in GPT. (type: bool, default: False)
 45 | lora_head: false
 46 | 
 47 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 48 | data:
 49 |   class_path: litgpt.data.Alpaca2k
 50 |   init_args:
 51 |     mask_prompt: false
 52 |     prompt_style: alpaca
 53 |     ignore_index: -100
 54 |     seed: 42
 55 |     num_workers: 4
 56 | 
 57 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 58 | train:
 59 | 
 60 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 61 |   save_interval: 200
 62 | 
 63 |   # Number of iterations between logging calls (type: int, default: 1)
 64 |   log_interval: 1
 65 | 
 66 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
 67 |   global_batch_size: 8
 68 | 
 69 |   # Number of samples per data-parallel rank (type: int, default: 4)
 70 |   micro_batch_size: 2
 71 | 
 72 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 73 |   lr_warmup_steps: 10
 74 | 
 75 |   # Number of epochs to train on (type: Optional[int], default: 5)
 76 |   epochs: 4
 77 | 
 78 |   # Total number of tokens to train on (type: Optional[int], default: null)
 79 |   max_tokens:
 80 | 
 81 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 82 |   max_steps:
 83 | 
 84 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 85 |   max_seq_length: 512
 86 | 
 87 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 88 |   tie_embeddings:
 89 | 
 90 |   #   (type: Optional[float], default: null)
 91 |   max_norm:
 92 | 
 93 |   #   (type: float, default: 6e-05)
 94 |   min_lr: 6.0e-05
 95 | 
 96 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 97 | eval:
 98 | 
 99 |   # Number of optimizer steps between evaluation calls (type: int, default: 100)
100 |   interval: 100
101 | 
102 |   # Number of tokens to generate (type: Optional[int], default: 100)
103 |   max_new_tokens: 100
104 | 
105 |   # Number of iterations (type: int, default: 100)
106 |   max_iters: 100
107 | 
108 |   # Whether to evaluate on the validation set at the beginning of the training
109 |   initial_validation: false
110 | 
111 |   # Whether to evaluate on the validation set at the end the training
112 |   final_validation: true
113 | 
114 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
115 | logger_name: csv
116 | 
117 | # The random seed to use for reproducibility. (type: int, default: 1337)
118 | seed: 1337
119 | 
120 | # Optimizer-related arguments
121 | optimizer:
122 | 
123 |   class_path: torch.optim.AdamW
124 |   
125 |   init_args:
126 |     
127 |     #   (type: float, default: 0.001)
128 |     lr: 0.0002
129 |     
130 |     #   (type: float, default: 0.01)
131 |     weight_decay: 0.0
132 |     
133 |     #   (type: tuple, default: (0.9,0.999))
134 |     betas:
135 |       - 0.9
136 |       - 0.95
137 | 


--------------------------------------------------------------------------------