├── .azure
    └── gpu-test.yml
├── .devcontainer
    ├── Dockerfile
    └── devcontainer.json
├── .github
    ├── CODEOWNERS
    ├── ISSUE_TEMPLATE
    │   ├── ask-a-question.md
    │   ├── bug-report.yaml
    │   └── feature-request.md
    └── workflows
    │   ├── check-links.yml
    │   ├── cpu-tests.yml
    │   ├── mkdocs-deploy.yml
    │   └── publish-pkg.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CITATION.cff
├── LICENSE
├── README.md
├── config_hub
    ├── finetune
    │   ├── README.md
    │   ├── falcon-7b
    │   │   ├── lora.yaml
    │   │   └── qlora.yaml
    │   ├── gemma-2b
    │   │   ├── full.yaml
    │   │   ├── lora.yaml
    │   │   └── qlora.yaml
    │   ├── gemma-7b
    │   │   ├── lora.yaml
    │   │   └── qlora.yaml
    │   ├── gemma2-2b
    │   │   ├── lora.yaml
    │   │   └── qlora.yaml
    │   ├── gemma2-9b
    │   │   ├── lora.yaml
    │   │   └── qlora.yaml
    │   ├── llama-2-7b
    │   │   ├── full.yaml
    │   │   ├── lora.yaml
    │   │   └── qlora.yaml
    │   ├── llama-3-8b
    │   │   ├── full.yaml
    │   │   ├── lora.yaml
    │   │   └── qlora.yaml
    │   ├── llama-3.1-8b
    │   │   ├── full.yaml
    │   │   ├── lora.yaml
    │   │   └── qlora.yaml
    │   ├── llama-3.2-1B
    │   │   ├── full.yaml
    │   │   ├── lora.yaml
    │   │   └── qlora.yaml
    │   ├── llama-3.2-3B
    │   │   ├── full.yaml
    │   │   ├── lora.yaml
    │   │   └── qlora.yaml
    │   ├── mistral-7b-v0.2
    │   │   ├── lora.yaml
    │   │   └── qlora.yaml
    │   ├── mistral-7b
    │   │   ├── lora.yaml
    │   │   └── qlora.yaml
    │   ├── phi-2
    │   │   ├── full.yaml
    │   │   ├── lora.yaml
    │   │   └── qlora.yaml
    │   ├── phi-3
    │   │   ├── full.yaml
    │   │   ├── lora.yaml
    │   │   └── qlora.yaml
    │   ├── stablelm-base-alpha-3b
    │   │   ├── full.yaml
    │   │   ├── lora.yaml
    │   │   └── qlora.yaml
    │   └── tiny-llama
    │   │   ├── full.yaml
    │   │   ├── lora.yaml
    │   │   └── qlora.yaml
    └── pretrain
    │   ├── debug.yaml
    │   ├── microllama.yaml
    │   ├── tinyllama.yaml
    │   └── tinystories.yaml
├── extensions
    ├── thunder
    │   ├── README.md
    │   ├── __init__.py
    │   ├── pretrain.py
    │   ├── strategies
    │   │   ├── __init__.py
    │   │   ├── thunder_ddp.py
    │   │   └── thunder_fsdp.py
    │   └── unsloth
    │   │   ├── __init__.py
    │   │   ├── executor.py
    │   │   └── kernels
    │   │       ├── __init__.py
    │   │       ├── cross_entropy_loss.py
    │   │       ├── rope_embedding.py
    │   │       ├── swiglu.py
    │   │       └── utils.py
    └── xla
    │   ├── README.md
    │   ├── __init__
    │   ├── finetune
    │       ├── __init__
    │       └── adapter.py
    │   ├── generate
    │       ├── __init__
    │       ├── adapter.py
    │       └── base.py
    │   ├── scripts
    │       ├── __init__
    │       └── prepare_alpaca.py
    │   └── utils.py
├── litgpt
    ├── __init__.py
    ├── __main__.py
    ├── adapter.py
    ├── adapter_v2.py
    ├── api.py
    ├── args.py
    ├── chat
    │   ├── __init__.py
    │   └── base.py
    ├── config.py
    ├── data
    │   ├── __init__.py
    │   ├── alpaca.py
    │   ├── alpaca_2k.py
    │   ├── alpaca_gpt4.py
    │   ├── base.py
    │   ├── deita.py
    │   ├── flan.py
    │   ├── json_data.py
    │   ├── lima.py
    │   ├── lit_data.py
    │   ├── longform.py
    │   ├── microllama.py
    │   ├── openwebtext.py
    │   ├── prepare_slimpajama.py
    │   ├── prepare_starcoder.py
    │   ├── text_files.py
    │   ├── tinyllama.py
    │   └── tinystories.py
    ├── deploy
    │   ├── __init__.py
    │   └── serve.py
    ├── eval
    │   └── evaluate.py
    ├── finetune
    │   ├── __init__.py
    │   ├── adapter.py
    │   ├── adapter_v2.py
    │   ├── full.py
    │   └── lora.py
    ├── generate
    │   ├── __init__.py
    │   ├── adapter.py
    │   ├── adapter_v2.py
    │   ├── base.py
    │   ├── full.py
    │   ├── sequentially.py
    │   ├── speculative_decoding.py
    │   └── tp.py
    ├── lora.py
    ├── model.py
    ├── pretrain.py
    ├── prompts.py
    ├── scripts
    │   ├── __init__.py
    │   ├── convert_hf_checkpoint.py
    │   ├── convert_lit_checkpoint.py
    │   ├── convert_pretrained_checkpoint.py
    │   ├── download.py
    │   └── merge_lora.py
    ├── tokenizer.py
    └── utils.py
├── pyproject.toml
├── tests
    ├── conftest.py
    ├── convert
    │   ├── __init__.py
    │   ├── test_hf_checkpoint.py
    │   ├── test_lit_checkpoint.py
    │   └── test_pretrained_checkpoint.py
    ├── data
    │   ├── __init__.py
    │   ├── _fixtures
    │   │   ├── alpaca.json
    │   │   ├── dolly.json
    │   │   ├── longform_train.json
    │   │   └── longform_val.json
    │   ├── test_alpaca.py
    │   ├── test_base.py
    │   ├── test_deita.py
    │   ├── test_json.py
    │   ├── test_lit_data.py
    │   ├── test_longform.py
    │   ├── test_openwebtext.py
    │   ├── test_textfiles.py
    │   ├── test_tinyllama.py
    │   └── test_tinystories.py
    ├── ext_thunder
    │   ├── __init__.py
    │   ├── test_thunder_distributed.py
    │   ├── test_thunder_networks.py
    │   ├── test_thunder_pretrain.py
    │   └── test_unsloth_executor.py
    ├── generate
    │   ├── __init__.py
    │   ├── test_adapter.py
    │   ├── test_main.py
    │   ├── test_sequentially.py
    │   ├── test_tp.py
    │   └── utils.py
    ├── test_adapter.py
    ├── test_adapter_v2.py
    ├── test_api.py
    ├── test_args.py
    ├── test_batch.py
    ├── test_chat.py
    ├── test_ci.py
    ├── test_cli.py
    ├── test_config.py
    ├── test_config_hub.py
    ├── test_distributed.py
    ├── test_evaluate.py
    ├── test_full.py
    ├── test_generate_speculatively.py
    ├── test_lora.py
    ├── test_merge_lora.py
    ├── test_model.py
    ├── test_pretrain.py
    ├── test_prompts.py
    ├── test_readme.py
    ├── test_rope.py
    ├── test_serve.py
    ├── test_tokenizer.py
    ├── test_trainer_support.py
    └── test_utils.py
└── tutorials
    ├── 0_to_litgpt.md
    ├── convert_hf_checkpoint.md
    ├── convert_lit_models.md
    ├── deploy.md
    ├── developer-docs
        ├── README.md
        ├── adding-models.md
        └── python-api.md
    ├── download_model_weights.md
    ├── evaluation.md
    ├── examples
        └── ptl-trainer
        │   ├── README.md
        │   ├── litgpt_ptl_medium.py
        │   └── litgpt_ptl_small.py
    ├── finetune.md
    ├── finetune_adapter.md
    ├── finetune_full.md
    ├── finetune_lora.md
    ├── full_finetune_example.py
    ├── images
        ├── 0_to_litgpt
        │   ├── commands.webp
        │   ├── finetune.webp
        │   ├── instruction-1.webp
        │   ├── instruction-2.webp
        │   ├── pretrain.webp
        │   └── usage.webp
        └── prepare_dataset
        │   ├── alpaca-2k.jpg
        │   ├── alpaca.jpg
        │   ├── alpaca_libre.jpg
        │   ├── alpacagpt4.jpg
        │   ├── deita-multiturn.jpg
        │   ├── deita.jpg
        │   ├── dolly.jpg
        │   ├── lima.jpg
        │   └── longform.jpg
    ├── inference.md
    ├── mkdocs.yml
    ├── oom.md
    ├── prepare_dataset.md
    ├── pretrain.md
    ├── pretrain_tinyllama.md
    ├── python-api.md
    ├── quantize.md
    └── resource-tables.md


/.azure/gpu-test.yml:
--------------------------------------------------------------------------------
  1 | name: GPU tests
  2 | 
  3 | trigger:
  4 |   branches:
  5 |     include:
  6 |       - "main"
  7 |       - "wip"
  8 | 
  9 | pr:
 10 |   branches:
 11 |     include:
 12 |       - "main"
 13 |       - "wip"
 14 | 
 15 | jobs:
 16 |   - job: testing
 17 |     strategy:
 18 |       matrix:
 19 |         "ordinary":
 20 |           #image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.7-cuda12.6.3"
 21 |           dependency: ""
 22 |         "w. Thunder":
 23 |           #image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.7-cuda12.6.3"
 24 |           dependency: "compiler"
 25 |     variables:
 26 |       DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
 27 |       PL_RUN_CUDA_TESTS: "1"
 28 |       TRANSFORMERS_CACHE: "/var/tmp/hf/transformers"
 29 |       HF_HOME: "/var/tmp/hf/home"
 30 |       HF_HUB_CACHE: "/var/tmp/hf/hub"
 31 |       CI: "true"
 32 |       PYTHON_VERSION: "3.10"
 33 |       CUDA_VERSION: "12.6.3"
 34 |       TORCH_VERSION: "2.7.0"
 35 |       CUDNN_FRONTEND_VERSION: "1.10.0"
 36 |     container:
 37 |       # image: "pytorchlightning/pytorch_lightning:base-cuda-py$(PYTHON_VERSION)-torch$(TORCH_VERSION)-cuda$(CUDA_VERSION)"
 38 |       # pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-cudnn-fe1.5.0-py3.10-pt_main-dev
 39 |       image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda$(CUDA_VERSION)-cudnn-fe$(CUDNN_FRONTEND_VERSION)-py$(PYTHON_VERSION)-pt_$(TORCH_VERSION)-dev"
 40 |       options: "--gpus=all --shm-size=8gb -v /var/tmp:/var/tmp"
 41 |     workspace:
 42 |       clean: all
 43 |     pool: "lit-rtx-3090"
 44 |     timeoutInMinutes: "35"
 45 |     cancelTimeoutInMinutes: "2"
 46 |     steps:
 47 |       - bash: |
 48 |           echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
 49 |         displayName: "set env. vars"
 50 | 
 51 |       - bash: |
 52 |           echo $(DEVICES)
 53 |           echo $CUDA_VISIBLE_DEVICES
 54 |           whereis nvidia
 55 |           nvidia-smi
 56 |           which python && which pip
 57 |           python --version
 58 |           pip --version
 59 |           pip list
 60 |         displayName: "Image info & NVIDIA"
 61 | 
 62 |       - script: |
 63 |           pip install --upgrade pip
 64 |           pip install '.[extra,test]' cffi -U
 65 |         displayName: "Install package & dependencies"
 66 | 
 67 |       - script: |
 68 |           set -e
 69 |           pip uninstall -y torchvision torchaudio
 70 |           pip install '.[compiler]'
 71 |           python -c "from thunder.executors import nvfuser_available ; assert nvfuser_available(), 'nvFuser is missing!'"
 72 |           python -c "from thunder.executors.triton_utils import triton_version ; assert triton_version() is not None, 'triton is missing!'"
 73 |         condition: eq(variables['dependency'], 'compiler')
 74 |         displayName: "Install `compiler` [nvFuser & Thunder]"
 75 | 
 76 |       - bash: |
 77 |           set -e
 78 |           pip list
 79 |           python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
 80 |           python -c "from torch import __version__ as ver ; assert str(ver).split('+')[0] == '$(TORCH_VERSION)', f'PyTorch: installed {ver} but expected $(TORCH_VERSION)'"
 81 |         displayName: "Env details"
 82 | 
 83 |       - bash: pytest -v
 84 |         displayName: "All tests"
 85 |         #condition: eq(variables['dependency'], 'compiler')
 86 |         timeoutInMinutes: "15"
 87 | 
 88 |       - bash: |
 89 |           wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
 90 |           bash run_standalone_tests.sh "tests"
 91 |         displayName: "Standalone tests"
 92 |         env:
 93 |           PL_RUN_STANDALONE_TESTS: "1"
 94 |           # NUM_PARALLEL_TESTS: "10"
 95 |         timeoutInMinutes: "10"
 96 | 
 97 |       - bash: |
 98 |           pip uninstall -y lightning-thunder
 99 |           # install thunder from source, so that, thunder.tests will be available
100 |           pip install -U "lightning-thunder[test] @ git+https://github.com/Lightning-AI/lightning-thunder.git"
101 |         displayName: "Re-install Thunder [main branch]"
102 |         condition: eq(variables['dependency'], 'compiler')
103 | 
104 |       - bash: |
105 |           # without env var, it filters out all tests
106 |           PL_RUN_CUDA_TESTS=0 pytest tests/ext_thunder/test_thunder_networks.py -v
107 |         displayName: "Extra tests for Thunder [main branch]"
108 |         condition: eq(variables['dependency'], 'compiler')
109 |         env:
110 |           TORCHDYNAMO_VERBOSE: "1"
111 |         timeoutInMinutes: "10"
112 | 


--------------------------------------------------------------------------------
/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | # See here for image contents: https://github.com/devcontainers/images/blob/main/src/python/.devcontainer/Dockerfile
 2 | 
 3 | # [Choice] Python version (use -bookworm or -bullseye variants on local arm64/Apple Silicon): 3, 3.12, 3.11, 3.10, 3.9, 3.8, 3-bookworm, 3.12-bookworm, 3.11-bookworm, 3.10-bookworm, 3.9-bookworm, 3.8-bookworm, 3-bullseye, 3.12-bullseye, 3.11-bullseye, 3.10-bullseye, 3.9-bullseye, 3.8-bullseye, 3-buster, 3.12-buster, 3.11-buster, 3.10-buster, 3.9-buster, 3.8-buster
 4 | ARG VARIANT=3-bookworm
 5 | FROM mcr.microsoft.com/devcontainers/python:1-${VARIANT}
 6 | 
 7 | # Temporary: Upgrade python packages due to https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-40897
 8 | # They are installed by the base image (python) which does not have the patch.
 9 | RUN python3 -m pip install --upgrade pip setuptools
10 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
  1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
  2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.194.0/containers/python-3
  3 | {
  4 |   "name": "Python 3 (litgpt)",
  5 |   "build": {
  6 |     "dockerfile": "Dockerfile",
  7 |     "context": "..",
  8 |     "args": {
  9 |       "VARIANT": "3.11-bookworm"
 10 |     }
 11 |   },
 12 |   "runArgs": [
 13 |     // Enable GPU passthrough, requires WSL2 on Windows
 14 |     //"--gpus=all",
 15 |     // One of the following options is required for torch multiprocessing
 16 |     //"--ipc=host",
 17 |     //"--shm-size=4gb",
 18 |   ],
 19 |   // Features to add to the dev container. More info: https://containers.dev/features.
 20 |   "features": {
 21 |     "ghcr.io/devcontainers/features/git:1": {},
 22 |     "ghcr.io/devcontainers/features/git-lfs:1": {},
 23 |     //"ghcr.io/devcontainers/features/nvidia-cuda:1": {},
 24 |     "ghcr.io/devcontainers-extra/features/actionlint:1": {},
 25 |     "ghcr.io/devcontainers-extra/features/pre-commit:2": {},
 26 |     "ghcr.io/dhoeric/features/act:1": {},
 27 |     "ghcr.io/devcontainers/features/docker-in-docker:2": {
 28 |       "version": "latest",
 29 |       "moby": true
 30 |     }
 31 |   },
 32 |   // Set *default* container specific settings.json values on container create.
 33 |   "customizations": {
 34 |     "vscode": {
 35 |       "settings": {
 36 |         "editor.tabSize": 4,
 37 |         "editor.renderWhitespace": "all",
 38 |         "editor.formatOnSave": true,
 39 |         "editor.rulers": [120],
 40 |         "files.exclude": {
 41 |           "**/__pycache__": true
 42 |         },
 43 |         "python.pythonPath": "/usr/local/bin/python",
 44 |         "python.defaultInterpreterPath": "/usr/local/bin/python",
 45 |         "python.languageServer": "Pylance",
 46 |         "python.analysis.autoImportCompletions": true,
 47 |         "python.analysis.completeFunctionParens": true,
 48 |         "python.analysis.autoSearchPaths": true,
 49 |         "python.testing.pytestArgs": ["tests"],
 50 |         "python.testing.unittestEnabled": false,
 51 |         "python.testing.pytestEnabled": true,
 52 |         "code-eol.highlightNonDefault": true,
 53 |         "code-eol.highlightExtraWhitespace": true,
 54 |         "autoDocstring.docstringFormat": "google-notypes",
 55 |         "autoDocstring.guessTypes": true,
 56 |         "autoDocstring.generateDocstringOnEnter": true,
 57 |         "autoDocstring.startOnNewLine": true,
 58 |         "telemetry.telemetryLevel": "off",
 59 |         "[python]": {
 60 |           "editor.formatOnSave": true,
 61 |           "editor.defaultFormatter": "charliermarsh.ruff",
 62 |           "editor.codeActionsOnSave": {
 63 |             "source.organizeImports": "always",
 64 |             "source.fixAll": "always"
 65 |           }
 66 |         }
 67 |       },
 68 |       // Add the IDs of extensions you want installed when the container is created.
 69 |       "extensions": [
 70 |         "ms-python.python",
 71 |         "ms-python.vscode-pylance",
 72 |         "ms-toolsai.jupyter",
 73 |         "GitHub.copilot",
 74 |         "GitHub.copilot-chat",
 75 |         "github.vscode-github-actions",
 76 |         "SanjulaGanepola.github-local-actions",
 77 |         "charliermarsh.ruff",
 78 |         "esbenp.prettier-vscode",
 79 |         "ms-vscode.test-adapter-converter",
 80 |         "njqdev.vscode-python-typehint",
 81 |         "KevinRose.vsc-python-indent",
 82 |         "medo64.render-crlf",
 83 |         "shardulm94.trailing-spaces",
 84 |         "nhoizey.gremlins",
 85 |         "wayou.vscode-todo-highlight",
 86 |         "Gruntfuggly.todo-tree",
 87 |         "njpwerner.autodocstring",
 88 |         "rodolphebarbanneau.python-docstring-highlighter",
 89 |         "mechatroner.rainbow-csv",
 90 |         "uctakeoff.vscode-counter",
 91 |         "bierner.github-markdown-preview",
 92 |         "yahyabatulu.vscode-markdown-alert",
 93 |         "ms-vscode-remote.vscode-remote-extensionpack",
 94 |         "ms-azuretools.vscode-docker",
 95 |         "redhat.vscode-yaml"
 96 |       ]
 97 |     }
 98 |   },
 99 |   // Use 'forwardPorts' to make a list of ports inside the container available locally.
100 |   // "forwardPorts": [],
101 |   // Use 'postCreateCommand' to run commands after the container is created.
102 |   "postCreateCommand": "pre-commit install && pip install '.[extra,compiler,test]' -U",
103 |   // Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
104 |   "remoteUser": "vscode"
105 | }
106 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @lantiga @t-vi @borda
2 | /README.md                           @williamfalcon @lantiga
3 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/ask-a-question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Ask a Question
 3 | about: Ask and answer questions related to LitGPT
 4 | title: ''
 5 | labels: question
 6 | 
 7 | ---
 8 | 
 9 | Please describe your question here.
10 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.yaml:
--------------------------------------------------------------------------------
 1 | name: Bug Report
 2 | description: Report errors related to LitGPT
 3 | title: "Description"
 4 | labels: bug
 5 | body:
 6 |   - type: markdown
 7 |     attributes:
 8 |       value: |
 9 |         Thank you for taking the time to report an issue. Please fill out the details below to help us resolve it.
10 | 
11 |   - type: textarea
12 |     id: bug_description
13 |     attributes:
14 |       label: Bug description
15 |       description: A description of the issue.
16 |       placeholder: |
17 |         Please provide a description of what the bug or issue is.
18 |     validations:
19 |       required: true
20 | 
21 |   - type: dropdown
22 |     id: operating_system
23 |     attributes:
24 |       label: What operating system are you using?
25 |       description: If applicable, please select the operating system where you experienced this issue.
26 |       options:
27 |         - "Unknown"
28 |         - "macOS"
29 |         - "Linux"
30 |         - "Windows"
31 |     validations:
32 |       required: true
33 | 
34 |   - type: textarea
35 |     id: version
36 |     attributes:
37 |       label: LitGPT Version
38 |       description: |
39 |         Please provide details about your LitGPT version by running the following code in your terminal:
40 |         ```
41 |         pip show litgpt | grep Version:
42 |         ```
43 |         You can simply copy and paste the outputs below.
44 |       value: |
45 |         ```
46 | 
47 | 
48 | 
49 |         ```
50 |     validations:
51 |       required: false
52 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Suggest a Feature
 3 | about: Propose a new feature or enhancement
 4 | title: ''
 5 | labels: enhancement
 6 | 
 7 | ---
 8 | 
 9 | Please describe the feature or enhancement along with the intended usecase.
10 | 


--------------------------------------------------------------------------------
/.github/workflows/check-links.yml:
--------------------------------------------------------------------------------
 1 | name: Check hyperlinks
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 | 
11 | jobs:
12 |   test:
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |       - uses: actions/checkout@v4
17 | 
18 |       - name: Set up Python
19 |         uses: actions/setup-python@v5
20 |         with:
21 |           python-version: "3.10"
22 | 
23 |       - name: Install dependencies
24 |         run: |
25 |           python -m pip install --upgrade pip
26 |           pip install "mistune<3.1"  # a newer version is incompatible with nbconvert
27 |           pip install pytest pytest-check-links
28 | 
29 |       - name: Check links
30 |         run: |
31 |           pytest --check-links README.md --check-links-ignore "http*"
32 |           pytest --check-links tutorials --check-links-ignore "http*"
33 | 


--------------------------------------------------------------------------------
/.github/workflows/mkdocs-deploy.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy MkDocs
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main]
 6 | 
 7 | permissions:
 8 |   contents: write
 9 | 
10 | jobs:
11 |   deploy:
12 |     runs-on: ubuntu-24.04
13 |     steps:
14 |       # Step 1: Checkout the repository
15 |       - uses: actions/checkout@v4
16 | 
17 |       # Step 2: Set up Python
18 |       - uses: actions/setup-python@v5
19 |         with:
20 |           python-version: "3.x"
21 |           cache: "pip"
22 | 
23 |       # Step 3: Install MkDocs and dependencies
24 |       - run: pip install mkdocs mkdocs-material mkdocs-pagetree-plugin
25 |       # Step 4: Deploy to GitHub Pages
26 |       - run: |
27 |           mkdir -p gh-pages/docs
28 |           cp -r tutorials/* gh-pages/docs
29 |           cd gh-pages
30 |           mv docs/mkdocs.yml mkdocs.yml
31 |           echo "{{ pagetree }}" > docs/index.md
32 |           mkdocs gh-deploy --force
33 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-pkg.yml:
--------------------------------------------------------------------------------
 1 | # To create a release, create a tag and push it to GitHub:
 2 | #git tag -a "v0.0.1-beta" -m "beta version testing"
 3 | #git push --tags
 4 | # https://dev.to/iamtekson/publish-package-to-pypi-and-release-new-version-using-github-actions-108k
 5 | name: Publish LitGPT to PyPI
 6 | 
 7 | on:
 8 |   push:
 9 |     tags:
10 |       - "v*"
11 | jobs:
12 |   build-n-publish:
13 |     name: Build and publish to PyPI
14 |     runs-on: ubuntu-latest
15 |     environment:
16 |       name: pypi
17 |       url: https://pypi.org/p/litgpt
18 |     permissions:
19 |       id-token: write
20 | 
21 |     steps:
22 |       - name: Checkout source
23 |         uses: actions/checkout@v3
24 | 
25 |       - name: Set up Python
26 |         uses: actions/setup-python@v4
27 |         with:
28 |           python-version: "3.x"
29 |           cache: "pip"
30 | 
31 |       - name: Build source and wheel distributions
32 |         run: |
33 |           python -m pip install --upgrade build twine
34 |           pip install importlib_metadata==7.2.1
35 |           python -m build
36 |           twine check --strict dist/*
37 |       - name: Publish distribution to PyPI
38 |         uses: pypa/gh-action-pypi-publish@release/v1
39 |         with:
40 |           user: __token__
41 |           password: ${{ secrets.PYPI_API_TOKEN }}
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .ipynb_checkpoints/
 2 | __pycache__
 3 | .idea
 4 | .DS_Store
 5 | *.egg-info
 6 | build
 7 | dist
 8 | .venv
 9 | .vscode
10 | 
11 | # data
12 | data
13 | datasets
14 | !litgpt/data
15 | !tests/data
16 | checkpoints
17 | out
18 | wandb
19 | events.out.tfevents*
20 | 
21 | # test artifacts from tests/test_readme.py
22 | **/custom_finetuning_dataset.json
23 | client.py
24 | **/custom_texts/
25 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright The Lightning team.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | default_language_version:
16 |   python: python3
17 | 
18 | ci:
19 |   autofix_prs: true
20 |   autoupdate_commit_msg: "[pre-commit.ci] pre-commit suggestions"
21 |   autoupdate_schedule: quarterly
22 |   # submodules: true
23 | 
24 | repos:
25 |   - repo: https://github.com/pre-commit/pre-commit-hooks
26 |     rev: v5.0.0
27 |     hooks:
28 |       - id: end-of-file-fixer
29 |       - id: trailing-whitespace
30 |         exclude: README.md
31 |       - id: check-yaml
32 |       - id: check-toml
33 |       #- id: check-docstring-first
34 |       #- id: check-executables-have-shebangs
35 |       - id: check-case-conflict
36 |       - id: check-added-large-files
37 |         args: ["--maxkb=250", "--enforce-all"]
38 |       - id: detect-private-key
39 | 
40 |   - repo: https://github.com/codespell-project/codespell
41 |     rev: v2.4.1
42 |     hooks:
43 |       - id: codespell
44 |         additional_dependencies: [tomli]
45 |         args: ["--write-changes"]
46 |         exclude: pyproject.toml
47 | 
48 |   #- repo: https://github.com/crate-ci/typos
49 |   #  rev: dictgen-v0.3.1
50 |   #  hooks:
51 |   #    - id: typos
52 |   #      args: [] # empty to do not write fixes
53 |   #      exclude: pyproject.toml
54 | 
55 |   #- repo: https://github.com/executablebooks/mdformat
56 |   #  rev: 0.7.21
57 |   #  hooks:
58 |   #    - id: mdformat
59 |   #      args: ["--number"]
60 |   #      additional_dependencies:
61 |   #        - mdformat-gfm
62 |   #        - mdformat-black
63 |   #        - mdformat_frontmatter
64 | 
65 |   - repo: https://github.com/pre-commit/mirrors-prettier
66 |     rev: v3.1.0
67 |     hooks:
68 |       - id: prettier
69 |         files: \.(json|yml|yaml|toml)
70 |         # https://prettier.io/docs/en/options.html#print-width
71 |         args: ["--print-width=140"]
72 | 
73 |   - repo: https://github.com/astral-sh/ruff-pre-commit
74 |     rev: v0.11.4
75 |     hooks:
76 |       - id: ruff
77 |         args: ["--fix"]
78 |       - id: ruff-format
79 |       - id: ruff
80 | 
81 |   - repo: https://github.com/tox-dev/pyproject-fmt
82 |     rev: v2.5.1
83 |     hooks:
84 |       - id: pyproject-fmt
85 |         additional_dependencies: [tox]
86 |   - repo: https://github.com/abravalheri/validate-pyproject
87 |     rev: v0.24.1
88 |     hooks:
89 |       - id: validate-pyproject
90 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, you can cite it as shown below."
 3 | title: "LitGPT"
 4 | abstract: "20+ high-performance LLMs with recipes to pretrain, finetune and deploy at scale."
 5 | date-released: 2023-03-22
 6 | authors:
 7 |   - name: "The Lightning AI team"
 8 | license: "Apache-2.0"
 9 | url: "https://github.com/Lightning-AI/litgpt"
10 | 


--------------------------------------------------------------------------------
/config_hub/finetune/falcon-7b/lora.yaml:
--------------------------------------------------------------------------------
  1 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  2 | checkpoint_dir: checkpoints/tiiuae/falcon-7b
  3 | 
  4 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
  5 | out_dir: out/finetune/lora-falcon-7b
  6 | 
  7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  8 | precision: bf16-true
  9 | 
 10 | # If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
 11 | quantize:
 12 | 
 13 | # How many devices/GPUs to use. (type: Union[int, str], default: 1)
 14 | devices: 1
 15 | 
 16 | # How many nodes to use. (type: int, default: 1)
 17 | num_nodes: 1
 18 | 
 19 | # The LoRA rank. (type: int, default: 8)
 20 | lora_r: 32
 21 | 
 22 | # The LoRA alpha. (type: int, default: 16)
 23 | lora_alpha: 16
 24 | 
 25 | # The LoRA dropout value. (type: float, default: 0.05)
 26 | lora_dropout: 0.05
 27 | 
 28 | # Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
 29 | lora_query: true
 30 | 
 31 | # Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
 32 | lora_key: false
 33 | 
 34 | # Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
 35 | lora_value: true
 36 | 
 37 | # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
 38 | lora_projection: false
 39 | 
 40 | # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
 41 | lora_mlp: false
 42 | 
 43 | # Whether to apply LoRA to output head in GPT. (type: bool, default: False)
 44 | lora_head: false
 45 | 
 46 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 47 | data:
 48 |   class_path: litgpt.data.Alpaca2k
 49 |   init_args:
 50 |     mask_prompt: false
 51 |     prompt_style: alpaca
 52 |     ignore_index: -100
 53 |     seed: 42
 54 |     num_workers: 4
 55 | 
 56 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 57 | train:
 58 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 59 |   save_interval: 200
 60 | 
 61 |   # Number of iterations between logging calls (type: int, default: 1)
 62 |   log_interval: 1
 63 | 
 64 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
 65 |   global_batch_size: 8
 66 | 
 67 |   # Number of samples per data-parallel rank (type: int, default: 4)
 68 |   micro_batch_size: 1
 69 | 
 70 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 71 |   lr_warmup_steps: 10
 72 | 
 73 |   # Number of epochs to train on (type: Optional[int], default: 5)
 74 |   epochs: 4
 75 | 
 76 |   # Total number of tokens to train on (type: Optional[int], default: null)
 77 |   max_tokens:
 78 | 
 79 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 80 |   max_steps:
 81 | 
 82 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 83 |   max_seq_length: 512
 84 | 
 85 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 86 |   tie_embeddings:
 87 | 
 88 |   #   (type: Optional[float], default: null)
 89 |   max_norm:
 90 | 
 91 |   #   (type: float, default: 6e-05)
 92 |   min_lr: 6.0e-05
 93 | 
 94 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 95 | eval:
 96 |   # Number of optimizer steps between evaluation calls (type: int, default: 100)
 97 |   interval: 100
 98 | 
 99 |   # Number of tokens to generate (type: Optional[int], default: 100)
100 |   max_new_tokens: 100
101 | 
102 |   # Number of iterations (type: int, default: 100)
103 |   max_iters: 100
104 | 
105 |   # Whether to evaluate on the validation set at the beginning of the training
106 |   initial_validation: false
107 | 
108 |   # Whether to evaluate on the validation set at the end the training
109 |   final_validation: true
110 | 
111 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
112 | logger_name: csv
113 | 
114 | # The random seed to use for reproducibility. (type: int, default: 1337)
115 | seed: 1337
116 | 
117 | # Optimizer-related arguments
118 | optimizer:
119 |   class_path: torch.optim.AdamW
120 | 
121 |   init_args:
122 |     #   (type: float, default: 0.001)
123 |     lr: 0.0002
124 | 
125 |     #   (type: float, default: 0.01)
126 |     weight_decay: 0.0
127 | 
128 |     #   (type: tuple, default: (0.9,0.999))
129 |     betas:
130 |       - 0.9
131 |       - 0.95
132 | 


--------------------------------------------------------------------------------
/config_hub/finetune/gemma-2b/full.yaml:
--------------------------------------------------------------------------------
  1 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  2 | checkpoint_dir: checkpoints/google/gemma-2b
  3 | 
  4 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
  5 | out_dir: out/finetune/full-gemma-2b
  6 | 
  7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  8 | precision: bf16-true
  9 | 
 10 | # How many devices/GPUs to use. (type: Union[int, str], default: 1)
 11 | devices: 4
 12 | 
 13 | # How many nodes to use. (type: int, default: 1)
 14 | num_nodes: 1
 15 | 
 16 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 17 | data:
 18 |   class_path: litgpt.data.Alpaca2k
 19 |   init_args:
 20 |     mask_prompt: false
 21 |     val_split_fraction: 0.03847
 22 |     prompt_style: alpaca
 23 |     ignore_index: -100
 24 |     seed: 42
 25 |     num_workers: 4
 26 | 
 27 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 28 | train:
 29 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 30 |   save_interval: 800
 31 | 
 32 |   # Number of iterations between logging calls (type: int, default: 1)
 33 |   log_interval: 1
 34 | 
 35 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
 36 |   global_batch_size: 16
 37 | 
 38 |   # Number of samples per data-parallel rank (type: int, default: 4)
 39 |   micro_batch_size: 1
 40 | 
 41 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 42 |   lr_warmup_steps: 100
 43 | 
 44 |   # Number of epochs to train on (type: Optional[int], default: 5)
 45 |   epochs: 1
 46 | 
 47 |   # Total number of tokens to train on (type: Optional[int], default: null)
 48 |   max_tokens:
 49 | 
 50 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 51 |   max_steps: 50
 52 | 
 53 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 54 |   max_seq_length: 512
 55 | 
 56 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 57 |   tie_embeddings:
 58 | 
 59 |   #   (type: Optional[float], default: null)
 60 |   max_norm:
 61 | 
 62 |   #   (type: float, default: 6e-05)
 63 |   min_lr: 6.0e-05
 64 | 
 65 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 66 | eval:
 67 |   # Number of optimizer steps between evaluation calls (type: int, default: 100)
 68 |   interval: 25
 69 | 
 70 |   # Number of tokens to generate (type: Optional[int], default: 100)
 71 |   max_new_tokens: 100
 72 | 
 73 |   # Number of iterations (type: int, default: 100)
 74 |   max_iters: 100
 75 | 
 76 |   # Whether to evaluate on the validation set at the beginning of the training
 77 |   initial_validation: false
 78 | 
 79 |   # Whether to evaluate on the validation set at the end the training
 80 |   final_validation: true
 81 | 
 82 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 83 | logger_name: csv
 84 | 
 85 | # The random seed to use for reproducibility. (type: int, default: 1337)
 86 | seed: 1337
 87 | 
 88 | # Optimizer-related arguments
 89 | optimizer:
 90 |   class_path: torch.optim.AdamW
 91 | 
 92 |   init_args:
 93 |     #   (type: float, default: 0.001)
 94 |     lr: 0.0002
 95 | 
 96 |     #   (type: float, default: 0.01)
 97 |     weight_decay: 0.0
 98 | 
 99 |     #   (type: tuple, default: (0.9,0.999))
100 |     betas:
101 |       - 0.9
102 |       - 0.95
103 | 


--------------------------------------------------------------------------------
/config_hub/finetune/llama-2-7b/full.yaml:
--------------------------------------------------------------------------------
  1 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  2 | checkpoint_dir: checkpoints/meta-llama/Llama-2-7b-hf
  3 | 
  4 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/finetune/full)
  5 | out_dir: out/finetune/full-llama2-7b
  6 | 
  7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  8 | precision: bf16-true
  9 | 
 10 | # How many devices/GPUs to use (type: Union[int, str], default: 1)
 11 | devices: 4
 12 | 
 13 | # How many nodes to use. (type: int, default: 1)
 14 | num_nodes: 1
 15 | 
 16 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
 17 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
 18 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
 19 | # (type: Union[bool, Literal["auto"], Path], default: False)
 20 | resume: false
 21 | 
 22 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 23 | data:
 24 |   class_path: litgpt.data.Alpaca2k
 25 |   init_args:
 26 |     mask_prompt: false
 27 |     prompt_style: alpaca
 28 |     ignore_index: -100
 29 |     seed: 42
 30 |     num_workers: 4
 31 | 
 32 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 33 | train:
 34 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 35 |   save_interval: 200
 36 | 
 37 |   # Number of iterations between logging calls (type: int, default: 1)
 38 |   log_interval: 1
 39 | 
 40 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64)
 41 |   global_batch_size: 64
 42 | 
 43 |   # Number of samples per data-parallel rank (type: int, default: 1)
 44 |   micro_batch_size: 4
 45 | 
 46 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 47 |   lr_warmup_steps: 25
 48 | 
 49 |   # Number of epochs to train on (type: Optional[int], default: 5)
 50 |   epochs: 1
 51 | 
 52 |   # Total number of tokens to train on (type: Optional[int], default: null)
 53 |   max_tokens:
 54 | 
 55 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 56 |   max_steps:
 57 | 
 58 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 59 |   max_seq_length: 512
 60 | 
 61 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 62 |   tie_embeddings:
 63 | 
 64 |   #   (type: Optional[float], default: null)
 65 |   max_norm:
 66 | 
 67 |   #   (type: float, default: 6e-05)
 68 |   min_lr: 6.0e-05
 69 | 
 70 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 71 | eval:
 72 |   # Number of optimizer steps between evaluation calls (type: int, default: 600)
 73 |   interval: 25
 74 | 
 75 |   # Number of tokens to generate (type: Optional[int], default: 100)
 76 |   max_new_tokens: 100
 77 | 
 78 |   # Number of iterations (type: int, default: 100)
 79 |   max_iters: 100
 80 | 
 81 |   # Whether to evaluate on the validation set at the beginning of the training
 82 |   initial_validation: false
 83 | 
 84 |   # Whether to evaluate on the validation set at the end the training
 85 |   final_validation: true
 86 | 
 87 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 88 | logger_name: csv
 89 | 
 90 | # The random seed to use for reproducibility. (type: int, default: 1337)
 91 | seed: 1337
 92 | 
 93 | # Optimizer-related arguments
 94 | optimizer:
 95 |   class_path: torch.optim.AdamW
 96 | 
 97 |   init_args:
 98 |     #   (type: float, default: 0.001)
 99 |     lr: 0.0002
100 | 
101 |     #   (type: float, default: 0.01)
102 |     weight_decay: 0.0
103 | 
104 |     #   (type: tuple, default: (0.9,0.999))
105 |     betas:
106 |       - 0.9
107 |       - 0.95
108 | 


--------------------------------------------------------------------------------
/config_hub/finetune/llama-2-7b/lora.yaml:
--------------------------------------------------------------------------------
  1 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  2 | checkpoint_dir: checkpoints/meta-llama/Llama-2-7b-hf
  3 | 
  4 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
  5 | out_dir: out/finetune/lora-llama2-7b
  6 | 
  7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  8 | precision: bf16-true
  9 | 
 10 | # If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
 11 | quantize:
 12 | 
 13 | # How many devices/GPUs to use. (type: Union[int, str], default: 1)
 14 | devices: 1
 15 | 
 16 | # How many nodes to use. (type: int, default: 1)
 17 | num_nodes: 1
 18 | 
 19 | # The LoRA rank. (type: int, default: 8)
 20 | lora_r: 32
 21 | 
 22 | # The LoRA alpha. (type: int, default: 16)
 23 | lora_alpha: 16
 24 | 
 25 | # The LoRA dropout value. (type: float, default: 0.05)
 26 | lora_dropout: 0.05
 27 | 
 28 | # Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
 29 | lora_query: true
 30 | 
 31 | # Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
 32 | lora_key: false
 33 | 
 34 | # Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
 35 | lora_value: true
 36 | 
 37 | # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
 38 | lora_projection: false
 39 | 
 40 | # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
 41 | lora_mlp: false
 42 | 
 43 | # Whether to apply LoRA to output head in GPT. (type: bool, default: False)
 44 | lora_head: false
 45 | 
 46 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 47 | data:
 48 |   class_path: litgpt.data.Alpaca2k
 49 |   init_args:
 50 |     mask_prompt: false
 51 |     prompt_style: alpaca
 52 |     ignore_index: -100
 53 |     seed: 42
 54 |     num_workers: 4
 55 | 
 56 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 57 | train:
 58 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 59 |   save_interval: 200
 60 | 
 61 |   # Number of iterations between logging calls (type: int, default: 1)
 62 |   log_interval: 1
 63 | 
 64 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
 65 |   global_batch_size: 8
 66 | 
 67 |   # Number of samples per data-parallel rank (type: int, default: 4)
 68 |   micro_batch_size: 2
 69 | 
 70 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 71 |   lr_warmup_steps: 10
 72 | 
 73 |   # Number of epochs to train on (type: Optional[int], default: 5)
 74 |   epochs: 4
 75 | 
 76 |   # Total number of tokens to train on (type: Optional[int], default: null)
 77 |   max_tokens:
 78 | 
 79 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 80 |   max_steps:
 81 | 
 82 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 83 |   max_seq_length: 512
 84 | 
 85 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 86 |   tie_embeddings:
 87 | 
 88 |   #   (type: Optional[float], default: null)
 89 |   max_norm:
 90 | 
 91 |   #   (type: float, default: 6e-05)
 92 |   min_lr: 6.0e-05
 93 | 
 94 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 95 | eval:
 96 |   # Number of optimizer steps between evaluation calls (type: int, default: 100)
 97 |   interval: 100
 98 | 
 99 |   # Number of tokens to generate (type: Optional[int], default: 100)
100 |   max_new_tokens: 100
101 | 
102 |   # Number of iterations (type: int, default: 100)
103 |   max_iters: 100
104 | 
105 |   # Whether to evaluate on the validation set at the beginning of the training
106 |   initial_validation: false
107 | 
108 |   # Whether to evaluate on the validation set at the end the training
109 |   final_validation: true
110 | 
111 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
112 | logger_name: csv
113 | 
114 | # The random seed to use for reproducibility. (type: int, default: 1337)
115 | seed: 1337
116 | 
117 | # Optimizer-related arguments
118 | optimizer:
119 |   class_path: torch.optim.AdamW
120 | 
121 |   init_args:
122 |     #   (type: float, default: 0.001)
123 |     lr: 0.0002
124 | 
125 |     #   (type: float, default: 0.01)
126 |     weight_decay: 0.0
127 | 
128 |     #   (type: tuple, default: (0.9,0.999))
129 |     betas:
130 |       - 0.9
131 |       - 0.95
132 | 


--------------------------------------------------------------------------------
/config_hub/finetune/llama-3-8b/full.yaml:
--------------------------------------------------------------------------------
  1 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  2 | checkpoint_dir: checkpoints/meta-llama/Meta-Llama-3-8B
  3 | 
  4 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/finetune/full)
  5 | out_dir: out/finetune/full-llama-3-8b
  6 | 
  7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  8 | precision: bf16-true
  9 | 
 10 | # How many devices/GPUs to use (type: Union[int, str], default: 1)
 11 | devices: 4
 12 | 
 13 | # How many nodes to use. (type: int, default: 1)
 14 | num_nodes: 1
 15 | 
 16 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
 17 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
 18 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
 19 | # (type: Union[bool, Literal["auto"], Path], default: False)
 20 | resume: false
 21 | 
 22 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 23 | data:
 24 |   class_path: litgpt.data.Alpaca2k
 25 |   init_args:
 26 |     mask_prompt: false
 27 |     prompt_style: alpaca
 28 |     ignore_index: -100
 29 |     seed: 42
 30 |     num_workers: 4
 31 | 
 32 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 33 | train:
 34 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 35 |   save_interval: 200
 36 | 
 37 |   # Number of iterations between logging calls (type: int, default: 1)
 38 |   log_interval: 1
 39 | 
 40 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64)
 41 |   global_batch_size: 64
 42 | 
 43 |   # Number of samples per data-parallel rank (type: int, default: 1)
 44 |   micro_batch_size: 4
 45 | 
 46 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 47 |   lr_warmup_steps: 25
 48 | 
 49 |   # Number of epochs to train on (type: Optional[int], default: 5)
 50 |   epochs: 1
 51 | 
 52 |   # Total number of tokens to train on (type: Optional[int], default: null)
 53 |   max_tokens:
 54 | 
 55 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 56 |   max_steps:
 57 | 
 58 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 59 |   max_seq_length: 512
 60 | 
 61 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 62 |   tie_embeddings:
 63 | 
 64 |   #   (type: Optional[float], default: null)
 65 |   max_norm:
 66 | 
 67 |   #   (type: float, default: 6e-05)
 68 |   min_lr: 6.0e-05
 69 | 
 70 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 71 | eval:
 72 |   # Number of optimizer steps between evaluation calls (type: int, default: 600)
 73 |   interval: 25
 74 | 
 75 |   # Number of tokens to generate (type: Optional[int], default: 100)
 76 |   max_new_tokens: 100
 77 | 
 78 |   # Number of iterations (type: int, default: 100)
 79 |   max_iters: 100
 80 | 
 81 |   # Whether to evaluate on the validation set at the beginning of the training
 82 |   initial_validation: false
 83 | 
 84 |   # Whether to evaluate on the validation set at the end the training
 85 |   final_validation: true
 86 | 
 87 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 88 | logger_name: csv
 89 | 
 90 | # The random seed to use for reproducibility. (type: int, default: 1337)
 91 | seed: 1337
 92 | 
 93 | # Optimizer-related arguments
 94 | optimizer:
 95 |   class_path: torch.optim.AdamW
 96 | 
 97 |   init_args:
 98 |     #   (type: float, default: 0.001)
 99 |     lr: 0.0002
100 | 
101 |     #   (type: float, default: 0.01)
102 |     weight_decay: 0.1
103 | 
104 |     #   (type: tuple, default: (0.9,0.999))
105 |     betas:
106 |       - 0.9
107 |       - 0.95
108 | 


--------------------------------------------------------------------------------
/config_hub/finetune/llama-3.1-8b/full.yaml:
--------------------------------------------------------------------------------
  1 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  2 | checkpoint_dir: checkpoints/meta-llama/Meta-Llama-3.1-8B
  3 | 
  4 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/finetune/full)
  5 | out_dir: out/finetune/full-llama-3.1-8b
  6 | 
  7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  8 | precision: bf16-true
  9 | 
 10 | # How many devices/GPUs to use (type: Union[int, str], default: 1)
 11 | devices: 4
 12 | 
 13 | # How many nodes to use. (type: int, default: 1)
 14 | num_nodes: 1
 15 | 
 16 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
 17 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
 18 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
 19 | # (type: Union[bool, Literal["auto"], Path], default: False)
 20 | resume: false
 21 | 
 22 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 23 | data:
 24 |   class_path: litgpt.data.Alpaca2k
 25 |   init_args:
 26 |     mask_prompt: false
 27 |     prompt_style: alpaca
 28 |     ignore_index: -100
 29 |     seed: 42
 30 |     num_workers: 4
 31 | 
 32 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 33 | train:
 34 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 35 |   save_interval: 200
 36 | 
 37 |   # Number of iterations between logging calls (type: int, default: 1)
 38 |   log_interval: 1
 39 | 
 40 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64)
 41 |   global_batch_size: 64
 42 | 
 43 |   # Number of samples per data-parallel rank (type: int, default: 1)
 44 |   micro_batch_size: 4
 45 | 
 46 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 47 |   lr_warmup_steps: 25
 48 | 
 49 |   # Number of epochs to train on (type: Optional[int], default: 5)
 50 |   epochs: 1
 51 | 
 52 |   # Total number of tokens to train on (type: Optional[int], default: null)
 53 |   max_tokens:
 54 | 
 55 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 56 |   max_steps:
 57 | 
 58 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 59 |   max_seq_length: 512
 60 | 
 61 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 62 |   tie_embeddings:
 63 | 
 64 |   #   (type: Optional[float], default: null)
 65 |   max_norm:
 66 | 
 67 |   #   (type: float, default: 6e-05)
 68 |   min_lr: 6.0e-05
 69 | 
 70 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 71 | eval:
 72 |   # Number of optimizer steps between evaluation calls (type: int, default: 600)
 73 |   interval: 25
 74 | 
 75 |   # Number of tokens to generate (type: Optional[int], default: 100)
 76 |   max_new_tokens: 100
 77 | 
 78 |   # Number of iterations (type: int, default: 100)
 79 |   max_iters: 100
 80 | 
 81 |   # Whether to evaluate on the validation set at the beginning of the training
 82 |   initial_validation: false
 83 | 
 84 |   # Whether to evaluate on the validation set at the end the training
 85 |   final_validation: true
 86 | 
 87 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 88 | logger_name: csv
 89 | 
 90 | # The random seed to use for reproducibility. (type: int, default: 1337)
 91 | seed: 1337
 92 | 
 93 | # Optimizer-related arguments
 94 | optimizer:
 95 |   class_path: torch.optim.AdamW
 96 | 
 97 |   init_args:
 98 |     #   (type: float, default: 0.001)
 99 |     lr: 0.0002
100 | 
101 |     #   (type: float, default: 0.01)
102 |     weight_decay: 0.1
103 | 
104 |     #   (type: tuple, default: (0.9,0.999))
105 |     betas:
106 |       - 0.9
107 |       - 0.95
108 | 


--------------------------------------------------------------------------------
/config_hub/finetune/llama-3.2-1B/full.yaml:
--------------------------------------------------------------------------------
  1 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  2 | checkpoint_dir: checkpoints/meta-llama/Llama-3.2-1B
  3 | 
  4 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/finetune/full)
  5 | out_dir: out/finetune/full-llama-3.2-1B
  6 | 
  7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  8 | precision: bf16-true
  9 | 
 10 | # How many devices/GPUs to use (type: Union[int, str], default: 1)
 11 | devices: 1
 12 | 
 13 | # How many nodes to use. (type: int, default: 1)
 14 | num_nodes: 1
 15 | 
 16 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
 17 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
 18 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
 19 | # (type: Union[bool, Literal["auto"], Path], default: False)
 20 | # resume: false
 21 | 
 22 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 23 | data:
 24 |   class_path: litgpt.data.Alpaca2k
 25 |   init_args:
 26 |     mask_prompt: false
 27 |     prompt_style: alpaca
 28 |     ignore_index: -100
 29 |     seed: 42
 30 |     num_workers: 4
 31 | 
 32 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 33 | train:
 34 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 35 |   save_interval: 200
 36 | 
 37 |   # Number of iterations between logging calls (type: int, default: 1)
 38 |   log_interval: 1
 39 | 
 40 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64)
 41 |   global_batch_size: 64
 42 | 
 43 |   # Number of samples per data-parallel rank (type: int, default: 1)
 44 |   micro_batch_size: 4
 45 | 
 46 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 47 |   lr_warmup_steps: 25
 48 | 
 49 |   # Number of epochs to train on (type: Optional[int], default: 5)
 50 |   epochs: 1
 51 | 
 52 |   # Total number of tokens to train on (type: Optional[int], default: null)
 53 |   max_tokens:
 54 | 
 55 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 56 |   max_steps:
 57 | 
 58 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 59 |   max_seq_length: 512
 60 | 
 61 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 62 |   tie_embeddings:
 63 | 
 64 |   #   (type: Optional[float], default: null)
 65 |   max_norm:
 66 | 
 67 |   #   (type: float, default: 6e-05)
 68 |   min_lr: 6.0e-05
 69 | 
 70 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 71 | eval:
 72 |   # Number of optimizer steps between evaluation calls (type: int, default: 600)
 73 |   interval: 25
 74 | 
 75 |   # Number of tokens to generate (type: Optional[int], default: 100)
 76 |   max_new_tokens: 100
 77 | 
 78 |   # Number of iterations (type: int, default: 100)
 79 |   max_iters: 100
 80 | 
 81 |   # Whether to evaluate on the validation set at the beginning of the training
 82 |   initial_validation: false
 83 | 
 84 |   # Whether to evaluate on the validation set at the end the training
 85 |   final_validation: true
 86 | 
 87 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 88 | logger_name: csv
 89 | 
 90 | # The random seed to use for reproducibility. (type: int, default: 1337)
 91 | seed: 1337
 92 | 
 93 | # Optimizer-related arguments
 94 | optimizer:
 95 |   class_path: torch.optim.AdamW
 96 | 
 97 |   init_args:
 98 |     #   (type: float, default: 0.001)
 99 |     lr: 0.0002
100 | 
101 |     #   (type: float, default: 0.01)
102 |     weight_decay: 0.1
103 | 
104 |     #   (type: tuple, default: (0.9,0.999))
105 |     betas:
106 |       - 0.9
107 |       - 0.95
108 | 


--------------------------------------------------------------------------------
/config_hub/finetune/llama-3.2-3B/full.yaml:
--------------------------------------------------------------------------------
  1 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  2 | checkpoint_dir: checkpoints/meta-llama/Llama-3.2-3B
  3 | 
  4 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/finetune/full)
  5 | out_dir: out/finetune/full-llama-3.2-3B
  6 | 
  7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  8 | precision: bf16-true
  9 | 
 10 | # How many devices/GPUs to use (type: Union[int, str], default: 1)
 11 | devices: 1
 12 | 
 13 | # How many nodes to use. (type: int, default: 1)
 14 | num_nodes: 1
 15 | 
 16 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
 17 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
 18 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
 19 | # (type: Union[bool, Literal["auto"], Path], default: False)
 20 | # resume: false
 21 | 
 22 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 23 | data:
 24 |   class_path: litgpt.data.Alpaca2k
 25 |   init_args:
 26 |     mask_prompt: false
 27 |     prompt_style: alpaca
 28 |     ignore_index: -100
 29 |     seed: 42
 30 |     num_workers: 4
 31 | 
 32 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 33 | train:
 34 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 35 |   save_interval: 200
 36 | 
 37 |   # Number of iterations between logging calls (type: int, default: 1)
 38 |   log_interval: 1
 39 | 
 40 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64)
 41 |   global_batch_size: 64
 42 | 
 43 |   # Number of samples per data-parallel rank (type: int, default: 1)
 44 |   micro_batch_size: 4
 45 | 
 46 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 47 |   lr_warmup_steps: 25
 48 | 
 49 |   # Number of epochs to train on (type: Optional[int], default: 5)
 50 |   epochs: 1
 51 | 
 52 |   # Total number of tokens to train on (type: Optional[int], default: null)
 53 |   max_tokens:
 54 | 
 55 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 56 |   max_steps:
 57 | 
 58 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 59 |   max_seq_length: 512
 60 | 
 61 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 62 |   tie_embeddings:
 63 | 
 64 |   #   (type: Optional[float], default: null)
 65 |   max_norm:
 66 | 
 67 |   #   (type: float, default: 6e-05)
 68 |   min_lr: 6.0e-05
 69 | 
 70 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 71 | eval:
 72 |   # Number of optimizer steps between evaluation calls (type: int, default: 600)
 73 |   interval: 25
 74 | 
 75 |   # Number of tokens to generate (type: Optional[int], default: 100)
 76 |   max_new_tokens: 100
 77 | 
 78 |   # Number of iterations (type: int, default: 100)
 79 |   max_iters: 100
 80 | 
 81 |   # Whether to evaluate on the validation set at the beginning of the training
 82 |   initial_validation: false
 83 | 
 84 |   # Whether to evaluate on the validation set at the end the training
 85 |   final_validation: true
 86 | 
 87 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 88 | logger_name: csv
 89 | 
 90 | # The random seed to use for reproducibility. (type: int, default: 1337)
 91 | seed: 1337
 92 | 
 93 | # Optimizer-related arguments
 94 | optimizer:
 95 |   class_path: torch.optim.AdamW
 96 | 
 97 |   init_args:
 98 |     #   (type: float, default: 0.001)
 99 |     lr: 0.0002
100 | 
101 |     #   (type: float, default: 0.01)
102 |     weight_decay: 0.1
103 | 
104 |     #   (type: tuple, default: (0.9,0.999))
105 |     betas:
106 |       - 0.9
107 |       - 0.95
108 | 


--------------------------------------------------------------------------------
/config_hub/finetune/phi-2/full.yaml:
--------------------------------------------------------------------------------
  1 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  2 | checkpoint_dir: checkpoints/microsoft/phi-2
  3 | 
  4 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/finetune/full)
  5 | out_dir: out/finetune/full-phi-2
  6 | 
  7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  8 | precision: bf16-true
  9 | 
 10 | # How many devices/GPUs to use (type: Union[int, str], default: 1)
 11 | devices: 2
 12 | 
 13 | # How many nodes to use. (type: int, default: 1)
 14 | num_nodes: 1
 15 | 
 16 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 17 | data:
 18 |   class_path: litgpt.data.Alpaca2k
 19 |   init_args:
 20 |     mask_prompt: false
 21 |     prompt_style: alpaca
 22 |     ignore_index: -100
 23 |     seed: 42
 24 |     num_workers: 4
 25 | 
 26 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 27 | train:
 28 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 29 |   save_interval: 200
 30 | 
 31 |   # Number of iterations between logging calls (type: int, default: 1)
 32 |   log_interval: 1
 33 | 
 34 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64)
 35 |   global_batch_size: 8
 36 | 
 37 |   # Number of samples per data-parallel rank (type: int, default: 1)
 38 |   micro_batch_size: 4
 39 | 
 40 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 41 |   lr_warmup_steps: 200
 42 | 
 43 |   # Number of epochs to train on (type: Optional[int], default: 5)
 44 |   epochs: 1
 45 | 
 46 |   # Total number of tokens to train on (type: Optional[int], default: null)
 47 |   max_tokens:
 48 | 
 49 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 50 |   max_steps: 100
 51 | 
 52 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 53 |   max_seq_length: 512
 54 | 
 55 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 56 |   tie_embeddings:
 57 | 
 58 |   #   (type: Optional[float], default: null)
 59 |   max_norm:
 60 | 
 61 |   #   (type: float, default: 6e-05)
 62 |   min_lr: 6.0e-05
 63 | 
 64 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 65 | eval:
 66 |   # Number of optimizer steps between evaluation calls (type: int, default: 600)
 67 |   interval: 25
 68 | 
 69 |   # Number of tokens to generate (type: Optional[int], default: 100)
 70 |   max_new_tokens: 100
 71 | 
 72 |   # Number of iterations (type: int, default: 100)
 73 |   max_iters: 100
 74 | 
 75 |   # Whether to evaluate on the validation set at the beginning of the training
 76 |   initial_validation: false
 77 | 
 78 |   # Whether to evaluate on the validation set at the end the training
 79 |   final_validation: true
 80 | 
 81 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 82 | logger_name: csv
 83 | 
 84 | # The random seed to use for reproducibility. (type: int, default: 1337)
 85 | seed: 1337
 86 | 
 87 | # Optimizer-related arguments
 88 | optimizer:
 89 |   class_path: torch.optim.AdamW
 90 | 
 91 |   init_args:
 92 |     #   (type: float, default: 0.001)
 93 |     lr: 0.0002
 94 | 
 95 |     #   (type: float, default: 0.01)
 96 |     weight_decay: 0.1
 97 | 
 98 |     #   (type: tuple, default: (0.9,0.999))
 99 |     betas:
100 |       - 0.9
101 |       - 0.95
102 | 


--------------------------------------------------------------------------------
/config_hub/finetune/phi-3/full.yaml:
--------------------------------------------------------------------------------
 1 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
 2 | checkpoint_dir: checkpoints/microsoft/Phi-3-mini-4k-instruct
 3 | 
 4 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/finetune/full)
 5 | out_dir: out/finetune/full-phi-3
 6 | 
 7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
 8 | precision: bf16-true
 9 | 
10 | # How many devices/GPUs to use (type: Union[int, str], default: 1)
11 | devices: 1
12 | 
13 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
14 | data:
15 |   class_path: litgpt.data.Alpaca2k
16 |   init_args:
17 |     mask_prompt: false
18 |     prompt_style: alpaca
19 |     ignore_index: -100
20 |     seed: 42
21 |     num_workers: 4
22 | 
23 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
24 | train:
25 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
26 |   save_interval: 200
27 | 
28 |   # Number of iterations between logging calls (type: int, default: 1)
29 |   log_interval: 1
30 | 
31 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64)
32 |   global_batch_size: 8
33 | 
34 |   # Number of samples per data-parallel rank (type: int, default: 1)
35 |   micro_batch_size: 4
36 | 
37 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
38 |   lr_warmup_steps: 200
39 | 
40 |   # Number of epochs to train on (type: Optional[int], default: 5)
41 |   epochs: 1
42 | 
43 |   # Total number of tokens to train on (type: Optional[int], default: null)
44 |   max_tokens:
45 | 
46 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
47 |   max_steps:
48 | 
49 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
50 |   max_seq_length: 512
51 | 
52 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
53 |   tie_embeddings:
54 | 
55 |   #   (type: Optional[float], default: null)
56 |   max_norm:
57 | 
58 |   #   (type: float, default: 6e-05)
59 |   min_lr: 6.0e-05
60 | 
61 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
62 | eval:
63 |   # Number of optimizer steps between evaluation calls (type: int, default: 600)
64 |   interval: 25
65 | 
66 |   # Number of tokens to generate (type: Optional[int], default: 100)
67 |   max_new_tokens: 100
68 | 
69 |   # Number of iterations (type: int, default: 100)
70 |   max_iters: 100
71 | 
72 |   # Whether to evaluate on the validation set at the beginning of the training
73 |   initial_validation: false
74 | 
75 |   # Whether to evaluate on the validation set at the end the training
76 |   final_validation: true
77 | 
78 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
79 | logger_name: csv
80 | 
81 | # The random seed to use for reproducibility. (type: int, default: 1337)
82 | seed: 1337
83 | 
84 | # Optimizer-related arguments
85 | optimizer:
86 |   class_path: torch.optim.AdamW
87 | 
88 |   init_args:
89 |     #   (type: float, default: 0.001)
90 |     lr: 0.0002
91 | 
92 |     #   (type: float, default: 0.01)
93 |     weight_decay: 0.1
94 | 
95 |     #   (type: tuple, default: (0.9,0.999))
96 |     betas:
97 |       - 0.9
98 |       - 0.95
99 | 


--------------------------------------------------------------------------------
/config_hub/finetune/phi-3/lora.yaml:
--------------------------------------------------------------------------------
  1 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  2 | checkpoint_dir: checkpoints/microsoft/Phi-3-mini-4k-instruct
  3 | 
  4 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
  5 | out_dir: out/finetune/lora-phi-3
  6 | 
  7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  8 | precision: bf16-true
  9 | 
 10 | # If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
 11 | quantize:
 12 | 
 13 | # How many devices/GPUs to use. (type: Union[int, str], default: 1)
 14 | devices: 1
 15 | 
 16 | # The LoRA rank. (type: int, default: 8)
 17 | lora_r: 8
 18 | 
 19 | # The LoRA alpha. (type: int, default: 16)
 20 | lora_alpha: 16
 21 | 
 22 | # The LoRA dropout value. (type: float, default: 0.05)
 23 | lora_dropout: 0.05
 24 | 
 25 | # Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
 26 | lora_query: true
 27 | 
 28 | # Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
 29 | lora_key: true
 30 | 
 31 | # Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
 32 | lora_value: true
 33 | 
 34 | # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
 35 | lora_projection: true
 36 | 
 37 | # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
 38 | lora_mlp: true
 39 | 
 40 | # Whether to apply LoRA to output head in GPT. (type: bool, default: False)
 41 | lora_head: true
 42 | 
 43 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 44 | data:
 45 |   class_path: litgpt.data.Alpaca2k
 46 |   init_args:
 47 |     mask_prompt: false
 48 |     val_split_fraction: 0.03847
 49 |     prompt_style: alpaca
 50 |     ignore_index: -100
 51 |     seed: 42
 52 |     num_workers: 4
 53 | 
 54 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 55 | train:
 56 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 57 |   save_interval: 800
 58 | 
 59 |   # Number of iterations between logging calls (type: int, default: 1)
 60 |   log_interval: 1
 61 | 
 62 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
 63 |   global_batch_size: 8
 64 | 
 65 |   # Number of samples per data-parallel rank (type: int, default: 4)
 66 |   micro_batch_size: 4
 67 | 
 68 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 69 |   lr_warmup_steps: 10
 70 | 
 71 |   # Number of epochs to train on (type: Optional[int], default: 5)
 72 |   epochs: 1
 73 | 
 74 |   # Total number of tokens to train on (type: Optional[int], default: null)
 75 |   max_tokens:
 76 | 
 77 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 78 |   max_steps:
 79 | 
 80 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 81 |   max_seq_length: 512
 82 | 
 83 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 84 |   tie_embeddings:
 85 | 
 86 |   #   (type: Optional[float], default: null)
 87 |   max_norm:
 88 | 
 89 |   #   (type: float, default: 6e-05)
 90 |   min_lr: 6.0e-05
 91 | 
 92 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 93 | eval:
 94 |   # Number of optimizer steps between evaluation calls (type: int, default: 100)
 95 |   interval: 100
 96 | 
 97 |   # Number of tokens to generate (type: Optional[int], default: 100)
 98 |   max_new_tokens: 100
 99 | 
100 |   # Number of iterations (type: int, default: 100)
101 |   max_iters: 100
102 | 
103 |   # Whether to evaluate on the validation set at the beginning of the training
104 |   initial_validation: false
105 | 
106 |   # Whether to evaluate on the validation set at the end the training
107 |   final_validation: true
108 | 
109 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
110 | logger_name: csv
111 | 
112 | # The random seed to use for reproducibility. (type: int, default: 1337)
113 | seed: 1337
114 | 
115 | # Optimizer-related arguments
116 | optimizer:
117 |   class_path: torch.optim.AdamW
118 | 
119 |   init_args:
120 |     #   (type: float, default: 0.001)
121 |     lr: 0.0002
122 | 
123 |     #   (type: float, default: 0.01)
124 |     weight_decay: 0.0
125 | 
126 |     #   (type: tuple, default: (0.9,0.999))
127 |     betas:
128 |       - 0.9
129 |       - 0.95
130 | 


--------------------------------------------------------------------------------
/config_hub/finetune/phi-3/qlora.yaml:
--------------------------------------------------------------------------------
  1 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  2 | checkpoint_dir: checkpoints/microsoft/Phi-3-mini-4k-instruct
  3 | 
  4 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
  5 | out_dir: out/finetune/qlora-phi-3
  6 | 
  7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  8 | precision: bf16-true
  9 | 
 10 | # If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
 11 | quantize: bnb.nf4
 12 | 
 13 | # How many devices/GPUs to use. (type: Union[int, str], default: 1)
 14 | devices: 1
 15 | 
 16 | # The LoRA rank. (type: int, default: 8)
 17 | lora_r: 8
 18 | 
 19 | # The LoRA alpha. (type: int, default: 16)
 20 | lora_alpha: 16
 21 | 
 22 | # The LoRA dropout value. (type: float, default: 0.05)
 23 | lora_dropout: 0.05
 24 | 
 25 | # Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
 26 | lora_query: true
 27 | 
 28 | # Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
 29 | lora_key: true
 30 | 
 31 | # Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
 32 | lora_value: true
 33 | 
 34 | # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
 35 | lora_projection: true
 36 | 
 37 | # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
 38 | lora_mlp: true
 39 | 
 40 | # Whether to apply LoRA to output head in GPT. (type: bool, default: False)
 41 | lora_head: true
 42 | 
 43 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 44 | data:
 45 |   class_path: litgpt.data.Alpaca2k
 46 |   init_args:
 47 |     mask_prompt: false
 48 |     val_split_fraction: 0.03847
 49 |     prompt_style: alpaca
 50 |     ignore_index: -100
 51 |     seed: 42
 52 |     num_workers: 4
 53 | 
 54 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 55 | train:
 56 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 57 |   save_interval: 800
 58 | 
 59 |   # Number of iterations between logging calls (type: int, default: 1)
 60 |   log_interval: 1
 61 | 
 62 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
 63 |   global_batch_size: 8
 64 | 
 65 |   # Number of samples per data-parallel rank (type: int, default: 4)
 66 |   micro_batch_size: 4
 67 | 
 68 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 69 |   lr_warmup_steps: 10
 70 | 
 71 |   # Number of epochs to train on (type: Optional[int], default: 5)
 72 |   epochs: 1
 73 | 
 74 |   # Total number of tokens to train on (type: Optional[int], default: null)
 75 |   max_tokens:
 76 | 
 77 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 78 |   max_steps:
 79 | 
 80 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 81 |   max_seq_length: 512
 82 | 
 83 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 84 |   tie_embeddings:
 85 | 
 86 |   #   (type: Optional[float], default: null)
 87 |   max_norm:
 88 | 
 89 |   #   (type: float, default: 6e-05)
 90 |   min_lr: 6.0e-05
 91 | 
 92 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 93 | eval:
 94 |   # Number of optimizer steps between evaluation calls (type: int, default: 100)
 95 |   interval: 100
 96 | 
 97 |   # Number of tokens to generate (type: Optional[int], default: 100)
 98 |   max_new_tokens: 100
 99 | 
100 |   # Number of iterations (type: int, default: 100)
101 |   max_iters: 100
102 | 
103 |   # Whether to evaluate on the validation set at the beginning of the training
104 |   initial_validation: false
105 | 
106 |   # Whether to evaluate on the validation set at the end the training
107 |   final_validation: true
108 | 
109 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
110 | logger_name: csv
111 | 
112 | # The random seed to use for reproducibility. (type: int, default: 1337)
113 | seed: 1337
114 | 
115 | # Optimizer-related arguments
116 | optimizer:
117 |   class_path: torch.optim.AdamW
118 | 
119 |   init_args:
120 |     #   (type: float, default: 0.001)
121 |     lr: 0.0002
122 | 
123 |     #   (type: float, default: 0.01)
124 |     weight_decay: 0.0
125 | 
126 |     #   (type: tuple, default: (0.9,0.999))
127 |     betas:
128 |       - 0.9
129 |       - 0.95
130 | 


--------------------------------------------------------------------------------
/config_hub/finetune/stablelm-base-alpha-3b/full.yaml:
--------------------------------------------------------------------------------
  1 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  2 | checkpoint_dir: checkpoints/stabilityai/stablelm-base-alpha-3b
  3 | 
  4 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
  5 | out_dir: out/finetune/full-stablelm-base-alpha-3b
  6 | 
  7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  8 | precision: bf16-true
  9 | 
 10 | # How many devices/GPUs to use. (type: Union[int, str], default: 1)
 11 | devices: 2
 12 | 
 13 | # How many nodes to use. (type: int, default: 1)
 14 | num_nodes: 1
 15 | 
 16 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 17 | data:
 18 |   class_path: litgpt.data.Alpaca2k
 19 |   init_args:
 20 |     mask_prompt: false
 21 |     val_split_fraction: 0.03847
 22 |     prompt_style: alpaca
 23 |     ignore_index: -100
 24 |     seed: 42
 25 |     num_workers: 4
 26 | 
 27 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 28 | train:
 29 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 30 |   save_interval: 800
 31 | 
 32 |   # Number of iterations between logging calls (type: int, default: 1)
 33 |   log_interval: 1
 34 | 
 35 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
 36 |   global_batch_size: 8
 37 | 
 38 |   # Number of samples per data-parallel rank (type: int, default: 4)
 39 |   micro_batch_size: 1
 40 | 
 41 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 42 |   lr_warmup_steps: 1000
 43 | 
 44 |   # Number of epochs to train on (type: Optional[int], default: 5)
 45 |   epochs: 1
 46 | 
 47 |   # Total number of tokens to train on (type: Optional[int], default: null)
 48 |   max_tokens:
 49 | 
 50 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 51 |   max_steps:
 52 | 
 53 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 54 |   max_seq_length: 512
 55 | 
 56 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 57 |   tie_embeddings:
 58 | 
 59 |   #   (type: Optional[float], default: null)
 60 |   max_norm:
 61 | 
 62 |   #   (type: float, default: 6e-05)
 63 |   min_lr: 6.0e-05
 64 | 
 65 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 66 | eval:
 67 |   # Number of optimizer steps between evaluation calls (type: int, default: 100)
 68 |   interval: 25
 69 | 
 70 |   # Number of tokens to generate (type: Optional[int], default: 100)
 71 |   max_new_tokens: 100
 72 | 
 73 |   # Number of iterations (type: int, default: 100)
 74 |   max_iters: 100
 75 | 
 76 |   # Whether to evaluate on the validation set at the beginning of the training
 77 |   initial_validation: false
 78 | 
 79 |   # Whether to evaluate on the validation set at the end the training
 80 |   final_validation: true
 81 | 
 82 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 83 | logger_name: csv
 84 | 
 85 | # The random seed to use for reproducibility. (type: int, default: 1337)
 86 | seed: 1337
 87 | 
 88 | # Optimizer-related arguments
 89 | optimizer:
 90 |   class_path: torch.optim.AdamW
 91 | 
 92 |   init_args:
 93 |     #   (type: float, default: 0.001)
 94 |     lr: 0.0002
 95 | 
 96 |     #   (type: float, default: 0.01)
 97 |     weight_decay: 0.1
 98 | 
 99 |     #   (type: tuple, default: (0.9,0.999))
100 |     betas:
101 |       - 0.9
102 |       - 0.95
103 | 


--------------------------------------------------------------------------------
/config_hub/finetune/tiny-llama/full.yaml:
--------------------------------------------------------------------------------
  1 | # The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
  2 | checkpoint_dir: checkpoints/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
  3 | 
  4 | # Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
  5 | out_dir: out/finetune/full-tiny-llama-1.1b
  6 | 
  7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
  8 | precision: bf16-true
  9 | 
 10 | # How many devices/GPUs to use. (type: Union[int, str], default: 1)
 11 | devices: 1
 12 | 
 13 | # How many nodes to use. (type: int, default: 1)
 14 | num_nodes: 1
 15 | 
 16 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 17 | data:
 18 |   class_path: litgpt.data.Alpaca2k
 19 |   init_args:
 20 |     mask_prompt: false
 21 |     val_split_fraction: 0.03847
 22 |     prompt_style: alpaca
 23 |     ignore_index: -100
 24 |     seed: 42
 25 |     num_workers: 4
 26 | 
 27 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 28 | train:
 29 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 30 |   save_interval: 800
 31 | 
 32 |   # Number of iterations between logging calls (type: int, default: 1)
 33 |   log_interval: 1
 34 | 
 35 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
 36 |   global_batch_size: 32
 37 | 
 38 |   # Number of samples per data-parallel rank (type: int, default: 4)
 39 |   micro_batch_size: 4
 40 | 
 41 |   # Number of iterations with learning rate warmup active (type: int, default: 100)
 42 |   lr_warmup_steps: 1000
 43 | 
 44 |   # Number of epochs to train on (type: Optional[int], default: 5)
 45 |   epochs: 1
 46 | 
 47 |   # Total number of tokens to train on (type: Optional[int], default: null)
 48 |   max_tokens:
 49 | 
 50 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 51 |   max_steps:
 52 | 
 53 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 54 |   max_seq_length: 512
 55 | 
 56 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
 57 |   tie_embeddings:
 58 | 
 59 |   #   (type: Optional[float], default: null)
 60 |   max_norm:
 61 | 
 62 |   #   (type: float, default: 6e-05)
 63 |   min_lr: 6.0e-05
 64 | 
 65 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 66 | eval:
 67 |   # Number of optimizer steps between evaluation calls (type: int, default: 100)
 68 |   interval: 25
 69 | 
 70 |   # Number of tokens to generate (type: Optional[int], default: 100)
 71 |   max_new_tokens: 100
 72 | 
 73 |   # Number of iterations (type: int, default: 100)
 74 |   max_iters: 100
 75 | 
 76 |   # Whether to evaluate on the validation set at the beginning of the training
 77 |   initial_validation: false
 78 | 
 79 |   # Whether to evaluate on the validation set at the end the training
 80 |   final_validation: true
 81 | 
 82 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
 83 | logger_name: csv
 84 | 
 85 | # The random seed to use for reproducibility. (type: int, default: 1337)
 86 | seed: 1337
 87 | 
 88 | # Optimizer-related arguments
 89 | optimizer:
 90 |   class_path: torch.optim.AdamW
 91 | 
 92 |   init_args:
 93 |     #   (type: float, default: 0.001)
 94 |     lr: 0.0002
 95 | 
 96 |     #   (type: float, default: 0.01)
 97 |     weight_decay: 0.0
 98 | 
 99 |     #   (type: tuple, default: (0.9,0.999))
100 |     betas:
101 |       - 0.9
102 |       - 0.95
103 | 


--------------------------------------------------------------------------------
/config_hub/pretrain/debug.yaml:
--------------------------------------------------------------------------------
  1 | # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
  2 | # ``model_config``. (type: Optional[str], default: null)
  3 | model_name: pythia-14m
  4 | 
  5 | # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
  6 | # ``model_config``. (type: Optional[Config], default: null)
  7 | model_config:
  8 | 
  9 | # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
 10 | # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
 11 | out_dir: out/pretrain/debug
 12 | 
 13 | # The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
 14 | precision: bf16-mixed
 15 | 
 16 | # Optional path to a checkpoint directory to initialize the model from.
 17 | # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
 18 | initial_checkpoint_dir:
 19 | 
 20 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
 21 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
 22 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
 23 | # (type: Union[bool, Literal["auto"], Path], default: False)
 24 | resume: false
 25 | 
 26 | # Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
 27 | data: TinyStories
 28 | 
 29 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 30 | train:
 31 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 32 |   save_interval: 1000
 33 | 
 34 |   # Number of iterations between logging calls (type: int, default: 1)
 35 |   log_interval: 1
 36 | 
 37 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
 38 |   global_batch_size: 125
 39 | 
 40 |   # Number of samples per data-parallel rank (type: int, default: 4)
 41 |   micro_batch_size: 5
 42 | 
 43 |   # Number of iterations with learning rate warmup active (type: int, default: 2000)
 44 |   lr_warmup_steps: 100
 45 | 
 46 |   # Number of epochs to train on (type: Optional[int], default: null)
 47 |   epochs:
 48 | 
 49 |   # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
 50 |   max_tokens: 100000000
 51 | 
 52 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 53 |   max_steps:
 54 | 
 55 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 56 |   max_seq_length:
 57 | 
 58 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
 59 |   tie_embeddings:
 60 | 
 61 |   #   (type: Optional[float], default: 1.0)
 62 |   max_norm: 1.0
 63 | 
 64 |   #   (type: float, default: 4e-05)
 65 |   min_lr: 6e-5
 66 | 
 67 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 68 | eval:
 69 |   # Number of optimizer steps between evaluation calls (type: int, default: 1000)
 70 |   interval: 1000
 71 | 
 72 |   # Number of tokens to generate (type: Optional[int], default: null)
 73 |   max_new_tokens:
 74 | 
 75 |   # Number of iterations (type: int, default: 100)
 76 |   max_iters: 100
 77 | 
 78 |   # Whether to evaluate on the validation set at the beginning of the training
 79 |   initial_validation: false
 80 | 
 81 |   # Whether to evaluate on the validation set at the end the training
 82 |   final_validation: false
 83 | 
 84 | # Optimizer-related arguments
 85 | optimizer:
 86 |   class_path: torch.optim.AdamW
 87 | 
 88 |   init_args:
 89 |     #   (type: float, default: 0.001)
 90 |     lr: 6e-4
 91 | 
 92 |     #   (type: float, default: 0.01)
 93 |     weight_decay: 0.1
 94 | 
 95 |     #   (type: tuple, default: (0.9,0.999))
 96 |     betas:
 97 |       - 0.9
 98 |       - 0.95
 99 | 
100 | # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
101 | devices: auto
102 | 
103 | # How many nodes to use. (type: int, default: 1)
104 | num_nodes: 1
105 | 
106 | # Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
107 | # module require this. (type: Optional[Path], default: null)
108 | tokenizer_dir: checkpoints/EleutherAI/pythia-14m
109 | 
110 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
111 | logger_name: tensorboard
112 | 
113 | # The random seed to use for reproducibility. (type: int, default: 42)
114 | seed: 42
115 | 


--------------------------------------------------------------------------------
/config_hub/pretrain/microllama.yaml:
--------------------------------------------------------------------------------
  1 | # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
  2 | # ``model_config``. (type: Optional[str], default: null)
  3 | model_name: micro-llama-300M
  4 | 
  5 | # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
  6 | # ``model_config``. (type: Optional[Config], default: null)
  7 | model_config:
  8 | 
  9 | # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
 10 | # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
 11 | out_dir: out/pretrain/micro-llama
 12 | 
 13 | # The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
 14 | precision: bf16-mixed
 15 | 
 16 | # Optional path to a checkpoint directory to initialize the model from.
 17 | # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
 18 | initial_checkpoint_dir:
 19 | 
 20 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
 21 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
 22 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
 23 | # (type: Union[bool, Literal["auto"], Path], default: False)
 24 | resume: false
 25 | 
 26 | # Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
 27 | data: MicroLlama
 28 | 
 29 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 30 | train:
 31 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 32 |   save_interval: 1000
 33 | 
 34 |   # Number of iterations between logging calls (type: int, default: 1)
 35 |   log_interval: 1
 36 | 
 37 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 48)
 38 |   # Scale this number according to the number of GPU and memory size per GPU
 39 |   # For example, we used 48 for 4 x 24G 4090
 40 |   global_batch_size: 48
 41 | 
 42 |   # Number of samples per data-parallel rank (type: int, default: 12)
 43 |   # Scale this number according to the memory size per GPU
 44 |   # For example, we used 12 for 24G 4090
 45 |   micro_batch_size: 12
 46 | 
 47 |   # Number of iterations with learning rate warmup active (type: int, default: 2000)
 48 |   lr_warmup_steps: 2000
 49 | 
 50 |   # Number of epochs to train on (type: Optional[int], default: null)
 51 |   epochs:
 52 | 
 53 |   # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
 54 |   max_tokens: 3000000000000
 55 | 
 56 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 57 |   max_steps:
 58 | 
 59 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 60 |   max_seq_length: 2048
 61 | 
 62 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
 63 |   tie_embeddings:
 64 | 
 65 |   #   (type: Optional[float], default: 1.0)
 66 |   max_norm: 1.0
 67 | 
 68 |   #   (type: float, default: 4e-05)
 69 |   min_lr: 4.0e-05
 70 | 
 71 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 72 | eval:
 73 |   # Number of optimizer steps between evaluation calls (type: int, default: 1000)
 74 |   interval: 1000
 75 | 
 76 |   # Number of tokens to generate (type: Optional[int], default: null)
 77 |   max_new_tokens:
 78 | 
 79 |   # Number of iterations (type: int, default: 100)
 80 |   max_iters: 100
 81 | 
 82 |   # Whether to evaluate on the validation set at the beginning of the training
 83 |   initial_validation: false
 84 | 
 85 | # Optimizer-related arguments
 86 | optimizer:
 87 |   class_path: torch.optim.AdamW
 88 | 
 89 |   init_args:
 90 |     #   (type: float, default: 0.001)
 91 |     lr: 4e-4
 92 | 
 93 |     #   (type: float, default: 0.01)
 94 |     weight_decay: 0.1
 95 | 
 96 |     #   (type: tuple, default: (0.9,0.999))
 97 |     betas:
 98 |       - 0.9
 99 |       - 0.95
100 | 
101 | # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
102 | devices: auto
103 | 
104 | # How many nodes to use. (type: int, default: 1)
105 | num_nodes: 1
106 | 
107 | # Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
108 | # module require this. (type: Optional[Path], default: null)
109 | tokenizer_dir: checkpoints/meta-llama/Llama-2-7b-hf
110 | 
111 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
112 | logger_name: tensorboard
113 | 
114 | # The random seed to use for reproducibility. (type: int, default: 42)
115 | seed: 42
116 | 


--------------------------------------------------------------------------------
/config_hub/pretrain/tinyllama.yaml:
--------------------------------------------------------------------------------
  1 | # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
  2 | # ``model_config``. (type: Optional[str], default: null)
  3 | model_name: tiny-llama-1.1b
  4 | 
  5 | # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
  6 | # ``model_config``. (type: Optional[Config], default: null)
  7 | model_config:
  8 | 
  9 | # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
 10 | # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
 11 | out_dir: out/pretrain/tiny-llama
 12 | 
 13 | # The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
 14 | precision: bf16-mixed
 15 | 
 16 | # Optional path to a checkpoint directory to initialize the model from.
 17 | # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
 18 | initial_checkpoint_dir:
 19 | 
 20 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
 21 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
 22 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
 23 | # (type: Union[bool, Literal["auto"], Path], default: False)
 24 | resume: false
 25 | 
 26 | # Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
 27 | data: TinyLlama
 28 | 
 29 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 30 | train:
 31 |   # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
 32 |   save_interval: 1000
 33 | 
 34 |   # Number of iterations between logging calls (type: int, default: 1)
 35 |   log_interval: 1
 36 | 
 37 |   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
 38 |   global_batch_size: 512
 39 | 
 40 |   # Number of samples per data-parallel rank (type: int, default: 4)
 41 |   micro_batch_size: 4
 42 | 
 43 |   # Number of iterations with learning rate warmup active (type: int, default: 2000)
 44 |   lr_warmup_steps: 2000
 45 | 
 46 |   # Number of epochs to train on (type: Optional[int], default: null)
 47 |   epochs:
 48 | 
 49 |   # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
 50 |   max_tokens: 3000000000000
 51 | 
 52 |   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
 53 |   max_steps:
 54 | 
 55 |   # Limits the length of samples. Off by default (type: Optional[int], default: null)
 56 |   max_seq_length: 2048
 57 | 
 58 |   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
 59 |   tie_embeddings:
 60 | 
 61 |   #   (type: Optional[float], default: 1.0)
 62 |   max_norm: 1.0
 63 | 
 64 |   #   (type: float, default: 4e-05)
 65 |   min_lr: 4.0e-05
 66 | 
 67 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 68 | eval:
 69 |   # Number of optimizer steps between evaluation calls (type: int, default: 1000)
 70 |   interval: 1000
 71 | 
 72 |   # Number of tokens to generate (type: Optional[int], default: null)
 73 |   max_new_tokens:
 74 | 
 75 |   # Number of iterations (type: int, default: 100)
 76 |   max_iters: 100
 77 | 
 78 |   # Whether to evaluate on the validation set at the beginning of the training
 79 |   initial_validation: false
 80 | 
 81 |   # Whether to evaluate on the validation set at the end the training
 82 |   final_validation: false
 83 | 
 84 | # Optimizer-related arguments
 85 | optimizer:
 86 |   class_path: torch.optim.AdamW
 87 | 
 88 |   init_args:
 89 |     #   (type: float, default: 0.001)
 90 |     lr: 4e-4
 91 | 
 92 |     #   (type: float, default: 0.01)
 93 |     weight_decay: 0.1
 94 | 
 95 |     #   (type: tuple, default: (0.9,0.999))
 96 |     betas:
 97 |       - 0.9
 98 |       - 0.95
 99 | 
100 | # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
101 | devices: auto
102 | 
103 | # How many nodes to use. (type: int, default: 1)
104 | num_nodes: 1
105 | 
106 | # Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
107 | # module require this. (type: Optional[Path], default: null)
108 | tokenizer_dir: checkpoints/meta-llama/Llama-2-7b-hf
109 | 
110 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
111 | logger_name: tensorboard
112 | 
113 | # The random seed to use for reproducibility. (type: int, default: 42)
114 | seed: 42
115 | 


--------------------------------------------------------------------------------
/extensions/thunder/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from pathlib import Path
3 | 
4 | # support running without installing as a package, adding extensions to the Python path
5 | wd = Path(__file__).parent.parent.resolve()
6 | sys.path.append(str(wd))
7 | 


--------------------------------------------------------------------------------
/extensions/thunder/strategies/__init__.py:
--------------------------------------------------------------------------------
1 | from .thunder_ddp import ThunderDDPStrategy  # noqa: F401
2 | from .thunder_fsdp import ThunderFSDPStrategy  # noqa: F401
3 | 


--------------------------------------------------------------------------------
/extensions/thunder/unsloth/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/extensions/thunder/unsloth/__init__.py


--------------------------------------------------------------------------------
/extensions/thunder/unsloth/kernels/__init__.py:
--------------------------------------------------------------------------------
1 | from .cross_entropy_loss import _cross_entropy_backward_impl, _cross_entropy_forward_impl  # noqa: F401
2 | from .rope_embedding import ROPE_GROUP_SIZE, _rope_embedding_backward_impl, _rope_embedding_forward_impl  # noqa: F401
3 | from .swiglu import swiglu_DWf_DW_dfg_kernel, swiglu_fg_kernel  # noqa: F401
4 | from .utils import calculate_settings  # noqa: F401
5 | 


--------------------------------------------------------------------------------
/extensions/thunder/unsloth/kernels/swiglu.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import torch
 16 | 
 17 | from litgpt.utils import _TRITON_AVAILABLE
 18 | 
 19 | if _TRITON_AVAILABLE:
 20 |     import triton
 21 |     import triton.language as tl
 22 | 
 23 | 
 24 | @triton.jit
 25 | def _fg_kernel(
 26 |     e,
 27 |     g,
 28 |     h,
 29 |     n_elements,
 30 |     BLOCK_SIZE: tl.constexpr,
 31 | ):
 32 |     block_idx = tl.program_id(0)
 33 |     offsets = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
 34 |     mask = offsets < n_elements
 35 | 
 36 |     e_row = tl.load(e + offsets, mask=mask, other=0).to(tl.float32)
 37 |     g_row = tl.load(g + offsets, mask=mask, other=0)  # .to(tl.float32)
 38 | 
 39 |     # f = e * sigmoid(e)
 40 |     f_row = e_row * tl.sigmoid(e_row)  # e_row / (1 + tl.exp(-e_row))
 41 |     f_row = f_row.to(g_row.dtype)  # Exact copy from HF
 42 |     # h = f * g
 43 |     h_row = f_row * g_row
 44 | 
 45 |     # Store h
 46 |     tl.store(h + offsets, h_row, mask=mask)
 47 | 
 48 | 
 49 | pass
 50 | 
 51 | 
 52 | def swiglu_fg_kernel(e, g):
 53 |     batch, seq_len, hd = e.shape
 54 |     n_elements = e.numel()
 55 |     h = torch.empty((batch, seq_len, hd), dtype=e.dtype, device="cuda")
 56 |     grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
 57 |     _fg_kernel[grid](
 58 |         e,
 59 |         g,
 60 |         h,
 61 |         n_elements,
 62 |         BLOCK_SIZE=1024,
 63 |     )
 64 |     return h
 65 | 
 66 | 
 67 | pass
 68 | 
 69 | 
 70 | @triton.jit
 71 | def _DWf_DW_dfg_kernel(
 72 |     DW,
 73 |     e,
 74 |     g,
 75 |     n_elements,
 76 |     BLOCK_SIZE: tl.constexpr,
 77 | ):
 78 |     """
 79 |     e = e.float()
 80 |     se = 1.0 / (1.0 + torch.exp(-e))
 81 |     f = (se * e).to(dtype)
 82 |     h = f * g
 83 |     df = DW * f
 84 |     dg = DW * g
 85 |     de = (dg.float() * se * (1.0 + e * (1.0 - se))).to(dtype)
 86 |     """
 87 |     block_idx = tl.program_id(0)
 88 |     offsets = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
 89 |     mask = offsets < n_elements
 90 | 
 91 |     DW_row = tl.load(DW + offsets, mask=mask, other=0)  # .to(tl.float32)
 92 |     e_row = tl.load(e + offsets, mask=mask, other=0).to(tl.float32)
 93 |     g_row = tl.load(g + offsets, mask=mask, other=0)  # .to(tl.float32)
 94 | 
 95 |     # e = e.float()
 96 |     # se = 1.0 / (1.0 + torch.exp(-e))
 97 |     se_row = tl.sigmoid(e_row)  # 1.0 / (1.0 + tl.exp(-e_row))
 98 |     # f = (se * e).to(dtype)
 99 |     f_row = se_row * e_row
100 |     f_row = f_row.to(DW_row.dtype)
101 |     # h = f * g
102 |     h_row = f_row * g_row
103 |     # df = DW * f
104 |     df_row = DW_row * f_row
105 |     # dg = DW * g
106 |     dg_row = DW_row * g_row
107 |     # de = (dg.float() * se * (1.0 + e * (1.0 - se))).to(dtype)
108 |     de_row = dg_row.to(tl.float32) * se_row * (1.0 + e_row * (1.0 - se_row))
109 |     de_row = de_row.to(DW_row.dtype)
110 | 
111 |     # Store derivatives in buffers
112 |     tl.store(DW + offsets, h_row, mask=mask)  # h  = f * g
113 |     tl.store(e + offsets, df_row, mask=mask)  # df = DW * f
114 |     tl.store(g + offsets, de_row, mask=mask)  # de
115 | 
116 | 
117 | pass
118 | 
119 | 
120 | def swiglu_DWf_DW_dfg_kernel(DW, e, g):
121 |     batch_seq_len, hd = e.shape
122 |     n_elements = e.numel()
123 |     grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
124 |     _DWf_DW_dfg_kernel[grid](
125 |         DW,
126 |         e,
127 |         g,
128 |         n_elements,
129 |         BLOCK_SIZE=1024,
130 |     )
131 |     return DW, e, g
132 | 
133 | 
134 | pass
135 | 


--------------------------------------------------------------------------------
/extensions/thunder/unsloth/kernels/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from litgpt.utils import _TRITON_AVAILABLE
17 | 
18 | if _TRITON_AVAILABLE:
19 |     import triton
20 | 
21 | MAX_FUSED_SIZE = 65536  # 2**16
22 | next_power_of_2 = triton.next_power_of_2
23 | 
24 | 
25 | def calculate_settings(n):
26 |     BLOCK_SIZE = next_power_of_2(n)
27 |     if BLOCK_SIZE > MAX_FUSED_SIZE:
28 |         raise RuntimeError(
29 |             f"Cannot launch Triton kernel since n = {n} exceeds the maximum CUDA blocksize = {MAX_FUSED_SIZE}."
30 |         )
31 |     num_warps = 4
32 |     if BLOCK_SIZE >= 32768:
33 |         num_warps = 32
34 |     elif BLOCK_SIZE >= 8192:
35 |         num_warps = 16
36 |     elif BLOCK_SIZE >= 2048:
37 |         num_warps = 8
38 |     return BLOCK_SIZE, num_warps
39 | 
40 | 
41 | pass
42 | 


--------------------------------------------------------------------------------
/extensions/xla/__init__:
--------------------------------------------------------------------------------
1 | import sys
2 | from pathlib import Path
3 | 
4 | # support running without installing as a package, adding extensions to the Python path
5 | wd = Path(__file__).parent.parent.resolve()
6 | sys.path.append(str(wd))
7 | 


--------------------------------------------------------------------------------
/extensions/xla/finetune/__init__:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/extensions/xla/finetune/__init__


--------------------------------------------------------------------------------
/extensions/xla/generate/__init__:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/extensions/xla/generate/__init__


--------------------------------------------------------------------------------
/extensions/xla/scripts/__init__:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/extensions/xla/scripts/__init__


--------------------------------------------------------------------------------
/litgpt/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import logging
 4 | import re
 5 | 
 6 | from litgpt.api import LLM
 7 | from litgpt.config import Config
 8 | from litgpt.model import GPT  # needs to be imported before config
 9 | from litgpt.prompts import PromptStyle
10 | from litgpt.tokenizer import Tokenizer
11 | 
12 | # Suppress excessive warnings, see https://github.com/pytorch/pytorch/issues/111632
13 | pattern = re.compile(".*Profiler function .* will be ignored")
14 | logging.getLogger("torch._dynamo.variables.torch").addFilter(lambda record: not pattern.search(record.getMessage()))
15 | 
16 | # Avoid printing state-dict profiling output at the WARNING level when saving a checkpoint
17 | logging.getLogger("torch.distributed.fsdp._optim_utils").disabled = True
18 | logging.getLogger("torch.distributed.fsdp._debug_utils").disabled = True
19 | 
20 | __all__ = ["LLM", "GPT", "Config", "PromptStyle", "Tokenizer"]
21 | 


--------------------------------------------------------------------------------
/litgpt/__main__.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import warnings
 4 | 
 5 | import torch
 6 | from jsonargparse import CLI, set_config_read_mode, set_docstring_parse_options
 7 | 
 8 | from litgpt.chat.base import main as chat_fn
 9 | from litgpt.deploy.serve import run_server as serve_fn
10 | from litgpt.eval.evaluate import convert_and_evaluate as evaluate_fn
11 | from litgpt.finetune.adapter import setup as finetune_adapter_fn
12 | from litgpt.finetune.adapter_v2 import setup as finetune_adapter_v2_fn
13 | from litgpt.finetune.full import setup as finetune_full_fn
14 | from litgpt.finetune.lora import setup as finetune_lora_fn
15 | from litgpt.generate.adapter import main as generate_adapter_fn
16 | from litgpt.generate.adapter_v2 import main as generate_adapter_v2_fn
17 | from litgpt.generate.base import main as generate_base_fn
18 | from litgpt.generate.full import main as generate_full_fn
19 | from litgpt.generate.sequentially import main as generate_sequentially_fn
20 | from litgpt.generate.speculative_decoding import main as generate_speculatively_fn
21 | from litgpt.generate.tp import main as generate_tp_fn
22 | from litgpt.pretrain import setup as pretrain_fn
23 | from litgpt.scripts.convert_hf_checkpoint import convert_hf_checkpoint as convert_hf_checkpoint_fn
24 | from litgpt.scripts.convert_lit_checkpoint import convert_lit_checkpoint as convert_lit_checkpoint_fn
25 | from litgpt.scripts.convert_pretrained_checkpoint import (
26 |     convert_pretrained_checkpoint as convert_pretrained_checkpoint_fn,
27 | )
28 | from litgpt.scripts.download import download_from_hub as download_fn
29 | from litgpt.scripts.merge_lora import merge_lora as merge_lora_fn
30 | 
31 | 
32 | def main() -> None:
33 |     parser_data = {
34 |         "download": download_fn,
35 |         "chat": chat_fn,
36 |         "finetune": finetune_lora_fn,
37 |         "finetune_lora": finetune_lora_fn,
38 |         "finetune_full": finetune_full_fn,
39 |         "finetune_adapter": finetune_adapter_fn,
40 |         "finetune_adapter_v2": finetune_adapter_v2_fn,
41 |         "pretrain": pretrain_fn,
42 |         "generate": generate_base_fn,
43 |         "generate_full": generate_full_fn,
44 |         "generate_adapter": generate_adapter_fn,
45 |         "generate_adapter_v2": generate_adapter_v2_fn,
46 |         "generate_sequentially": generate_sequentially_fn,
47 |         "generate_speculatively": generate_speculatively_fn,
48 |         "generate_tp": generate_tp_fn,
49 |         "convert_to_litgpt": convert_hf_checkpoint_fn,
50 |         "convert_from_litgpt": convert_lit_checkpoint_fn,
51 |         "convert_pretrained_checkpoint": convert_pretrained_checkpoint_fn,
52 |         "merge_lora": merge_lora_fn,
53 |         "evaluate": evaluate_fn,
54 |         "serve": serve_fn,
55 |     }
56 | 
57 |     set_docstring_parse_options(attribute_docstrings=True)
58 |     set_config_read_mode(urls_enabled=True)
59 | 
60 |     # PyTorch bug that raises a false-positive warning
61 |     # More info: https://github.com/Lightning-AI/litgpt/issues/1561
62 |     warning_message = r"The epoch parameter in `scheduler.step\(\)` was not necessary and is being deprecated.*"
63 | 
64 |     warnings.filterwarnings(
65 |         action="ignore", message=warning_message, category=UserWarning, module=r".*torch\.optim\.lr_scheduler.*"
66 |     )
67 | 
68 |     torch.set_float32_matmul_precision("high")
69 |     CLI(parser_data)
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     main()
74 | 


--------------------------------------------------------------------------------
/litgpt/args.py:
--------------------------------------------------------------------------------
  1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
  2 | import math
  3 | import warnings
  4 | from dataclasses import dataclass
  5 | from typing import Optional, Union
  6 | 
  7 | 
  8 | @dataclass
  9 | class TrainArgs:
 10 |     """Training-related arguments"""
 11 | 
 12 |     save_interval: Optional[int] = 1000
 13 |     """Number of optimizer steps between saving checkpoints"""
 14 |     log_interval: int = 1
 15 |     """Number of iterations between logging calls"""
 16 |     global_batch_size: int = 64
 17 |     """Number of samples between optimizer steps across data-parallel ranks"""
 18 |     micro_batch_size: int = 4
 19 |     """Number of samples per data-parallel rank"""
 20 |     lr_warmup_steps: Optional[int] = 100
 21 |     """Number of iterations with learning rate warmup active"""
 22 |     lr_warmup_fraction: Optional[float] = None
 23 |     """The fraction of an epoch to use for learning rate warmup"""
 24 |     epochs: Optional[int] = None
 25 |     """Number of epochs to train on"""
 26 |     # TODO: `pretrain` is the only script using `max_tokens` explicitly. replace it with epoch_size*epochs?
 27 |     max_tokens: Optional[int] = None
 28 |     """Total number of tokens to train on"""
 29 |     max_steps: Optional[int] = None
 30 |     """Limits the number of optimizer steps to run"""
 31 |     max_seq_length: Optional[int] = None
 32 |     """Limits the length of samples"""
 33 |     tie_embeddings: Optional[bool] = None
 34 |     """Whether to tie the embedding weights with the language modeling head weights"""
 35 | 
 36 |     # Optimization args
 37 |     max_norm: Optional[float] = None
 38 |     min_lr: float = 6e-5
 39 | 
 40 |     def __post_init__(self) -> None:
 41 |         if self.lr_warmup_fraction and self.lr_warmup_steps:
 42 |             raise ValueError(
 43 |                 "Can't provide both `--train.lr_warmup_fraction` and `--train.lr_warmup_steps`. Choose one."
 44 |             )
 45 |         if self.lr_warmup_fraction and not (0 <= self.lr_warmup_fraction <= 1):
 46 |             raise ValueError("`--train.lr_warmup_fraction` must be between 0 and 1.")
 47 | 
 48 |         if self.lr_warmup_steps and self.max_steps and (self.lr_warmup_steps >= self.max_steps):
 49 |             warnings.warn(
 50 |                 "`--train.lr_warmup_steps` should be less than `--train.max_steps`."
 51 |                 f" Got {self.lr_warmup_steps} lr_warmup_steps and {self.max_steps} max_steps.",
 52 |                 UserWarning,
 53 |             )
 54 | 
 55 |     def gradient_accumulation_iters(self, devices: int, num_nodes: int = 1) -> int:
 56 |         """Number of iterations between gradient synchronizations"""
 57 |         gradient_accumulation_iters = self.batch_size(devices, num_nodes) // self.micro_batch_size
 58 |         assert gradient_accumulation_iters > 0
 59 |         return gradient_accumulation_iters
 60 | 
 61 |     def batch_size(self, devices: int, num_nodes: int = 1) -> int:
 62 |         """Number of samples between optimizer steps per data-parallel rank"""
 63 |         batch_size = self.global_batch_size // (devices * num_nodes)
 64 |         assert batch_size > 0
 65 |         return batch_size
 66 | 
 67 |     def warmup_iters(self, devices: int, num_nodes: int, max_iters: int, train_dataloader) -> int:
 68 |         """Number of iterations to warm up the learning rate."""
 69 |         if self.lr_warmup_fraction:
 70 |             return min(max_iters, math.ceil(self.lr_warmup_fraction * len(train_dataloader)))
 71 |         if self.lr_warmup_steps:
 72 |             return min(max_iters, self.lr_warmup_steps * self.gradient_accumulation_iters(devices, num_nodes))
 73 |         return 0
 74 | 
 75 | 
 76 | @dataclass
 77 | class EvalArgs:
 78 |     """Evaluation-related arguments"""
 79 | 
 80 |     interval: int = 600
 81 |     """Number of optimizer steps between evaluation calls"""
 82 |     max_new_tokens: Optional[int] = None
 83 |     """Number of tokens to generate"""
 84 |     max_iters: int = 100
 85 |     """Number of iterations"""
 86 |     initial_validation: bool = False
 87 |     """Whether to evaluate on the validation set at the beginning of the training"""
 88 |     final_validation: bool = True
 89 |     """Whether to evaluate on the validation set at the end of the training"""
 90 |     evaluate_example: Union[str, int] = "first"
 91 |     """How to pick an example instruction to evaluate periodically during training.
 92 |        Can be "first", "random", or an integer index to pick a specific example."""
 93 | 
 94 | 
 95 | @dataclass
 96 | class LogArgs:
 97 |     """Logging-related arguments"""
 98 | 
 99 |     project: Optional[str] = None
100 |     """Project name"""
101 |     run: Optional[str] = None
102 |     """Run name"""
103 |     group: Optional[str] = None
104 |     """Group name"""
105 | 


--------------------------------------------------------------------------------
/litgpt/chat/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/litgpt/chat/__init__.py


--------------------------------------------------------------------------------
/litgpt/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | from litgpt.data.alpaca import Alpaca
 4 | from litgpt.data.alpaca_2k import Alpaca2k
 5 | from litgpt.data.alpaca_gpt4 import AlpacaGPT4
 6 | from litgpt.data.base import DataModule, SFTDataset, get_sft_collate_fn
 7 | from litgpt.data.deita import Deita
 8 | from litgpt.data.flan import FLAN
 9 | from litgpt.data.json_data import JSON
10 | from litgpt.data.lima import LIMA
11 | from litgpt.data.lit_data import LitData
12 | from litgpt.data.longform import LongForm
13 | from litgpt.data.microllama import MicroLlama
14 | from litgpt.data.openwebtext import OpenWebText
15 | from litgpt.data.text_files import TextFiles
16 | from litgpt.data.tinyllama import TinyLlama
17 | from litgpt.data.tinystories import TinyStories
18 | 
19 | __all__ = [
20 |     "Alpaca",
21 |     "Alpaca2k",
22 |     "AlpacaGPT4",
23 |     "Deita",
24 |     "FLAN",
25 |     "JSON",
26 |     "LIMA",
27 |     "LitData",
28 |     "DataModule",
29 |     "LongForm",
30 |     "OpenWebText",
31 |     "SFTDataset",
32 |     "TextFiles",
33 |     "TinyLlama",
34 |     "TinyStories",
35 |     "MicroLlama",
36 |     "get_sft_collate_fn",
37 | ]
38 | 


--------------------------------------------------------------------------------
/litgpt/data/alpaca_2k.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | 
 4 | from dataclasses import dataclass, field
 5 | from pathlib import Path
 6 | 
 7 | from litgpt.data.alpaca import Alpaca
 8 | from litgpt.data.base import SFTDataset
 9 | 
10 | 
11 | @dataclass
12 | class Alpaca2k(Alpaca):
13 |     """Alpaca2k data module for supervised finetuning."""
14 | 
15 |     val_split_fraction: float = 0.05  # to get exactly 100 validation samples,
16 |     """The fraction of the dataset to use for the validation dataset. The rest is used for training."""
17 |     download_dir: Path = Path("./data/alpaca2k")
18 |     """The directory in which the downloaded datasetgets saved."""
19 |     repo_id: str = field(repr=False, default="mhenrichsen/alpaca_2k_test")
20 |     """The URL from where to download the dataset."""
21 |     file_name: str = field(repr=False, default="alpaca2k_data_cleaned_archive.json")
22 |     """The name of the dataset file to download."""
23 | 
24 |     def prepare_data(self) -> None:
25 |         from datasets import load_dataset
26 | 
27 |         load_dataset(self.repo_id, cache_dir=self.download_dir)
28 | 
29 |     def setup(self, stage: str = "") -> None:
30 |         from datasets import load_dataset
31 | 
32 |         dataset = load_dataset(self.repo_id, cache_dir=self.download_dir)
33 | 
34 |         train_validation_split = dataset["train"].train_test_split(test_size=self.val_split_fraction, seed=self.seed)
35 |         train_data = train_validation_split["train"]
36 |         test_data = train_validation_split["test"]
37 | 
38 |         self.train_dataset = SFTDataset(
39 |             data=train_data,
40 |             tokenizer=self.tokenizer,
41 |             prompt_style=self.prompt_style,
42 |             max_seq_length=self.max_seq_length,
43 |             mask_prompt=self.mask_prompt,
44 |             ignore_index=self.ignore_index,
45 |         )
46 |         self.test_dataset = SFTDataset(
47 |             data=test_data,
48 |             tokenizer=self.tokenizer,
49 |             prompt_style=self.prompt_style,
50 |             max_seq_length=self.max_seq_length,
51 |             mask_prompt=self.mask_prompt,
52 |             ignore_index=self.ignore_index,
53 |         )
54 | 


--------------------------------------------------------------------------------
/litgpt/data/alpaca_gpt4.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | 
 4 | from dataclasses import dataclass, field
 5 | from pathlib import Path
 6 | 
 7 | from litgpt.data.alpaca import Alpaca
 8 | 
 9 | _URL = "https://raw.githubusercontent.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/main/data/alpaca_gpt4_data.json"
10 | 
11 | 
12 | @dataclass
13 | class AlpacaGPT4(Alpaca):
14 |     """AlpacaGPT4 data module for supervised finetuning."""
15 | 
16 |     val_split_fraction: float = 0.03847  # to get exactly 2000 test samples,
17 |     """The fraction of the dataset to use for the validation dataset. The rest is used for training."""
18 |     download_dir: Path = Path("./data/alpacagpt4")
19 |     """The directory in which the downloaded datasetgets saved."""
20 |     file_url: str = field(repr=False, default=_URL)
21 |     """The URL from where to download the dataset."""
22 |     file_name: str = field(repr=False, default="alpacagpt4_data_cleaned_archive.json")
23 |     """The name of the dataset file to download."""
24 | 


--------------------------------------------------------------------------------
/litgpt/data/lit_data.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | import os
 3 | from dataclasses import dataclass, field
 4 | from pathlib import Path
 5 | from typing import Optional, Tuple, Union
 6 | 
 7 | from torch.utils.data import DataLoader
 8 | 
 9 | from litgpt.data import DataModule
10 | from litgpt.tokenizer import Tokenizer
11 | 
12 | 
13 | @dataclass
14 | class LitData(DataModule):
15 |     """Loads data using LitData's StreamingDataset given a path to a folder of preprocessed data (chunks)."""
16 | 
17 |     data_path: Union[str, Path] = Path("data/")
18 |     """The path to the data directory containing the preprocessed chunks for the streaming dataset
19 |     The path can also be a remote path (e.g., s3://). See also ``split_names`` if this path contains subfolders
20 |     for training- and validation splits."""
21 |     split_names: Optional[Tuple[str, str]] = None
22 |     """Optional tuple for names of subfolders for training and validation under ``data_path``. If not provided,
23 |     all data under data_path will be used for training, and the validation dataloader will be identical to the
24 |     train dataloader."""
25 |     seed: int = 42
26 |     """The random seed for shuffling the dataset."""
27 |     num_workers: int = 8
28 |     """How many DataLoader processes to use for loading."""
29 | 
30 |     batch_size: int = field(init=False, repr=False, default=1)
31 |     seq_length: int = field(init=False, repr=False, default=2048)
32 | 
33 |     def __post_init__(self) -> None:
34 |         super().__init__()
35 |         if self.split_names is not None and len(self.split_names) != 2:
36 |             raise ValueError("If provided `split_names` must be a tuple of two strings, for example: ('train', 'val').")
37 | 
38 |     def connect(
39 |         self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: Optional[int] = None
40 |     ) -> None:
41 |         self.batch_size = batch_size
42 |         self.seq_length = max_seq_length + 1  # Increase by one because we need the next token as well
43 | 
44 |     def train_dataloader(self) -> DataLoader:
45 |         input_dir = os.path.join(self.data_path, self.split_names[0]) if self.split_names else str(self.data_path)
46 |         return self._dataloader(input_dir=input_dir, train=True)
47 | 
48 |     def val_dataloader(self) -> DataLoader:
49 |         input_dir = os.path.join(self.data_path, self.split_names[1]) if self.split_names else str(self.data_path)
50 |         return self._dataloader(input_dir=input_dir, train=False)
51 | 
52 |     def _dataloader(self, input_dir: str, train: bool):
53 |         from litdata.streaming import StreamingDataLoader, StreamingDataset, TokensLoader
54 | 
55 |         dataset = StreamingDataset(
56 |             input_dir=input_dir,
57 |             item_loader=TokensLoader(block_size=self.seq_length),
58 |             shuffle=train,
59 |             seed=self.seed,
60 |         )
61 |         dataloader = StreamingDataLoader(
62 |             dataset, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True
63 |         )
64 |         return dataloader
65 | 


--------------------------------------------------------------------------------
/litgpt/data/longform.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import json
 4 | from dataclasses import dataclass, field
 5 | from pathlib import Path
 6 | from typing import Optional, Union
 7 | 
 8 | import torch
 9 | from torch.utils.data import DataLoader
10 | 
11 | from litgpt.data import DataModule, SFTDataset, get_sft_collate_fn
12 | from litgpt.data.alpaca import download_if_missing
13 | from litgpt.prompts import PromptStyle
14 | from litgpt.tokenizer import Tokenizer
15 | 
16 | _URL = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset"
17 | 
18 | 
19 | @dataclass
20 | class LongForm(DataModule):
21 |     """LongForm data module for supervised finetuning."""
22 | 
23 |     mask_prompt: bool = False
24 |     """Whether to mask the prompt section from the label (with ``ignore_index``)."""
25 |     prompt_style: Union[str, PromptStyle] = "longform"
26 |     """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles."""
27 |     ignore_index: int = -100
28 |     """The index to use for elements to be ignored in the label."""
29 |     seed: int = 42
30 |     """The random seed for shuffling the dataset."""
31 |     num_workers: int = 4
32 |     """How many DataLoader processes to use for loading."""
33 |     download_dir: Path = Path("./data/longform")
34 |     """The directory in which the downloaded dataset gets saved."""
35 | 
36 |     tokenizer: Optional[Tokenizer] = field(default=None, init=False, repr=False)
37 |     batch_size: int = field(default=1, init=False, repr=False)
38 |     max_seq_length: int = field(default=-1, init=False, repr=False)
39 |     train_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
40 |     test_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
41 | 
42 |     def __post_init__(self) -> None:
43 |         super().__init__()
44 |         if isinstance(self.prompt_style, str):
45 |             self.prompt_style = PromptStyle.from_name(self.prompt_style)
46 | 
47 |     def connect(
48 |         self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: Optional[int] = None
49 |     ) -> None:
50 |         self.tokenizer = tokenizer
51 |         self.batch_size = batch_size
52 |         self.max_seq_length = -1 if max_seq_length is None else max_seq_length
53 | 
54 |     def prepare_data(self) -> None:
55 |         self.download_dir.mkdir(parents=True, exist_ok=True)
56 |         download_if_missing(self.download_dir / "train.json", f"{_URL}/train.json")
57 |         download_if_missing(self.download_dir / "val.json", f"{_URL}/val.json")
58 | 
59 |     def train_dataloader(self):
60 |         return self._dataloader("train")
61 | 
62 |     def val_dataloader(self):
63 |         return self._dataloader("val")
64 | 
65 |     def _dataloader(self, split: str) -> DataLoader:
66 |         with open(self.download_dir / f"{split}.json", encoding="utf-8") as file:
67 |             data = json.load(file)
68 | 
69 |         dataset = SFTDataset(
70 |             data=data,
71 |             tokenizer=self.tokenizer,
72 |             prompt_style=self.prompt_style,
73 |             max_seq_length=self.max_seq_length,
74 |             mask_prompt=self.mask_prompt,
75 |             ignore_index=self.ignore_index,
76 |             transform=_transform,
77 |         )
78 |         return DataLoader(
79 |             dataset=dataset,
80 |             batch_size=self.batch_size,
81 |             shuffle=(split == "train"),
82 |             generator=torch.Generator().manual_seed(self.seed),
83 |             num_workers=self.num_workers,
84 |             collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index),
85 |         )
86 | 
87 | 
88 | def _transform(item: dict) -> dict:
89 |     item["instruction"] = item.pop("input")
90 |     return item
91 | 


--------------------------------------------------------------------------------
/litgpt/data/microllama.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | from dataclasses import dataclass
 3 | from pathlib import Path
 4 | from typing import Union
 5 | 
 6 | from litgpt.data.tinyllama import TinyLlama
 7 | 
 8 | 
 9 | @dataclass
10 | class MicroLlama(TinyLlama):
11 |     """The MicroLlama data module is composed of only SlimPajama data."""
12 | 
13 |     def __init__(self, data_path: Union[str, Path] = Path("data/"), seed: int = 42, num_workers: int = 8):
14 |         super().__init__(data_path=data_path, seed=seed, num_workers=num_workers, use_starcoder=False)
15 | 


--------------------------------------------------------------------------------
/litgpt/data/prepare_slimpajama.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import json
 4 | import os
 5 | import time
 6 | from pathlib import Path
 7 | 
 8 | from litgpt.data.prepare_starcoder import DataChunkRecipe
 9 | from litgpt.tokenizer import Tokenizer
10 | from litgpt.utils import CLI, extend_checkpoint_dir
11 | 
12 | 
13 | class SlimPajamaDataRecipe(DataChunkRecipe):
14 |     is_generator = True
15 | 
16 |     def __init__(self, tokenizer: Tokenizer, chunk_size: int):
17 |         super().__init__(chunk_size)
18 |         self.tokenizer = tokenizer
19 | 
20 |     def prepare_structure(self, input_dir):
21 |         files = Path(input_dir).rglob("*.zst")
22 |         return [str(file) for file in files]
23 | 
24 |     def prepare_item(self, filepath):
25 |         import zstandard as zstd
26 | 
27 |         with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
28 |             for row in f:
29 |                 text = json.loads(row)["text"]
30 |                 if json.loads(row)["meta"]["redpajama_set_name"] == "RedPajamaGithub":
31 |                     continue  # exclude the GitHub data since it overlaps with starcoder
32 |                 text_ids = self.tokenizer.encode(string=text, bos=False, eos=True)
33 |                 yield text_ids
34 | 
35 | 
36 | def prepare(
37 |     input_dir: Path = Path("data/SlimPajama-627B/train"),
38 |     output_dir: Path = Path("data/slimpajama/train"),
39 |     tokenizer_path: Path = Path("checkpoints/Llama-2-7b-hf/"),
40 |     chunk_size: int = (2049 * 16384),
41 |     fast_dev_run: bool = False,
42 | ) -> None:
43 |     from litdata.processing.data_processor import DataProcessor
44 | 
45 |     tokenizer_path = extend_checkpoint_dir(tokenizer_path)
46 |     tokenizer = Tokenizer(tokenizer_path)
47 |     data_recipe = SlimPajamaDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size)
48 |     data_processor = DataProcessor(
49 |         input_dir=str(input_dir),
50 |         output_dir=str(output_dir),
51 |         fast_dev_run=fast_dev_run,
52 |         num_workers=os.cpu_count(),
53 |         num_downloaders=1,
54 |     )
55 | 
56 |     start_time = time.time()
57 |     data_processor.run(data_recipe)
58 |     elapsed_time = time.time() - start_time
59 |     print(f"Time taken: {elapsed_time:.2f} seconds")
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     CLI(prepare)
64 | 


--------------------------------------------------------------------------------
/litgpt/data/prepare_starcoder.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import os
 4 | import time
 5 | import traceback
 6 | from pathlib import Path
 7 | 
 8 | from lightning_utilities.core.imports import RequirementCache
 9 | 
10 | from litgpt.tokenizer import Tokenizer
11 | from litgpt.utils import CLI, extend_checkpoint_dir
12 | 
13 | _LITDATA_AVAILABLE = RequirementCache("litdata")
14 | if _LITDATA_AVAILABLE:
15 |     from litdata.processing.data_processor import DataChunkRecipe
16 | else:
17 |     DataChunkRecipe = object
18 | 
19 | 
20 | class StarcoderDataRecipe(DataChunkRecipe):
21 |     is_generator = True
22 | 
23 |     def __init__(self, tokenizer: Tokenizer, chunk_size: int):
24 |         super().__init__(chunk_size)
25 |         self.tokenizer = tokenizer
26 | 
27 |     def prepare_structure(self, input_dir):
28 |         files = Path(input_dir).rglob("*.parquet")
29 |         return [str(file) for file in files]
30 | 
31 |     def prepare_item(self, item_metadata):
32 |         import pyarrow.parquet as pq
33 | 
34 |         filepath = item_metadata
35 |         start = time.time()
36 | 
37 |         try:
38 |             parquet_file = pq.ParquetFile(filepath)
39 |             # reduce RAM usage
40 |             for batch in parquet_file.iter_batches(batch_size=8192, columns=["content"]):
41 |                 for text in batch.to_pandas()["content"]:
42 |                     yield self.tokenizer.encode(text, bos=False, eos=True)
43 | 
44 |         except Exception:
45 |             print(traceback.format_exc())
46 |             print(f"Error reading {filepath}")
47 |             return
48 | 
49 |         parquet_file.close()
50 |         end = time.time()
51 |         print(f"Took {end - start:.2f} seconds total", filepath)
52 | 
53 | 
54 | def prepare(
55 |     input_dir: Path = Path("data/starcoderdata"),
56 |     output_dir: Path = Path("data/starcoder"),
57 |     tokenizer_path: Path = Path("checkpoints/Llama-2-7b-hf/"),
58 |     chunk_size: int = (2049 * 8192),
59 |     fast_dev_run: bool = False,
60 | ) -> None:
61 |     from litdata.processing.data_processor import DataProcessor
62 | 
63 |     tokenizer_path = extend_checkpoint_dir(tokenizer_path)
64 |     tokenizer = Tokenizer(tokenizer_path)
65 |     data_recipe = StarcoderDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size)
66 |     data_processor = DataProcessor(
67 |         input_dir=str(input_dir),
68 |         output_dir=str(output_dir),
69 |         fast_dev_run=fast_dev_run,
70 |         num_workers=os.cpu_count(),
71 |         num_downloaders=1,
72 |     )
73 | 
74 |     start_time = time.time()
75 |     data_processor.run(data_recipe)
76 |     elapsed_time = time.time() - start_time
77 |     print(f"Time taken: {elapsed_time:.2f} seconds")
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     CLI(prepare)
82 | 


--------------------------------------------------------------------------------
/litgpt/data/tinyllama.py:
--------------------------------------------------------------------------------
  1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
  2 | from dataclasses import dataclass, field
  3 | from pathlib import Path
  4 | from typing import Optional, Union
  5 | 
  6 | from torch.utils.data import DataLoader
  7 | 
  8 | from litgpt.data import DataModule
  9 | from litgpt.tokenizer import Tokenizer
 10 | 
 11 | 
 12 | @dataclass
 13 | class TinyLlama(DataModule):
 14 |     """The TinyLlama data module is composed of a mix of SlimPajama and Starcoder data.
 15 | 
 16 |     Provides training and validation streaming dataloaders that return batches of tokens.
 17 |     """
 18 | 
 19 |     data_path: Union[str, Path] = Path("data/")
 20 |     """The path to the data directory, containing two folders 'slimpajama' and 'starcoder'
 21 |     which are the output of the preprocessing step done in advance. See the `tutorial/pretrain_tinyllama.md`
 22 |     for instructions. The path can also be a remote path (e.g., s3://)."""
 23 |     seed: int = 42
 24 |     """The random seed for shuffling the dataset."""
 25 |     num_workers: int = 8
 26 |     """How many DataLoader processes to use for loading."""
 27 |     use_starcoder: bool = True
 28 |     """Toggle for using Starcoder data."""
 29 | 
 30 |     batch_size: int = field(init=False, repr=False, default=1)
 31 |     seq_length: int = field(init=False, repr=False, default=2048)
 32 | 
 33 |     def __post_init__(self):
 34 |         super().__init__()
 35 |         # Could be a remote path (s3://) or a local path
 36 |         self.slimpajama_train = str(self.data_path).rstrip("/") + "/slimpajama/train"
 37 |         self.slimpajama_val = str(self.data_path).rstrip("/") + "/slimpajama/val"
 38 |         self.required_paths = [self.slimpajama_train, self.slimpajama_val]
 39 | 
 40 |         if self.use_starcoder:
 41 |             self.starcoder_train = str(self.data_path).rstrip("/") + "/starcoder"
 42 |             self.required_paths += [self.starcoder_train]
 43 | 
 44 |     def connect(
 45 |         self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: Optional[int] = None
 46 |     ) -> None:
 47 |         self.batch_size = batch_size
 48 |         self.seq_length = max_seq_length + 1  # Increase by one because we need the next token as well
 49 | 
 50 |     def prepare_data(self) -> None:
 51 |         for path in self.required_paths:
 52 |             if not path.startswith("s3://") and not Path(path).is_dir():
 53 |                 raise FileNotFoundError(
 54 |                     "The data path for TinyLlama is expected to be the directory containing these subdirectories:"
 55 |                     f" `slimpajama/train`, `slimpajama/val`, `starcoder`. The directory {path} does not exist."
 56 |                     " Set it via `--data.data_path=...`"
 57 |                 )
 58 | 
 59 |     def train_dataloader(self) -> DataLoader:
 60 |         from litdata.streaming import CombinedStreamingDataset, StreamingDataLoader, StreamingDataset, TokensLoader
 61 | 
 62 |         slim_train_data = StreamingDataset(
 63 |             input_dir=self.slimpajama_train,
 64 |             item_loader=TokensLoader(block_size=self.seq_length),
 65 |             shuffle=True,
 66 |             drop_last=True,
 67 |         )
 68 |         train_data = slim_train_data
 69 | 
 70 |         if self.use_starcoder:
 71 |             train_datasets = [
 72 |                 slim_train_data,
 73 |                 StreamingDataset(
 74 |                     input_dir=self.starcoder_train,
 75 |                     item_loader=TokensLoader(block_size=self.seq_length),
 76 |                     shuffle=True,
 77 |                     drop_last=True,
 78 |                 ),
 79 |             ]
 80 | 
 81 |             # Mix SlimPajama data and Starcoder data with these proportions:
 82 |             weights = (0.693584, 0.306416)
 83 |             train_data = CombinedStreamingDataset(
 84 |                 datasets=train_datasets, seed=self.seed, weights=weights, iterate_over_all=False
 85 |             )
 86 | 
 87 |         train_dataloader = StreamingDataLoader(
 88 |             train_data, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True
 89 |         )
 90 |         return train_dataloader
 91 | 
 92 |     def val_dataloader(self) -> DataLoader:
 93 |         from litdata.streaming import StreamingDataLoader, StreamingDataset, TokensLoader
 94 | 
 95 |         val_dataset = StreamingDataset(
 96 |             input_dir=self.slimpajama_val,
 97 |             item_loader=TokensLoader(block_size=self.seq_length),
 98 |             shuffle=True,
 99 |         )
100 |         val_dataloader = StreamingDataLoader(
101 |             val_dataset, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True
102 |         )
103 |         return val_dataloader
104 | 


--------------------------------------------------------------------------------
/litgpt/deploy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/litgpt/deploy/__init__.py


--------------------------------------------------------------------------------
/litgpt/finetune/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/litgpt/finetune/__init__.py


--------------------------------------------------------------------------------
/litgpt/generate/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/litgpt/generate/__init__.py


--------------------------------------------------------------------------------
/litgpt/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/litgpt/scripts/__init__.py


--------------------------------------------------------------------------------
/litgpt/scripts/convert_pretrained_checkpoint.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | from pathlib import Path
 4 | from pprint import pprint
 5 | 
 6 | import torch
 7 | 
 8 | from litgpt.utils import copy_config_files, extend_checkpoint_dir, incremental_save
 9 | 
10 | 
11 | @torch.inference_mode()
12 | def convert_pretrained_checkpoint(checkpoint_dir: Path, output_dir: Path) -> None:
13 |     """Convert a checkpoint after pretraining.
14 | 
15 |     The pretrained checkpoint contains optimizer states and several other metadata that are not needed after training
16 |     is finished. This script will export the state-dict of the model and place it in the chosen output folder,
17 |     which then can be loaded by other scripts for inference, evaluation, etc.
18 | 
19 |     Args:
20 |         checkpoint_dir: Path to a checkpoint directory produced by ``litgpt.pretrain``.
21 |         output_dir: The output folder where the converted state-dict file and config files will be saved to.
22 |     """
23 |     checkpoint_dir = extend_checkpoint_dir(checkpoint_dir)
24 |     pprint(locals())
25 | 
26 |     if output_dir.is_dir() and output_dir.glob("*"):
27 |         raise FileExistsError(
28 |             f"The output folder exists and is not empty: {str(output_dir)}."
29 |             " Please delete it first or choose a different name."
30 |         )
31 | 
32 |     output_dir.mkdir(parents=True)
33 |     checkpoint_file = checkpoint_dir / "lit_model.pth"
34 |     output_checkpoint_file = output_dir / "lit_model.pth"
35 | 
36 |     # TODO: Consolidate sharded checkpoint if applicable
37 |     # Extract the model state dict and save to output folder
38 |     with incremental_save(output_checkpoint_file) as saver:
39 |         print("Processing", checkpoint_file)
40 |         full_checkpoint = torch.load(str(checkpoint_file), mmap=True)
41 |         loaded_state_dict = full_checkpoint["model"]
42 |         converted_state_dict = {}
43 |         for param_name, param in loaded_state_dict.items():
44 |             saver.store_early(param)
45 |             # remove prefix for compiled model (if any)
46 |             param_name = param_name.replace("_orig_mod.", "")
47 |             converted_state_dict[param_name] = param
48 |         print(f"Saving converted checkpoint to {str(output_checkpoint_file)}.")
49 |         saver.save(converted_state_dict)
50 | 
51 |     copy_config_files(checkpoint_dir, output_dir)
52 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | build-backend = "setuptools.build_meta"
  3 | 
  4 | requires = [
  5 |   "setuptools>=68.2.2",
  6 |   "wheel>=0.41.2",
  7 | ]
  8 | 
  9 | [project]
 10 | name = "litgpt"
 11 | version = "0.5.9.dev1"
 12 | description = "Hackable implementation of state-of-the-art open-source LLMs"
 13 | readme = "README.md"
 14 | license = { file = "LICENSE" }
 15 | 
 16 | authors = [
 17 |   { name = "Lightning AI", email = "contact@lightning.ai" },
 18 | ]
 19 | classifiers = [
 20 |   "Programming Language :: Python :: 3 :: Only",
 21 |   "Programming Language :: Python :: 3.9",
 22 |   "Programming Language :: Python :: 3.10",
 23 |   "Programming Language :: Python :: 3.11",
 24 |   "Programming Language :: Python :: 3.12",
 25 |   "Programming Language :: Python :: 3.13",
 26 | ]
 27 | dependencies = [
 28 |   # download models:
 29 |   "huggingface-hub>=0.23.5",
 30 |   "jsonargparse[signatures]>=4.30.1,<=4.32.1; python_version<='3.9'", # 4.33 does not seem to be compatible with Python 3.9
 31 |   "jsonargparse[signatures]>=4.37; python_version>'3.9'",             # required to work with python3.12+
 32 |   "lightning>=2.5",
 33 |   "numpy<2",                                                          # for older Torch versions
 34 |   "psutil==7",
 35 |   "safetensors>=0.4.3",
 36 |   # tokenization in most models:
 37 |   "tokenizers>=0.15.2",
 38 |   "torch>=2.5",
 39 |   # convert_hf_checkpoint
 40 |   "tqdm>=4.66",
 41 | ]
 42 | 
 43 | optional-dependencies.compiler = [
 44 |   # compilaton:
 45 |   "lightning-thunder>=0.2.0.dev20250119; python_version>='3.10' and sys_platform=='linux'",
 46 | ]
 47 | optional-dependencies.extra = [
 48 |   "bitsandbytes>=0.42,<0.43; sys_platform=='darwin'",
 49 |   # quantization:
 50 |   "bitsandbytes>=0.45.2,<0.45.5; sys_platform=='linux' or sys_platform=='win32'",
 51 |   # litgpt.evaluate:
 52 |   "datasets>=2.18",
 53 |   # download:
 54 |   "huggingface-hub[hf-transfer]>=0.21",
 55 |   "litdata==0.2.45",
 56 |   # litgpt.deploy:
 57 |   "litserve>0.2",
 58 |   "lm-eval>=0.4.2",
 59 |   # litgpt.data.prepare_starcoder.py:
 60 |   "pandas>=1.9",
 61 |   "pyarrow>=15.0.2",
 62 |   # litgpt.data:
 63 |   "requests>=2.31",
 64 |   # llama-based models:
 65 |   "sentencepiece>=0.2",
 66 |   # litgpt.pretrain:
 67 |   "tensorboard>=2.14",
 68 |   "torchmetrics>=1.3.1",
 69 |   "transformers>=4.51.3,<4.52",
 70 |   # litdata, only on non-Windows:
 71 |   "uvloop>=0.2; sys_platform!='win32'",
 72 |   # litgpt.data.prepare_slimpajama.py:
 73 |   "zstandard>=0.22",
 74 | ]
 75 | optional-dependencies.test = [
 76 |   "einops>=0.7",
 77 |   "protobuf>=4.23.4",
 78 |   "pytest>=8.1.1",
 79 |   "pytest-benchmark>=5.1",
 80 |   "pytest-dependency>=0.6",
 81 |   "pytest-rerunfailures>=14",
 82 |   "pytest-timeout>=2.3.1",
 83 | ]
 84 | urls.documentation = "https://github.com/lightning-AI/litgpt/tutorials"
 85 | urls.homepage = "https://github.com/lightning-AI/litgpt"
 86 | scripts.litgpt = "litgpt.__main__:main"
 87 | 
 88 | [tool.setuptools.packages.find]
 89 | include = [
 90 |   "litgpt",
 91 |   "litgpt.*",
 92 | ]
 93 | exclude = [  ]
 94 | 
 95 | [tool.setuptools.package-data]
 96 | litgpt = [
 97 |   "LICENSE",
 98 |   "README.md",
 99 | ]
100 | 
101 | [tool.ruff]
102 | target-version = "py38"
103 | line-length = 120
104 | exclude = [
105 |   "build",
106 |   "dist",
107 |   "docs",
108 | ]
109 | 
110 | lint.select = [
111 |   "E",
112 |   "F",  # see: https://pypi.org/project/pyflakes
113 |   "I",  # implementation for isort
114 |   "UP", # see: https://docs.astral.sh/ruff/rules/#pyupgrade-up
115 |   "W",  # see: https://pypi.org/project/pycodestyle
116 | ]
117 | #extend-select = [
118 | #    "C4",  # see: https://pypi.org/project/flake8-comprehensions
119 | #    "PT",  # see: https://pypi.org/project/flake8-pytest-style
120 | #    "RET",  # see: https://pypi.org/project/flake8-return
121 | #    "SIM",  # see: https://pypi.org/project/flake8-simplify
122 | #]
123 | lint.ignore = [
124 |   "E501", # Line too long
125 |   "E731", # Do not assign a lambda expression, use a def
126 |   "E741", # todo: Ambiguous variable name
127 |   "F841", # todo: Local variable is assigned to but never used
128 | ]
129 | # Use Google-style docstrings.
130 | lint.pydocstyle.convention = "google"
131 | 
132 | [tool.codespell]
133 | #skip = '*.py'
134 | quiet-level = 3
135 | ignore-words-list = """
136 |   tral, \
137 |   Rockerfeller
138 | """
139 | 
140 | [tool.pytest.ini_options]
141 | addopts = [
142 |   "--strict-markers",
143 |   #"--doctest-modules",
144 |   "--color=yes",
145 |   "--disable-pytest-warnings",
146 | ]
147 | 


--------------------------------------------------------------------------------
/tests/convert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tests/convert/__init__.py


--------------------------------------------------------------------------------
/tests/convert/test_pretrained_checkpoint.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import os
 4 | 
 5 | import torch
 6 | 
 7 | from litgpt.scripts.convert_pretrained_checkpoint import convert_pretrained_checkpoint
 8 | 
 9 | 
10 | def test_convert_pretrained_checkpoint(tmp_path, fake_checkpoint_dir):
11 |     # Pretend we made a checkpoint from pretraining
12 |     pretrained_checkpoint = {
13 |         "model": {"some.module.weight": torch.rand(2, 2), "_orig_mod.some.other.module.weight": torch.rand(2, 2)},
14 |         "the_optimizer": "optimizer_state",
15 |         "other": 1,
16 |     }
17 |     torch.save(pretrained_checkpoint, fake_checkpoint_dir / "lit_model.pth")
18 | 
19 |     convert_pretrained_checkpoint(checkpoint_dir=fake_checkpoint_dir, output_dir=(tmp_path / "converted"))
20 | 
21 |     assert set(os.listdir(tmp_path / "converted")) == {
22 |         "lit_model.pth",
23 |         "model_config.yaml",
24 |         "tokenizer_config.json",
25 |         "tokenizer.json",
26 |     }
27 |     converted_checkpoint = torch.load(tmp_path / "converted" / "lit_model.pth")
28 |     assert list(converted_checkpoint.keys()) == ["some.module.weight", "some.other.module.weight"]
29 | 


--------------------------------------------------------------------------------
/tests/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tests/data/__init__.py


--------------------------------------------------------------------------------
/tests/data/test_alpaca.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | from litgpt.data import Alpaca
 3 | from litgpt.prompts import Alpaca as AlpacaPromptStyle
 4 | 
 5 | 
 6 | def test_alpaca(mock_tokenizer, alpaca_path):
 7 |     alpaca = Alpaca(val_split_fraction=0.5, download_dir=alpaca_path.parent, file_name=alpaca_path.name, num_workers=0)
 8 |     assert isinstance(alpaca.prompt_style, AlpacaPromptStyle)
 9 |     alpaca.connect(mock_tokenizer, batch_size=2, max_seq_length=10)
10 |     alpaca.prepare_data()
11 |     alpaca.setup()
12 | 
13 |     train_dataloader = alpaca.train_dataloader()
14 |     val_dataloader = alpaca.val_dataloader()
15 | 
16 |     assert len(train_dataloader) == 6
17 |     assert len(val_dataloader) == 6
18 | 
19 |     train_batch = next(iter(train_dataloader))
20 |     val_batch = next(iter(val_dataloader))
21 | 
22 |     assert train_batch.keys() == val_batch.keys() == {"input_ids", "labels", "token_counts"}
23 |     for key in ["input_ids", "labels"]:
24 |         assert train_batch[key].shape == (2, 10), f"Unexpected shape for train_batch[{key}]"
25 |         assert val_batch[key].shape == (2, 10), f"Unexpected shape for val_batch[{key}]"
26 | 
27 |     assert isinstance(train_dataloader.dataset.prompt_style, AlpacaPromptStyle)
28 |     assert isinstance(val_dataloader.dataset.prompt_style, AlpacaPromptStyle)
29 | 
30 |     # has attributes from super class `LightningDataModule`
31 |     assert alpaca.prepare_data_per_node
32 | 


--------------------------------------------------------------------------------
/tests/data/test_base.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | from typing import Optional
 4 | 
 5 | import pytest
 6 | import torch
 7 | 
 8 | from litgpt.data.base import SFTDataset, get_sft_collate_fn
 9 | from litgpt.prompts import PromptStyle
10 | 
11 | 
12 | @pytest.mark.parametrize("mask_prompt", [True, False])
13 | @pytest.mark.parametrize("ignore_index", [-1, -100])
14 | @pytest.mark.parametrize("max_seq_length", [1000, 5, -1])
15 | def test_sft_dataset(max_seq_length, ignore_index, mask_prompt, mock_tokenizer):
16 |     class Style(PromptStyle):
17 |         def apply(self, prompt: str, *, sys_prompt: Optional[str] = None, **kwargs) -> str:
18 |             return f"In: {prompt} Out:"
19 | 
20 |     i = ignore_index
21 |     data = [{"instruction": "Foo", "output": "Bar"}, {"instruction": "Boo", "output": "Ahh"}]
22 | 
23 |     dataset = SFTDataset(
24 |         data=data,
25 |         tokenizer=mock_tokenizer,
26 |         prompt_style=Style(),
27 |         mask_prompt=mask_prompt,
28 |         ignore_index=ignore_index,
29 |         max_seq_length=max_seq_length,
30 |     )
31 |     assert len(dataset) == len(data)
32 | 
33 |     expected_input_ids = torch.tensor([73, 110, 58, 32, 70, 111, 111, 32, 79, 117, 116, 58, 66, 97, 114, 1])
34 |     # If prompt is not masked, labels == input_ids
35 |     expected_labels = (
36 |         torch.tensor([i, i, i, i, i, i, i, i, i, i, i, i, 66, 97, 114, 1]) if mask_prompt else expected_input_ids
37 |     )
38 | 
39 |     if max_seq_length == -1:
40 |         assert torch.equal(dataset[0]["input_ids"], expected_input_ids)
41 |         assert torch.equal(dataset[0]["labels"], expected_labels)
42 |     else:
43 |         assert torch.equal(dataset[0]["input_ids"], expected_input_ids[:max_seq_length])
44 |         assert torch.equal(dataset[0]["labels"], expected_labels[:max_seq_length])
45 | 
46 | 
47 | @pytest.mark.parametrize("ignore_index", [-1, -100])
48 | @pytest.mark.parametrize("pad_id", [0, 100])
49 | def test_sft_collate_fn_padding(pad_id, ignore_index):
50 |     collate = get_sft_collate_fn(pad_id=pad_id, ignore_index=ignore_index)
51 |     samples = [
52 |         {
53 |             "input_ids": torch.tensor([1, 2, 3]),
54 |             "labels": torch.tensor([10, 20, 30]),
55 |             "token_counts": {"raw": 3, "raw_plus_prompt_template": 25},
56 |         },
57 |         {
58 |             "input_ids": torch.tensor([4, 5, 6, 7, 8]),
59 |             "labels": torch.tensor([40, 50, 60, 70, 80]),
60 |             "token_counts": {"raw": 5, "raw_plus_prompt_template": 27},
61 |         },
62 |     ]
63 |     expected = {
64 |         "input_ids": torch.tensor([[1, 2, 3, pad_id, pad_id], [4, 5, 6, 7, 8]]),
65 |         "labels": torch.tensor([[10, 20, 30, ignore_index, ignore_index], [40, 50, 60, 70, 80]]),
66 |         "token_counts": {"raw": torch.tensor([[3], [5]]), "raw_plus_prompt_template": torch.tensor([[25], [27]])},
67 |     }
68 |     batch = collate(samples)
69 |     assert all(torch.equal(batch[k], expected[k]) for k in ("input_ids", "labels"))
70 |     for key in ("raw", "raw_plus_prompt_template"):
71 |         assert torch.equal(batch["token_counts"][key], expected["token_counts"][key]), f"Token count mismatch for {key}"
72 | 
73 | 
74 | def test_sft_collate_fn_truncation():
75 |     collate = get_sft_collate_fn(max_seq_length=2)
76 |     samples = [
77 |         {
78 |             "input_ids": torch.tensor([1, 2, 3]),
79 |             "labels": torch.tensor([10, 20, 30]),
80 |             "token_counts": {"raw": 3, "raw_plus_prompt_template": 25},
81 |         },
82 |         {
83 |             "input_ids": torch.tensor([4, 5, 6, 7, 8]),
84 |             "labels": torch.tensor([40, 50, 60, 70, 80]),
85 |             "token_counts": {"raw": 5, "raw_plus_prompt_template": 27},
86 |         },
87 |     ]
88 |     expected = {
89 |         "input_ids": torch.tensor([[1, 2], [4, 5]]),
90 |         "labels": torch.tensor([[10, 20], [40, 50]]),
91 |         "token_counts": {"raw": torch.tensor([[3], [5]]), "raw_plus_prompt_template": torch.tensor([[25], [27]])},
92 |     }
93 |     batch = collate(samples)
94 |     assert all(torch.equal(batch[k], expected[k]) for k in ("input_ids", "labels"))
95 |     for key in ("raw", "raw_plus_prompt_template"):
96 |         assert torch.equal(batch["token_counts"][key], expected["token_counts"][key]), f"Token count mismatch for {key}"
97 | 


--------------------------------------------------------------------------------
/tests/data/test_deita.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | from unittest import mock
 3 | 
 4 | from litgpt.data import Deita, SFTDataset
 5 | from litgpt.data.deita import format_dataset
 6 | from litgpt.prompts import Alpaca as AlpacaPromptStyle
 7 | 
 8 | 
 9 | def test_format_dataset():
10 |     data = [
11 |         {
12 |             "prompt": "prompt1",
13 |             "prompt_id": "1",
14 |             "messages": [
15 |                 {"content": "question1", "role": "user"},
16 |                 {"content": "response1", "role": "assistant"},
17 |                 {"content": "question2", "role": "user"},
18 |                 {"content": "response2", "role": "assistant"},
19 |             ],
20 |         },
21 |         {
22 |             "prompt": "prompt2",
23 |             "prompt_id": "2",
24 |             "messages": [
25 |                 {"content": "question3", "role": "user"},
26 |                 {"content": "response3", "role": "assistant"},
27 |                 {"content": "question4", "role": "user"},
28 |                 {"content": "response4", "role": "assistant"},
29 |             ],
30 |         },
31 |     ]
32 | 
33 |     assert format_dataset(data, include_multi_turn_conversations=False) == [
34 |         {"instruction": "question1", "output": "response1", "input": ""},
35 |         {"instruction": "question3", "output": "response3", "input": ""},
36 |     ]
37 |     assert format_dataset(data, include_multi_turn_conversations=True) == [
38 |         {"instruction": "question1", "output": "response1", "input": ""},
39 |         {"instruction": "question2", "output": "response2", "input": ""},
40 |         {"instruction": "question3", "output": "response3", "input": ""},
41 |         {"instruction": "question4", "output": "response4", "input": ""},
42 |     ]
43 | 
44 | 
45 | @mock.patch("litgpt.data.deita.format_dataset")
46 | @mock.patch("datasets.load_dataset")
47 | def test_deita(_, format_dataset_mock, mock_tokenizer, tmp_path):
48 |     format_dataset_mock.return_value = [
49 |         {"instruction": "inst1", "output": "out1"},
50 |         {"instruction": "inst2", "output": "out2"},
51 |         {"instruction": "inst3", "output": "out3"},
52 |     ]
53 | 
54 |     deita = Deita(num_workers=0, download_dir=tmp_path)
55 |     assert isinstance(deita.prompt_style, AlpacaPromptStyle)
56 |     deita.connect(mock_tokenizer, batch_size=2, max_seq_length=10)
57 |     deita.prepare_data()
58 |     deita.setup()
59 | 
60 |     train_dataloader = deita.train_dataloader()
61 |     assert isinstance(train_dataloader.dataset, SFTDataset)
62 |     assert len(train_dataloader) == 2
63 | 
64 |     val_dataloader = deita.val_dataloader()
65 |     assert isinstance(val_dataloader.dataset, SFTDataset)
66 |     assert len(val_dataloader) == 2
67 | 
68 |     assert isinstance(train_dataloader.dataset.prompt_style, AlpacaPromptStyle)
69 |     assert isinstance(val_dataloader.dataset.prompt_style, AlpacaPromptStyle)
70 | 
71 |     # has attributes from super class `LightningDataModule`
72 |     assert deita.prepare_data_per_node
73 | 


--------------------------------------------------------------------------------
/tests/data/test_lit_data.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | import sys
 3 | from unittest import mock
 4 | from unittest.mock import ANY
 5 | 
 6 | import pytest
 7 | 
 8 | from litgpt.data import LitData
 9 | 
10 | 
11 | @pytest.mark.skipif(sys.platform == "win32", reason="Needs to implement platform agnostic path/url joining")
12 | @mock.patch("litgpt.data.lit_data.LitData._dataloader")
13 | def test_input_dir_and_splits(dl_mock, tmp_path):
14 |     with pytest.raises(ValueError, match="If provided `split_names` must be a tuple of two strings"):
15 |         LitData(data_path=tmp_path, split_names=("train",))
16 | 
17 |     # local dir, no splits
18 |     data = LitData(data_path=tmp_path)
19 |     data.train_dataloader()
20 |     dl_mock.assert_called_with(input_dir=str(tmp_path), train=True)
21 |     data.val_dataloader()
22 |     dl_mock.assert_called_with(input_dir=str(tmp_path), train=False)
23 | 
24 |     # local dir, splits
25 |     data = LitData(data_path=tmp_path, split_names=("train", "val"))
26 |     data.train_dataloader()
27 |     dl_mock.assert_called_with(input_dir=str(tmp_path / "train"), train=True)
28 |     data.val_dataloader()
29 |     dl_mock.assert_called_with(input_dir=str(tmp_path / "val"), train=False)
30 | 
31 |     # remote dir, splits
32 |     data = LitData(data_path="s3://mydataset/data", split_names=("train", "val"))
33 |     data.train_dataloader()
34 |     dl_mock.assert_called_with(input_dir="s3://mydataset/data/train", train=True)
35 |     data.val_dataloader()
36 |     dl_mock.assert_called_with(input_dir="s3://mydataset/data/val", train=False)
37 | 
38 | 
39 | @pytest.mark.skipif(sys.platform == "win32", reason="Needs to implement platform agnostic path/url joining")
40 | @mock.patch("litdata.streaming.StreamingDataset")
41 | @mock.patch("litdata.streaming.StreamingDataLoader")
42 | def test_dataset_args(streaming_dataloader_mock, streaming_dataset_mock, tmp_path):
43 |     data = LitData(data_path=tmp_path, seed=1000)
44 |     data.train_dataloader()
45 |     streaming_dataset_mock.assert_called_with(
46 |         input_dir=str(tmp_path),
47 |         item_loader=ANY,
48 |         shuffle=True,
49 |         seed=1000,
50 |     )
51 |     streaming_dataloader_mock.assert_called_with(
52 |         streaming_dataset_mock(),
53 |         batch_size=1,
54 |         pin_memory=True,
55 |         num_workers=8,
56 |         drop_last=True,
57 |     )
58 | 


--------------------------------------------------------------------------------
/tests/data/test_longform.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | from litgpt.data import LongForm
 3 | from litgpt.prompts import Longform as LongFormPromptStyle
 4 | 
 5 | 
 6 | def test_longform(mock_tokenizer, longform_path):
 7 |     longform = LongForm(download_dir=longform_path, num_workers=0)
 8 |     assert isinstance(longform.prompt_style, LongFormPromptStyle)
 9 |     longform.connect(mock_tokenizer, batch_size=2, max_seq_length=10)
10 |     longform.prepare_data()
11 |     longform.setup()
12 | 
13 |     train_dataloader = longform.train_dataloader()
14 |     val_dataloader = longform.val_dataloader()
15 | 
16 |     assert len(train_dataloader) == 9
17 |     assert len(val_dataloader) == 5
18 | 
19 |     train_batch = next(iter(train_dataloader))
20 |     val_batch = next(iter(val_dataloader))
21 | 
22 |     assert train_batch.keys() == val_batch.keys() == {"input_ids", "labels", "token_counts"}
23 |     for key in ["input_ids", "labels"]:
24 |         assert train_batch[key].shape == (2, 10), f"Unexpected shape for train_batch[{key}]"
25 |         assert val_batch[key].shape == (2, 10), f"Unexpected shape for val_batch[{key}]"
26 | 
27 |     assert isinstance(train_dataloader.dataset.prompt_style, LongFormPromptStyle)
28 |     assert isinstance(val_dataloader.dataset.prompt_style, LongFormPromptStyle)
29 | 
30 |     # has attributes from super class `LightningDataModule`
31 |     assert longform.prepare_data_per_node
32 | 


--------------------------------------------------------------------------------
/tests/data/test_openwebtext.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | import sys
 3 | from unittest import mock
 4 | from unittest.mock import ANY, call
 5 | 
 6 | import pytest
 7 | from litdata.streaming import StreamingDataLoader, StreamingDataset
 8 | from torch.utils.data import DataLoader
 9 | 
10 | from litgpt.data import OpenWebText
11 | 
12 | 
13 | @pytest.mark.skipif(sys.platform == "win32", reason="Not in the mood to add Windows support right now.")
14 | @mock.patch("litdata.optimize")
15 | @mock.patch("litdata.streaming.dataset.subsample_streaming_dataset", return_value=([], []))
16 | @mock.patch("datasets.load_dataset")
17 | def test_openwebtext(_, __, optimize_mock, tmp_path, mock_tokenizer):
18 |     data = OpenWebText(data_path=(tmp_path / "openwebtext"))
19 |     assert data.seq_length == 2048
20 |     assert data.batch_size == 1
21 | 
22 |     data.connect(tokenizer=mock_tokenizer, batch_size=2, max_seq_length=1024)
23 |     assert data.seq_length == 1025
24 |     assert data.batch_size == 2
25 | 
26 |     # Data does not exist, preprocess it
27 |     data.prepare_data()
28 |     optimize_mock.assert_has_calls(
29 |         [
30 |             call(
31 |                 fn=ANY,
32 |                 num_workers=ANY,
33 |                 inputs=[],
34 |                 output_dir=str(tmp_path / "openwebtext" / "train"),
35 |                 chunk_bytes="200MB",
36 |             ),
37 |             call(
38 |                 fn=ANY,
39 |                 num_workers=ANY,
40 |                 inputs=[],
41 |                 output_dir=str(tmp_path / "openwebtext" / "val"),
42 |                 chunk_bytes="200MB",
43 |             ),
44 |         ]
45 |     )
46 |     optimize_mock.reset_mock()
47 | 
48 |     # Data exists, already preprocessed
49 |     (tmp_path / "openwebtext" / "train").mkdir(parents=True)
50 |     (tmp_path / "openwebtext" / "val").mkdir(parents=True)
51 |     data.prepare_data()
52 |     optimize_mock.assert_not_called()
53 | 
54 |     data.setup()
55 | 
56 |     train_dataloader = data.train_dataloader()
57 |     assert isinstance(train_dataloader, StreamingDataLoader)
58 |     assert isinstance(train_dataloader.dataset, StreamingDataset)
59 | 
60 |     val_dataloader = data.val_dataloader()
61 |     assert isinstance(val_dataloader, DataLoader)
62 |     assert isinstance(val_dataloader.dataset, StreamingDataset)
63 | 
64 |     # has attributes from super class `LightningDataModule`
65 |     assert data.prepare_data_per_node
66 | 


--------------------------------------------------------------------------------
/tests/data/test_textfiles.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import torch
  4 | from litdata import TokensLoader, optimize
  5 | from torch.utils._pytree import tree_map
  6 | 
  7 | from litgpt.data.text_files import TextFiles
  8 | 
  9 | 
 10 | class Tokenizer:
 11 |     bos_id = 0
 12 | 
 13 |     def encode(self, text, bos, eos):
 14 |         assert bos
 15 |         assert not eos
 16 |         return [self.bos_id] + [ord(c) for c in text]
 17 | 
 18 | 
 19 | def tokenize(data):
 20 |     for story in data:
 21 |         yield torch.tensor(story)
 22 | 
 23 | 
 24 | def fake_chunk(path, data):
 25 |     optimize(
 26 |         fn=tokenize,
 27 |         inputs=[data] * len(data),
 28 |         output_dir=str(path),
 29 |         num_workers=1,
 30 |         chunk_bytes="200MB",
 31 |         item_loader=TokensLoader(),
 32 |     )
 33 | 
 34 | 
 35 | def test_textfiles_datamodule(tmp_path):
 36 |     from litgpt.data.text_files import TextFiles
 37 | 
 38 |     data_dir = tmp_path / "textfiles"
 39 |     datamodule = TextFiles(train_data_path=data_dir, num_workers=1)
 40 |     datamodule.connect(max_seq_length=2, tokenizer=Tokenizer())
 41 | 
 42 |     # simulate `datamodule.prepare_data`
 43 |     train_data_dir = data_dir / "train"
 44 |     train_data_dir.mkdir(parents=True)
 45 |     fake_chunk(train_data_dir, [[12], [0, 23, 15, 63, 0], [73, 5, 0, 1, 1999, 0, 13]])
 46 |     datamodule.setup()
 47 | 
 48 |     tr_dataloader = datamodule.train_dataloader()
 49 |     tr_dataloader.shuffle = False
 50 | 
 51 |     actual = tree_map(torch.Tensor.tolist, list(tr_dataloader))
 52 | 
 53 |     # there is 1 sample per index in the data (13)
 54 |     assert actual == [
 55 |         [[73, 5, 0]],
 56 |         [[12, 0, 23]],
 57 |         [[5, 0, 1]],
 58 |         [[0, 73, 5]],
 59 |         [[1999, 0, 13]],
 60 |         [[0, 1, 1999]],
 61 |         [[1, 1999, 0]],
 62 |         [[0, 23, 15]],
 63 |         [[13, 12, 0]],
 64 |         [[63, 0, 73]],
 65 |         [[23, 15, 63]],
 66 |         [[15, 63, 0]],
 67 |         [[0, 13, 12]],
 68 |     ]
 69 | 
 70 | 
 71 | class MockTokenizer:
 72 |     bos_id = 0
 73 |     eos_id = 1
 74 |     use_bos = True
 75 | 
 76 |     def encode(self, text, bos=True, eos=False, device=None, max_length=-1):
 77 |         # Simple: map each character to its ordinal + 2
 78 |         tokens = [ord(c) + 2 for c in text]
 79 |         if bos:
 80 |             tokens = [self.bos_id] + tokens
 81 |         if eos:
 82 |             tokens.append(self.eos_id)
 83 |         if max_length > 0:
 84 |             tokens = tokens[:max_length]
 85 |         return torch.tensor(tokens, dtype=torch.long, device=device)
 86 | 
 87 |     def decode(self, tensor):
 88 |         ids = tensor.tolist() if tensor.ndim > 0 else [tensor.item()]
 89 |         chars = []
 90 |         for tid in ids:
 91 |             if tid == self.bos_id:
 92 |                 chars.append("<BOS>")
 93 |             elif tid == self.eos_id:
 94 |                 chars.append("<EOS>")
 95 |             else:
 96 |                 chars.append(chr(tid - 2))
 97 |         return "".join(chars)
 98 | 
 99 |     def decode_stream(self, token_stream, device=None):
100 |         for token in token_stream:
101 |             yield self.decode(token)
102 | 
103 |     @property
104 |     def vocab_size(self):
105 |         return 130
106 | 
107 | 
108 | def test_textfiles_token_loader(tmp_path):
109 |     # Create the directory for text files
110 |     data_dir = tmp_path / "textfiles"
111 |     data_dir.mkdir(parents=True, exist_ok=True)
112 | 
113 |     # Write sample training data to the directory
114 |     sample_texts = ["hello world", "foo bar", "lorem ipsum"]
115 |     for i, text in enumerate(sample_texts):
116 |         (data_dir / f"{i}.txt").write_text(text)
117 | 
118 |     datamodule = TextFiles(train_data_path=data_dir, num_workers=1)
119 |     datamodule.connect(max_seq_length=2, tokenizer=MockTokenizer())
120 |     datamodule.prepare_data()
121 | 
122 |     # ensure training set uses tokens loader
123 |     index_json = data_dir / "train" / "index.json"
124 |     assert index_json.exists()
125 |     meta = json.loads(index_json.read_text())
126 |     assert meta["config"]["item_loader"] == "TokensLoader"
127 | 
128 |     # ensure validation set uses tokens loader
129 |     index_json = data_dir / "val" / "index.json"
130 |     assert index_json.exists()
131 |     meta = json.loads(index_json.read_text())
132 |     assert meta["config"]["item_loader"] == "TokensLoader"
133 | 


--------------------------------------------------------------------------------
/tests/data/test_tinyllama.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | from unittest import mock
 3 | 
 4 | import pytest
 5 | from litdata.streaming import CombinedStreamingDataset, StreamingDataLoader, StreamingDataset
 6 | from torch.utils.data import DataLoader
 7 | 
 8 | from litgpt.data import TinyLlama
 9 | 
10 | 
11 | @mock.patch("litdata.streaming.dataset.subsample_streaming_dataset", return_value=([], []))
12 | def test_tinyllama(_, tmp_path):
13 |     data = TinyLlama(data_path=(tmp_path / "data"))
14 |     assert data.seq_length == 2048
15 |     assert data.batch_size == 1
16 | 
17 |     data.connect(batch_size=2, max_seq_length=1024)
18 |     assert data.seq_length == 1025
19 |     assert data.batch_size == 2
20 | 
21 |     with pytest.raises(FileNotFoundError, match="The directory .*data/slimpajama/train does not exist"):
22 |         data.prepare_data()
23 | 
24 |     (tmp_path / "data" / "slimpajama" / "train").mkdir(parents=True)
25 |     (tmp_path / "data" / "slimpajama" / "val").mkdir(parents=True)
26 |     (tmp_path / "data" / "starcoder").mkdir(parents=True)
27 | 
28 |     data.prepare_data()
29 |     data.setup()
30 | 
31 |     train_dataloader = data.train_dataloader()
32 |     assert isinstance(train_dataloader, StreamingDataLoader)
33 |     assert isinstance(train_dataloader.dataset, CombinedStreamingDataset)
34 | 
35 |     val_dataloader = data.val_dataloader()
36 |     assert isinstance(val_dataloader, DataLoader)
37 |     assert isinstance(val_dataloader.dataset, StreamingDataset)
38 | 
39 |     # has attributes from super class `LightningDataModule`
40 |     assert data.prepare_data_per_node
41 | 


--------------------------------------------------------------------------------
/tests/data/test_tinystories.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import pytest
  4 | import torch
  5 | from litdata import optimize
  6 | from litdata.streaming import StreamingDataset, TokensLoader
  7 | from torch.utils._pytree import tree_map
  8 | 
  9 | 
 10 | def tokenize(data):
 11 |     for story in data:
 12 |         yield torch.tensor(story)
 13 | 
 14 | 
 15 | def fake_chunk(path, data):
 16 |     optimize(
 17 |         fn=tokenize,
 18 |         inputs=[data] * len(data),
 19 |         output_dir=str(path),
 20 |         num_workers=1,
 21 |         chunk_bytes="200MB",
 22 |         item_loader=TokensLoader(),
 23 |     )
 24 | 
 25 | 
 26 | @pytest.mark.parametrize(
 27 |     ("max_seq_len", "expected"),
 28 |     [
 29 |         (2, [[0, 23, 15], [63, 0, 73], [5, 0, 1], [1999, 0, 13]]),
 30 |         (5, [[0, 23, 15, 63, 0, 73], [5, 0, 1, 1999, 0, 13]]),
 31 |         (6, [[0, 23, 15, 63, 0, 73, 5]]),
 32 |         (7, [[0, 23, 15, 63, 0, 73, 5, 0]]),
 33 |     ],
 34 | )
 35 | def test_pretok_dataset(tmp_path, max_seq_len, expected):
 36 |     fake_data = [0, 23, 15, 63, 0, 73, 5, 0, 1, 1999, 0, 13]
 37 |     assert len(fake_data) == 12
 38 |     fake_chunk(tmp_path, [fake_data])
 39 | 
 40 |     dataset = StreamingDataset(
 41 |         input_dir=str(tmp_path), item_loader=TokensLoader(block_size=max_seq_len + 1), shuffle=False, drop_last=False
 42 |     )
 43 |     actual = tree_map(torch.Tensor.tolist, list(dataset))
 44 |     assert actual == expected
 45 | 
 46 | 
 47 | def test_tokenize(tmp_path, monkeypatch):
 48 |     from litgpt.data.tinystories import tokenize
 49 | 
 50 |     story1, story2 = "foo bar", "    fun    "
 51 |     data = [{"story": story1}, {"story": story2}]
 52 |     shard_path = tmp_path / "data.json"
 53 |     with open(shard_path, "w", encoding="utf-8") as f:
 54 |         json.dump(data, f)
 55 | 
 56 |     class Tokenizer:
 57 |         bos_id = 0
 58 | 
 59 |         def encode(self, text, bos, eos):
 60 |             assert bos
 61 |             assert not eos
 62 |             return [self.bos_id] + [ord(c) for c in text]
 63 | 
 64 |     monkeypatch.setenv("DATA_OPTIMIZER_GLOBAL_RANK", "0")
 65 |     monkeypatch.setenv("DATA_OPTIMIZER_NUM_WORKERS", "1")
 66 |     data = tokenize(str(shard_path), Tokenizer())
 67 |     assert list(data) == [[0, 102, 111, 111, 32, 98, 97, 114], [0, 102, 117, 110]]
 68 | 
 69 | 
 70 | def test_tinystories_datamodule(tmp_path):
 71 |     from litgpt.data.tinystories import TinyStories
 72 | 
 73 |     data_dir = tmp_path / "tinystories"
 74 | 
 75 |     datamodule = TinyStories(data_dir, seed=42, num_workers=1)
 76 |     datamodule.connect(max_seq_length=2)
 77 | 
 78 |     # simulate `datamodule.prepare_data`
 79 |     train_data_dir = data_dir / "train"
 80 |     train_data_dir.mkdir(parents=True)
 81 |     fake_chunk(train_data_dir, [[12], [0, 23, 15, 63, 0], [73, 5, 0, 1, 1999, 0, 13]])
 82 | 
 83 |     datamodule.setup()
 84 | 
 85 |     tr_dataloader = datamodule.train_dataloader()
 86 |     tr_dataloader.shuffle = False
 87 | 
 88 |     actual = tree_map(torch.Tensor.tolist, list(tr_dataloader))
 89 | 
 90 |     # there is 1 sample per index in the data (13)
 91 |     assert actual == [
 92 |         [[73, 5, 0]],
 93 |         [[12, 0, 23]],
 94 |         [[5, 0, 1]],
 95 |         [[0, 73, 5]],
 96 |         [[1999, 0, 13]],
 97 |         [[0, 1, 1999]],
 98 |         [[1, 1999, 0]],
 99 |         [[0, 23, 15]],
100 |         [[13, 12, 0]],
101 |         [[63, 0, 73]],
102 |         [[23, 15, 63]],
103 |         [[15, 63, 0]],
104 |         [[0, 13, 12]],
105 |     ]
106 | 


--------------------------------------------------------------------------------
/tests/ext_thunder/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pathlib import Path
 3 | 
 4 | # support running without installing as a package, adding extensions to the Python path
 5 | wd = Path(__file__).parent.parent.parent.resolve()
 6 | if wd.is_dir():
 7 |     sys.path.append(str(wd))
 8 | else:
 9 |     import warnings
10 | 
11 |     warnings.warn(f"Could not find extensions directory at {wd}")
12 | 


--------------------------------------------------------------------------------
/tests/ext_thunder/test_thunder_networks.py:
--------------------------------------------------------------------------------
1 | """Run thunder tests as part of LitGPT CI"""
2 | 
3 | from litgpt.utils import _THUNDER_AVAILABLE
4 | 
5 | if _THUNDER_AVAILABLE:
6 |     from thunder.tests.test_networks import *  # noqa: F403
7 | else:
8 |     print("Skipping test_thunder_networks.py (thunder not available)")
9 | 


--------------------------------------------------------------------------------
/tests/ext_thunder/test_thunder_pretrain.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from contextlib import redirect_stdout
 3 | from io import StringIO
 4 | from unittest.mock import Mock
 5 | 
 6 | import torch
 7 | from torch.utils.data import DataLoader
 8 | 
 9 | from litgpt import Config
10 | from litgpt.args import EvalArgs, TrainArgs
11 | from litgpt.utils import _THUNDER_AVAILABLE, _RunIf
12 | 
13 | if _THUNDER_AVAILABLE:
14 |     import extensions.thunder.pretrain as thunder_pretrain
15 | 
16 | 
17 | @_RunIf(min_cuda_gpus=1, thunder=True)
18 | def test_pretrain_thunder(tmp_path, monkeypatch):
19 |     model_config = Config(block_size=2, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8)
20 | 
21 |     dataset = torch.tensor([[0, 1, 2], [3, 4, 5], [0, 1, 2]])
22 |     dataloader = DataLoader(dataset)
23 |     monkeypatch.setattr(thunder_pretrain, "get_dataloaders", Mock(return_value=(dataloader, dataloader)))
24 |     monkeypatch.setattr(thunder_pretrain, "save_hyperparameters", Mock())
25 | 
26 |     out_dir = tmp_path / "out"
27 |     stdout = StringIO()
28 |     with redirect_stdout(stdout):
29 |         thunder_pretrain.setup(
30 |             devices=1,
31 |             model_config=model_config,
32 |             out_dir=out_dir,
33 |             train=TrainArgs(global_batch_size=2, max_tokens=16, save_interval=1, micro_batch_size=1, max_norm=1.0),
34 |             eval=EvalArgs(interval=1, max_iters=1),
35 |             optimizer="AdamW",
36 |         )
37 | 
38 |     out_dir_contents = set(os.listdir(out_dir))
39 |     checkpoint_dirs = {"step-00000001", "step-00000002", "step-00000003", "step-00000004"}
40 |     assert checkpoint_dirs.issubset(out_dir_contents)
41 |     assert all((out_dir / p).is_dir() for p in checkpoint_dirs)
42 |     for checkpoint_dir in checkpoint_dirs:
43 |         # the `tokenizer_dir` is None by default, so only 'lit_model.pth' shows here
44 |         assert set(os.listdir(out_dir / checkpoint_dir)) == {"lit_model.pth", "model_config.yaml"}
45 | 
46 |     assert (out_dir / "logs" / "tensorboard" / "version_0").is_dir()
47 | 
48 |     logs = stdout.getvalue()
49 |     assert logs.count("(step)") == 4
50 |     assert logs.count("val loss") == 4
51 |     assert "Total parameters: 1,888" in logs
52 | 


--------------------------------------------------------------------------------
/tests/generate/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tests/generate/__init__.py


--------------------------------------------------------------------------------
/tests/generate/test_adapter.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import os
 4 | import re
 5 | import subprocess
 6 | import sys
 7 | from contextlib import redirect_stderr, redirect_stdout
 8 | from io import StringIO
 9 | from unittest.mock import ANY, Mock, call
10 | 
11 | import pytest
12 | import torch
13 | import yaml
14 | 
15 | skip_in_ci_on_macos = pytest.mark.skipif(
16 |     sys.platform == "darwin" and os.getenv("GITHUB_ACTIONS") == "true",
17 |     reason="Skipped on macOS in CI environment because CI machine does not have enough memory to run this test.",
18 | )
19 | 
20 | 
21 | @skip_in_ci_on_macos
22 | @pytest.mark.parametrize("version", ("v1", "v2"))
23 | def test_main(fake_checkpoint_dir, monkeypatch, version, tensor_like):
24 |     if version == "v1":
25 |         import litgpt.generate.adapter as generate
26 |     else:
27 |         import litgpt.generate.adapter_v2 as generate
28 | 
29 |     config_path = fake_checkpoint_dir / "model_config.yaml"
30 |     config = {"block_size": 128, "vocab_size": 50, "n_layer": 2, "n_head": 4, "n_embd": 8, "rotary_percentage": 1}
31 |     config_path.write_text(yaml.dump(config))
32 | 
33 |     monkeypatch.setattr(generate, "lazy_load", Mock())
34 |     monkeypatch.setattr(generate.GPT, "load_state_dict", Mock())
35 |     tokenizer_mock = Mock()
36 |     tokenizer_mock.return_value.encode.return_value = torch.tensor([[1, 2, 3]])
37 |     tokenizer_mock.return_value.decode.return_value = "### Response:foo bar baz"
38 |     monkeypatch.setattr(generate, "Tokenizer", tokenizer_mock)
39 |     generate_mock = Mock()
40 |     generate_mock.return_value = torch.tensor([[3, 2, 1]])
41 |     monkeypatch.setattr(generate, "generate", generate_mock)
42 | 
43 |     num_samples = 1
44 |     out, err = StringIO(), StringIO()
45 |     with redirect_stdout(out), redirect_stderr(err):
46 |         generate.main(temperature=2.0, top_k=2, top_p=0.9, checkpoint_dir=fake_checkpoint_dir)
47 | 
48 |     assert len(tokenizer_mock.return_value.decode.mock_calls) == num_samples
49 |     assert torch.allclose(tokenizer_mock.return_value.decode.call_args[0][0], generate_mock.return_value)
50 |     assert (
51 |         generate_mock.mock_calls
52 |         == [call(ANY, tensor_like, 101, temperature=2.0, top_k=2, top_p=0.9, eos_id=ANY)] * num_samples
53 |     )
54 | 
55 |     expected_output = "foo bar baz\n" * num_samples
56 |     # Allow for the config to be printed before the expected repeated strings.
57 |     pattern = rf".*^{re.escape(expected_output.strip())}$.*"
58 |     assert re.match(pattern, out.getvalue().strip(), re.DOTALL | re.MULTILINE)
59 | 
60 |     err_value = err.getvalue()
61 |     expected_parts = [
62 |         "'padded_vocab_size': 512",
63 |         "'n_layer': 2",
64 |         "'n_head': 4",
65 |         "'head_size': 2",
66 |         "'n_embd': 8",
67 |     ]
68 |     assert all(part in err_value for part in expected_parts)
69 | 
70 | 
71 | @pytest.mark.parametrize("version", ("", "_v2"))
72 | def test_cli(version):
73 |     args = ["litgpt", f"generate_adapter{version}", "-h"]
74 |     output = subprocess.check_output(args)
75 |     output = str(output.decode())
76 |     assert "For models finetuned with" in output
77 | 


--------------------------------------------------------------------------------
/tests/generate/utils.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | 
 3 | 
 4 | def find_forward_hooks(module):
 5 |     mapping = defaultdict(list)
 6 |     for name, submodule in module.named_modules():
 7 |         for hook in submodule._forward_pre_hooks.values():
 8 |             hook_data = ("forward_pre_hook", hook.func.__name__, hook.args, hook.keywords)
 9 |             mapping[name].append(hook_data)
10 |         for hook in submodule._forward_hooks.values():
11 |             hook_data = ("forward_hook", hook.func.__name__, hook.args, hook.keywords)
12 |             mapping[name].append(hook_data)
13 |     return dict(mapping)
14 | 


--------------------------------------------------------------------------------
/tests/test_args.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | import pytest
 3 | 
 4 | from litgpt.args import TrainArgs
 5 | 
 6 | 
 7 | def test_compute_warmup_iters():
 8 |     # warmup disabled
 9 |     train = TrainArgs(lr_warmup_steps=0, lr_warmup_fraction=0)
10 |     assert train.warmup_iters(devices=1, num_nodes=1, max_iters=1000, train_dataloader=range(10)) == 0
11 | 
12 |     # lr_warmup_steps and lr_warmup_fraction both are not allowed
13 |     with pytest.raises(ValueError, match="Can't provide both `--train.lr_warmup_fraction`"):
14 |         TrainArgs(lr_warmup_steps=1, lr_warmup_fraction=0.2)
15 | 
16 |     # lr_warmup_fraction invalid range
17 |     with pytest.raises(ValueError, match=" must be between 0 and 1"):
18 |         TrainArgs(lr_warmup_steps=0, lr_warmup_fraction=1.1)
19 | 
20 |     # lr_warmup_steps
21 |     train = TrainArgs(global_batch_size=1, micro_batch_size=1, lr_warmup_steps=100, lr_warmup_fraction=0)
22 |     assert train.warmup_iters(devices=1, num_nodes=1, max_iters=1000, train_dataloader=range(10)) == 100
23 |     # lr_warmup_steps multiplied by accumulation factor
24 |     train.global_batch_size = 4
25 |     assert train.warmup_iters(devices=1, num_nodes=1, max_iters=1000, train_dataloader=range(10)) == 400
26 |     assert train.warmup_iters(devices=2, num_nodes=1, max_iters=1000, train_dataloader=range(10)) == 200
27 |     # lr_warmup_steps truncated by max iters
28 |     assert train.warmup_iters(devices=1, num_nodes=1, max_iters=120, train_dataloader=range(10)) == 120
29 | 
30 |     # lr_warmup_fraction
31 |     train = TrainArgs(global_batch_size=1, micro_batch_size=1, lr_warmup_steps=0, lr_warmup_fraction=0.3)
32 |     assert train.warmup_iters(devices=1, num_nodes=1, max_iters=1000, train_dataloader=range(100)) == 30
33 |     # lr_warmup_fraction truncated by max iters
34 |     assert train.warmup_iters(devices=1, num_nodes=1, max_iters=20, train_dataloader=range(100)) == 20
35 |     # lr_warmup_fraction rounds up
36 |     assert train.warmup_iters(devices=1, num_nodes=1, max_iters=1000, train_dataloader=range(5)) == 2
37 | 


--------------------------------------------------------------------------------
/tests/test_ci.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | from lightning.fabric.plugins.precision.bitsandbytes import _BITSANDBYTES_AVAILABLE
 4 | 
 5 | from litgpt.utils import _RunIf
 6 | 
 7 | 
 8 | @_RunIf(min_cuda_gpus=1)
 9 | def test_gpu_ci_installs_bitsandbytes():
10 |     assert _BITSANDBYTES_AVAILABLE, str(_BITSANDBYTES_AVAILABLE)
11 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from contextlib import redirect_stdout
 3 | from io import StringIO
 4 | from unittest import mock
 5 | 
 6 | import pytest
 7 | from packaging.version import Version
 8 | 
 9 | from litgpt.__main__ import main
10 | 
11 | 
12 | def test_cli():
13 |     out = StringIO()
14 |     with pytest.raises(SystemExit), redirect_stdout(out), mock.patch("sys.argv", ["litgpt", "-h"]):
15 |         main()
16 |     out = out.getvalue()
17 |     assert "usage: litgpt" in out
18 |     assert (
19 |         "{download,chat,finetune,finetune_lora,finetune_full,finetune_adapter,finetune_adapter_v2,"
20 |         "pretrain,generate,generate_full,generate_adapter,generate_adapter_v2,generate_sequentially,"
21 |         "generate_speculatively,generate_tp,convert_to_litgpt,convert_from_litgpt,convert_pretrained_checkpoint,"
22 |         "merge_lora,evaluate,serve}" in out
23 |     )
24 |     assert (
25 |         """Available subcommands:
26 |     download            Download weights or tokenizer data from the Hugging
27 |                         Face Hub.
28 |     chat                Chat with a model."""
29 |         in out
30 |     )
31 |     assert """evaluate            Evaluate a model with the LM Evaluation Harness.""" in out
32 |     assert """serve               Serve a LitGPT model using LitServe.""" in out
33 |     out = StringIO()
34 |     with pytest.raises(SystemExit), redirect_stdout(out), mock.patch("sys.argv", ["litgpt", "finetune_lora", "-h"]):
35 |         main()
36 |     out = out.getvalue()
37 |     assert (
38 |         """--lora_alpha LORA_ALPHA
39 |                         The LoRA alpha. (type: int, default: 16)"""
40 |         in out
41 |     )
42 | 
43 |     if Version(f"{sys.version_info.major}.{sys.version_info.minor}") < Version("3.9"):
44 |         # python 3.8 prints `Union[int, null]` instead of `Optional[int]`
45 |         return
46 | 
47 |     out = StringIO()
48 |     with pytest.raises(SystemExit), redirect_stdout(out), mock.patch("sys.argv", ["litgpt", "pretrain", "-h"]):
49 |         main()
50 |     out = out.getvalue()
51 |     print(out)
52 |     assert (
53 |         """--train.max_tokens MAX_TOKENS
54 |                         Total number of tokens to train on (type:
55 |                         Optional[int], default: 3000000000000)"""
56 |         in out
57 |     )
58 | 
59 | 
60 | def test_rewrite_finetune_command():
61 |     out1 = StringIO()
62 |     with pytest.raises(SystemExit), redirect_stdout(out1), mock.patch("sys.argv", ["litgpt", "fineune", "-h"]):
63 |         main()
64 |     out2 = StringIO()
65 |     with pytest.raises(SystemExit), redirect_stdout(out2), mock.patch("sys.argv", ["litgpt", "fineune_lora", "-h"]):
66 |         main()
67 |     assert out1.getvalue() == out2.getvalue()
68 | 


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
  1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
  2 | 
  3 | import pytest
  4 | import yaml
  5 | 
  6 | import litgpt.config as config_module
  7 | from litgpt import Config
  8 | from litgpt.config import find_multiple
  9 | 
 10 | 
 11 | def test_config():
 12 |     config = Config()
 13 |     assert config.name == ""
 14 |     assert config.block_size == 4096
 15 | 
 16 |     config = Config(block_size=2048)
 17 |     assert config.block_size == 2048
 18 | 
 19 |     config = Config.from_name("pythia-14m")
 20 |     assert config.block_size == 512
 21 | 
 22 |     config = Config.from_name("pythia-14m", block_size=4096)
 23 |     assert config.block_size == 4096
 24 | 
 25 |     config = Config(hf_config={"name": "pythia-14m"})
 26 |     assert config.name == "pythia-14m"
 27 | 
 28 | 
 29 | def test_from_hf_name():
 30 |     # by short-hand name
 31 |     config0 = Config.from_name("tiny-llama-1.1b")
 32 |     # or by huggingface hub repo name
 33 |     config1 = Config.from_name("TinyLlama-1.1B-intermediate-step-1431k-3T")
 34 |     assert config0 is not None
 35 |     assert config1 is not None
 36 |     assert config0 == config1
 37 | 
 38 | 
 39 | def test_nonexisting_name():
 40 |     with pytest.raises(ValueError, match="'invalid-model-name' is not a supported config name"):
 41 |         Config.from_name("invalid-model-name")
 42 | 
 43 | 
 44 | @pytest.mark.parametrize("config", config_module.configs, ids=[c["name"] for c in config_module.configs])
 45 | def test_short_and_hf_names_are_equal_unless_on_purpose(config):
 46 |     # by short-hand name
 47 |     config0 = Config.from_name(config["name"])
 48 |     # or by huggingface hub repo name
 49 |     config1 = Config.from_name(config["hf_config"]["name"])
 50 |     assert config0.name == config1.name
 51 | 
 52 | 
 53 | def test_from_hf_name_with_org_string():
 54 |     # Test case 1: valid input
 55 |     config0 = Config.from_name("tiny-llama-1.1b")
 56 |     config1 = Config.from_name("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T")
 57 |     assert config0 is not None
 58 |     assert config1 is not None
 59 |     assert config0 == config1
 60 | 
 61 |     # Test case 2: invalid input - org not found
 62 |     with pytest.raises(
 63 |         ValueError, match="'UnknownOrg/TinyLlama-1.1B-intermediate-step-1431k-3T' is not a supported config name"
 64 |     ):
 65 |         Config.from_name("UnknownOrg/TinyLlama-1.1B-intermediate-step-1431k-3T")
 66 | 
 67 |     # Test case 3: invalid input - name not found
 68 |     with pytest.raises(ValueError, match="'TinyLlama/TinyLlama-XYZ' is not a supported config name"):
 69 |         Config.from_name("TinyLlama/TinyLlama-XYZ")
 70 | 
 71 | 
 72 | def test_from_checkpoint(tmp_path):
 73 |     # 1. Neither `lit_config.py` nor matching config exists.
 74 |     with pytest.raises(FileNotFoundError, match="neither 'model_config.yaml' nor matching config exists"):
 75 |         Config.from_checkpoint(tmp_path / "non_existing_checkpoint")
 76 | 
 77 |     # 2. If `lit_config.py` doesn't exists, but there is a matching config in `litgpt/config.py`.
 78 |     config = Config.from_checkpoint(tmp_path / "pythia-14m")
 79 |     assert config.name == "pythia-14m"
 80 |     assert config.block_size == 512
 81 |     assert config.n_layer == 6
 82 | 
 83 |     # 3. If only `lit_config.py` exists.
 84 |     config_data = {"name": "pythia-14m", "block_size": 24, "n_layer": 2}
 85 |     with open(tmp_path / "model_config.yaml", "w", encoding="utf-8") as file:
 86 |         yaml.dump(config_data, file)
 87 |     config = Config.from_checkpoint(tmp_path)
 88 |     assert config.name == "pythia-14m"
 89 |     assert config.block_size == 24
 90 |     assert config.n_layer == 2
 91 | 
 92 |     # 4. Both `lit_config.py` and a matching config exist, but `lit_config.py` supersedes matching config
 93 |     (tmp_path / "pythia-14m").mkdir()
 94 |     with open(tmp_path / "pythia-14m/model_config.yaml", "w", encoding="utf-8") as file:
 95 |         yaml.dump(config_data, file)
 96 |     config = Config.from_checkpoint(tmp_path / "pythia-14m")
 97 |     assert config.name == "pythia-14m"
 98 |     assert config.block_size == 24
 99 |     assert config.n_layer == 2
100 | 
101 | 
102 | @pytest.mark.parametrize("head_size", [None, 128])
103 | def test_head_size(head_size):
104 |     config = Config(head_size)
105 | 
106 |     assert config.head_size == head_size or config.n_embd // config.n_head
107 | 
108 | 
109 | def test_find_multiple():
110 |     assert find_multiple(17, 5) == 20
111 |     assert find_multiple(30, 7) == 35
112 |     assert find_multiple(10, 2) == 10
113 |     assert find_multiple(5, 10) == 10
114 |     assert find_multiple(50254, 128) == 50304
115 |     assert find_multiple(50254, 256) == 50432
116 |     assert find_multiple(50254, 512) == 50688
117 | 


--------------------------------------------------------------------------------
/tests/test_config_hub.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import importlib.util
 3 | from pathlib import Path
 4 | from unittest import mock
 5 | from unittest.mock import Mock
 6 | 
 7 | import pytest
 8 | from lightning.fabric.plugins import Precision
 9 | 
10 | from litgpt import Config
11 | from litgpt.utils import CLI
12 | 
13 | fixed_pairs = [
14 |     ("litgpt/pretrain.py", "pretrain/debug.yaml"),
15 |     ("litgpt/pretrain.py", "pretrain/tinyllama.yaml"),
16 |     ("litgpt/pretrain.py", "pretrain/tinystories.yaml"),
17 |     (
18 |         "litgpt/pretrain.py",
19 |         "https://raw.githubusercontent.com/Lightning-AI/litgpt/4d55ab6d0aa404f0da0d03a80a8801ed60e07e83/config_hub/pretrain/tinystories.yaml",  # TODO: Update with path from main after merge
20 |     ),
21 | ]
22 | 
23 | config_hub_path = Path(__file__).parent.parent / "config_hub" / "finetune"
24 | model_pairs = []
25 | 
26 | for model_dir in config_hub_path.iterdir():
27 |     if model_dir.is_dir():
28 |         model_name = model_dir.name
29 |         for yaml_file in model_dir.glob("*.yaml"):
30 |             config_name = yaml_file.stem
31 |             python_file = "litgpt/finetune/full.py" if config_name == "full" else "litgpt/finetune/lora.py"
32 |             relative_yaml_path = yaml_file.relative_to(config_hub_path.parent)
33 |             model_pairs.append((python_file, str(relative_yaml_path)))
34 | 
35 | all_pairs = fixed_pairs + model_pairs
36 | 
37 | 
38 | @pytest.mark.parametrize(("script_file", "config_file"), all_pairs)
39 | def test_config_help(script_file, config_file, monkeypatch):
40 |     """Test that configs validate against the signature in the scripts."""
41 |     script_file = Path(__file__).parent.parent / script_file
42 |     assert script_file.is_file()
43 |     if "http" not in str(config_file):
44 |         config_file = Path(__file__).parent.parent / "config_hub" / config_file
45 |         assert config_file.is_file()
46 | 
47 |     spec = importlib.util.spec_from_file_location(str(script_file.parent.name), script_file)
48 |     module = importlib.util.module_from_spec(spec)
49 |     spec.loader.exec_module(module)
50 | 
51 |     monkeypatch.setattr(module, "main", Mock())
52 |     monkeypatch.setattr(module, "Tokenizer", Mock())
53 |     monkeypatch.setattr(module, "BitsandbytesPrecision", Mock(return_value=Precision()), raising=False)
54 |     monkeypatch.setattr(module, "Config", Mock(return_value=Config.from_name("pythia-14m")))
55 |     monkeypatch.setattr(module, "check_valid_checkpoint_dir", Mock(), raising=False)
56 | 
57 |     try:
58 |         with mock.patch("sys.argv", [script_file.name, "--config", str(config_file), "--devices", "1"]):
59 |             CLI(module.setup)
60 |             module.main.assert_called_once()
61 |     except FileNotFoundError:
62 |         pass
63 |         # FileNotFound occurs here because we have not downloaded the model weights referenced in the config files
64 |         # which is ok because here we just want to validate the config file itself.
65 | 


--------------------------------------------------------------------------------
/tests/test_distributed.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from lightning import Fabric
 4 | 
 5 | from litgpt.utils import _RunIf
 6 | 
 7 | 
 8 | @_RunIf(min_cuda_gpus=2, standalone=True)
 9 | @pytest.mark.parametrize("strategy", ["ddp", "fsdp"])
10 | def test_no_backward_sync(strategy):
11 |     fabric = Fabric(devices=2, accelerator="cuda", strategy=strategy)
12 |     fabric.launch()
13 | 
14 |     # account for sharding in the case of FSDP
15 |     out_features = 1 if "ddp" in strategy else fabric.world_size
16 | 
17 |     model = torch.nn.Linear(1, out_features, bias=False, device=fabric.device)
18 |     x = torch.randn(1, 1, device=fabric.device)
19 |     model = fabric.setup(model)
20 | 
21 |     # 6 iters, 3 grad accumulation iters
22 |     for i, enabled in enumerate((True, True, False, True, True, False), 1):
23 |         x = torch.tensor([i * (fabric.local_rank + 1)], device=fabric.device, dtype=torch.float32)
24 | 
25 |         with fabric.no_backward_sync(model, enabled):
26 |             y = model(x)
27 |             fabric.backward(y.sum())
28 |         if not enabled:
29 |             # Math for the first 3 iters
30 |             #
31 |             # DistributedDataParallel
32 |             # (1*1+2*1+3*1 + 1*2+2*2+3*2) / 2       = 9
33 |             #  ^^^^^^^^^^^   ^^^^^^^^^^^  ^^^
34 |             #  rank0         rank1        allreduce
35 |             #
36 |             # thunder.distributed.ddp
37 |             # ((1*1+2*1) + (1*2+2*2)) / 2        + (3*1 + 3*2)  / 2        = 9
38 |             #   ^^^^^^^     ^^^^^^^   ^^^           ^^^   ^^^   ^^^
39 |             #   rank0       rank1     allreduce1    rank0 rank1 allreduce2
40 |             assert model.weight.grad.shape.numel() == 1, model.weight.grad.shape
41 |             assert model.weight.grad.item() == (9.0 if i == 3 else 22.5)
42 |             model.weight.grad = None
43 | 


--------------------------------------------------------------------------------
/tests/test_evaluate.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import subprocess
 4 | from contextlib import redirect_stdout
 5 | from dataclasses import asdict
 6 | from io import StringIO
 7 | from unittest import mock
 8 | 
 9 | import pytest
10 | import torch
11 | import yaml
12 | 
13 | import litgpt.eval.evaluate as module
14 | from litgpt import GPT, Config
15 | from litgpt.scripts.download import download_from_hub
16 | 
17 | 
18 | @pytest.mark.flaky(reruns=3)
19 | def test_evaluate_script(tmp_path):
20 |     ours_config = Config.from_name("pythia-14m")
21 |     download_from_hub(repo_id="EleutherAI/pythia-14m", tokenizer_only=True, checkpoint_dir=tmp_path)
22 |     checkpoint_dir = tmp_path / "EleutherAI" / "pythia-14m"
23 |     ours_model = GPT(ours_config)
24 |     torch.save(ours_model.state_dict(), checkpoint_dir / "lit_model.pth")
25 |     with open(checkpoint_dir / "model_config.yaml", "w", encoding="utf-8") as fp:
26 |         yaml.dump(asdict(ours_config), fp)
27 | 
28 |     stdout = StringIO()
29 |     with redirect_stdout(stdout), mock.patch("sys.argv", ["eval/evaluate.py"]):
30 |         with pytest.raises(ValueError) as excinfo:
31 |             module.convert_and_evaluate(
32 |                 checkpoint_dir,
33 |                 out_dir=tmp_path / "out_dir",
34 |                 device=None,
35 |                 dtype=torch.float32,
36 |                 limit=5,
37 |                 tasks="logiqa",
38 |                 batch_size=0,  # Test for non-positive integer
39 |             )
40 |         assert "batch_size must be a positive integer, 'auto', or in the format 'auto:N'." in str(excinfo.value)
41 | 
42 |         with pytest.raises(ValueError) as excinfo:
43 |             module.convert_and_evaluate(
44 |                 checkpoint_dir,
45 |                 out_dir=tmp_path / "out_dir",
46 |                 device=None,
47 |                 dtype=torch.float32,
48 |                 limit=5,
49 |                 tasks="logiqa",
50 |                 batch_size="invalid",  # Test for invalid string
51 |             )
52 |         assert "batch_size must be a positive integer, 'auto', or in the format 'auto:N'." in str(excinfo.value)
53 | 
54 |     stdout = StringIO()
55 |     with redirect_stdout(stdout), mock.patch("sys.argv", ["eval/evaluate.py"]):
56 |         module.convert_and_evaluate(
57 |             checkpoint_dir,
58 |             out_dir=tmp_path / "out_dir",
59 |             device=None,
60 |             dtype=torch.float32,
61 |             limit=5,
62 |             tasks="logiqa",
63 |             batch_size=1,  # Valid case
64 |         )
65 |     stdout = stdout.getvalue()
66 |     assert (tmp_path / "out_dir" / "results.json").is_file()
67 |     assert "logiqa" in stdout
68 |     assert "Metric" in stdout
69 |     assert "Loading checkpoint shards" not in stdout
70 | 
71 | 
72 | def test_cli():
73 |     args = ["litgpt", "evaluate", "-h"]
74 |     output = subprocess.check_output(args)
75 |     output = str(output.decode())
76 |     assert "Evaluate a model with the LM Evaluation Harness" in output
77 | 


--------------------------------------------------------------------------------
/tests/test_full.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import os
 4 | from contextlib import redirect_stdout
 5 | from io import StringIO
 6 | from unittest import mock
 7 | from unittest.mock import Mock
 8 | 
 9 | import torch
10 | import yaml
11 | 
12 | import litgpt.finetune.full as module
13 | from litgpt.args import EvalArgs, TrainArgs
14 | from litgpt.data import Alpaca
15 | 
16 | 
17 | @mock.patch.dict(os.environ, {"LT_ACCELERATOR": "cpu"})
18 | def test_full_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_path):
19 |     model_config = dict(block_size=128, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8)
20 |     (fake_checkpoint_dir / "model_config.yaml").write_text(yaml.dump(model_config))
21 |     monkeypatch.setattr(module, "load_checkpoint", Mock())
22 | 
23 |     tokenizer_mock = Mock()
24 |     tokenizer_mock.return_value = tokenizer_mock
25 |     tokenizer_mock.encode = lambda *_, **__: torch.tensor([3, 2, 1])
26 |     monkeypatch.setattr(module, "Tokenizer", tokenizer_mock)
27 | 
28 |     out_dir = tmp_path / "out"
29 |     setup_args = (fake_checkpoint_dir,)
30 |     setup_kwargs = dict(
31 |         data=Alpaca(download_dir=alpaca_path.parent, file_name=alpaca_path.name, val_split_fraction=0.5, num_workers=0),
32 |         out_dir=out_dir,
33 |         precision="32-true",
34 |         train=TrainArgs(global_batch_size=1, save_interval=2, epochs=1, max_steps=6, micro_batch_size=1),
35 |         eval=EvalArgs(interval=2, max_iters=2, max_new_tokens=1),
36 |     )
37 |     stdout = StringIO()
38 |     with redirect_stdout(stdout), mock.patch("sys.argv", ["full.py", str(fake_checkpoint_dir)]):
39 |         module.setup(*setup_args, **setup_kwargs)
40 | 
41 |     out_dir_contents = set(os.listdir(out_dir))
42 |     checkpoint_dirs = {"step-000002", "step-000004", "step-000006", "final"}
43 |     assert checkpoint_dirs.issubset(out_dir_contents)
44 |     assert all((out_dir / p).is_dir() for p in checkpoint_dirs)
45 |     for checkpoint_dir in checkpoint_dirs:
46 |         assert set(os.listdir(out_dir / checkpoint_dir)) == {
47 |             "lit_model.pth",
48 |             "model_config.yaml",
49 |             "tokenizer_config.json",
50 |             "tokenizer.json",
51 |             "hyperparameters.yaml",
52 |             "prompt_style.yaml",
53 |         }
54 |     assert (out_dir / "logs" / "csv" / "version_0" / "metrics.csv").is_file()
55 | 
56 |     logs = stdout.getvalue()
57 |     assert logs.count("(step)") == 6
58 |     assert logs.count("val loss") == 4  # 3 validations + 1 final validation
59 |     assert logs.count("Final evaluation") == 1
60 |     assert "of trainable parameters: 1,888" in logs
61 | 
62 |     # Resume training and do 2 steps more
63 |     setup_kwargs["train"].max_steps = 8
64 |     setup_kwargs["resume"] = True
65 |     stdout = StringIO()
66 |     with redirect_stdout(stdout), mock.patch("sys.argv", ["full.py", str(fake_checkpoint_dir)]):
67 |         module.setup(*setup_args, **setup_kwargs)
68 |     logs = stdout.getvalue()
69 |     assert f"Resuming training from {out_dir / 'step-000006' / 'lit_model.pth'}" in logs
70 |     assert logs.count("(step)") == 2
71 |     assert out_dir / "step-000008" in set(out_dir.iterdir())
72 | 


--------------------------------------------------------------------------------
/tests/test_merge_lora.py:
--------------------------------------------------------------------------------
 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 2 | 
 3 | import os
 4 | import shutil
 5 | from contextlib import redirect_stdout
 6 | from io import StringIO
 7 | from pathlib import Path
 8 | from unittest import mock
 9 | 
10 | import pytest
11 | import torch
12 | import yaml
13 | 
14 | from litgpt.lora import GPT as LoRAGPT
15 | from litgpt.lora import lora_filter
16 | from litgpt.model import GPT
17 | from litgpt.scripts.merge_lora import load_lora_metadata, merge_lora
18 | 
19 | 
20 | @mock.patch.dict(os.environ, {"LT_ACCELERATOR": "cpu"})
21 | @pytest.mark.parametrize(
22 |     ("pretrained_dtype", "lora_dtype"), [(None, None), (torch.float16, torch.float32), (torch.float16, torch.bfloat16)]
23 | )
24 | def test_merge_lora(tmp_path, fake_checkpoint_dir, pretrained_dtype, lora_dtype):
25 |     pretrained_checkpoint_dir = tmp_path / "pretrained"
26 |     lora_checkpoint_dir = tmp_path / "lora"
27 |     shutil.copytree(fake_checkpoint_dir, pretrained_checkpoint_dir)
28 |     shutil.copytree(fake_checkpoint_dir, lora_checkpoint_dir)
29 |     (lora_checkpoint_dir / "lit_model.pth").unlink()  # should not already exist
30 |     shutil.rmtree(tmp_path / "checkpoints")
31 | 
32 |     # Create a fake pretrained checkpoint
33 |     config = dict(block_size=128, padded_vocab_size=256, n_layer=3, n_head=8, n_embd=16)
34 |     with open(pretrained_checkpoint_dir / "model_config.yaml", "w", encoding="utf-8") as fp:
35 |         yaml.dump(config, fp)
36 |     base_model = GPT.from_name("pythia-14m", **config).to(dtype=pretrained_dtype)
37 |     state_dict = base_model.state_dict()
38 |     assert len(state_dict) == 40
39 |     torch.save(state_dict, pretrained_checkpoint_dir / "lit_model.pth")
40 | 
41 |     # Create a fake LoRA checkpoint
42 |     lora_kwargs = dict(lora_r=8, lora_alpha=16, lora_dropout=0.05, lora_query=True, lora_value=True)
43 |     lora_model = LoRAGPT.from_name("pythia-14m", **config, **lora_kwargs).to(dtype=lora_dtype)
44 |     state_dict = {k: v for k, v in lora_model.state_dict().items() if lora_filter(k, v)}
45 |     assert len(state_dict) == 6
46 |     torch.save(state_dict, lora_checkpoint_dir / "lit_model.pth.lora")
47 |     hparams = dict(checkpoint_dir=str(pretrained_checkpoint_dir), **lora_kwargs)
48 |     with open(lora_checkpoint_dir / "hyperparameters.yaml", "w", encoding="utf-8") as file:
49 |         yaml.dump(hparams, file)
50 |     shutil.copyfile(pretrained_checkpoint_dir / "model_config.yaml", lora_checkpoint_dir / "model_config.yaml")
51 | 
52 |     assert set(os.listdir(tmp_path)) == {"lora", "pretrained"}
53 |     merge_lora(lora_checkpoint_dir)
54 |     assert set(os.listdir(tmp_path)) == {"lora", "pretrained"}
55 |     assert set(os.listdir(lora_checkpoint_dir)) == {
56 |         "model_config.yaml",
57 |         "lit_model.pth",
58 |         "lit_model.pth.lora",
59 |         "tokenizer.json",
60 |         "tokenizer_config.json",
61 |         "hyperparameters.yaml",
62 |     }
63 | 
64 |     # Assert that the merged weights can be loaded back into the base model
65 |     merged = torch.load(lora_checkpoint_dir / "lit_model.pth")
66 |     keys = base_model.load_state_dict(merged, strict=True)
67 |     assert not keys.missing_keys
68 |     assert not keys.unexpected_keys
69 | 
70 |     # Attempt to merge again
71 |     stdout = StringIO()
72 |     with redirect_stdout(stdout):
73 |         merge_lora(lora_checkpoint_dir)
74 |     assert "LoRA weights have already been merged" in stdout.getvalue()
75 | 
76 | 
77 | def test_load_lora_metadata(fake_checkpoint_dir):
78 |     assert not (fake_checkpoint_dir / "hyperparameters.yaml").is_file()
79 |     with pytest.raises(FileNotFoundError, match="missing a `hyperparameters.yaml` file"):
80 |         load_lora_metadata(fake_checkpoint_dir)
81 | 
82 |     hparams = dict(precision="bf16-mixed", checkpoint_dir="checkpoints/meta-llama/Llama-2-7b", lora_r=8, lora_alpha=16)
83 |     with open(fake_checkpoint_dir / "hyperparameters.yaml", "w", encoding="utf-8") as file:
84 |         yaml.dump(hparams, file)
85 | 
86 |     lora_args, pretrained_dir, precision = load_lora_metadata(fake_checkpoint_dir)
87 |     assert lora_args == dict(lora_r=8, lora_alpha=16)
88 |     assert pretrained_dir == Path("checkpoints/meta-llama/Llama-2-7b")
89 |     assert precision == "bf16-mixed"
90 | 


--------------------------------------------------------------------------------
/tutorials/convert_hf_checkpoint.md:
--------------------------------------------------------------------------------
 1 | # Converting Hugging Face Transformers to LitGPT weights
 2 | 
 3 | By default, the `litgpt download` command converts the downloaded HF checkpoint files into a LitGPT compatible format after downloading. For example,
 4 | 
 5 | ```bash
 6 | litgpt download EleutherAI/pythia-14m
 7 | ```
 8 | 
 9 | creates the following files:
10 | 
11 | ```
12 | checkpoints/
13 | └── EleutherAI/
14 |     └── pythia-14m/
15 |         ├── config.json
16 |         ├── generation_config.json
17 |         ├── model_config.yaml      # LitGPT specific file
18 |         ├── lit_model.pth          # LitGPT specific file
19 |         ├── pytorch_model.bin
20 |         ├── tokenizer.json
21 |         └── tokenizer_config.json
22 | ```
23 | 
24 | 
25 | 
26 | To disable the automatic conversion, which is useful for development and debugging purposes, you can run the `litgpt download` with the `--convert_checkpoint false` flag. This will only download the checkpoint files but do not convert them for use in LitGPT:
27 | 
28 | ```bash
29 | rm -rf checkpoints/EleutherAI/pythia-14m
30 | 
31 | litgpt download EleutherAI/pythia-14m \
32 |   --convert_checkpoint false
33 | 
34 | ls checkpoints/EleutherAI/pythia-14m
35 | ```
36 | 
37 | ```
38 |  checkpoints/
39 | └── EleutherAI/
40 |     └── pythia-14m/
41 |         ├── config.json
42 |         ├── generation_config.json
43 |         ├── pytorch_model.bin
44 |         ├── tokenizer.json
45 |         └── tokenizer_config.json
46 | ```
47 | 
48 | The required files `model_config.yaml` and `lit_model.pth` files can then be manually generated via the `litgpt/scripts/convert_hf_checkpoint.py` script:
49 | 
50 | ```bash
51 | litgpt convert_to_litgpt checkpoints/EleutherAI/pythia-14m
52 | ```
53 | 


--------------------------------------------------------------------------------
/tutorials/deploy.md:
--------------------------------------------------------------------------------
 1 | # Serve and Deploy LLMs
 2 | 
 3 | This document shows how you can serve a LitGPT for deployment.
 4 | 
 5 | 
 6 | &nbsp;
 7 | ## Serve an LLM with LitServe
 8 | 
 9 | This section illustrates how we can set up an inference server for a phi-2 LLM using `litgpt serve` that is minimal and highly scalable.
10 | 
11 | 
12 | &nbsp;
13 | ### Step 1: Start the inference server
14 | 
15 | 
16 | ```bash
17 | # 1) Download a pretrained model (alternatively, use your own finetuned model)
18 | litgpt download microsoft/phi-2
19 | 
20 | # 2) Start the server
21 | litgpt serve microsoft/phi-2
22 | ```
23 | 
24 | > [!TIP]
25 | > Use `litgpt serve --help` to display additional options, including the port, devices, LLM temperature setting, and more.
26 | 
27 | 
28 | &nbsp;
29 | ### Step 2: Query the inference server
30 | 
31 | You can now send requests to the inference server you started in step 2. For example, in a new Python session, we can send requests to the inference server as follows:
32 | 
33 | 
34 | ```python
35 | import requests, json
36 | 
37 | response = requests.post(
38 |     "http://127.0.0.1:8000/predict",
39 |     json={"prompt": "Fix typos in the following sentence: Example input"}
40 | )
41 | 
42 | print(response.json()["output"])
43 | ```
44 | 
45 | Executing the code above prints the following output:
46 | 
47 | ```
48 | Example input.
49 | ```
50 | 
51 | &nbsp;
52 | ### Optional: Use the streaming mode
53 | 
54 | The 2-step procedure described above returns the complete response all at once. If you want to stream the response on a token-by-token basis, start the server with the streaming option enabled:
55 | 
56 | ```bash
57 | litgpt serve microsoft/phi-2 --stream true
58 | ```
59 | 
60 | Then, use the following updated code to query the inference server:
61 | 
62 | ```python
63 | import requests, json
64 | 
65 | response = requests.post(
66 |     "http://127.0.0.1:8000/predict",
67 |     json={"prompt": "Fix typos in the following sentence: Example input"},
68 |     stream=True
69 | )
70 | 
71 | # stream the response
72 | for line in response.iter_lines(decode_unicode=True):
73 |     if line:
74 |         print(json.loads(line)["output"], end="")
75 | ```
76 | 
77 | ```
78 | Sure, here is the corrected sentence:
79 | 
80 | Example input
81 | ```
82 | 
83 | &nbsp;
84 | ## Serve an LLM UI with Chainlit
85 | 
86 | If you are interested in developing a simple ChatGPT-like UI prototype, see the Chainlit tutorial in the following Studio:
87 | 
88 | <a target="_blank" href="https://lightning.ai/lightning-ai/studios/chatgpt-like-llm-uis-via-chainlit">
89 |   <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/studio-badge.svg" alt="Open In Studio"/>
90 | </a>
91 | 


--------------------------------------------------------------------------------
/tutorials/developer-docs/README.md:
--------------------------------------------------------------------------------
1 | LitGPT developer documentation files.
2 | 


--------------------------------------------------------------------------------
/tutorials/developer-docs/python-api.md:
--------------------------------------------------------------------------------
  1 | # LitGPT High-level Python API
  2 | 
  3 | This is a work-in-progress draft for a high-level LitGPT Python API.
  4 | 
  5 | &nbsp;
  6 | ## Model loading & saving
  7 | 
  8 | The `LLM.load` command loads an `llm` object, which contains both the model object (a PyTorch module) and a preprocessor.
  9 | 
 10 | ```python
 11 | from litgpt import LLM
 12 | 
 13 | llm = LLM.load(
 14 |     model="url | local_path",
 15 |     # high-level user only needs to care about those:
 16 |     memory_reduction="none | medium | strong"
 17 |     # advanced options for technical users:
 18 |     source="hf | local | other"
 19 |     quantize="bnb.nf4",
 20 |     precision="bf16-true",
 21 |     device=""auto | cuda | cpu",
 22 | )
 23 | ```
 24 | 
 25 | Here,
 26 | 
 27 | -  `llm.model` contains the PyTorch Module
 28 | - and `llm.preprocessor.tokenizer`  contains the tokenizer
 29 | 
 30 | The `llm.save` command saves the model weights, tokenizer, and configuration information.
 31 | 
 32 | 
 33 | ```python
 34 | llm.save(checkpoint_dir, format="lightning | ollama | hf")
 35 | ```
 36 | 
 37 | 
 38 | &nbsp;
 39 | ## Inference / Chat
 40 | 
 41 | ```
 42 | response = llm.generate(
 43 |     prompt="What do Llamas eat?",
 44 |     temperature=0.1,
 45 |     top_p=0.8,
 46 |     ...
 47 | )
 48 | ```
 49 | 
 50 | 
 51 | &nbsp;
 52 | ## Dataset
 53 | 
 54 | The `llm.prepare_dataset` command prepares a dataset for training.
 55 | 
 56 | ```
 57 | llm.download_dataset(
 58 |     URL,
 59 |     ...
 60 | )
 61 | ```
 62 | 
 63 | ```
 64 | dataset = llm.prepare_dataset(
 65 |     path,
 66 |     task="pretrain | instruction_finetune",
 67 |     test_portion=0.1,
 68 |     ...
 69 | )
 70 | ```
 71 | 
 72 | &nbsp;
 73 | ## Training
 74 | 
 75 | 
 76 | ```python
 77 | llm.instruction_finetune(
 78 |     config=None,
 79 |     dataset=dataset,
 80 |     max_iter=10,
 81 |     method="full | lora | adapter | adapter_v2"
 82 | )
 83 | ```
 84 | 
 85 | ```python
 86 | llm.pretrain(config=None, dataset=dataset, max_iter=10, ...)
 87 | ```
 88 | 
 89 | &nbsp;
 90 | ## Serving
 91 | 
 92 | 
 93 | ```python
 94 | llm.serve(port=8000)
 95 | ```
 96 | 
 97 | Then in another Python session:
 98 | 
 99 | ```python
100 | import requests, json
101 | 
102 | response = requests.post(
103 |     "http://127.0.0.1:8000/predict",
104 |     json={"prompt": "Fix typos in the following sentence: Example input"}
105 | )
106 | 
107 | print(response.json()["output"])
108 | ```
109 | 


--------------------------------------------------------------------------------
/tutorials/examples/ptl-trainer/README.md:
--------------------------------------------------------------------------------
 1 | ## Minimal PyTorch Lightning Trainer Example
 2 | 
 3 | 
 4 | 
 5 | The script in this folder provides minimal examples showing how to train a LitGPT model using LitGPT's `GPT` class with the [PyTorch Lightning](https://github.com/Lightning-AI/pytorch-lightning) Trainer.
 6 | 
 7 | You can run the scripts as follows:
 8 | 
 9 | &nbsp
10 | ## Small 160M model:
11 | 
12 | ```bash
13 | # Download the Pythia model
14 | litgpt download EleutherAI/pythia-160m
15 | 
16 | python litgpt_ptl_small.py
17 | ```
18 | 
19 | &nbsp
20 | ## Medium-sized 8B model:
21 | 
22 | ```bash
23 | # Download the Llama 3.1 model
24 | litgpt download meta-llama/Meta-Llama-3.1-8B --access_token hf_...
25 | 
26 | python litgpt_ptl_medium.py
27 | ```
28 | 


--------------------------------------------------------------------------------
/tutorials/examples/ptl-trainer/litgpt_ptl_medium.py:
--------------------------------------------------------------------------------
 1 | import lightning as L
 2 | import torch
 3 | 
 4 | import litgpt
 5 | from litgpt.data import Alpaca2k
 6 | from litgpt.lora import GPT, merge_lora_weights
 7 | 
 8 | 
 9 | class LitLLM(L.LightningModule):
10 |     def __init__(self):
11 |         super().__init__()
12 |         self.model = GPT.from_name(
13 |             name="Llama-3.1-8B",
14 |             lora_r=32,
15 |             lora_alpha=16,
16 |             lora_dropout=0.05,
17 |             lora_key=False,
18 |             lora_value=True,
19 |         )
20 |         litgpt.lora.mark_only_lora_as_trainable(self.model)
21 | 
22 |     def on_train_start(self):
23 |         state_dict = torch.load("checkpoints/meta-llama/Meta-Llama-3.1-8B/lit_model.pth", mmap=True)
24 |         self.model.load_state_dict(state_dict, strict=False)
25 | 
26 |     def training_step(self, batch):
27 |         input_ids, targets = batch["input_ids"], batch["labels"]
28 |         logits = self.model(input_ids)
29 |         loss = litgpt.utils.chunked_cross_entropy(logits[..., :-1, :], targets[..., 1:])
30 |         self.log("train_loss", loss, prog_bar=True)
31 |         return loss
32 | 
33 |     def configure_optimizers(self):
34 |         warmup_steps = 10
35 |         optimizer = torch.optim.AdamW(self.model.parameters(), lr=0.0002, weight_decay=0.0, betas=(0.9, 0.95))
36 |         scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: step / warmup_steps)
37 |         return [optimizer], [scheduler]
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     data = Alpaca2k()
42 |     tokenizer = litgpt.Tokenizer("checkpoints/meta-llama/Meta-Llama-3.1-8B")
43 |     data.connect(tokenizer, batch_size=1, max_seq_length=512)
44 | 
45 |     trainer = L.Trainer(
46 |         devices=1,
47 |         max_epochs=2,
48 |         accumulate_grad_batches=8,
49 |         precision="bf16-true",
50 |     )
51 |     with trainer.init_module(empty_init=True):
52 |         model = LitLLM()
53 | 
54 |     trainer.fit(model, data)
55 | 
56 |     # Save final checkpoint
57 |     merge_lora_weights(model.model)
58 |     trainer.save_checkpoint("checkpoints/finetuned.ckpt", weights_only=True)
59 | 


--------------------------------------------------------------------------------
/tutorials/finetune.md:
--------------------------------------------------------------------------------
 1 | # Finetuning
 2 | 
 3 | We provide a simple finetuning commands (`litgpt finetune_*`) that instruction-finetune a pretrained model on datasets such as [Alpaca](https://github.com/tatsu-lab/stanford_alpaca), [Dolly](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm), and others. For more information on the supported instruction datasets and how to prepare your own custom datasets, please see the [tutorials/prepare_dataset](prepare_dataset.md) tutorials.
 4 | 
 5 | LitGPT currently supports the following finetuning methods:
 6 | 
 7 | ```bash
 8 | litgpt finetune_full
 9 | litgpt finetune_lora
10 | litgpt finetune_adapter
11 | litgpt finetune_adapter_v2
12 | ```
13 | 
14 | &nbsp;
15 | > [!TIP]
16 | > To install all required dependencies before finetuning, first run `pip install "litgpt[all]"`.
17 | &nbsp;
18 | 
19 | 
20 | The following section provides more details about these methods, including links for additional resources.
21 | 
22 | 
23 | &nbsp;
24 | ## LitGPT finetuning commands
25 | 
26 | The section below provides additional information on the available and links to further resources.
27 | 
28 | &nbsp;
29 | ### Full finetuning
30 | 
31 | ```bash
32 | litgpt finetune_full
33 | ```
34 | 
35 | This method trains all model weight parameters and is the most memory-intensive finetuning technique in LitGPT.
36 | 
37 | **More information and resources:**
38 | 
39 | - the LitGPT [tutorials/finetune_full](finetune_full.md) tutorial
40 | 
41 | 
42 | &nbsp;
43 | ### LoRA and QLoRA finetuning
44 | 
45 | ```bash
46 | litgpt finetune_lora stabilityai/stablelm-base-alpha-3b
47 | ```
48 | 
49 | LoRA and QLoRA are parameter-efficient finetuning technique that only require updating a small number of parameters, which makes this a more memory-efficienty alternative to full finetuning.
50 | 
51 | **More information and resources:**
52 | 
53 | - the LitGPT [tutorials/finetune_lora](finetune_lora.md) tutorial
54 | - the LoRA paper by ([Hu et al. 2021](https://arxiv.org/abs/2106.09685))
55 | - the conceptual tutorial [Parameter-Efficient LLM Finetuning With Low-Rank Adaptation (LoRA)](https://lightning.ai/pages/community/tutorial/lora-llm/)
56 | 
57 | 
58 | &nbsp;
59 | ### Adapter finetuning
60 | 
61 | ```bash
62 | litgpt finetune_adapter stabilityai/stablelm-base-alpha-3b
63 | ```
64 | 
65 | or
66 | 
67 | ```bash
68 | litgpt finetune_adapter_v2 stabilityai/stablelm-base-alpha-3b
69 | ```
70 | 
71 | Similar to LoRA, adapter finetuning is a parameter-efficient finetuning technique that only requires training a small subset of weight parameters, making this finetuning method more memory-efficient than full-parameter finetuning.
72 | 
73 | **More information and resources:**
74 | 
75 | - the LitGPT [tutorials/finetune_adapter](finetune_adapter.md) tutorial
76 | - the Llama-Adapter ([Gao et al. 2023](https://arxiv.org/abs/2304.15010)) and Llama-Adapter v2  ([Zhang et al. 2023](https://arxiv.org/abs/2303.16199)) papers that originally introduces these methods
77 | - the conceptual tutorial [Understanding Parameter-Efficient Finetuning of Large Language Models: From Prefix Tuning to LLaMA-Adapters](https://lightning.ai/pages/community/article/understanding-llama-adapters/)
78 | 


--------------------------------------------------------------------------------
/tutorials/finetune_full.md:
--------------------------------------------------------------------------------
 1 | # Finetuning the whole model
 2 | 
 3 | If you are interested in parameter-efficient finetuning, check out [finetune_adapter.md](finetune_adapter.md). In contrast to parameter-efficient finetuning, this "full" approach finetunes all model parameters, which is substantially more expensive. It may only be recommended as a baseline for comparison studies.
 4 | 
 5 | ## Preparation
 6 | 
 7 | The steps here only need to be done once:
 8 | 
 9 | 1. Follow the instructions in the [README](../README.md) to install the dependencies.
10 | 2. Download and convert the weights following our [guide](download_model_weights.md).
11 | 
12 | LitGPT provides common datasets for finetuning, such as Alpaca, LIMA, Dolly, and more.
13 | You can optionally [prepare your own dataset](#tune-on-your-dataset).
14 | For more information about dataset preparation, also see the [prepare_dataset.md](./prepare_dataset.md) tutorial.
15 | 
16 | ## Running the finetuning
17 | 
18 | ```bash
19 | litgpt finetune_full tiiuae/falcon-7b \
20 |   --data Alpaca \
21 | ```
22 | 
23 | Finetuning the falcon-7b model requires at least 8 GPUs with ~40 GB memory each.
24 | 
25 | You can speed up training by passing the `devices` argument to the script to utilize more GPUs if available.
26 | Depending on the available GPU memory, you can also tune the `micro_batch_size` parameter to utilize the GPU efficiently.
27 | 
28 | This script will save checkpoints periodically to the `out_dir` directory. If you are finetuning different models or on your own dataset, you can specify an output directory with your preferred name:
29 | 
30 | ```bash
31 | litgpt finetune_full tiiuae/falcon-7b \
32 |   --data Alpaca \
33 |   --out_dir out/full/my-model-finetuned
34 | ```
35 | 
36 | If your GPU does not support `bfloat16`, you can pass the `--precision 32-true` argument.
37 | For instance, to fine-tune on MPS (the GPU on modern Macs), you can run
38 | 
39 | ```bash
40 | litgpt finetune_full tiiuae/falcon-7b \
41 |   --data Alpaca \
42 |   --out_dir out/full/my-model-finetuned \
43 |   --precision 32-true
44 | ```
45 | 
46 | Note that `mps` as the accelerator will be picked up automatically by Fabric when running on a modern Mac.
47 | 
48 | ## Test the model
49 | 
50 | You can test the finetuned model with your own instructions by running:
51 | 
52 | ```bash
53 | litgpt generate tiiuae/falcon-7b \
54 |     --prompt "Recommend a movie to watch on the weekend." \
55 |     --finetuned_path out/full/my-model-finetuned/lit_model_finetuned.pth
56 | ```
57 | 
58 | Output:
59 | 
60 | ```text
61 | A good movie to watch on the weekend would be The Lion King, since it's a classic family film that everyone can enjoy...
62 | ```
63 | 
64 | If your GPU supports `bfloat16`, the script will automatically use it.
65 | 
66 | ## Tune on your dataset
67 | 
68 | You can easily train on your own instruction dataset saved in JSON format.
69 | 
70 | 1. Create a JSON file in which each row holds one instruction-response pair.
71 |    A row has an entry for 'instruction' and 'output', and optionally 'input'. Note that currently, the 'input' field is only used in the Alpaca chat template. If you are using the Alpaca template, 'input' can be the empty string if the instruction doesn't require a context.
72 |    Below is an example json file:
73 | 
74 |     ```text
75 |     [
76 |         {
77 |             "instruction": "Arrange the given numbers in ascending order.",
78 |             "input": "2, 4, 0, 8, 3", // Optional: only used in Alpaca chat template
79 |             "output": "0, 2, 3, 4, 8"
80 |         },
81 |         ...
82 |     ]
83 |     ```
84 | 
85 | 2. Run `litgpt finetune` by passing in the location of your data (and optionally other parameters):
86 | 
87 |     ```bash
88 |     litgpt finetune tiiuae/falcon-7b \
89 |         --data JSON \
90 |         --data.json_path data/mydata.json \
91 |         --out_dir data/mydata-finetuned
92 |     ```
93 | 


--------------------------------------------------------------------------------
/tutorials/full_finetune_example.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script is meant to be the simplest possible starting point for full finetuning a GPT model using lightning fabric with code (not CLI).
  3 | 
  4 | - no checkpoints
  5 | - no out dir
  6 | - no precision
  7 | - no resume
  8 | - no train/eval args (or any args in general)
  9 | - no logger (only to terminal)
 10 | - no grad accumulation
 11 | and no other fancy stuff.
 12 | 
 13 | To add all the above stuff, you can slowly add them in yourself by looking at the code in litgpt/finetune/full.py or the docs for litgpt/fabric.
 14 | """
 15 | 
 16 | import os
 17 | 
 18 | import lightning as L
 19 | import torch
 20 | import torch.nn as nn
 21 | 
 22 | from litgpt.data import Alpaca
 23 | from litgpt.model import GPT, Config
 24 | from litgpt.tokenizer import Tokenizer
 25 | from litgpt.utils import num_parameters
 26 | 
 27 | # training params/args
 28 | SEED = 1337
 29 | MODEL_NAME = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"  # try also "stabilityai/stablelm-base-alpha-3b"!
 30 | BATCH_SIZE = 4
 31 | LR_WARMUP_STEPS = 100
 32 | MAX_STEPS = 601
 33 | 
 34 | 
 35 | def validate(model, val_dataloader):
 36 |     model.eval()
 37 |     loss = 0
 38 |     with torch.no_grad():
 39 |         for batch in val_dataloader:
 40 |             input_ids, targets = batch["input_ids"], batch["labels"]
 41 |             logits = model(input_ids)
 42 |             logits = logits.reshape(-1, logits.size(-1))
 43 |             targets = targets.reshape(-1)
 44 |             loss += nn.functional.cross_entropy(logits[..., :-1, :], targets[..., 1:])
 45 |     fabric.print(f"Validation loss: {loss / len(val_dataloader)}")
 46 | 
 47 | 
 48 | def train(fabric, model, optimizer, scheduler, train_dataloader, val_dataloader):
 49 |     for iter_num, batch in enumerate(train_dataloader):
 50 |         input_ids, targets = batch["input_ids"], batch["labels"]
 51 | 
 52 |         # get model preds (logits)
 53 |         logits = model(input_ids)
 54 |         logits = logits.reshape(-1, logits.size(-1))
 55 | 
 56 |         # get loss
 57 |         targets = targets.reshape(-1)
 58 |         loss = nn.functional.cross_entropy(logits[..., :-1, :], targets[..., 1:])
 59 | 
 60 |         # update weights
 61 |         fabric.backward(loss)
 62 |         optimizer.step()
 63 |         optimizer.zero_grad()
 64 |         scheduler.step()
 65 | 
 66 |         # print train loss every 100 steps
 67 |         if iter_num % 100 == 0 or iter_num == 0:
 68 |             fabric.print(f"Train iter {iter_num} -  loss {loss}")
 69 | 
 70 |         # validate every 300 steps
 71 |         if iter_num % 300 == 0 or iter_num == 0:
 72 |             validate(model, val_dataloader)
 73 |             model.train()
 74 |         iter_num += 1
 75 | 
 76 |         if iter_num >= MAX_STEPS:
 77 |             break
 78 | 
 79 | 
 80 | def main(fabric):
 81 |     fabric.seed_everything(SEED)
 82 | 
 83 |     # setup data, make tokenizer and make dataloaders
 84 |     data = Alpaca()
 85 |     tokenizer = Tokenizer(checkpoint_dir=f"checkpoints/{MODEL_NAME}")
 86 |     data.connect(tokenizer=tokenizer, batch_size=BATCH_SIZE, max_seq_length=1024)
 87 |     data.setup()
 88 |     train_dataloader = data.train_dataloader()
 89 |     val_dataloader = data.val_dataloader()
 90 |     train_dataloader, val_dataloader = fabric.setup_dataloaders(train_dataloader, val_dataloader)
 91 | 
 92 |     # print how many steps in an epoch
 93 |     fabric.print(f"Steps in an epoch: {len(train_dataloader)}")
 94 | 
 95 |     # setup model
 96 |     config = Config.from_file(f"checkpoints/{MODEL_NAME}/model_config.yaml")
 97 |     model = GPT(config)
 98 |     fabric.print(f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}")
 99 |     model = fabric.setup(model)
100 | 
101 |     # setup optimizer
102 |     optimizer = torch.optim.AdamW(model.parameters(), lr=3e-3, weight_decay=0.02, betas=(0.9, 0.95))
103 |     optimizer = fabric.setup_optimizers(optimizer)
104 | 
105 |     # setup lr scheduler
106 |     scheduler1 = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: step / LR_WARMUP_STEPS)
107 |     scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(MAX_STEPS - LR_WARMUP_STEPS))
108 |     scheduler = torch.optim.lr_scheduler.SequentialLR(optimizer, [scheduler1, scheduler2], milestones=[LR_WARMUP_STEPS])
109 | 
110 |     # Start training!!!
111 |     train(fabric, model, optimizer, scheduler, train_dataloader, val_dataloader)
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     # check that the model exists (downloaded to ./checkpoints/)
116 |     if not os.path.exists(f"checkpoints/{MODEL_NAME}"):
117 |         print(f"Model {MODEL_NAME} not found. Please download it using `litgpt download --repo {MODEL_NAME}`")
118 |         exit()
119 | 
120 |     ### Setup and launch
121 |     fabric = L.Fabric(devices="auto", strategy="auto")
122 |     fabric.launch(main)
123 | 


--------------------------------------------------------------------------------
/tutorials/images/0_to_litgpt/commands.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/0_to_litgpt/commands.webp


--------------------------------------------------------------------------------
/tutorials/images/0_to_litgpt/finetune.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/0_to_litgpt/finetune.webp


--------------------------------------------------------------------------------
/tutorials/images/0_to_litgpt/instruction-1.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/0_to_litgpt/instruction-1.webp


--------------------------------------------------------------------------------
/tutorials/images/0_to_litgpt/instruction-2.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/0_to_litgpt/instruction-2.webp


--------------------------------------------------------------------------------
/tutorials/images/0_to_litgpt/pretrain.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/0_to_litgpt/pretrain.webp


--------------------------------------------------------------------------------
/tutorials/images/0_to_litgpt/usage.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/0_to_litgpt/usage.webp


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/alpaca-2k.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/prepare_dataset/alpaca-2k.jpg


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/alpaca.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/prepare_dataset/alpaca.jpg


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/alpaca_libre.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/prepare_dataset/alpaca_libre.jpg


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/alpacagpt4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/prepare_dataset/alpacagpt4.jpg


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/deita-multiturn.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/prepare_dataset/deita-multiturn.jpg


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/deita.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/prepare_dataset/deita.jpg


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/dolly.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/prepare_dataset/dolly.jpg


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/lima.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/prepare_dataset/lima.jpg


--------------------------------------------------------------------------------
/tutorials/images/prepare_dataset/longform.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/prepare_dataset/longform.jpg


--------------------------------------------------------------------------------
/tutorials/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: LitGPT Tutorials
2 | 
3 | plugins:
4 |   - pagetree
5 | 
6 | theme:
7 |   name: material
8 | 


--------------------------------------------------------------------------------