├── .azure └── gpu-test.yml ├── .devcontainer ├── Dockerfile └── devcontainer.json ├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── ask-a-question.md │ ├── bug-report.yaml │ └── feature-request.md └── workflows │ ├── check-links.yml │ ├── cpu-tests.yml │ ├── mkdocs-deploy.yml │ └── publish-pkg.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CITATION.cff ├── LICENSE ├── README.md ├── config_hub ├── finetune │ ├── README.md │ ├── falcon-7b │ │ ├── lora.yaml │ │ └── qlora.yaml │ ├── gemma-2b │ │ ├── full.yaml │ │ ├── lora.yaml │ │ └── qlora.yaml │ ├── gemma-7b │ │ ├── lora.yaml │ │ └── qlora.yaml │ ├── gemma2-2b │ │ ├── lora.yaml │ │ └── qlora.yaml │ ├── gemma2-9b │ │ ├── lora.yaml │ │ └── qlora.yaml │ ├── llama-2-7b │ │ ├── full.yaml │ │ ├── lora.yaml │ │ └── qlora.yaml │ ├── llama-3-8b │ │ ├── full.yaml │ │ ├── lora.yaml │ │ └── qlora.yaml │ ├── llama-3.1-8b │ │ ├── full.yaml │ │ ├── lora.yaml │ │ └── qlora.yaml │ ├── llama-3.2-1B │ │ ├── full.yaml │ │ ├── lora.yaml │ │ └── qlora.yaml │ ├── llama-3.2-3B │ │ ├── full.yaml │ │ ├── lora.yaml │ │ └── qlora.yaml │ ├── mistral-7b-v0.2 │ │ ├── lora.yaml │ │ └── qlora.yaml │ ├── mistral-7b │ │ ├── lora.yaml │ │ └── qlora.yaml │ ├── phi-2 │ │ ├── full.yaml │ │ ├── lora.yaml │ │ └── qlora.yaml │ ├── phi-3 │ │ ├── full.yaml │ │ ├── lora.yaml │ │ └── qlora.yaml │ ├── stablelm-base-alpha-3b │ │ ├── full.yaml │ │ ├── lora.yaml │ │ └── qlora.yaml │ └── tiny-llama │ │ ├── full.yaml │ │ ├── lora.yaml │ │ └── qlora.yaml └── pretrain │ ├── debug.yaml │ ├── microllama.yaml │ ├── tinyllama.yaml │ └── tinystories.yaml ├── extensions ├── thunder │ ├── README.md │ ├── __init__.py │ ├── pretrain.py │ ├── strategies │ │ ├── __init__.py │ │ ├── thunder_ddp.py │ │ └── thunder_fsdp.py │ └── unsloth │ │ ├── __init__.py │ │ ├── executor.py │ │ └── kernels │ │ ├── __init__.py │ │ ├── cross_entropy_loss.py │ │ ├── rope_embedding.py │ │ ├── swiglu.py │ │ └── utils.py └── xla │ ├── README.md │ ├── __init__ │ ├── finetune │ ├── __init__ │ └── adapter.py │ ├── generate │ ├── __init__ │ ├── adapter.py │ └── base.py │ ├── scripts │ ├── __init__ │ └── prepare_alpaca.py │ └── utils.py ├── litgpt ├── __init__.py ├── __main__.py ├── adapter.py ├── adapter_v2.py ├── api.py ├── args.py ├── chat │ ├── __init__.py │ └── base.py ├── config.py ├── data │ ├── __init__.py │ ├── alpaca.py │ ├── alpaca_2k.py │ ├── alpaca_gpt4.py │ ├── base.py │ ├── deita.py │ ├── flan.py │ ├── json_data.py │ ├── lima.py │ ├── lit_data.py │ ├── longform.py │ ├── microllama.py │ ├── openwebtext.py │ ├── prepare_slimpajama.py │ ├── prepare_starcoder.py │ ├── text_files.py │ ├── tinyllama.py │ └── tinystories.py ├── deploy │ ├── __init__.py │ └── serve.py ├── eval │ └── evaluate.py ├── finetune │ ├── __init__.py │ ├── adapter.py │ ├── adapter_v2.py │ ├── full.py │ └── lora.py ├── generate │ ├── __init__.py │ ├── adapter.py │ ├── adapter_v2.py │ ├── base.py │ ├── full.py │ ├── sequentially.py │ ├── speculative_decoding.py │ └── tp.py ├── lora.py ├── model.py ├── pretrain.py ├── prompts.py ├── scripts │ ├── __init__.py │ ├── convert_hf_checkpoint.py │ ├── convert_lit_checkpoint.py │ ├── convert_pretrained_checkpoint.py │ ├── download.py │ └── merge_lora.py ├── tokenizer.py └── utils.py ├── pyproject.toml ├── tests ├── conftest.py ├── convert │ ├── __init__.py │ ├── test_hf_checkpoint.py │ ├── test_lit_checkpoint.py │ └── test_pretrained_checkpoint.py ├── data │ ├── __init__.py │ ├── _fixtures │ │ ├── alpaca.json │ │ ├── dolly.json │ │ ├── longform_train.json │ │ └── longform_val.json │ ├── test_alpaca.py │ ├── test_base.py │ ├── test_deita.py │ ├── test_json.py │ ├── test_lit_data.py │ ├── test_longform.py │ ├── test_openwebtext.py │ ├── test_textfiles.py │ ├── test_tinyllama.py │ └── test_tinystories.py ├── ext_thunder │ ├── __init__.py │ ├── test_thunder_distributed.py │ ├── test_thunder_networks.py │ ├── test_thunder_pretrain.py │ └── test_unsloth_executor.py ├── generate │ ├── __init__.py │ ├── test_adapter.py │ ├── test_main.py │ ├── test_sequentially.py │ ├── test_tp.py │ └── utils.py ├── test_adapter.py ├── test_adapter_v2.py ├── test_api.py ├── test_args.py ├── test_batch.py ├── test_chat.py ├── test_ci.py ├── test_cli.py ├── test_config.py ├── test_config_hub.py ├── test_distributed.py ├── test_evaluate.py ├── test_full.py ├── test_generate_speculatively.py ├── test_lora.py ├── test_merge_lora.py ├── test_model.py ├── test_pretrain.py ├── test_prompts.py ├── test_readme.py ├── test_rope.py ├── test_serve.py ├── test_tokenizer.py ├── test_trainer_support.py └── test_utils.py └── tutorials ├── 0_to_litgpt.md ├── convert_hf_checkpoint.md ├── convert_lit_models.md ├── deploy.md ├── developer-docs ├── README.md ├── adding-models.md └── python-api.md ├── download_model_weights.md ├── evaluation.md ├── examples └── ptl-trainer │ ├── README.md │ ├── litgpt_ptl_medium.py │ └── litgpt_ptl_small.py ├── finetune.md ├── finetune_adapter.md ├── finetune_full.md ├── finetune_lora.md ├── full_finetune_example.py ├── images ├── 0_to_litgpt │ ├── commands.webp │ ├── finetune.webp │ ├── instruction-1.webp │ ├── instruction-2.webp │ ├── pretrain.webp │ └── usage.webp └── prepare_dataset │ ├── alpaca-2k.jpg │ ├── alpaca.jpg │ ├── alpaca_libre.jpg │ ├── alpacagpt4.jpg │ ├── deita-multiturn.jpg │ ├── deita.jpg │ ├── dolly.jpg │ ├── lima.jpg │ └── longform.jpg ├── inference.md ├── mkdocs.yml ├── oom.md ├── prepare_dataset.md ├── pretrain.md ├── pretrain_tinyllama.md ├── python-api.md ├── quantize.md └── resource-tables.md /.azure/gpu-test.yml: -------------------------------------------------------------------------------- 1 | name: GPU tests 2 | 3 | trigger: 4 | branches: 5 | include: 6 | - "main" 7 | - "wip" 8 | 9 | pr: 10 | branches: 11 | include: 12 | - "main" 13 | - "wip" 14 | 15 | jobs: 16 | - job: testing 17 | strategy: 18 | matrix: 19 | "ordinary": 20 | #image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.7-cuda12.6.3" 21 | dependency: "" 22 | "w. Thunder": 23 | #image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.7-cuda12.6.3" 24 | dependency: "compiler" 25 | variables: 26 | DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) 27 | PL_RUN_CUDA_TESTS: "1" 28 | TRANSFORMERS_CACHE: "/var/tmp/hf/transformers" 29 | HF_HOME: "/var/tmp/hf/home" 30 | HF_HUB_CACHE: "/var/tmp/hf/hub" 31 | CI: "true" 32 | PYTHON_VERSION: "3.10" 33 | CUDA_VERSION: "12.6.3" 34 | TORCH_VERSION: "2.7.0" 35 | CUDNN_FRONTEND_VERSION: "1.10.0" 36 | container: 37 | # image: "pytorchlightning/pytorch_lightning:base-cuda-py$(PYTHON_VERSION)-torch$(TORCH_VERSION)-cuda$(CUDA_VERSION)" 38 | # pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-cudnn-fe1.5.0-py3.10-pt_main-dev 39 | image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda$(CUDA_VERSION)-cudnn-fe$(CUDNN_FRONTEND_VERSION)-py$(PYTHON_VERSION)-pt_$(TORCH_VERSION)-dev" 40 | options: "--gpus=all --shm-size=8gb -v /var/tmp:/var/tmp" 41 | workspace: 42 | clean: all 43 | pool: "lit-rtx-3090" 44 | timeoutInMinutes: "35" 45 | cancelTimeoutInMinutes: "2" 46 | steps: 47 | - bash: | 48 | echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" 49 | displayName: "set env. vars" 50 | 51 | - bash: | 52 | echo $(DEVICES) 53 | echo $CUDA_VISIBLE_DEVICES 54 | whereis nvidia 55 | nvidia-smi 56 | which python && which pip 57 | python --version 58 | pip --version 59 | pip list 60 | displayName: "Image info & NVIDIA" 61 | 62 | - script: | 63 | pip install --upgrade pip 64 | pip install '.[extra,test]' cffi -U 65 | displayName: "Install package & dependencies" 66 | 67 | - script: | 68 | set -e 69 | pip uninstall -y torchvision torchaudio 70 | pip install '.[compiler]' 71 | python -c "from thunder.executors import nvfuser_available ; assert nvfuser_available(), 'nvFuser is missing!'" 72 | python -c "from thunder.executors.triton_utils import triton_version ; assert triton_version() is not None, 'triton is missing!'" 73 | condition: eq(variables['dependency'], 'compiler') 74 | displayName: "Install `compiler` [nvFuser & Thunder]" 75 | 76 | - bash: | 77 | set -e 78 | pip list 79 | python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" 80 | python -c "from torch import __version__ as ver ; assert str(ver).split('+')[0] == '$(TORCH_VERSION)', f'PyTorch: installed {ver} but expected $(TORCH_VERSION)'" 81 | displayName: "Env details" 82 | 83 | - bash: pytest -v 84 | displayName: "All tests" 85 | #condition: eq(variables['dependency'], 'compiler') 86 | timeoutInMinutes: "15" 87 | 88 | - bash: | 89 | wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh 90 | bash run_standalone_tests.sh "tests" 91 | displayName: "Standalone tests" 92 | env: 93 | PL_RUN_STANDALONE_TESTS: "1" 94 | # NUM_PARALLEL_TESTS: "10" 95 | timeoutInMinutes: "10" 96 | 97 | - bash: | 98 | pip uninstall -y lightning-thunder 99 | # install thunder from source, so that, thunder.tests will be available 100 | pip install -U "lightning-thunder[test] @ git+https://github.com/Lightning-AI/lightning-thunder.git" 101 | displayName: "Re-install Thunder [main branch]" 102 | condition: eq(variables['dependency'], 'compiler') 103 | 104 | - bash: | 105 | # without env var, it filters out all tests 106 | PL_RUN_CUDA_TESTS=0 pytest tests/ext_thunder/test_thunder_networks.py -v 107 | displayName: "Extra tests for Thunder [main branch]" 108 | condition: eq(variables['dependency'], 'compiler') 109 | env: 110 | TORCHDYNAMO_VERBOSE: "1" 111 | timeoutInMinutes: "10" 112 | -------------------------------------------------------------------------------- /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | # See here for image contents: https://github.com/devcontainers/images/blob/main/src/python/.devcontainer/Dockerfile 2 | 3 | # [Choice] Python version (use -bookworm or -bullseye variants on local arm64/Apple Silicon): 3, 3.12, 3.11, 3.10, 3.9, 3.8, 3-bookworm, 3.12-bookworm, 3.11-bookworm, 3.10-bookworm, 3.9-bookworm, 3.8-bookworm, 3-bullseye, 3.12-bullseye, 3.11-bullseye, 3.10-bullseye, 3.9-bullseye, 3.8-bullseye, 3-buster, 3.12-buster, 3.11-buster, 3.10-buster, 3.9-buster, 3.8-buster 4 | ARG VARIANT=3-bookworm 5 | FROM mcr.microsoft.com/devcontainers/python:1-${VARIANT} 6 | 7 | # Temporary: Upgrade python packages due to https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-40897 8 | # They are installed by the base image (python) which does not have the patch. 9 | RUN python3 -m pip install --upgrade pip setuptools 10 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at: 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.194.0/containers/python-3 3 | { 4 | "name": "Python 3 (litgpt)", 5 | "build": { 6 | "dockerfile": "Dockerfile", 7 | "context": "..", 8 | "args": { 9 | "VARIANT": "3.11-bookworm" 10 | } 11 | }, 12 | "runArgs": [ 13 | // Enable GPU passthrough, requires WSL2 on Windows 14 | //"--gpus=all", 15 | // One of the following options is required for torch multiprocessing 16 | //"--ipc=host", 17 | //"--shm-size=4gb", 18 | ], 19 | // Features to add to the dev container. More info: https://containers.dev/features. 20 | "features": { 21 | "ghcr.io/devcontainers/features/git:1": {}, 22 | "ghcr.io/devcontainers/features/git-lfs:1": {}, 23 | //"ghcr.io/devcontainers/features/nvidia-cuda:1": {}, 24 | "ghcr.io/devcontainers-extra/features/actionlint:1": {}, 25 | "ghcr.io/devcontainers-extra/features/pre-commit:2": {}, 26 | "ghcr.io/dhoeric/features/act:1": {}, 27 | "ghcr.io/devcontainers/features/docker-in-docker:2": { 28 | "version": "latest", 29 | "moby": true 30 | } 31 | }, 32 | // Set *default* container specific settings.json values on container create. 33 | "customizations": { 34 | "vscode": { 35 | "settings": { 36 | "editor.tabSize": 4, 37 | "editor.renderWhitespace": "all", 38 | "editor.formatOnSave": true, 39 | "editor.rulers": [120], 40 | "files.exclude": { 41 | "**/__pycache__": true 42 | }, 43 | "python.pythonPath": "/usr/local/bin/python", 44 | "python.defaultInterpreterPath": "/usr/local/bin/python", 45 | "python.languageServer": "Pylance", 46 | "python.analysis.autoImportCompletions": true, 47 | "python.analysis.completeFunctionParens": true, 48 | "python.analysis.autoSearchPaths": true, 49 | "python.testing.pytestArgs": ["tests"], 50 | "python.testing.unittestEnabled": false, 51 | "python.testing.pytestEnabled": true, 52 | "code-eol.highlightNonDefault": true, 53 | "code-eol.highlightExtraWhitespace": true, 54 | "autoDocstring.docstringFormat": "google-notypes", 55 | "autoDocstring.guessTypes": true, 56 | "autoDocstring.generateDocstringOnEnter": true, 57 | "autoDocstring.startOnNewLine": true, 58 | "telemetry.telemetryLevel": "off", 59 | "[python]": { 60 | "editor.formatOnSave": true, 61 | "editor.defaultFormatter": "charliermarsh.ruff", 62 | "editor.codeActionsOnSave": { 63 | "source.organizeImports": "always", 64 | "source.fixAll": "always" 65 | } 66 | } 67 | }, 68 | // Add the IDs of extensions you want installed when the container is created. 69 | "extensions": [ 70 | "ms-python.python", 71 | "ms-python.vscode-pylance", 72 | "ms-toolsai.jupyter", 73 | "GitHub.copilot", 74 | "GitHub.copilot-chat", 75 | "github.vscode-github-actions", 76 | "SanjulaGanepola.github-local-actions", 77 | "charliermarsh.ruff", 78 | "esbenp.prettier-vscode", 79 | "ms-vscode.test-adapter-converter", 80 | "njqdev.vscode-python-typehint", 81 | "KevinRose.vsc-python-indent", 82 | "medo64.render-crlf", 83 | "shardulm94.trailing-spaces", 84 | "nhoizey.gremlins", 85 | "wayou.vscode-todo-highlight", 86 | "Gruntfuggly.todo-tree", 87 | "njpwerner.autodocstring", 88 | "rodolphebarbanneau.python-docstring-highlighter", 89 | "mechatroner.rainbow-csv", 90 | "uctakeoff.vscode-counter", 91 | "bierner.github-markdown-preview", 92 | "yahyabatulu.vscode-markdown-alert", 93 | "ms-vscode-remote.vscode-remote-extensionpack", 94 | "ms-azuretools.vscode-docker", 95 | "redhat.vscode-yaml" 96 | ] 97 | } 98 | }, 99 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 100 | // "forwardPorts": [], 101 | // Use 'postCreateCommand' to run commands after the container is created. 102 | "postCreateCommand": "pre-commit install && pip install '.[extra,compiler,test]' -U", 103 | // Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root. 104 | "remoteUser": "vscode" 105 | } 106 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @lantiga @t-vi @borda 2 | /README.md @williamfalcon @lantiga 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/ask-a-question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Ask a Question 3 | about: Ask and answer questions related to LitGPT 4 | title: '' 5 | labels: question 6 | 7 | --- 8 | 9 | Please describe your question here. 10 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.yaml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: Report errors related to LitGPT 3 | title: "Description" 4 | labels: bug 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | Thank you for taking the time to report an issue. Please fill out the details below to help us resolve it. 10 | 11 | - type: textarea 12 | id: bug_description 13 | attributes: 14 | label: Bug description 15 | description: A description of the issue. 16 | placeholder: | 17 | Please provide a description of what the bug or issue is. 18 | validations: 19 | required: true 20 | 21 | - type: dropdown 22 | id: operating_system 23 | attributes: 24 | label: What operating system are you using? 25 | description: If applicable, please select the operating system where you experienced this issue. 26 | options: 27 | - "Unknown" 28 | - "macOS" 29 | - "Linux" 30 | - "Windows" 31 | validations: 32 | required: true 33 | 34 | - type: textarea 35 | id: version 36 | attributes: 37 | label: LitGPT Version 38 | description: | 39 | Please provide details about your LitGPT version by running the following code in your terminal: 40 | ``` 41 | pip show litgpt | grep Version: 42 | ``` 43 | You can simply copy and paste the outputs below. 44 | value: | 45 | ``` 46 | 47 | 48 | 49 | ``` 50 | validations: 51 | required: false 52 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Suggest a Feature 3 | about: Propose a new feature or enhancement 4 | title: '' 5 | labels: enhancement 6 | 7 | --- 8 | 9 | Please describe the feature or enhancement along with the intended usecase. 10 | -------------------------------------------------------------------------------- /.github/workflows/check-links.yml: -------------------------------------------------------------------------------- 1 | name: Check hyperlinks 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | test: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | 18 | - name: Set up Python 19 | uses: actions/setup-python@v5 20 | with: 21 | python-version: "3.10" 22 | 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install "mistune<3.1" # a newer version is incompatible with nbconvert 27 | pip install pytest pytest-check-links 28 | 29 | - name: Check links 30 | run: | 31 | pytest --check-links README.md --check-links-ignore "http*" 32 | pytest --check-links tutorials --check-links-ignore "http*" 33 | -------------------------------------------------------------------------------- /.github/workflows/mkdocs-deploy.yml: -------------------------------------------------------------------------------- 1 | name: Deploy MkDocs 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | 7 | permissions: 8 | contents: write 9 | 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-24.04 13 | steps: 14 | # Step 1: Checkout the repository 15 | - uses: actions/checkout@v4 16 | 17 | # Step 2: Set up Python 18 | - uses: actions/setup-python@v5 19 | with: 20 | python-version: "3.x" 21 | cache: "pip" 22 | 23 | # Step 3: Install MkDocs and dependencies 24 | - run: pip install mkdocs mkdocs-material mkdocs-pagetree-plugin 25 | # Step 4: Deploy to GitHub Pages 26 | - run: | 27 | mkdir -p gh-pages/docs 28 | cp -r tutorials/* gh-pages/docs 29 | cd gh-pages 30 | mv docs/mkdocs.yml mkdocs.yml 31 | echo "{{ pagetree }}" > docs/index.md 32 | mkdocs gh-deploy --force 33 | -------------------------------------------------------------------------------- /.github/workflows/publish-pkg.yml: -------------------------------------------------------------------------------- 1 | # To create a release, create a tag and push it to GitHub: 2 | #git tag -a "v0.0.1-beta" -m "beta version testing" 3 | #git push --tags 4 | # https://dev.to/iamtekson/publish-package-to-pypi-and-release-new-version-using-github-actions-108k 5 | name: Publish LitGPT to PyPI 6 | 7 | on: 8 | push: 9 | tags: 10 | - "v*" 11 | jobs: 12 | build-n-publish: 13 | name: Build and publish to PyPI 14 | runs-on: ubuntu-latest 15 | environment: 16 | name: pypi 17 | url: https://pypi.org/p/litgpt 18 | permissions: 19 | id-token: write 20 | 21 | steps: 22 | - name: Checkout source 23 | uses: actions/checkout@v3 24 | 25 | - name: Set up Python 26 | uses: actions/setup-python@v4 27 | with: 28 | python-version: "3.x" 29 | cache: "pip" 30 | 31 | - name: Build source and wheel distributions 32 | run: | 33 | python -m pip install --upgrade build twine 34 | pip install importlib_metadata==7.2.1 35 | python -m build 36 | twine check --strict dist/* 37 | - name: Publish distribution to PyPI 38 | uses: pypa/gh-action-pypi-publish@release/v1 39 | with: 40 | user: __token__ 41 | password: ${{ secrets.PYPI_API_TOKEN }} 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | __pycache__ 3 | .idea 4 | .DS_Store 5 | *.egg-info 6 | build 7 | dist 8 | .venv 9 | .vscode 10 | 11 | # data 12 | data 13 | datasets 14 | !litgpt/data 15 | !tests/data 16 | checkpoints 17 | out 18 | wandb 19 | events.out.tfevents* 20 | 21 | # test artifacts from tests/test_readme.py 22 | **/custom_finetuning_dataset.json 23 | client.py 24 | **/custom_texts/ 25 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright The Lightning team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | default_language_version: 16 | python: python3 17 | 18 | ci: 19 | autofix_prs: true 20 | autoupdate_commit_msg: "[pre-commit.ci] pre-commit suggestions" 21 | autoupdate_schedule: quarterly 22 | # submodules: true 23 | 24 | repos: 25 | - repo: https://github.com/pre-commit/pre-commit-hooks 26 | rev: v5.0.0 27 | hooks: 28 | - id: end-of-file-fixer 29 | - id: trailing-whitespace 30 | exclude: README.md 31 | - id: check-yaml 32 | - id: check-toml 33 | #- id: check-docstring-first 34 | #- id: check-executables-have-shebangs 35 | - id: check-case-conflict 36 | - id: check-added-large-files 37 | args: ["--maxkb=250", "--enforce-all"] 38 | - id: detect-private-key 39 | 40 | - repo: https://github.com/codespell-project/codespell 41 | rev: v2.4.1 42 | hooks: 43 | - id: codespell 44 | additional_dependencies: [tomli] 45 | args: ["--write-changes"] 46 | exclude: pyproject.toml 47 | 48 | #- repo: https://github.com/crate-ci/typos 49 | # rev: dictgen-v0.3.1 50 | # hooks: 51 | # - id: typos 52 | # args: [] # empty to do not write fixes 53 | # exclude: pyproject.toml 54 | 55 | #- repo: https://github.com/executablebooks/mdformat 56 | # rev: 0.7.21 57 | # hooks: 58 | # - id: mdformat 59 | # args: ["--number"] 60 | # additional_dependencies: 61 | # - mdformat-gfm 62 | # - mdformat-black 63 | # - mdformat_frontmatter 64 | 65 | - repo: https://github.com/pre-commit/mirrors-prettier 66 | rev: v3.1.0 67 | hooks: 68 | - id: prettier 69 | files: \.(json|yml|yaml|toml) 70 | # https://prettier.io/docs/en/options.html#print-width 71 | args: ["--print-width=140"] 72 | 73 | - repo: https://github.com/astral-sh/ruff-pre-commit 74 | rev: v0.11.4 75 | hooks: 76 | - id: ruff 77 | args: ["--fix"] 78 | - id: ruff-format 79 | - id: ruff 80 | 81 | - repo: https://github.com/tox-dev/pyproject-fmt 82 | rev: v2.5.1 83 | hooks: 84 | - id: pyproject-fmt 85 | additional_dependencies: [tox] 86 | - repo: https://github.com/abravalheri/validate-pyproject 87 | rev: v0.24.1 88 | hooks: 89 | - id: validate-pyproject 90 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, you can cite it as shown below." 3 | title: "LitGPT" 4 | abstract: "20+ high-performance LLMs with recipes to pretrain, finetune and deploy at scale." 5 | date-released: 2023-03-22 6 | authors: 7 | - name: "The Lightning AI team" 8 | license: "Apache-2.0" 9 | url: "https://github.com/Lightning-AI/litgpt" 10 | -------------------------------------------------------------------------------- /config_hub/finetune/falcon-7b/lora.yaml: -------------------------------------------------------------------------------- 1 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 2 | checkpoint_dir: checkpoints/tiiuae/falcon-7b 3 | 4 | # Directory in which to save checkpoints and logs. (type: , default: out/lora) 5 | out_dir: out/finetune/lora-falcon-7b 6 | 7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 8 | precision: bf16-true 9 | 10 | # If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) 11 | quantize: 12 | 13 | # How many devices/GPUs to use. (type: Union[int, str], default: 1) 14 | devices: 1 15 | 16 | # How many nodes to use. (type: int, default: 1) 17 | num_nodes: 1 18 | 19 | # The LoRA rank. (type: int, default: 8) 20 | lora_r: 32 21 | 22 | # The LoRA alpha. (type: int, default: 16) 23 | lora_alpha: 16 24 | 25 | # The LoRA dropout value. (type: float, default: 0.05) 26 | lora_dropout: 0.05 27 | 28 | # Whether to apply LoRA to the query weights in attention. (type: bool, default: True) 29 | lora_query: true 30 | 31 | # Whether to apply LoRA to the key weights in attention. (type: bool, default: False) 32 | lora_key: false 33 | 34 | # Whether to apply LoRA to the value weights in attention. (type: bool, default: True) 35 | lora_value: true 36 | 37 | # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) 38 | lora_projection: false 39 | 40 | # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) 41 | lora_mlp: false 42 | 43 | # Whether to apply LoRA to output head in GPT. (type: bool, default: False) 44 | lora_head: false 45 | 46 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 47 | data: 48 | class_path: litgpt.data.Alpaca2k 49 | init_args: 50 | mask_prompt: false 51 | prompt_style: alpaca 52 | ignore_index: -100 53 | seed: 42 54 | num_workers: 4 55 | 56 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 57 | train: 58 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 59 | save_interval: 200 60 | 61 | # Number of iterations between logging calls (type: int, default: 1) 62 | log_interval: 1 63 | 64 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) 65 | global_batch_size: 8 66 | 67 | # Number of samples per data-parallel rank (type: int, default: 4) 68 | micro_batch_size: 1 69 | 70 | # Number of iterations with learning rate warmup active (type: int, default: 100) 71 | lr_warmup_steps: 10 72 | 73 | # Number of epochs to train on (type: Optional[int], default: 5) 74 | epochs: 4 75 | 76 | # Total number of tokens to train on (type: Optional[int], default: null) 77 | max_tokens: 78 | 79 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 80 | max_steps: 81 | 82 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 83 | max_seq_length: 512 84 | 85 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 86 | tie_embeddings: 87 | 88 | # (type: Optional[float], default: null) 89 | max_norm: 90 | 91 | # (type: float, default: 6e-05) 92 | min_lr: 6.0e-05 93 | 94 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 95 | eval: 96 | # Number of optimizer steps between evaluation calls (type: int, default: 100) 97 | interval: 100 98 | 99 | # Number of tokens to generate (type: Optional[int], default: 100) 100 | max_new_tokens: 100 101 | 102 | # Number of iterations (type: int, default: 100) 103 | max_iters: 100 104 | 105 | # Whether to evaluate on the validation set at the beginning of the training 106 | initial_validation: false 107 | 108 | # Whether to evaluate on the validation set at the end the training 109 | final_validation: true 110 | 111 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 112 | logger_name: csv 113 | 114 | # The random seed to use for reproducibility. (type: int, default: 1337) 115 | seed: 1337 116 | 117 | # Optimizer-related arguments 118 | optimizer: 119 | class_path: torch.optim.AdamW 120 | 121 | init_args: 122 | # (type: float, default: 0.001) 123 | lr: 0.0002 124 | 125 | # (type: float, default: 0.01) 126 | weight_decay: 0.0 127 | 128 | # (type: tuple, default: (0.9,0.999)) 129 | betas: 130 | - 0.9 131 | - 0.95 132 | -------------------------------------------------------------------------------- /config_hub/finetune/gemma-2b/full.yaml: -------------------------------------------------------------------------------- 1 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 2 | checkpoint_dir: checkpoints/google/gemma-2b 3 | 4 | # Directory in which to save checkpoints and logs. (type: , default: out/lora) 5 | out_dir: out/finetune/full-gemma-2b 6 | 7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 8 | precision: bf16-true 9 | 10 | # How many devices/GPUs to use. (type: Union[int, str], default: 1) 11 | devices: 4 12 | 13 | # How many nodes to use. (type: int, default: 1) 14 | num_nodes: 1 15 | 16 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 17 | data: 18 | class_path: litgpt.data.Alpaca2k 19 | init_args: 20 | mask_prompt: false 21 | val_split_fraction: 0.03847 22 | prompt_style: alpaca 23 | ignore_index: -100 24 | seed: 42 25 | num_workers: 4 26 | 27 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 28 | train: 29 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 30 | save_interval: 800 31 | 32 | # Number of iterations between logging calls (type: int, default: 1) 33 | log_interval: 1 34 | 35 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) 36 | global_batch_size: 16 37 | 38 | # Number of samples per data-parallel rank (type: int, default: 4) 39 | micro_batch_size: 1 40 | 41 | # Number of iterations with learning rate warmup active (type: int, default: 100) 42 | lr_warmup_steps: 100 43 | 44 | # Number of epochs to train on (type: Optional[int], default: 5) 45 | epochs: 1 46 | 47 | # Total number of tokens to train on (type: Optional[int], default: null) 48 | max_tokens: 49 | 50 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 51 | max_steps: 50 52 | 53 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 54 | max_seq_length: 512 55 | 56 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 57 | tie_embeddings: 58 | 59 | # (type: Optional[float], default: null) 60 | max_norm: 61 | 62 | # (type: float, default: 6e-05) 63 | min_lr: 6.0e-05 64 | 65 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 66 | eval: 67 | # Number of optimizer steps between evaluation calls (type: int, default: 100) 68 | interval: 25 69 | 70 | # Number of tokens to generate (type: Optional[int], default: 100) 71 | max_new_tokens: 100 72 | 73 | # Number of iterations (type: int, default: 100) 74 | max_iters: 100 75 | 76 | # Whether to evaluate on the validation set at the beginning of the training 77 | initial_validation: false 78 | 79 | # Whether to evaluate on the validation set at the end the training 80 | final_validation: true 81 | 82 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 83 | logger_name: csv 84 | 85 | # The random seed to use for reproducibility. (type: int, default: 1337) 86 | seed: 1337 87 | 88 | # Optimizer-related arguments 89 | optimizer: 90 | class_path: torch.optim.AdamW 91 | 92 | init_args: 93 | # (type: float, default: 0.001) 94 | lr: 0.0002 95 | 96 | # (type: float, default: 0.01) 97 | weight_decay: 0.0 98 | 99 | # (type: tuple, default: (0.9,0.999)) 100 | betas: 101 | - 0.9 102 | - 0.95 103 | -------------------------------------------------------------------------------- /config_hub/finetune/llama-2-7b/full.yaml: -------------------------------------------------------------------------------- 1 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 2 | checkpoint_dir: checkpoints/meta-llama/Llama-2-7b-hf 3 | 4 | # Directory in which to save checkpoints and logs. (type: , default: out/finetune/full) 5 | out_dir: out/finetune/full-llama2-7b 6 | 7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 8 | precision: bf16-true 9 | 10 | # How many devices/GPUs to use (type: Union[int, str], default: 1) 11 | devices: 4 12 | 13 | # How many nodes to use. (type: int, default: 1) 14 | num_nodes: 1 15 | 16 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume 17 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing 18 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists. 19 | # (type: Union[bool, Literal["auto"], Path], default: False) 20 | resume: false 21 | 22 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 23 | data: 24 | class_path: litgpt.data.Alpaca2k 25 | init_args: 26 | mask_prompt: false 27 | prompt_style: alpaca 28 | ignore_index: -100 29 | seed: 42 30 | num_workers: 4 31 | 32 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 33 | train: 34 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 35 | save_interval: 200 36 | 37 | # Number of iterations between logging calls (type: int, default: 1) 38 | log_interval: 1 39 | 40 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64) 41 | global_batch_size: 64 42 | 43 | # Number of samples per data-parallel rank (type: int, default: 1) 44 | micro_batch_size: 4 45 | 46 | # Number of iterations with learning rate warmup active (type: int, default: 100) 47 | lr_warmup_steps: 25 48 | 49 | # Number of epochs to train on (type: Optional[int], default: 5) 50 | epochs: 1 51 | 52 | # Total number of tokens to train on (type: Optional[int], default: null) 53 | max_tokens: 54 | 55 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 56 | max_steps: 57 | 58 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 59 | max_seq_length: 512 60 | 61 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 62 | tie_embeddings: 63 | 64 | # (type: Optional[float], default: null) 65 | max_norm: 66 | 67 | # (type: float, default: 6e-05) 68 | min_lr: 6.0e-05 69 | 70 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 71 | eval: 72 | # Number of optimizer steps between evaluation calls (type: int, default: 600) 73 | interval: 25 74 | 75 | # Number of tokens to generate (type: Optional[int], default: 100) 76 | max_new_tokens: 100 77 | 78 | # Number of iterations (type: int, default: 100) 79 | max_iters: 100 80 | 81 | # Whether to evaluate on the validation set at the beginning of the training 82 | initial_validation: false 83 | 84 | # Whether to evaluate on the validation set at the end the training 85 | final_validation: true 86 | 87 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 88 | logger_name: csv 89 | 90 | # The random seed to use for reproducibility. (type: int, default: 1337) 91 | seed: 1337 92 | 93 | # Optimizer-related arguments 94 | optimizer: 95 | class_path: torch.optim.AdamW 96 | 97 | init_args: 98 | # (type: float, default: 0.001) 99 | lr: 0.0002 100 | 101 | # (type: float, default: 0.01) 102 | weight_decay: 0.0 103 | 104 | # (type: tuple, default: (0.9,0.999)) 105 | betas: 106 | - 0.9 107 | - 0.95 108 | -------------------------------------------------------------------------------- /config_hub/finetune/llama-2-7b/lora.yaml: -------------------------------------------------------------------------------- 1 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 2 | checkpoint_dir: checkpoints/meta-llama/Llama-2-7b-hf 3 | 4 | # Directory in which to save checkpoints and logs. (type: , default: out/lora) 5 | out_dir: out/finetune/lora-llama2-7b 6 | 7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 8 | precision: bf16-true 9 | 10 | # If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) 11 | quantize: 12 | 13 | # How many devices/GPUs to use. (type: Union[int, str], default: 1) 14 | devices: 1 15 | 16 | # How many nodes to use. (type: int, default: 1) 17 | num_nodes: 1 18 | 19 | # The LoRA rank. (type: int, default: 8) 20 | lora_r: 32 21 | 22 | # The LoRA alpha. (type: int, default: 16) 23 | lora_alpha: 16 24 | 25 | # The LoRA dropout value. (type: float, default: 0.05) 26 | lora_dropout: 0.05 27 | 28 | # Whether to apply LoRA to the query weights in attention. (type: bool, default: True) 29 | lora_query: true 30 | 31 | # Whether to apply LoRA to the key weights in attention. (type: bool, default: False) 32 | lora_key: false 33 | 34 | # Whether to apply LoRA to the value weights in attention. (type: bool, default: True) 35 | lora_value: true 36 | 37 | # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) 38 | lora_projection: false 39 | 40 | # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) 41 | lora_mlp: false 42 | 43 | # Whether to apply LoRA to output head in GPT. (type: bool, default: False) 44 | lora_head: false 45 | 46 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 47 | data: 48 | class_path: litgpt.data.Alpaca2k 49 | init_args: 50 | mask_prompt: false 51 | prompt_style: alpaca 52 | ignore_index: -100 53 | seed: 42 54 | num_workers: 4 55 | 56 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 57 | train: 58 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 59 | save_interval: 200 60 | 61 | # Number of iterations between logging calls (type: int, default: 1) 62 | log_interval: 1 63 | 64 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) 65 | global_batch_size: 8 66 | 67 | # Number of samples per data-parallel rank (type: int, default: 4) 68 | micro_batch_size: 2 69 | 70 | # Number of iterations with learning rate warmup active (type: int, default: 100) 71 | lr_warmup_steps: 10 72 | 73 | # Number of epochs to train on (type: Optional[int], default: 5) 74 | epochs: 4 75 | 76 | # Total number of tokens to train on (type: Optional[int], default: null) 77 | max_tokens: 78 | 79 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 80 | max_steps: 81 | 82 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 83 | max_seq_length: 512 84 | 85 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 86 | tie_embeddings: 87 | 88 | # (type: Optional[float], default: null) 89 | max_norm: 90 | 91 | # (type: float, default: 6e-05) 92 | min_lr: 6.0e-05 93 | 94 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 95 | eval: 96 | # Number of optimizer steps between evaluation calls (type: int, default: 100) 97 | interval: 100 98 | 99 | # Number of tokens to generate (type: Optional[int], default: 100) 100 | max_new_tokens: 100 101 | 102 | # Number of iterations (type: int, default: 100) 103 | max_iters: 100 104 | 105 | # Whether to evaluate on the validation set at the beginning of the training 106 | initial_validation: false 107 | 108 | # Whether to evaluate on the validation set at the end the training 109 | final_validation: true 110 | 111 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 112 | logger_name: csv 113 | 114 | # The random seed to use for reproducibility. (type: int, default: 1337) 115 | seed: 1337 116 | 117 | # Optimizer-related arguments 118 | optimizer: 119 | class_path: torch.optim.AdamW 120 | 121 | init_args: 122 | # (type: float, default: 0.001) 123 | lr: 0.0002 124 | 125 | # (type: float, default: 0.01) 126 | weight_decay: 0.0 127 | 128 | # (type: tuple, default: (0.9,0.999)) 129 | betas: 130 | - 0.9 131 | - 0.95 132 | -------------------------------------------------------------------------------- /config_hub/finetune/llama-3-8b/full.yaml: -------------------------------------------------------------------------------- 1 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 2 | checkpoint_dir: checkpoints/meta-llama/Meta-Llama-3-8B 3 | 4 | # Directory in which to save checkpoints and logs. (type: , default: out/finetune/full) 5 | out_dir: out/finetune/full-llama-3-8b 6 | 7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 8 | precision: bf16-true 9 | 10 | # How many devices/GPUs to use (type: Union[int, str], default: 1) 11 | devices: 4 12 | 13 | # How many nodes to use. (type: int, default: 1) 14 | num_nodes: 1 15 | 16 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume 17 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing 18 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists. 19 | # (type: Union[bool, Literal["auto"], Path], default: False) 20 | resume: false 21 | 22 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 23 | data: 24 | class_path: litgpt.data.Alpaca2k 25 | init_args: 26 | mask_prompt: false 27 | prompt_style: alpaca 28 | ignore_index: -100 29 | seed: 42 30 | num_workers: 4 31 | 32 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 33 | train: 34 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 35 | save_interval: 200 36 | 37 | # Number of iterations between logging calls (type: int, default: 1) 38 | log_interval: 1 39 | 40 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64) 41 | global_batch_size: 64 42 | 43 | # Number of samples per data-parallel rank (type: int, default: 1) 44 | micro_batch_size: 4 45 | 46 | # Number of iterations with learning rate warmup active (type: int, default: 100) 47 | lr_warmup_steps: 25 48 | 49 | # Number of epochs to train on (type: Optional[int], default: 5) 50 | epochs: 1 51 | 52 | # Total number of tokens to train on (type: Optional[int], default: null) 53 | max_tokens: 54 | 55 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 56 | max_steps: 57 | 58 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 59 | max_seq_length: 512 60 | 61 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 62 | tie_embeddings: 63 | 64 | # (type: Optional[float], default: null) 65 | max_norm: 66 | 67 | # (type: float, default: 6e-05) 68 | min_lr: 6.0e-05 69 | 70 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 71 | eval: 72 | # Number of optimizer steps between evaluation calls (type: int, default: 600) 73 | interval: 25 74 | 75 | # Number of tokens to generate (type: Optional[int], default: 100) 76 | max_new_tokens: 100 77 | 78 | # Number of iterations (type: int, default: 100) 79 | max_iters: 100 80 | 81 | # Whether to evaluate on the validation set at the beginning of the training 82 | initial_validation: false 83 | 84 | # Whether to evaluate on the validation set at the end the training 85 | final_validation: true 86 | 87 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 88 | logger_name: csv 89 | 90 | # The random seed to use for reproducibility. (type: int, default: 1337) 91 | seed: 1337 92 | 93 | # Optimizer-related arguments 94 | optimizer: 95 | class_path: torch.optim.AdamW 96 | 97 | init_args: 98 | # (type: float, default: 0.001) 99 | lr: 0.0002 100 | 101 | # (type: float, default: 0.01) 102 | weight_decay: 0.1 103 | 104 | # (type: tuple, default: (0.9,0.999)) 105 | betas: 106 | - 0.9 107 | - 0.95 108 | -------------------------------------------------------------------------------- /config_hub/finetune/llama-3.1-8b/full.yaml: -------------------------------------------------------------------------------- 1 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 2 | checkpoint_dir: checkpoints/meta-llama/Meta-Llama-3.1-8B 3 | 4 | # Directory in which to save checkpoints and logs. (type: , default: out/finetune/full) 5 | out_dir: out/finetune/full-llama-3.1-8b 6 | 7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 8 | precision: bf16-true 9 | 10 | # How many devices/GPUs to use (type: Union[int, str], default: 1) 11 | devices: 4 12 | 13 | # How many nodes to use. (type: int, default: 1) 14 | num_nodes: 1 15 | 16 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume 17 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing 18 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists. 19 | # (type: Union[bool, Literal["auto"], Path], default: False) 20 | resume: false 21 | 22 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 23 | data: 24 | class_path: litgpt.data.Alpaca2k 25 | init_args: 26 | mask_prompt: false 27 | prompt_style: alpaca 28 | ignore_index: -100 29 | seed: 42 30 | num_workers: 4 31 | 32 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 33 | train: 34 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 35 | save_interval: 200 36 | 37 | # Number of iterations between logging calls (type: int, default: 1) 38 | log_interval: 1 39 | 40 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64) 41 | global_batch_size: 64 42 | 43 | # Number of samples per data-parallel rank (type: int, default: 1) 44 | micro_batch_size: 4 45 | 46 | # Number of iterations with learning rate warmup active (type: int, default: 100) 47 | lr_warmup_steps: 25 48 | 49 | # Number of epochs to train on (type: Optional[int], default: 5) 50 | epochs: 1 51 | 52 | # Total number of tokens to train on (type: Optional[int], default: null) 53 | max_tokens: 54 | 55 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 56 | max_steps: 57 | 58 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 59 | max_seq_length: 512 60 | 61 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 62 | tie_embeddings: 63 | 64 | # (type: Optional[float], default: null) 65 | max_norm: 66 | 67 | # (type: float, default: 6e-05) 68 | min_lr: 6.0e-05 69 | 70 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 71 | eval: 72 | # Number of optimizer steps between evaluation calls (type: int, default: 600) 73 | interval: 25 74 | 75 | # Number of tokens to generate (type: Optional[int], default: 100) 76 | max_new_tokens: 100 77 | 78 | # Number of iterations (type: int, default: 100) 79 | max_iters: 100 80 | 81 | # Whether to evaluate on the validation set at the beginning of the training 82 | initial_validation: false 83 | 84 | # Whether to evaluate on the validation set at the end the training 85 | final_validation: true 86 | 87 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 88 | logger_name: csv 89 | 90 | # The random seed to use for reproducibility. (type: int, default: 1337) 91 | seed: 1337 92 | 93 | # Optimizer-related arguments 94 | optimizer: 95 | class_path: torch.optim.AdamW 96 | 97 | init_args: 98 | # (type: float, default: 0.001) 99 | lr: 0.0002 100 | 101 | # (type: float, default: 0.01) 102 | weight_decay: 0.1 103 | 104 | # (type: tuple, default: (0.9,0.999)) 105 | betas: 106 | - 0.9 107 | - 0.95 108 | -------------------------------------------------------------------------------- /config_hub/finetune/llama-3.2-1B/full.yaml: -------------------------------------------------------------------------------- 1 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 2 | checkpoint_dir: checkpoints/meta-llama/Llama-3.2-1B 3 | 4 | # Directory in which to save checkpoints and logs. (type: , default: out/finetune/full) 5 | out_dir: out/finetune/full-llama-3.2-1B 6 | 7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 8 | precision: bf16-true 9 | 10 | # How many devices/GPUs to use (type: Union[int, str], default: 1) 11 | devices: 1 12 | 13 | # How many nodes to use. (type: int, default: 1) 14 | num_nodes: 1 15 | 16 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume 17 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing 18 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists. 19 | # (type: Union[bool, Literal["auto"], Path], default: False) 20 | # resume: false 21 | 22 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 23 | data: 24 | class_path: litgpt.data.Alpaca2k 25 | init_args: 26 | mask_prompt: false 27 | prompt_style: alpaca 28 | ignore_index: -100 29 | seed: 42 30 | num_workers: 4 31 | 32 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 33 | train: 34 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 35 | save_interval: 200 36 | 37 | # Number of iterations between logging calls (type: int, default: 1) 38 | log_interval: 1 39 | 40 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64) 41 | global_batch_size: 64 42 | 43 | # Number of samples per data-parallel rank (type: int, default: 1) 44 | micro_batch_size: 4 45 | 46 | # Number of iterations with learning rate warmup active (type: int, default: 100) 47 | lr_warmup_steps: 25 48 | 49 | # Number of epochs to train on (type: Optional[int], default: 5) 50 | epochs: 1 51 | 52 | # Total number of tokens to train on (type: Optional[int], default: null) 53 | max_tokens: 54 | 55 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 56 | max_steps: 57 | 58 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 59 | max_seq_length: 512 60 | 61 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 62 | tie_embeddings: 63 | 64 | # (type: Optional[float], default: null) 65 | max_norm: 66 | 67 | # (type: float, default: 6e-05) 68 | min_lr: 6.0e-05 69 | 70 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 71 | eval: 72 | # Number of optimizer steps between evaluation calls (type: int, default: 600) 73 | interval: 25 74 | 75 | # Number of tokens to generate (type: Optional[int], default: 100) 76 | max_new_tokens: 100 77 | 78 | # Number of iterations (type: int, default: 100) 79 | max_iters: 100 80 | 81 | # Whether to evaluate on the validation set at the beginning of the training 82 | initial_validation: false 83 | 84 | # Whether to evaluate on the validation set at the end the training 85 | final_validation: true 86 | 87 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 88 | logger_name: csv 89 | 90 | # The random seed to use for reproducibility. (type: int, default: 1337) 91 | seed: 1337 92 | 93 | # Optimizer-related arguments 94 | optimizer: 95 | class_path: torch.optim.AdamW 96 | 97 | init_args: 98 | # (type: float, default: 0.001) 99 | lr: 0.0002 100 | 101 | # (type: float, default: 0.01) 102 | weight_decay: 0.1 103 | 104 | # (type: tuple, default: (0.9,0.999)) 105 | betas: 106 | - 0.9 107 | - 0.95 108 | -------------------------------------------------------------------------------- /config_hub/finetune/llama-3.2-3B/full.yaml: -------------------------------------------------------------------------------- 1 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 2 | checkpoint_dir: checkpoints/meta-llama/Llama-3.2-3B 3 | 4 | # Directory in which to save checkpoints and logs. (type: , default: out/finetune/full) 5 | out_dir: out/finetune/full-llama-3.2-3B 6 | 7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 8 | precision: bf16-true 9 | 10 | # How many devices/GPUs to use (type: Union[int, str], default: 1) 11 | devices: 1 12 | 13 | # How many nodes to use. (type: int, default: 1) 14 | num_nodes: 1 15 | 16 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume 17 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing 18 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists. 19 | # (type: Union[bool, Literal["auto"], Path], default: False) 20 | # resume: false 21 | 22 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 23 | data: 24 | class_path: litgpt.data.Alpaca2k 25 | init_args: 26 | mask_prompt: false 27 | prompt_style: alpaca 28 | ignore_index: -100 29 | seed: 42 30 | num_workers: 4 31 | 32 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 33 | train: 34 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 35 | save_interval: 200 36 | 37 | # Number of iterations between logging calls (type: int, default: 1) 38 | log_interval: 1 39 | 40 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64) 41 | global_batch_size: 64 42 | 43 | # Number of samples per data-parallel rank (type: int, default: 1) 44 | micro_batch_size: 4 45 | 46 | # Number of iterations with learning rate warmup active (type: int, default: 100) 47 | lr_warmup_steps: 25 48 | 49 | # Number of epochs to train on (type: Optional[int], default: 5) 50 | epochs: 1 51 | 52 | # Total number of tokens to train on (type: Optional[int], default: null) 53 | max_tokens: 54 | 55 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 56 | max_steps: 57 | 58 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 59 | max_seq_length: 512 60 | 61 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 62 | tie_embeddings: 63 | 64 | # (type: Optional[float], default: null) 65 | max_norm: 66 | 67 | # (type: float, default: 6e-05) 68 | min_lr: 6.0e-05 69 | 70 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 71 | eval: 72 | # Number of optimizer steps between evaluation calls (type: int, default: 600) 73 | interval: 25 74 | 75 | # Number of tokens to generate (type: Optional[int], default: 100) 76 | max_new_tokens: 100 77 | 78 | # Number of iterations (type: int, default: 100) 79 | max_iters: 100 80 | 81 | # Whether to evaluate on the validation set at the beginning of the training 82 | initial_validation: false 83 | 84 | # Whether to evaluate on the validation set at the end the training 85 | final_validation: true 86 | 87 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 88 | logger_name: csv 89 | 90 | # The random seed to use for reproducibility. (type: int, default: 1337) 91 | seed: 1337 92 | 93 | # Optimizer-related arguments 94 | optimizer: 95 | class_path: torch.optim.AdamW 96 | 97 | init_args: 98 | # (type: float, default: 0.001) 99 | lr: 0.0002 100 | 101 | # (type: float, default: 0.01) 102 | weight_decay: 0.1 103 | 104 | # (type: tuple, default: (0.9,0.999)) 105 | betas: 106 | - 0.9 107 | - 0.95 108 | -------------------------------------------------------------------------------- /config_hub/finetune/phi-2/full.yaml: -------------------------------------------------------------------------------- 1 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 2 | checkpoint_dir: checkpoints/microsoft/phi-2 3 | 4 | # Directory in which to save checkpoints and logs. (type: , default: out/finetune/full) 5 | out_dir: out/finetune/full-phi-2 6 | 7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 8 | precision: bf16-true 9 | 10 | # How many devices/GPUs to use (type: Union[int, str], default: 1) 11 | devices: 2 12 | 13 | # How many nodes to use. (type: int, default: 1) 14 | num_nodes: 1 15 | 16 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 17 | data: 18 | class_path: litgpt.data.Alpaca2k 19 | init_args: 20 | mask_prompt: false 21 | prompt_style: alpaca 22 | ignore_index: -100 23 | seed: 42 24 | num_workers: 4 25 | 26 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 27 | train: 28 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 29 | save_interval: 200 30 | 31 | # Number of iterations between logging calls (type: int, default: 1) 32 | log_interval: 1 33 | 34 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64) 35 | global_batch_size: 8 36 | 37 | # Number of samples per data-parallel rank (type: int, default: 1) 38 | micro_batch_size: 4 39 | 40 | # Number of iterations with learning rate warmup active (type: int, default: 100) 41 | lr_warmup_steps: 200 42 | 43 | # Number of epochs to train on (type: Optional[int], default: 5) 44 | epochs: 1 45 | 46 | # Total number of tokens to train on (type: Optional[int], default: null) 47 | max_tokens: 48 | 49 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 50 | max_steps: 100 51 | 52 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 53 | max_seq_length: 512 54 | 55 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 56 | tie_embeddings: 57 | 58 | # (type: Optional[float], default: null) 59 | max_norm: 60 | 61 | # (type: float, default: 6e-05) 62 | min_lr: 6.0e-05 63 | 64 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 65 | eval: 66 | # Number of optimizer steps between evaluation calls (type: int, default: 600) 67 | interval: 25 68 | 69 | # Number of tokens to generate (type: Optional[int], default: 100) 70 | max_new_tokens: 100 71 | 72 | # Number of iterations (type: int, default: 100) 73 | max_iters: 100 74 | 75 | # Whether to evaluate on the validation set at the beginning of the training 76 | initial_validation: false 77 | 78 | # Whether to evaluate on the validation set at the end the training 79 | final_validation: true 80 | 81 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 82 | logger_name: csv 83 | 84 | # The random seed to use for reproducibility. (type: int, default: 1337) 85 | seed: 1337 86 | 87 | # Optimizer-related arguments 88 | optimizer: 89 | class_path: torch.optim.AdamW 90 | 91 | init_args: 92 | # (type: float, default: 0.001) 93 | lr: 0.0002 94 | 95 | # (type: float, default: 0.01) 96 | weight_decay: 0.1 97 | 98 | # (type: tuple, default: (0.9,0.999)) 99 | betas: 100 | - 0.9 101 | - 0.95 102 | -------------------------------------------------------------------------------- /config_hub/finetune/phi-3/full.yaml: -------------------------------------------------------------------------------- 1 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 2 | checkpoint_dir: checkpoints/microsoft/Phi-3-mini-4k-instruct 3 | 4 | # Directory in which to save checkpoints and logs. (type: , default: out/finetune/full) 5 | out_dir: out/finetune/full-phi-3 6 | 7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 8 | precision: bf16-true 9 | 10 | # How many devices/GPUs to use (type: Union[int, str], default: 1) 11 | devices: 1 12 | 13 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 14 | data: 15 | class_path: litgpt.data.Alpaca2k 16 | init_args: 17 | mask_prompt: false 18 | prompt_style: alpaca 19 | ignore_index: -100 20 | seed: 42 21 | num_workers: 4 22 | 23 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 24 | train: 25 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 26 | save_interval: 200 27 | 28 | # Number of iterations between logging calls (type: int, default: 1) 29 | log_interval: 1 30 | 31 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64) 32 | global_batch_size: 8 33 | 34 | # Number of samples per data-parallel rank (type: int, default: 1) 35 | micro_batch_size: 4 36 | 37 | # Number of iterations with learning rate warmup active (type: int, default: 100) 38 | lr_warmup_steps: 200 39 | 40 | # Number of epochs to train on (type: Optional[int], default: 5) 41 | epochs: 1 42 | 43 | # Total number of tokens to train on (type: Optional[int], default: null) 44 | max_tokens: 45 | 46 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 47 | max_steps: 48 | 49 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 50 | max_seq_length: 512 51 | 52 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 53 | tie_embeddings: 54 | 55 | # (type: Optional[float], default: null) 56 | max_norm: 57 | 58 | # (type: float, default: 6e-05) 59 | min_lr: 6.0e-05 60 | 61 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 62 | eval: 63 | # Number of optimizer steps between evaluation calls (type: int, default: 600) 64 | interval: 25 65 | 66 | # Number of tokens to generate (type: Optional[int], default: 100) 67 | max_new_tokens: 100 68 | 69 | # Number of iterations (type: int, default: 100) 70 | max_iters: 100 71 | 72 | # Whether to evaluate on the validation set at the beginning of the training 73 | initial_validation: false 74 | 75 | # Whether to evaluate on the validation set at the end the training 76 | final_validation: true 77 | 78 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 79 | logger_name: csv 80 | 81 | # The random seed to use for reproducibility. (type: int, default: 1337) 82 | seed: 1337 83 | 84 | # Optimizer-related arguments 85 | optimizer: 86 | class_path: torch.optim.AdamW 87 | 88 | init_args: 89 | # (type: float, default: 0.001) 90 | lr: 0.0002 91 | 92 | # (type: float, default: 0.01) 93 | weight_decay: 0.1 94 | 95 | # (type: tuple, default: (0.9,0.999)) 96 | betas: 97 | - 0.9 98 | - 0.95 99 | -------------------------------------------------------------------------------- /config_hub/finetune/phi-3/lora.yaml: -------------------------------------------------------------------------------- 1 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 2 | checkpoint_dir: checkpoints/microsoft/Phi-3-mini-4k-instruct 3 | 4 | # Directory in which to save checkpoints and logs. (type: , default: out/lora) 5 | out_dir: out/finetune/lora-phi-3 6 | 7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 8 | precision: bf16-true 9 | 10 | # If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) 11 | quantize: 12 | 13 | # How many devices/GPUs to use. (type: Union[int, str], default: 1) 14 | devices: 1 15 | 16 | # The LoRA rank. (type: int, default: 8) 17 | lora_r: 8 18 | 19 | # The LoRA alpha. (type: int, default: 16) 20 | lora_alpha: 16 21 | 22 | # The LoRA dropout value. (type: float, default: 0.05) 23 | lora_dropout: 0.05 24 | 25 | # Whether to apply LoRA to the query weights in attention. (type: bool, default: True) 26 | lora_query: true 27 | 28 | # Whether to apply LoRA to the key weights in attention. (type: bool, default: False) 29 | lora_key: true 30 | 31 | # Whether to apply LoRA to the value weights in attention. (type: bool, default: True) 32 | lora_value: true 33 | 34 | # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) 35 | lora_projection: true 36 | 37 | # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) 38 | lora_mlp: true 39 | 40 | # Whether to apply LoRA to output head in GPT. (type: bool, default: False) 41 | lora_head: true 42 | 43 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 44 | data: 45 | class_path: litgpt.data.Alpaca2k 46 | init_args: 47 | mask_prompt: false 48 | val_split_fraction: 0.03847 49 | prompt_style: alpaca 50 | ignore_index: -100 51 | seed: 42 52 | num_workers: 4 53 | 54 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 55 | train: 56 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 57 | save_interval: 800 58 | 59 | # Number of iterations between logging calls (type: int, default: 1) 60 | log_interval: 1 61 | 62 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) 63 | global_batch_size: 8 64 | 65 | # Number of samples per data-parallel rank (type: int, default: 4) 66 | micro_batch_size: 4 67 | 68 | # Number of iterations with learning rate warmup active (type: int, default: 100) 69 | lr_warmup_steps: 10 70 | 71 | # Number of epochs to train on (type: Optional[int], default: 5) 72 | epochs: 1 73 | 74 | # Total number of tokens to train on (type: Optional[int], default: null) 75 | max_tokens: 76 | 77 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 78 | max_steps: 79 | 80 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 81 | max_seq_length: 512 82 | 83 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 84 | tie_embeddings: 85 | 86 | # (type: Optional[float], default: null) 87 | max_norm: 88 | 89 | # (type: float, default: 6e-05) 90 | min_lr: 6.0e-05 91 | 92 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 93 | eval: 94 | # Number of optimizer steps between evaluation calls (type: int, default: 100) 95 | interval: 100 96 | 97 | # Number of tokens to generate (type: Optional[int], default: 100) 98 | max_new_tokens: 100 99 | 100 | # Number of iterations (type: int, default: 100) 101 | max_iters: 100 102 | 103 | # Whether to evaluate on the validation set at the beginning of the training 104 | initial_validation: false 105 | 106 | # Whether to evaluate on the validation set at the end the training 107 | final_validation: true 108 | 109 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 110 | logger_name: csv 111 | 112 | # The random seed to use for reproducibility. (type: int, default: 1337) 113 | seed: 1337 114 | 115 | # Optimizer-related arguments 116 | optimizer: 117 | class_path: torch.optim.AdamW 118 | 119 | init_args: 120 | # (type: float, default: 0.001) 121 | lr: 0.0002 122 | 123 | # (type: float, default: 0.01) 124 | weight_decay: 0.0 125 | 126 | # (type: tuple, default: (0.9,0.999)) 127 | betas: 128 | - 0.9 129 | - 0.95 130 | -------------------------------------------------------------------------------- /config_hub/finetune/phi-3/qlora.yaml: -------------------------------------------------------------------------------- 1 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 2 | checkpoint_dir: checkpoints/microsoft/Phi-3-mini-4k-instruct 3 | 4 | # Directory in which to save checkpoints and logs. (type: , default: out/lora) 5 | out_dir: out/finetune/qlora-phi-3 6 | 7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 8 | precision: bf16-true 9 | 10 | # If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) 11 | quantize: bnb.nf4 12 | 13 | # How many devices/GPUs to use. (type: Union[int, str], default: 1) 14 | devices: 1 15 | 16 | # The LoRA rank. (type: int, default: 8) 17 | lora_r: 8 18 | 19 | # The LoRA alpha. (type: int, default: 16) 20 | lora_alpha: 16 21 | 22 | # The LoRA dropout value. (type: float, default: 0.05) 23 | lora_dropout: 0.05 24 | 25 | # Whether to apply LoRA to the query weights in attention. (type: bool, default: True) 26 | lora_query: true 27 | 28 | # Whether to apply LoRA to the key weights in attention. (type: bool, default: False) 29 | lora_key: true 30 | 31 | # Whether to apply LoRA to the value weights in attention. (type: bool, default: True) 32 | lora_value: true 33 | 34 | # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) 35 | lora_projection: true 36 | 37 | # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) 38 | lora_mlp: true 39 | 40 | # Whether to apply LoRA to output head in GPT. (type: bool, default: False) 41 | lora_head: true 42 | 43 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 44 | data: 45 | class_path: litgpt.data.Alpaca2k 46 | init_args: 47 | mask_prompt: false 48 | val_split_fraction: 0.03847 49 | prompt_style: alpaca 50 | ignore_index: -100 51 | seed: 42 52 | num_workers: 4 53 | 54 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 55 | train: 56 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 57 | save_interval: 800 58 | 59 | # Number of iterations between logging calls (type: int, default: 1) 60 | log_interval: 1 61 | 62 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) 63 | global_batch_size: 8 64 | 65 | # Number of samples per data-parallel rank (type: int, default: 4) 66 | micro_batch_size: 4 67 | 68 | # Number of iterations with learning rate warmup active (type: int, default: 100) 69 | lr_warmup_steps: 10 70 | 71 | # Number of epochs to train on (type: Optional[int], default: 5) 72 | epochs: 1 73 | 74 | # Total number of tokens to train on (type: Optional[int], default: null) 75 | max_tokens: 76 | 77 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 78 | max_steps: 79 | 80 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 81 | max_seq_length: 512 82 | 83 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 84 | tie_embeddings: 85 | 86 | # (type: Optional[float], default: null) 87 | max_norm: 88 | 89 | # (type: float, default: 6e-05) 90 | min_lr: 6.0e-05 91 | 92 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 93 | eval: 94 | # Number of optimizer steps between evaluation calls (type: int, default: 100) 95 | interval: 100 96 | 97 | # Number of tokens to generate (type: Optional[int], default: 100) 98 | max_new_tokens: 100 99 | 100 | # Number of iterations (type: int, default: 100) 101 | max_iters: 100 102 | 103 | # Whether to evaluate on the validation set at the beginning of the training 104 | initial_validation: false 105 | 106 | # Whether to evaluate on the validation set at the end the training 107 | final_validation: true 108 | 109 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 110 | logger_name: csv 111 | 112 | # The random seed to use for reproducibility. (type: int, default: 1337) 113 | seed: 1337 114 | 115 | # Optimizer-related arguments 116 | optimizer: 117 | class_path: torch.optim.AdamW 118 | 119 | init_args: 120 | # (type: float, default: 0.001) 121 | lr: 0.0002 122 | 123 | # (type: float, default: 0.01) 124 | weight_decay: 0.0 125 | 126 | # (type: tuple, default: (0.9,0.999)) 127 | betas: 128 | - 0.9 129 | - 0.95 130 | -------------------------------------------------------------------------------- /config_hub/finetune/stablelm-base-alpha-3b/full.yaml: -------------------------------------------------------------------------------- 1 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 2 | checkpoint_dir: checkpoints/stabilityai/stablelm-base-alpha-3b 3 | 4 | # Directory in which to save checkpoints and logs. (type: , default: out/lora) 5 | out_dir: out/finetune/full-stablelm-base-alpha-3b 6 | 7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 8 | precision: bf16-true 9 | 10 | # How many devices/GPUs to use. (type: Union[int, str], default: 1) 11 | devices: 2 12 | 13 | # How many nodes to use. (type: int, default: 1) 14 | num_nodes: 1 15 | 16 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 17 | data: 18 | class_path: litgpt.data.Alpaca2k 19 | init_args: 20 | mask_prompt: false 21 | val_split_fraction: 0.03847 22 | prompt_style: alpaca 23 | ignore_index: -100 24 | seed: 42 25 | num_workers: 4 26 | 27 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 28 | train: 29 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 30 | save_interval: 800 31 | 32 | # Number of iterations between logging calls (type: int, default: 1) 33 | log_interval: 1 34 | 35 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) 36 | global_batch_size: 8 37 | 38 | # Number of samples per data-parallel rank (type: int, default: 4) 39 | micro_batch_size: 1 40 | 41 | # Number of iterations with learning rate warmup active (type: int, default: 100) 42 | lr_warmup_steps: 1000 43 | 44 | # Number of epochs to train on (type: Optional[int], default: 5) 45 | epochs: 1 46 | 47 | # Total number of tokens to train on (type: Optional[int], default: null) 48 | max_tokens: 49 | 50 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 51 | max_steps: 52 | 53 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 54 | max_seq_length: 512 55 | 56 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 57 | tie_embeddings: 58 | 59 | # (type: Optional[float], default: null) 60 | max_norm: 61 | 62 | # (type: float, default: 6e-05) 63 | min_lr: 6.0e-05 64 | 65 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 66 | eval: 67 | # Number of optimizer steps between evaluation calls (type: int, default: 100) 68 | interval: 25 69 | 70 | # Number of tokens to generate (type: Optional[int], default: 100) 71 | max_new_tokens: 100 72 | 73 | # Number of iterations (type: int, default: 100) 74 | max_iters: 100 75 | 76 | # Whether to evaluate on the validation set at the beginning of the training 77 | initial_validation: false 78 | 79 | # Whether to evaluate on the validation set at the end the training 80 | final_validation: true 81 | 82 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 83 | logger_name: csv 84 | 85 | # The random seed to use for reproducibility. (type: int, default: 1337) 86 | seed: 1337 87 | 88 | # Optimizer-related arguments 89 | optimizer: 90 | class_path: torch.optim.AdamW 91 | 92 | init_args: 93 | # (type: float, default: 0.001) 94 | lr: 0.0002 95 | 96 | # (type: float, default: 0.01) 97 | weight_decay: 0.1 98 | 99 | # (type: tuple, default: (0.9,0.999)) 100 | betas: 101 | - 0.9 102 | - 0.95 103 | -------------------------------------------------------------------------------- /config_hub/finetune/tiny-llama/full.yaml: -------------------------------------------------------------------------------- 1 | # The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) 2 | checkpoint_dir: checkpoints/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T 3 | 4 | # Directory in which to save checkpoints and logs. (type: , default: out/lora) 5 | out_dir: out/finetune/full-tiny-llama-1.1b 6 | 7 | # The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 8 | precision: bf16-true 9 | 10 | # How many devices/GPUs to use. (type: Union[int, str], default: 1) 11 | devices: 1 12 | 13 | # How many nodes to use. (type: int, default: 1) 14 | num_nodes: 1 15 | 16 | # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. 17 | data: 18 | class_path: litgpt.data.Alpaca2k 19 | init_args: 20 | mask_prompt: false 21 | val_split_fraction: 0.03847 22 | prompt_style: alpaca 23 | ignore_index: -100 24 | seed: 42 25 | num_workers: 4 26 | 27 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 28 | train: 29 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 30 | save_interval: 800 31 | 32 | # Number of iterations between logging calls (type: int, default: 1) 33 | log_interval: 1 34 | 35 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) 36 | global_batch_size: 32 37 | 38 | # Number of samples per data-parallel rank (type: int, default: 4) 39 | micro_batch_size: 4 40 | 41 | # Number of iterations with learning rate warmup active (type: int, default: 100) 42 | lr_warmup_steps: 1000 43 | 44 | # Number of epochs to train on (type: Optional[int], default: 5) 45 | epochs: 1 46 | 47 | # Total number of tokens to train on (type: Optional[int], default: null) 48 | max_tokens: 49 | 50 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 51 | max_steps: 52 | 53 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 54 | max_seq_length: 512 55 | 56 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) 57 | tie_embeddings: 58 | 59 | # (type: Optional[float], default: null) 60 | max_norm: 61 | 62 | # (type: float, default: 6e-05) 63 | min_lr: 6.0e-05 64 | 65 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 66 | eval: 67 | # Number of optimizer steps between evaluation calls (type: int, default: 100) 68 | interval: 25 69 | 70 | # Number of tokens to generate (type: Optional[int], default: 100) 71 | max_new_tokens: 100 72 | 73 | # Number of iterations (type: int, default: 100) 74 | max_iters: 100 75 | 76 | # Whether to evaluate on the validation set at the beginning of the training 77 | initial_validation: false 78 | 79 | # Whether to evaluate on the validation set at the end the training 80 | final_validation: true 81 | 82 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) 83 | logger_name: csv 84 | 85 | # The random seed to use for reproducibility. (type: int, default: 1337) 86 | seed: 1337 87 | 88 | # Optimizer-related arguments 89 | optimizer: 90 | class_path: torch.optim.AdamW 91 | 92 | init_args: 93 | # (type: float, default: 0.001) 94 | lr: 0.0002 95 | 96 | # (type: float, default: 0.01) 97 | weight_decay: 0.0 98 | 99 | # (type: tuple, default: (0.9,0.999)) 100 | betas: 101 | - 0.9 102 | - 0.95 103 | -------------------------------------------------------------------------------- /config_hub/pretrain/debug.yaml: -------------------------------------------------------------------------------- 1 | # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with 2 | # ``model_config``. (type: Optional[str], default: null) 3 | model_name: pythia-14m 4 | 5 | # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with 6 | # ``model_config``. (type: Optional[Config], default: null) 7 | model_config: 8 | 9 | # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in 10 | # /teamspace/jobs//share. (type: , default: out/pretrain) 11 | out_dir: out/pretrain/debug 12 | 13 | # The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 14 | precision: bf16-mixed 15 | 16 | # Optional path to a checkpoint directory to initialize the model from. 17 | # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null) 18 | initial_checkpoint_dir: 19 | 20 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume 21 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing 22 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists. 23 | # (type: Union[bool, Literal["auto"], Path], default: False) 24 | resume: false 25 | 26 | # Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``. 27 | data: TinyStories 28 | 29 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 30 | train: 31 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 32 | save_interval: 1000 33 | 34 | # Number of iterations between logging calls (type: int, default: 1) 35 | log_interval: 1 36 | 37 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512) 38 | global_batch_size: 125 39 | 40 | # Number of samples per data-parallel rank (type: int, default: 4) 41 | micro_batch_size: 5 42 | 43 | # Number of iterations with learning rate warmup active (type: int, default: 2000) 44 | lr_warmup_steps: 100 45 | 46 | # Number of epochs to train on (type: Optional[int], default: null) 47 | epochs: 48 | 49 | # Total number of tokens to train on (type: Optional[int], default: 3000000000000) 50 | max_tokens: 100000000 51 | 52 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 53 | max_steps: 54 | 55 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 56 | max_seq_length: 57 | 58 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False) 59 | tie_embeddings: 60 | 61 | # (type: Optional[float], default: 1.0) 62 | max_norm: 1.0 63 | 64 | # (type: float, default: 4e-05) 65 | min_lr: 6e-5 66 | 67 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 68 | eval: 69 | # Number of optimizer steps between evaluation calls (type: int, default: 1000) 70 | interval: 1000 71 | 72 | # Number of tokens to generate (type: Optional[int], default: null) 73 | max_new_tokens: 74 | 75 | # Number of iterations (type: int, default: 100) 76 | max_iters: 100 77 | 78 | # Whether to evaluate on the validation set at the beginning of the training 79 | initial_validation: false 80 | 81 | # Whether to evaluate on the validation set at the end the training 82 | final_validation: false 83 | 84 | # Optimizer-related arguments 85 | optimizer: 86 | class_path: torch.optim.AdamW 87 | 88 | init_args: 89 | # (type: float, default: 0.001) 90 | lr: 6e-4 91 | 92 | # (type: float, default: 0.01) 93 | weight_decay: 0.1 94 | 95 | # (type: tuple, default: (0.9,0.999)) 96 | betas: 97 | - 0.9 98 | - 0.95 99 | 100 | # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto) 101 | devices: auto 102 | 103 | # How many nodes to use. (type: int, default: 1) 104 | num_nodes: 1 105 | 106 | # Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data 107 | # module require this. (type: Optional[Path], default: null) 108 | tokenizer_dir: checkpoints/EleutherAI/pythia-14m 109 | 110 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard) 111 | logger_name: tensorboard 112 | 113 | # The random seed to use for reproducibility. (type: int, default: 42) 114 | seed: 42 115 | -------------------------------------------------------------------------------- /config_hub/pretrain/microllama.yaml: -------------------------------------------------------------------------------- 1 | # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with 2 | # ``model_config``. (type: Optional[str], default: null) 3 | model_name: micro-llama-300M 4 | 5 | # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with 6 | # ``model_config``. (type: Optional[Config], default: null) 7 | model_config: 8 | 9 | # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in 10 | # /teamspace/jobs//share. (type: , default: out/pretrain) 11 | out_dir: out/pretrain/micro-llama 12 | 13 | # The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 14 | precision: bf16-mixed 15 | 16 | # Optional path to a checkpoint directory to initialize the model from. 17 | # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null) 18 | initial_checkpoint_dir: 19 | 20 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume 21 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing 22 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists. 23 | # (type: Union[bool, Literal["auto"], Path], default: False) 24 | resume: false 25 | 26 | # Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``. 27 | data: MicroLlama 28 | 29 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 30 | train: 31 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 32 | save_interval: 1000 33 | 34 | # Number of iterations between logging calls (type: int, default: 1) 35 | log_interval: 1 36 | 37 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 48) 38 | # Scale this number according to the number of GPU and memory size per GPU 39 | # For example, we used 48 for 4 x 24G 4090 40 | global_batch_size: 48 41 | 42 | # Number of samples per data-parallel rank (type: int, default: 12) 43 | # Scale this number according to the memory size per GPU 44 | # For example, we used 12 for 24G 4090 45 | micro_batch_size: 12 46 | 47 | # Number of iterations with learning rate warmup active (type: int, default: 2000) 48 | lr_warmup_steps: 2000 49 | 50 | # Number of epochs to train on (type: Optional[int], default: null) 51 | epochs: 52 | 53 | # Total number of tokens to train on (type: Optional[int], default: 3000000000000) 54 | max_tokens: 3000000000000 55 | 56 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 57 | max_steps: 58 | 59 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 60 | max_seq_length: 2048 61 | 62 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False) 63 | tie_embeddings: 64 | 65 | # (type: Optional[float], default: 1.0) 66 | max_norm: 1.0 67 | 68 | # (type: float, default: 4e-05) 69 | min_lr: 4.0e-05 70 | 71 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 72 | eval: 73 | # Number of optimizer steps between evaluation calls (type: int, default: 1000) 74 | interval: 1000 75 | 76 | # Number of tokens to generate (type: Optional[int], default: null) 77 | max_new_tokens: 78 | 79 | # Number of iterations (type: int, default: 100) 80 | max_iters: 100 81 | 82 | # Whether to evaluate on the validation set at the beginning of the training 83 | initial_validation: false 84 | 85 | # Optimizer-related arguments 86 | optimizer: 87 | class_path: torch.optim.AdamW 88 | 89 | init_args: 90 | # (type: float, default: 0.001) 91 | lr: 4e-4 92 | 93 | # (type: float, default: 0.01) 94 | weight_decay: 0.1 95 | 96 | # (type: tuple, default: (0.9,0.999)) 97 | betas: 98 | - 0.9 99 | - 0.95 100 | 101 | # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto) 102 | devices: auto 103 | 104 | # How many nodes to use. (type: int, default: 1) 105 | num_nodes: 1 106 | 107 | # Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data 108 | # module require this. (type: Optional[Path], default: null) 109 | tokenizer_dir: checkpoints/meta-llama/Llama-2-7b-hf 110 | 111 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard) 112 | logger_name: tensorboard 113 | 114 | # The random seed to use for reproducibility. (type: int, default: 42) 115 | seed: 42 116 | -------------------------------------------------------------------------------- /config_hub/pretrain/tinyllama.yaml: -------------------------------------------------------------------------------- 1 | # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with 2 | # ``model_config``. (type: Optional[str], default: null) 3 | model_name: tiny-llama-1.1b 4 | 5 | # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with 6 | # ``model_config``. (type: Optional[Config], default: null) 7 | model_config: 8 | 9 | # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in 10 | # /teamspace/jobs//share. (type: , default: out/pretrain) 11 | out_dir: out/pretrain/tiny-llama 12 | 13 | # The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) 14 | precision: bf16-mixed 15 | 16 | # Optional path to a checkpoint directory to initialize the model from. 17 | # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null) 18 | initial_checkpoint_dir: 19 | 20 | # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume 21 | # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing 22 | # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists. 23 | # (type: Union[bool, Literal["auto"], Path], default: False) 24 | resume: false 25 | 26 | # Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``. 27 | data: TinyLlama 28 | 29 | # Training-related arguments. See ``litgpt.args.TrainArgs`` for details 30 | train: 31 | # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) 32 | save_interval: 1000 33 | 34 | # Number of iterations between logging calls (type: int, default: 1) 35 | log_interval: 1 36 | 37 | # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512) 38 | global_batch_size: 512 39 | 40 | # Number of samples per data-parallel rank (type: int, default: 4) 41 | micro_batch_size: 4 42 | 43 | # Number of iterations with learning rate warmup active (type: int, default: 2000) 44 | lr_warmup_steps: 2000 45 | 46 | # Number of epochs to train on (type: Optional[int], default: null) 47 | epochs: 48 | 49 | # Total number of tokens to train on (type: Optional[int], default: 3000000000000) 50 | max_tokens: 3000000000000 51 | 52 | # Limits the number of optimizer steps to run. (type: Optional[int], default: null) 53 | max_steps: 54 | 55 | # Limits the length of samples. Off by default (type: Optional[int], default: null) 56 | max_seq_length: 2048 57 | 58 | # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False) 59 | tie_embeddings: 60 | 61 | # (type: Optional[float], default: 1.0) 62 | max_norm: 1.0 63 | 64 | # (type: float, default: 4e-05) 65 | min_lr: 4.0e-05 66 | 67 | # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details 68 | eval: 69 | # Number of optimizer steps between evaluation calls (type: int, default: 1000) 70 | interval: 1000 71 | 72 | # Number of tokens to generate (type: Optional[int], default: null) 73 | max_new_tokens: 74 | 75 | # Number of iterations (type: int, default: 100) 76 | max_iters: 100 77 | 78 | # Whether to evaluate on the validation set at the beginning of the training 79 | initial_validation: false 80 | 81 | # Whether to evaluate on the validation set at the end the training 82 | final_validation: false 83 | 84 | # Optimizer-related arguments 85 | optimizer: 86 | class_path: torch.optim.AdamW 87 | 88 | init_args: 89 | # (type: float, default: 0.001) 90 | lr: 4e-4 91 | 92 | # (type: float, default: 0.01) 93 | weight_decay: 0.1 94 | 95 | # (type: tuple, default: (0.9,0.999)) 96 | betas: 97 | - 0.9 98 | - 0.95 99 | 100 | # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto) 101 | devices: auto 102 | 103 | # How many nodes to use. (type: int, default: 1) 104 | num_nodes: 1 105 | 106 | # Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data 107 | # module require this. (type: Optional[Path], default: null) 108 | tokenizer_dir: checkpoints/meta-llama/Llama-2-7b-hf 109 | 110 | # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard) 111 | logger_name: tensorboard 112 | 113 | # The random seed to use for reproducibility. (type: int, default: 42) 114 | seed: 42 115 | -------------------------------------------------------------------------------- /extensions/thunder/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | # support running without installing as a package, adding extensions to the Python path 5 | wd = Path(__file__).parent.parent.resolve() 6 | sys.path.append(str(wd)) 7 | -------------------------------------------------------------------------------- /extensions/thunder/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | from .thunder_ddp import ThunderDDPStrategy # noqa: F401 2 | from .thunder_fsdp import ThunderFSDPStrategy # noqa: F401 3 | -------------------------------------------------------------------------------- /extensions/thunder/unsloth/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/extensions/thunder/unsloth/__init__.py -------------------------------------------------------------------------------- /extensions/thunder/unsloth/kernels/__init__.py: -------------------------------------------------------------------------------- 1 | from .cross_entropy_loss import _cross_entropy_backward_impl, _cross_entropy_forward_impl # noqa: F401 2 | from .rope_embedding import ROPE_GROUP_SIZE, _rope_embedding_backward_impl, _rope_embedding_forward_impl # noqa: F401 3 | from .swiglu import swiglu_DWf_DW_dfg_kernel, swiglu_fg_kernel # noqa: F401 4 | from .utils import calculate_settings # noqa: F401 5 | -------------------------------------------------------------------------------- /extensions/thunder/unsloth/kernels/swiglu.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | 17 | from litgpt.utils import _TRITON_AVAILABLE 18 | 19 | if _TRITON_AVAILABLE: 20 | import triton 21 | import triton.language as tl 22 | 23 | 24 | @triton.jit 25 | def _fg_kernel( 26 | e, 27 | g, 28 | h, 29 | n_elements, 30 | BLOCK_SIZE: tl.constexpr, 31 | ): 32 | block_idx = tl.program_id(0) 33 | offsets = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 34 | mask = offsets < n_elements 35 | 36 | e_row = tl.load(e + offsets, mask=mask, other=0).to(tl.float32) 37 | g_row = tl.load(g + offsets, mask=mask, other=0) # .to(tl.float32) 38 | 39 | # f = e * sigmoid(e) 40 | f_row = e_row * tl.sigmoid(e_row) # e_row / (1 + tl.exp(-e_row)) 41 | f_row = f_row.to(g_row.dtype) # Exact copy from HF 42 | # h = f * g 43 | h_row = f_row * g_row 44 | 45 | # Store h 46 | tl.store(h + offsets, h_row, mask=mask) 47 | 48 | 49 | pass 50 | 51 | 52 | def swiglu_fg_kernel(e, g): 53 | batch, seq_len, hd = e.shape 54 | n_elements = e.numel() 55 | h = torch.empty((batch, seq_len, hd), dtype=e.dtype, device="cuda") 56 | grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),) 57 | _fg_kernel[grid]( 58 | e, 59 | g, 60 | h, 61 | n_elements, 62 | BLOCK_SIZE=1024, 63 | ) 64 | return h 65 | 66 | 67 | pass 68 | 69 | 70 | @triton.jit 71 | def _DWf_DW_dfg_kernel( 72 | DW, 73 | e, 74 | g, 75 | n_elements, 76 | BLOCK_SIZE: tl.constexpr, 77 | ): 78 | """ 79 | e = e.float() 80 | se = 1.0 / (1.0 + torch.exp(-e)) 81 | f = (se * e).to(dtype) 82 | h = f * g 83 | df = DW * f 84 | dg = DW * g 85 | de = (dg.float() * se * (1.0 + e * (1.0 - se))).to(dtype) 86 | """ 87 | block_idx = tl.program_id(0) 88 | offsets = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) 89 | mask = offsets < n_elements 90 | 91 | DW_row = tl.load(DW + offsets, mask=mask, other=0) # .to(tl.float32) 92 | e_row = tl.load(e + offsets, mask=mask, other=0).to(tl.float32) 93 | g_row = tl.load(g + offsets, mask=mask, other=0) # .to(tl.float32) 94 | 95 | # e = e.float() 96 | # se = 1.0 / (1.0 + torch.exp(-e)) 97 | se_row = tl.sigmoid(e_row) # 1.0 / (1.0 + tl.exp(-e_row)) 98 | # f = (se * e).to(dtype) 99 | f_row = se_row * e_row 100 | f_row = f_row.to(DW_row.dtype) 101 | # h = f * g 102 | h_row = f_row * g_row 103 | # df = DW * f 104 | df_row = DW_row * f_row 105 | # dg = DW * g 106 | dg_row = DW_row * g_row 107 | # de = (dg.float() * se * (1.0 + e * (1.0 - se))).to(dtype) 108 | de_row = dg_row.to(tl.float32) * se_row * (1.0 + e_row * (1.0 - se_row)) 109 | de_row = de_row.to(DW_row.dtype) 110 | 111 | # Store derivatives in buffers 112 | tl.store(DW + offsets, h_row, mask=mask) # h = f * g 113 | tl.store(e + offsets, df_row, mask=mask) # df = DW * f 114 | tl.store(g + offsets, de_row, mask=mask) # de 115 | 116 | 117 | pass 118 | 119 | 120 | def swiglu_DWf_DW_dfg_kernel(DW, e, g): 121 | batch_seq_len, hd = e.shape 122 | n_elements = e.numel() 123 | grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),) 124 | _DWf_DW_dfg_kernel[grid]( 125 | DW, 126 | e, 127 | g, 128 | n_elements, 129 | BLOCK_SIZE=1024, 130 | ) 131 | return DW, e, g 132 | 133 | 134 | pass 135 | -------------------------------------------------------------------------------- /extensions/thunder/unsloth/kernels/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from litgpt.utils import _TRITON_AVAILABLE 17 | 18 | if _TRITON_AVAILABLE: 19 | import triton 20 | 21 | MAX_FUSED_SIZE = 65536 # 2**16 22 | next_power_of_2 = triton.next_power_of_2 23 | 24 | 25 | def calculate_settings(n): 26 | BLOCK_SIZE = next_power_of_2(n) 27 | if BLOCK_SIZE > MAX_FUSED_SIZE: 28 | raise RuntimeError( 29 | f"Cannot launch Triton kernel since n = {n} exceeds the maximum CUDA blocksize = {MAX_FUSED_SIZE}." 30 | ) 31 | num_warps = 4 32 | if BLOCK_SIZE >= 32768: 33 | num_warps = 32 34 | elif BLOCK_SIZE >= 8192: 35 | num_warps = 16 36 | elif BLOCK_SIZE >= 2048: 37 | num_warps = 8 38 | return BLOCK_SIZE, num_warps 39 | 40 | 41 | pass 42 | -------------------------------------------------------------------------------- /extensions/xla/__init__: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | # support running without installing as a package, adding extensions to the Python path 5 | wd = Path(__file__).parent.parent.resolve() 6 | sys.path.append(str(wd)) 7 | -------------------------------------------------------------------------------- /extensions/xla/finetune/__init__: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/extensions/xla/finetune/__init__ -------------------------------------------------------------------------------- /extensions/xla/generate/__init__: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/extensions/xla/generate/__init__ -------------------------------------------------------------------------------- /extensions/xla/scripts/__init__: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/extensions/xla/scripts/__init__ -------------------------------------------------------------------------------- /litgpt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import logging 4 | import re 5 | 6 | from litgpt.api import LLM 7 | from litgpt.config import Config 8 | from litgpt.model import GPT # needs to be imported before config 9 | from litgpt.prompts import PromptStyle 10 | from litgpt.tokenizer import Tokenizer 11 | 12 | # Suppress excessive warnings, see https://github.com/pytorch/pytorch/issues/111632 13 | pattern = re.compile(".*Profiler function .* will be ignored") 14 | logging.getLogger("torch._dynamo.variables.torch").addFilter(lambda record: not pattern.search(record.getMessage())) 15 | 16 | # Avoid printing state-dict profiling output at the WARNING level when saving a checkpoint 17 | logging.getLogger("torch.distributed.fsdp._optim_utils").disabled = True 18 | logging.getLogger("torch.distributed.fsdp._debug_utils").disabled = True 19 | 20 | __all__ = ["LLM", "GPT", "Config", "PromptStyle", "Tokenizer"] 21 | -------------------------------------------------------------------------------- /litgpt/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import warnings 4 | 5 | import torch 6 | from jsonargparse import CLI, set_config_read_mode, set_docstring_parse_options 7 | 8 | from litgpt.chat.base import main as chat_fn 9 | from litgpt.deploy.serve import run_server as serve_fn 10 | from litgpt.eval.evaluate import convert_and_evaluate as evaluate_fn 11 | from litgpt.finetune.adapter import setup as finetune_adapter_fn 12 | from litgpt.finetune.adapter_v2 import setup as finetune_adapter_v2_fn 13 | from litgpt.finetune.full import setup as finetune_full_fn 14 | from litgpt.finetune.lora import setup as finetune_lora_fn 15 | from litgpt.generate.adapter import main as generate_adapter_fn 16 | from litgpt.generate.adapter_v2 import main as generate_adapter_v2_fn 17 | from litgpt.generate.base import main as generate_base_fn 18 | from litgpt.generate.full import main as generate_full_fn 19 | from litgpt.generate.sequentially import main as generate_sequentially_fn 20 | from litgpt.generate.speculative_decoding import main as generate_speculatively_fn 21 | from litgpt.generate.tp import main as generate_tp_fn 22 | from litgpt.pretrain import setup as pretrain_fn 23 | from litgpt.scripts.convert_hf_checkpoint import convert_hf_checkpoint as convert_hf_checkpoint_fn 24 | from litgpt.scripts.convert_lit_checkpoint import convert_lit_checkpoint as convert_lit_checkpoint_fn 25 | from litgpt.scripts.convert_pretrained_checkpoint import ( 26 | convert_pretrained_checkpoint as convert_pretrained_checkpoint_fn, 27 | ) 28 | from litgpt.scripts.download import download_from_hub as download_fn 29 | from litgpt.scripts.merge_lora import merge_lora as merge_lora_fn 30 | 31 | 32 | def main() -> None: 33 | parser_data = { 34 | "download": download_fn, 35 | "chat": chat_fn, 36 | "finetune": finetune_lora_fn, 37 | "finetune_lora": finetune_lora_fn, 38 | "finetune_full": finetune_full_fn, 39 | "finetune_adapter": finetune_adapter_fn, 40 | "finetune_adapter_v2": finetune_adapter_v2_fn, 41 | "pretrain": pretrain_fn, 42 | "generate": generate_base_fn, 43 | "generate_full": generate_full_fn, 44 | "generate_adapter": generate_adapter_fn, 45 | "generate_adapter_v2": generate_adapter_v2_fn, 46 | "generate_sequentially": generate_sequentially_fn, 47 | "generate_speculatively": generate_speculatively_fn, 48 | "generate_tp": generate_tp_fn, 49 | "convert_to_litgpt": convert_hf_checkpoint_fn, 50 | "convert_from_litgpt": convert_lit_checkpoint_fn, 51 | "convert_pretrained_checkpoint": convert_pretrained_checkpoint_fn, 52 | "merge_lora": merge_lora_fn, 53 | "evaluate": evaluate_fn, 54 | "serve": serve_fn, 55 | } 56 | 57 | set_docstring_parse_options(attribute_docstrings=True) 58 | set_config_read_mode(urls_enabled=True) 59 | 60 | # PyTorch bug that raises a false-positive warning 61 | # More info: https://github.com/Lightning-AI/litgpt/issues/1561 62 | warning_message = r"The epoch parameter in `scheduler.step\(\)` was not necessary and is being deprecated.*" 63 | 64 | warnings.filterwarnings( 65 | action="ignore", message=warning_message, category=UserWarning, module=r".*torch\.optim\.lr_scheduler.*" 66 | ) 67 | 68 | torch.set_float32_matmul_precision("high") 69 | CLI(parser_data) 70 | 71 | 72 | if __name__ == "__main__": 73 | main() 74 | -------------------------------------------------------------------------------- /litgpt/args.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | import math 3 | import warnings 4 | from dataclasses import dataclass 5 | from typing import Optional, Union 6 | 7 | 8 | @dataclass 9 | class TrainArgs: 10 | """Training-related arguments""" 11 | 12 | save_interval: Optional[int] = 1000 13 | """Number of optimizer steps between saving checkpoints""" 14 | log_interval: int = 1 15 | """Number of iterations between logging calls""" 16 | global_batch_size: int = 64 17 | """Number of samples between optimizer steps across data-parallel ranks""" 18 | micro_batch_size: int = 4 19 | """Number of samples per data-parallel rank""" 20 | lr_warmup_steps: Optional[int] = 100 21 | """Number of iterations with learning rate warmup active""" 22 | lr_warmup_fraction: Optional[float] = None 23 | """The fraction of an epoch to use for learning rate warmup""" 24 | epochs: Optional[int] = None 25 | """Number of epochs to train on""" 26 | # TODO: `pretrain` is the only script using `max_tokens` explicitly. replace it with epoch_size*epochs? 27 | max_tokens: Optional[int] = None 28 | """Total number of tokens to train on""" 29 | max_steps: Optional[int] = None 30 | """Limits the number of optimizer steps to run""" 31 | max_seq_length: Optional[int] = None 32 | """Limits the length of samples""" 33 | tie_embeddings: Optional[bool] = None 34 | """Whether to tie the embedding weights with the language modeling head weights""" 35 | 36 | # Optimization args 37 | max_norm: Optional[float] = None 38 | min_lr: float = 6e-5 39 | 40 | def __post_init__(self) -> None: 41 | if self.lr_warmup_fraction and self.lr_warmup_steps: 42 | raise ValueError( 43 | "Can't provide both `--train.lr_warmup_fraction` and `--train.lr_warmup_steps`. Choose one." 44 | ) 45 | if self.lr_warmup_fraction and not (0 <= self.lr_warmup_fraction <= 1): 46 | raise ValueError("`--train.lr_warmup_fraction` must be between 0 and 1.") 47 | 48 | if self.lr_warmup_steps and self.max_steps and (self.lr_warmup_steps >= self.max_steps): 49 | warnings.warn( 50 | "`--train.lr_warmup_steps` should be less than `--train.max_steps`." 51 | f" Got {self.lr_warmup_steps} lr_warmup_steps and {self.max_steps} max_steps.", 52 | UserWarning, 53 | ) 54 | 55 | def gradient_accumulation_iters(self, devices: int, num_nodes: int = 1) -> int: 56 | """Number of iterations between gradient synchronizations""" 57 | gradient_accumulation_iters = self.batch_size(devices, num_nodes) // self.micro_batch_size 58 | assert gradient_accumulation_iters > 0 59 | return gradient_accumulation_iters 60 | 61 | def batch_size(self, devices: int, num_nodes: int = 1) -> int: 62 | """Number of samples between optimizer steps per data-parallel rank""" 63 | batch_size = self.global_batch_size // (devices * num_nodes) 64 | assert batch_size > 0 65 | return batch_size 66 | 67 | def warmup_iters(self, devices: int, num_nodes: int, max_iters: int, train_dataloader) -> int: 68 | """Number of iterations to warm up the learning rate.""" 69 | if self.lr_warmup_fraction: 70 | return min(max_iters, math.ceil(self.lr_warmup_fraction * len(train_dataloader))) 71 | if self.lr_warmup_steps: 72 | return min(max_iters, self.lr_warmup_steps * self.gradient_accumulation_iters(devices, num_nodes)) 73 | return 0 74 | 75 | 76 | @dataclass 77 | class EvalArgs: 78 | """Evaluation-related arguments""" 79 | 80 | interval: int = 600 81 | """Number of optimizer steps between evaluation calls""" 82 | max_new_tokens: Optional[int] = None 83 | """Number of tokens to generate""" 84 | max_iters: int = 100 85 | """Number of iterations""" 86 | initial_validation: bool = False 87 | """Whether to evaluate on the validation set at the beginning of the training""" 88 | final_validation: bool = True 89 | """Whether to evaluate on the validation set at the end of the training""" 90 | evaluate_example: Union[str, int] = "first" 91 | """How to pick an example instruction to evaluate periodically during training. 92 | Can be "first", "random", or an integer index to pick a specific example.""" 93 | 94 | 95 | @dataclass 96 | class LogArgs: 97 | """Logging-related arguments""" 98 | 99 | project: Optional[str] = None 100 | """Project name""" 101 | run: Optional[str] = None 102 | """Run name""" 103 | group: Optional[str] = None 104 | """Group name""" 105 | -------------------------------------------------------------------------------- /litgpt/chat/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/litgpt/chat/__init__.py -------------------------------------------------------------------------------- /litgpt/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | from litgpt.data.alpaca import Alpaca 4 | from litgpt.data.alpaca_2k import Alpaca2k 5 | from litgpt.data.alpaca_gpt4 import AlpacaGPT4 6 | from litgpt.data.base import DataModule, SFTDataset, get_sft_collate_fn 7 | from litgpt.data.deita import Deita 8 | from litgpt.data.flan import FLAN 9 | from litgpt.data.json_data import JSON 10 | from litgpt.data.lima import LIMA 11 | from litgpt.data.lit_data import LitData 12 | from litgpt.data.longform import LongForm 13 | from litgpt.data.microllama import MicroLlama 14 | from litgpt.data.openwebtext import OpenWebText 15 | from litgpt.data.text_files import TextFiles 16 | from litgpt.data.tinyllama import TinyLlama 17 | from litgpt.data.tinystories import TinyStories 18 | 19 | __all__ = [ 20 | "Alpaca", 21 | "Alpaca2k", 22 | "AlpacaGPT4", 23 | "Deita", 24 | "FLAN", 25 | "JSON", 26 | "LIMA", 27 | "LitData", 28 | "DataModule", 29 | "LongForm", 30 | "OpenWebText", 31 | "SFTDataset", 32 | "TextFiles", 33 | "TinyLlama", 34 | "TinyStories", 35 | "MicroLlama", 36 | "get_sft_collate_fn", 37 | ] 38 | -------------------------------------------------------------------------------- /litgpt/data/alpaca_2k.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | 4 | from dataclasses import dataclass, field 5 | from pathlib import Path 6 | 7 | from litgpt.data.alpaca import Alpaca 8 | from litgpt.data.base import SFTDataset 9 | 10 | 11 | @dataclass 12 | class Alpaca2k(Alpaca): 13 | """Alpaca2k data module for supervised finetuning.""" 14 | 15 | val_split_fraction: float = 0.05 # to get exactly 100 validation samples, 16 | """The fraction of the dataset to use for the validation dataset. The rest is used for training.""" 17 | download_dir: Path = Path("./data/alpaca2k") 18 | """The directory in which the downloaded datasetgets saved.""" 19 | repo_id: str = field(repr=False, default="mhenrichsen/alpaca_2k_test") 20 | """The URL from where to download the dataset.""" 21 | file_name: str = field(repr=False, default="alpaca2k_data_cleaned_archive.json") 22 | """The name of the dataset file to download.""" 23 | 24 | def prepare_data(self) -> None: 25 | from datasets import load_dataset 26 | 27 | load_dataset(self.repo_id, cache_dir=self.download_dir) 28 | 29 | def setup(self, stage: str = "") -> None: 30 | from datasets import load_dataset 31 | 32 | dataset = load_dataset(self.repo_id, cache_dir=self.download_dir) 33 | 34 | train_validation_split = dataset["train"].train_test_split(test_size=self.val_split_fraction, seed=self.seed) 35 | train_data = train_validation_split["train"] 36 | test_data = train_validation_split["test"] 37 | 38 | self.train_dataset = SFTDataset( 39 | data=train_data, 40 | tokenizer=self.tokenizer, 41 | prompt_style=self.prompt_style, 42 | max_seq_length=self.max_seq_length, 43 | mask_prompt=self.mask_prompt, 44 | ignore_index=self.ignore_index, 45 | ) 46 | self.test_dataset = SFTDataset( 47 | data=test_data, 48 | tokenizer=self.tokenizer, 49 | prompt_style=self.prompt_style, 50 | max_seq_length=self.max_seq_length, 51 | mask_prompt=self.mask_prompt, 52 | ignore_index=self.ignore_index, 53 | ) 54 | -------------------------------------------------------------------------------- /litgpt/data/alpaca_gpt4.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | 4 | from dataclasses import dataclass, field 5 | from pathlib import Path 6 | 7 | from litgpt.data.alpaca import Alpaca 8 | 9 | _URL = "https://raw.githubusercontent.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/main/data/alpaca_gpt4_data.json" 10 | 11 | 12 | @dataclass 13 | class AlpacaGPT4(Alpaca): 14 | """AlpacaGPT4 data module for supervised finetuning.""" 15 | 16 | val_split_fraction: float = 0.03847 # to get exactly 2000 test samples, 17 | """The fraction of the dataset to use for the validation dataset. The rest is used for training.""" 18 | download_dir: Path = Path("./data/alpacagpt4") 19 | """The directory in which the downloaded datasetgets saved.""" 20 | file_url: str = field(repr=False, default=_URL) 21 | """The URL from where to download the dataset.""" 22 | file_name: str = field(repr=False, default="alpacagpt4_data_cleaned_archive.json") 23 | """The name of the dataset file to download.""" 24 | -------------------------------------------------------------------------------- /litgpt/data/lit_data.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | import os 3 | from dataclasses import dataclass, field 4 | from pathlib import Path 5 | from typing import Optional, Tuple, Union 6 | 7 | from torch.utils.data import DataLoader 8 | 9 | from litgpt.data import DataModule 10 | from litgpt.tokenizer import Tokenizer 11 | 12 | 13 | @dataclass 14 | class LitData(DataModule): 15 | """Loads data using LitData's StreamingDataset given a path to a folder of preprocessed data (chunks).""" 16 | 17 | data_path: Union[str, Path] = Path("data/") 18 | """The path to the data directory containing the preprocessed chunks for the streaming dataset 19 | The path can also be a remote path (e.g., s3://). See also ``split_names`` if this path contains subfolders 20 | for training- and validation splits.""" 21 | split_names: Optional[Tuple[str, str]] = None 22 | """Optional tuple for names of subfolders for training and validation under ``data_path``. If not provided, 23 | all data under data_path will be used for training, and the validation dataloader will be identical to the 24 | train dataloader.""" 25 | seed: int = 42 26 | """The random seed for shuffling the dataset.""" 27 | num_workers: int = 8 28 | """How many DataLoader processes to use for loading.""" 29 | 30 | batch_size: int = field(init=False, repr=False, default=1) 31 | seq_length: int = field(init=False, repr=False, default=2048) 32 | 33 | def __post_init__(self) -> None: 34 | super().__init__() 35 | if self.split_names is not None and len(self.split_names) != 2: 36 | raise ValueError("If provided `split_names` must be a tuple of two strings, for example: ('train', 'val').") 37 | 38 | def connect( 39 | self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: Optional[int] = None 40 | ) -> None: 41 | self.batch_size = batch_size 42 | self.seq_length = max_seq_length + 1 # Increase by one because we need the next token as well 43 | 44 | def train_dataloader(self) -> DataLoader: 45 | input_dir = os.path.join(self.data_path, self.split_names[0]) if self.split_names else str(self.data_path) 46 | return self._dataloader(input_dir=input_dir, train=True) 47 | 48 | def val_dataloader(self) -> DataLoader: 49 | input_dir = os.path.join(self.data_path, self.split_names[1]) if self.split_names else str(self.data_path) 50 | return self._dataloader(input_dir=input_dir, train=False) 51 | 52 | def _dataloader(self, input_dir: str, train: bool): 53 | from litdata.streaming import StreamingDataLoader, StreamingDataset, TokensLoader 54 | 55 | dataset = StreamingDataset( 56 | input_dir=input_dir, 57 | item_loader=TokensLoader(block_size=self.seq_length), 58 | shuffle=train, 59 | seed=self.seed, 60 | ) 61 | dataloader = StreamingDataLoader( 62 | dataset, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True 63 | ) 64 | return dataloader 65 | -------------------------------------------------------------------------------- /litgpt/data/longform.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import json 4 | from dataclasses import dataclass, field 5 | from pathlib import Path 6 | from typing import Optional, Union 7 | 8 | import torch 9 | from torch.utils.data import DataLoader 10 | 11 | from litgpt.data import DataModule, SFTDataset, get_sft_collate_fn 12 | from litgpt.data.alpaca import download_if_missing 13 | from litgpt.prompts import PromptStyle 14 | from litgpt.tokenizer import Tokenizer 15 | 16 | _URL = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset" 17 | 18 | 19 | @dataclass 20 | class LongForm(DataModule): 21 | """LongForm data module for supervised finetuning.""" 22 | 23 | mask_prompt: bool = False 24 | """Whether to mask the prompt section from the label (with ``ignore_index``).""" 25 | prompt_style: Union[str, PromptStyle] = "longform" 26 | """The style to apply to instruction prompts. See `litgpt.prompts` for a list of available styles.""" 27 | ignore_index: int = -100 28 | """The index to use for elements to be ignored in the label.""" 29 | seed: int = 42 30 | """The random seed for shuffling the dataset.""" 31 | num_workers: int = 4 32 | """How many DataLoader processes to use for loading.""" 33 | download_dir: Path = Path("./data/longform") 34 | """The directory in which the downloaded dataset gets saved.""" 35 | 36 | tokenizer: Optional[Tokenizer] = field(default=None, init=False, repr=False) 37 | batch_size: int = field(default=1, init=False, repr=False) 38 | max_seq_length: int = field(default=-1, init=False, repr=False) 39 | train_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False) 40 | test_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False) 41 | 42 | def __post_init__(self) -> None: 43 | super().__init__() 44 | if isinstance(self.prompt_style, str): 45 | self.prompt_style = PromptStyle.from_name(self.prompt_style) 46 | 47 | def connect( 48 | self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: Optional[int] = None 49 | ) -> None: 50 | self.tokenizer = tokenizer 51 | self.batch_size = batch_size 52 | self.max_seq_length = -1 if max_seq_length is None else max_seq_length 53 | 54 | def prepare_data(self) -> None: 55 | self.download_dir.mkdir(parents=True, exist_ok=True) 56 | download_if_missing(self.download_dir / "train.json", f"{_URL}/train.json") 57 | download_if_missing(self.download_dir / "val.json", f"{_URL}/val.json") 58 | 59 | def train_dataloader(self): 60 | return self._dataloader("train") 61 | 62 | def val_dataloader(self): 63 | return self._dataloader("val") 64 | 65 | def _dataloader(self, split: str) -> DataLoader: 66 | with open(self.download_dir / f"{split}.json", encoding="utf-8") as file: 67 | data = json.load(file) 68 | 69 | dataset = SFTDataset( 70 | data=data, 71 | tokenizer=self.tokenizer, 72 | prompt_style=self.prompt_style, 73 | max_seq_length=self.max_seq_length, 74 | mask_prompt=self.mask_prompt, 75 | ignore_index=self.ignore_index, 76 | transform=_transform, 77 | ) 78 | return DataLoader( 79 | dataset=dataset, 80 | batch_size=self.batch_size, 81 | shuffle=(split == "train"), 82 | generator=torch.Generator().manual_seed(self.seed), 83 | num_workers=self.num_workers, 84 | collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index), 85 | ) 86 | 87 | 88 | def _transform(item: dict) -> dict: 89 | item["instruction"] = item.pop("input") 90 | return item 91 | -------------------------------------------------------------------------------- /litgpt/data/microllama.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | from dataclasses import dataclass 3 | from pathlib import Path 4 | from typing import Union 5 | 6 | from litgpt.data.tinyllama import TinyLlama 7 | 8 | 9 | @dataclass 10 | class MicroLlama(TinyLlama): 11 | """The MicroLlama data module is composed of only SlimPajama data.""" 12 | 13 | def __init__(self, data_path: Union[str, Path] = Path("data/"), seed: int = 42, num_workers: int = 8): 14 | super().__init__(data_path=data_path, seed=seed, num_workers=num_workers, use_starcoder=False) 15 | -------------------------------------------------------------------------------- /litgpt/data/prepare_slimpajama.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import json 4 | import os 5 | import time 6 | from pathlib import Path 7 | 8 | from litgpt.data.prepare_starcoder import DataChunkRecipe 9 | from litgpt.tokenizer import Tokenizer 10 | from litgpt.utils import CLI, extend_checkpoint_dir 11 | 12 | 13 | class SlimPajamaDataRecipe(DataChunkRecipe): 14 | is_generator = True 15 | 16 | def __init__(self, tokenizer: Tokenizer, chunk_size: int): 17 | super().__init__(chunk_size) 18 | self.tokenizer = tokenizer 19 | 20 | def prepare_structure(self, input_dir): 21 | files = Path(input_dir).rglob("*.zst") 22 | return [str(file) for file in files] 23 | 24 | def prepare_item(self, filepath): 25 | import zstandard as zstd 26 | 27 | with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f: 28 | for row in f: 29 | text = json.loads(row)["text"] 30 | if json.loads(row)["meta"]["redpajama_set_name"] == "RedPajamaGithub": 31 | continue # exclude the GitHub data since it overlaps with starcoder 32 | text_ids = self.tokenizer.encode(string=text, bos=False, eos=True) 33 | yield text_ids 34 | 35 | 36 | def prepare( 37 | input_dir: Path = Path("data/SlimPajama-627B/train"), 38 | output_dir: Path = Path("data/slimpajama/train"), 39 | tokenizer_path: Path = Path("checkpoints/Llama-2-7b-hf/"), 40 | chunk_size: int = (2049 * 16384), 41 | fast_dev_run: bool = False, 42 | ) -> None: 43 | from litdata.processing.data_processor import DataProcessor 44 | 45 | tokenizer_path = extend_checkpoint_dir(tokenizer_path) 46 | tokenizer = Tokenizer(tokenizer_path) 47 | data_recipe = SlimPajamaDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size) 48 | data_processor = DataProcessor( 49 | input_dir=str(input_dir), 50 | output_dir=str(output_dir), 51 | fast_dev_run=fast_dev_run, 52 | num_workers=os.cpu_count(), 53 | num_downloaders=1, 54 | ) 55 | 56 | start_time = time.time() 57 | data_processor.run(data_recipe) 58 | elapsed_time = time.time() - start_time 59 | print(f"Time taken: {elapsed_time:.2f} seconds") 60 | 61 | 62 | if __name__ == "__main__": 63 | CLI(prepare) 64 | -------------------------------------------------------------------------------- /litgpt/data/prepare_starcoder.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import os 4 | import time 5 | import traceback 6 | from pathlib import Path 7 | 8 | from lightning_utilities.core.imports import RequirementCache 9 | 10 | from litgpt.tokenizer import Tokenizer 11 | from litgpt.utils import CLI, extend_checkpoint_dir 12 | 13 | _LITDATA_AVAILABLE = RequirementCache("litdata") 14 | if _LITDATA_AVAILABLE: 15 | from litdata.processing.data_processor import DataChunkRecipe 16 | else: 17 | DataChunkRecipe = object 18 | 19 | 20 | class StarcoderDataRecipe(DataChunkRecipe): 21 | is_generator = True 22 | 23 | def __init__(self, tokenizer: Tokenizer, chunk_size: int): 24 | super().__init__(chunk_size) 25 | self.tokenizer = tokenizer 26 | 27 | def prepare_structure(self, input_dir): 28 | files = Path(input_dir).rglob("*.parquet") 29 | return [str(file) for file in files] 30 | 31 | def prepare_item(self, item_metadata): 32 | import pyarrow.parquet as pq 33 | 34 | filepath = item_metadata 35 | start = time.time() 36 | 37 | try: 38 | parquet_file = pq.ParquetFile(filepath) 39 | # reduce RAM usage 40 | for batch in parquet_file.iter_batches(batch_size=8192, columns=["content"]): 41 | for text in batch.to_pandas()["content"]: 42 | yield self.tokenizer.encode(text, bos=False, eos=True) 43 | 44 | except Exception: 45 | print(traceback.format_exc()) 46 | print(f"Error reading {filepath}") 47 | return 48 | 49 | parquet_file.close() 50 | end = time.time() 51 | print(f"Took {end - start:.2f} seconds total", filepath) 52 | 53 | 54 | def prepare( 55 | input_dir: Path = Path("data/starcoderdata"), 56 | output_dir: Path = Path("data/starcoder"), 57 | tokenizer_path: Path = Path("checkpoints/Llama-2-7b-hf/"), 58 | chunk_size: int = (2049 * 8192), 59 | fast_dev_run: bool = False, 60 | ) -> None: 61 | from litdata.processing.data_processor import DataProcessor 62 | 63 | tokenizer_path = extend_checkpoint_dir(tokenizer_path) 64 | tokenizer = Tokenizer(tokenizer_path) 65 | data_recipe = StarcoderDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size) 66 | data_processor = DataProcessor( 67 | input_dir=str(input_dir), 68 | output_dir=str(output_dir), 69 | fast_dev_run=fast_dev_run, 70 | num_workers=os.cpu_count(), 71 | num_downloaders=1, 72 | ) 73 | 74 | start_time = time.time() 75 | data_processor.run(data_recipe) 76 | elapsed_time = time.time() - start_time 77 | print(f"Time taken: {elapsed_time:.2f} seconds") 78 | 79 | 80 | if __name__ == "__main__": 81 | CLI(prepare) 82 | -------------------------------------------------------------------------------- /litgpt/data/tinyllama.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | from dataclasses import dataclass, field 3 | from pathlib import Path 4 | from typing import Optional, Union 5 | 6 | from torch.utils.data import DataLoader 7 | 8 | from litgpt.data import DataModule 9 | from litgpt.tokenizer import Tokenizer 10 | 11 | 12 | @dataclass 13 | class TinyLlama(DataModule): 14 | """The TinyLlama data module is composed of a mix of SlimPajama and Starcoder data. 15 | 16 | Provides training and validation streaming dataloaders that return batches of tokens. 17 | """ 18 | 19 | data_path: Union[str, Path] = Path("data/") 20 | """The path to the data directory, containing two folders 'slimpajama' and 'starcoder' 21 | which are the output of the preprocessing step done in advance. See the `tutorial/pretrain_tinyllama.md` 22 | for instructions. The path can also be a remote path (e.g., s3://).""" 23 | seed: int = 42 24 | """The random seed for shuffling the dataset.""" 25 | num_workers: int = 8 26 | """How many DataLoader processes to use for loading.""" 27 | use_starcoder: bool = True 28 | """Toggle for using Starcoder data.""" 29 | 30 | batch_size: int = field(init=False, repr=False, default=1) 31 | seq_length: int = field(init=False, repr=False, default=2048) 32 | 33 | def __post_init__(self): 34 | super().__init__() 35 | # Could be a remote path (s3://) or a local path 36 | self.slimpajama_train = str(self.data_path).rstrip("/") + "/slimpajama/train" 37 | self.slimpajama_val = str(self.data_path).rstrip("/") + "/slimpajama/val" 38 | self.required_paths = [self.slimpajama_train, self.slimpajama_val] 39 | 40 | if self.use_starcoder: 41 | self.starcoder_train = str(self.data_path).rstrip("/") + "/starcoder" 42 | self.required_paths += [self.starcoder_train] 43 | 44 | def connect( 45 | self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: Optional[int] = None 46 | ) -> None: 47 | self.batch_size = batch_size 48 | self.seq_length = max_seq_length + 1 # Increase by one because we need the next token as well 49 | 50 | def prepare_data(self) -> None: 51 | for path in self.required_paths: 52 | if not path.startswith("s3://") and not Path(path).is_dir(): 53 | raise FileNotFoundError( 54 | "The data path for TinyLlama is expected to be the directory containing these subdirectories:" 55 | f" `slimpajama/train`, `slimpajama/val`, `starcoder`. The directory {path} does not exist." 56 | " Set it via `--data.data_path=...`" 57 | ) 58 | 59 | def train_dataloader(self) -> DataLoader: 60 | from litdata.streaming import CombinedStreamingDataset, StreamingDataLoader, StreamingDataset, TokensLoader 61 | 62 | slim_train_data = StreamingDataset( 63 | input_dir=self.slimpajama_train, 64 | item_loader=TokensLoader(block_size=self.seq_length), 65 | shuffle=True, 66 | drop_last=True, 67 | ) 68 | train_data = slim_train_data 69 | 70 | if self.use_starcoder: 71 | train_datasets = [ 72 | slim_train_data, 73 | StreamingDataset( 74 | input_dir=self.starcoder_train, 75 | item_loader=TokensLoader(block_size=self.seq_length), 76 | shuffle=True, 77 | drop_last=True, 78 | ), 79 | ] 80 | 81 | # Mix SlimPajama data and Starcoder data with these proportions: 82 | weights = (0.693584, 0.306416) 83 | train_data = CombinedStreamingDataset( 84 | datasets=train_datasets, seed=self.seed, weights=weights, iterate_over_all=False 85 | ) 86 | 87 | train_dataloader = StreamingDataLoader( 88 | train_data, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True 89 | ) 90 | return train_dataloader 91 | 92 | def val_dataloader(self) -> DataLoader: 93 | from litdata.streaming import StreamingDataLoader, StreamingDataset, TokensLoader 94 | 95 | val_dataset = StreamingDataset( 96 | input_dir=self.slimpajama_val, 97 | item_loader=TokensLoader(block_size=self.seq_length), 98 | shuffle=True, 99 | ) 100 | val_dataloader = StreamingDataLoader( 101 | val_dataset, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True 102 | ) 103 | return val_dataloader 104 | -------------------------------------------------------------------------------- /litgpt/deploy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/litgpt/deploy/__init__.py -------------------------------------------------------------------------------- /litgpt/finetune/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/litgpt/finetune/__init__.py -------------------------------------------------------------------------------- /litgpt/generate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/litgpt/generate/__init__.py -------------------------------------------------------------------------------- /litgpt/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/litgpt/scripts/__init__.py -------------------------------------------------------------------------------- /litgpt/scripts/convert_pretrained_checkpoint.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | from pathlib import Path 4 | from pprint import pprint 5 | 6 | import torch 7 | 8 | from litgpt.utils import copy_config_files, extend_checkpoint_dir, incremental_save 9 | 10 | 11 | @torch.inference_mode() 12 | def convert_pretrained_checkpoint(checkpoint_dir: Path, output_dir: Path) -> None: 13 | """Convert a checkpoint after pretraining. 14 | 15 | The pretrained checkpoint contains optimizer states and several other metadata that are not needed after training 16 | is finished. This script will export the state-dict of the model and place it in the chosen output folder, 17 | which then can be loaded by other scripts for inference, evaluation, etc. 18 | 19 | Args: 20 | checkpoint_dir: Path to a checkpoint directory produced by ``litgpt.pretrain``. 21 | output_dir: The output folder where the converted state-dict file and config files will be saved to. 22 | """ 23 | checkpoint_dir = extend_checkpoint_dir(checkpoint_dir) 24 | pprint(locals()) 25 | 26 | if output_dir.is_dir() and output_dir.glob("*"): 27 | raise FileExistsError( 28 | f"The output folder exists and is not empty: {str(output_dir)}." 29 | " Please delete it first or choose a different name." 30 | ) 31 | 32 | output_dir.mkdir(parents=True) 33 | checkpoint_file = checkpoint_dir / "lit_model.pth" 34 | output_checkpoint_file = output_dir / "lit_model.pth" 35 | 36 | # TODO: Consolidate sharded checkpoint if applicable 37 | # Extract the model state dict and save to output folder 38 | with incremental_save(output_checkpoint_file) as saver: 39 | print("Processing", checkpoint_file) 40 | full_checkpoint = torch.load(str(checkpoint_file), mmap=True) 41 | loaded_state_dict = full_checkpoint["model"] 42 | converted_state_dict = {} 43 | for param_name, param in loaded_state_dict.items(): 44 | saver.store_early(param) 45 | # remove prefix for compiled model (if any) 46 | param_name = param_name.replace("_orig_mod.", "") 47 | converted_state_dict[param_name] = param 48 | print(f"Saving converted checkpoint to {str(output_checkpoint_file)}.") 49 | saver.save(converted_state_dict) 50 | 51 | copy_config_files(checkpoint_dir, output_dir) 52 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | build-backend = "setuptools.build_meta" 3 | 4 | requires = [ 5 | "setuptools>=68.2.2", 6 | "wheel>=0.41.2", 7 | ] 8 | 9 | [project] 10 | name = "litgpt" 11 | version = "0.5.9.dev1" 12 | description = "Hackable implementation of state-of-the-art open-source LLMs" 13 | readme = "README.md" 14 | license = { file = "LICENSE" } 15 | 16 | authors = [ 17 | { name = "Lightning AI", email = "contact@lightning.ai" }, 18 | ] 19 | classifiers = [ 20 | "Programming Language :: Python :: 3 :: Only", 21 | "Programming Language :: Python :: 3.9", 22 | "Programming Language :: Python :: 3.10", 23 | "Programming Language :: Python :: 3.11", 24 | "Programming Language :: Python :: 3.12", 25 | "Programming Language :: Python :: 3.13", 26 | ] 27 | dependencies = [ 28 | # download models: 29 | "huggingface-hub>=0.23.5", 30 | "jsonargparse[signatures]>=4.30.1,<=4.32.1; python_version<='3.9'", # 4.33 does not seem to be compatible with Python 3.9 31 | "jsonargparse[signatures]>=4.37; python_version>'3.9'", # required to work with python3.12+ 32 | "lightning>=2.5", 33 | "numpy<2", # for older Torch versions 34 | "psutil==7", 35 | "safetensors>=0.4.3", 36 | # tokenization in most models: 37 | "tokenizers>=0.15.2", 38 | "torch>=2.5", 39 | # convert_hf_checkpoint 40 | "tqdm>=4.66", 41 | ] 42 | 43 | optional-dependencies.compiler = [ 44 | # compilaton: 45 | "lightning-thunder>=0.2.0.dev20250119; python_version>='3.10' and sys_platform=='linux'", 46 | ] 47 | optional-dependencies.extra = [ 48 | "bitsandbytes>=0.42,<0.43; sys_platform=='darwin'", 49 | # quantization: 50 | "bitsandbytes>=0.45.2,<0.45.5; sys_platform=='linux' or sys_platform=='win32'", 51 | # litgpt.evaluate: 52 | "datasets>=2.18", 53 | # download: 54 | "huggingface-hub[hf-transfer]>=0.21", 55 | "litdata==0.2.45", 56 | # litgpt.deploy: 57 | "litserve>0.2", 58 | "lm-eval>=0.4.2", 59 | # litgpt.data.prepare_starcoder.py: 60 | "pandas>=1.9", 61 | "pyarrow>=15.0.2", 62 | # litgpt.data: 63 | "requests>=2.31", 64 | # llama-based models: 65 | "sentencepiece>=0.2", 66 | # litgpt.pretrain: 67 | "tensorboard>=2.14", 68 | "torchmetrics>=1.3.1", 69 | "transformers>=4.51.3,<4.52", 70 | # litdata, only on non-Windows: 71 | "uvloop>=0.2; sys_platform!='win32'", 72 | # litgpt.data.prepare_slimpajama.py: 73 | "zstandard>=0.22", 74 | ] 75 | optional-dependencies.test = [ 76 | "einops>=0.7", 77 | "protobuf>=4.23.4", 78 | "pytest>=8.1.1", 79 | "pytest-benchmark>=5.1", 80 | "pytest-dependency>=0.6", 81 | "pytest-rerunfailures>=14", 82 | "pytest-timeout>=2.3.1", 83 | ] 84 | urls.documentation = "https://github.com/lightning-AI/litgpt/tutorials" 85 | urls.homepage = "https://github.com/lightning-AI/litgpt" 86 | scripts.litgpt = "litgpt.__main__:main" 87 | 88 | [tool.setuptools.packages.find] 89 | include = [ 90 | "litgpt", 91 | "litgpt.*", 92 | ] 93 | exclude = [ ] 94 | 95 | [tool.setuptools.package-data] 96 | litgpt = [ 97 | "LICENSE", 98 | "README.md", 99 | ] 100 | 101 | [tool.ruff] 102 | target-version = "py38" 103 | line-length = 120 104 | exclude = [ 105 | "build", 106 | "dist", 107 | "docs", 108 | ] 109 | 110 | lint.select = [ 111 | "E", 112 | "F", # see: https://pypi.org/project/pyflakes 113 | "I", # implementation for isort 114 | "UP", # see: https://docs.astral.sh/ruff/rules/#pyupgrade-up 115 | "W", # see: https://pypi.org/project/pycodestyle 116 | ] 117 | #extend-select = [ 118 | # "C4", # see: https://pypi.org/project/flake8-comprehensions 119 | # "PT", # see: https://pypi.org/project/flake8-pytest-style 120 | # "RET", # see: https://pypi.org/project/flake8-return 121 | # "SIM", # see: https://pypi.org/project/flake8-simplify 122 | #] 123 | lint.ignore = [ 124 | "E501", # Line too long 125 | "E731", # Do not assign a lambda expression, use a def 126 | "E741", # todo: Ambiguous variable name 127 | "F841", # todo: Local variable is assigned to but never used 128 | ] 129 | # Use Google-style docstrings. 130 | lint.pydocstyle.convention = "google" 131 | 132 | [tool.codespell] 133 | #skip = '*.py' 134 | quiet-level = 3 135 | ignore-words-list = """ 136 | tral, \ 137 | Rockerfeller 138 | """ 139 | 140 | [tool.pytest.ini_options] 141 | addopts = [ 142 | "--strict-markers", 143 | #"--doctest-modules", 144 | "--color=yes", 145 | "--disable-pytest-warnings", 146 | ] 147 | -------------------------------------------------------------------------------- /tests/convert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tests/convert/__init__.py -------------------------------------------------------------------------------- /tests/convert/test_pretrained_checkpoint.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import os 4 | 5 | import torch 6 | 7 | from litgpt.scripts.convert_pretrained_checkpoint import convert_pretrained_checkpoint 8 | 9 | 10 | def test_convert_pretrained_checkpoint(tmp_path, fake_checkpoint_dir): 11 | # Pretend we made a checkpoint from pretraining 12 | pretrained_checkpoint = { 13 | "model": {"some.module.weight": torch.rand(2, 2), "_orig_mod.some.other.module.weight": torch.rand(2, 2)}, 14 | "the_optimizer": "optimizer_state", 15 | "other": 1, 16 | } 17 | torch.save(pretrained_checkpoint, fake_checkpoint_dir / "lit_model.pth") 18 | 19 | convert_pretrained_checkpoint(checkpoint_dir=fake_checkpoint_dir, output_dir=(tmp_path / "converted")) 20 | 21 | assert set(os.listdir(tmp_path / "converted")) == { 22 | "lit_model.pth", 23 | "model_config.yaml", 24 | "tokenizer_config.json", 25 | "tokenizer.json", 26 | } 27 | converted_checkpoint = torch.load(tmp_path / "converted" / "lit_model.pth") 28 | assert list(converted_checkpoint.keys()) == ["some.module.weight", "some.other.module.weight"] 29 | -------------------------------------------------------------------------------- /tests/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tests/data/__init__.py -------------------------------------------------------------------------------- /tests/data/test_alpaca.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | from litgpt.data import Alpaca 3 | from litgpt.prompts import Alpaca as AlpacaPromptStyle 4 | 5 | 6 | def test_alpaca(mock_tokenizer, alpaca_path): 7 | alpaca = Alpaca(val_split_fraction=0.5, download_dir=alpaca_path.parent, file_name=alpaca_path.name, num_workers=0) 8 | assert isinstance(alpaca.prompt_style, AlpacaPromptStyle) 9 | alpaca.connect(mock_tokenizer, batch_size=2, max_seq_length=10) 10 | alpaca.prepare_data() 11 | alpaca.setup() 12 | 13 | train_dataloader = alpaca.train_dataloader() 14 | val_dataloader = alpaca.val_dataloader() 15 | 16 | assert len(train_dataloader) == 6 17 | assert len(val_dataloader) == 6 18 | 19 | train_batch = next(iter(train_dataloader)) 20 | val_batch = next(iter(val_dataloader)) 21 | 22 | assert train_batch.keys() == val_batch.keys() == {"input_ids", "labels", "token_counts"} 23 | for key in ["input_ids", "labels"]: 24 | assert train_batch[key].shape == (2, 10), f"Unexpected shape for train_batch[{key}]" 25 | assert val_batch[key].shape == (2, 10), f"Unexpected shape for val_batch[{key}]" 26 | 27 | assert isinstance(train_dataloader.dataset.prompt_style, AlpacaPromptStyle) 28 | assert isinstance(val_dataloader.dataset.prompt_style, AlpacaPromptStyle) 29 | 30 | # has attributes from super class `LightningDataModule` 31 | assert alpaca.prepare_data_per_node 32 | -------------------------------------------------------------------------------- /tests/data/test_base.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | from typing import Optional 4 | 5 | import pytest 6 | import torch 7 | 8 | from litgpt.data.base import SFTDataset, get_sft_collate_fn 9 | from litgpt.prompts import PromptStyle 10 | 11 | 12 | @pytest.mark.parametrize("mask_prompt", [True, False]) 13 | @pytest.mark.parametrize("ignore_index", [-1, -100]) 14 | @pytest.mark.parametrize("max_seq_length", [1000, 5, -1]) 15 | def test_sft_dataset(max_seq_length, ignore_index, mask_prompt, mock_tokenizer): 16 | class Style(PromptStyle): 17 | def apply(self, prompt: str, *, sys_prompt: Optional[str] = None, **kwargs) -> str: 18 | return f"In: {prompt} Out:" 19 | 20 | i = ignore_index 21 | data = [{"instruction": "Foo", "output": "Bar"}, {"instruction": "Boo", "output": "Ahh"}] 22 | 23 | dataset = SFTDataset( 24 | data=data, 25 | tokenizer=mock_tokenizer, 26 | prompt_style=Style(), 27 | mask_prompt=mask_prompt, 28 | ignore_index=ignore_index, 29 | max_seq_length=max_seq_length, 30 | ) 31 | assert len(dataset) == len(data) 32 | 33 | expected_input_ids = torch.tensor([73, 110, 58, 32, 70, 111, 111, 32, 79, 117, 116, 58, 66, 97, 114, 1]) 34 | # If prompt is not masked, labels == input_ids 35 | expected_labels = ( 36 | torch.tensor([i, i, i, i, i, i, i, i, i, i, i, i, 66, 97, 114, 1]) if mask_prompt else expected_input_ids 37 | ) 38 | 39 | if max_seq_length == -1: 40 | assert torch.equal(dataset[0]["input_ids"], expected_input_ids) 41 | assert torch.equal(dataset[0]["labels"], expected_labels) 42 | else: 43 | assert torch.equal(dataset[0]["input_ids"], expected_input_ids[:max_seq_length]) 44 | assert torch.equal(dataset[0]["labels"], expected_labels[:max_seq_length]) 45 | 46 | 47 | @pytest.mark.parametrize("ignore_index", [-1, -100]) 48 | @pytest.mark.parametrize("pad_id", [0, 100]) 49 | def test_sft_collate_fn_padding(pad_id, ignore_index): 50 | collate = get_sft_collate_fn(pad_id=pad_id, ignore_index=ignore_index) 51 | samples = [ 52 | { 53 | "input_ids": torch.tensor([1, 2, 3]), 54 | "labels": torch.tensor([10, 20, 30]), 55 | "token_counts": {"raw": 3, "raw_plus_prompt_template": 25}, 56 | }, 57 | { 58 | "input_ids": torch.tensor([4, 5, 6, 7, 8]), 59 | "labels": torch.tensor([40, 50, 60, 70, 80]), 60 | "token_counts": {"raw": 5, "raw_plus_prompt_template": 27}, 61 | }, 62 | ] 63 | expected = { 64 | "input_ids": torch.tensor([[1, 2, 3, pad_id, pad_id], [4, 5, 6, 7, 8]]), 65 | "labels": torch.tensor([[10, 20, 30, ignore_index, ignore_index], [40, 50, 60, 70, 80]]), 66 | "token_counts": {"raw": torch.tensor([[3], [5]]), "raw_plus_prompt_template": torch.tensor([[25], [27]])}, 67 | } 68 | batch = collate(samples) 69 | assert all(torch.equal(batch[k], expected[k]) for k in ("input_ids", "labels")) 70 | for key in ("raw", "raw_plus_prompt_template"): 71 | assert torch.equal(batch["token_counts"][key], expected["token_counts"][key]), f"Token count mismatch for {key}" 72 | 73 | 74 | def test_sft_collate_fn_truncation(): 75 | collate = get_sft_collate_fn(max_seq_length=2) 76 | samples = [ 77 | { 78 | "input_ids": torch.tensor([1, 2, 3]), 79 | "labels": torch.tensor([10, 20, 30]), 80 | "token_counts": {"raw": 3, "raw_plus_prompt_template": 25}, 81 | }, 82 | { 83 | "input_ids": torch.tensor([4, 5, 6, 7, 8]), 84 | "labels": torch.tensor([40, 50, 60, 70, 80]), 85 | "token_counts": {"raw": 5, "raw_plus_prompt_template": 27}, 86 | }, 87 | ] 88 | expected = { 89 | "input_ids": torch.tensor([[1, 2], [4, 5]]), 90 | "labels": torch.tensor([[10, 20], [40, 50]]), 91 | "token_counts": {"raw": torch.tensor([[3], [5]]), "raw_plus_prompt_template": torch.tensor([[25], [27]])}, 92 | } 93 | batch = collate(samples) 94 | assert all(torch.equal(batch[k], expected[k]) for k in ("input_ids", "labels")) 95 | for key in ("raw", "raw_plus_prompt_template"): 96 | assert torch.equal(batch["token_counts"][key], expected["token_counts"][key]), f"Token count mismatch for {key}" 97 | -------------------------------------------------------------------------------- /tests/data/test_deita.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | from unittest import mock 3 | 4 | from litgpt.data import Deita, SFTDataset 5 | from litgpt.data.deita import format_dataset 6 | from litgpt.prompts import Alpaca as AlpacaPromptStyle 7 | 8 | 9 | def test_format_dataset(): 10 | data = [ 11 | { 12 | "prompt": "prompt1", 13 | "prompt_id": "1", 14 | "messages": [ 15 | {"content": "question1", "role": "user"}, 16 | {"content": "response1", "role": "assistant"}, 17 | {"content": "question2", "role": "user"}, 18 | {"content": "response2", "role": "assistant"}, 19 | ], 20 | }, 21 | { 22 | "prompt": "prompt2", 23 | "prompt_id": "2", 24 | "messages": [ 25 | {"content": "question3", "role": "user"}, 26 | {"content": "response3", "role": "assistant"}, 27 | {"content": "question4", "role": "user"}, 28 | {"content": "response4", "role": "assistant"}, 29 | ], 30 | }, 31 | ] 32 | 33 | assert format_dataset(data, include_multi_turn_conversations=False) == [ 34 | {"instruction": "question1", "output": "response1", "input": ""}, 35 | {"instruction": "question3", "output": "response3", "input": ""}, 36 | ] 37 | assert format_dataset(data, include_multi_turn_conversations=True) == [ 38 | {"instruction": "question1", "output": "response1", "input": ""}, 39 | {"instruction": "question2", "output": "response2", "input": ""}, 40 | {"instruction": "question3", "output": "response3", "input": ""}, 41 | {"instruction": "question4", "output": "response4", "input": ""}, 42 | ] 43 | 44 | 45 | @mock.patch("litgpt.data.deita.format_dataset") 46 | @mock.patch("datasets.load_dataset") 47 | def test_deita(_, format_dataset_mock, mock_tokenizer, tmp_path): 48 | format_dataset_mock.return_value = [ 49 | {"instruction": "inst1", "output": "out1"}, 50 | {"instruction": "inst2", "output": "out2"}, 51 | {"instruction": "inst3", "output": "out3"}, 52 | ] 53 | 54 | deita = Deita(num_workers=0, download_dir=tmp_path) 55 | assert isinstance(deita.prompt_style, AlpacaPromptStyle) 56 | deita.connect(mock_tokenizer, batch_size=2, max_seq_length=10) 57 | deita.prepare_data() 58 | deita.setup() 59 | 60 | train_dataloader = deita.train_dataloader() 61 | assert isinstance(train_dataloader.dataset, SFTDataset) 62 | assert len(train_dataloader) == 2 63 | 64 | val_dataloader = deita.val_dataloader() 65 | assert isinstance(val_dataloader.dataset, SFTDataset) 66 | assert len(val_dataloader) == 2 67 | 68 | assert isinstance(train_dataloader.dataset.prompt_style, AlpacaPromptStyle) 69 | assert isinstance(val_dataloader.dataset.prompt_style, AlpacaPromptStyle) 70 | 71 | # has attributes from super class `LightningDataModule` 72 | assert deita.prepare_data_per_node 73 | -------------------------------------------------------------------------------- /tests/data/test_lit_data.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | import sys 3 | from unittest import mock 4 | from unittest.mock import ANY 5 | 6 | import pytest 7 | 8 | from litgpt.data import LitData 9 | 10 | 11 | @pytest.mark.skipif(sys.platform == "win32", reason="Needs to implement platform agnostic path/url joining") 12 | @mock.patch("litgpt.data.lit_data.LitData._dataloader") 13 | def test_input_dir_and_splits(dl_mock, tmp_path): 14 | with pytest.raises(ValueError, match="If provided `split_names` must be a tuple of two strings"): 15 | LitData(data_path=tmp_path, split_names=("train",)) 16 | 17 | # local dir, no splits 18 | data = LitData(data_path=tmp_path) 19 | data.train_dataloader() 20 | dl_mock.assert_called_with(input_dir=str(tmp_path), train=True) 21 | data.val_dataloader() 22 | dl_mock.assert_called_with(input_dir=str(tmp_path), train=False) 23 | 24 | # local dir, splits 25 | data = LitData(data_path=tmp_path, split_names=("train", "val")) 26 | data.train_dataloader() 27 | dl_mock.assert_called_with(input_dir=str(tmp_path / "train"), train=True) 28 | data.val_dataloader() 29 | dl_mock.assert_called_with(input_dir=str(tmp_path / "val"), train=False) 30 | 31 | # remote dir, splits 32 | data = LitData(data_path="s3://mydataset/data", split_names=("train", "val")) 33 | data.train_dataloader() 34 | dl_mock.assert_called_with(input_dir="s3://mydataset/data/train", train=True) 35 | data.val_dataloader() 36 | dl_mock.assert_called_with(input_dir="s3://mydataset/data/val", train=False) 37 | 38 | 39 | @pytest.mark.skipif(sys.platform == "win32", reason="Needs to implement platform agnostic path/url joining") 40 | @mock.patch("litdata.streaming.StreamingDataset") 41 | @mock.patch("litdata.streaming.StreamingDataLoader") 42 | def test_dataset_args(streaming_dataloader_mock, streaming_dataset_mock, tmp_path): 43 | data = LitData(data_path=tmp_path, seed=1000) 44 | data.train_dataloader() 45 | streaming_dataset_mock.assert_called_with( 46 | input_dir=str(tmp_path), 47 | item_loader=ANY, 48 | shuffle=True, 49 | seed=1000, 50 | ) 51 | streaming_dataloader_mock.assert_called_with( 52 | streaming_dataset_mock(), 53 | batch_size=1, 54 | pin_memory=True, 55 | num_workers=8, 56 | drop_last=True, 57 | ) 58 | -------------------------------------------------------------------------------- /tests/data/test_longform.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | from litgpt.data import LongForm 3 | from litgpt.prompts import Longform as LongFormPromptStyle 4 | 5 | 6 | def test_longform(mock_tokenizer, longform_path): 7 | longform = LongForm(download_dir=longform_path, num_workers=0) 8 | assert isinstance(longform.prompt_style, LongFormPromptStyle) 9 | longform.connect(mock_tokenizer, batch_size=2, max_seq_length=10) 10 | longform.prepare_data() 11 | longform.setup() 12 | 13 | train_dataloader = longform.train_dataloader() 14 | val_dataloader = longform.val_dataloader() 15 | 16 | assert len(train_dataloader) == 9 17 | assert len(val_dataloader) == 5 18 | 19 | train_batch = next(iter(train_dataloader)) 20 | val_batch = next(iter(val_dataloader)) 21 | 22 | assert train_batch.keys() == val_batch.keys() == {"input_ids", "labels", "token_counts"} 23 | for key in ["input_ids", "labels"]: 24 | assert train_batch[key].shape == (2, 10), f"Unexpected shape for train_batch[{key}]" 25 | assert val_batch[key].shape == (2, 10), f"Unexpected shape for val_batch[{key}]" 26 | 27 | assert isinstance(train_dataloader.dataset.prompt_style, LongFormPromptStyle) 28 | assert isinstance(val_dataloader.dataset.prompt_style, LongFormPromptStyle) 29 | 30 | # has attributes from super class `LightningDataModule` 31 | assert longform.prepare_data_per_node 32 | -------------------------------------------------------------------------------- /tests/data/test_openwebtext.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | import sys 3 | from unittest import mock 4 | from unittest.mock import ANY, call 5 | 6 | import pytest 7 | from litdata.streaming import StreamingDataLoader, StreamingDataset 8 | from torch.utils.data import DataLoader 9 | 10 | from litgpt.data import OpenWebText 11 | 12 | 13 | @pytest.mark.skipif(sys.platform == "win32", reason="Not in the mood to add Windows support right now.") 14 | @mock.patch("litdata.optimize") 15 | @mock.patch("litdata.streaming.dataset.subsample_streaming_dataset", return_value=([], [])) 16 | @mock.patch("datasets.load_dataset") 17 | def test_openwebtext(_, __, optimize_mock, tmp_path, mock_tokenizer): 18 | data = OpenWebText(data_path=(tmp_path / "openwebtext")) 19 | assert data.seq_length == 2048 20 | assert data.batch_size == 1 21 | 22 | data.connect(tokenizer=mock_tokenizer, batch_size=2, max_seq_length=1024) 23 | assert data.seq_length == 1025 24 | assert data.batch_size == 2 25 | 26 | # Data does not exist, preprocess it 27 | data.prepare_data() 28 | optimize_mock.assert_has_calls( 29 | [ 30 | call( 31 | fn=ANY, 32 | num_workers=ANY, 33 | inputs=[], 34 | output_dir=str(tmp_path / "openwebtext" / "train"), 35 | chunk_bytes="200MB", 36 | ), 37 | call( 38 | fn=ANY, 39 | num_workers=ANY, 40 | inputs=[], 41 | output_dir=str(tmp_path / "openwebtext" / "val"), 42 | chunk_bytes="200MB", 43 | ), 44 | ] 45 | ) 46 | optimize_mock.reset_mock() 47 | 48 | # Data exists, already preprocessed 49 | (tmp_path / "openwebtext" / "train").mkdir(parents=True) 50 | (tmp_path / "openwebtext" / "val").mkdir(parents=True) 51 | data.prepare_data() 52 | optimize_mock.assert_not_called() 53 | 54 | data.setup() 55 | 56 | train_dataloader = data.train_dataloader() 57 | assert isinstance(train_dataloader, StreamingDataLoader) 58 | assert isinstance(train_dataloader.dataset, StreamingDataset) 59 | 60 | val_dataloader = data.val_dataloader() 61 | assert isinstance(val_dataloader, DataLoader) 62 | assert isinstance(val_dataloader.dataset, StreamingDataset) 63 | 64 | # has attributes from super class `LightningDataModule` 65 | assert data.prepare_data_per_node 66 | -------------------------------------------------------------------------------- /tests/data/test_textfiles.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import torch 4 | from litdata import TokensLoader, optimize 5 | from torch.utils._pytree import tree_map 6 | 7 | from litgpt.data.text_files import TextFiles 8 | 9 | 10 | class Tokenizer: 11 | bos_id = 0 12 | 13 | def encode(self, text, bos, eos): 14 | assert bos 15 | assert not eos 16 | return [self.bos_id] + [ord(c) for c in text] 17 | 18 | 19 | def tokenize(data): 20 | for story in data: 21 | yield torch.tensor(story) 22 | 23 | 24 | def fake_chunk(path, data): 25 | optimize( 26 | fn=tokenize, 27 | inputs=[data] * len(data), 28 | output_dir=str(path), 29 | num_workers=1, 30 | chunk_bytes="200MB", 31 | item_loader=TokensLoader(), 32 | ) 33 | 34 | 35 | def test_textfiles_datamodule(tmp_path): 36 | from litgpt.data.text_files import TextFiles 37 | 38 | data_dir = tmp_path / "textfiles" 39 | datamodule = TextFiles(train_data_path=data_dir, num_workers=1) 40 | datamodule.connect(max_seq_length=2, tokenizer=Tokenizer()) 41 | 42 | # simulate `datamodule.prepare_data` 43 | train_data_dir = data_dir / "train" 44 | train_data_dir.mkdir(parents=True) 45 | fake_chunk(train_data_dir, [[12], [0, 23, 15, 63, 0], [73, 5, 0, 1, 1999, 0, 13]]) 46 | datamodule.setup() 47 | 48 | tr_dataloader = datamodule.train_dataloader() 49 | tr_dataloader.shuffle = False 50 | 51 | actual = tree_map(torch.Tensor.tolist, list(tr_dataloader)) 52 | 53 | # there is 1 sample per index in the data (13) 54 | assert actual == [ 55 | [[73, 5, 0]], 56 | [[12, 0, 23]], 57 | [[5, 0, 1]], 58 | [[0, 73, 5]], 59 | [[1999, 0, 13]], 60 | [[0, 1, 1999]], 61 | [[1, 1999, 0]], 62 | [[0, 23, 15]], 63 | [[13, 12, 0]], 64 | [[63, 0, 73]], 65 | [[23, 15, 63]], 66 | [[15, 63, 0]], 67 | [[0, 13, 12]], 68 | ] 69 | 70 | 71 | class MockTokenizer: 72 | bos_id = 0 73 | eos_id = 1 74 | use_bos = True 75 | 76 | def encode(self, text, bos=True, eos=False, device=None, max_length=-1): 77 | # Simple: map each character to its ordinal + 2 78 | tokens = [ord(c) + 2 for c in text] 79 | if bos: 80 | tokens = [self.bos_id] + tokens 81 | if eos: 82 | tokens.append(self.eos_id) 83 | if max_length > 0: 84 | tokens = tokens[:max_length] 85 | return torch.tensor(tokens, dtype=torch.long, device=device) 86 | 87 | def decode(self, tensor): 88 | ids = tensor.tolist() if tensor.ndim > 0 else [tensor.item()] 89 | chars = [] 90 | for tid in ids: 91 | if tid == self.bos_id: 92 | chars.append("") 93 | elif tid == self.eos_id: 94 | chars.append("") 95 | else: 96 | chars.append(chr(tid - 2)) 97 | return "".join(chars) 98 | 99 | def decode_stream(self, token_stream, device=None): 100 | for token in token_stream: 101 | yield self.decode(token) 102 | 103 | @property 104 | def vocab_size(self): 105 | return 130 106 | 107 | 108 | def test_textfiles_token_loader(tmp_path): 109 | # Create the directory for text files 110 | data_dir = tmp_path / "textfiles" 111 | data_dir.mkdir(parents=True, exist_ok=True) 112 | 113 | # Write sample training data to the directory 114 | sample_texts = ["hello world", "foo bar", "lorem ipsum"] 115 | for i, text in enumerate(sample_texts): 116 | (data_dir / f"{i}.txt").write_text(text) 117 | 118 | datamodule = TextFiles(train_data_path=data_dir, num_workers=1) 119 | datamodule.connect(max_seq_length=2, tokenizer=MockTokenizer()) 120 | datamodule.prepare_data() 121 | 122 | # ensure training set uses tokens loader 123 | index_json = data_dir / "train" / "index.json" 124 | assert index_json.exists() 125 | meta = json.loads(index_json.read_text()) 126 | assert meta["config"]["item_loader"] == "TokensLoader" 127 | 128 | # ensure validation set uses tokens loader 129 | index_json = data_dir / "val" / "index.json" 130 | assert index_json.exists() 131 | meta = json.loads(index_json.read_text()) 132 | assert meta["config"]["item_loader"] == "TokensLoader" 133 | -------------------------------------------------------------------------------- /tests/data/test_tinyllama.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | from unittest import mock 3 | 4 | import pytest 5 | from litdata.streaming import CombinedStreamingDataset, StreamingDataLoader, StreamingDataset 6 | from torch.utils.data import DataLoader 7 | 8 | from litgpt.data import TinyLlama 9 | 10 | 11 | @mock.patch("litdata.streaming.dataset.subsample_streaming_dataset", return_value=([], [])) 12 | def test_tinyllama(_, tmp_path): 13 | data = TinyLlama(data_path=(tmp_path / "data")) 14 | assert data.seq_length == 2048 15 | assert data.batch_size == 1 16 | 17 | data.connect(batch_size=2, max_seq_length=1024) 18 | assert data.seq_length == 1025 19 | assert data.batch_size == 2 20 | 21 | with pytest.raises(FileNotFoundError, match="The directory .*data/slimpajama/train does not exist"): 22 | data.prepare_data() 23 | 24 | (tmp_path / "data" / "slimpajama" / "train").mkdir(parents=True) 25 | (tmp_path / "data" / "slimpajama" / "val").mkdir(parents=True) 26 | (tmp_path / "data" / "starcoder").mkdir(parents=True) 27 | 28 | data.prepare_data() 29 | data.setup() 30 | 31 | train_dataloader = data.train_dataloader() 32 | assert isinstance(train_dataloader, StreamingDataLoader) 33 | assert isinstance(train_dataloader.dataset, CombinedStreamingDataset) 34 | 35 | val_dataloader = data.val_dataloader() 36 | assert isinstance(val_dataloader, DataLoader) 37 | assert isinstance(val_dataloader.dataset, StreamingDataset) 38 | 39 | # has attributes from super class `LightningDataModule` 40 | assert data.prepare_data_per_node 41 | -------------------------------------------------------------------------------- /tests/data/test_tinystories.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | import torch 5 | from litdata import optimize 6 | from litdata.streaming import StreamingDataset, TokensLoader 7 | from torch.utils._pytree import tree_map 8 | 9 | 10 | def tokenize(data): 11 | for story in data: 12 | yield torch.tensor(story) 13 | 14 | 15 | def fake_chunk(path, data): 16 | optimize( 17 | fn=tokenize, 18 | inputs=[data] * len(data), 19 | output_dir=str(path), 20 | num_workers=1, 21 | chunk_bytes="200MB", 22 | item_loader=TokensLoader(), 23 | ) 24 | 25 | 26 | @pytest.mark.parametrize( 27 | ("max_seq_len", "expected"), 28 | [ 29 | (2, [[0, 23, 15], [63, 0, 73], [5, 0, 1], [1999, 0, 13]]), 30 | (5, [[0, 23, 15, 63, 0, 73], [5, 0, 1, 1999, 0, 13]]), 31 | (6, [[0, 23, 15, 63, 0, 73, 5]]), 32 | (7, [[0, 23, 15, 63, 0, 73, 5, 0]]), 33 | ], 34 | ) 35 | def test_pretok_dataset(tmp_path, max_seq_len, expected): 36 | fake_data = [0, 23, 15, 63, 0, 73, 5, 0, 1, 1999, 0, 13] 37 | assert len(fake_data) == 12 38 | fake_chunk(tmp_path, [fake_data]) 39 | 40 | dataset = StreamingDataset( 41 | input_dir=str(tmp_path), item_loader=TokensLoader(block_size=max_seq_len + 1), shuffle=False, drop_last=False 42 | ) 43 | actual = tree_map(torch.Tensor.tolist, list(dataset)) 44 | assert actual == expected 45 | 46 | 47 | def test_tokenize(tmp_path, monkeypatch): 48 | from litgpt.data.tinystories import tokenize 49 | 50 | story1, story2 = "foo bar", " fun " 51 | data = [{"story": story1}, {"story": story2}] 52 | shard_path = tmp_path / "data.json" 53 | with open(shard_path, "w", encoding="utf-8") as f: 54 | json.dump(data, f) 55 | 56 | class Tokenizer: 57 | bos_id = 0 58 | 59 | def encode(self, text, bos, eos): 60 | assert bos 61 | assert not eos 62 | return [self.bos_id] + [ord(c) for c in text] 63 | 64 | monkeypatch.setenv("DATA_OPTIMIZER_GLOBAL_RANK", "0") 65 | monkeypatch.setenv("DATA_OPTIMIZER_NUM_WORKERS", "1") 66 | data = tokenize(str(shard_path), Tokenizer()) 67 | assert list(data) == [[0, 102, 111, 111, 32, 98, 97, 114], [0, 102, 117, 110]] 68 | 69 | 70 | def test_tinystories_datamodule(tmp_path): 71 | from litgpt.data.tinystories import TinyStories 72 | 73 | data_dir = tmp_path / "tinystories" 74 | 75 | datamodule = TinyStories(data_dir, seed=42, num_workers=1) 76 | datamodule.connect(max_seq_length=2) 77 | 78 | # simulate `datamodule.prepare_data` 79 | train_data_dir = data_dir / "train" 80 | train_data_dir.mkdir(parents=True) 81 | fake_chunk(train_data_dir, [[12], [0, 23, 15, 63, 0], [73, 5, 0, 1, 1999, 0, 13]]) 82 | 83 | datamodule.setup() 84 | 85 | tr_dataloader = datamodule.train_dataloader() 86 | tr_dataloader.shuffle = False 87 | 88 | actual = tree_map(torch.Tensor.tolist, list(tr_dataloader)) 89 | 90 | # there is 1 sample per index in the data (13) 91 | assert actual == [ 92 | [[73, 5, 0]], 93 | [[12, 0, 23]], 94 | [[5, 0, 1]], 95 | [[0, 73, 5]], 96 | [[1999, 0, 13]], 97 | [[0, 1, 1999]], 98 | [[1, 1999, 0]], 99 | [[0, 23, 15]], 100 | [[13, 12, 0]], 101 | [[63, 0, 73]], 102 | [[23, 15, 63]], 103 | [[15, 63, 0]], 104 | [[0, 13, 12]], 105 | ] 106 | -------------------------------------------------------------------------------- /tests/ext_thunder/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | # support running without installing as a package, adding extensions to the Python path 5 | wd = Path(__file__).parent.parent.parent.resolve() 6 | if wd.is_dir(): 7 | sys.path.append(str(wd)) 8 | else: 9 | import warnings 10 | 11 | warnings.warn(f"Could not find extensions directory at {wd}") 12 | -------------------------------------------------------------------------------- /tests/ext_thunder/test_thunder_networks.py: -------------------------------------------------------------------------------- 1 | """Run thunder tests as part of LitGPT CI""" 2 | 3 | from litgpt.utils import _THUNDER_AVAILABLE 4 | 5 | if _THUNDER_AVAILABLE: 6 | from thunder.tests.test_networks import * # noqa: F403 7 | else: 8 | print("Skipping test_thunder_networks.py (thunder not available)") 9 | -------------------------------------------------------------------------------- /tests/ext_thunder/test_thunder_pretrain.py: -------------------------------------------------------------------------------- 1 | import os 2 | from contextlib import redirect_stdout 3 | from io import StringIO 4 | from unittest.mock import Mock 5 | 6 | import torch 7 | from torch.utils.data import DataLoader 8 | 9 | from litgpt import Config 10 | from litgpt.args import EvalArgs, TrainArgs 11 | from litgpt.utils import _THUNDER_AVAILABLE, _RunIf 12 | 13 | if _THUNDER_AVAILABLE: 14 | import extensions.thunder.pretrain as thunder_pretrain 15 | 16 | 17 | @_RunIf(min_cuda_gpus=1, thunder=True) 18 | def test_pretrain_thunder(tmp_path, monkeypatch): 19 | model_config = Config(block_size=2, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8) 20 | 21 | dataset = torch.tensor([[0, 1, 2], [3, 4, 5], [0, 1, 2]]) 22 | dataloader = DataLoader(dataset) 23 | monkeypatch.setattr(thunder_pretrain, "get_dataloaders", Mock(return_value=(dataloader, dataloader))) 24 | monkeypatch.setattr(thunder_pretrain, "save_hyperparameters", Mock()) 25 | 26 | out_dir = tmp_path / "out" 27 | stdout = StringIO() 28 | with redirect_stdout(stdout): 29 | thunder_pretrain.setup( 30 | devices=1, 31 | model_config=model_config, 32 | out_dir=out_dir, 33 | train=TrainArgs(global_batch_size=2, max_tokens=16, save_interval=1, micro_batch_size=1, max_norm=1.0), 34 | eval=EvalArgs(interval=1, max_iters=1), 35 | optimizer="AdamW", 36 | ) 37 | 38 | out_dir_contents = set(os.listdir(out_dir)) 39 | checkpoint_dirs = {"step-00000001", "step-00000002", "step-00000003", "step-00000004"} 40 | assert checkpoint_dirs.issubset(out_dir_contents) 41 | assert all((out_dir / p).is_dir() for p in checkpoint_dirs) 42 | for checkpoint_dir in checkpoint_dirs: 43 | # the `tokenizer_dir` is None by default, so only 'lit_model.pth' shows here 44 | assert set(os.listdir(out_dir / checkpoint_dir)) == {"lit_model.pth", "model_config.yaml"} 45 | 46 | assert (out_dir / "logs" / "tensorboard" / "version_0").is_dir() 47 | 48 | logs = stdout.getvalue() 49 | assert logs.count("(step)") == 4 50 | assert logs.count("val loss") == 4 51 | assert "Total parameters: 1,888" in logs 52 | -------------------------------------------------------------------------------- /tests/generate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tests/generate/__init__.py -------------------------------------------------------------------------------- /tests/generate/test_adapter.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import os 4 | import re 5 | import subprocess 6 | import sys 7 | from contextlib import redirect_stderr, redirect_stdout 8 | from io import StringIO 9 | from unittest.mock import ANY, Mock, call 10 | 11 | import pytest 12 | import torch 13 | import yaml 14 | 15 | skip_in_ci_on_macos = pytest.mark.skipif( 16 | sys.platform == "darwin" and os.getenv("GITHUB_ACTIONS") == "true", 17 | reason="Skipped on macOS in CI environment because CI machine does not have enough memory to run this test.", 18 | ) 19 | 20 | 21 | @skip_in_ci_on_macos 22 | @pytest.mark.parametrize("version", ("v1", "v2")) 23 | def test_main(fake_checkpoint_dir, monkeypatch, version, tensor_like): 24 | if version == "v1": 25 | import litgpt.generate.adapter as generate 26 | else: 27 | import litgpt.generate.adapter_v2 as generate 28 | 29 | config_path = fake_checkpoint_dir / "model_config.yaml" 30 | config = {"block_size": 128, "vocab_size": 50, "n_layer": 2, "n_head": 4, "n_embd": 8, "rotary_percentage": 1} 31 | config_path.write_text(yaml.dump(config)) 32 | 33 | monkeypatch.setattr(generate, "lazy_load", Mock()) 34 | monkeypatch.setattr(generate.GPT, "load_state_dict", Mock()) 35 | tokenizer_mock = Mock() 36 | tokenizer_mock.return_value.encode.return_value = torch.tensor([[1, 2, 3]]) 37 | tokenizer_mock.return_value.decode.return_value = "### Response:foo bar baz" 38 | monkeypatch.setattr(generate, "Tokenizer", tokenizer_mock) 39 | generate_mock = Mock() 40 | generate_mock.return_value = torch.tensor([[3, 2, 1]]) 41 | monkeypatch.setattr(generate, "generate", generate_mock) 42 | 43 | num_samples = 1 44 | out, err = StringIO(), StringIO() 45 | with redirect_stdout(out), redirect_stderr(err): 46 | generate.main(temperature=2.0, top_k=2, top_p=0.9, checkpoint_dir=fake_checkpoint_dir) 47 | 48 | assert len(tokenizer_mock.return_value.decode.mock_calls) == num_samples 49 | assert torch.allclose(tokenizer_mock.return_value.decode.call_args[0][0], generate_mock.return_value) 50 | assert ( 51 | generate_mock.mock_calls 52 | == [call(ANY, tensor_like, 101, temperature=2.0, top_k=2, top_p=0.9, eos_id=ANY)] * num_samples 53 | ) 54 | 55 | expected_output = "foo bar baz\n" * num_samples 56 | # Allow for the config to be printed before the expected repeated strings. 57 | pattern = rf".*^{re.escape(expected_output.strip())}$.*" 58 | assert re.match(pattern, out.getvalue().strip(), re.DOTALL | re.MULTILINE) 59 | 60 | err_value = err.getvalue() 61 | expected_parts = [ 62 | "'padded_vocab_size': 512", 63 | "'n_layer': 2", 64 | "'n_head': 4", 65 | "'head_size': 2", 66 | "'n_embd': 8", 67 | ] 68 | assert all(part in err_value for part in expected_parts) 69 | 70 | 71 | @pytest.mark.parametrize("version", ("", "_v2")) 72 | def test_cli(version): 73 | args = ["litgpt", f"generate_adapter{version}", "-h"] 74 | output = subprocess.check_output(args) 75 | output = str(output.decode()) 76 | assert "For models finetuned with" in output 77 | -------------------------------------------------------------------------------- /tests/generate/utils.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | 4 | def find_forward_hooks(module): 5 | mapping = defaultdict(list) 6 | for name, submodule in module.named_modules(): 7 | for hook in submodule._forward_pre_hooks.values(): 8 | hook_data = ("forward_pre_hook", hook.func.__name__, hook.args, hook.keywords) 9 | mapping[name].append(hook_data) 10 | for hook in submodule._forward_hooks.values(): 11 | hook_data = ("forward_hook", hook.func.__name__, hook.args, hook.keywords) 12 | mapping[name].append(hook_data) 13 | return dict(mapping) 14 | -------------------------------------------------------------------------------- /tests/test_args.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | import pytest 3 | 4 | from litgpt.args import TrainArgs 5 | 6 | 7 | def test_compute_warmup_iters(): 8 | # warmup disabled 9 | train = TrainArgs(lr_warmup_steps=0, lr_warmup_fraction=0) 10 | assert train.warmup_iters(devices=1, num_nodes=1, max_iters=1000, train_dataloader=range(10)) == 0 11 | 12 | # lr_warmup_steps and lr_warmup_fraction both are not allowed 13 | with pytest.raises(ValueError, match="Can't provide both `--train.lr_warmup_fraction`"): 14 | TrainArgs(lr_warmup_steps=1, lr_warmup_fraction=0.2) 15 | 16 | # lr_warmup_fraction invalid range 17 | with pytest.raises(ValueError, match=" must be between 0 and 1"): 18 | TrainArgs(lr_warmup_steps=0, lr_warmup_fraction=1.1) 19 | 20 | # lr_warmup_steps 21 | train = TrainArgs(global_batch_size=1, micro_batch_size=1, lr_warmup_steps=100, lr_warmup_fraction=0) 22 | assert train.warmup_iters(devices=1, num_nodes=1, max_iters=1000, train_dataloader=range(10)) == 100 23 | # lr_warmup_steps multiplied by accumulation factor 24 | train.global_batch_size = 4 25 | assert train.warmup_iters(devices=1, num_nodes=1, max_iters=1000, train_dataloader=range(10)) == 400 26 | assert train.warmup_iters(devices=2, num_nodes=1, max_iters=1000, train_dataloader=range(10)) == 200 27 | # lr_warmup_steps truncated by max iters 28 | assert train.warmup_iters(devices=1, num_nodes=1, max_iters=120, train_dataloader=range(10)) == 120 29 | 30 | # lr_warmup_fraction 31 | train = TrainArgs(global_batch_size=1, micro_batch_size=1, lr_warmup_steps=0, lr_warmup_fraction=0.3) 32 | assert train.warmup_iters(devices=1, num_nodes=1, max_iters=1000, train_dataloader=range(100)) == 30 33 | # lr_warmup_fraction truncated by max iters 34 | assert train.warmup_iters(devices=1, num_nodes=1, max_iters=20, train_dataloader=range(100)) == 20 35 | # lr_warmup_fraction rounds up 36 | assert train.warmup_iters(devices=1, num_nodes=1, max_iters=1000, train_dataloader=range(5)) == 2 37 | -------------------------------------------------------------------------------- /tests/test_ci.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | from lightning.fabric.plugins.precision.bitsandbytes import _BITSANDBYTES_AVAILABLE 4 | 5 | from litgpt.utils import _RunIf 6 | 7 | 8 | @_RunIf(min_cuda_gpus=1) 9 | def test_gpu_ci_installs_bitsandbytes(): 10 | assert _BITSANDBYTES_AVAILABLE, str(_BITSANDBYTES_AVAILABLE) 11 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from contextlib import redirect_stdout 3 | from io import StringIO 4 | from unittest import mock 5 | 6 | import pytest 7 | from packaging.version import Version 8 | 9 | from litgpt.__main__ import main 10 | 11 | 12 | def test_cli(): 13 | out = StringIO() 14 | with pytest.raises(SystemExit), redirect_stdout(out), mock.patch("sys.argv", ["litgpt", "-h"]): 15 | main() 16 | out = out.getvalue() 17 | assert "usage: litgpt" in out 18 | assert ( 19 | "{download,chat,finetune,finetune_lora,finetune_full,finetune_adapter,finetune_adapter_v2," 20 | "pretrain,generate,generate_full,generate_adapter,generate_adapter_v2,generate_sequentially," 21 | "generate_speculatively,generate_tp,convert_to_litgpt,convert_from_litgpt,convert_pretrained_checkpoint," 22 | "merge_lora,evaluate,serve}" in out 23 | ) 24 | assert ( 25 | """Available subcommands: 26 | download Download weights or tokenizer data from the Hugging 27 | Face Hub. 28 | chat Chat with a model.""" 29 | in out 30 | ) 31 | assert """evaluate Evaluate a model with the LM Evaluation Harness.""" in out 32 | assert """serve Serve a LitGPT model using LitServe.""" in out 33 | out = StringIO() 34 | with pytest.raises(SystemExit), redirect_stdout(out), mock.patch("sys.argv", ["litgpt", "finetune_lora", "-h"]): 35 | main() 36 | out = out.getvalue() 37 | assert ( 38 | """--lora_alpha LORA_ALPHA 39 | The LoRA alpha. (type: int, default: 16)""" 40 | in out 41 | ) 42 | 43 | if Version(f"{sys.version_info.major}.{sys.version_info.minor}") < Version("3.9"): 44 | # python 3.8 prints `Union[int, null]` instead of `Optional[int]` 45 | return 46 | 47 | out = StringIO() 48 | with pytest.raises(SystemExit), redirect_stdout(out), mock.patch("sys.argv", ["litgpt", "pretrain", "-h"]): 49 | main() 50 | out = out.getvalue() 51 | print(out) 52 | assert ( 53 | """--train.max_tokens MAX_TOKENS 54 | Total number of tokens to train on (type: 55 | Optional[int], default: 3000000000000)""" 56 | in out 57 | ) 58 | 59 | 60 | def test_rewrite_finetune_command(): 61 | out1 = StringIO() 62 | with pytest.raises(SystemExit), redirect_stdout(out1), mock.patch("sys.argv", ["litgpt", "fineune", "-h"]): 63 | main() 64 | out2 = StringIO() 65 | with pytest.raises(SystemExit), redirect_stdout(out2), mock.patch("sys.argv", ["litgpt", "fineune_lora", "-h"]): 66 | main() 67 | assert out1.getvalue() == out2.getvalue() 68 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import pytest 4 | import yaml 5 | 6 | import litgpt.config as config_module 7 | from litgpt import Config 8 | from litgpt.config import find_multiple 9 | 10 | 11 | def test_config(): 12 | config = Config() 13 | assert config.name == "" 14 | assert config.block_size == 4096 15 | 16 | config = Config(block_size=2048) 17 | assert config.block_size == 2048 18 | 19 | config = Config.from_name("pythia-14m") 20 | assert config.block_size == 512 21 | 22 | config = Config.from_name("pythia-14m", block_size=4096) 23 | assert config.block_size == 4096 24 | 25 | config = Config(hf_config={"name": "pythia-14m"}) 26 | assert config.name == "pythia-14m" 27 | 28 | 29 | def test_from_hf_name(): 30 | # by short-hand name 31 | config0 = Config.from_name("tiny-llama-1.1b") 32 | # or by huggingface hub repo name 33 | config1 = Config.from_name("TinyLlama-1.1B-intermediate-step-1431k-3T") 34 | assert config0 is not None 35 | assert config1 is not None 36 | assert config0 == config1 37 | 38 | 39 | def test_nonexisting_name(): 40 | with pytest.raises(ValueError, match="'invalid-model-name' is not a supported config name"): 41 | Config.from_name("invalid-model-name") 42 | 43 | 44 | @pytest.mark.parametrize("config", config_module.configs, ids=[c["name"] for c in config_module.configs]) 45 | def test_short_and_hf_names_are_equal_unless_on_purpose(config): 46 | # by short-hand name 47 | config0 = Config.from_name(config["name"]) 48 | # or by huggingface hub repo name 49 | config1 = Config.from_name(config["hf_config"]["name"]) 50 | assert config0.name == config1.name 51 | 52 | 53 | def test_from_hf_name_with_org_string(): 54 | # Test case 1: valid input 55 | config0 = Config.from_name("tiny-llama-1.1b") 56 | config1 = Config.from_name("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T") 57 | assert config0 is not None 58 | assert config1 is not None 59 | assert config0 == config1 60 | 61 | # Test case 2: invalid input - org not found 62 | with pytest.raises( 63 | ValueError, match="'UnknownOrg/TinyLlama-1.1B-intermediate-step-1431k-3T' is not a supported config name" 64 | ): 65 | Config.from_name("UnknownOrg/TinyLlama-1.1B-intermediate-step-1431k-3T") 66 | 67 | # Test case 3: invalid input - name not found 68 | with pytest.raises(ValueError, match="'TinyLlama/TinyLlama-XYZ' is not a supported config name"): 69 | Config.from_name("TinyLlama/TinyLlama-XYZ") 70 | 71 | 72 | def test_from_checkpoint(tmp_path): 73 | # 1. Neither `lit_config.py` nor matching config exists. 74 | with pytest.raises(FileNotFoundError, match="neither 'model_config.yaml' nor matching config exists"): 75 | Config.from_checkpoint(tmp_path / "non_existing_checkpoint") 76 | 77 | # 2. If `lit_config.py` doesn't exists, but there is a matching config in `litgpt/config.py`. 78 | config = Config.from_checkpoint(tmp_path / "pythia-14m") 79 | assert config.name == "pythia-14m" 80 | assert config.block_size == 512 81 | assert config.n_layer == 6 82 | 83 | # 3. If only `lit_config.py` exists. 84 | config_data = {"name": "pythia-14m", "block_size": 24, "n_layer": 2} 85 | with open(tmp_path / "model_config.yaml", "w", encoding="utf-8") as file: 86 | yaml.dump(config_data, file) 87 | config = Config.from_checkpoint(tmp_path) 88 | assert config.name == "pythia-14m" 89 | assert config.block_size == 24 90 | assert config.n_layer == 2 91 | 92 | # 4. Both `lit_config.py` and a matching config exist, but `lit_config.py` supersedes matching config 93 | (tmp_path / "pythia-14m").mkdir() 94 | with open(tmp_path / "pythia-14m/model_config.yaml", "w", encoding="utf-8") as file: 95 | yaml.dump(config_data, file) 96 | config = Config.from_checkpoint(tmp_path / "pythia-14m") 97 | assert config.name == "pythia-14m" 98 | assert config.block_size == 24 99 | assert config.n_layer == 2 100 | 101 | 102 | @pytest.mark.parametrize("head_size", [None, 128]) 103 | def test_head_size(head_size): 104 | config = Config(head_size) 105 | 106 | assert config.head_size == head_size or config.n_embd // config.n_head 107 | 108 | 109 | def test_find_multiple(): 110 | assert find_multiple(17, 5) == 20 111 | assert find_multiple(30, 7) == 35 112 | assert find_multiple(10, 2) == 10 113 | assert find_multiple(5, 10) == 10 114 | assert find_multiple(50254, 128) == 50304 115 | assert find_multiple(50254, 256) == 50432 116 | assert find_multiple(50254, 512) == 50688 117 | -------------------------------------------------------------------------------- /tests/test_config_hub.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import importlib.util 3 | from pathlib import Path 4 | from unittest import mock 5 | from unittest.mock import Mock 6 | 7 | import pytest 8 | from lightning.fabric.plugins import Precision 9 | 10 | from litgpt import Config 11 | from litgpt.utils import CLI 12 | 13 | fixed_pairs = [ 14 | ("litgpt/pretrain.py", "pretrain/debug.yaml"), 15 | ("litgpt/pretrain.py", "pretrain/tinyllama.yaml"), 16 | ("litgpt/pretrain.py", "pretrain/tinystories.yaml"), 17 | ( 18 | "litgpt/pretrain.py", 19 | "https://raw.githubusercontent.com/Lightning-AI/litgpt/4d55ab6d0aa404f0da0d03a80a8801ed60e07e83/config_hub/pretrain/tinystories.yaml", # TODO: Update with path from main after merge 20 | ), 21 | ] 22 | 23 | config_hub_path = Path(__file__).parent.parent / "config_hub" / "finetune" 24 | model_pairs = [] 25 | 26 | for model_dir in config_hub_path.iterdir(): 27 | if model_dir.is_dir(): 28 | model_name = model_dir.name 29 | for yaml_file in model_dir.glob("*.yaml"): 30 | config_name = yaml_file.stem 31 | python_file = "litgpt/finetune/full.py" if config_name == "full" else "litgpt/finetune/lora.py" 32 | relative_yaml_path = yaml_file.relative_to(config_hub_path.parent) 33 | model_pairs.append((python_file, str(relative_yaml_path))) 34 | 35 | all_pairs = fixed_pairs + model_pairs 36 | 37 | 38 | @pytest.mark.parametrize(("script_file", "config_file"), all_pairs) 39 | def test_config_help(script_file, config_file, monkeypatch): 40 | """Test that configs validate against the signature in the scripts.""" 41 | script_file = Path(__file__).parent.parent / script_file 42 | assert script_file.is_file() 43 | if "http" not in str(config_file): 44 | config_file = Path(__file__).parent.parent / "config_hub" / config_file 45 | assert config_file.is_file() 46 | 47 | spec = importlib.util.spec_from_file_location(str(script_file.parent.name), script_file) 48 | module = importlib.util.module_from_spec(spec) 49 | spec.loader.exec_module(module) 50 | 51 | monkeypatch.setattr(module, "main", Mock()) 52 | monkeypatch.setattr(module, "Tokenizer", Mock()) 53 | monkeypatch.setattr(module, "BitsandbytesPrecision", Mock(return_value=Precision()), raising=False) 54 | monkeypatch.setattr(module, "Config", Mock(return_value=Config.from_name("pythia-14m"))) 55 | monkeypatch.setattr(module, "check_valid_checkpoint_dir", Mock(), raising=False) 56 | 57 | try: 58 | with mock.patch("sys.argv", [script_file.name, "--config", str(config_file), "--devices", "1"]): 59 | CLI(module.setup) 60 | module.main.assert_called_once() 61 | except FileNotFoundError: 62 | pass 63 | # FileNotFound occurs here because we have not downloaded the model weights referenced in the config files 64 | # which is ok because here we just want to validate the config file itself. 65 | -------------------------------------------------------------------------------- /tests/test_distributed.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | from lightning import Fabric 4 | 5 | from litgpt.utils import _RunIf 6 | 7 | 8 | @_RunIf(min_cuda_gpus=2, standalone=True) 9 | @pytest.mark.parametrize("strategy", ["ddp", "fsdp"]) 10 | def test_no_backward_sync(strategy): 11 | fabric = Fabric(devices=2, accelerator="cuda", strategy=strategy) 12 | fabric.launch() 13 | 14 | # account for sharding in the case of FSDP 15 | out_features = 1 if "ddp" in strategy else fabric.world_size 16 | 17 | model = torch.nn.Linear(1, out_features, bias=False, device=fabric.device) 18 | x = torch.randn(1, 1, device=fabric.device) 19 | model = fabric.setup(model) 20 | 21 | # 6 iters, 3 grad accumulation iters 22 | for i, enabled in enumerate((True, True, False, True, True, False), 1): 23 | x = torch.tensor([i * (fabric.local_rank + 1)], device=fabric.device, dtype=torch.float32) 24 | 25 | with fabric.no_backward_sync(model, enabled): 26 | y = model(x) 27 | fabric.backward(y.sum()) 28 | if not enabled: 29 | # Math for the first 3 iters 30 | # 31 | # DistributedDataParallel 32 | # (1*1+2*1+3*1 + 1*2+2*2+3*2) / 2 = 9 33 | # ^^^^^^^^^^^ ^^^^^^^^^^^ ^^^ 34 | # rank0 rank1 allreduce 35 | # 36 | # thunder.distributed.ddp 37 | # ((1*1+2*1) + (1*2+2*2)) / 2 + (3*1 + 3*2) / 2 = 9 38 | # ^^^^^^^ ^^^^^^^ ^^^ ^^^ ^^^ ^^^ 39 | # rank0 rank1 allreduce1 rank0 rank1 allreduce2 40 | assert model.weight.grad.shape.numel() == 1, model.weight.grad.shape 41 | assert model.weight.grad.item() == (9.0 if i == 3 else 22.5) 42 | model.weight.grad = None 43 | -------------------------------------------------------------------------------- /tests/test_evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import subprocess 4 | from contextlib import redirect_stdout 5 | from dataclasses import asdict 6 | from io import StringIO 7 | from unittest import mock 8 | 9 | import pytest 10 | import torch 11 | import yaml 12 | 13 | import litgpt.eval.evaluate as module 14 | from litgpt import GPT, Config 15 | from litgpt.scripts.download import download_from_hub 16 | 17 | 18 | @pytest.mark.flaky(reruns=3) 19 | def test_evaluate_script(tmp_path): 20 | ours_config = Config.from_name("pythia-14m") 21 | download_from_hub(repo_id="EleutherAI/pythia-14m", tokenizer_only=True, checkpoint_dir=tmp_path) 22 | checkpoint_dir = tmp_path / "EleutherAI" / "pythia-14m" 23 | ours_model = GPT(ours_config) 24 | torch.save(ours_model.state_dict(), checkpoint_dir / "lit_model.pth") 25 | with open(checkpoint_dir / "model_config.yaml", "w", encoding="utf-8") as fp: 26 | yaml.dump(asdict(ours_config), fp) 27 | 28 | stdout = StringIO() 29 | with redirect_stdout(stdout), mock.patch("sys.argv", ["eval/evaluate.py"]): 30 | with pytest.raises(ValueError) as excinfo: 31 | module.convert_and_evaluate( 32 | checkpoint_dir, 33 | out_dir=tmp_path / "out_dir", 34 | device=None, 35 | dtype=torch.float32, 36 | limit=5, 37 | tasks="logiqa", 38 | batch_size=0, # Test for non-positive integer 39 | ) 40 | assert "batch_size must be a positive integer, 'auto', or in the format 'auto:N'." in str(excinfo.value) 41 | 42 | with pytest.raises(ValueError) as excinfo: 43 | module.convert_and_evaluate( 44 | checkpoint_dir, 45 | out_dir=tmp_path / "out_dir", 46 | device=None, 47 | dtype=torch.float32, 48 | limit=5, 49 | tasks="logiqa", 50 | batch_size="invalid", # Test for invalid string 51 | ) 52 | assert "batch_size must be a positive integer, 'auto', or in the format 'auto:N'." in str(excinfo.value) 53 | 54 | stdout = StringIO() 55 | with redirect_stdout(stdout), mock.patch("sys.argv", ["eval/evaluate.py"]): 56 | module.convert_and_evaluate( 57 | checkpoint_dir, 58 | out_dir=tmp_path / "out_dir", 59 | device=None, 60 | dtype=torch.float32, 61 | limit=5, 62 | tasks="logiqa", 63 | batch_size=1, # Valid case 64 | ) 65 | stdout = stdout.getvalue() 66 | assert (tmp_path / "out_dir" / "results.json").is_file() 67 | assert "logiqa" in stdout 68 | assert "Metric" in stdout 69 | assert "Loading checkpoint shards" not in stdout 70 | 71 | 72 | def test_cli(): 73 | args = ["litgpt", "evaluate", "-h"] 74 | output = subprocess.check_output(args) 75 | output = str(output.decode()) 76 | assert "Evaluate a model with the LM Evaluation Harness" in output 77 | -------------------------------------------------------------------------------- /tests/test_full.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import os 4 | from contextlib import redirect_stdout 5 | from io import StringIO 6 | from unittest import mock 7 | from unittest.mock import Mock 8 | 9 | import torch 10 | import yaml 11 | 12 | import litgpt.finetune.full as module 13 | from litgpt.args import EvalArgs, TrainArgs 14 | from litgpt.data import Alpaca 15 | 16 | 17 | @mock.patch.dict(os.environ, {"LT_ACCELERATOR": "cpu"}) 18 | def test_full_script(tmp_path, fake_checkpoint_dir, monkeypatch, alpaca_path): 19 | model_config = dict(block_size=128, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8) 20 | (fake_checkpoint_dir / "model_config.yaml").write_text(yaml.dump(model_config)) 21 | monkeypatch.setattr(module, "load_checkpoint", Mock()) 22 | 23 | tokenizer_mock = Mock() 24 | tokenizer_mock.return_value = tokenizer_mock 25 | tokenizer_mock.encode = lambda *_, **__: torch.tensor([3, 2, 1]) 26 | monkeypatch.setattr(module, "Tokenizer", tokenizer_mock) 27 | 28 | out_dir = tmp_path / "out" 29 | setup_args = (fake_checkpoint_dir,) 30 | setup_kwargs = dict( 31 | data=Alpaca(download_dir=alpaca_path.parent, file_name=alpaca_path.name, val_split_fraction=0.5, num_workers=0), 32 | out_dir=out_dir, 33 | precision="32-true", 34 | train=TrainArgs(global_batch_size=1, save_interval=2, epochs=1, max_steps=6, micro_batch_size=1), 35 | eval=EvalArgs(interval=2, max_iters=2, max_new_tokens=1), 36 | ) 37 | stdout = StringIO() 38 | with redirect_stdout(stdout), mock.patch("sys.argv", ["full.py", str(fake_checkpoint_dir)]): 39 | module.setup(*setup_args, **setup_kwargs) 40 | 41 | out_dir_contents = set(os.listdir(out_dir)) 42 | checkpoint_dirs = {"step-000002", "step-000004", "step-000006", "final"} 43 | assert checkpoint_dirs.issubset(out_dir_contents) 44 | assert all((out_dir / p).is_dir() for p in checkpoint_dirs) 45 | for checkpoint_dir in checkpoint_dirs: 46 | assert set(os.listdir(out_dir / checkpoint_dir)) == { 47 | "lit_model.pth", 48 | "model_config.yaml", 49 | "tokenizer_config.json", 50 | "tokenizer.json", 51 | "hyperparameters.yaml", 52 | "prompt_style.yaml", 53 | } 54 | assert (out_dir / "logs" / "csv" / "version_0" / "metrics.csv").is_file() 55 | 56 | logs = stdout.getvalue() 57 | assert logs.count("(step)") == 6 58 | assert logs.count("val loss") == 4 # 3 validations + 1 final validation 59 | assert logs.count("Final evaluation") == 1 60 | assert "of trainable parameters: 1,888" in logs 61 | 62 | # Resume training and do 2 steps more 63 | setup_kwargs["train"].max_steps = 8 64 | setup_kwargs["resume"] = True 65 | stdout = StringIO() 66 | with redirect_stdout(stdout), mock.patch("sys.argv", ["full.py", str(fake_checkpoint_dir)]): 67 | module.setup(*setup_args, **setup_kwargs) 68 | logs = stdout.getvalue() 69 | assert f"Resuming training from {out_dir / 'step-000006' / 'lit_model.pth'}" in logs 70 | assert logs.count("(step)") == 2 71 | assert out_dir / "step-000008" in set(out_dir.iterdir()) 72 | -------------------------------------------------------------------------------- /tests/test_merge_lora.py: -------------------------------------------------------------------------------- 1 | # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. 2 | 3 | import os 4 | import shutil 5 | from contextlib import redirect_stdout 6 | from io import StringIO 7 | from pathlib import Path 8 | from unittest import mock 9 | 10 | import pytest 11 | import torch 12 | import yaml 13 | 14 | from litgpt.lora import GPT as LoRAGPT 15 | from litgpt.lora import lora_filter 16 | from litgpt.model import GPT 17 | from litgpt.scripts.merge_lora import load_lora_metadata, merge_lora 18 | 19 | 20 | @mock.patch.dict(os.environ, {"LT_ACCELERATOR": "cpu"}) 21 | @pytest.mark.parametrize( 22 | ("pretrained_dtype", "lora_dtype"), [(None, None), (torch.float16, torch.float32), (torch.float16, torch.bfloat16)] 23 | ) 24 | def test_merge_lora(tmp_path, fake_checkpoint_dir, pretrained_dtype, lora_dtype): 25 | pretrained_checkpoint_dir = tmp_path / "pretrained" 26 | lora_checkpoint_dir = tmp_path / "lora" 27 | shutil.copytree(fake_checkpoint_dir, pretrained_checkpoint_dir) 28 | shutil.copytree(fake_checkpoint_dir, lora_checkpoint_dir) 29 | (lora_checkpoint_dir / "lit_model.pth").unlink() # should not already exist 30 | shutil.rmtree(tmp_path / "checkpoints") 31 | 32 | # Create a fake pretrained checkpoint 33 | config = dict(block_size=128, padded_vocab_size=256, n_layer=3, n_head=8, n_embd=16) 34 | with open(pretrained_checkpoint_dir / "model_config.yaml", "w", encoding="utf-8") as fp: 35 | yaml.dump(config, fp) 36 | base_model = GPT.from_name("pythia-14m", **config).to(dtype=pretrained_dtype) 37 | state_dict = base_model.state_dict() 38 | assert len(state_dict) == 40 39 | torch.save(state_dict, pretrained_checkpoint_dir / "lit_model.pth") 40 | 41 | # Create a fake LoRA checkpoint 42 | lora_kwargs = dict(lora_r=8, lora_alpha=16, lora_dropout=0.05, lora_query=True, lora_value=True) 43 | lora_model = LoRAGPT.from_name("pythia-14m", **config, **lora_kwargs).to(dtype=lora_dtype) 44 | state_dict = {k: v for k, v in lora_model.state_dict().items() if lora_filter(k, v)} 45 | assert len(state_dict) == 6 46 | torch.save(state_dict, lora_checkpoint_dir / "lit_model.pth.lora") 47 | hparams = dict(checkpoint_dir=str(pretrained_checkpoint_dir), **lora_kwargs) 48 | with open(lora_checkpoint_dir / "hyperparameters.yaml", "w", encoding="utf-8") as file: 49 | yaml.dump(hparams, file) 50 | shutil.copyfile(pretrained_checkpoint_dir / "model_config.yaml", lora_checkpoint_dir / "model_config.yaml") 51 | 52 | assert set(os.listdir(tmp_path)) == {"lora", "pretrained"} 53 | merge_lora(lora_checkpoint_dir) 54 | assert set(os.listdir(tmp_path)) == {"lora", "pretrained"} 55 | assert set(os.listdir(lora_checkpoint_dir)) == { 56 | "model_config.yaml", 57 | "lit_model.pth", 58 | "lit_model.pth.lora", 59 | "tokenizer.json", 60 | "tokenizer_config.json", 61 | "hyperparameters.yaml", 62 | } 63 | 64 | # Assert that the merged weights can be loaded back into the base model 65 | merged = torch.load(lora_checkpoint_dir / "lit_model.pth") 66 | keys = base_model.load_state_dict(merged, strict=True) 67 | assert not keys.missing_keys 68 | assert not keys.unexpected_keys 69 | 70 | # Attempt to merge again 71 | stdout = StringIO() 72 | with redirect_stdout(stdout): 73 | merge_lora(lora_checkpoint_dir) 74 | assert "LoRA weights have already been merged" in stdout.getvalue() 75 | 76 | 77 | def test_load_lora_metadata(fake_checkpoint_dir): 78 | assert not (fake_checkpoint_dir / "hyperparameters.yaml").is_file() 79 | with pytest.raises(FileNotFoundError, match="missing a `hyperparameters.yaml` file"): 80 | load_lora_metadata(fake_checkpoint_dir) 81 | 82 | hparams = dict(precision="bf16-mixed", checkpoint_dir="checkpoints/meta-llama/Llama-2-7b", lora_r=8, lora_alpha=16) 83 | with open(fake_checkpoint_dir / "hyperparameters.yaml", "w", encoding="utf-8") as file: 84 | yaml.dump(hparams, file) 85 | 86 | lora_args, pretrained_dir, precision = load_lora_metadata(fake_checkpoint_dir) 87 | assert lora_args == dict(lora_r=8, lora_alpha=16) 88 | assert pretrained_dir == Path("checkpoints/meta-llama/Llama-2-7b") 89 | assert precision == "bf16-mixed" 90 | -------------------------------------------------------------------------------- /tutorials/convert_hf_checkpoint.md: -------------------------------------------------------------------------------- 1 | # Converting Hugging Face Transformers to LitGPT weights 2 | 3 | By default, the `litgpt download` command converts the downloaded HF checkpoint files into a LitGPT compatible format after downloading. For example, 4 | 5 | ```bash 6 | litgpt download EleutherAI/pythia-14m 7 | ``` 8 | 9 | creates the following files: 10 | 11 | ``` 12 | checkpoints/ 13 | └── EleutherAI/ 14 | └── pythia-14m/ 15 | ├── config.json 16 | ├── generation_config.json 17 | ├── model_config.yaml # LitGPT specific file 18 | ├── lit_model.pth # LitGPT specific file 19 | ├── pytorch_model.bin 20 | ├── tokenizer.json 21 | └── tokenizer_config.json 22 | ``` 23 | 24 | 25 | 26 | To disable the automatic conversion, which is useful for development and debugging purposes, you can run the `litgpt download` with the `--convert_checkpoint false` flag. This will only download the checkpoint files but do not convert them for use in LitGPT: 27 | 28 | ```bash 29 | rm -rf checkpoints/EleutherAI/pythia-14m 30 | 31 | litgpt download EleutherAI/pythia-14m \ 32 | --convert_checkpoint false 33 | 34 | ls checkpoints/EleutherAI/pythia-14m 35 | ``` 36 | 37 | ``` 38 | checkpoints/ 39 | └── EleutherAI/ 40 | └── pythia-14m/ 41 | ├── config.json 42 | ├── generation_config.json 43 | ├── pytorch_model.bin 44 | ├── tokenizer.json 45 | └── tokenizer_config.json 46 | ``` 47 | 48 | The required files `model_config.yaml` and `lit_model.pth` files can then be manually generated via the `litgpt/scripts/convert_hf_checkpoint.py` script: 49 | 50 | ```bash 51 | litgpt convert_to_litgpt checkpoints/EleutherAI/pythia-14m 52 | ``` 53 | -------------------------------------------------------------------------------- /tutorials/deploy.md: -------------------------------------------------------------------------------- 1 | # Serve and Deploy LLMs 2 | 3 | This document shows how you can serve a LitGPT for deployment. 4 | 5 | 6 |   7 | ## Serve an LLM with LitServe 8 | 9 | This section illustrates how we can set up an inference server for a phi-2 LLM using `litgpt serve` that is minimal and highly scalable. 10 | 11 | 12 |   13 | ### Step 1: Start the inference server 14 | 15 | 16 | ```bash 17 | # 1) Download a pretrained model (alternatively, use your own finetuned model) 18 | litgpt download microsoft/phi-2 19 | 20 | # 2) Start the server 21 | litgpt serve microsoft/phi-2 22 | ``` 23 | 24 | > [!TIP] 25 | > Use `litgpt serve --help` to display additional options, including the port, devices, LLM temperature setting, and more. 26 | 27 | 28 |   29 | ### Step 2: Query the inference server 30 | 31 | You can now send requests to the inference server you started in step 2. For example, in a new Python session, we can send requests to the inference server as follows: 32 | 33 | 34 | ```python 35 | import requests, json 36 | 37 | response = requests.post( 38 | "http://127.0.0.1:8000/predict", 39 | json={"prompt": "Fix typos in the following sentence: Example input"} 40 | ) 41 | 42 | print(response.json()["output"]) 43 | ``` 44 | 45 | Executing the code above prints the following output: 46 | 47 | ``` 48 | Example input. 49 | ``` 50 | 51 |   52 | ### Optional: Use the streaming mode 53 | 54 | The 2-step procedure described above returns the complete response all at once. If you want to stream the response on a token-by-token basis, start the server with the streaming option enabled: 55 | 56 | ```bash 57 | litgpt serve microsoft/phi-2 --stream true 58 | ``` 59 | 60 | Then, use the following updated code to query the inference server: 61 | 62 | ```python 63 | import requests, json 64 | 65 | response = requests.post( 66 | "http://127.0.0.1:8000/predict", 67 | json={"prompt": "Fix typos in the following sentence: Example input"}, 68 | stream=True 69 | ) 70 | 71 | # stream the response 72 | for line in response.iter_lines(decode_unicode=True): 73 | if line: 74 | print(json.loads(line)["output"], end="") 75 | ``` 76 | 77 | ``` 78 | Sure, here is the corrected sentence: 79 | 80 | Example input 81 | ``` 82 | 83 |   84 | ## Serve an LLM UI with Chainlit 85 | 86 | If you are interested in developing a simple ChatGPT-like UI prototype, see the Chainlit tutorial in the following Studio: 87 | 88 | 89 | Open In Studio 90 | 91 | -------------------------------------------------------------------------------- /tutorials/developer-docs/README.md: -------------------------------------------------------------------------------- 1 | LitGPT developer documentation files. 2 | -------------------------------------------------------------------------------- /tutorials/developer-docs/python-api.md: -------------------------------------------------------------------------------- 1 | # LitGPT High-level Python API 2 | 3 | This is a work-in-progress draft for a high-level LitGPT Python API. 4 | 5 |   6 | ## Model loading & saving 7 | 8 | The `LLM.load` command loads an `llm` object, which contains both the model object (a PyTorch module) and a preprocessor. 9 | 10 | ```python 11 | from litgpt import LLM 12 | 13 | llm = LLM.load( 14 | model="url | local_path", 15 | # high-level user only needs to care about those: 16 | memory_reduction="none | medium | strong" 17 | # advanced options for technical users: 18 | source="hf | local | other" 19 | quantize="bnb.nf4", 20 | precision="bf16-true", 21 | device=""auto | cuda | cpu", 22 | ) 23 | ``` 24 | 25 | Here, 26 | 27 | - `llm.model` contains the PyTorch Module 28 | - and `llm.preprocessor.tokenizer` contains the tokenizer 29 | 30 | The `llm.save` command saves the model weights, tokenizer, and configuration information. 31 | 32 | 33 | ```python 34 | llm.save(checkpoint_dir, format="lightning | ollama | hf") 35 | ``` 36 | 37 | 38 |   39 | ## Inference / Chat 40 | 41 | ``` 42 | response = llm.generate( 43 | prompt="What do Llamas eat?", 44 | temperature=0.1, 45 | top_p=0.8, 46 | ... 47 | ) 48 | ``` 49 | 50 | 51 |   52 | ## Dataset 53 | 54 | The `llm.prepare_dataset` command prepares a dataset for training. 55 | 56 | ``` 57 | llm.download_dataset( 58 | URL, 59 | ... 60 | ) 61 | ``` 62 | 63 | ``` 64 | dataset = llm.prepare_dataset( 65 | path, 66 | task="pretrain | instruction_finetune", 67 | test_portion=0.1, 68 | ... 69 | ) 70 | ``` 71 | 72 |   73 | ## Training 74 | 75 | 76 | ```python 77 | llm.instruction_finetune( 78 | config=None, 79 | dataset=dataset, 80 | max_iter=10, 81 | method="full | lora | adapter | adapter_v2" 82 | ) 83 | ``` 84 | 85 | ```python 86 | llm.pretrain(config=None, dataset=dataset, max_iter=10, ...) 87 | ``` 88 | 89 |   90 | ## Serving 91 | 92 | 93 | ```python 94 | llm.serve(port=8000) 95 | ``` 96 | 97 | Then in another Python session: 98 | 99 | ```python 100 | import requests, json 101 | 102 | response = requests.post( 103 | "http://127.0.0.1:8000/predict", 104 | json={"prompt": "Fix typos in the following sentence: Example input"} 105 | ) 106 | 107 | print(response.json()["output"]) 108 | ``` 109 | -------------------------------------------------------------------------------- /tutorials/examples/ptl-trainer/README.md: -------------------------------------------------------------------------------- 1 | ## Minimal PyTorch Lightning Trainer Example 2 | 3 | 4 | 5 | The script in this folder provides minimal examples showing how to train a LitGPT model using LitGPT's `GPT` class with the [PyTorch Lightning](https://github.com/Lightning-AI/pytorch-lightning) Trainer. 6 | 7 | You can run the scripts as follows: 8 | 9 |   10 | ## Small 160M model: 11 | 12 | ```bash 13 | # Download the Pythia model 14 | litgpt download EleutherAI/pythia-160m 15 | 16 | python litgpt_ptl_small.py 17 | ``` 18 | 19 |   20 | ## Medium-sized 8B model: 21 | 22 | ```bash 23 | # Download the Llama 3.1 model 24 | litgpt download meta-llama/Meta-Llama-3.1-8B --access_token hf_... 25 | 26 | python litgpt_ptl_medium.py 27 | ``` 28 | -------------------------------------------------------------------------------- /tutorials/examples/ptl-trainer/litgpt_ptl_medium.py: -------------------------------------------------------------------------------- 1 | import lightning as L 2 | import torch 3 | 4 | import litgpt 5 | from litgpt.data import Alpaca2k 6 | from litgpt.lora import GPT, merge_lora_weights 7 | 8 | 9 | class LitLLM(L.LightningModule): 10 | def __init__(self): 11 | super().__init__() 12 | self.model = GPT.from_name( 13 | name="Llama-3.1-8B", 14 | lora_r=32, 15 | lora_alpha=16, 16 | lora_dropout=0.05, 17 | lora_key=False, 18 | lora_value=True, 19 | ) 20 | litgpt.lora.mark_only_lora_as_trainable(self.model) 21 | 22 | def on_train_start(self): 23 | state_dict = torch.load("checkpoints/meta-llama/Meta-Llama-3.1-8B/lit_model.pth", mmap=True) 24 | self.model.load_state_dict(state_dict, strict=False) 25 | 26 | def training_step(self, batch): 27 | input_ids, targets = batch["input_ids"], batch["labels"] 28 | logits = self.model(input_ids) 29 | loss = litgpt.utils.chunked_cross_entropy(logits[..., :-1, :], targets[..., 1:]) 30 | self.log("train_loss", loss, prog_bar=True) 31 | return loss 32 | 33 | def configure_optimizers(self): 34 | warmup_steps = 10 35 | optimizer = torch.optim.AdamW(self.model.parameters(), lr=0.0002, weight_decay=0.0, betas=(0.9, 0.95)) 36 | scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: step / warmup_steps) 37 | return [optimizer], [scheduler] 38 | 39 | 40 | if __name__ == "__main__": 41 | data = Alpaca2k() 42 | tokenizer = litgpt.Tokenizer("checkpoints/meta-llama/Meta-Llama-3.1-8B") 43 | data.connect(tokenizer, batch_size=1, max_seq_length=512) 44 | 45 | trainer = L.Trainer( 46 | devices=1, 47 | max_epochs=2, 48 | accumulate_grad_batches=8, 49 | precision="bf16-true", 50 | ) 51 | with trainer.init_module(empty_init=True): 52 | model = LitLLM() 53 | 54 | trainer.fit(model, data) 55 | 56 | # Save final checkpoint 57 | merge_lora_weights(model.model) 58 | trainer.save_checkpoint("checkpoints/finetuned.ckpt", weights_only=True) 59 | -------------------------------------------------------------------------------- /tutorials/finetune.md: -------------------------------------------------------------------------------- 1 | # Finetuning 2 | 3 | We provide a simple finetuning commands (`litgpt finetune_*`) that instruction-finetune a pretrained model on datasets such as [Alpaca](https://github.com/tatsu-lab/stanford_alpaca), [Dolly](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm), and others. For more information on the supported instruction datasets and how to prepare your own custom datasets, please see the [tutorials/prepare_dataset](prepare_dataset.md) tutorials. 4 | 5 | LitGPT currently supports the following finetuning methods: 6 | 7 | ```bash 8 | litgpt finetune_full 9 | litgpt finetune_lora 10 | litgpt finetune_adapter 11 | litgpt finetune_adapter_v2 12 | ``` 13 | 14 |   15 | > [!TIP] 16 | > To install all required dependencies before finetuning, first run `pip install "litgpt[all]"`. 17 |   18 | 19 | 20 | The following section provides more details about these methods, including links for additional resources. 21 | 22 | 23 |   24 | ## LitGPT finetuning commands 25 | 26 | The section below provides additional information on the available and links to further resources. 27 | 28 |   29 | ### Full finetuning 30 | 31 | ```bash 32 | litgpt finetune_full 33 | ``` 34 | 35 | This method trains all model weight parameters and is the most memory-intensive finetuning technique in LitGPT. 36 | 37 | **More information and resources:** 38 | 39 | - the LitGPT [tutorials/finetune_full](finetune_full.md) tutorial 40 | 41 | 42 |   43 | ### LoRA and QLoRA finetuning 44 | 45 | ```bash 46 | litgpt finetune_lora stabilityai/stablelm-base-alpha-3b 47 | ``` 48 | 49 | LoRA and QLoRA are parameter-efficient finetuning technique that only require updating a small number of parameters, which makes this a more memory-efficienty alternative to full finetuning. 50 | 51 | **More information and resources:** 52 | 53 | - the LitGPT [tutorials/finetune_lora](finetune_lora.md) tutorial 54 | - the LoRA paper by ([Hu et al. 2021](https://arxiv.org/abs/2106.09685)) 55 | - the conceptual tutorial [Parameter-Efficient LLM Finetuning With Low-Rank Adaptation (LoRA)](https://lightning.ai/pages/community/tutorial/lora-llm/) 56 | 57 | 58 |   59 | ### Adapter finetuning 60 | 61 | ```bash 62 | litgpt finetune_adapter stabilityai/stablelm-base-alpha-3b 63 | ``` 64 | 65 | or 66 | 67 | ```bash 68 | litgpt finetune_adapter_v2 stabilityai/stablelm-base-alpha-3b 69 | ``` 70 | 71 | Similar to LoRA, adapter finetuning is a parameter-efficient finetuning technique that only requires training a small subset of weight parameters, making this finetuning method more memory-efficient than full-parameter finetuning. 72 | 73 | **More information and resources:** 74 | 75 | - the LitGPT [tutorials/finetune_adapter](finetune_adapter.md) tutorial 76 | - the Llama-Adapter ([Gao et al. 2023](https://arxiv.org/abs/2304.15010)) and Llama-Adapter v2 ([Zhang et al. 2023](https://arxiv.org/abs/2303.16199)) papers that originally introduces these methods 77 | - the conceptual tutorial [Understanding Parameter-Efficient Finetuning of Large Language Models: From Prefix Tuning to LLaMA-Adapters](https://lightning.ai/pages/community/article/understanding-llama-adapters/) 78 | -------------------------------------------------------------------------------- /tutorials/finetune_full.md: -------------------------------------------------------------------------------- 1 | # Finetuning the whole model 2 | 3 | If you are interested in parameter-efficient finetuning, check out [finetune_adapter.md](finetune_adapter.md). In contrast to parameter-efficient finetuning, this "full" approach finetunes all model parameters, which is substantially more expensive. It may only be recommended as a baseline for comparison studies. 4 | 5 | ## Preparation 6 | 7 | The steps here only need to be done once: 8 | 9 | 1. Follow the instructions in the [README](../README.md) to install the dependencies. 10 | 2. Download and convert the weights following our [guide](download_model_weights.md). 11 | 12 | LitGPT provides common datasets for finetuning, such as Alpaca, LIMA, Dolly, and more. 13 | You can optionally [prepare your own dataset](#tune-on-your-dataset). 14 | For more information about dataset preparation, also see the [prepare_dataset.md](./prepare_dataset.md) tutorial. 15 | 16 | ## Running the finetuning 17 | 18 | ```bash 19 | litgpt finetune_full tiiuae/falcon-7b \ 20 | --data Alpaca \ 21 | ``` 22 | 23 | Finetuning the falcon-7b model requires at least 8 GPUs with ~40 GB memory each. 24 | 25 | You can speed up training by passing the `devices` argument to the script to utilize more GPUs if available. 26 | Depending on the available GPU memory, you can also tune the `micro_batch_size` parameter to utilize the GPU efficiently. 27 | 28 | This script will save checkpoints periodically to the `out_dir` directory. If you are finetuning different models or on your own dataset, you can specify an output directory with your preferred name: 29 | 30 | ```bash 31 | litgpt finetune_full tiiuae/falcon-7b \ 32 | --data Alpaca \ 33 | --out_dir out/full/my-model-finetuned 34 | ``` 35 | 36 | If your GPU does not support `bfloat16`, you can pass the `--precision 32-true` argument. 37 | For instance, to fine-tune on MPS (the GPU on modern Macs), you can run 38 | 39 | ```bash 40 | litgpt finetune_full tiiuae/falcon-7b \ 41 | --data Alpaca \ 42 | --out_dir out/full/my-model-finetuned \ 43 | --precision 32-true 44 | ``` 45 | 46 | Note that `mps` as the accelerator will be picked up automatically by Fabric when running on a modern Mac. 47 | 48 | ## Test the model 49 | 50 | You can test the finetuned model with your own instructions by running: 51 | 52 | ```bash 53 | litgpt generate tiiuae/falcon-7b \ 54 | --prompt "Recommend a movie to watch on the weekend." \ 55 | --finetuned_path out/full/my-model-finetuned/lit_model_finetuned.pth 56 | ``` 57 | 58 | Output: 59 | 60 | ```text 61 | A good movie to watch on the weekend would be The Lion King, since it's a classic family film that everyone can enjoy... 62 | ``` 63 | 64 | If your GPU supports `bfloat16`, the script will automatically use it. 65 | 66 | ## Tune on your dataset 67 | 68 | You can easily train on your own instruction dataset saved in JSON format. 69 | 70 | 1. Create a JSON file in which each row holds one instruction-response pair. 71 | A row has an entry for 'instruction' and 'output', and optionally 'input'. Note that currently, the 'input' field is only used in the Alpaca chat template. If you are using the Alpaca template, 'input' can be the empty string if the instruction doesn't require a context. 72 | Below is an example json file: 73 | 74 | ```text 75 | [ 76 | { 77 | "instruction": "Arrange the given numbers in ascending order.", 78 | "input": "2, 4, 0, 8, 3", // Optional: only used in Alpaca chat template 79 | "output": "0, 2, 3, 4, 8" 80 | }, 81 | ... 82 | ] 83 | ``` 84 | 85 | 2. Run `litgpt finetune` by passing in the location of your data (and optionally other parameters): 86 | 87 | ```bash 88 | litgpt finetune tiiuae/falcon-7b \ 89 | --data JSON \ 90 | --data.json_path data/mydata.json \ 91 | --out_dir data/mydata-finetuned 92 | ``` 93 | -------------------------------------------------------------------------------- /tutorials/full_finetune_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script is meant to be the simplest possible starting point for full finetuning a GPT model using lightning fabric with code (not CLI). 3 | 4 | - no checkpoints 5 | - no out dir 6 | - no precision 7 | - no resume 8 | - no train/eval args (or any args in general) 9 | - no logger (only to terminal) 10 | - no grad accumulation 11 | and no other fancy stuff. 12 | 13 | To add all the above stuff, you can slowly add them in yourself by looking at the code in litgpt/finetune/full.py or the docs for litgpt/fabric. 14 | """ 15 | 16 | import os 17 | 18 | import lightning as L 19 | import torch 20 | import torch.nn as nn 21 | 22 | from litgpt.data import Alpaca 23 | from litgpt.model import GPT, Config 24 | from litgpt.tokenizer import Tokenizer 25 | from litgpt.utils import num_parameters 26 | 27 | # training params/args 28 | SEED = 1337 29 | MODEL_NAME = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" # try also "stabilityai/stablelm-base-alpha-3b"! 30 | BATCH_SIZE = 4 31 | LR_WARMUP_STEPS = 100 32 | MAX_STEPS = 601 33 | 34 | 35 | def validate(model, val_dataloader): 36 | model.eval() 37 | loss = 0 38 | with torch.no_grad(): 39 | for batch in val_dataloader: 40 | input_ids, targets = batch["input_ids"], batch["labels"] 41 | logits = model(input_ids) 42 | logits = logits.reshape(-1, logits.size(-1)) 43 | targets = targets.reshape(-1) 44 | loss += nn.functional.cross_entropy(logits[..., :-1, :], targets[..., 1:]) 45 | fabric.print(f"Validation loss: {loss / len(val_dataloader)}") 46 | 47 | 48 | def train(fabric, model, optimizer, scheduler, train_dataloader, val_dataloader): 49 | for iter_num, batch in enumerate(train_dataloader): 50 | input_ids, targets = batch["input_ids"], batch["labels"] 51 | 52 | # get model preds (logits) 53 | logits = model(input_ids) 54 | logits = logits.reshape(-1, logits.size(-1)) 55 | 56 | # get loss 57 | targets = targets.reshape(-1) 58 | loss = nn.functional.cross_entropy(logits[..., :-1, :], targets[..., 1:]) 59 | 60 | # update weights 61 | fabric.backward(loss) 62 | optimizer.step() 63 | optimizer.zero_grad() 64 | scheduler.step() 65 | 66 | # print train loss every 100 steps 67 | if iter_num % 100 == 0 or iter_num == 0: 68 | fabric.print(f"Train iter {iter_num} - loss {loss}") 69 | 70 | # validate every 300 steps 71 | if iter_num % 300 == 0 or iter_num == 0: 72 | validate(model, val_dataloader) 73 | model.train() 74 | iter_num += 1 75 | 76 | if iter_num >= MAX_STEPS: 77 | break 78 | 79 | 80 | def main(fabric): 81 | fabric.seed_everything(SEED) 82 | 83 | # setup data, make tokenizer and make dataloaders 84 | data = Alpaca() 85 | tokenizer = Tokenizer(checkpoint_dir=f"checkpoints/{MODEL_NAME}") 86 | data.connect(tokenizer=tokenizer, batch_size=BATCH_SIZE, max_seq_length=1024) 87 | data.setup() 88 | train_dataloader = data.train_dataloader() 89 | val_dataloader = data.val_dataloader() 90 | train_dataloader, val_dataloader = fabric.setup_dataloaders(train_dataloader, val_dataloader) 91 | 92 | # print how many steps in an epoch 93 | fabric.print(f"Steps in an epoch: {len(train_dataloader)}") 94 | 95 | # setup model 96 | config = Config.from_file(f"checkpoints/{MODEL_NAME}/model_config.yaml") 97 | model = GPT(config) 98 | fabric.print(f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}") 99 | model = fabric.setup(model) 100 | 101 | # setup optimizer 102 | optimizer = torch.optim.AdamW(model.parameters(), lr=3e-3, weight_decay=0.02, betas=(0.9, 0.95)) 103 | optimizer = fabric.setup_optimizers(optimizer) 104 | 105 | # setup lr scheduler 106 | scheduler1 = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: step / LR_WARMUP_STEPS) 107 | scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(MAX_STEPS - LR_WARMUP_STEPS)) 108 | scheduler = torch.optim.lr_scheduler.SequentialLR(optimizer, [scheduler1, scheduler2], milestones=[LR_WARMUP_STEPS]) 109 | 110 | # Start training!!! 111 | train(fabric, model, optimizer, scheduler, train_dataloader, val_dataloader) 112 | 113 | 114 | if __name__ == "__main__": 115 | # check that the model exists (downloaded to ./checkpoints/) 116 | if not os.path.exists(f"checkpoints/{MODEL_NAME}"): 117 | print(f"Model {MODEL_NAME} not found. Please download it using `litgpt download --repo {MODEL_NAME}`") 118 | exit() 119 | 120 | ### Setup and launch 121 | fabric = L.Fabric(devices="auto", strategy="auto") 122 | fabric.launch(main) 123 | -------------------------------------------------------------------------------- /tutorials/images/0_to_litgpt/commands.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/0_to_litgpt/commands.webp -------------------------------------------------------------------------------- /tutorials/images/0_to_litgpt/finetune.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/0_to_litgpt/finetune.webp -------------------------------------------------------------------------------- /tutorials/images/0_to_litgpt/instruction-1.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/0_to_litgpt/instruction-1.webp -------------------------------------------------------------------------------- /tutorials/images/0_to_litgpt/instruction-2.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/0_to_litgpt/instruction-2.webp -------------------------------------------------------------------------------- /tutorials/images/0_to_litgpt/pretrain.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/0_to_litgpt/pretrain.webp -------------------------------------------------------------------------------- /tutorials/images/0_to_litgpt/usage.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/0_to_litgpt/usage.webp -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/alpaca-2k.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/prepare_dataset/alpaca-2k.jpg -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/alpaca.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/prepare_dataset/alpaca.jpg -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/alpaca_libre.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/prepare_dataset/alpaca_libre.jpg -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/alpacagpt4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/prepare_dataset/alpacagpt4.jpg -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/deita-multiturn.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/prepare_dataset/deita-multiturn.jpg -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/deita.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/prepare_dataset/deita.jpg -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/dolly.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/prepare_dataset/dolly.jpg -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/lima.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/prepare_dataset/lima.jpg -------------------------------------------------------------------------------- /tutorials/images/prepare_dataset/longform.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lightning-AI/litgpt/f99ca4ecb5f5a147259415357fc7f480caa38b22/tutorials/images/prepare_dataset/longform.jpg -------------------------------------------------------------------------------- /tutorials/mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: LitGPT Tutorials 2 | 3 | plugins: 4 | - pagetree 5 | 6 | theme: 7 | name: material 8 | --------------------------------------------------------------------------------