├── .github ├── ISSUE_TEMPLATE │ ├── ask-a-question.md │ └── bug-report.yaml └── workflows │ ├── basic-tests-linux-uv.yml │ ├── basic-tests-macos-uv.yml │ ├── basic-tests-old-pytorch.yml │ ├── basic-tests-pip.yml │ ├── basic-tests-pixi.yml │ ├── basic-tests-pytorch-rc.yml │ ├── basic-tests-windows-uv-pip.yml │ ├── basic-tests-windows-uv-pip.yml.disabled │ ├── basic-tests-windows-uv.yml.disabled │ ├── check-links.yml │ ├── check-spelling-errors.yml │ └── pep8-linter.yml ├── .gitignore ├── CITATION.cff ├── LICENSE.txt ├── README.md ├── appendix-A ├── 01_main-chapter-code │ ├── DDP-script-torchrun.py │ ├── DDP-script.py │ ├── README.md │ ├── code-part1.ipynb │ ├── code-part2.ipynb │ └── exercise-solutions.ipynb ├── 02_setup-recommendations │ └── README.md └── README.md ├── appendix-D ├── 01_main-chapter-code │ ├── appendix-D.ipynb │ ├── previous_chapters.py │ └── the-verdict.txt └── README.md ├── appendix-E ├── 01_main-chapter-code │ ├── appendix-E.ipynb │ ├── gpt_download.py │ └── previous_chapters.py └── README.md ├── ch01 └── README.md ├── ch02 ├── 01_main-chapter-code │ ├── README.md │ ├── ch02.ipynb │ ├── dataloader.ipynb │ ├── exercise-solutions.ipynb │ └── the-verdict.txt ├── 02_bonus_bytepair-encoder │ ├── README.md │ ├── bpe_openai_gpt2.py │ ├── compare-bpe-tiktoken.ipynb │ ├── gpt2_model │ │ ├── encoder.json │ │ └── vocab.bpe │ └── requirements-extra.txt ├── 03_bonus_embedding-vs-matmul │ ├── README.md │ └── embeddings-and-linear-layers.ipynb ├── 04_bonus_dataloader-intuition │ ├── README.md │ └── dataloader-intuition.ipynb ├── 05_bpe-from-scratch │ ├── README.md │ ├── bpe-from-scratch.ipynb │ └── tests │ │ └── tests.py └── README.md ├── ch03 ├── 01_main-chapter-code │ ├── README.md │ ├── ch03.ipynb │ ├── exercise-solutions.ipynb │ ├── multihead-attention.ipynb │ └── small-text-sample.txt ├── 02_bonus_efficient-multihead-attention │ ├── README.md │ └── mha-implementations.ipynb ├── 03_understanding-buffers │ ├── README.md │ └── understanding-buffers.ipynb └── README.md ├── ch04 ├── 01_main-chapter-code │ ├── README.md │ ├── ch04.ipynb │ ├── exercise-solutions.ipynb │ ├── gpt.py │ ├── previous_chapters.py │ └── tests.py ├── 02_performance-analysis │ ├── README.md │ ├── flops-analysis.ipynb │ └── requirements-extra.txt └── README.md ├── ch05 ├── 01_main-chapter-code │ ├── README.md │ ├── ch05.ipynb │ ├── exercise-solutions.ipynb │ ├── gpt_download.py │ ├── gpt_generate.py │ ├── gpt_train.py │ ├── previous_chapters.py │ └── tests.py ├── 02_alternative_weight_loading │ ├── README.md │ ├── weight-loading-hf-safetensors.ipynb │ ├── weight-loading-hf-transformers.ipynb │ └── weight-loading-pytorch.ipynb ├── 03_bonus_pretraining_on_gutenberg │ ├── README.md │ ├── prepare_dataset.py │ ├── pretraining_simple.py │ └── tests.py ├── 04_learning_rate_schedulers │ └── README.md ├── 05_bonus_hparam_tuning │ ├── README.md │ ├── hparam_search.py │ └── the-verdict.txt ├── 06_user_interface │ ├── README.md │ ├── app_orig.py │ ├── app_own.py │ └── requirements-extra.txt ├── 07_gpt_to_llama │ ├── README.md │ ├── config.json │ ├── converting-gpt-to-llama2.ipynb │ ├── converting-llama2-to-llama3.ipynb │ ├── previous_chapters.py │ ├── requirements-extra.txt │ ├── standalone-llama32-mem-opt.ipynb │ ├── standalone-llama32.ipynb │ └── tests │ │ ├── test-requirements-extra.txt │ │ └── tests.py ├── 08_memory_efficient_weight_loading │ ├── README.md │ ├── memory-efficient-state-dict.ipynb │ └── previous_chapters.py ├── 09_extending-tokenizers │ ├── README.md │ └── extend-tiktoken.ipynb ├── 10_llm-training-speed │ ├── 00_orig.py │ ├── 01_opt_single_gpu.py │ ├── 02_opt_multi_gpu_ddp.py │ └── README.md └── README.md ├── ch06 ├── 01_main-chapter-code │ ├── README.md │ ├── ch06.ipynb │ ├── exercise-solutions.ipynb │ ├── gpt_class_finetune.py │ ├── gpt_download.py │ ├── load-finetuned-model.ipynb │ ├── previous_chapters.py │ └── tests.py ├── 02_bonus_additional-experiments │ ├── README.md │ ├── additional_experiments.py │ ├── gpt_download.py │ └── previous_chapters.py ├── 03_bonus_imdb-classification │ ├── README.md │ ├── download_prepare_dataset.py │ ├── gpt_download.py │ ├── previous_chapters.py │ ├── requirements-extra.txt │ ├── sklearn-baseline.ipynb │ ├── train_bert_hf.py │ ├── train_bert_hf_spam.py │ ├── train_gpt.py │ └── train_sklearn_logreg.py ├── 04_user_interface │ ├── README.md │ ├── app.py │ └── requirements-extra.txt └── README.md ├── ch07 ├── 01_main-chapter-code │ ├── README.md │ ├── ch07.ipynb │ ├── exercise-solutions.ipynb │ ├── exercise_experiments.py │ ├── gpt_download.py │ ├── gpt_instruction_finetuning.py │ ├── instruction-data-with-response.json │ ├── instruction-data.json │ ├── load-finetuned-model.ipynb │ ├── ollama_evaluate.py │ ├── previous_chapters.py │ └── tests.py ├── 02_dataset-utilities │ ├── README.md │ ├── config.json │ ├── create-passive-voice-entries.ipynb │ ├── find-near-duplicates.py │ ├── instruction-examples-modified.json │ ├── instruction-examples.json │ └── requirements-extra.txt ├── 03_model-evaluation │ ├── README.md │ ├── config.json │ ├── eval-example-data.json │ ├── llm-instruction-eval-ollama.ipynb │ ├── llm-instruction-eval-openai.ipynb │ ├── requirements-extra.txt │ └── scores │ │ ├── correlation-analysis.ipynb │ │ ├── gpt4-model-1-response.json │ │ ├── gpt4-model-2-response.json │ │ ├── llama3-8b-model-1-response.json │ │ └── llama3-8b-model-2-response.json ├── 04_preference-tuning-with-dpo │ ├── README.md │ ├── create-preference-data-ollama.ipynb │ ├── dpo-from-scratch.ipynb │ ├── instruction-data-with-preference.json │ └── previous_chapters.py ├── 05_dataset-generation │ ├── README.md │ ├── config.json │ ├── instruction-data-llama3-7b.json │ ├── llama3-ollama.ipynb │ ├── reflection-gpt4.ipynb │ └── requirements-extra.txt ├── 06_user_interface │ ├── README.md │ ├── app.py │ └── requirements-extra.txt └── README.md ├── pixi.toml ├── pkg └── llms_from_scratch │ ├── README.md │ ├── __init__.py │ ├── appendix_a.py │ ├── appendix_d.py │ ├── appendix_e.py │ ├── ch02.py │ ├── ch03.py │ ├── ch04.py │ ├── ch05.py │ ├── ch06.py │ ├── ch07.py │ ├── llama3.py │ └── tests │ ├── test_appendix_a.py │ ├── test_appendix_d.py │ ├── test_appendix_e.py │ ├── test_ch02.py │ ├── test_ch03.py │ ├── test_ch04.py │ ├── test_ch05.py │ ├── test_ch06.py │ ├── test_ch07.py │ └── test_llama3.py ├── pyproject.toml ├── requirements.txt └── setup ├── .vscode └── extensions.json ├── 01_optional-python-setup-preferences ├── README.md ├── native-pixi.md └── native-uv.md ├── 02_installing-python-libraries ├── README.md ├── python_environment_check.ipynb ├── python_environment_check.py └── tests.py ├── 03_optional-docker-environment ├── .devcontainer │ ├── Dockerfile │ ├── README.md │ └── devcontainer.json └── README.md ├── 04_optional-aws-sagemaker-notebook ├── README.md └── cloudformation-template.yml └── README.md /.github/ISSUE_TEMPLATE/ask-a-question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Ask a Question 3 | about: Ask questions related to the book 4 | title: '' 5 | labels: [question] 6 | assignees: rasbt 7 | 8 | --- 9 | 10 | If you have a question that is not a bug, please consider asking it in this GitHub repository's [discussion forum](https://github.com/rasbt/LLMs-from-scratch/discussions). 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.yaml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: Report errors related to the book content or code 3 | title: "Description" 4 | labels: [bug] 5 | assignees: rasbt 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: | 10 | Thank you for taking the time to report an issue. Please fill out the details below to help resolve it. 11 | 12 | - type: textarea 13 | id: bug_description 14 | attributes: 15 | label: Bug description 16 | description: A description of the issue. 17 | placeholder: | 18 | Please provide a description of what the bug or issue is. 19 | validations: 20 | required: true 21 | 22 | - type: dropdown 23 | id: operating_system 24 | attributes: 25 | label: What operating system are you using? 26 | description: If applicable, please select the operating system where you experienced this issue. 27 | options: 28 | - "Unknown" 29 | - "macOS" 30 | - "Linux" 31 | - "Windows" 32 | validations: 33 | required: False 34 | 35 | - type: dropdown 36 | id: compute_environment 37 | attributes: 38 | label: Where do you run your code? 39 | description: Please select the computing environment where you ran this code. 40 | options: 41 | - "Local (laptop, desktop)" 42 | - "Lightning AI Studio" 43 | - "Google Colab" 44 | - "Other cloud environment (AWS, Azure, GCP)" 45 | validations: 46 | required: False 47 | 48 | - type: textarea 49 | id: environment 50 | attributes: 51 | label: Environment 52 | description: | 53 | Please provide details about your Python environment via the environment collection script or notebook located at 54 | https://github.com/rasbt/LLMs-from-scratch/tree/main/setup/02_installing-python-libraries. 55 | For your convenience, you can download and run the script from your terminal as follows: 56 | 57 | ```bash 58 | curl --ssl-no-revoke -O https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/setup/02_installing-python-libraries/python_environment_check.py \ 59 | -O https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/requirements.txt 60 | 61 | python python_environment_check.py 62 | ``` 63 | 64 | The script will print your Python environment information in the following format 65 | ```console 66 | [OK] Your Python version is 3.11.4 67 | [OK] torch 2.3.1 68 | [OK] jupyterlab 4.2.2 69 | [OK] tiktoken 0.7.0 70 | [OK] matplotlib 3.9.0 71 | [OK] numpy 1.26.4 72 | [OK] tensorflow 2.16.1 73 | [OK] tqdm 4.66.4 74 | [OK] pandas 2.2.2 75 | [OK] psutil 5.9.8 76 | ``` 77 | You can simply copy and paste the outputs of this script below. 78 | value: | 79 | ``` 80 | 81 | 82 | 83 | ``` 84 | validations: 85 | required: false 86 | -------------------------------------------------------------------------------- /.github/workflows/basic-tests-linux-uv.yml: -------------------------------------------------------------------------------- 1 | name: Code tests Linux 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | paths: 7 | - '**/*.py' 8 | - '**/*.ipynb' 9 | - '**/*.yaml' 10 | - '**/*.yml' 11 | - '**/*.sh' 12 | pull_request: 13 | branches: [ main ] 14 | paths: 15 | - '**/*.py' 16 | - '**/*.ipynb' 17 | - '**/*.yaml' 18 | - '**/*.yml' 19 | - '**/*.sh' 20 | workflow_dispatch: 21 | 22 | concurrency: 23 | group: ${{ github.workflow }}-${{ github.ref }} 24 | cancel-in-progress: true 25 | 26 | jobs: 27 | uv-tests: 28 | name: Code tests (Linux) 29 | runs-on: ubuntu-latest 30 | steps: 31 | - uses: actions/checkout@v4 32 | 33 | - name: Set up Python (uv) 34 | uses: actions/setup-python@v5 35 | with: 36 | python-version: "3.13" 37 | 38 | - name: Install uv and dependencies 39 | shell: bash 40 | run: | 41 | curl -LsSf https://astral.sh/uv/install.sh | sh 42 | uv sync --dev --python=3.10 # tests for backwards compatibility 43 | uv pip install -r ch05/07_gpt_to_llama/tests/test-requirements-extra.txt 44 | uv add pytest-ruff nbval 45 | 46 | - name: Test Selected Python Scripts (uv) 47 | shell: bash 48 | run: | 49 | source .venv/bin/activate 50 | pytest --ruff setup/02_installing-python-libraries/tests.py 51 | pytest --ruff ch04/01_main-chapter-code/tests.py 52 | pytest --ruff ch05/01_main-chapter-code/tests.py 53 | pytest --ruff ch05/07_gpt_to_llama/tests/tests.py 54 | pytest --ruff ch06/01_main-chapter-code/tests.py 55 | 56 | - name: Validate Selected Jupyter Notebooks (uv) 57 | shell: bash 58 | run: | 59 | source .venv/bin/activate 60 | pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb 61 | pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb 62 | pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb 63 | 64 | - name: Test Selected Bonus Materials 65 | shell: bash 66 | run: | 67 | source .venv/bin/activate 68 | pytest ch02/05_bpe-from-scratch/tests/tests.py 69 | 70 | - name: Test Selected Bonus Materials 71 | shell: bash 72 | run: | 73 | source .venv/bin/activate 74 | uv pip install transformers 75 | pytest pkg/llms_from_scratch/tests/ 76 | -------------------------------------------------------------------------------- /.github/workflows/basic-tests-macos-uv.yml: -------------------------------------------------------------------------------- 1 | name: Code tests macOS 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | paths: 7 | - '**/*.py' 8 | - '**/*.ipynb' 9 | - '**/*.yaml' 10 | - '**/*.yml' 11 | - '**/*.sh' 12 | pull_request: 13 | branches: [ main ] 14 | paths: 15 | - '**/*.py' 16 | - '**/*.ipynb' 17 | - '**/*.yaml' 18 | - '**/*.yml' 19 | - '**/*.sh' 20 | workflow_dispatch: 21 | 22 | concurrency: 23 | group: ${{ github.workflow }}-${{ github.ref }} 24 | cancel-in-progress: true 25 | 26 | jobs: 27 | uv-tests: 28 | name: Code tests (macOS) 29 | runs-on: macos-latest 30 | steps: 31 | - uses: actions/checkout@v4 32 | 33 | - name: Set up Python (uv) 34 | uses: actions/setup-python@v5 35 | with: 36 | python-version: "3.13" 37 | 38 | - name: Install uv and dependencies 39 | shell: bash 40 | run: | 41 | curl -LsSf https://astral.sh/uv/install.sh | sh 42 | uv sync --dev --python=3.10 # tests for backwards compatibility 43 | uv pip install -r ch05/07_gpt_to_llama/tests/test-requirements-extra.txt 44 | uv add pytest-ruff nbval 45 | 46 | - name: Test Selected Python Scripts (uv) 47 | shell: bash 48 | run: | 49 | source .venv/bin/activate 50 | pytest --ruff setup/02_installing-python-libraries/tests.py 51 | pytest --ruff ch04/01_main-chapter-code/tests.py 52 | pytest --ruff ch05/01_main-chapter-code/tests.py 53 | pytest --ruff ch05/07_gpt_to_llama/tests/tests.py 54 | pytest --ruff ch06/01_main-chapter-code/tests.py 55 | 56 | - name: Validate Selected Jupyter Notebooks (uv) 57 | shell: bash 58 | run: | 59 | source .venv/bin/activate 60 | pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb 61 | pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb 62 | pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb 63 | -------------------------------------------------------------------------------- /.github/workflows/basic-tests-old-pytorch.yml: -------------------------------------------------------------------------------- 1 | name: Test PyTorch 2.2 and 2.6 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | paths: 7 | - '**/*.py' # Run workflow for changes in Python files 8 | - '**/*.ipynb' 9 | - '**/*.yaml' 10 | - '**/*.yml' 11 | - '**/*.sh' 12 | pull_request: 13 | branches: [ main ] 14 | paths: 15 | - '**/*.py' 16 | - '**/*.ipynb' 17 | - '**/*.yaml' 18 | - '**/*.yml' 19 | - '**/*.sh' 20 | 21 | jobs: 22 | test: 23 | runs-on: ubuntu-latest 24 | strategy: 25 | matrix: 26 | pytorch-version: [ 2.3.0, 2.6.0 ] 27 | 28 | steps: 29 | - uses: actions/checkout@v4 30 | 31 | - name: Set up Python 32 | uses: actions/setup-python@v5 33 | with: 34 | python-version: "3.13" 35 | 36 | - name: Install dependencies 37 | run: | 38 | curl -LsSf https://astral.sh/uv/install.sh | sh 39 | uv sync --dev --python=3.10 # tests for backwards compatibility 40 | uv pip install -r ch05/07_gpt_to_llama/tests/test-requirements-extra.txt 41 | uv add torch==${{ matrix.pytorch-version }} 42 | uv add pytest-ruff nbval 43 | 44 | - name: Test Selected Python Scripts 45 | run: | 46 | source .venv/bin/activate 47 | pytest --ruff setup/02_installing-python-libraries/tests.py 48 | pytest --ruff ch04/01_main-chapter-code/tests.py 49 | pytest --ruff ch05/01_main-chapter-code/tests.py 50 | pytest --ruff ch05/07_gpt_to_llama/tests/tests.py 51 | pytest --ruff ch06/01_main-chapter-code/tests.py 52 | 53 | - name: Validate Selected Jupyter Notebooks 54 | run: | 55 | source .venv/bin/activate 56 | pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb 57 | pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb 58 | pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb 59 | -------------------------------------------------------------------------------- /.github/workflows/basic-tests-pip.yml: -------------------------------------------------------------------------------- 1 | name: Code tests (plain pip) 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | paths: 7 | - '**/*.py' 8 | - '**/*.ipynb' 9 | - '**/*.yaml' 10 | - '**/*.yml' 11 | - '**/*.sh' 12 | pull_request: 13 | branches: [ main ] 14 | paths: 15 | - '**/*.py' 16 | - '**/*.ipynb' 17 | - '**/*.yaml' 18 | - '**/*.yml' 19 | - '**/*.sh' 20 | workflow_dispatch: 21 | 22 | concurrency: 23 | group: ${{ github.workflow }}-${{ github.ref }} 24 | cancel-in-progress: true 25 | 26 | jobs: 27 | pip-tests: 28 | name: Pip Tests (Ubuntu Only) 29 | runs-on: ubuntu-latest 30 | steps: 31 | - uses: actions/checkout@v4 32 | 33 | - name: Set up Python 34 | uses: actions/setup-python@v5 35 | with: 36 | python-version: "3.10" # tests for backwards compatibility 37 | 38 | - name: Create Virtual Environment and Install Dependencies 39 | run: | 40 | python -m venv .venv 41 | source .venv/bin/activate 42 | pip install --upgrade pip 43 | pip install -r requirements.txt 44 | pip install -r ch05/07_gpt_to_llama/tests/test-requirements-extra.txt 45 | pip install pytest pytest-ruff nbval 46 | 47 | - name: Test Selected Python Scripts 48 | run: | 49 | source .venv/bin/activate 50 | pytest --ruff setup/02_installing-python-libraries/tests.py 51 | pytest --ruff ch04/01_main-chapter-code/tests.py 52 | pytest --ruff ch05/01_main-chapter-code/tests.py 53 | pytest --ruff ch05/07_gpt_to_llama/tests/tests.py 54 | pytest --ruff ch06/01_main-chapter-code/tests.py 55 | 56 | - name: Validate Selected Jupyter Notebooks 57 | run: | 58 | source .venv/bin/activate 59 | pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb 60 | pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb 61 | pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb -------------------------------------------------------------------------------- /.github/workflows/basic-tests-pixi.yml: -------------------------------------------------------------------------------- 1 | name: Code tests (pixi) 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | paths: 7 | - '**/*.py' 8 | - '**/*.ipynb' 9 | - '**/*.yaml' 10 | - '**/*.yml' 11 | - '**/*.sh' 12 | pull_request: 13 | branches: [ main ] 14 | paths: 15 | - '**/*.py' 16 | - '**/*.ipynb' 17 | - '**/*.yaml' 18 | - '**/*.yml' 19 | - '**/*.sh' 20 | workflow_dispatch: 21 | 22 | concurrency: 23 | group: ${{ github.workflow }}-${{ github.ref }} 24 | cancel-in-progress: true 25 | 26 | jobs: 27 | test: 28 | runs-on: ${{ matrix.os }} 29 | strategy: 30 | matrix: 31 | os: [ubuntu-latest, macos-latest, windows-latest] 32 | 33 | steps: 34 | - uses: actions/checkout@v4 35 | 36 | - name: Set up pixi (without caching) 37 | uses: prefix-dev/setup-pixi@v0.8.2 38 | with: 39 | environments: tests 40 | cache: false 41 | 42 | - name: List installed packages 43 | run: | 44 | pixi list --environment tests 45 | pixi run --environment tests pip install "huggingface-hub>=0.30.0,<1.0" 46 | 47 | - name: Test Selected Python Scripts 48 | shell: pixi run --environment tests bash -e {0} 49 | run: | 50 | pytest --ruff setup/02_installing-python-libraries/tests.py 51 | pytest --ruff ch04/01_main-chapter-code/tests.py 52 | pytest --ruff ch05/01_main-chapter-code/tests.py 53 | pytest --ruff ch05/07_gpt_to_llama/tests/tests.py 54 | pytest --ruff ch06/01_main-chapter-code/tests.py 55 | 56 | - name: Validate Selected Jupyter Notebooks 57 | shell: pixi run --environment tests bash -e {0} 58 | run: | 59 | pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb 60 | pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb 61 | pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb 62 | -------------------------------------------------------------------------------- /.github/workflows/basic-tests-pytorch-rc.yml: -------------------------------------------------------------------------------- 1 | name: Test latest PyTorch nightly / release candidate 2 | on: 3 | push: 4 | branches: [ main ] 5 | paths: 6 | - '**/*.py' # Run workflow for changes in Python files 7 | - '**/*.ipynb' 8 | - '**/*.yaml' 9 | - '**/*.yml' 10 | - '**/*.sh' 11 | pull_request: 12 | branches: [ main ] 13 | paths: 14 | - '**/*.py' 15 | - '**/*.ipynb' 16 | - '**/*.yaml' 17 | - '**/*.yml' 18 | - '**/*.sh' 19 | 20 | jobs: 21 | test: 22 | runs-on: ubuntu-latest 23 | 24 | steps: 25 | - uses: actions/checkout@v4 26 | 27 | - name: Set up Python 28 | uses: actions/setup-python@v5 29 | with: 30 | python-version: "3.13" 31 | 32 | - name: Install dependencies 33 | run: | 34 | curl -LsSf https://astral.sh/uv/install.sh | sh 35 | uv sync --dev --python=3.10 # tests for backwards compatibility 36 | uv pip install -r ch05/07_gpt_to_llama/tests/test-requirements-extra.txt 37 | uv add pytest-ruff nbval 38 | uv pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu 39 | 40 | - name: Test Selected Python Scripts 41 | run: | 42 | source .venv/bin/activate 43 | pytest --ruff setup/02_installing-python-libraries/tests.py 44 | pytest --ruff ch04/01_main-chapter-code/tests.py 45 | pytest --ruff ch05/01_main-chapter-code/tests.py 46 | pytest --ruff ch05/07_gpt_to_llama/tests/tests.py 47 | pytest --ruff ch06/01_main-chapter-code/tests.py 48 | 49 | - name: Validate Selected Jupyter Notebooks 50 | run: | 51 | source .venv/bin/activate 52 | pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb 53 | pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb 54 | pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb 55 | -------------------------------------------------------------------------------- /.github/workflows/basic-tests-windows-uv-pip.yml: -------------------------------------------------------------------------------- 1 | name: Code tests Windows (uv/pip) 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | paths: 7 | - '**/*.py' 8 | - '**/*.ipynb' 9 | - '**/*.yaml' 10 | - '**/*.yml' 11 | - '**/*.sh' 12 | pull_request: 13 | branches: [ main ] 14 | paths: 15 | - '**/*.py' 16 | - '**/*.ipynb' 17 | - '**/*.yaml' 18 | - '**/*.yml' 19 | - '**/*.sh' 20 | 21 | jobs: 22 | test: 23 | runs-on: windows-latest 24 | 25 | steps: 26 | - name: Checkout Code 27 | uses: actions/checkout@v4 28 | 29 | - name: Set up Python 30 | uses: actions/setup-python@v5 31 | with: 32 | python-version: '3.11' 33 | 34 | - name: Install dependencies 35 | shell: bash 36 | run: | 37 | export PATH="$HOME/.local/bin:$PATH" 38 | pip install --upgrade pip 39 | pip install uv 40 | uv venv --python=python3.11 41 | source .venv/Scripts/activate 42 | pip install -r requirements.txt # because of dependency issue on Windows when using `uv pip` 43 | pip install tensorflow-io-gcs-filesystem==0.31.0 # Explicit for Windows 44 | pip install -r ch05/07_gpt_to_llama/tests/test-requirements-extra.txt 45 | pip install pytest-ruff nbval 46 | 47 | - name: Run Python Tests 48 | shell: bash 49 | run: | 50 | source .venv/Scripts/activate 51 | pytest --ruff setup/02_installing-python-libraries/tests.py 52 | pytest --ruff ch04/01_main-chapter-code/tests.py 53 | pytest --ruff ch05/01_main-chapter-code/tests.py 54 | pytest --ruff ch05/07_gpt_to_llama/tests/tests.py 55 | pytest --ruff ch06/01_main-chapter-code/tests.py 56 | 57 | - name: Run Jupyter Notebook Tests 58 | shell: bash 59 | run: | 60 | source .venv/Scripts/activate 61 | pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb 62 | pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb 63 | pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb -------------------------------------------------------------------------------- /.github/workflows/basic-tests-windows-uv-pip.yml.disabled: -------------------------------------------------------------------------------- 1 | name: Code tests Windows (uv/pip) 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | paths: 7 | - '**/*.py' 8 | - '**/*.ipynb' 9 | - '**/*.yaml' 10 | - '**/*.yml' 11 | - '**/*.sh' 12 | pull_request: 13 | branches: [ main ] 14 | paths: 15 | - '**/*.py' 16 | - '**/*.ipynb' 17 | - '**/*.yaml' 18 | - '**/*.yml' 19 | - '**/*.sh' 20 | 21 | jobs: 22 | test: 23 | runs-on: windows-latest 24 | 25 | steps: 26 | - name: Checkout Code 27 | uses: actions/checkout@v4 28 | 29 | - name: Set up Python 30 | uses: actions/setup-python@v5 31 | with: 32 | python-version: "3.13" 33 | 34 | - name: Install dependencies 35 | shell: pwsh 36 | run: | 37 | $env:Path = "C:\Users\runneradmin\.local\bin;$env:Path" 38 | python -m pip install --upgrade pip 39 | python -m pip install uv 40 | uv venv --python=python3.11 41 | . .\.venv\Scripts\Activate.ps1 42 | $env:UV_PIP_OPTS="--no-binary tensorflow-io-gcs-filesystem" 43 | uv pip install -r requirements.txt 44 | uv pip install -r ch05/07_gpt_to_llama/tests/test-requirements-extra.txt 45 | uv pip install pytest-ruff nbval 46 | uv pip install --force-reinstall matplotlib "numpy<2.1" 47 | 48 | - name: Run Python Tests 49 | shell: pwsh 50 | run: | 51 | $env:Path = "C:\Users\runneradmin\.local\bin;$env:Path" 52 | . .\.venv\Scripts\Activate.ps1 53 | pytest --ruff setup/02_installing-python-libraries/tests.py 54 | pytest --ruff ch04/01_main-chapter-code/tests.py 55 | pytest --ruff ch05/01_main-chapter-code/tests.py 56 | pytest --ruff ch05/07_gpt_to_llama/tests/tests.py 57 | pytest --ruff ch06/01_main-chapter-code/tests.py 58 | 59 | - name: Run Jupyter Notebook Tests 60 | shell: pwsh 61 | run: | 62 | $env:Path = "C:\Users\runneradmin\.local\bin;$env:Path" 63 | . .\.venv\Scripts\Activate.ps1 64 | pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb 65 | pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb 66 | pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb 67 | -------------------------------------------------------------------------------- /.github/workflows/basic-tests-windows-uv.yml.disabled: -------------------------------------------------------------------------------- 1 | name: Code tests Windows (uv) 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | paths: 7 | - '**/*.py' 8 | - '**/*.ipynb' 9 | - '**/*.yaml' 10 | - '**/*.yml' 11 | - '**/*.sh' 12 | pull_request: 13 | branches: [ main ] 14 | paths: 15 | - '**/*.py' 16 | - '**/*.ipynb' 17 | - '**/*.yaml' 18 | - '**/*.yml' 19 | - '**/*.sh' 20 | 21 | jobs: 22 | test: 23 | runs-on: windows-latest 24 | 25 | steps: 26 | - name: Checkout Code 27 | uses: actions/checkout@v4 28 | 29 | - name: Set up Python 30 | uses: actions/setup-python@v5 31 | with: 32 | python-version: "3.13" 33 | 34 | - name: Install dependencies 35 | shell: pwsh 36 | run: | 37 | # Prepend local bin directory to PATH 38 | powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex" 39 | $env:Path = "C:\Users\runneradmin\.local\bin;$env:Path" 40 | uv sync --dev --python=3.10 41 | $env:UV_PIP_OPTS="--no-binary tensorflow-io-gcs-filesystem" 42 | uv pip install -r requirements.txt 43 | uv pip install matplotlib # for some reason Windows requires this 44 | uv pip install -r ch05/07_gpt_to_llama/tests/test-requirements-extra.txt 45 | uv add pytest-ruff nbval 46 | 47 | - name: Run Python Tests 48 | shell: pwsh 49 | run: | 50 | . .\.venv\Scripts\Activate.ps1 51 | pytest --ruff setup/02_installing-python-libraries/tests.py 52 | pytest --ruff ch04/01_main-chapter-code/tests.py 53 | pytest --ruff ch05/01_main-chapter-code/tests.py 54 | pytest --ruff ch05/07_gpt_to_llama/tests/tests.py 55 | pytest --ruff ch06/01_main-chapter-code/tests.py 56 | 57 | - name: Run Jupyter Notebook Tests 58 | shell: pwsh 59 | run: | 60 | . .\.venv\Scripts\Activate.ps1 61 | pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb 62 | pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb 63 | pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb 64 | -------------------------------------------------------------------------------- /.github/workflows/check-links.yml: -------------------------------------------------------------------------------- 1 | name: Check hyperlinks 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | test: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | 18 | - name: Set up Python 19 | uses: actions/setup-python@v5 20 | with: 21 | python-version: "3.10" 22 | 23 | - name: Install dependencies 24 | run: | 25 | curl -LsSf https://astral.sh/uv/install.sh | sh 26 | uv add pytest-ruff pytest-check-links 27 | 28 | - name: Check links 29 | run: | 30 | source .venv/bin/activate 31 | pytest --ruff --check-links ./ \ 32 | --check-links-ignore "https://platform.openai.com/*" \ 33 | --check-links-ignore "https://openai.com/*" \ 34 | --check-links-ignore "https://arena.lmsys.org" \ 35 | --check-links-ignore "https://unsloth.ai/blog/gradient" \ 36 | --check-links-ignore "https://www.reddit.com/r/*" \ 37 | --check-links-ignore "https://code.visualstudio.com/*" \ 38 | --check-links-ignore "https://arxiv.org/*" \ 39 | --check-links-ignore "https://ai.stanford.edu/~amaas/data/sentiment/" \ 40 | --check-links-ignore "https://x.com/*" \ 41 | --check-links-ignore "https://scholar.google.com/*" 42 | -------------------------------------------------------------------------------- /.github/workflows/check-spelling-errors.yml: -------------------------------------------------------------------------------- 1 | name: Spell Check 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | spellcheck: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | 18 | - name: Set up Python 19 | uses: actions/setup-python@v5 20 | with: 21 | python-version: "3.10" 22 | 23 | - name: Install codespell 24 | run: | 25 | curl -LsSf https://astral.sh/uv/install.sh | sh 26 | uv sync --dev --python=3.10 27 | uv add codespell 28 | 29 | - name: Run codespell 30 | run: | 31 | source .venv/bin/activate 32 | codespell -L "ocassion,occassion,ot,te,tje" **/*.{txt,md,py,ipynb} 33 | -------------------------------------------------------------------------------- /.github/workflows/pep8-linter.yml: -------------------------------------------------------------------------------- 1 | name: PEP8 Style checks 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | flake8: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Set up Python 15 | uses: actions/setup-python@v5 16 | with: 17 | python-version: "3.13" 18 | - name: Install ruff (a faster flake 8 equivalent) 19 | run: | 20 | curl -LsSf https://astral.sh/uv/install.sh | sh 21 | uv sync --dev --python=3.10 22 | uv add ruff 23 | 24 | - name: Run ruff with exceptions 25 | run: | 26 | source .venv/bin/activate 27 | ruff check . 28 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this book or its accompanying code, please cite it as follows." 3 | title: "Build A Large Language Model (From Scratch), Published by Manning, ISBN 978-1633437166" 4 | abstract: "This book provides a comprehensive, step-by-step guide to implementing a ChatGPT-like large language model from scratch in PyTorch." 5 | date-released: 2024-09-12 6 | authors: 7 | - family-names: "Raschka" 8 | given-names: "Sebastian" 9 | license: "Apache-2.0" 10 | url: "https://www.manning.com/books/build-a-large-language-model-from-scratch" 11 | repository-code: "https://github.com/rasbt/LLMs-from-scratch" 12 | keywords: 13 | - large language models 14 | - natural language processing 15 | - artificial intelligence 16 | - PyTorch 17 | - machine learning 18 | - deep learning 19 | -------------------------------------------------------------------------------- /appendix-A/01_main-chapter-code/README.md: -------------------------------------------------------------------------------- 1 | # Appendix A: Introduction to PyTorch 2 | 3 | ### Main Chapter Code 4 | 5 | - [code-part1.ipynb](code-part1.ipynb) contains all the section A.1 to A.8 code as it appears in the chapter 6 | - [code-part2.ipynb](code-part2.ipynb) contains all the section A.9 GPU code as it appears in the chapter 7 | - [DDP-script.py](DDP-script.py) contains the script to demonstrate multi-GPU usage (note that Jupyter Notebooks only support single GPUs, so this is a script, not a notebook). You can run it as `python DDP-script.py`. If your machine has more than 2 GPUs, run it as `CUDA_VISIBLE_DEVIVES=0,1 python DDP-script.py`. 8 | - [exercise-solutions.ipynb](exercise-solutions.ipynb) contains the exercise solutions for this chapter 9 | 10 | ### Optional Code 11 | 12 | - [DDP-script-torchrun.py](DDP-script-torchrun.py) is an optional version of the `DDP-script.py` script that runs via the PyTorch `torchrun` command instead of spawning and managing multiple processes ourselves via `multiprocessing.spawn`. The `torchrun` command has the advantage of automatically handling distributed initialization, including multi-node coordination, which slightly simplifies the setup process. You can use this script via `torchrun --nproc_per_node=2 DDP-script-torchrun.py` 13 | -------------------------------------------------------------------------------- /appendix-A/01_main-chapter-code/exercise-solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "\n", 9 | "\n", 15 | "\n", 18 | "\n", 19 | "
\n", 10 | "\n", 11 | "Supplementary code for the Build a Large Language Model From Scratch book by Sebastian Raschka
\n", 12 | "
Code repository: https://github.com/rasbt/LLMs-from-scratch\n", 13 | "
\n", 14 | "
\n", 16 | "\n", 17 | "
\n" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Exercise A.1" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "The [Python Setup Tips](../../setup/01_optional-python-setup-preferences/README.md) document in this repository contains additional recommendations and tips to set up your Python environment.\n" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "## Exercise A.2" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "The [Installing Libraries Used In This Book document](../../setup/02_installing-python-libraries/README.md) and [directory](../../setup/02_installing-python-libraries/) contains utilities to check whether your environment is set up correctly." 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "## Exercise A.3" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "import torch\n", 64 | "\n", 65 | "class NeuralNetwork(torch.nn.Module):\n", 66 | " def __init__(self, num_inputs, num_outputs):\n", 67 | " super().__init__()\n", 68 | "\n", 69 | " self.layers = torch.nn.Sequential(\n", 70 | " \n", 71 | " # 1st hidden layer\n", 72 | " torch.nn.Linear(num_inputs, 30),\n", 73 | " torch.nn.ReLU(),\n", 74 | "\n", 75 | " # 2nd hidden layer\n", 76 | " torch.nn.Linear(30, 20),\n", 77 | " torch.nn.ReLU(),\n", 78 | "\n", 79 | " # output layer\n", 80 | " torch.nn.Linear(20, num_outputs),\n", 81 | " )\n", 82 | "\n", 83 | " def forward(self, x):\n", 84 | " logits = self.layers(x)\n", 85 | " return logits" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 3, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "name": "stdout", 95 | "output_type": "stream", 96 | "text": [ 97 | "Total number of trainable model parameters: 752\n" 98 | ] 99 | } 100 | ], 101 | "source": [ 102 | "model = NeuralNetwork(2, 2)\n", 103 | "\n", 104 | "num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)\n", 105 | "print(\"Total number of trainable model parameters:\", num_params)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "## Exercise A.4" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 1, 118 | "metadata": { 119 | "id": "qGgnamiyLJxp" 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "import torch\n", 124 | "\n", 125 | "a = torch.rand(100, 200)\n", 126 | "b = torch.rand(200, 300)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 2, 132 | "metadata": { 133 | "colab": { 134 | "base_uri": "https://localhost:8080/" 135 | }, 136 | "id": "CvGvIeVkLzXE", 137 | "outputId": "44d027be-0787-4348-9c06-4e559d94d0e1" 138 | }, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "63.8 µs ± 8.7 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "%timeit a @ b" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 3, 155 | "metadata": { 156 | "id": "OmRtZLa9L2ZG" 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "a, b = a.to(\"cuda\"), b.to(\"cuda\")" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 4, 166 | "metadata": { 167 | "colab": { 168 | "base_uri": "https://localhost:8080/" 169 | }, 170 | "id": "duLEhXDPL6k0", 171 | "outputId": "3486471d-fd62-446f-9855-2d01f41fd101" 172 | }, 173 | "outputs": [ 174 | { 175 | "name": "stdout", 176 | "output_type": "stream", 177 | "text": [ 178 | "13.8 µs ± 425 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" 179 | ] 180 | } 181 | ], 182 | "source": [ 183 | "%timeit a @ b" 184 | ] 185 | } 186 | ], 187 | "metadata": { 188 | "accelerator": "GPU", 189 | "colab": { 190 | "gpuType": "V100", 191 | "machine_shape": "hm", 192 | "provenance": [] 193 | }, 194 | "kernelspec": { 195 | "display_name": "Python 3 (ipykernel)", 196 | "language": "python", 197 | "name": "python3" 198 | }, 199 | "language_info": { 200 | "codemirror_mode": { 201 | "name": "ipython", 202 | "version": 3 203 | }, 204 | "file_extension": ".py", 205 | "mimetype": "text/x-python", 206 | "name": "python", 207 | "nbconvert_exporter": "python", 208 | "pygments_lexer": "ipython3", 209 | "version": "3.10.6" 210 | } 211 | }, 212 | "nbformat": 4, 213 | "nbformat_minor": 4 214 | } 215 | -------------------------------------------------------------------------------- /appendix-A/02_setup-recommendations/README.md: -------------------------------------------------------------------------------- 1 | ## Python and Environment Setup Recommendations 2 | 3 | 4 | 5 | Please see the [README.md](../../setup/README.md) in the [setup](../../setup) directory for Python installation and setup recommendations. 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /appendix-A/README.md: -------------------------------------------------------------------------------- 1 | # Appendix A: Introduction to PyTorch 2 | 3 |   4 | ## Main Chapter Code 5 | 6 | - [01_main-chapter-code](01_main-chapter-code) contains the main chapter code 7 | 8 |   9 | ## Bonus Materials 10 | 11 | - [02_setup-recommendations](02_setup-recommendations) contains Python installation and setup recommendations. -------------------------------------------------------------------------------- /appendix-D/README.md: -------------------------------------------------------------------------------- 1 | # Appendix D: Adding Bells and Whistles to the Training Loop 2 | 3 | - [01_main-chapter-code](01_main-chapter-code) contains the main chapter code. -------------------------------------------------------------------------------- /appendix-E/README.md: -------------------------------------------------------------------------------- 1 | # Appendix E: Parameter-efficient Finetuning with LoRA 2 | 3 | - [01_main-chapter-code](01_main-chapter-code) contains the main chapter code. -------------------------------------------------------------------------------- /ch01/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 1: Understanding Large Language Models 2 | 3 | 4 |   5 | ## Main Chapter Code 6 | 7 | There is no code in this chapter. 8 | 9 | 10 |   11 | ## Bonus Materials 12 | 13 | In the video below, I share my personal approach to setting up a Python environment on my computer: 14 | 15 |
16 |
17 | 18 | [![Link to the video](https://img.youtube.com/vi/yAcWnfsZhzo/0.jpg)](https://www.youtube.com/watch?v=yAcWnfsZhzo) 19 | 20 |
21 |
22 | 23 | As an optional bonus, the following video tutorial provides an overview of the LLM development lifecycle covered in this book: 24 | 25 |
26 |
27 | 28 | [![Link to the video](https://img.youtube.com/vi/kPGTx4wcm_w/0.jpg)](https://www.youtube.com/watch?v=kPGTx4wcm_w) 29 | 30 | -------------------------------------------------------------------------------- /ch02/01_main-chapter-code/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 2: Working with Text Data 2 | 3 | ### Main Chapter Code 4 | 5 | - [ch02.ipynb](ch02.ipynb) contains all the code as it appears in the chapter 6 | 7 | ### Optional Code 8 | 9 | - [dataloader.ipynb](dataloader.ipynb) is a minimal notebook with the main data loading pipeline implemented in this chapter 10 | -------------------------------------------------------------------------------- /ch02/01_main-chapter-code/dataloader.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "6e2a4891-c257-4d6b-afb3-e8fef39d0437", 6 | "metadata": {}, 7 | "source": [ 8 | "\n", 9 | "\n", 10 | "\n", 16 | "\n", 19 | "\n", 20 | "
\n", 11 | "\n", 12 | "Supplementary code for the Build a Large Language Model From Scratch book by Sebastian Raschka
\n", 13 | "
Code repository: https://github.com/rasbt/LLMs-from-scratch\n", 14 | "
\n", 15 | "
\n", 17 | "\n", 18 | "
\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "id": "6f678e62-7bcb-4405-86ae-dce94f494303", 26 | "metadata": {}, 27 | "source": [ 28 | "# The Main Data Loading Pipeline Summarized" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "070000fc-a7b7-4c56-a2c0-a938d413a790", 34 | "metadata": {}, 35 | "source": [ 36 | "The complete chapter code is located in [ch02.ipynb](./ch02.ipynb).\n", 37 | "\n", 38 | "This notebook contains the main takeaway, the data loading pipeline without the intermediate steps." 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "id": "2b4e8f2d-cb81-41a3-8780-a70b382e18ae", 44 | "metadata": {}, 45 | "source": [ 46 | "Packages that are being used in this notebook:" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 1, 52 | "id": "c7ed6fbe-45ac-40ce-8ea5-4edb212565e1", 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "torch version: 2.4.0\n", 60 | "tiktoken version: 0.7.0\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "# NBVAL_SKIP\n", 66 | "from importlib.metadata import version\n", 67 | "\n", 68 | "print(\"torch version:\", version(\"torch\"))\n", 69 | "print(\"tiktoken version:\", version(\"tiktoken\"))" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 2, 75 | "id": "0ed4b7db-3b47-4fd3-a4a6-5f4ed5dd166e", 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "import tiktoken\n", 80 | "import torch\n", 81 | "from torch.utils.data import Dataset, DataLoader\n", 82 | "\n", 83 | "\n", 84 | "class GPTDatasetV1(Dataset):\n", 85 | " def __init__(self, txt, tokenizer, max_length, stride):\n", 86 | " self.input_ids = []\n", 87 | " self.target_ids = []\n", 88 | "\n", 89 | " # Tokenize the entire text\n", 90 | " token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n", 91 | "\n", 92 | " # Use a sliding window to chunk the book into overlapping sequences of max_length\n", 93 | " for i in range(0, len(token_ids) - max_length, stride):\n", 94 | " input_chunk = token_ids[i:i + max_length]\n", 95 | " target_chunk = token_ids[i + 1: i + max_length + 1]\n", 96 | " self.input_ids.append(torch.tensor(input_chunk))\n", 97 | " self.target_ids.append(torch.tensor(target_chunk))\n", 98 | "\n", 99 | " def __len__(self):\n", 100 | " return len(self.input_ids)\n", 101 | "\n", 102 | " def __getitem__(self, idx):\n", 103 | " return self.input_ids[idx], self.target_ids[idx]\n", 104 | "\n", 105 | "\n", 106 | "def create_dataloader_v1(txt, batch_size, max_length, stride,\n", 107 | " shuffle=True, drop_last=True, num_workers=0):\n", 108 | " # Initialize the tokenizer\n", 109 | " tokenizer = tiktoken.get_encoding(\"gpt2\")\n", 110 | "\n", 111 | " # Create dataset\n", 112 | " dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)\n", 113 | "\n", 114 | " # Create dataloader\n", 115 | " dataloader = DataLoader(\n", 116 | " dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)\n", 117 | "\n", 118 | " return dataloader\n", 119 | "\n", 120 | "\n", 121 | "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n", 122 | " raw_text = f.read()\n", 123 | "\n", 124 | "vocab_size = 50257\n", 125 | "output_dim = 256\n", 126 | "context_length = 1024\n", 127 | "\n", 128 | "\n", 129 | "token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)\n", 130 | "pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)\n", 131 | "\n", 132 | "batch_size = 8\n", 133 | "max_length = 4\n", 134 | "dataloader = create_dataloader_v1(\n", 135 | " raw_text,\n", 136 | " batch_size=batch_size,\n", 137 | " max_length=max_length,\n", 138 | " stride=max_length\n", 139 | ")" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 3, 145 | "id": "664397bc-6daa-4b88-90aa-e8fc1fbd5846", 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "for batch in dataloader:\n", 150 | " x, y = batch\n", 151 | "\n", 152 | " token_embeddings = token_embedding_layer(x)\n", 153 | " pos_embeddings = pos_embedding_layer(torch.arange(max_length))\n", 154 | "\n", 155 | " input_embeddings = token_embeddings + pos_embeddings\n", 156 | "\n", 157 | " break" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 4, 163 | "id": "d3664332-e6bb-447e-8b96-203aafde8b24", 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "torch.Size([8, 4, 256])\n" 171 | ] 172 | } 173 | ], 174 | "source": [ 175 | "print(input_embeddings.shape)" 176 | ] 177 | } 178 | ], 179 | "metadata": { 180 | "kernelspec": { 181 | "display_name": "Python 3 (ipykernel)", 182 | "language": "python", 183 | "name": "python3" 184 | }, 185 | "language_info": { 186 | "codemirror_mode": { 187 | "name": "ipython", 188 | "version": 3 189 | }, 190 | "file_extension": ".py", 191 | "mimetype": "text/x-python", 192 | "name": "python", 193 | "nbconvert_exporter": "python", 194 | "pygments_lexer": "ipython3", 195 | "version": "3.11.4" 196 | } 197 | }, 198 | "nbformat": 4, 199 | "nbformat_minor": 5 200 | } 201 | -------------------------------------------------------------------------------- /ch02/02_bonus_bytepair-encoder/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 2: Working with Text Data 2 | 3 | 4 | 5 | - [compare-bpe-tiktoken.ipynb](compare-bpe-tiktoken.ipynb) benchmarks various byte pair encoding implementations 6 | - [bpe_openai_gpt2.py](bpe_openai_gpt2.py) is the original bytepair encoder code used by OpenAI 7 | 8 | -------------------------------------------------------------------------------- /ch02/02_bonus_bytepair-encoder/requirements-extra.txt: -------------------------------------------------------------------------------- 1 | requests 2 | tqdm 3 | transformers>=4.33.2 4 | -------------------------------------------------------------------------------- /ch02/03_bonus_embedding-vs-matmul/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 2: Working with Text Data 2 | 3 | - [embeddings-and-linear-layers.ipynb](embeddings-and-linear-layers.ipynb) contains optional (bonus) code to explain that embedding layers and fully connected layers applied to one-hot encoded vectors are equivalent. 4 | -------------------------------------------------------------------------------- /ch02/04_bonus_dataloader-intuition/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 2: Working with Text Data 2 | 3 | - [dataloader-intuition.ipynb](dataloader-intuition.ipynb) contains optional (bonus) code to explain the data loader more intuitively with simple numbers rather than text. 4 | -------------------------------------------------------------------------------- /ch02/05_bpe-from-scratch/README.md: -------------------------------------------------------------------------------- 1 | # Byte Pair Encoding (BPE) Tokenizer From Scratch 2 | 3 | - [bpe-from-scratch.ipynb](bpe-from-scratch.ipynb) contains optional (bonus) code that explains and shows how the BPE tokenizer works under the hood. 4 | -------------------------------------------------------------------------------- /ch02/05_bpe-from-scratch/tests/tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import io 4 | import nbformat 5 | import types 6 | import pytest 7 | 8 | import tiktoken 9 | 10 | 11 | def import_definitions_from_notebook(fullname, names): 12 | """Loads function definitions from a Jupyter notebook file into a module.""" 13 | path = os.path.join(os.path.dirname(__file__), "..", fullname + ".ipynb") 14 | path = os.path.normpath(path) 15 | 16 | if not os.path.exists(path): 17 | raise FileNotFoundError(f"Notebook file not found at: {path}") 18 | 19 | with io.open(path, "r", encoding="utf-8") as f: 20 | nb = nbformat.read(f, as_version=4) 21 | 22 | mod = types.ModuleType(fullname) 23 | sys.modules[fullname] = mod 24 | 25 | # Execute all code cells to capture dependencies 26 | for cell in nb.cells: 27 | if cell.cell_type == "code": 28 | exec(cell.source, mod.__dict__) 29 | 30 | # Ensure required names are in module 31 | missing_names = [name for name in names if name not in mod.__dict__] 32 | if missing_names: 33 | raise ImportError(f"Missing definitions in notebook: {missing_names}") 34 | 35 | return mod 36 | 37 | 38 | @pytest.fixture(scope="module") 39 | def imported_module(): 40 | fullname = "bpe-from-scratch" 41 | names = ["BPETokenizerSimple", "download_file_if_absent"] 42 | return import_definitions_from_notebook(fullname, names) 43 | 44 | 45 | @pytest.fixture(scope="module") 46 | def gpt2_files(imported_module): 47 | """Fixture to handle downloading GPT-2 files.""" 48 | download_file_if_absent = getattr(imported_module, "download_file_if_absent", None) 49 | 50 | search_directories = [".", "../02_bonus_bytepair-encoder/gpt2_model/"] 51 | files_to_download = { 52 | "https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe": "vocab.bpe", 53 | "https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json": "encoder.json" 54 | } 55 | paths = {filename: download_file_if_absent(url, filename, search_directories) 56 | for url, filename in files_to_download.items()} 57 | 58 | return paths 59 | 60 | 61 | def test_tokenizer_training(imported_module, gpt2_files): 62 | BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None) 63 | download_file_if_absent = getattr(imported_module, "download_file_if_absent", None) 64 | 65 | tokenizer = BPETokenizerSimple() 66 | verdict_path = download_file_if_absent( 67 | url=( 68 | "https://raw.githubusercontent.com/rasbt/" 69 | "LLMs-from-scratch/main/ch02/01_main-chapter-code/" 70 | "the-verdict.txt" 71 | ), 72 | filename="the-verdict.txt", 73 | search_dirs="." 74 | ) 75 | 76 | with open(verdict_path, "r", encoding="utf-8") as f: # added ../01_main-chapter-code/ 77 | text = f.read() 78 | 79 | tokenizer.train(text, vocab_size=1000, allowed_special={"<|endoftext|>"}) 80 | assert len(tokenizer.vocab) == 1000, "Tokenizer vocabulary size mismatch." 81 | assert len(tokenizer.bpe_merges) == 742, "Tokenizer BPE merges count mismatch." 82 | 83 | input_text = "Jack embraced beauty through art and life." 84 | token_ids = tokenizer.encode(input_text) 85 | assert token_ids == [424, 256, 654, 531, 302, 311, 256, 296, 97, 465, 121, 595, 841, 116, 287, 466, 256, 326, 972, 46], "Token IDs do not match expected output." 86 | 87 | assert tokenizer.decode(token_ids) == input_text, "Decoded text does not match the original input." 88 | 89 | tokenizer.save_vocab_and_merges(vocab_path="vocab.json", bpe_merges_path="bpe_merges.txt") 90 | tokenizer2 = BPETokenizerSimple() 91 | tokenizer2.load_vocab_and_merges(vocab_path="vocab.json", bpe_merges_path="bpe_merges.txt") 92 | assert tokenizer2.decode(token_ids) == input_text, "Decoded text mismatch after reloading tokenizer." 93 | 94 | 95 | def test_gpt2_tokenizer_openai_simple(imported_module, gpt2_files): 96 | BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None) 97 | 98 | tokenizer_gpt2 = BPETokenizerSimple() 99 | tokenizer_gpt2.load_vocab_and_merges_from_openai( 100 | vocab_path=gpt2_files["encoder.json"], bpe_merges_path=gpt2_files["vocab.bpe"] 101 | ) 102 | 103 | assert len(tokenizer_gpt2.vocab) == 50257, "GPT-2 tokenizer vocabulary size mismatch." 104 | 105 | input_text = "This is some text" 106 | token_ids = tokenizer_gpt2.encode(input_text) 107 | assert token_ids == [1212, 318, 617, 2420], "Tokenized output does not match expected GPT-2 encoding." 108 | 109 | 110 | def test_gpt2_tokenizer_openai_edgecases(imported_module, gpt2_files): 111 | BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None) 112 | 113 | tokenizer_gpt2 = BPETokenizerSimple() 114 | tokenizer_gpt2.load_vocab_and_merges_from_openai( 115 | vocab_path=gpt2_files["encoder.json"], bpe_merges_path=gpt2_files["vocab.bpe"] 116 | ) 117 | tik_tokenizer = tiktoken.get_encoding("gpt2") 118 | 119 | test_cases = [ 120 | ("Hello,", [15496, 11]), 121 | ("Implementations", [3546, 26908, 602]), 122 | ("asdf asdfasdf a!!, @aba 9asdf90asdfk", [292, 7568, 355, 7568, 292, 7568, 257, 3228, 11, 2488, 15498, 860, 292, 7568, 3829, 292, 7568, 74]), 123 | ("Hello, world. Is this-- a test?", [15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]) 124 | ] 125 | 126 | errors = [] 127 | 128 | for input_text, expected_tokens in test_cases: 129 | tik_tokens = tik_tokenizer.encode(input_text) 130 | gpt2_tokens = tokenizer_gpt2.encode(input_text) 131 | 132 | print(f"Text: {input_text}") 133 | print(f"Expected Tokens: {expected_tokens}") 134 | print(f"tiktoken Output: {tik_tokens}") 135 | print(f"BPETokenizerSimple Output: {gpt2_tokens}") 136 | print("-" * 40) 137 | 138 | if tik_tokens != expected_tokens: 139 | errors.append(f"Tiktokenized output does not match expected GPT-2 encoding for '{input_text}'.\n" 140 | f"Expected: {expected_tokens}, Got: {tik_tokens}") 141 | 142 | if gpt2_tokens != expected_tokens: 143 | errors.append(f"Tokenized output does not match expected GPT-2 encoding for '{input_text}'.\n" 144 | f"Expected: {expected_tokens}, Got: {gpt2_tokens}") 145 | 146 | if errors: 147 | pytest.fail("\n".join(errors)) 148 | -------------------------------------------------------------------------------- /ch02/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 2: Working with Text Data 2 | 3 |   4 | ## Main Chapter Code 5 | 6 | - [01_main-chapter-code](01_main-chapter-code) contains the main chapter code and exercise solutions 7 | 8 |   9 | ## Bonus Materials 10 | 11 | - [02_bonus_bytepair-encoder](02_bonus_bytepair-encoder) contains optional code to benchmark different byte pair encoder implementations 12 | 13 | - [03_bonus_embedding-vs-matmul](03_bonus_embedding-vs-matmul) contains optional (bonus) code to explain that embedding layers and fully connected layers applied to one-hot encoded vectors are equivalent. 14 | 15 | - [04_bonus_dataloader-intuition](04_bonus_dataloader-intuition) contains optional (bonus) code to explain the data loader more intuitively with simple numbers rather than text. 16 | 17 | - [05_bpe-from-scratch](05_bpe-from-scratch) contains (bonus) code that implements and trains a GPT-2 BPE tokenizer from scratch. 18 | 19 | 20 | 21 | 22 | 23 | In the video below, I provide a code-along session that covers some of the chapter contents as supplementary material. 24 | 25 |
26 |
27 | 28 | [![Link to the video](https://img.youtube.com/vi/341Rb8fJxY0/0.jpg)](https://www.youtube.com/watch?v=yAcWnfsZhzo) 29 | -------------------------------------------------------------------------------- /ch03/01_main-chapter-code/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 3: Coding Attention Mechanisms 2 | 3 | ### Main Chapter Code 4 | 5 | - [ch03.ipynb](ch03.ipynb) contains all the code as it appears in the chapter 6 | 7 | ### Optional Code 8 | 9 | - [multihead-attention.ipynb](multihead-attention.ipynb) is a minimal notebook with the main data loading pipeline implemented in this chapter 10 | 11 | -------------------------------------------------------------------------------- /ch03/01_main-chapter-code/small-text-sample.txt: -------------------------------------------------------------------------------- 1 | Once upon a time in a quiet village nestled among rolling hills and whispering forests, there lived a young girl named Elara. Elara was known for her boundless curiosity and her love for the stars. Every night, she would climb to the highest hill near her home to gaze at the glittering sky, dreaming of distant worlds and galaxies. 2 | 3 | In the heart of the village, there was an ancient library, tended by an old, wise librarian named Mr. Bramwell. This library was a treasure trove of books on every subject, but most importantly, it housed a collection of old star maps and celestial guides. Elara, fascinated by these books, spent countless hours with Mr. Bramwell, learning about constellations, planets, and the mysteries of the universe. 4 | 5 | One evening, while studying an old star map, Elara noticed a small, uncharted star that twinkled differently. She shared this discovery with Mr. Bramwell, who was equally intrigued. They decided to observe this star every night, noting its unique patterns and movements. This small, mysterious star, which they named "Elara's Star," became the center of their nightly adventures. 6 | 7 | As days turned into weeks, the villagers began to take notice of Elara's star. The uncharted star brought the community together, with people of all ages joining Elara and Mr. Bramwell on the hill each night to gaze at the sky. The nightly gatherings turned into a festival of stars, where stories were shared, friendships were formed, and the mysteries of the cosmos were contemplated. 8 | 9 | The story of Elara and her star spread far and wide, attracting astronomers and dreamers from distant lands. The once quiet village became a beacon of wonder, a place where the sky seemed a little closer and the stars a bit friendlier. Elara's curiosity had not only unveiled a hidden star but had also brought her community together, reminding everyone that sometimes, the most extraordinary discoveries are waiting just above us, in the starlit sky. -------------------------------------------------------------------------------- /ch03/02_bonus_efficient-multihead-attention/README.md: -------------------------------------------------------------------------------- 1 | # More Efficient Multi-Head Attention Implementations 2 | 3 | - [mha-implementations.ipynb](mha-implementations.ipynb) contains and compares different implementations of multi-head attention 4 | 5 | 6 | 7 | ### Summary 8 | 9 | The figures below summarize the performance benchmarks (lower is better). 10 | 11 | 12 |   13 | #### Forward pass only 14 | 15 | 16 | 17 |   18 | #### Forward and backward pass 19 | 20 | 21 | 22 |   23 | #### Forward and backward pass after compilation 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /ch03/03_understanding-buffers/README.md: -------------------------------------------------------------------------------- 1 | # Understanding PyTorch Buffers 2 | 3 | - [understanding-buffers.ipynb](understanding-buffers.ipynb) explains the idea behind PyTorch buffers, which are used to implement the causal attention mechanism in chapter 3 4 | 5 | 6 |
7 | Below is a hands-on video tutorial I recorded to explain the code: 8 | 9 |
10 |
11 | 12 | [![Link to the video](https://img.youtube.com/vi/PetlIokI9Ao/0.jpg)](https://www.youtube.com/watch?v=PetlIokI9Ao) 13 | 14 | -------------------------------------------------------------------------------- /ch03/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 3: Coding Attention Mechanisms 2 | 3 |   4 | ## Main Chapter Code 5 | 6 | - [01_main-chapter-code](01_main-chapter-code) contains the main chapter code. 7 | 8 |   9 | ## Bonus Materials 10 | 11 | - [02_bonus_efficient-multihead-attention](02_bonus_efficient-multihead-attention) implements and compares different implementation variants of multihead-attention 12 | - [03_understanding-buffers](03_understanding-buffers) explains the idea behind PyTorch buffers, which are used to implement the causal attention mechanism in chapter 3 13 | 14 | 15 | 16 | In the video below, I provide a code-along session that covers some of the chapter contents as supplementary material. 17 | 18 |
19 |
20 | 21 | [![Link to the video](https://img.youtube.com/vi/-Ll8DtpNtvk/0.jpg)](https://www.youtube.com/watch?v=-Ll8DtpNtvk) 22 | -------------------------------------------------------------------------------- /ch04/01_main-chapter-code/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 4: Implementing a GPT Model from Scratch To Generate Text 2 | 3 | ### Main Chapter Code 4 | 5 | - [ch04.ipynb](ch04.ipynb) contains all the code as it appears in the chapter 6 | - [previous_chapters.py](previous_chapters.py) is a Python module that contains the `MultiHeadAttention` module from the previous chapter, which we import in [ch04.ipynb](ch04.ipynb) to create the GPT model 7 | 8 | ### Optional Code 9 | 10 | - [gpt.py](gpt.py) is a standalone Python script file with the code that we implemented thus far, including the GPT model we coded in this chapter 11 | 12 | -------------------------------------------------------------------------------- /ch04/01_main-chapter-code/previous_chapters.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | import tiktoken 7 | import torch 8 | import torch.nn as nn 9 | from torch.utils.data import Dataset, DataLoader 10 | 11 | 12 | class GPTDatasetV1(Dataset): 13 | def __init__(self, txt, tokenizer, max_length, stride): 14 | self.input_ids = [] 15 | self.target_ids = [] 16 | 17 | # Tokenize the entire text 18 | token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"}) 19 | 20 | # Use a sliding window to chunk the book into overlapping sequences of max_length 21 | for i in range(0, len(token_ids) - max_length, stride): 22 | input_chunk = token_ids[i:i + max_length] 23 | target_chunk = token_ids[i + 1: i + max_length + 1] 24 | self.input_ids.append(torch.tensor(input_chunk)) 25 | self.target_ids.append(torch.tensor(target_chunk)) 26 | 27 | def __len__(self): 28 | return len(self.input_ids) 29 | 30 | def __getitem__(self, idx): 31 | return self.input_ids[idx], self.target_ids[idx] 32 | 33 | 34 | def create_dataloader_v1(txt, batch_size=4, max_length=256, 35 | stride=128, shuffle=True, drop_last=True, num_workers=0): 36 | # Initialize the tokenizer 37 | tokenizer = tiktoken.get_encoding("gpt2") 38 | 39 | # Create dataset 40 | dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) 41 | 42 | # Create dataloader 43 | dataloader = DataLoader( 44 | dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers) 45 | 46 | return dataloader 47 | 48 | 49 | class MultiHeadAttention(nn.Module): 50 | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): 51 | super().__init__() 52 | assert d_out % num_heads == 0, "d_out must be divisible by num_heads" 53 | 54 | self.d_out = d_out 55 | self.num_heads = num_heads 56 | self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim 57 | 58 | self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) 59 | self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) 60 | self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) 61 | self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs 62 | self.dropout = nn.Dropout(dropout) 63 | self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) 64 | 65 | def forward(self, x): 66 | b, num_tokens, d_in = x.shape 67 | 68 | keys = self.W_key(x) # Shape: (b, num_tokens, d_out) 69 | queries = self.W_query(x) 70 | values = self.W_value(x) 71 | 72 | # We implicitly split the matrix by adding a `num_heads` dimension 73 | # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim) 74 | keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) 75 | values = values.view(b, num_tokens, self.num_heads, self.head_dim) 76 | queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) 77 | 78 | # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim) 79 | keys = keys.transpose(1, 2) 80 | queries = queries.transpose(1, 2) 81 | values = values.transpose(1, 2) 82 | 83 | # Compute scaled dot-product attention (aka self-attention) with a causal mask 84 | attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head 85 | 86 | # Original mask truncated to the number of tokens and converted to boolean 87 | mask_bool = self.mask.bool()[:num_tokens, :num_tokens] 88 | 89 | # Use the mask to fill attention scores 90 | attn_scores.masked_fill_(mask_bool, -torch.inf) 91 | 92 | attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) 93 | attn_weights = self.dropout(attn_weights) 94 | 95 | # Shape: (b, num_tokens, num_heads, head_dim) 96 | context_vec = (attn_weights @ values).transpose(1, 2) 97 | 98 | # Combine heads, where self.d_out = self.num_heads * self.head_dim 99 | context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out) 100 | context_vec = self.out_proj(context_vec) # optional projection 101 | 102 | return context_vec 103 | -------------------------------------------------------------------------------- /ch04/01_main-chapter-code/tests.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | # File for internal use (unit tests) 7 | 8 | from gpt import main 9 | 10 | expected = """ 11 | ================================================== 12 | IN 13 | ================================================== 14 | 15 | Input text: Hello, I am 16 | Encoded input text: [15496, 11, 314, 716] 17 | encoded_tensor.shape: torch.Size([1, 4]) 18 | 19 | 20 | ================================================== 21 | OUT 22 | ================================================== 23 | 24 | Output: tensor([[15496, 11, 314, 716, 27018, 24086, 47843, 30961, 42348, 7267, 25 | 49706, 43231, 47062, 34657]]) 26 | Output length: 14 27 | Output text: Hello, I am Featureiman Byeswickattribute argue logger Normandy Compton analogous 28 | """ 29 | 30 | 31 | def test_main(capsys): 32 | main() 33 | captured = capsys.readouterr() 34 | 35 | # Normalize line endings and strip trailing whitespace from each line 36 | normalized_expected = '\n'.join(line.rstrip() for line in expected.splitlines()) 37 | normalized_output = '\n'.join(line.rstrip() for line in captured.out.splitlines()) 38 | 39 | # Compare normalized strings 40 | assert normalized_output == normalized_expected 41 | -------------------------------------------------------------------------------- /ch04/02_performance-analysis/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 4: Implementing a GPT Model from Scratch To Generate Text 2 | 3 | - [flops-analysis.ipynb](flops-analysis.ipynb) analyses the floating point operations per second (FLOPS) of the GPT model(s) implemented in the main chapter. 4 | - `requirements-extra.txt` includes additional Python libraries that need to be installed (via `pip install -r requirements-extra.txt`. -------------------------------------------------------------------------------- /ch04/02_performance-analysis/requirements-extra.txt: -------------------------------------------------------------------------------- 1 | thop -------------------------------------------------------------------------------- /ch04/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 4: Implementing a GPT Model from Scratch to Generate Text 2 | 3 |   4 | ## Main Chapter Code 5 | 6 | - [01_main-chapter-code](01_main-chapter-code) contains the main chapter code. 7 | 8 |   9 | ## Bonus Materials 10 | 11 | - [02_performance-analysis](02_performance-analysis) contains optional code analyzing the performance of the GPT model(s) implemented in the main chapter 12 | - [ch05/07_gpt_to_llama](../ch05/07_gpt_to_llama) contains a step-by-step guide for converting a GPT architecture implementation to Llama 3.2 and loads pretrained weights from Meta AI (it might be interesting to look at alternative architectures after completing chapter 4, but you can also save that for after reading chapter 5) 13 | 14 | 15 | 16 | In the video below, I provide a code-along session that covers some of the chapter contents as supplementary material. 17 | 18 |
19 |
20 | 21 | [![Link to the video](https://img.youtube.com/vi/YSAkgEarBGE/0.jpg)](https://www.youtube.com/watch?v=YSAkgEarBGE) 22 | -------------------------------------------------------------------------------- /ch05/01_main-chapter-code/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 5: Pretraining on Unlabeled Data 2 | 3 | ### Main Chapter Code 4 | 5 | - [ch05.ipynb](ch05.ipynb) contains all the code as it appears in the chapter 6 | - [previous_chapters.py](previous_chapters.py) is a Python module that contains the `MultiHeadAttention` module and `GPTModel` class from the previous chapters, which we import in [ch05.ipynb](ch05.ipynb) to pretrain the GPT model 7 | - [gpt_download.py](gpt_download.py) contains the utility functions for downloading the pretrained GPT model weights 8 | - [exercise-solutions.ipynb](exercise-solutions.ipynb) contains the exercise solutions for this chapter 9 | 10 | ### Optional Code 11 | 12 | - [gpt_train.py](gpt_train.py) is a standalone Python script file with the code that we implemented in [ch05.ipynb](ch05.ipynb) to train the GPT model (you can think of it as a code file summarizing this chapter) 13 | - [gpt_generate.py](gpt_generate.py) is a standalone Python script file with the code that we implemented in [ch05.ipynb](ch05.ipynb) to load and use the pretrained model weights from OpenAI 14 | 15 | -------------------------------------------------------------------------------- /ch05/01_main-chapter-code/tests.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | # File for internal use (unit tests) 7 | 8 | import pytest 9 | from gpt_train import main 10 | import http.client 11 | from urllib.parse import urlparse 12 | 13 | 14 | @pytest.fixture 15 | def gpt_config(): 16 | return { 17 | "vocab_size": 50257, 18 | "context_length": 12, # small for testing efficiency 19 | "emb_dim": 32, # small for testing efficiency 20 | "n_heads": 4, # small for testing efficiency 21 | "n_layers": 2, # small for testing efficiency 22 | "drop_rate": 0.1, 23 | "qkv_bias": False 24 | } 25 | 26 | 27 | @pytest.fixture 28 | def other_settings(): 29 | return { 30 | "learning_rate": 5e-4, 31 | "num_epochs": 1, # small for testing efficiency 32 | "batch_size": 2, 33 | "weight_decay": 0.1 34 | } 35 | 36 | 37 | def test_main(gpt_config, other_settings): 38 | train_losses, val_losses, tokens_seen, model = main(gpt_config, other_settings) 39 | 40 | assert len(train_losses) == 39, "Unexpected number of training losses" 41 | assert len(val_losses) == 39, "Unexpected number of validation losses" 42 | assert len(tokens_seen) == 39, "Unexpected number of tokens seen" 43 | 44 | 45 | def check_file_size(url, expected_size): 46 | parsed_url = urlparse(url) 47 | if parsed_url.scheme == "https": 48 | conn = http.client.HTTPSConnection(parsed_url.netloc) 49 | else: 50 | conn = http.client.HTTPConnection(parsed_url.netloc) 51 | 52 | conn.request("HEAD", parsed_url.path) 53 | response = conn.getresponse() 54 | if response.status != 200: 55 | return False, f"{url} not accessible" 56 | size = response.getheader("Content-Length") 57 | if size is None: 58 | return False, "Content-Length header is missing" 59 | size = int(size) 60 | if size != expected_size: 61 | return False, f"{url} file has expected size {expected_size}, but got {size}" 62 | return True, f"{url} file size is correct" 63 | 64 | 65 | def test_model_files(): 66 | def check_model_files(base_url): 67 | 68 | model_size = "124M" 69 | files = { 70 | "checkpoint": 77, 71 | "encoder.json": 1042301, 72 | "hparams.json": 90, 73 | "model.ckpt.data-00000-of-00001": 497759232, 74 | "model.ckpt.index": 5215, 75 | "model.ckpt.meta": 471155, 76 | "vocab.bpe": 456318 77 | } 78 | 79 | for file_name, expected_size in files.items(): 80 | url = f"{base_url}/{model_size}/{file_name}" 81 | valid, message = check_file_size(url, expected_size) 82 | assert valid, message 83 | 84 | model_size = "355M" 85 | files = { 86 | "checkpoint": 77, 87 | "encoder.json": 1042301, 88 | "hparams.json": 91, 89 | "model.ckpt.data-00000-of-00001": 1419292672, 90 | "model.ckpt.index": 10399, 91 | "model.ckpt.meta": 926519, 92 | "vocab.bpe": 456318 93 | } 94 | 95 | for file_name, expected_size in files.items(): 96 | url = f"{base_url}/{model_size}/{file_name}" 97 | valid, message = check_file_size(url, expected_size) 98 | assert valid, message 99 | 100 | check_model_files(base_url="https://openaipublic.blob.core.windows.net/gpt-2/models") 101 | check_model_files(base_url="https://f001.backblazeb2.com/file/LLMs-from-scratch/gpt2") 102 | -------------------------------------------------------------------------------- /ch05/02_alternative_weight_loading/README.md: -------------------------------------------------------------------------------- 1 | # Alternative Approaches to Loading Pretrained Weights 2 | 3 | This folder contains alternative weight loading strategies in case the weights become unavailable from OpenAI. 4 | 5 | - [weight-loading-pytorch.ipynb](weight-loading-pytorch.ipynb): (Recommended) contains code to load the weights from PyTorch state dicts that I created by converting the original TensorFlow weights 6 | 7 | - [weight-loading-hf-transformers.ipynb](weight-loading-hf-transformers.ipynb): contains code to load the weights from the Hugging Face Model Hub via the `transformers` library 8 | 9 | - [weight-loading-hf-safetensors.ipynb](weight-loading-hf-safetensors.ipynb): contains code to load the weights from the Hugging Face Model Hub via the `safetensors` library directly (skipping the instantiation of a Hugging Face transformer model) -------------------------------------------------------------------------------- /ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | """ 7 | Script that processes the Project Gutenberg files into fewer larger files. 8 | """ 9 | 10 | import argparse 11 | import os 12 | import re 13 | from tqdm import tqdm 14 | from gutenberg.src.cleanup import strip_headers 15 | 16 | 17 | def is_english(text, threshold=0.9): 18 | ascii_chars = sum(1 for c in text if ord(c) < 128) 19 | return ascii_chars / len(text) > threshold 20 | 21 | 22 | def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftext|>", fallback_encoding="latin1"): 23 | if not os.path.exists(target_dir): 24 | os.makedirs(target_dir) 25 | 26 | current_content = [] 27 | current_size = 0 28 | file_counter = 1 29 | 30 | for file_path in tqdm(file_paths): 31 | try: 32 | with open(file_path, "r", encoding="utf-8") as file: 33 | content = file.read() 34 | except UnicodeDecodeError: 35 | # Attempt to read the file with a fallback encoding 36 | tqdm.write(f"Warning: UnicodeDecodeError encountered. Trying fallback encoding for {file_path}") 37 | with open(file_path, "r", encoding=fallback_encoding) as file: 38 | content = file.read() 39 | 40 | if not is_english(content): 41 | tqdm.write(f"Skipping {file_path} as it does not contain primarily English text.") 42 | continue 43 | content = strip_headers(content) 44 | 45 | # Regular expression to replace multiple blank lines with a single blank line 46 | content = re.sub(r'\n\s*\n', '\n\n', content) 47 | estimated_size = len(content.encode("utf-8")) 48 | 49 | if current_size + estimated_size > max_size_mb * 1024 * 1024: 50 | target_file_path = os.path.join(target_dir, f"combined_{file_counter}.txt") 51 | with open(target_file_path, "w", encoding="utf-8") as target_file: 52 | target_file.write(separator.join(current_content)) 53 | file_counter += 1 54 | current_content = [content] 55 | current_size = estimated_size 56 | else: 57 | current_content.append(content) 58 | current_size += estimated_size 59 | 60 | if current_content: 61 | target_file_path = os.path.join(target_dir, f"combined_{file_counter}.txt") 62 | with open(target_file_path, "w", encoding="utf-8") as target_file: 63 | target_file.write(separator.join(current_content)) 64 | return file_counter 65 | 66 | 67 | if __name__ == "__main__": 68 | 69 | parser = argparse.ArgumentParser(description="Preprocess and combine text files for pretraining") 70 | 71 | parser.add_argument("--data_dir", type=str, default="gutenberg/data/raw", 72 | help="Directory containing the downloaded raw training data") 73 | parser.add_argument("--max_size_mb", type=int, default=500, 74 | help="The maximum file size for each concatenated file in megabytes") 75 | parser.add_argument("--output_dir", type=str, default="gutenberg_preprocessed", 76 | help="Directory where the preprocessed data will be saved") 77 | 78 | args = parser.parse_args() 79 | 80 | all_files = [os.path.join(path, name) for path, subdirs, files in os.walk(args.data_dir) 81 | for name in files if name.endswith((".txt", ".txt.utf8"))] 82 | 83 | print(f"{len(all_files)} file(s) to process.") 84 | file_counter = combine_files(all_files, args.output_dir, max_size_mb=args.max_size_mb) 85 | print(f"{file_counter} file(s) saved in {os.path.abspath(args.output_dir)}") 86 | -------------------------------------------------------------------------------- /ch05/03_bonus_pretraining_on_gutenberg/tests.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | # File for internal use (unit tests) 7 | 8 | from pathlib import Path 9 | import os 10 | import subprocess 11 | 12 | 13 | def test_pretraining(): 14 | 15 | sequence = "a b c d" 16 | repetitions = 1000 17 | content = sequence * repetitions 18 | 19 | folder_path = Path("gutenberg") / "data" 20 | file_name = "repeated_sequence.txt" 21 | 22 | os.makedirs(folder_path, exist_ok=True) 23 | 24 | with open(folder_path/file_name, "w") as file: 25 | file.write(content) 26 | 27 | result = subprocess.run( 28 | ["python", "pretraining_simple.py", "--debug", "true"], 29 | capture_output=True, text=True 30 | ) 31 | print(result.stdout) 32 | assert "Maximum GPU memory allocated" in result.stdout 33 | -------------------------------------------------------------------------------- /ch05/04_learning_rate_schedulers/README.md: -------------------------------------------------------------------------------- 1 | # Adding Bells and Whistles to the Training Loop 2 | 3 | The main chapter used a relatively simple training function to keep the code readable and fit Chapter 5 within the page limits. Optionally, we can add a linear warm-up, a cosine decay schedule, and gradient clipping to improve the training stability and convergence. 4 | 5 | You can find the code for this more sophisticated training function in [Appendix D: Adding Bells and Whistles to the Training Loop](../../appendix-D/01_main-chapter-code/appendix-D.ipynb). -------------------------------------------------------------------------------- /ch05/05_bonus_hparam_tuning/README.md: -------------------------------------------------------------------------------- 1 | # Optimizing Hyperparameters for Pretraining 2 | 3 | The [hparam_search.py](hparam_search.py) script, based on the extended training function in [Appendix D: Adding Bells and Whistles to the Training Loop](../../appendix-D/01_main-chapter-code/appendix-D.ipynb), is designed to find optimal hyperparameters via grid search. 4 | 5 | >[!NOTE] 6 | This script will take a long time to run. You may want to reduce the number of hyperparameter configurations explored in the `HPARAM_GRID` dictionary at the top. -------------------------------------------------------------------------------- /ch05/06_user_interface/README.md: -------------------------------------------------------------------------------- 1 | # Building a User Interface to Interact With the Pretrained LLM 2 | 3 | 4 | 5 | This bonus folder contains code for running a ChatGPT-like user interface to interact with the pretrained LLMs from chapter 5, as shown below. 6 | 7 | 8 | 9 | ![Chainlit UI example](https://sebastianraschka.com/images/LLMs-from-scratch-images/bonus/chainlit/chainlit-orig.webp) 10 | 11 | 12 | 13 | To implement this user interface, we use the open-source [Chainlit Python package](https://github.com/Chainlit/chainlit). 14 | 15 |   16 | ## Step 1: Install dependencies 17 | 18 | First, we install the `chainlit` package via 19 | 20 | ```bash 21 | pip install chainlit 22 | ``` 23 | 24 | (Alternatively, execute `pip install -r requirements-extra.txt`.) 25 | 26 |   27 | ## Step 2: Run `app` code 28 | 29 | This folder contains 2 files: 30 | 31 | 1. [`app_orig.py`](app_orig.py): This file loads and uses the original GPT-2 weights from OpenAI. 32 | 2. [`app_own.py`](app_own.py): This file loads and uses the GPT-2 weights we generated in chapter 5. This requires that you execute the [`../01_main-chapter-code/ch05.ipynb`](../01_main-chapter-code/ch05.ipynb) file first. 33 | 34 | (Open and inspect these files to learn more.) 35 | 36 | Run one of the following commands from the terminal to start the UI server: 37 | 38 | ```bash 39 | chainlit run app_orig.py 40 | ``` 41 | 42 | or 43 | 44 | ```bash 45 | chainlit run app_own.py 46 | ``` 47 | 48 | Running one of the commands above should open a new browser tab where you can interact with the model. If the browser tab does not open automatically, inspect the terminal command and copy the local address into your browser address bar (usually, the address is `http://localhost:8000`). -------------------------------------------------------------------------------- /ch05/06_user_interface/app_orig.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | import tiktoken 7 | import torch 8 | import chainlit 9 | 10 | # For llms_from_scratch installation instructions, see: 11 | # https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg 12 | from llms_from_scratch.ch04 import GPTModel 13 | from llms_from_scratch.ch05 import ( 14 | download_and_load_gpt2, 15 | generate, 16 | load_weights_into_gpt, 17 | text_to_token_ids, 18 | token_ids_to_text, 19 | ) 20 | 21 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 22 | 23 | 24 | def get_model_and_tokenizer(): 25 | """ 26 | Code to load a GPT-2 model with pretrained weights from OpenAI. 27 | The code is similar to chapter 5. 28 | The model will be downloaded automatically if it doesn't exist in the current folder, yet. 29 | """ 30 | 31 | CHOOSE_MODEL = "gpt2-small (124M)" # Optionally replace with another model from the model_configs dir below 32 | 33 | BASE_CONFIG = { 34 | "vocab_size": 50257, # Vocabulary size 35 | "context_length": 1024, # Context length 36 | "drop_rate": 0.0, # Dropout rate 37 | "qkv_bias": True # Query-key-value bias 38 | } 39 | 40 | model_configs = { 41 | "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12}, 42 | "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16}, 43 | "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20}, 44 | "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25}, 45 | } 46 | 47 | model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")") 48 | 49 | BASE_CONFIG.update(model_configs[CHOOSE_MODEL]) 50 | 51 | settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2") 52 | 53 | gpt = GPTModel(BASE_CONFIG) 54 | load_weights_into_gpt(gpt, params) 55 | gpt.to(device) 56 | gpt.eval() 57 | 58 | tokenizer = tiktoken.get_encoding("gpt2") 59 | 60 | return tokenizer, gpt, BASE_CONFIG 61 | 62 | 63 | # Obtain the necessary tokenizer and model files for the chainlit function below 64 | tokenizer, model, model_config = get_model_and_tokenizer() 65 | 66 | 67 | @chainlit.on_message 68 | async def main(message: chainlit.Message): 69 | """ 70 | The main Chainlit function. 71 | """ 72 | token_ids = generate( # function uses `with torch.no_grad()` internally already 73 | model=model, 74 | idx=text_to_token_ids(message.content, tokenizer).to(device), # The user text is provided via as `message.content` 75 | max_new_tokens=50, 76 | context_size=model_config["context_length"], 77 | top_k=1, 78 | temperature=0.0 79 | ) 80 | 81 | text = token_ids_to_text(token_ids, tokenizer) 82 | 83 | await chainlit.Message( 84 | content=f"{text}", # This returns the model response to the interface 85 | ).send() 86 | -------------------------------------------------------------------------------- /ch05/06_user_interface/app_own.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | from pathlib import Path 7 | import sys 8 | 9 | import tiktoken 10 | import torch 11 | import chainlit 12 | 13 | # For llms_from_scratch installation instructions, see: 14 | # https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg 15 | from llms_from_scratch.ch04 import GPTModel 16 | from llms_from_scratch.ch05 import ( 17 | generate, 18 | text_to_token_ids, 19 | token_ids_to_text, 20 | ) 21 | 22 | 23 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 24 | 25 | 26 | def get_model_and_tokenizer(): 27 | """ 28 | Code to load a GPT-2 model with pretrained weights generated in chapter 5. 29 | This requires that you run the code in chapter 5 first, which generates the necessary model.pth file. 30 | """ 31 | 32 | GPT_CONFIG_124M = { 33 | "vocab_size": 50257, # Vocabulary size 34 | "context_length": 256, # Shortened context length (orig: 1024) 35 | "emb_dim": 768, # Embedding dimension 36 | "n_heads": 12, # Number of attention heads 37 | "n_layers": 12, # Number of layers 38 | "drop_rate": 0.1, # Dropout rate 39 | "qkv_bias": False # Query-key-value bias 40 | } 41 | 42 | tokenizer = tiktoken.get_encoding("gpt2") 43 | 44 | model_path = Path("..") / "01_main-chapter-code" / "model.pth" 45 | if not model_path.exists(): 46 | print(f"Could not find the {model_path} file. Please run the chapter 5 code (ch05.ipynb) to generate the model.pth file.") 47 | sys.exit() 48 | 49 | checkpoint = torch.load(model_path, weights_only=True) 50 | model = GPTModel(GPT_CONFIG_124M) 51 | model.load_state_dict(checkpoint) 52 | model.to(device) 53 | 54 | return tokenizer, model, GPT_CONFIG_124M 55 | 56 | 57 | # Obtain the necessary tokenizer and model files for the chainlit function below 58 | tokenizer, model, model_config = get_model_and_tokenizer() 59 | 60 | 61 | @chainlit.on_message 62 | async def main(message: chainlit.Message): 63 | """ 64 | The main Chainlit function. 65 | """ 66 | token_ids = generate( # function uses `with torch.no_grad()` internally already 67 | model=model, 68 | idx=text_to_token_ids(message.content, tokenizer).to(device), # The user text is provided via as `message.content` 69 | max_new_tokens=50, 70 | context_size=model_config["context_length"], 71 | top_k=1, 72 | temperature=0.0 73 | ) 74 | 75 | text = token_ids_to_text(token_ids, tokenizer) 76 | 77 | await chainlit.Message( 78 | content=f"{text}", # This returns the model response to the interface 79 | ).send() 80 | -------------------------------------------------------------------------------- /ch05/06_user_interface/requirements-extra.txt: -------------------------------------------------------------------------------- 1 | chainlit>=1.2.0 -------------------------------------------------------------------------------- /ch05/07_gpt_to_llama/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "HF_ACCESS_TOKEN": "hf-...", 3 | "_comment": "Enter your access token from https://huggingface.co/settings/tokens" 4 | } 5 | -------------------------------------------------------------------------------- /ch05/07_gpt_to_llama/previous_chapters.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | # 6 | # This file collects all the relevant code that we covered thus far 7 | # throughout Chapters 2-4. 8 | # This file can be run as a standalone script. 9 | 10 | import torch 11 | 12 | 13 | ##################################### 14 | # Chapter 5 15 | ##################################### 16 | def text_to_token_ids(text, tokenizer): 17 | encoded = tokenizer.encode(text) 18 | encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension 19 | return encoded_tensor 20 | 21 | 22 | def token_ids_to_text(token_ids, tokenizer): 23 | flat = token_ids.squeeze(0) # remove batch dimension 24 | return tokenizer.decode(flat.tolist()) 25 | 26 | 27 | def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None): 28 | 29 | # For-loop is the same as before: Get logits, and only focus on last time step 30 | for _ in range(max_new_tokens): 31 | idx_cond = idx[:, -context_size:] 32 | with torch.no_grad(): 33 | logits = model(idx_cond) 34 | logits = logits[:, -1, :] 35 | 36 | # New: Filter logits with top_k sampling 37 | if top_k is not None: 38 | # Keep only top_k values 39 | top_logits, _ = torch.topk(logits, top_k) 40 | min_val = top_logits[:, -1] 41 | logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits) 42 | 43 | # New: Apply temperature scaling 44 | if temperature > 0.0: 45 | logits = logits / temperature 46 | 47 | # Apply softmax to get probabilities 48 | probs = torch.softmax(logits, dim=-1) # (batch_size, context_len) 49 | 50 | # Sample from the distribution 51 | idx_next = torch.multinomial(probs, num_samples=1) # (batch_size, 1) 52 | 53 | # Otherwise same as before: get idx of the vocab entry with the highest logits value 54 | else: 55 | idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch_size, 1) 56 | 57 | if idx_next == eos_id: # Stop generating early if end-of-sequence token is encountered and eos_id is specified 58 | break 59 | 60 | # Same as before: append sampled index to the running sequence 61 | idx = torch.cat((idx, idx_next), dim=1) # (batch_size, num_tokens+1) 62 | 63 | return idx 64 | -------------------------------------------------------------------------------- /ch05/07_gpt_to_llama/requirements-extra.txt: -------------------------------------------------------------------------------- 1 | blobfile>=3.0.0 2 | huggingface_hub>=0.24.7 3 | ipywidgets>=8.1.2 4 | safetensors>=0.4.4 5 | sentencepiece>=0.1.99 6 | -------------------------------------------------------------------------------- /ch05/07_gpt_to_llama/tests/test-requirements-extra.txt: -------------------------------------------------------------------------------- 1 | pytest>=8.1.1 2 | transformers>=4.44.2 3 | -------------------------------------------------------------------------------- /ch05/08_memory_efficient_weight_loading/README.md: -------------------------------------------------------------------------------- 1 | # Memory-efficient Model Weight Loading 2 | 3 | This folder contains code to illustrate how to load model weights more efficiently 4 | 5 | - [memory-efficient-state-dict.ipynb](memory-efficient-state-dict.ipynb): contains code to load model weights via PyTorch's `load_state_dict` method more efficiently 6 | -------------------------------------------------------------------------------- /ch05/09_extending-tokenizers/README.md: -------------------------------------------------------------------------------- 1 | # Extending the Tiktoken BPE Tokenizer with New Tokens 2 | 3 | - [extend-tiktoken.ipynb](extend-tiktoken.ipynb) contains optional (bonus) code to explain how we can add special tokens to a tokenizer implemented via `tiktoken` and how to update the LLM accordingly -------------------------------------------------------------------------------- /ch05/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 5: Pretraining on Unlabeled Data 2 | 3 |   4 | ## Main Chapter Code 5 | 6 | - [01_main-chapter-code](01_main-chapter-code) contains the main chapter code 7 | 8 |   9 | ## Bonus Materials 10 | 11 | - [02_alternative_weight_loading](02_alternative_weight_loading) contains code to load the GPT model weights from alternative places in case the model weights become unavailable from OpenAI 12 | - [03_bonus_pretraining_on_gutenberg](03_bonus_pretraining_on_gutenberg) contains code to pretrain the LLM longer on the whole corpus of books from Project Gutenberg 13 | - [04_learning_rate_schedulers](04_learning_rate_schedulers) contains code implementing a more sophisticated training function including learning rate schedulers and gradient clipping 14 | - [05_bonus_hparam_tuning](05_bonus_hparam_tuning) contains an optional hyperparameter tuning script 15 | - [06_user_interface](06_user_interface) implements an interactive user interface to interact with the pretrained LLM 16 | - [07_gpt_to_llama](07_gpt_to_llama) contains a step-by-step guide for converting a GPT architecture implementation to Llama 3.2 and loads pretrained weights from Meta AI 17 | - [08_memory_efficient_weight_loading](08_memory_efficient_weight_loading) contains a bonus notebook showing how to load model weights via PyTorch's `load_state_dict` method more efficiently 18 | - [09_extending-tokenizers](09_extending-tokenizers) contains a from-scratch implementation of the GPT-2 BPE tokenizer 19 | - [10_llm-training-speed](10_llm-training-speed) shows PyTorch performance tips to improve the LLM training speed 20 | 21 | 22 | 23 |
24 |
25 | 26 | [![Link to the video](https://img.youtube.com/vi/Zar2TJv-sE0/0.jpg)](https://www.youtube.com/watch?v=Zar2TJv-sE0) -------------------------------------------------------------------------------- /ch06/01_main-chapter-code/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 6: Finetuning for Classification 2 | 3 | ### Main Chapter Code 4 | 5 | - [ch06.ipynb](ch06.ipynb) contains all the code as it appears in the chapter 6 | - [previous_chapters.py](previous_chapters.py) is a Python module that contains the GPT model we coded and trained in previous chapters, alongside many utility functions, which we reuse in this chapter 7 | - [gpt_download.py](gpt_download.py) contains the utility functions for downloading the pretrained GPT model weights 8 | - [exercise-solutions.ipynb](exercise-solutions.ipynb) contains the exercise solutions for this chapter 9 | 10 | ### Optional Code 11 | 12 | - [load-finetuned-model.ipynb](load-finetuned-model.ipynb) is a standalone Jupyter notebook to load the finetuned model we created in this chapter 13 | - [gpt_class_finetune.py](gpt_class_finetune.py) is a standalone Python script file with the code that we implemented in [ch06.ipynb](ch06.ipynb) to finetune the GPT model (you can think of it as a chapter summary) 14 | 15 | -------------------------------------------------------------------------------- /ch06/01_main-chapter-code/exercise-solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "ba450fb1-8a26-4894-ab7a-5d7bfefe90ce", 6 | "metadata": {}, 7 | "source": [ 8 | "\n", 9 | "\n", 10 | "\n", 16 | "\n", 19 | "\n", 20 | "
\n", 11 | "\n", 12 | "Supplementary code for the Build a Large Language Model From Scratch book by Sebastian Raschka
\n", 13 | "
Code repository: https://github.com/rasbt/LLMs-from-scratch\n", 14 | "
\n", 15 | "
\n", 17 | "\n", 18 | "
" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "id": "51c9672d-8d0c-470d-ac2d-1271f8ec3f14", 26 | "metadata": {}, 27 | "source": [ 28 | "# Chapter 6 Exercise solutions" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "5fea8be3-30a1-4623-a6d7-b095c6c1092e", 34 | "metadata": {}, 35 | "source": [ 36 | "## Exercise 6.1: Increasing the context length" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "id": "5860ba9f-2db3-4480-b96b-4be1c68981eb", 42 | "metadata": {}, 43 | "source": [ 44 | "We can pad the inputs to the maximum number of tokens the model supports by setting the max length to 1024:\n", 45 | "\n", 46 | "```python\n", 47 | "max_length = 1024\n", 48 | "\n", 49 | "train_dataset = SpamDataset(base_path / \"train.csv\", max_length=max_length, tokenizer=tokenizer)\n", 50 | "val_dataset = SpamDataset(base_path / \"validation.csv\", max_length=max_length, tokenizer=tokenizer)\n", 51 | "test_dataset = SpamDataset(base_path / \"test.csv\", max_length=max_length, tokenizer=tokenizer)\n", 52 | "```\n", 53 | "\n", 54 | "or, equivalently, we can define the `max_length` via:\n", 55 | "\n", 56 | "```python\n", 57 | "max_length = model.pos_emb.weight.shape[0]\n", 58 | "```\n", 59 | "\n", 60 | "or\n", 61 | "\n", 62 | "```python\n", 63 | "max_length = BASE_CONFIG[\"context_length\"]\n", 64 | "```" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "2b0f4d5d-17fd-4265-93d8-ea08a22fdaf8", 70 | "metadata": {}, 71 | "source": [ 72 | "For convenience, you can run this experiment via\n", 73 | "\n", 74 | "```bash\n", 75 | "python additional-experiments.py --context_length \"model_context_length\"\n", 76 | "```\n", 77 | "\n", 78 | "using the code in the [../02_bonus_additional-experiments](../02_bonus_additional-experiments) folder, which results in a substantially worse test accuracy of 78.33% (versus the 95.67% in the main chapter)." 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "id": "5a780455-f52a-48d1-ab82-6afd40bcad8b", 84 | "metadata": {}, 85 | "source": [ 86 | "## Exercise 6.2: Finetuning the whole model" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "id": "56aa5208-aa29-4165-a0ec-7480754e2a18", 92 | "metadata": {}, 93 | "source": [ 94 | "Instead of finetuning just the final transformer block, we can finetune the entire model by removing the following lines from the code:\n", 95 | "\n", 96 | "```python\n", 97 | "for param in model.parameters():\n", 98 | " param.requires_grad = False\n", 99 | "```\n", 100 | "\n", 101 | "For convenience, you can run this experiment via\n", 102 | "\n", 103 | "```bash\n", 104 | "python additional-experiments.py --trainable_layers all\n", 105 | "```\n", 106 | "\n", 107 | "using the code in the [../02_bonus_additional-experiments](../02_bonus_additional-experiments) folder, which results in a 1% improved test accuracy of 96.67% (versus the 95.67% in the main chapter)." 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "id": "2269bce3-f2b5-4a76-a692-5977c75a57b6", 113 | "metadata": {}, 114 | "source": [ 115 | "## Exercise 6.3: Finetuning the first versus last token " 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "id": "7418a629-51b6-4aa2-83b7-bc0261bc370f", 121 | "metadata": {}, 122 | "source": [ 123 | "Rather than finetuning the last output token, we can finetune the first output token by changing \n", 124 | "\n", 125 | "```python\n", 126 | "model(input_batch)[:, -1, :]\n", 127 | "```\n", 128 | "\n", 129 | "to\n", 130 | "\n", 131 | "```python\n", 132 | "model(input_batch)[:, 0, :]\n", 133 | "```\n", 134 | "\n", 135 | "everywhere in the code.\n", 136 | "\n", 137 | "For convenience, you can run this experiment via\n", 138 | "\n", 139 | "```\n", 140 | "python additional-experiments.py --trainable_token first\n", 141 | "```\n", 142 | "\n", 143 | "using the code in the [../02_bonus_additional-experiments](../02_bonus_additional-experiments) folder, which results in a substantially worse test accuracy of 75.00% (versus the 95.67% in the main chapter)." 144 | ] 145 | } 146 | ], 147 | "metadata": { 148 | "kernelspec": { 149 | "display_name": "Python 3 (ipykernel)", 150 | "language": "python", 151 | "name": "python3" 152 | }, 153 | "language_info": { 154 | "codemirror_mode": { 155 | "name": "ipython", 156 | "version": 3 157 | }, 158 | "file_extension": ".py", 159 | "mimetype": "text/x-python", 160 | "name": "python", 161 | "nbconvert_exporter": "python", 162 | "pygments_lexer": "ipython3", 163 | "version": "3.10.11" 164 | } 165 | }, 166 | "nbformat": 4, 167 | "nbformat_minor": 5 168 | } 169 | -------------------------------------------------------------------------------- /ch06/01_main-chapter-code/tests.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | # File for internal use (unit tests) 7 | 8 | 9 | import subprocess 10 | 11 | 12 | def test_gpt_class_finetune(): 13 | command = ["python", "ch06/01_main-chapter-code/gpt_class_finetune.py", "--test_mode"] 14 | 15 | result = subprocess.run(command, capture_output=True, text=True) 16 | assert result.returncode == 0, f"Script exited with errors: {result.stderr}" 17 | -------------------------------------------------------------------------------- /ch06/03_bonus_imdb-classification/download_prepare_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | import os 7 | import sys 8 | import tarfile 9 | import time 10 | import urllib.request 11 | import pandas as pd 12 | 13 | 14 | def reporthook(count, block_size, total_size): 15 | global start_time 16 | if count == 0: 17 | start_time = time.time() 18 | else: 19 | duration = time.time() - start_time 20 | progress_size = int(count * block_size) 21 | percent = count * block_size * 100 / total_size 22 | 23 | speed = int(progress_size / (1024 * duration)) if duration else 0 24 | sys.stdout.write( 25 | f"\r{int(percent)}% | {progress_size / (1024**2):.2f} MB " 26 | f"| {speed:.2f} MB/s | {duration:.2f} sec elapsed" 27 | ) 28 | sys.stdout.flush() 29 | 30 | 31 | def download_and_extract_dataset(dataset_url, target_file, directory): 32 | if not os.path.exists(directory): 33 | if os.path.exists(target_file): 34 | os.remove(target_file) 35 | urllib.request.urlretrieve(dataset_url, target_file, reporthook) 36 | print("\nExtracting dataset ...") 37 | with tarfile.open(target_file, "r:gz") as tar: 38 | tar.extractall() 39 | else: 40 | print(f"Directory `{directory}` already exists. Skipping download.") 41 | 42 | 43 | def load_dataset_to_dataframe(basepath="aclImdb", labels={"pos": 1, "neg": 0}): 44 | data_frames = [] # List to store each chunk of DataFrame 45 | for subset in ("test", "train"): 46 | for label in ("pos", "neg"): 47 | path = os.path.join(basepath, subset, label) 48 | for file in sorted(os.listdir(path)): 49 | with open(os.path.join(path, file), "r", encoding="utf-8") as infile: 50 | # Create a DataFrame for each file and add it to the list 51 | data_frames.append(pd.DataFrame({"text": [infile.read()], "label": [labels[label]]})) 52 | # Concatenate all DataFrame chunks together 53 | df = pd.concat(data_frames, ignore_index=True) 54 | df = df.sample(frac=1, random_state=123).reset_index(drop=True) # Shuffle the DataFrame 55 | return df 56 | 57 | 58 | def partition_and_save(df, sizes=(35000, 5000, 10000)): 59 | # Shuffle the DataFrame 60 | df_shuffled = df.sample(frac=1, random_state=123).reset_index(drop=True) 61 | 62 | # Get indices for where to split the data 63 | train_end = sizes[0] 64 | val_end = sizes[0] + sizes[1] 65 | 66 | # Split the DataFrame 67 | train = df_shuffled.iloc[:train_end] 68 | val = df_shuffled.iloc[train_end:val_end] 69 | test = df_shuffled.iloc[val_end:] 70 | 71 | # Save to CSV files 72 | train.to_csv("train.csv", index=False) 73 | val.to_csv("validation.csv", index=False) 74 | test.to_csv("test.csv", index=False) 75 | 76 | 77 | if __name__ == "__main__": 78 | dataset_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" 79 | print("Downloading dataset ...") 80 | download_and_extract_dataset(dataset_url, "aclImdb_v1.tar.gz", "aclImdb") 81 | print("Creating data frames ...") 82 | df = load_dataset_to_dataframe() 83 | print("Partitioning and saving data frames ...") 84 | partition_and_save(df) 85 | -------------------------------------------------------------------------------- /ch06/03_bonus_imdb-classification/requirements-extra.txt: -------------------------------------------------------------------------------- 1 | transformers>=4.33.2 2 | scikit-learn>=1.3.0 -------------------------------------------------------------------------------- /ch06/03_bonus_imdb-classification/train_sklearn_logreg.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | import pandas as pd 7 | from sklearn.feature_extraction.text import CountVectorizer 8 | from sklearn.linear_model import LogisticRegression 9 | from sklearn.metrics import accuracy_score 10 | # from sklearn.metrics import balanced_accuracy_score 11 | from sklearn.dummy import DummyClassifier 12 | 13 | 14 | def load_dataframes(): 15 | df_train = pd.read_csv("train.csv") 16 | df_val = pd.read_csv("validation.csv") 17 | df_test = pd.read_csv("test.csv") 18 | 19 | return df_train, df_val, df_test 20 | 21 | 22 | def eval(model, X_train, y_train, X_val, y_val, X_test, y_test): 23 | # Making predictions 24 | y_pred_train = model.predict(X_train) 25 | y_pred_val = model.predict(X_val) 26 | y_pred_test = model.predict(X_test) 27 | 28 | # Calculating accuracy and balanced accuracy 29 | accuracy_train = accuracy_score(y_train, y_pred_train) 30 | # balanced_accuracy_train = balanced_accuracy_score(y_train, y_pred_train) 31 | 32 | accuracy_val = accuracy_score(y_val, y_pred_val) 33 | # balanced_accuracy_val = balanced_accuracy_score(y_val, y_pred_val) 34 | 35 | accuracy_test = accuracy_score(y_test, y_pred_test) 36 | # balanced_accuracy_test = balanced_accuracy_score(y_test, y_pred_test) 37 | 38 | # Printing the results 39 | print(f"Training Accuracy: {accuracy_train*100:.2f}%") 40 | print(f"Validation Accuracy: {accuracy_val*100:.2f}%") 41 | print(f"Test Accuracy: {accuracy_test*100:.2f}%") 42 | 43 | # print(f"\nTraining Balanced Accuracy: {balanced_accuracy_train*100:.2f}%") 44 | # print(f"Validation Balanced Accuracy: {balanced_accuracy_val*100:.2f}%") 45 | # print(f"Test Balanced Accuracy: {balanced_accuracy_test*100:.2f}%") 46 | 47 | 48 | if __name__ == "__main__": 49 | df_train, df_val, df_test = load_dataframes() 50 | 51 | ######################################### 52 | # Convert text into bag-of-words model 53 | vectorizer = CountVectorizer() 54 | ######################################### 55 | 56 | X_train = vectorizer.fit_transform(df_train["text"]) 57 | X_val = vectorizer.transform(df_val["text"]) 58 | X_test = vectorizer.transform(df_test["text"]) 59 | y_train, y_val, y_test = df_train["label"], df_val["label"], df_test["label"] 60 | 61 | ##################################### 62 | # Model training and evaluation 63 | ##################################### 64 | 65 | # Create a dummy classifier with the strategy to predict the most frequent class 66 | dummy_clf = DummyClassifier(strategy="most_frequent") 67 | dummy_clf.fit(X_train, y_train) 68 | 69 | print("Dummy classifier:") 70 | eval(dummy_clf, X_train, y_train, X_val, y_val, X_test, y_test) 71 | 72 | print("\n\nLogistic regression classifier:") 73 | model = LogisticRegression(max_iter=1000) 74 | model.fit(X_train, y_train) 75 | eval(model, X_train, y_train, X_val, y_val, X_test, y_test) 76 | -------------------------------------------------------------------------------- /ch06/04_user_interface/README.md: -------------------------------------------------------------------------------- 1 | # Building a User Interface to Interact With the GPT-based Spam Classifier 2 | 3 | 4 | 5 | This bonus folder contains code for running a ChatGPT-like user interface to interact with the finetuned GPT-based spam classifier from chapter 6, as shown below. 6 | 7 | 8 | 9 | ![Chainlit UI example](https://sebastianraschka.com/images/LLMs-from-scratch-images/bonus/chainlit/chainlit-spam.webp) 10 | 11 | 12 | 13 | To implement this user interface, we use the open-source [Chainlit Python package](https://github.com/Chainlit/chainlit). 14 | 15 |   16 | ## Step 1: Install dependencies 17 | 18 | First, we install the `chainlit` package via 19 | 20 | ```bash 21 | pip install chainlit 22 | ``` 23 | 24 | (Alternatively, execute `pip install -r requirements-extra.txt`.) 25 | 26 |   27 | ## Step 2: Run `app` code 28 | 29 | The [`app.py`](app.py) file contains the UI code based. Open and inspect these files to learn more. 30 | 31 | This file loads and uses the GPT-2 classifier weights we generated in chapter 6. This requires that you execute the [`../01_main-chapter-code/ch06.ipynb`](../01_main-chapter-code/ch06.ipynb) file first. 32 | 33 | Excecute the following command from the terminal to start the UI server: 34 | 35 | ```bash 36 | chainlit run app.py 37 | ``` 38 | 39 | Running commands above should open a new browser tab where you can interact with the model. If the browser tab does not open automatically, inspect the terminal command and copy the local address into your browser address bar (usually, the address is `http://localhost:8000`). -------------------------------------------------------------------------------- /ch06/04_user_interface/app.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | from pathlib import Path 7 | import sys 8 | 9 | import tiktoken 10 | import torch 11 | import chainlit 12 | 13 | # For llms_from_scratch installation instructions, see: 14 | # https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg 15 | from llms_from_scratch.ch04 import GPTModel 16 | from llms_from_scratch.ch06 import classify_review 17 | 18 | 19 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 20 | 21 | 22 | def get_model_and_tokenizer(): 23 | """ 24 | Code to load finetuned GPT-2 model generated in chapter 6. 25 | This requires that you run the code in chapter 6 first, which generates the necessary model.pth file. 26 | """ 27 | 28 | GPT_CONFIG_124M = { 29 | "vocab_size": 50257, # Vocabulary size 30 | "context_length": 1024, # Context length 31 | "emb_dim": 768, # Embedding dimension 32 | "n_heads": 12, # Number of attention heads 33 | "n_layers": 12, # Number of layers 34 | "drop_rate": 0.1, # Dropout rate 35 | "qkv_bias": True # Query-key-value bias 36 | } 37 | 38 | tokenizer = tiktoken.get_encoding("gpt2") 39 | 40 | model_path = Path("..") / "01_main-chapter-code" / "review_classifier.pth" 41 | if not model_path.exists(): 42 | print( 43 | f"Could not find the {model_path} file. Please run the chapter 6 code" 44 | " (ch06.ipynb) to generate the review_classifier.pth file." 45 | ) 46 | sys.exit() 47 | 48 | # Instantiate model 49 | model = GPTModel(GPT_CONFIG_124M) 50 | 51 | # Convert model to classifier as in section 6.5 in ch06.ipynb 52 | num_classes = 2 53 | model.out_head = torch.nn.Linear(in_features=GPT_CONFIG_124M["emb_dim"], out_features=num_classes) 54 | 55 | # Then load model weights 56 | checkpoint = torch.load(model_path, map_location=device, weights_only=True) 57 | model.load_state_dict(checkpoint) 58 | model.to(device) 59 | model.eval() 60 | 61 | return tokenizer, model 62 | 63 | 64 | # Obtain the necessary tokenizer and model files for the chainlit function below 65 | tokenizer, model = get_model_and_tokenizer() 66 | 67 | 68 | @chainlit.on_message 69 | async def main(message: chainlit.Message): 70 | """ 71 | The main Chainlit function. 72 | """ 73 | user_input = message.content 74 | 75 | label = classify_review(user_input, model, tokenizer, device, max_length=120) 76 | 77 | await chainlit.Message( 78 | content=f"{label}", # This returns the model response to the interface 79 | ).send() 80 | -------------------------------------------------------------------------------- /ch06/04_user_interface/requirements-extra.txt: -------------------------------------------------------------------------------- 1 | chainlit>=1.2.0 -------------------------------------------------------------------------------- /ch06/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 6: Finetuning for Classification 2 | 3 |   4 | ## Main Chapter Code 5 | 6 | - [01_main-chapter-code](01_main-chapter-code) contains the main chapter code 7 | 8 |   9 | ## Bonus Materials 10 | 11 | - [02_bonus_additional-experiments](02_bonus_additional-experiments) includes additional experiments (e.g., training the last vs first token, extending the input length, etc.) 12 | - [03_bonus_imdb-classification](03_bonus_imdb-classification) compares the LLM from chapter 6 with other models on a 50k IMDB movie review sentiment classification dataset 13 | - [04_user_interface](04_user_interface) implements an interactive user interface to interact with the pretrained LLM 14 | 15 | 16 | 17 | 18 | 19 |
20 |
21 | 22 | [![Link to the video](https://img.youtube.com/vi/5PFXJYme4ik/0.jpg)](https://www.youtube.com/watch?v=5PFXJYme4ik) -------------------------------------------------------------------------------- /ch07/01_main-chapter-code/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 7: Finetuning to Follow Instructions 2 | 3 | ### Main Chapter Code 4 | 5 | - [ch07.ipynb](ch07.ipynb) contains all the code as it appears in the chapter 6 | - [previous_chapters.py](previous_chapters.py) is a Python module that contains the GPT model we coded and trained in previous chapters, alongside many utility functions, which we reuse in this chapter 7 | - [gpt_download.py](gpt_download.py) contains the utility functions for downloading the pretrained GPT model weights 8 | - [exercise-solutions.ipynb](exercise-solutions.ipynb) contains the exercise solutions for this chapter 9 | 10 | 11 | ### Optional Code 12 | 13 | - [load-finetuned-model.ipynb](load-finetuned-model.ipynb) is a standalone Jupyter notebook to load the instruction finetuned model we created in this chapter 14 | 15 | - [gpt_instruction_finetuning.py](gpt_instruction_finetuning.py) is a standalone Python script to instruction finetune the model as described in the main chapter (think of it as a chapter summary focused on the finetuning parts) 16 | 17 | Usage: 18 | 19 | ```bash 20 | python gpt_instruction_finetuning.py 21 | ``` 22 | 23 | ``` 24 | matplotlib version: 3.9.0 25 | tiktoken version: 0.7.0 26 | torch version: 2.3.1 27 | tqdm version: 4.66.4 28 | tensorflow version: 2.16.1 29 | -------------------------------------------------- 30 | Training set length: 935 31 | Validation set length: 55 32 | Test set length: 110 33 | -------------------------------------------------- 34 | Device: cpu 35 | -------------------------------------------------- 36 | File already exists and is up-to-date: gpt2/355M/checkpoint 37 | File already exists and is up-to-date: gpt2/355M/encoder.json 38 | File already exists and is up-to-date: gpt2/355M/hparams.json 39 | File already exists and is up-to-date: gpt2/355M/model.ckpt.data-00000-of-00001 40 | File already exists and is up-to-date: gpt2/355M/model.ckpt.index 41 | File already exists and is up-to-date: gpt2/355M/model.ckpt.meta 42 | File already exists and is up-to-date: gpt2/355M/vocab.bpe 43 | Loaded model: gpt2-medium (355M) 44 | -------------------------------------------------- 45 | Initial losses 46 | Training loss: 3.839039182662964 47 | Validation loss: 3.7619192123413088 48 | Ep 1 (Step 000000): Train loss 2.611, Val loss 2.668 49 | Ep 1 (Step 000005): Train loss 1.161, Val loss 1.131 50 | Ep 1 (Step 000010): Train loss 0.939, Val loss 0.973 51 | ... 52 | Training completed in 15.66 minutes. 53 | Plot saved as loss-plot-standalone.pdf 54 | -------------------------------------------------- 55 | Generating responses 56 | 100%|█████████████████████████████████████████████████████████| 110/110 [06:57<00:00, 3.80s/it] 57 | Responses saved as instruction-data-with-response-standalone.json 58 | Model saved as gpt2-medium355M-sft-standalone.pth 59 | ``` 60 | 61 | - [ollama_evaluate.py](ollama_evaluate.py) is a standalone Python script to evaluate the responses of the finetuned model as described in the main chapter (think of it as a chapter summary focused on the evaluation parts) 62 | 63 | Usage: 64 | 65 | ```bash 66 | python ollama_evaluate.py --file_path instruction-data-with-response-standalone.json 67 | ``` 68 | 69 | ``` 70 | Ollama running: True 71 | Scoring entries: 100%|███████████████████████████████████████| 110/110 [01:08<00:00, 1.62it/s] 72 | Number of scores: 110 of 110 73 | Average score: 51.75 74 | ``` 75 | 76 | - [exercise_experiments.py](exercise_experiments.py) is an optional scropt that implements the exercise solutions; for more details see [exercise-solutions.ipynb](exercise-solutions.ipynb) 77 | -------------------------------------------------------------------------------- /ch07/01_main-chapter-code/ollama_evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | # 6 | # A minimal instruction finetuning file based on the code in chapter 7 7 | 8 | import json 9 | import psutil 10 | from tqdm import tqdm 11 | import urllib.request 12 | 13 | 14 | def query_model(prompt, model="llama3", url="http://localhost:11434/api/chat"): 15 | # Create the data payload as a dictionary 16 | data = { 17 | "model": model, 18 | "messages": [ 19 | {"role": "user", "content": prompt} 20 | ], 21 | "options": { # Settings below are required for deterministic responses 22 | "seed": 123, 23 | "temperature": 0, 24 | "num_ctx": 2048 25 | } 26 | } 27 | 28 | # Convert the dictionary to a JSON formatted string and encode it to bytes 29 | payload = json.dumps(data).encode("utf-8") 30 | 31 | # Create a request object, setting the method to POST and adding necessary headers 32 | request = urllib.request.Request(url, data=payload, method="POST") 33 | request.add_header("Content-Type", "application/json") 34 | 35 | # Send the request and capture the response 36 | response_data = "" 37 | with urllib.request.urlopen(request) as response: 38 | # Read and decode the response 39 | while True: 40 | line = response.readline().decode("utf-8") 41 | if not line: 42 | break 43 | response_json = json.loads(line) 44 | response_data += response_json["message"]["content"] 45 | 46 | return response_data 47 | 48 | 49 | def check_if_running(process_name): 50 | running = False 51 | for proc in psutil.process_iter(["name"]): 52 | if process_name in proc.info["name"]: 53 | running = True 54 | break 55 | return running 56 | 57 | 58 | def format_input(entry): 59 | instruction_text = ( 60 | f"Below is an instruction that describes a task. " 61 | f"Write a response that appropriately completes the request." 62 | f"\n\n### Instruction:\n{entry['instruction']}" 63 | ) 64 | 65 | input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else "" 66 | 67 | return instruction_text + input_text 68 | 69 | 70 | def main(file_path): 71 | ollama_running = check_if_running("ollama") 72 | 73 | if not ollama_running: 74 | raise RuntimeError("Ollama not running. Launch ollama before proceeding.") 75 | print("Ollama running:", check_if_running("ollama")) 76 | 77 | with open(file_path, "r") as file: 78 | test_data = json.load(file) 79 | 80 | model = "llama3" 81 | scores = generate_model_scores(test_data, "model_response", model) 82 | print(f"Number of scores: {len(scores)} of {len(test_data)}") 83 | print(f"Average score: {sum(scores)/len(scores):.2f}\n") 84 | 85 | 86 | def generate_model_scores(json_data, json_key, model="llama3"): 87 | scores = [] 88 | for entry in tqdm(json_data, desc="Scoring entries"): 89 | if entry[json_key] == "": 90 | scores.append(0) 91 | else: 92 | prompt = ( 93 | f"Given the input `{format_input(entry)}` " 94 | f"and correct output `{entry['output']}`, " 95 | f"score the model response `{entry[json_key]}`" 96 | f" on a scale from 0 to 100, where 100 is the best score. " 97 | f"Respond with the integer number only." 98 | ) 99 | score = query_model(prompt, model) 100 | try: 101 | scores.append(int(score)) 102 | except ValueError: 103 | print(f"Could not convert score: {score}") 104 | continue 105 | 106 | return scores 107 | 108 | 109 | if __name__ == "__main__": 110 | 111 | import argparse 112 | 113 | parser = argparse.ArgumentParser( 114 | description="Evaluate model responses with ollama" 115 | ) 116 | parser.add_argument( 117 | "--file_path", 118 | required=True, 119 | help=( 120 | "The path to the test dataset `.json` file with the" 121 | " `'output'` and `'model_response'` keys" 122 | ) 123 | ) 124 | args = parser.parse_args() 125 | 126 | main(file_path=args.file_path) 127 | -------------------------------------------------------------------------------- /ch07/01_main-chapter-code/tests.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | # File for internal use (unit tests) 7 | 8 | 9 | import subprocess 10 | 11 | 12 | def test_gpt_class_finetune(): 13 | command = ["python", "ch07/01_main-chapter-code/gpt_instruction_finetuning.py", "--test_mode"] 14 | 15 | result = subprocess.run(command, capture_output=True, text=True) 16 | assert result.returncode == 0, f"Script exited with errors: {result.stderr}" 17 | -------------------------------------------------------------------------------- /ch07/02_dataset-utilities/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 7: Finetuning to Follow Instructions 2 | 3 | This folder contains utility code that can be used for preparing an instruction dataset. 4 | 5 | Install the additional package requirements via: 6 | 7 | ```bash 8 | pip install -r requirements-extra.txt 9 | ``` 10 | 11 | 12 | 13 | 14 | 15 | ### Finding Near Duplicates 16 | 17 | The `find-near-duplicates.py` function can be used to identify duplicates and near-duplicates in an instruction dataset. For example, 18 | 19 | 20 | 21 | ```bash 22 | python find-near-duplicates.py --json_file instruction-examples.json 23 | ``` 24 | 25 | ``` 26 | scikit-learn version: 1.3.1 27 | 28 | 29 | ================================================== 30 | Searching 'instruction' for duplicates ... 31 | ================================================== 32 | Duplicate pair found with similarity 0.94: 33 | 1. Edit the following sentence to make it more formal. 34 | 2. Edit the sentence to make it more formal. 35 | 36 | Duplicate pair found with similarity 1.00: 37 | 1. Name a dwarf planet in our solar system. 38 | 2. Name a dwarf planet in our solar system. 39 | 40 | Duplicate pair found with similarity 0.91: 41 | 1. Change the sentences from active voice to passive voice. 42 | 2. Change the sentence from passive to active voice. 43 | 44 | 45 | 46 | ================================================== 47 | Searching 'input' for duplicates ... 48 | ================================================== 49 | No duplicates found 50 | 51 | 52 | ================================================== 53 | Searching 'output' for duplicates ... 54 | ================================================== 55 | Duplicate pair found with similarity 1.00: 56 | 1. One dwarf planet in our solar system is Pluto. 57 | 2. One dwarf planet in our solar system is Pluto. 58 | 59 | 60 | ``` 61 | 62 |   63 | You can use the `--threshold` setting with a value between 0 and 1 to decrease or increase the sensitivity. 64 | The default threshold is 0.9. 65 | 66 | 67 | 68 |   69 | ## Creating Passive Voice Entries 70 | 71 | - The [create-passive-voice-entries.ipynb](create-passive-voice-entries.ipynb) notebook uses OpenAI's GPT-4 to create "passive voice" entries for an instruction dataset, as shown in the example below 72 | 73 | ```python 74 | { 75 | 'instruction': 'Identify the verb in the following sentence', 76 | 'input': 'The cat sleeps on the couch.', 77 | 'output': 'The verb in the sentence is "sleeps."', 78 | 'output_2': 'The sentence is "sleeps."' # <---- Newly created entry 79 | } 80 | ``` 81 | -------------------------------------------------------------------------------- /ch07/02_dataset-utilities/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "OPENAI_API_KEY": "sk-...", 3 | "_comment": "Enter your API key from https://platform.openai.com/api-keys" 4 | } 5 | -------------------------------------------------------------------------------- /ch07/02_dataset-utilities/find-near-duplicates.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 3 | # Source for "Build a Large Language Model From Scratch" 4 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 5 | # Code: https://github.com/rasbt/LLMs-from-scratch 6 | 7 | import argparse 8 | import json 9 | import re 10 | from sklearn import __version__ as sklearn_version 11 | from sklearn.feature_extraction.text import TfidfVectorizer 12 | from sklearn.metrics.pairwise import cosine_similarity 13 | 14 | 15 | # Sample JSON dataset 16 | example_data = [ 17 | {"instruction": "What is the capital of Italy?", 18 | "input": "", "output": "The capital of Italy is Rome." 19 | }, 20 | {"instruction": "What's the capital city of Italy?", 21 | "input": "", "output": "The capital city is Rome." 22 | }, 23 | {"instruction": "Identify the main verb in the sentence: 'The cat sleeps on the couch.'", 24 | "input": "", "output": "The verb is 'sleeps'." 25 | }, 26 | {"instruction": "Identify the verb in the following sentence: The cat sleeps on the couch.", 27 | "input": "", "output": "The verb in the sentence is \"sleeps.\"" 28 | }, 29 | # ... 30 | ] 31 | 32 | 33 | def preprocess_text(text): 34 | # Lowercase the text 35 | text = text.lower() 36 | # Remove punctuation 37 | text = re.sub(r'[^\w\s]', '', text) 38 | return text 39 | 40 | 41 | def find_near_duplicates(json_data, threshold=0.75, key="instruction"): 42 | """The higher the threshold, the more similar the texts have to be to match""" 43 | 44 | # Extract instructions 45 | text = [preprocess_text(item[key]) for item in json_data if item[key]] 46 | near_duplicates = [] 47 | indices_to_remove = set() 48 | 49 | if not text: 50 | return {}, near_duplicates 51 | 52 | # Vectorize the text data 53 | vectorizer = TfidfVectorizer(stop_words=None, analyzer='char', ngram_range=(1, 3)) 54 | tfidf_matrix = vectorizer.fit_transform(text) 55 | 56 | # Compute cosine similarity between each pair of entries 57 | cos_sim_matrix = cosine_similarity(tfidf_matrix) 58 | 59 | # Find pairs of near-duplicate instructions based on the threshold 60 | 61 | for i in range(len(cos_sim_matrix)): 62 | for j in range(i+1, len(cos_sim_matrix)): 63 | if cos_sim_matrix[i, j] > threshold: 64 | if len(json_data[i][key]) <= 1 or len(json_data[j][key]) <= 1: 65 | continue 66 | near_duplicates.append((json_data[i], json_data[j], cos_sim_matrix[i, j])) 67 | if key in ("input", "output"): # Don't remove duplicates based on the instruction 68 | indices_to_remove.add(j) # Mark the second entry for removal 69 | 70 | # Remove the near-duplicate entries 71 | filtered_json_data = [item for index, item in enumerate(json_data) if index not in indices_to_remove] 72 | 73 | return filtered_json_data, near_duplicates 74 | 75 | 76 | def find_print_and_remove_near_duplicates(json_data, remove_duplicates=False, threshold=0.75): 77 | """ 78 | Searches each key in the first JSON object for duplicates across a list of JSON objects. 79 | Prints the duplicates if found. 80 | """ 81 | for key in json_data[0].keys(): 82 | 83 | if remove_duplicates: 84 | json_data, near_duplicates = find_near_duplicates(json_data, key=key, threshold=threshold) 85 | else: 86 | _, near_duplicates = find_near_duplicates(json_data, key=key, threshold=threshold) 87 | separator = 50 * '=' 88 | print(f"\n\n{separator}\nSearching '{key}' for duplicates ...\n{separator}") 89 | if not near_duplicates: 90 | print("No duplicates found") 91 | else: 92 | for dup in near_duplicates: 93 | print( 94 | f"Duplicate pair found with similarity {dup[2]:.2f}:\n" 95 | f"1. {dup[0][key]}\n2. {dup[1][key]}\n" 96 | ) 97 | return json_data 98 | 99 | 100 | if __name__ == "__main__": 101 | print("scikit-learn version:", sklearn_version) 102 | 103 | parser = argparse.ArgumentParser() 104 | parser.add_argument( 105 | "--json_file", 106 | type=str, 107 | help=("Path to the dataset JSON file") 108 | ) 109 | parser.add_argument( 110 | "--threshold", 111 | type=float, 112 | default=0.9, 113 | help=("A sensitivity threshold between 0 and 1 where 1 is strictest") 114 | ) 115 | parser.add_argument( 116 | "--remove_duplicates", 117 | action='store_true', 118 | default=False, 119 | help=( 120 | "Removes duplicates based on the 'input' or 'output' keys " 121 | " (but not the 'instruction') and saves the cleaned JSON file as --json_output_file" 122 | ) 123 | ) 124 | parser.add_argument( 125 | "--json_output_file", 126 | type=str, 127 | help=("Path to the dataset JSON file") 128 | ) 129 | 130 | args = parser.parse_args() 131 | 132 | if args.remove_duplicates and not args.json_output_file: 133 | raise ValueError( 134 | "Provide an output file via --json_output_file " 135 | "to save the cleaned JSON data." 136 | ) 137 | 138 | if not args.json_file: 139 | json_data = example_data 140 | 141 | else: 142 | with open(args.json_file, "r") as file: 143 | json_data = json.load(file) 144 | 145 | json_data = find_print_and_remove_near_duplicates( 146 | json_data=json_data, 147 | remove_duplicates=args.remove_duplicates, 148 | threshold=args.threshold 149 | ) 150 | 151 | if args.remove_duplicates: 152 | with open(args.json_output_file, "w") as file: 153 | json.dump(json_data, file, indent=4) 154 | -------------------------------------------------------------------------------- /ch07/02_dataset-utilities/requirements-extra.txt: -------------------------------------------------------------------------------- 1 | openai>=1.30.3 2 | scikit-learn>=1.3.1 3 | tqdm>=4.65.0 -------------------------------------------------------------------------------- /ch07/03_model-evaluation/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 7: Finetuning to Follow Instructions 2 | 3 | This folder contains utility code that can be used for model evaluation. 4 | 5 | 6 | 7 |   8 | ## Evaluating Instruction Responses Using the OpenAI API 9 | 10 | 11 | - The [llm-instruction-eval-openai.ipynb](llm-instruction-eval-openai.ipynb) notebook uses OpenAI's GPT-4 to evaluate responses generated by instruction finetuned models. It works with a JSON file in the following format: 12 | 13 | ```python 14 | { 15 | "instruction": "What is the atomic number of helium?", 16 | "input": "", 17 | "output": "The atomic number of helium is 2.", # <-- The target given in the test set 18 | "model 1 response": "\nThe atomic number of helium is 2.0.", # <-- Response by an LLM 19 | "model 2 response": "\nThe atomic number of helium is 3." # <-- Response by a 2nd LLM 20 | }, 21 | ``` 22 | 23 |   24 | ## Evaluating Instruction Responses Locally Using Ollama 25 | 26 | - The [llm-instruction-eval-ollama.ipynb](llm-instruction-eval-ollama.ipynb) notebook offers an alternative to the one above, utilizing a locally downloaded Llama 3 model via Ollama. -------------------------------------------------------------------------------- /ch07/03_model-evaluation/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "OPENAI_API_KEY": "sk-...", 3 | "_comment": "Enter your API key from https://platform.openai.com/api-keys" 4 | } 5 | -------------------------------------------------------------------------------- /ch07/03_model-evaluation/requirements-extra.txt: -------------------------------------------------------------------------------- 1 | openai>=1.30.3 2 | tqdm>=4.65.0 3 | -------------------------------------------------------------------------------- /ch07/03_model-evaluation/scores/gpt4-model-1-response.json: -------------------------------------------------------------------------------- 1 | [0, 50, 20, 100, 0, 100, 0, 100, 100, 100, 55, 0, 100, 100, 100, 100, 100, 0, 98, 100, 100, 0, 100, 100, 100, 100, 100, 100, 0, 100, 100, 0, 100, 100, 85, 100, 0, 0, 100, 100, 100, 100, 100, 100, 0, 100, 100, 95, 20, 50, 85, 100, 100, 100, 100, 55, 100, 100, 100, 0, 100, 98, 100, 100, 100, 0, 85, 100, 100, 98, 100, 100, 100, 0, 100, 100, 100, 100, 0, 100, 0, 100, 100, 0, 0, 100, 50, 100, 100, 10, 100, 100, 100, 100, 0, 100, 100, 25, 100, 30] -------------------------------------------------------------------------------- /ch07/03_model-evaluation/scores/gpt4-model-2-response.json: -------------------------------------------------------------------------------- 1 | [0, 100, 0, 100, 0, 100, 0, 100, 0, 0, 50, 0, 100, 100, 100, 100, 100, 100, 100, 95, 0, 50, 100, 100, 0, 0, 100, 0, 0, 100, 0, 0, 100, 0, 67, 0, 0, 0, 100, 100, 95, 100, 100, 100, 0, 0, 0, 0, 100, 100, 100, 0, 55, 100, 0, 100, 65, 100, 100, 0, 100, 100, 100, 0, 100, 0, 85, 100, 100, 85, 0, 75, 100, 0, 0, 100, 100, 100, 0, 100, 0, 50, 100, 100, 0, 100, 0, 0, 100, 85, 100, 0, 100, 100, 0, 100, 100, 0, 0, 0] -------------------------------------------------------------------------------- /ch07/03_model-evaluation/scores/llama3-8b-model-1-response.json: -------------------------------------------------------------------------------- 1 | [20, 92, 85, 90, 20, 90, 22, 97, 60, 96, 20, 20, 98, 95, 90, 98, 95, 20, 98, 98, 92, 20, 96, 96, 100, 98, 98, 95, 20, 95, 98, 20, 85, 95, 80, 97, 40, 21, 100, 85, 95, 98, 92, 98, 69, 98, 80, 60, 60, 20, 80, 68, 80, 96, 96, 68, 80, 95, 80, 20, 95, 98, 80, 98, 94, 20, 40, 98, 100, 85, 98, 90, 95, 85, 95, 80, 98, 98, 25, 98, 40, 92, 95, 82, 87, 98, 80, 90, 95, 4, 90, 90, 80, 98, 20, 98, 98, 40, 92, 98] -------------------------------------------------------------------------------- /ch07/03_model-evaluation/scores/llama3-8b-model-2-response.json: -------------------------------------------------------------------------------- 1 | [76, 85, 67, 90, 20, 98, 22, 96, 40, 80, 40, 20, 90, 98, 80, 92, 98, 98, 95, 99, 55, 99, 80, 90, 20, 4, 98, 4, 40, 95, 14, 44, 95, 44, 80, 4, 4, 40, 95, 80, 98, 95, 92, 98, 68, 20, 20, 60, 95, 90, 98, 0, 20, 80, 20, 80, 92, 98, 98, 20, 95, 100, 95, 85, 98, 4, 40, 98, 98, 65, 20, 76, 100, 67, 44, 92, 75, 97, 27, 98, 20, 60, 90, 96, 67, 98, 80, 10, 80, 98, 100, 40, 92, 98, 20, 98, 98, 20, 20] -------------------------------------------------------------------------------- /ch07/04_preference-tuning-with-dpo/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 7: Finetuning to Follow Instructions 2 | 3 | - [create-preference-data-ollama.ipynb](create-preference-data-ollama.ipynb): A notebook that creates a synthetic dataset for preference finetuning dataset using Llama 3.1 and Ollama 4 | 5 | - [dpo-from-scratch.ipynb](dpo-from-scratch.ipynb): This notebook implements Direct Preference Optimization (DPO) for LLM alignment 6 | 7 | 8 | -------------------------------------------------------------------------------- /ch07/05_dataset-generation/README.md: -------------------------------------------------------------------------------- 1 | # Generating Datasets for Instruction Finetuning 2 | 3 | This folder contains utility code that can be used for generating a dataset for instruction finetuning. 4 | 5 | - [llama3-ollama.ipynb](llama3-ollama.ipynb): A notebook that creates a synthetic instruction finetuning dataset using Llama 3 and Ollama 6 | 7 | - [reflection-gpt4.ipynb](reflection-gpt4.ipynb): A notebook that implements an instruction dataset refinement step based on reflection-tuning 8 | -------------------------------------------------------------------------------- /ch07/05_dataset-generation/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "OPENAI_API_KEY": "sk-...", 3 | "_comment": "Enter your API key from https://platform.openai.com/api-keys" 4 | } 5 | -------------------------------------------------------------------------------- /ch07/05_dataset-generation/requirements-extra.txt: -------------------------------------------------------------------------------- 1 | openai>=1.30.3 2 | tqdm>=4.65.0 3 | -------------------------------------------------------------------------------- /ch07/06_user_interface/README.md: -------------------------------------------------------------------------------- 1 | # Building a User Interface to Interact With the Instruction Finetuned GPT Model 2 | 3 | 4 | 5 | This bonus folder contains code for running a ChatGPT-like user interface to interact with the instruction finetuned GPT from chapter 7, as shown below. 6 | 7 | 8 | 9 | ![Chainlit UI example](https://sebastianraschka.com/images/LLMs-from-scratch-images/bonus/chainlit/chainlit-sft.webp?2) 10 | 11 | 12 | 13 | To implement this user interface, we use the open-source [Chainlit Python package](https://github.com/Chainlit/chainlit). 14 | 15 |   16 | ## Step 1: Install dependencies 17 | 18 | First, we install the `chainlit` package via 19 | 20 | ```bash 21 | pip install chainlit 22 | ``` 23 | 24 | (Alternatively, execute `pip install -r requirements-extra.txt`.) 25 | 26 |   27 | ## Step 2: Run `app` code 28 | 29 | The [`app.py`](app.py) file contains the UI code based. Open and inspect these files to learn more. 30 | 31 | This file loads and uses the GPT-2 weights we generated in chapter 7. This requires that you execute the [`../01_main-chapter-code/ch07.ipynb`](../01_main-chapter-code/ch07.ipynb) file first. 32 | 33 | Excecute the following command from the terminal to start the UI server: 34 | 35 | ```bash 36 | chainlit run app.py 37 | ``` 38 | 39 | Running commands above should open a new browser tab where you can interact with the model. If the browser tab does not open automatically, inspect the terminal command and copy the local address into your browser address bar (usually, the address is `http://localhost:8000`). -------------------------------------------------------------------------------- /ch07/06_user_interface/app.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | from pathlib import Path 7 | import sys 8 | 9 | import tiktoken 10 | import torch 11 | import chainlit 12 | 13 | 14 | # For llms_from_scratch installation instructions, see: 15 | # https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg 16 | from llms_from_scratch.ch04 import GPTModel 17 | from llms_from_scratch.ch05 import ( 18 | generate, 19 | text_to_token_ids, 20 | token_ids_to_text, 21 | ) 22 | 23 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 24 | 25 | 26 | def get_model_and_tokenizer(): 27 | """ 28 | Code to load a GPT-2 model with finetuned weights generated in chapter 7. 29 | This requires that you run the code in chapter 7 first, which generates the necessary gpt2-medium355M-sft.pth file. 30 | """ 31 | 32 | GPT_CONFIG_355M = { 33 | "vocab_size": 50257, # Vocabulary size 34 | "context_length": 1024, # Shortened context length (orig: 1024) 35 | "emb_dim": 1024, # Embedding dimension 36 | "n_heads": 16, # Number of attention heads 37 | "n_layers": 24, # Number of layers 38 | "drop_rate": 0.0, # Dropout rate 39 | "qkv_bias": True # Query-key-value bias 40 | } 41 | 42 | tokenizer = tiktoken.get_encoding("gpt2") 43 | 44 | model_path = Path("..") / "01_main-chapter-code" / "gpt2-medium355M-sft.pth" 45 | if not model_path.exists(): 46 | print( 47 | f"Could not find the {model_path} file. Please run the chapter 7 code " 48 | " (ch07.ipynb) to generate the gpt2-medium355M-sft.pt file." 49 | ) 50 | sys.exit() 51 | 52 | checkpoint = torch.load(model_path, weights_only=True) 53 | model = GPTModel(GPT_CONFIG_355M) 54 | model.load_state_dict(checkpoint) 55 | model.to(device) 56 | 57 | return tokenizer, model, GPT_CONFIG_355M 58 | 59 | 60 | def extract_response(response_text, input_text): 61 | return response_text[len(input_text):].replace("### Response:", "").strip() 62 | 63 | 64 | # Obtain the necessary tokenizer and model files for the chainlit function below 65 | tokenizer, model, model_config = get_model_and_tokenizer() 66 | 67 | 68 | @chainlit.on_message 69 | async def main(message: chainlit.Message): 70 | """ 71 | The main Chainlit function. 72 | """ 73 | 74 | torch.manual_seed(123) 75 | 76 | prompt = f"""Below is an instruction that describes a task. Write a response 77 | that appropriately completes the request. 78 | 79 | ### Instruction: 80 | {message.content} 81 | """ 82 | 83 | token_ids = generate( # function uses `with torch.no_grad()` internally already 84 | model=model, 85 | idx=text_to_token_ids(prompt, tokenizer).to(device), # The user text is provided via as `message.content` 86 | max_new_tokens=35, 87 | context_size=model_config["context_length"], 88 | eos_id=50256 89 | ) 90 | 91 | text = token_ids_to_text(token_ids, tokenizer) 92 | response = extract_response(text, prompt) 93 | 94 | await chainlit.Message( 95 | content=f"{response}", # This returns the model response to the interface 96 | ).send() 97 | -------------------------------------------------------------------------------- /ch07/06_user_interface/requirements-extra.txt: -------------------------------------------------------------------------------- 1 | chainlit>=1.2.0 -------------------------------------------------------------------------------- /ch07/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 7: Finetuning to Follow Instructions 2 | 3 |   4 | ## Main Chapter Code 5 | 6 | - [01_main-chapter-code](01_main-chapter-code) contains the main chapter code and exercise solutions 7 | 8 |   9 | ## Bonus Materials 10 | 11 | - [02_dataset-utilities](02_dataset-utilities) contains utility code that can be used for preparing an instruction dataset 12 | - [03_model-evaluation](03_model-evaluation) contains utility code for evaluating instruction responses using a local Llama 3 model and the GPT-4 API 13 | - [04_preference-tuning-with-dpo](04_preference-tuning-with-dpo) implements code for preference finetuning with Direct Preference Optimization (DPO) 14 | - [05_dataset-generation](05_dataset-generation) contains code to generate and improve synthetic datasets for instruction finetuning 15 | - [06_user_interface](06_user_interface) implements an interactive user interface to interact with the pretrained LLM 16 | 17 | 18 | 19 | 20 | 21 |
22 |
23 | 24 | [![Link to the video](https://img.youtube.com/vi/4yNswvhPWCQ/0.jpg)](https://www.youtube.com/watch?v=4yNswvhPWCQ) -------------------------------------------------------------------------------- /pixi.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | authors = ["Matthew Feickert "] 3 | channels = ["conda-forge"] 4 | name = "LLMs-from-scratch" 5 | platforms = ["linux-64", "osx-arm64", "win-64"] 6 | version = "0.1.0" 7 | 8 | [tasks] 9 | 10 | [dependencies] 11 | python = "3.10.*" 12 | pytorch-cpu = ">=2.6.0,<3" 13 | jupyterlab = ">=4.3.5,<5" 14 | tiktoken = ">=0.9.0,<0.10" 15 | matplotlib = ">=3.10.0,<4" 16 | tqdm = ">=4.67.1,<5" 17 | pandas = ">=2.2.3,<3" 18 | numpy = ">=1.26,<2.1" 19 | psutil = ">=5.9.5,<7" 20 | 21 | [pypi-dependencies] 22 | # The TensorFlow team unfortunately does not maintain the conda-forge 23 | # feedstock and it is currently broken for TensorFlow v2.18.0 24 | tensorflow = ">=2.18.0, <3" 25 | llms-from-scratch = { path = ".", editable = true } 26 | 27 | [target.win-64.pypi-dependencies] 28 | tensorflow-cpu = ">=2.18.0, <3" 29 | 30 | [feature.tests.dependencies] 31 | blobfile = ">=3.0.0,<4" 32 | huggingface_hub = ">=0.24.7,<0.29" 33 | ipywidgets = ">=8.1.2,<9" 34 | safetensors = ">=0.4.4,<0.6" 35 | sentencepiece = ">=0.1.99,<0.3" 36 | transformers = ">=4.33.2,<5" 37 | pytest = ">=8.3.4,<9" 38 | nbval = ">=0.11.0,<0.12" 39 | 40 | [feature.tests.pypi-dependencies] 41 | pytest-ruff = ">=0.4.1, <0.5" 42 | 43 | [feature.tests.target.win-64.pypi-dependencies] 44 | tensorflow-io-gcs-filesystem = "==0.31.0" 45 | 46 | [environments] 47 | tests = ["tests"] 48 | -------------------------------------------------------------------------------- /pkg/llms_from_scratch/README.md: -------------------------------------------------------------------------------- 1 | # `llms-from-scratch` PyPI Package 2 | 3 | This optional PyPI package lets you conveniently import code from various chapters of the *Build a Large Language Model From Scratch* book. 4 | 5 |   6 | ## Installation 7 | 8 |   9 | ### From PyPI 10 | 11 | Install the `llms-from-scratch` package from the official [Python Package Index](https://pypi.org/project/llms-from-scratch/) (PyPI): 12 | 13 | ```bash 14 | pip install llms-from-scratch 15 | ``` 16 | 17 | > **Note:** If you're using [`uv`](https://github.com/astral-sh/uv), replace `pip` with `uv pip` or use `uv add`: 18 | 19 | ```bash 20 | uv add llms-from-scratch 21 | ``` 22 | 23 | 24 | 25 |   26 | ### Editable Install from GitHub 27 | 28 | If you'd like to modify the code and have those changes reflected during development: 29 | 30 | ```bash 31 | git clone https://github.com/rasbt/LLMs-from-scratch.git 32 | cd LLMs-from-scratch 33 | pip install -e . 34 | ``` 35 | 36 | > **Note:** With `uv`, use: 37 | 38 | ```bash 39 | uv add --editable . --dev 40 | ``` 41 | 42 | 43 | 44 |   45 | ## Using the Package 46 | 47 | Once installed, you can import code from any chapter using: 48 | 49 | ```python 50 | from llms_from_scratch.ch02 import GPTDatasetV1, create_dataloader_v1 51 | 52 | from llms_from_scratch.ch03 import ( 53 | SelfAttention_v1, 54 | SelfAttention_v2, 55 | CausalAttention, 56 | MultiHeadAttentionWrapper, 57 | MultiHeadAttention, 58 | PyTorchMultiHeadAttention # Bonus: Faster variant using PyTorch's scaled_dot_product_attention 59 | ) 60 | 61 | from llms_from_scratch.ch04 import ( 62 | LayerNorm, 63 | GELU, 64 | FeedForward, 65 | TransformerBlock, 66 | GPTModel, 67 | GPTModelFast # Bonus: Faster variant using PyTorch's scaled_dot_product_attention 68 | generate_text_simple 69 | ) 70 | 71 | from llms_from_scratch.ch05 import ( 72 | generate, 73 | train_model_simple, 74 | evaluate_model, 75 | generate_and_print_sample, 76 | assign, 77 | load_weights_into_gpt, 78 | text_to_token_ids, 79 | token_ids_to_text, 80 | calc_loss_batch, 81 | calc_loss_loader, 82 | plot_losses, 83 | download_and_load_gpt2 84 | ) 85 | 86 | from llms_from_scratch.ch06 import ( 87 | download_and_unzip_spam_data, 88 | create_balanced_dataset, 89 | random_split, 90 | SpamDataset, 91 | calc_accuracy_loader, 92 | evaluate_model, 93 | train_classifier_simple, 94 | plot_values, 95 | classify_review 96 | ) 97 | 98 | from llms_from_scratch.ch07 import ( 99 | download_and_load_file, 100 | format_input, 101 | InstructionDataset, 102 | custom_collate_fn, 103 | check_if_running, 104 | query_model, 105 | generate_model_scores 106 | ) 107 | 108 | 109 | from llms_from_scratch.appendix_a import NeuralNetwork, ToyDataset 110 | 111 | from llms_from_scratch.appendix_d import find_highest_gradient, train_model 112 | 113 | from llms_from_scratch.llama3 import ( 114 | Llama3Model, 115 | Llama3Tokenizer, 116 | ChatFormat, 117 | clean_text 118 | ) 119 | ``` 120 | 121 | (For the `llms_from_scratch.llama3` usage information, please see [this bonus section](../../ch05/07_gpt_to_llama/README.md). 122 | -------------------------------------------------------------------------------- /pkg/llms_from_scratch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch -------------------------------------------------------------------------------- /pkg/llms_from_scratch/appendix_a.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | import torch 7 | from torch.utils.data import Dataset 8 | 9 | 10 | class NeuralNetwork(torch.nn.Module): 11 | def __init__(self, num_inputs, num_outputs): 12 | super().__init__() 13 | 14 | self.layers = torch.nn.Sequential( 15 | 16 | # 1st hidden layer 17 | torch.nn.Linear(num_inputs, 30), 18 | torch.nn.ReLU(), 19 | 20 | # 2nd hidden layer 21 | torch.nn.Linear(30, 20), 22 | torch.nn.ReLU(), 23 | 24 | # output layer 25 | torch.nn.Linear(20, num_outputs), 26 | ) 27 | 28 | def forward(self, x): 29 | logits = self.layers(x) 30 | return logits 31 | 32 | 33 | class ToyDataset(Dataset): 34 | def __init__(self, X, y): 35 | self.features = X 36 | self.labels = y 37 | 38 | def __getitem__(self, index): 39 | one_x = self.features[index] 40 | one_y = self.labels[index] 41 | return one_x, one_y 42 | 43 | def __len__(self): 44 | return self.labels.shape[0] 45 | -------------------------------------------------------------------------------- /pkg/llms_from_scratch/appendix_d.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | from .ch05 import calc_loss_batch, evaluate_model, generate_and_print_sample 7 | 8 | import math 9 | import torch 10 | 11 | 12 | def find_highest_gradient(model): 13 | max_grad = None 14 | for param in model.parameters(): 15 | if param.grad is not None: 16 | grad_values = param.grad.data.flatten() 17 | max_grad_param = grad_values.max() 18 | if max_grad is None or max_grad_param > max_grad: 19 | max_grad = max_grad_param 20 | return max_grad 21 | 22 | 23 | def train_model(model, train_loader, val_loader, optimizer, device, 24 | n_epochs, eval_freq, eval_iter, start_context, tokenizer, 25 | warmup_steps, initial_lr=3e-05, min_lr=1e-6, orig_book_version=False): 26 | 27 | train_losses, val_losses, track_tokens_seen, track_lrs = [], [], [], [] 28 | tokens_seen, global_step = 0, -1 29 | 30 | # Retrieve the maximum learning rate from the optimizer 31 | peak_lr = optimizer.param_groups[0]["lr"] 32 | 33 | # Calculate the total number of iterations in the training process 34 | total_training_steps = len(train_loader) * n_epochs 35 | 36 | # Calculate the learning rate increment during the warmup phase 37 | lr_increment = (peak_lr - initial_lr) / warmup_steps 38 | 39 | for epoch in range(n_epochs): 40 | model.train() 41 | for input_batch, target_batch in train_loader: 42 | optimizer.zero_grad() 43 | global_step += 1 44 | 45 | # Adjust the learning rate based on the current phase (warmup or cosine annealing) 46 | if global_step < warmup_steps: 47 | # Linear warmup 48 | lr = initial_lr + global_step * lr_increment 49 | else: 50 | # Cosine annealing after warmup 51 | progress = ((global_step - warmup_steps) / 52 | (total_training_steps - warmup_steps)) 53 | lr = min_lr + (peak_lr - min_lr) * 0.5 * (1 + math.cos(math.pi * progress)) 54 | 55 | # Apply the calculated learning rate to the optimizer 56 | for param_group in optimizer.param_groups: 57 | param_group["lr"] = lr 58 | track_lrs.append(lr) # Store the current learning rate 59 | 60 | # Calculate and backpropagate the loss 61 | loss = calc_loss_batch(input_batch, target_batch, model, device) 62 | loss.backward() 63 | 64 | # Apply gradient clipping after the warmup phase to avoid exploding gradients 65 | if orig_book_version: 66 | if global_step > warmup_steps: 67 | torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) 68 | else: 69 | if global_step >= warmup_steps: # the book originally used global_step > warmup_steps, which lead to a skipped clipping step after warmup 70 | torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) 71 | 72 | optimizer.step() 73 | tokens_seen += input_batch.numel() 74 | 75 | # Periodically evaluate the model on the training and validation sets 76 | if global_step % eval_freq == 0: 77 | train_loss, val_loss = evaluate_model( 78 | model, train_loader, val_loader, 79 | device, eval_iter 80 | ) 81 | train_losses.append(train_loss) 82 | val_losses.append(val_loss) 83 | track_tokens_seen.append(tokens_seen) 84 | # Print the current losses 85 | print(f"Ep {epoch+1} (Iter {global_step:06d}): " 86 | f"Train loss {train_loss:.3f}, " 87 | f"Val loss {val_loss:.3f}") 88 | 89 | # Generate and print a sample from the model to monitor progress 90 | generate_and_print_sample( 91 | model, tokenizer, device, start_context 92 | ) 93 | 94 | return train_losses, val_losses, track_tokens_seen, track_lrs 95 | -------------------------------------------------------------------------------- /pkg/llms_from_scratch/appendix_e.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | import torch 7 | import math 8 | 9 | 10 | class LoRALayer(torch.nn.Module): 11 | def __init__(self, in_dim, out_dim, rank, alpha): 12 | super().__init__() 13 | self.A = torch.nn.Parameter(torch.empty(in_dim, rank)) 14 | torch.nn.init.kaiming_uniform_(self.A, a=math.sqrt(5)) # similar to standard weight initialization 15 | self.B = torch.nn.Parameter(torch.zeros(rank, out_dim)) 16 | self.alpha = alpha 17 | 18 | def forward(self, x): 19 | x = self.alpha * (x @ self.A @ self.B) 20 | return x 21 | 22 | 23 | class LinearWithLoRA(torch.nn.Module): 24 | def __init__(self, linear, rank, alpha): 25 | super().__init__() 26 | self.linear = linear 27 | self.lora = LoRALayer( 28 | linear.in_features, linear.out_features, rank, alpha 29 | ) 30 | 31 | def forward(self, x): 32 | return self.linear(x) + self.lora(x) 33 | 34 | 35 | def replace_linear_with_lora(model, rank, alpha): 36 | for name, module in model.named_children(): 37 | if isinstance(module, torch.nn.Linear): 38 | # Replace the Linear layer with LinearWithLoRA 39 | setattr(model, name, LinearWithLoRA(module, rank, alpha)) 40 | else: 41 | # Recursively apply the same function to child modules 42 | replace_linear_with_lora(module, rank, alpha) 43 | -------------------------------------------------------------------------------- /pkg/llms_from_scratch/ch02.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | import torch 7 | from torch.utils.data import Dataset, DataLoader 8 | import tiktoken 9 | 10 | 11 | class GPTDatasetV1(Dataset): 12 | def __init__(self, txt, tokenizer, max_length, stride): 13 | self.tokenizer = tokenizer 14 | self.input_ids = [] 15 | self.target_ids = [] 16 | 17 | # Tokenize the entire text 18 | token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"}) 19 | 20 | # Use a sliding window to chunk the book into overlapping sequences of max_length 21 | for i in range(0, len(token_ids) - max_length, stride): 22 | input_chunk = token_ids[i:i + max_length] 23 | target_chunk = token_ids[i + 1: i + max_length + 1] 24 | self.input_ids.append(torch.tensor(input_chunk)) 25 | self.target_ids.append(torch.tensor(target_chunk)) 26 | 27 | def __len__(self): 28 | return len(self.input_ids) 29 | 30 | def __getitem__(self, idx): 31 | return self.input_ids[idx], self.target_ids[idx] 32 | 33 | 34 | def create_dataloader_v1(txt, batch_size=4, max_length=256, 35 | stride=128, shuffle=True, drop_last=True, num_workers=0): 36 | # Initialize the tokenizer 37 | tokenizer = tiktoken.get_encoding("gpt2") 38 | 39 | # Create dataset 40 | dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) 41 | 42 | # Create dataloader 43 | dataloader = DataLoader( 44 | dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers) 45 | 46 | return dataloader 47 | -------------------------------------------------------------------------------- /pkg/llms_from_scratch/tests/test_appendix_a.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | from llms_from_scratch.appendix_a import NeuralNetwork, ToyDataset 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | from torch.utils.data import DataLoader 11 | 12 | 13 | def test_dataset(): 14 | 15 | X_train = torch.tensor([ 16 | [-1.2, 3.1], 17 | [-0.9, 2.9], 18 | [-0.5, 2.6], 19 | [2.3, -1.1], 20 | [2.7, -1.5] 21 | ]) 22 | 23 | y_train = torch.tensor([0, 0, 0, 1, 1]) 24 | train_ds = ToyDataset(X_train, y_train) 25 | 26 | len(train_ds) == 5 27 | torch.manual_seed(123) 28 | 29 | train_loader = DataLoader( 30 | dataset=train_ds, 31 | batch_size=2, 32 | shuffle=True, 33 | num_workers=0 34 | ) 35 | 36 | torch.manual_seed(123) 37 | model = NeuralNetwork(num_inputs=2, num_outputs=2) 38 | optimizer = torch.optim.SGD(model.parameters(), lr=0.5) 39 | 40 | num_epochs = 3 41 | 42 | for epoch in range(num_epochs): 43 | 44 | model.train() 45 | for batch_idx, (features, labels) in enumerate(train_loader): 46 | 47 | logits = model(features) 48 | 49 | loss = F.cross_entropy(logits, labels) 50 | 51 | optimizer.zero_grad() 52 | loss.backward() 53 | optimizer.step() 54 | 55 | print(f"Epoch: {epoch+1:03d}/{num_epochs:03d}" 56 | f" | Batch {batch_idx:03d}/{len(train_loader):03d}" 57 | f" | Train/Val Loss: {loss:.2f}") 58 | 59 | model.eval() 60 | with torch.no_grad(): 61 | outputs = model(X_train) 62 | 63 | expected = torch.tensor([ 64 | [2.8569, -4.1618], 65 | [2.5382, -3.7548], 66 | [2.0944, -3.1820], 67 | [-1.4814, 1.4816], 68 | [-1.7176, 1.7342] 69 | ]) 70 | torch.equal(outputs, expected) 71 | -------------------------------------------------------------------------------- /pkg/llms_from_scratch/tests/test_appendix_d.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | from llms_from_scratch.ch02 import create_dataloader_v1 7 | from llms_from_scratch.ch04 import GPTModel 8 | from llms_from_scratch.appendix_d import train_model 9 | 10 | import os 11 | import urllib 12 | 13 | import tiktoken 14 | import torch 15 | from torch.utils.data import Subset, DataLoader 16 | 17 | 18 | def test_train(tmp_path): 19 | 20 | GPT_CONFIG_124M = { 21 | "vocab_size": 50257, # Vocabulary size 22 | "context_length": 256, # Shortened context length (orig: 1024) 23 | "emb_dim": 768, # Embedding dimension 24 | "n_heads": 12, # Number of attention heads 25 | "n_layers": 12, # Number of layers 26 | "drop_rate": 0.1, # Dropout rate 27 | "qkv_bias": False # Query-key-value bias 28 | } 29 | 30 | OTHER_SETTINGS = { 31 | "learning_rate": 5e-4, 32 | "num_epochs": 2, 33 | "batch_size": 1, 34 | "weight_decay": 0.1 35 | } 36 | 37 | torch.manual_seed(123) 38 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 39 | 40 | ############################## 41 | # Download data if necessary 42 | ############################## 43 | 44 | file_path = tmp_path / "the-verdict.txt" 45 | url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt" 46 | 47 | if not os.path.exists(file_path): 48 | with urllib.request.urlopen(url) as response: 49 | text_data = response.read().decode("utf-8") 50 | with open(file_path, "w", encoding="utf-8") as file: 51 | file.write(text_data) 52 | else: 53 | with open(file_path, "r", encoding="utf-8") as file: 54 | text_data = file.read() 55 | 56 | ############################## 57 | # Initialize model 58 | ############################## 59 | 60 | model = GPTModel(GPT_CONFIG_124M) 61 | model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes 62 | 63 | ############################## 64 | # Set up dataloaders 65 | ############################## 66 | 67 | # Train/validation ratio 68 | train_ratio = 0.90 69 | split_idx = int(train_ratio * len(text_data)) 70 | 71 | train_loader = create_dataloader_v1( 72 | text_data[:split_idx], 73 | batch_size=OTHER_SETTINGS["batch_size"], 74 | max_length=GPT_CONFIG_124M["context_length"], 75 | stride=GPT_CONFIG_124M["context_length"], 76 | drop_last=True, 77 | shuffle=True, 78 | num_workers=0 79 | ) 80 | 81 | val_loader = create_dataloader_v1( 82 | text_data[split_idx:], 83 | batch_size=OTHER_SETTINGS["batch_size"], 84 | max_length=GPT_CONFIG_124M["context_length"], 85 | stride=GPT_CONFIG_124M["context_length"], 86 | drop_last=False, 87 | shuffle=False, 88 | num_workers=0 89 | ) 90 | 91 | ############################## 92 | # Train model 93 | ############################## 94 | 95 | tokenizer = tiktoken.get_encoding("gpt2") 96 | 97 | train_subset = Subset(train_loader.dataset, range(1)) 98 | one_batch_train_loader = DataLoader(train_subset, batch_size=1) 99 | val_subset = Subset(val_loader.dataset, range(1)) 100 | one_batch_val_loader = DataLoader(val_subset, batch_size=1) 101 | 102 | peak_lr = 0.001 # this was originally set to 5e-4 in the book by mistake 103 | optimizer = torch.optim.AdamW(model.parameters(), lr=peak_lr, weight_decay=0.1) # the book accidentally omitted the lr assignment 104 | tokenizer = tiktoken.get_encoding("gpt2") 105 | 106 | n_epochs = 6 107 | warmup_steps = 1 108 | 109 | train_losses, val_losses, tokens_seen, lrs = train_model( 110 | model, one_batch_train_loader, one_batch_val_loader, optimizer, device, n_epochs=n_epochs, 111 | eval_freq=5, eval_iter=1, start_context="Every effort moves you", 112 | tokenizer=tokenizer, warmup_steps=warmup_steps, 113 | initial_lr=1e-5, min_lr=1e-5 114 | ) 115 | 116 | assert round(train_losses[0], 1) == 10.9 117 | assert round(val_losses[0], 1) == 11.0 118 | assert train_losses[-1] < train_losses[0] 119 | -------------------------------------------------------------------------------- /pkg/llms_from_scratch/tests/test_appendix_e.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | 7 | from llms_from_scratch.ch04 import GPTModel 8 | from llms_from_scratch.ch06 import ( 9 | download_and_unzip_spam_data, create_balanced_dataset, 10 | random_split, SpamDataset, train_classifier_simple 11 | ) 12 | from llms_from_scratch.appendix_e import replace_linear_with_lora 13 | 14 | from pathlib import Path 15 | import urllib 16 | 17 | import pandas as pd 18 | import tiktoken 19 | import torch 20 | from torch.utils.data import DataLoader, Subset 21 | 22 | 23 | def test_train_classifier_lora(tmp_path): 24 | 25 | ######################################## 26 | # Download and prepare dataset 27 | ######################################## 28 | 29 | url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip" 30 | zip_path = tmp_path / "sms_spam_collection.zip" 31 | extracted_path = tmp_path / "sms_spam_collection" 32 | data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv" 33 | 34 | try: 35 | download_and_unzip_spam_data( 36 | url, zip_path, extracted_path, data_file_path 37 | ) 38 | except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e: 39 | print(f"Primary URL failed: {e}. Trying backup URL...") 40 | backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" 41 | download_and_unzip_spam_data( 42 | backup_url, zip_path, extracted_path, data_file_path 43 | ) 44 | 45 | df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"]) 46 | balanced_df = create_balanced_dataset(df) 47 | balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1}) 48 | 49 | train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1) 50 | train_df.to_csv(tmp_path / "train.csv", index=None) 51 | validation_df.to_csv(tmp_path / "validation.csv", index=None) 52 | test_df.to_csv(tmp_path / "test.csv", index=None) 53 | 54 | ######################################## 55 | # Create data loaders 56 | ######################################## 57 | tokenizer = tiktoken.get_encoding("gpt2") 58 | 59 | train_dataset = SpamDataset( 60 | csv_file=tmp_path / "train.csv", 61 | max_length=None, 62 | tokenizer=tokenizer 63 | ) 64 | 65 | val_dataset = SpamDataset( 66 | csv_file=tmp_path / "validation.csv", 67 | max_length=train_dataset.max_length, 68 | tokenizer=tokenizer 69 | ) 70 | 71 | num_workers = 0 72 | batch_size = 8 73 | 74 | torch.manual_seed(123) 75 | 76 | train_loader = DataLoader( 77 | dataset=train_dataset, 78 | batch_size=batch_size, 79 | shuffle=True, 80 | num_workers=num_workers, 81 | drop_last=True, 82 | ) 83 | 84 | val_loader = DataLoader( 85 | dataset=val_dataset, 86 | batch_size=batch_size, 87 | num_workers=num_workers, 88 | drop_last=False, 89 | ) 90 | 91 | ######################################## 92 | # Load pretrained model 93 | ######################################## 94 | 95 | # Small GPT model for testing purposes 96 | BASE_CONFIG = { 97 | "vocab_size": 50257, 98 | "context_length": 120, 99 | "drop_rate": 0.0, 100 | "qkv_bias": False, 101 | "emb_dim": 12, 102 | "n_layers": 1, 103 | "n_heads": 2 104 | } 105 | model = GPTModel(BASE_CONFIG) 106 | model.eval() 107 | device = "cpu" 108 | 109 | ######################################## 110 | # Modify and pretrained model 111 | ######################################## 112 | 113 | for param in model.parameters(): 114 | param.requires_grad = False 115 | 116 | torch.manual_seed(123) 117 | 118 | num_classes = 2 119 | model.out_head = torch.nn.Linear(in_features=BASE_CONFIG["emb_dim"], out_features=num_classes) 120 | replace_linear_with_lora(model, rank=16, alpha=16) 121 | model.to(device) 122 | 123 | for param in model.trf_blocks[-1].parameters(): 124 | param.requires_grad = True 125 | 126 | for param in model.final_norm.parameters(): 127 | param.requires_grad = True 128 | 129 | ######################################## 130 | # Finetune modified model 131 | ######################################## 132 | 133 | torch.manual_seed(123) 134 | 135 | optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1) 136 | 137 | train_subset = Subset(train_loader.dataset, range(5)) 138 | batch_train_loader = DataLoader(train_subset, batch_size=5) 139 | val_subset = Subset(val_loader.dataset, range(5)) 140 | batch_val_loader = DataLoader(val_subset, batch_size=5) 141 | 142 | num_epochs = 6 143 | train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple( 144 | model, batch_train_loader, batch_val_loader, optimizer, device, 145 | num_epochs=num_epochs, eval_freq=1, eval_iter=1, 146 | ) 147 | 148 | assert round(train_losses[0], 1) == 0.8 149 | assert round(val_losses[0], 1) == 0.8 150 | assert train_losses[-1] < train_losses[0] 151 | -------------------------------------------------------------------------------- /pkg/llms_from_scratch/tests/test_ch02.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | from llms_from_scratch.ch02 import create_dataloader_v1 7 | 8 | import os 9 | import urllib.request 10 | 11 | import pytest 12 | import torch 13 | 14 | 15 | @pytest.mark.parametrize("file_name", ["the-verdict.txt"]) 16 | def test_dataloader(tmp_path, file_name): 17 | 18 | if not os.path.exists("the-verdict.txt"): 19 | url = ("https://raw.githubusercontent.com/rasbt/" 20 | "LLMs-from-scratch/main/ch02/01_main-chapter-code/" 21 | "the-verdict.txt") 22 | file_path = "the-verdict.txt" 23 | urllib.request.urlretrieve(url, file_path) 24 | 25 | with open("the-verdict.txt", "r", encoding="utf-8") as f: 26 | raw_text = f.read() 27 | 28 | vocab_size = 50257 29 | output_dim = 256 30 | context_length = 1024 31 | 32 | token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim) 33 | pos_embedding_layer = torch.nn.Embedding(context_length, output_dim) 34 | 35 | batch_size = 8 36 | max_length = 4 37 | dataloader = create_dataloader_v1( 38 | raw_text, 39 | batch_size=batch_size, 40 | max_length=max_length, 41 | stride=max_length 42 | ) 43 | 44 | for batch in dataloader: 45 | x, y = batch 46 | 47 | token_embeddings = token_embedding_layer(x) 48 | pos_embeddings = pos_embedding_layer(torch.arange(max_length)) 49 | 50 | input_embeddings = token_embeddings + pos_embeddings 51 | 52 | break 53 | 54 | input_embeddings.shape == torch.Size([8, 4, 256]) 55 | -------------------------------------------------------------------------------- /pkg/llms_from_scratch/tests/test_ch03.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | 7 | from llms_from_scratch.ch03 import MultiHeadAttention, PyTorchMultiHeadAttention 8 | import torch 9 | 10 | 11 | def test_mha(): 12 | 13 | context_length = 100 14 | d_in = 256 15 | d_out = 16 16 | 17 | mha = MultiHeadAttention(d_in, d_out, context_length, dropout=0.0, num_heads=2) 18 | 19 | batch = torch.rand(8, 6, d_in) 20 | context_vecs = mha(batch) 21 | 22 | context_vecs.shape == torch.Size([8, 6, d_out]) 23 | 24 | # Test bonus class 25 | mha = PyTorchMultiHeadAttention(d_in, d_out, num_heads=2) 26 | 27 | batch = torch.rand(8, 6, d_in) 28 | context_vecs = mha(batch) 29 | 30 | context_vecs.shape == torch.Size([8, 6, d_out]) 31 | -------------------------------------------------------------------------------- /pkg/llms_from_scratch/tests/test_ch04.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | from llms_from_scratch.ch04 import GPTModel, GPTModelFast 7 | from llms_from_scratch.ch04 import generate_text_simple 8 | 9 | import pytest 10 | import torch 11 | import tiktoken 12 | 13 | 14 | GPT_CONFIG_124M = { 15 | "vocab_size": 50257, # Vocabulary size 16 | "context_length": 1024, # Context length 17 | "emb_dim": 768, # Embedding dimension 18 | "n_heads": 12, # Number of attention heads 19 | "n_layers": 12, # Number of layers 20 | "drop_rate": 0.1, # Dropout rate 21 | "qkv_bias": False # Query-Key-Value bias 22 | } 23 | 24 | 25 | @pytest.mark.parametrize("ModelClass", [GPTModel, GPTModelFast]) 26 | def test_gpt_model_variants(ModelClass): 27 | torch.manual_seed(123) 28 | model = ModelClass(GPT_CONFIG_124M) 29 | model.eval() # disable dropout 30 | 31 | start_context = "Hello, I am" 32 | 33 | tokenizer = tiktoken.get_encoding("gpt2") 34 | encoded = tokenizer.encode(start_context) 35 | encoded_tensor = torch.tensor(encoded).unsqueeze(0) 36 | 37 | print(f"\n{50*'='}\n{22*' '}IN\n{50*'='}") 38 | print("\nInput text:", start_context) 39 | print("Encoded input text:", encoded) 40 | print("encoded_tensor.shape:", encoded_tensor.shape) 41 | 42 | out = generate_text_simple( 43 | model=model, 44 | idx=encoded_tensor, 45 | max_new_tokens=10, 46 | context_size=GPT_CONFIG_124M["context_length"] 47 | ) 48 | 49 | expect = torch.tensor([ 50 | [15496, 11, 314, 716, 27018, 24086, 47843, 30961, 42348, 7267, 51 | 49706, 43231, 47062, 34657] 52 | ]) 53 | assert torch.equal(expect, out), "Generated output does not match expected output" 54 | -------------------------------------------------------------------------------- /pkg/llms_from_scratch/tests/test_ch05.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | from llms_from_scratch.ch02 import create_dataloader_v1 7 | from llms_from_scratch.ch04 import GPTModel, GPTModelFast 8 | from llms_from_scratch.ch05 import train_model_simple 9 | 10 | import os 11 | import urllib 12 | 13 | import pytest 14 | import tiktoken 15 | import torch 16 | from torch.utils.data import Subset, DataLoader 17 | 18 | 19 | GPT_CONFIG_124M = { 20 | "vocab_size": 50257, 21 | "context_length": 256, # Shortened for test speed 22 | "emb_dim": 768, 23 | "n_heads": 12, 24 | "n_layers": 12, 25 | "drop_rate": 0.1, 26 | "qkv_bias": False 27 | } 28 | 29 | OTHER_SETTINGS = { 30 | "learning_rate": 5e-4, 31 | "num_epochs": 2, 32 | "batch_size": 1, 33 | "weight_decay": 0.1 34 | } 35 | 36 | 37 | @pytest.mark.parametrize("ModelClass", [GPTModel, GPTModelFast]) 38 | def test_train_simple(tmp_path, ModelClass): 39 | torch.manual_seed(123) 40 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 41 | 42 | ############################## 43 | # Download data if necessary 44 | ############################## 45 | file_path = tmp_path / "the-verdict.txt" 46 | url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt" 47 | 48 | if not os.path.exists(file_path): 49 | with urllib.request.urlopen(url) as response: 50 | text_data = response.read().decode("utf-8") 51 | with open(file_path, "w", encoding="utf-8") as f: 52 | f.write(text_data) 53 | else: 54 | with open(file_path, "r", encoding="utf-8") as f: 55 | text_data = f.read() 56 | 57 | ############################## 58 | # Set up dataloaders 59 | ############################## 60 | train_ratio = 0.90 61 | split_idx = int(train_ratio * len(text_data)) 62 | 63 | train_loader = create_dataloader_v1( 64 | text_data[:split_idx], 65 | batch_size=OTHER_SETTINGS["batch_size"], 66 | max_length=GPT_CONFIG_124M["context_length"], 67 | stride=GPT_CONFIG_124M["context_length"], 68 | drop_last=True, 69 | shuffle=True, 70 | num_workers=0 71 | ) 72 | 73 | val_loader = create_dataloader_v1( 74 | text_data[split_idx:], 75 | batch_size=OTHER_SETTINGS["batch_size"], 76 | max_length=GPT_CONFIG_124M["context_length"], 77 | stride=GPT_CONFIG_124M["context_length"], 78 | drop_last=False, 79 | shuffle=False, 80 | num_workers=0 81 | ) 82 | 83 | # Limit to 1 batch for speed 84 | train_subset = Subset(train_loader.dataset, range(1)) 85 | one_batch_train_loader = DataLoader(train_subset, batch_size=1) 86 | val_subset = Subset(val_loader.dataset, range(1)) 87 | one_batch_val_loader = DataLoader(val_subset, batch_size=1) 88 | 89 | ############################## 90 | # Train model 91 | ############################## 92 | model = ModelClass(GPT_CONFIG_124M) 93 | model.to(device) 94 | 95 | optimizer = torch.optim.AdamW( 96 | model.parameters(), 97 | lr=OTHER_SETTINGS["learning_rate"], 98 | weight_decay=OTHER_SETTINGS["weight_decay"] 99 | ) 100 | 101 | tokenizer = tiktoken.get_encoding("gpt2") 102 | 103 | train_losses, val_losses, tokens_seen = train_model_simple( 104 | model, one_batch_train_loader, one_batch_val_loader, optimizer, device, 105 | num_epochs=OTHER_SETTINGS["num_epochs"], eval_freq=1, eval_iter=1, 106 | start_context="Every effort moves you", tokenizer=tokenizer 107 | ) 108 | 109 | assert round(train_losses[0], 1) == 7.6 110 | assert round(val_losses[0], 1) == 10.1 111 | assert train_losses[-1] < train_losses[0] 112 | -------------------------------------------------------------------------------- /pkg/llms_from_scratch/tests/test_ch06.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | 7 | from llms_from_scratch.ch04 import GPTModel 8 | from llms_from_scratch.ch06 import ( 9 | download_and_unzip_spam_data, create_balanced_dataset, 10 | random_split, SpamDataset, train_classifier_simple 11 | ) 12 | 13 | from pathlib import Path 14 | import urllib 15 | 16 | import pandas as pd 17 | import tiktoken 18 | import torch 19 | from torch.utils.data import DataLoader, Subset 20 | 21 | 22 | def test_train_classifier(tmp_path): 23 | 24 | ######################################## 25 | # Download and prepare dataset 26 | ######################################## 27 | 28 | url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip" 29 | zip_path = tmp_path / "sms_spam_collection.zip" 30 | extracted_path = tmp_path / "sms_spam_collection" 31 | data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv" 32 | 33 | try: 34 | download_and_unzip_spam_data( 35 | url, zip_path, extracted_path, data_file_path 36 | ) 37 | except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e: 38 | print(f"Primary URL failed: {e}. Trying backup URL...") 39 | backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" 40 | download_and_unzip_spam_data( 41 | backup_url, zip_path, extracted_path, data_file_path 42 | ) 43 | 44 | df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"]) 45 | balanced_df = create_balanced_dataset(df) 46 | balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1}) 47 | 48 | train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1) 49 | train_df.to_csv(tmp_path / "train.csv", index=None) 50 | validation_df.to_csv(tmp_path / "validation.csv", index=None) 51 | test_df.to_csv(tmp_path / "test.csv", index=None) 52 | 53 | ######################################## 54 | # Create data loaders 55 | ######################################## 56 | tokenizer = tiktoken.get_encoding("gpt2") 57 | 58 | train_dataset = SpamDataset( 59 | csv_file=tmp_path / "train.csv", 60 | max_length=None, 61 | tokenizer=tokenizer 62 | ) 63 | 64 | val_dataset = SpamDataset( 65 | csv_file=tmp_path / "validation.csv", 66 | max_length=train_dataset.max_length, 67 | tokenizer=tokenizer 68 | ) 69 | 70 | num_workers = 0 71 | batch_size = 8 72 | 73 | torch.manual_seed(123) 74 | 75 | train_loader = DataLoader( 76 | dataset=train_dataset, 77 | batch_size=batch_size, 78 | shuffle=True, 79 | num_workers=num_workers, 80 | drop_last=True, 81 | ) 82 | 83 | val_loader = DataLoader( 84 | dataset=val_dataset, 85 | batch_size=batch_size, 86 | num_workers=num_workers, 87 | drop_last=False, 88 | ) 89 | 90 | ######################################## 91 | # Load pretrained model 92 | ######################################## 93 | 94 | # Small GPT model for testing purposes 95 | BASE_CONFIG = { 96 | "vocab_size": 50257, 97 | "context_length": 120, 98 | "drop_rate": 0.0, 99 | "qkv_bias": False, 100 | "emb_dim": 12, 101 | "n_layers": 1, 102 | "n_heads": 2 103 | } 104 | model = GPTModel(BASE_CONFIG) 105 | model.eval() 106 | device = "cpu" 107 | 108 | ######################################## 109 | # Modify and pretrained model 110 | ######################################## 111 | 112 | for param in model.parameters(): 113 | param.requires_grad = False 114 | 115 | torch.manual_seed(123) 116 | 117 | num_classes = 2 118 | model.out_head = torch.nn.Linear(in_features=BASE_CONFIG["emb_dim"], out_features=num_classes) 119 | model.to(device) 120 | 121 | for param in model.trf_blocks[-1].parameters(): 122 | param.requires_grad = True 123 | 124 | for param in model.final_norm.parameters(): 125 | param.requires_grad = True 126 | 127 | ######################################## 128 | # Finetune modified model 129 | ######################################## 130 | 131 | torch.manual_seed(123) 132 | 133 | optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.0) 134 | 135 | train_subset = Subset(train_loader.dataset, range(5)) 136 | batch_train_loader = DataLoader(train_subset, batch_size=5) 137 | val_subset = Subset(val_loader.dataset, range(5)) 138 | batch_val_loader = DataLoader(val_subset, batch_size=5) 139 | 140 | num_epochs = 5 141 | train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple( 142 | model, batch_train_loader, batch_val_loader, optimizer, device, 143 | num_epochs=num_epochs, eval_freq=1, eval_iter=1, 144 | ) 145 | 146 | assert round(train_losses[0], 1) == 0.8 147 | assert round(val_losses[0], 1) == 0.8 148 | assert train_losses[-1] < train_losses[0] 149 | -------------------------------------------------------------------------------- /pkg/llms_from_scratch/tests/test_ch07.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | from llms_from_scratch.ch04 import GPTModel 7 | from llms_from_scratch.ch05 import train_model_simple 8 | from llms_from_scratch.ch07 import ( 9 | download_and_load_file, InstructionDataset, format_input, custom_collate_fn 10 | ) 11 | 12 | from functools import partial 13 | 14 | import torch 15 | from torch.utils.data import DataLoader 16 | import tiktoken 17 | 18 | 19 | def test_instruction_finetune(tmp_path): 20 | 21 | ####################################### 22 | # Download and prepare dataset 23 | ####################################### 24 | file_path = tmp_path / "instruction-data.json" 25 | url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json" 26 | data = download_and_load_file(file_path, url) 27 | 28 | train_portion = int(len(data) * 0.85) # 85% for training 29 | test_portion = int(len(data) * 0.1) # 10% for testing 30 | 31 | train_data = data[:train_portion] 32 | test_data = data[train_portion:train_portion + test_portion] 33 | val_data = data[train_portion + test_portion:] 34 | 35 | # Use very small subset for testing purposes 36 | train_data = train_data[:15] 37 | val_data = val_data[:15] 38 | test_data = test_data[:15] 39 | 40 | tokenizer = tiktoken.get_encoding("gpt2") 41 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 42 | 43 | customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=100) 44 | 45 | num_workers = 0 46 | batch_size = 8 47 | 48 | torch.manual_seed(123) 49 | 50 | train_dataset = InstructionDataset(train_data, tokenizer) 51 | train_loader = DataLoader( 52 | train_dataset, 53 | batch_size=batch_size, 54 | collate_fn=customized_collate_fn, 55 | shuffle=True, 56 | drop_last=True, 57 | num_workers=num_workers 58 | ) 59 | 60 | val_dataset = InstructionDataset(val_data, tokenizer) 61 | val_loader = DataLoader( 62 | val_dataset, 63 | batch_size=batch_size, 64 | collate_fn=customized_collate_fn, 65 | shuffle=False, 66 | drop_last=False, 67 | num_workers=num_workers 68 | ) 69 | 70 | ####################################### 71 | # Load pretrained model 72 | ####################################### 73 | 74 | # Small GPT model for testing purposes 75 | BASE_CONFIG = { 76 | "vocab_size": 50257, 77 | "context_length": 120, 78 | "drop_rate": 0.0, 79 | "qkv_bias": False, 80 | "emb_dim": 12, 81 | "n_layers": 1, 82 | "n_heads": 2 83 | } 84 | model = GPTModel(BASE_CONFIG) 85 | model.eval() 86 | device = "cpu" 87 | CHOOSE_MODEL = "Small test model" 88 | 89 | print("Loaded model:", CHOOSE_MODEL) 90 | print(50*"-") 91 | 92 | ####################################### 93 | # Finetuning the model 94 | ####################################### 95 | 96 | num_epochs = 10 97 | optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1) 98 | 99 | torch.manual_seed(123) 100 | train_losses, val_losses, tokens_seen = train_model_simple( 101 | model, train_loader, val_loader, optimizer, device, 102 | num_epochs=num_epochs, eval_freq=5, eval_iter=5, 103 | start_context=format_input(val_data[0]), tokenizer=tokenizer 104 | ) 105 | 106 | assert round(train_losses[0], 1) == 10.9 107 | assert round(val_losses[0], 1) == 10.9 108 | assert train_losses[-1] < train_losses[0] 109 | -------------------------------------------------------------------------------- /pkg/llms_from_scratch/tests/test_llama3.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | from llms_from_scratch.ch04 import generate_text_simple 7 | from llms_from_scratch.llama3 import ( 8 | compute_rope_params, 9 | apply_rope, 10 | rescale_theta, 11 | LLAMA32_CONFIG_1B, 12 | GroupedQueryAttention, 13 | GroupedQueryAttentionFast, 14 | Llama3Model, 15 | ) 16 | 17 | import importlib 18 | import pytest 19 | import tiktoken 20 | import torch 21 | 22 | 23 | transformers_installed = importlib.util.find_spec("transformers") is not None 24 | 25 | 26 | @pytest.mark.skipif(not transformers_installed, reason="transformers not installed") 27 | def test_rope(): 28 | 29 | from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding, apply_rotary_pos_emb 30 | 31 | # Settings 32 | batch_size = 1 33 | context_len = 8192 34 | num_heads = 4 35 | head_dim = 16 36 | rope_theta = 500_000 37 | 38 | rope_config = { 39 | "factor": 8.0, 40 | "low_freq_factor": 1.0, 41 | "high_freq_factor": 4.0, 42 | "original_context_length": 8192, 43 | } 44 | 45 | # Instantiate RoPE parameters 46 | cos, sin = compute_rope_params( 47 | head_dim=head_dim, 48 | theta_base=rope_theta, 49 | context_length=context_len, 50 | freq_config=rope_config, 51 | ) 52 | 53 | # Dummy query and key tensors 54 | torch.manual_seed(123) 55 | queries = torch.randn(batch_size, num_heads, context_len, head_dim) 56 | keys = torch.randn(batch_size, num_heads, context_len, head_dim) 57 | 58 | # Apply rotary position embeddings 59 | queries_rot = apply_rope(queries, cos, sin) 60 | keys_rot = apply_rope(keys, cos, sin) 61 | 62 | # Generate reference RoPE via HF 63 | hf_rope_params = { 64 | "factor": 8.0, 65 | "low_freq_factor": 1.0, 66 | "high_freq_factor": 4.0, 67 | "original_max_position_embeddings": 8192, 68 | "rope_type": "llama3" 69 | } 70 | 71 | class RoPEConfig: 72 | rope_type = "llama3" 73 | rope_scaling = hf_rope_params 74 | factor = 1.0 75 | dim: int = head_dim 76 | rope_theta = 500_000 77 | max_position_embeddings: int = 8192 78 | hidden_size = head_dim * num_heads 79 | num_attention_heads = num_heads 80 | 81 | config = RoPEConfig() 82 | 83 | rot_emb = LlamaRotaryEmbedding(config=config) 84 | position_ids = torch.arange(context_len, dtype=torch.long).unsqueeze(0) 85 | ref_cos, ref_sin = rot_emb(queries, position_ids) 86 | ref_queries_rot, ref_keys_rot = apply_rotary_pos_emb(queries, keys, ref_cos, ref_sin) 87 | 88 | torch.testing.assert_close(sin, ref_sin.squeeze(0)) 89 | torch.testing.assert_close(cos, ref_cos.squeeze(0)) 90 | torch.testing.assert_close(keys_rot, ref_keys_rot) 91 | torch.testing.assert_close(queries_rot, ref_queries_rot) 92 | 93 | 94 | GPT_CONFIG_124M = { 95 | "vocab_size": 50257, # Vocabulary size 96 | "context_length": 1024, # Context length 97 | "emb_dim": 768, # Embedding dimension 98 | "n_heads": 12, # Number of attention heads 99 | "n_layers": 12, # Number of layers 100 | "drop_rate": 0.1, # Dropout rate 101 | "qkv_bias": False # Query-Key-Value bias 102 | } 103 | 104 | 105 | def test_rescale(): 106 | 107 | new_theta = rescale_theta( 108 | theta_old=500_000., 109 | context_length_old=131_072, 110 | context_length_new=8192 111 | ) 112 | assert new_theta == 31250. 113 | 114 | old_theta = rescale_theta( 115 | theta_old=new_theta, 116 | context_length_old=8192, 117 | context_length_new=131_072 118 | ) 119 | assert old_theta == 500_000. 120 | 121 | 122 | def test_grouped_query_attention_equivalence(): 123 | torch.manual_seed(42) 124 | b, t, d_in, d_out, num_heads, num_kv_groups = 2, 8, 32, 64, 4, 2 125 | 126 | x = torch.randn(b, t, d_in) 127 | cos, sin = compute_rope_params( 128 | head_dim=d_out // num_heads, 129 | theta_base=50_000, 130 | context_length=t, 131 | freq_config={ 132 | "factor": 32.0, 133 | "low_freq_factor": 1.0, 134 | "high_freq_factor": 4.0, 135 | "original_context_length": t, 136 | } 137 | ) 138 | 139 | # Causal mask for the slow version 140 | mask = torch.triu(torch.ones(t, t, dtype=torch.bool), diagonal=1) 141 | 142 | attn1 = GroupedQueryAttention(d_in, d_out, num_heads, num_kv_groups) 143 | attn2 = GroupedQueryAttentionFast(d_in, d_out, num_heads, num_kv_groups) 144 | 145 | # Copy weights to make both models identical 146 | attn2.load_state_dict(attn1.state_dict()) 147 | 148 | # Run both 149 | y1 = attn1(x, mask, cos, sin) 150 | y2 = attn2(x, cos, sin) 151 | 152 | # Compare outputs 153 | max_diff = (y1 - y2).abs().max().item() 154 | print(f"Max difference between slow and fast outputs: {max_diff:.4e}") 155 | assert torch.allclose(y1, y2, atol=1e-4) 156 | 157 | 158 | @pytest.fixture(scope="session") 159 | def llama3_weights_path(tmp_path_factory): 160 | """Creates and saves a deterministic Llama3 model for testing.""" 161 | path = tmp_path_factory.mktemp("models") / "llama3_test_weights.pt" 162 | 163 | if not path.exists(): 164 | torch.manual_seed(123) 165 | model = Llama3Model(LLAMA32_CONFIG_1B) 166 | torch.save(model.state_dict(), path) 167 | 168 | return path 169 | 170 | 171 | @pytest.mark.parametrize("ModelClass", [Llama3Model]) 172 | def test_gpt_model_variants(ModelClass, llama3_weights_path): 173 | torch.manual_seed(123) 174 | model = ModelClass(LLAMA32_CONFIG_1B) 175 | model.load_state_dict(torch.load(llama3_weights_path)) 176 | model.eval() 177 | 178 | start_context = "Llamas eat" 179 | 180 | tokenizer = tiktoken.get_encoding("gpt2") 181 | encoded = tokenizer.encode(start_context) 182 | encoded_tensor = torch.tensor(encoded).unsqueeze(0) 183 | 184 | print(f"\n{50*'='}\n{22*' '}IN\n{50*'='}") 185 | print("\nInput text:", start_context) 186 | print("Encoded input text:", encoded) 187 | print("encoded_tensor.shape:", encoded_tensor.shape) 188 | 189 | out = generate_text_simple( 190 | model=model, 191 | idx=encoded_tensor, 192 | max_new_tokens=5, 193 | context_size=LLAMA32_CONFIG_1B["context_length"] 194 | ) 195 | print("Encoded output text:", out) 196 | expect = torch.tensor([ 197 | [43, 2543, 292, 4483, 100383, 8113, 21197, 33804, 54419] 198 | ]) 199 | assert torch.equal(expect, out) 200 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "llms-from-scratch" 7 | version = "1.0.6" 8 | description = "Implement a ChatGPT-like LLM in PyTorch from scratch, step by step" 9 | readme = "README.md" 10 | requires-python = ">=3.10" 11 | dependencies = [ 12 | "torch>=2.3.0", 13 | "jupyterlab>=4.0", 14 | "tiktoken>=0.5.1", 15 | "matplotlib>=3.7.1", 16 | "tensorflow>=2.18.0", 17 | "tqdm>=4.66.1", 18 | "numpy>=1.26,<2.1", 19 | "pandas>=2.2.1", 20 | "pip>=25.0.1", 21 | "pytest>=8.3.5", 22 | ] 23 | 24 | [tool.uv.sources] 25 | llms-from-scratch = { workspace = true } 26 | 27 | [dependency-groups] 28 | dev = [ 29 | "build>=1.2.2.post1", 30 | "llms-from-scratch", 31 | "twine>=6.1.0", 32 | ] 33 | 34 | [tool.ruff] 35 | line-length = 140 36 | 37 | [tool.ruff.lint] 38 | exclude = [".venv"] 39 | # Ignored rules (W504 removed) 40 | ignore = [ 41 | "C406", "E226", "E402", "E702", "E703", 42 | "E722", "E731", "E741" 43 | ] 44 | 45 | 46 | # `llms_from_scratch` PyPI package 47 | 48 | [tool.setuptools] 49 | package-dir = {"" = "pkg"} 50 | 51 | [tool.setuptools.packages.find] 52 | where = ["pkg"] 53 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch >= 2.3.0 # all 2 | jupyterlab >= 4.0 # all 3 | tiktoken >= 0.5.1 # ch02; ch04; ch05 4 | matplotlib >= 3.7.1 # ch04; ch06; ch07 5 | tensorflow >= 2.18.0 # ch05; ch06; ch07 6 | tqdm >= 4.66.1 # ch05; ch07 7 | numpy >= 1.26, < 2.1 # dependency of several other libraries like torch and pandas 8 | pandas >= 2.2.1 # ch06 9 | psutil >= 5.9.5 # ch07; already installed automatically as dependency of torch 10 | -------------------------------------------------------------------------------- /setup/.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "ms-python.python", 4 | "ms-toolsai.jupyter", 5 | "ms-azuretools.vscode-docker", 6 | "ms-vscode-remote.vscode-remote-extensionpack", 7 | "yahyabatulu.vscode-markdown-alert", 8 | "tomoki1207.pdf", 9 | "mechatroner.rainbow-csv" 10 | ] 11 | } -------------------------------------------------------------------------------- /setup/01_optional-python-setup-preferences/native-pixi.md: -------------------------------------------------------------------------------- 1 | # Native pixi Python and package management 2 | 3 | This tutorial is an alternative to the [`./native-uv.md`](native-uv.md) document for those who prefer `pixi`'s native commands over traditional environment and package managers like `conda` and `pip`. 4 | 5 | Note that pixi uses `uv add` under the hood, as described in [`./native-uv.md`](native-uv.md). 6 | 7 | Pixi and uv are both modern package and environment management tools for Python, but pixi is a polyglot package manager designed for managing not just Python but also other languages (similar to conda), while uv is a Python-specific tool optimized for ultra-fast dependency resolution and package installation. 8 | 9 | Someone might choose pixi over uv if they need a polyglot package manager that supports multiple languages (not just Python) or prefer a declarative environment management approach similar to conda. For more information, please visit the official [pixi documentation](https://pixi.sh/latest/). 10 | 11 | In this tutorial, I am using a computer running macOS, but this workflow is similar for Linux machines and may work for other operating systems as well. 12 | 13 |   14 | ## 1. Install pixi 15 | 16 | Pixi can be installed as follows, depending on your operating system. 17 | 18 |
19 | 20 | **macOS and Linux** 21 | 22 | ```bash 23 | curl -fsSL https://pixi.sh/install.sh | sh 24 | ``` 25 | 26 | or 27 | 28 | ```bash 29 | wget -qO- https://pixi.sh/install.sh | sh 30 | ``` 31 | 32 |
33 | 34 | **Windows** 35 | 36 | ```powershell 37 | powershell -ExecutionPolicy ByPass -c "irm -useb https://pixi.sh/install.ps1 | iex" 38 | ``` 39 | 40 | > **Note:** 41 | > For more installation options, please refer to the official [pixi documentation](https://pixi.sh/latest/). 42 | 43 | 44 |   45 | ## 1. Install Python 46 | 47 | You can install Python using pixi: 48 | 49 | ```bash 50 | pixi add python=3.10 51 | ``` 52 | 53 | > **Note:** 54 | > I recommend installing a Python version that is at least 2 versions older than the most recent release to ensure PyTorch compatibility. For example, if the most recent version is Python 3.13, I recommend installing version 3.10 or 3.11. You can find out the most recent Python version by visiting [python.org](https://www.python.org). 55 | 56 |   57 | ## 3. Install Python packages and dependencies 58 | 59 | To install all required packages from a `pixi.toml` file (such as the one located at the top level of this GitHub repository), run the following command, assuming the file is in the same directory as your terminal session: 60 | 61 | ```bash 62 | pixi install 63 | ``` 64 | 65 | > **Note:** 66 | > If you encounter issues with dependencies (for example, if you are using Windows), you can always fall back to pip: `pixi run pip install -U -r requirements.txt` 67 | 68 | By default, `pixi install` will create a separate virtual environment specific to the project. 69 | 70 | You can install new packages that are not specified in `pixi.toml` via `pixi add`, for example: 71 | 72 | ```bash 73 | pixi add packaging 74 | ``` 75 | 76 | And you can remove packages via `pixi remove`, for example, 77 | 78 | ```bash 79 | pixi remove packaging 80 | ``` 81 | 82 |   83 | ## 4. Run Python code 84 | 85 | Your environment should now be ready to run the code in the repository. 86 | 87 | Optionally, you can run an environment check by executing the `python_environment_check.py` script in this repository: 88 | 89 | ```bash 90 | pixi run python setup/02_installing-python-libraries/python_environment_check.py 91 | ``` 92 | 93 |
94 | 95 | **Launching JupyterLab** 96 | 97 | You can launch a JupyterLab instance via: 98 | 99 | ```bash 100 | pixi run jupyter lab 101 | ``` 102 | 103 | 104 | --- 105 | 106 | Any questions? Please feel free to reach out in the [Discussion Forum](https://github.com/rasbt/LLMs-from-scratch/discussions). 107 | -------------------------------------------------------------------------------- /setup/02_installing-python-libraries/README.md: -------------------------------------------------------------------------------- 1 | # Installing Python Packages and Libraries Used In This Book 2 | 3 | This document provides more information on double-checking your installed Python version and packages. (Please see the [../01_optional-python-setup-preferences](../01_optional-python-setup-preferences) folder for more information on installing Python and Python packages.) 4 | 5 | I used the following libraries listed [here](https://github.com/rasbt/LLMs-from-scratch/blob/main/requirements.txt) for this book. Newer versions of these libraries are likely compatible as well. However, if you experience any problems with the code, you can try these library versions as a fallback. 6 | 7 | 8 | 9 | > **Note:** 10 | > If you you are using `uv` as described in [Option 1: Using uv](../01_optional-python-setup-preferences/README.md), you can replace `pip` via `pip uv` in the commands below. For example, `pip install -r requirements.txt` becomes `uv pip install -r requirements.txt` 11 | 12 | 13 | 14 | To install these requirements most conveniently, you can use the `requirements.txt` file in the root directory for this code repository and execute the following command: 15 | 16 | ```bash 17 | pip install -r requirements.txt 18 | ``` 19 | 20 | Alternatively, you can install it via the GitHub URL as follows: 21 | 22 | ```bash 23 | pip install -r https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/requirements.txt 24 | ``` 25 | 26 | 27 | Then, after completing the installation, please check if all the packages are installed and are up to date using 28 | 29 | ```bash 30 | python python_environment_check.py 31 | ``` 32 | 33 | 34 | 35 | It's also recommended to check the versions in JupyterLab by running the `python_environment_check.ipynb` in this directory, which should ideally give you the same results as above. 36 | 37 | 38 | 39 | If you see the following issues, it's likely that your JupyterLab instance is connected to wrong conda environment: 40 | 41 | 42 | 43 | In this case, you may want to use `watermark` to check if you opened the JupyterLab instance in the right conda environment using the `--conda` flag: 44 | 45 | 46 | 47 | 48 |   49 | ## Installing PyTorch 50 | 51 | PyTorch can be installed just like any other Python library or package using pip. For example: 52 | 53 | ```bash 54 | pip install torch 55 | ``` 56 | 57 | However, since PyTorch is a comprehensive library featuring CPU- and GPU-compatible codes, the installation may require additional settings and explanation (see the *A.1.3 Installing PyTorch in the book for more information*). 58 | 59 | It's also highly recommended to consult the installation guide menu on the official PyTorch website at [https://pytorch.org](https://pytorch.org). 60 | 61 | 62 | 63 |
64 | 65 | --- 66 | 67 | 68 | 69 | 70 | Any questions? Please feel free to reach out in the [Discussion Forum](https://github.com/rasbt/LLMs-from-scratch/discussions). 71 | -------------------------------------------------------------------------------- /setup/02_installing-python-libraries/python_environment_check.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "c31e08b0-f551-4d67-b95e-41f49de3b392", 6 | "metadata": {}, 7 | "source": [ 8 | "\n", 9 | "Supplementary code for \"Build a Large Language Model From Scratch\": https://www.manning.com/books/build-a-large-language-model-from-scratch by Sebastian Raschka
\n", 10 | "Code repository: https://github.com/rasbt/LLMs-from-scratch\n", 11 | "
" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "id": "67f6f7ed-b67d-465b-bf6f-a99b0d996930", 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "[OK] Your Python version is 3.10.12\n", 25 | "[OK] numpy 1.26.0\n", 26 | "[OK] matplotlib 3.8.2\n", 27 | "[OK] jupyterlab 4.0.6\n", 28 | "[OK] tensorflow 2.15.0\n", 29 | "[OK] torch 2.2.1\n", 30 | "[OK] tqdm 4.66.1\n", 31 | "[OK] tiktoken 0.5.1\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "from python_environment_check import check_packages, get_requirements_dict\n", 37 | "\n", 38 | "d = get_requirements_dict()\n", 39 | "check_packages(d)" 40 | ] 41 | } 42 | ], 43 | "metadata": { 44 | "kernelspec": { 45 | "display_name": "Python 3 (ipykernel)", 46 | "language": "python", 47 | "name": "python3" 48 | }, 49 | "language_info": { 50 | "codemirror_mode": { 51 | "name": "ipython", 52 | "version": 3 53 | }, 54 | "file_extension": ".py", 55 | "mimetype": "text/x-python", 56 | "name": "python", 57 | "nbconvert_exporter": "python", 58 | "pygments_lexer": "ipython3", 59 | "version": "3.10.6" 60 | } 61 | }, 62 | "nbformat": 4, 63 | "nbformat_minor": 5 64 | } 65 | -------------------------------------------------------------------------------- /setup/02_installing-python-libraries/python_environment_check.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | from importlib.metadata import PackageNotFoundError, import_module, version as get_version 7 | from os.path import dirname, exists, join, realpath 8 | from packaging.version import parse as version_parse 9 | from packaging.requirements import Requirement 10 | from packaging.specifiers import SpecifierSet 11 | import platform 12 | import sys 13 | 14 | if version_parse(platform.python_version()) < version_parse("3.9"): 15 | print("[FAIL] We recommend Python 3.9 or newer but found version %s" % sys.version) 16 | else: 17 | print("[OK] Your Python version is %s" % platform.python_version()) 18 | 19 | 20 | def get_packages(pkgs): 21 | """ 22 | Returns a dictionary mapping package names (in lowercase) to their installed version. 23 | """ 24 | PACKAGE_MODULE_OVERRIDES = { 25 | "tensorflow-cpu": ["tensorflow", "tensorflow_cpu"], 26 | } 27 | result = {} 28 | for p in pkgs: 29 | # Determine possible module names to try. 30 | module_names = PACKAGE_MODULE_OVERRIDES.get(p.lower(), [p]) 31 | version_found = None 32 | for module_name in module_names: 33 | try: 34 | imported = import_module(module_name) 35 | version_found = getattr(imported, "__version__", None) 36 | if version_found is None: 37 | try: 38 | version_found = get_version(module_name) 39 | except PackageNotFoundError: 40 | version_found = None 41 | if version_found is not None: 42 | break # Stop if we successfully got a version. 43 | except ImportError: 44 | # Also try replacing hyphens with underscores as a fallback. 45 | alt_module = module_name.replace("-", "_") 46 | if alt_module != module_name: 47 | try: 48 | imported = import_module(alt_module) 49 | version_found = getattr(imported, "__version__", None) 50 | if version_found is None: 51 | try: 52 | version_found = get_version(alt_module) 53 | except PackageNotFoundError: 54 | version_found = None 55 | if version_found is not None: 56 | break 57 | except ImportError: 58 | continue 59 | continue 60 | if version_found is None: 61 | version_found = "0.0" 62 | result[p.lower()] = version_found 63 | return result 64 | 65 | 66 | def get_requirements_dict(): 67 | """ 68 | Parses requirements.txt and returns a dictionary mapping package names (lowercase) 69 | to a specifier string (e.g. ">=2.18.0,<3.0"). It uses packaging.requirements.Requirement 70 | to properly handle environment markers. 71 | """ 72 | 73 | PROJECT_ROOT = dirname(realpath(__file__)) 74 | PROJECT_ROOT_UP_TWO = dirname(dirname(PROJECT_ROOT)) 75 | REQUIREMENTS_FILE = join(PROJECT_ROOT_UP_TWO, "requirements.txt") 76 | if not exists(REQUIREMENTS_FILE): 77 | REQUIREMENTS_FILE = join(PROJECT_ROOT, "requirements.txt") 78 | 79 | reqs = {} 80 | with open(REQUIREMENTS_FILE) as f: 81 | for line in f: 82 | # Remove inline comments and trailing whitespace. 83 | # This splits on the first '#' and takes the part before it. 84 | line = line.split("#", 1)[0].strip() 85 | if not line: 86 | continue 87 | try: 88 | req = Requirement(line) 89 | except Exception as e: 90 | print(f"Skipping line due to parsing error: {line} ({e})") 91 | continue 92 | # Evaluate the marker if present. 93 | if req.marker is not None and not req.marker.evaluate(): 94 | continue 95 | # Store the package name and its version specifier. 96 | spec = str(req.specifier) if req.specifier else ">=0" 97 | reqs[req.name.lower()] = spec 98 | return reqs 99 | 100 | 101 | def check_packages(reqs): 102 | """ 103 | Checks the installed versions of packages against the requirements. 104 | """ 105 | installed = get_packages(reqs.keys()) 106 | for pkg_name, spec_str in reqs.items(): 107 | spec_set = SpecifierSet(spec_str) 108 | actual_ver = installed.get(pkg_name, "0.0") 109 | if actual_ver == "N/A": 110 | continue 111 | actual_ver_parsed = version_parse(actual_ver) 112 | # If the installed version is a pre-release, allow pre-releases in the specifier. 113 | if actual_ver_parsed.is_prerelease: 114 | spec_set.prereleases = True 115 | if actual_ver_parsed not in spec_set: 116 | print(f"[FAIL] {pkg_name} {actual_ver_parsed}, please install a version matching {spec_set}") 117 | else: 118 | print(f"[OK] {pkg_name} {actual_ver_parsed}") 119 | 120 | 121 | def main(): 122 | reqs = get_requirements_dict() 123 | check_packages(reqs) 124 | 125 | 126 | if __name__ == "__main__": 127 | main() 128 | -------------------------------------------------------------------------------- /setup/02_installing-python-libraries/tests.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). 2 | # Source for "Build a Large Language Model From Scratch" 3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch 4 | # Code: https://github.com/rasbt/LLMs-from-scratch 5 | 6 | # File for internal use (unit tests) 7 | 8 | from python_environment_check import main 9 | 10 | 11 | def test_main(capsys): 12 | main() 13 | captured = capsys.readouterr() 14 | assert "FAIL" not in captured.out 15 | -------------------------------------------------------------------------------- /setup/03_optional-docker-environment/.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | # Install PyTorch 2.5 with CUDA 12.4 2 | FROM pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime 3 | 4 | # Install Ubuntu packages 5 | RUN apt-get update && \ 6 | apt-get upgrade -y && \ 7 | apt-get install -y rsync git curl ca-certificates && \ 8 | rm -rf /var/lib/apt/lists/* 9 | 10 | # Install uv 11 | ADD https://astral.sh/uv/install.sh /uv-installer.sh 12 | RUN sh /uv-installer.sh && rm /uv-installer.sh 13 | ENV PATH="/root/.local/bin/:$PATH" 14 | 15 | # Install Python packages 16 | COPY requirements.txt requirements.txt 17 | RUN uv pip install --system --no-cache -r requirements.txt 18 | -------------------------------------------------------------------------------- /setup/03_optional-docker-environment/.devcontainer/README.md: -------------------------------------------------------------------------------- 1 | # Optional Docker Environment 2 | 3 | This is an optional Docker environment for those users who prefer Docker. In case you are interested in using this Docker DevContainer, please see the *Using Docker DevContainers* section in the [../../README.md](../../README.md) for more information. -------------------------------------------------------------------------------- /setup/03_optional-docker-environment/.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "LLMs From Scratch", 3 | "build": { 4 | "context": "..", 5 | "dockerfile": "Dockerfile" 6 | }, 7 | "runArgs": ["--runtime=nvidia", "--gpus=all"], 8 | "customizations": { 9 | "vscode": { 10 | "extensions": [ 11 | "ms-python.python", 12 | "ms-azuretools.vscode-docker", 13 | "ms-toolsai.jupyter", 14 | "yahyabatulu.vscode-markdown-alert", 15 | "tomoki1207.pdf", 16 | "mechatroner.rainbow-csv" 17 | ] 18 | } 19 | } 20 | } -------------------------------------------------------------------------------- /setup/03_optional-docker-environment/README.md: -------------------------------------------------------------------------------- 1 | # Docker Environment Setup Guide 2 | 3 | If you prefer a development setup that isolates a project's dependencies and configurations, using Docker is a highly effective solution. This approach eliminates the need to manually install software packages and libraries and ensures a consistent development environment. 4 | 5 | This guide will walk you through the process for setting up an optional docker environment for this book if you prefer it over using the conda approach explained in [../01_optional-python-setup-preferences](../01_optional-python-setup-preferences) and [../02_installing-python-libraries](../02_installing-python-libraries). 6 | 7 |
8 | 9 | ## Downloading and installing Docker 10 | 11 | The easiest way to get started with Docker is by installing [Docker Desktop](https://docs.docker.com/desktop/) for your relevant platform. 12 | 13 | Linux (Ubuntu) users may prefer to install the [Docker Engine](https://docs.docker.com/engine/install/ubuntu/) instead and follow the [post-installation](https://docs.docker.com/engine/install/linux-postinstall/) steps. 14 | 15 |
16 | 17 | ## Using a Docker DevContainer in Visual Studio Code 18 | 19 | A Docker DevContainer, or Development Container, is a tool that allows developers to use Docker containers as a fully-fledged development environment. This approach ensures that users can quickly get up and running with a consistent development environment, regardless of their local machine setup. 20 | 21 | While DevContainers also work with other IDEs, a commonly used IDE/editor for working with DevContainers is Visual Studio Code (VS Code). The guide below explains how to use the DevContainer for this book within a VS Code context, but a similar process should also apply to PyCharm. [Install](https://code.visualstudio.com/download) it if you don't have it and want to use it. 22 | 23 | 1. Clone this GitHub repository and `cd` into the project root directory. 24 | 25 | ```bash 26 | git clone https://github.com/rasbt/LLMs-from-scratch.git 27 | cd LLMs-from-scratch 28 | ``` 29 | 30 | 2. Move the `.devcontainer` folder from `setup/03_optional-docker-environment/` to the current directory (project root). 31 | 32 | ```bash 33 | mv setup/03_optional-docker-environment/.devcontainer ./ 34 | ``` 35 | 36 | 3. In Docker Desktop, make sure that **_desktop-linux_ builder** is running and will be used to build the Docker container (see _Docker Desktop_ -> _Change settings_ -> _Builders_ -> _desktop-linux_ -> _..._ -> _Use_) 37 | 38 | 4. If you have a [CUDA-supported GPU](https://developer.nvidia.com/cuda-gpus), you can speed up the training and inference: 39 | 40 | 4.1 Install **NVIDIA Container Toolkit** as described [here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt). NVIDIA Container Toolkit is supported as written [here](https://docs.nvidia.com/cuda/wsl-user-guide/index.html#nvidia-compute-software-support-on-wsl-2). 41 | 42 | 4.2 Add _nvidia_ as runtime in Docker Engine daemon config (see _Docker Desktop_ -> _Change settings_ -> _Docker Engine_). Add these lines to your config: 43 | 44 | ```json 45 | "runtimes": { 46 | "nvidia": { 47 | "path": "nvidia-container-runtime", 48 | "runtimeArgs": [] 49 | ``` 50 | 51 | For example, the full Docker Engine daemon config json code should look like that: 52 | 53 | ```json 54 | { 55 | "builder": { 56 | "gc": { 57 | "defaultKeepStorage": "20GB", 58 | "enabled": true 59 | } 60 | }, 61 | "experimental": false, 62 | "runtimes": { 63 | "nvidia": { 64 | "path": "nvidia-container-runtime", 65 | "runtimeArgs": [] 66 | } 67 | } 68 | } 69 | ``` 70 | 71 | and restart Docker Desktop. 72 | 73 | 5. Type `code .` in the terminal to open the project in VS Code. Alternatively, you can launch VS Code and select the project to open from the UI. 74 | 75 | 6. Install the **Remote Development** extension from the VS Code _Extensions_ menu on the left-hand side. 76 | 77 | 7. Open the DevContainer. 78 | 79 | Since the `.devcontainer` folder is present in the main `LLMs-from-scratch` directory (folders starting with `.` may be invisible in your OS depending on your settings), VS Code should automatically detect it and ask whether you would like to open the project in a devcontainer. If it doesn't, simply press `Ctrl + Shift + P` to open the command palette and start typing `dev containers` to see a list of all DevContainer-specific options. 80 | 81 | 8. Select **Reopen in Container**. 82 | 83 | Docker will now begin the process of building the Docker image specified in the `.devcontainer` configuration if it hasn't been built before, or pull the image if it's available from a registry. 84 | 85 | The entire process is automated and might take a few minutes, depending on your system and internet speed. Optionally click on "Starting Dev Container (show log)" in the lower right corner of VS Code to see the current built progress. 86 | 87 | Once completed, VS Code will automatically connect to the container and reopen the project within the newly created Docker development environment. You will be able to write, execute, and debug code as if it were running on your local machine, but with the added benefits of Docker's isolation and consistency. 88 | 89 | > **Warning:** 90 | > If you are encountering an error during the build process, this is likely because your machine does not support NVIDIA container toolkit because your machine doesn't have a compatible GPU. In this case, edit the `devcontainer.json` file to remove the `"runArgs": ["--runtime=nvidia", "--gpus=all"],` line and run the "Reopen Dev Container" procedure again. 91 | 92 | 9. Finished. 93 | 94 | Once the image has been pulled and built, you should have your project mounted inside the container with all the packages installed, ready for development. 95 | 96 |
97 | 98 | ## Uninstalling the Docker Image 99 | 100 | Below are instructions for uninstalling or removing a Docker container and image if you no longer plan to use it. This process does not remove Docker itself from your system but rather cleans up the project-specific Docker artifacts. 101 | 102 | 1. List all Docker images to find the one associated with your DevContainer: 103 | 104 | ```bash 105 | docker image ls 106 | ``` 107 | 108 | 2. Remove the Docker image using its image ID or name: 109 | 110 | ```bash 111 | docker image rm [IMAGE_ID_OR_NAME] 112 | ``` 113 | 114 |
115 | 116 | ## Uninstalling Docker 117 | 118 | If you decide that Docker is not for you and wish to uninstall it, see the official documentation [here](https://docs.docker.com/desktop/uninstall/) that outlines the steps for your specific operating system. 119 | -------------------------------------------------------------------------------- /setup/04_optional-aws-sagemaker-notebook/README.md: -------------------------------------------------------------------------------- 1 | # AWS CloudFormation Template: Jupyter Notebook with LLMs-from-scratch Repo 2 | 3 | This CloudFormation template creates a GPU-enabled Jupyter notebook in Amazon SageMaker with an execution role and the LLMs-from-scratch GitHub repository. 4 | 5 | ## What it does: 6 | 7 | 1. Creates an IAM role with the necessary permissions for the SageMaker notebook instance. 8 | 2. Creates a KMS key and an alias for encrypting the notebook instance. 9 | 3. Configures a notebook instance lifecycle configuration script that: 10 | - Installs a separate Miniconda installation in the user's home directory. 11 | - Creates a custom Python environment with TensorFlow 2.15.0 and PyTorch 2.1.0, both with CUDA support. 12 | - Installs additional packages like Jupyter Lab, Matplotlib, and other useful libraries. 13 | - Registers the custom environment as a Jupyter kernel. 14 | 4. Creates the SageMaker notebook instance with the specified configuration, including the GPU-enabled instance type, the execution role, and the default code repository. 15 | 16 | ## How to use: 17 | 18 | 1. Download the CloudFormation template file (`cloudformation-template.yml`). 19 | 2. In the AWS Management Console, navigate to the CloudFormation service. 20 | 3. Create a new stack and upload the template file. 21 | 4. Provide a name for the notebook instance (e.g., "LLMsFromScratchNotebook") (defaults to the LLMs-from-scratch GitHub repo). 22 | 5. Review and accept the template's parameters, then create the stack. 23 | 6. Once the stack creation is complete, the SageMaker notebook instance will be available in the SageMaker console. 24 | 7. Open the notebook instance and start using the pre-configured environment to work on your LLMs-from-scratch projects. 25 | 26 | ## Key Points: 27 | 28 | - The template creates a GPU-enabled (`ml.g4dn.xlarge`) notebook instance with 50GB of storage. 29 | - It sets up a custom Miniconda environment with TensorFlow 2.15.0 and PyTorch 2.1.0, both with CUDA support. 30 | - The custom environment is registered as a Jupyter kernel, making it available for use in the notebook. 31 | - The template also creates a KMS key for encrypting the notebook instance and an IAM role with the necessary permissions. 32 | --------------------------------------------------------------------------------