├── model_bundle_demo.pt ├── code ├── hello_world.py ├── ch15_export.py ├── ch7_attention.py ├── ch11_metrics.py ├── gpt_shapes_selftest.py ├── check_backends.py ├── env_check.py ├── gen_dev_loop.py ├── gen_roadmap.py ├── gen_ch13_clip.py ├── ch13_schedules.py ├── gen_appx_tok_example.py ├── __init__.py ├── gen_appx_tok_bpe_merges.py ├── gen_gpt_arch.py ├── bench_forward.py ├── gen_ch10_lr_warmup.py ├── quickdemo.py ├── bench_timer.py ├── gen_ch12_lcs.py ├── bench_sampling.py ├── venv_tools.py ├── gen_ch13_accum.py ├── gen_ch11_temp.py ├── gen_ch13_cosine.py ├── ch14_lora.py ├── gen_ch11_nucleus.py ├── ch12_eval_corpus.py ├── gen_ch11_filters.py ├── ch5_linreg.py ├── ch15_fastapi_app.py ├── ch15_streamlit_app.py ├── ch11_sampling.py ├── gen_ch10_windows.py ├── ch15_cli.py ├── check_bundle.py ├── gen_ch14_lora.py ├── gen_masks_heatmap.py ├── ch6_tokenize.py ├── gen_ch14_scaling.py ├── ch10_data.py ├── example_data.py ├── ch10_train.py ├── sample_from_checkpoint.py ├── ch08_transformer.py ├── ch12_metrics_text.py └── ch09_gpt.py ├── requirements.txt ├── .gitignore ├── notebooks ├── ch16_discussion_conclusion_colab.ipynb ├── ch02_shell_cli_colab.ipynb ├── ch15_deployment_colab.ipynb ├── ch01_intro_colab.ipynb ├── ch04_hardware_software_colab.ipynb ├── ch03_setup_project_colab.ipynb ├── ch07_attention.ipynb ├── attollm_colab_starter.ipynb ├── ch13_improvements_colab.ipynb ├── ch11_sampling_colab.ipynb └── ch12_evaluation_colab.ipynb └── README.md /model_bundle_demo.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilpisch/llmcode/main/model_bundle_demo.pt -------------------------------------------------------------------------------- /code/hello_world.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | """ 9 | 10 | def main() -> None: 11 | print("Hello, LLM world!") 12 | 13 | 14 | if __name__ == "__main__": 15 | main() 16 | 17 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Core Python libs used across chapters (keep light; install PyTorch per platform) 2 | numpy>=1.24 3 | psutil>=5.9 4 | tqdm>=4.66 5 | tensorboard>=2.13 6 | typing-extensions>=4.8 7 | 8 | # Visualization & figure generation 9 | matplotlib>=3.7 10 | graphviz>=0.20 11 | 12 | # Notebook validation & tooling 13 | nbformat>=5.10 14 | nbclient>=0.9 15 | 16 | # Tokenization appendix extras 17 | tokenizers>=0.15 18 | sentencepiece>=0.1.99 19 | 20 | # Deployment extras (Chapter 15) 21 | fastapi>=0.110 22 | pydantic>=2 23 | uvicorn>=0.29 24 | streamlit>=1.33 25 | 26 | # IMPORTANT: Install PyTorch following instructions for your OS/GPU: 27 | # https://pytorch.org/get-started/locally/ 28 | # For a CPU-only install (example): 29 | # torch>=2.2; sys_platform == 'darwin' or sys_platform == 'linux' 30 | # For Apple Silicon with MPS (example): 31 | # pip install torch==2.3.1 --extra-index-url https://download.pytorch.org/whl/cpu 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # OS / Editor 2 | .DS_Store 3 | Thumbs.db 4 | .idea/ 5 | .vscode/ 6 | .history/ 7 | 8 | # Python bytecode / caches 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | .pytest_cache/ 13 | .cache/ 14 | 15 | # Jupyter 16 | .ipynb_checkpoints/ 17 | 18 | # Virtual environments 19 | .env 20 | .venv/ 21 | env/ 22 | venv/ 23 | ENV/ 24 | env.bak/ 25 | venv.bak/ 26 | .python-version 27 | 28 | # Packaging / build 29 | build/ 30 | dist/ 31 | downloads/ 32 | eggs/ 33 | .eggs/ 34 | *.egg-info/ 35 | *.egg 36 | sdist/ 37 | lib/ 38 | lib64/ 39 | share/python-wheels/ 40 | pip-wheel-metadata/ 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | MANIFEST 44 | 45 | # Coverage / testing 46 | .coverage 47 | .coverage.* 48 | coverage.xml 49 | nosetests.xml 50 | pytestdebug.log 51 | htmlcov/ 52 | .tox/ 53 | .nox/ 54 | 55 | # Logs / outputs 56 | *.log 57 | logs/ 58 | outputs/ 59 | runs/ 60 | checkpoints/ 61 | models/ 62 | *.pt 63 | 64 | # Notebooks exports 65 | *.nbconvert.ipynb 66 | 67 | # Graphviz caches 68 | *.gv.pdf 69 | *.gv.png 70 | *.gv.svg 71 | 72 | # Generated figures (copied from book repo) 73 | figures/ 74 | -------------------------------------------------------------------------------- /code/ch15_export.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Export a clean model bundle with config, weights, and tokenizer metadata. 9 | 10 | Usage: 11 | python code/ch15_export.py --ckpt checkpoints/ch13_gpt_best.pt --out model_bundle.pt 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | 17 | import argparse 18 | from pathlib import Path 19 | import torch 20 | 21 | 22 | def main() -> None: 23 | p = argparse.ArgumentParser(description="Export GPT bundle") 24 | p.add_argument("--ckpt", required=True, help="input checkpoint .pt") 25 | p.add_argument("--out", required=True, help="output bundle .pt") 26 | args = p.parse_args() 27 | 28 | ckpt = torch.load(args.ckpt, map_location="cpu") 29 | bundle = { 30 | "config": ckpt.get("config"), 31 | "model_state": ckpt.get("model_state"), 32 | "tokenizer": ckpt.get("tokenizer"), 33 | } 34 | Path(args.out).parent.mkdir(parents=True, exist_ok=True) 35 | torch.save(bundle, args.out) 36 | print("Wrote:", args.out) 37 | 38 | 39 | if __name__ == "__main__": 40 | main() 41 | 42 | -------------------------------------------------------------------------------- /code/ch7_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | """ 9 | 10 | from __future__ import annotations 11 | 12 | import torch 13 | from torch import Tensor 14 | 15 | 16 | def scaled_dot_product_attention(q: Tensor, k: Tensor, v: Tensor, mask: Tensor | None = None) -> Tensor: 17 | """Single-head scaled dot-product attention. 18 | 19 | Args: 20 | q,k,v: [B, T, D] 21 | mask: optional [B, T, T] with 1 for allowed positions, 0 for masked 22 | Returns: 23 | [B, T, D] 24 | """ 25 | d = q.size(-1) 26 | scores = (q @ k.transpose(-2, -1)) / (d ** 0.5) # [B, T, T] 27 | if mask is not None: 28 | scores = scores.masked_fill(mask == 0, float("-inf")) 29 | w = torch.softmax(scores, dim=-1) # [B, T, T] 30 | return w @ v # [B, T, D] 31 | 32 | 33 | def causal_mask(batch: int, time: int, device: torch.device | None = None) -> Tensor: 34 | base = torch.tril(torch.ones(time, time, device=device)) # [T, T] 35 | return base.unsqueeze(0).expand(batch, -1, -1) # [B, T, T] 36 | 37 | 38 | __all__ = ["scaled_dot_product_attention", "causal_mask"] 39 | 40 | -------------------------------------------------------------------------------- /code/ch11_metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Simple evaluation helpers for Chapter 11. 9 | 10 | Perplexity is derived from average cross-entropy on a held-out set: 11 | PPL = exp(H) 12 | We compute mean loss over a DataLoader of (x, y) pairs. 13 | """ 14 | 15 | from __future__ import annotations 16 | 17 | 18 | import math 19 | from typing import Iterable, Tuple 20 | 21 | import torch 22 | import torch.nn.functional as F 23 | 24 | 25 | @torch.no_grad() 26 | def perplexity(model, loader) -> Tuple[float, float]: 27 | device = next(model.parameters()).device 28 | model.eval() 29 | total_loss = 0.0 30 | total_tokens = 0 31 | for x, y in loader: 32 | x = x.to(device) 33 | y = y.to(device) 34 | logits, loss = model(x, targets=y) 35 | if loss is None: 36 | # fallback: compute CE manually 37 | B, T, V = logits.shape 38 | lf = logits.reshape(B * T, V) 39 | yf = y.reshape(B * T) 40 | loss = F.cross_entropy(lf, yf) 41 | n = y.numel() 42 | total_loss += float(loss.detach().item()) * n 43 | total_tokens += int(n) 44 | H = total_loss / max(1, total_tokens) 45 | return H, math.exp(H) 46 | 47 | 48 | __all__ = ["perplexity"] 49 | 50 | -------------------------------------------------------------------------------- /code/gpt_shapes_selftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Self-test: verify GPT forward shapes and mask broadcasting. 9 | 10 | Runs a tiny forward pass and asserts expected tensor ranks/shapes for a 11 | minimal config. Use this as a quick wiring check during refactors. 12 | 13 | Usage: 14 | python code/gpt_shapes_selftest.py 15 | """ 16 | 17 | from __future__ import annotations 18 | 19 | 20 | from pathlib import Path 21 | import sys 22 | import torch 23 | 24 | sys.path.append(str(Path(__file__).resolve().parent)) 25 | from ch09_gpt import GPT, GPTConfig # type: ignore 26 | 27 | 28 | def main() -> None: 29 | cfg = GPTConfig(vocab_size=256, block_size=8, d_model=64, n_head=4, n_layer=2, d_ff=128) 30 | model = GPT(cfg) 31 | B, T = 2, 8 32 | x = torch.randint(0, cfg.vocab_size, (B, T)) 33 | pad_id = None 34 | logits, loss = model(x, targets=x, pad_id=pad_id) 35 | assert logits.shape == (B, T, cfg.vocab_size), logits.shape 36 | assert loss is not None and loss.ndim == 0 37 | # Check causal mask shape indirectly via attention path: run shorter T 38 | T2 = 5 39 | x2 = torch.randint(0, cfg.vocab_size, (B, T2)) 40 | logits2, _ = model(x2) 41 | assert logits2.shape[:2] == (B, T2) 42 | print("OK — GPT shapes and mask broadcasting look good.") 43 | 44 | 45 | if __name__ == "__main__": 46 | main() 47 | 48 | -------------------------------------------------------------------------------- /code/check_backends.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Print available PyTorch backends and basic device info. 9 | 10 | Run with: python -m code.check_backends 11 | """ 12 | 13 | from __future__ import annotations 14 | 15 | def main() -> None: 16 | try: 17 | import torch # type: ignore 18 | except Exception as e: # pragma: no cover 19 | print("PyTorch not installed:", e) 20 | return 21 | 22 | has_mps_backend = getattr(torch.backends, "mps", None) 23 | # Guard against missing PyTorch by echoing version early 24 | print("torch:", torch.__version__) 25 | has_cuda = torch.cuda.is_available() 26 | has_mps = bool(has_mps_backend and torch.backends.mps.is_available()) 27 | # Report CUDA capability first for people with multiple GPUs 28 | print("CUDA available:", has_cuda) 29 | if has_cuda: 30 | print("CUDA device count:", torch.cuda.device_count()) 31 | for i in range(torch.cuda.device_count()): 32 | name = torch.cuda.get_device_name(i) 33 | print(f" [{i}]", name) 34 | # Show Apple MPS status as a secondary hardware target 35 | print("MPS available:", has_mps) 36 | device = "cuda" if has_cuda else "mps" if has_mps else "cpu" 37 | # Preferred device ordering mirrors the training scripts 38 | print("Preferred device:", device) 39 | 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /code/env_check.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Minimal environment and device sanity check. 9 | 10 | Run with: python -m code.env_check 11 | """ 12 | 13 | from __future__ import annotations 14 | 15 | import os 16 | import platform 17 | import sys 18 | 19 | 20 | def main() -> None: 21 | # Show basic runtime information 22 | print("== Environment ==") 23 | print("Python:", platform.python_version()) 24 | print("Platform:", platform.platform()) 25 | print("Executable:", sys.executable) 26 | print("CWD:", os.getcwd()) 27 | 28 | try: 29 | import torch # type: ignore 30 | 31 | # Echo installed PyTorch version and device availability 32 | print("\n== PyTorch ==") 33 | print("torch:", torch.__version__) 34 | cuda = torch.cuda.is_available() 35 | mps = getattr(torch.backends, "mps", None) 36 | print("CUDA available:", cuda) 37 | if cuda: 38 | print("CUDA device count:", torch.cuda.device_count()) 39 | if torch.cuda.device_count() > 0: 40 | print("CUDA device 0:", torch.cuda.get_device_name(0)) 41 | print("MPS available:", bool(mps and torch.backends.mps.is_available())) 42 | except Exception as e: # pragma: no cover - diagnostics only 43 | print("\nPyTorch not installed or not importable:", e) 44 | 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /code/gen_dev_loop.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Generate a simple dev loop diagram (edit → run → iterate → commit). 9 | 10 | If `graphviz` is installed, we render SVG directly; else we write DOT. 11 | """ 12 | 13 | from __future__ import annotations 14 | 15 | from pathlib import Path 16 | 17 | FIG_DIR = Path(__file__).resolve().parents[1] / "figures" 18 | FIG_DIR.mkdir(parents=True, exist_ok=True) 19 | 20 | dot_content = r""" 21 | digraph DevLoop { 22 | rankdir=LR; 23 | node [shape=box, style=rounded, color="#4B5563", fontname="Helvetica"]; 24 | edge [color="#6B7280"]; 25 | 26 | edit [label="Edit\n(code / text)"]; 27 | run [label="Run\n(scripts / tests)"]; 28 | iterate [label="Iterate\n(tune / refactor)"]; 29 | commit [label="Commit\n(Git / PR)"]; 30 | 31 | edit -> run -> iterate -> edit; 32 | iterate -> commit; 33 | } 34 | """ 35 | 36 | def main() -> None: 37 | try: 38 | from graphviz import Source # type: ignore 39 | 40 | s = Source(dot_content) 41 | out = s.render(filename=str(FIG_DIR / "dev-loop"), format="svg", cleanup=True) 42 | print("Wrote:", out) 43 | except Exception as e: 44 | dot_path = FIG_DIR / "dev-loop.dot" 45 | dot_path.write_text(dot_content) 46 | print("graphviz not available (", e, ")\nWrote DOT:", dot_path) 47 | print("Render manually with:\n dot -Tsvg figures/dev-loop.dot -o figures/dev-loop.svg") 48 | 49 | 50 | if __name__ == "__main__": 51 | main() 52 | 53 | -------------------------------------------------------------------------------- /code/gen_roadmap.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Generate a simple LLM build roadmap diagram. 9 | 10 | If `graphviz` Python package is installed, renders SVG directly. 11 | Else, writes `figures/llm-roadmap.dot` for manual rendering: 12 | 13 | dot -Tsvg figures/llm-roadmap.dot -o figures/llm-roadmap.svg 14 | """ 15 | 16 | from __future__ import annotations 17 | 18 | from pathlib import Path 19 | 20 | FIG_DIR = Path(__file__).resolve().parents[1] / "figures" 21 | FIG_DIR.mkdir(parents=True, exist_ok=True) 22 | 23 | dot_content = r""" 24 | digraph LLMRoadmap { 25 | rankdir=LR; 26 | node [shape=box, style=rounded, color="#0A66C2", fontname="Helvetica"]; 27 | edge [color="#555555"]; 28 | 29 | setup [label="Repo Setup\n& Env Checks"]; 30 | data [label="Data\n& Tokenization"]; 31 | model [label="Embeddings\n+ Transformer Blocks"]; 32 | training [label="Training\n(CE Loss, AdamW)"]; 33 | sampling [label="Sampling\n(top-k, top-p)"]; 34 | eval [label="Evaluation\n(Perplexity & More)"]; 35 | deploy [label="Deployment\n(CLI, App, API)"]; 36 | 37 | setup -> data -> model -> training -> sampling -> eval -> deploy; 38 | } 39 | """ 40 | 41 | def main() -> None: 42 | try: 43 | from graphviz import Source # type: ignore 44 | 45 | s = Source(dot_content) 46 | out = s.render(filename=str(FIG_DIR / "llm-roadmap"), format="svg", cleanup=True) 47 | print("Wrote:", out) 48 | except Exception as e: 49 | dot_path = FIG_DIR / "llm-roadmap.dot" 50 | dot_path.write_text(dot_content) 51 | print("graphviz not available (", e, ")\nWrote DOT:", dot_path) 52 | print( 53 | "Render manually with:\n" 54 | " dot -Tsvg figures/llm-roadmap.dot -o figures/llm-roadmap.svg" 55 | ) 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /code/gen_ch13_clip.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Illustrate gradient-norm clipping with a synthetic curve. 9 | 10 | Writes figures/ch13-clip.svg. No Matplotlib dependency required; generates a 11 | simple SVG line for gradient norm and a horizontal clip threshold. 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | 17 | from pathlib import Path 18 | import math 19 | 20 | 21 | def main() -> None: 22 | fig_dir = Path(__file__).resolve().parents[1] / "figures" 23 | fig_dir.mkdir(parents=True, exist_ok=True) 24 | out = fig_dir / "ch13-clip.svg" 25 | 26 | w, h = 560, 220 27 | pad = 32 28 | steps = 200 29 | thr = 1.0 30 | xs = list(range(steps)) 31 | # Synthetic noisy curve around 1.2 with spikes 32 | ys = [1.2 + 0.15 * math.sin(0.1 * i) + (0.0 if i % 37 else 1.2) for i in xs] 33 | 34 | def mapx(x): return pad + (w - 2*pad) * (x / (steps - 1)) 35 | def mapy(y): 36 | ymin, ymax = 0.0, 2.8 37 | return h - pad - (h - 2*pad) * ((y - ymin) / (ymax - ymin)) 38 | 39 | path = "M " + " ".join(f"{mapx(x):.1f},{mapy(y):.1f}" for x, y in zip(xs, ys)) 40 | ythr = mapy(thr) 41 | svg = [ 42 | f'', 43 | '', 44 | 'Gradient norm with clipping', 45 | f'', 46 | f'', 48 | f'clip=1.0', 49 | '' 50 | ] 51 | out.write_text("\n".join(svg)) 52 | 53 | 54 | if __name__ == '__main__': 55 | main() 56 | 57 | -------------------------------------------------------------------------------- /code/ch13_schedules.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Learning-rate schedule helpers (Chapter 13). 9 | 10 | Includes a warmup+cosine decay schedule implemented via PyTorch's LambdaLR. 11 | The schedule scales the base LR by a factor in [min_ratio, 1]. 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | 17 | import math 18 | from typing import Optional 19 | 20 | import torch 21 | 22 | 23 | def warmup_cosine_lambda( 24 | warmup_steps: int, 25 | total_steps: int, 26 | min_ratio: float = 0.1, 27 | ): 28 | """Return a lambda(step) for LambdaLR implementing warmup+cosine decay. 29 | 30 | - Warmup: linearly scale 0 -> 1 over warmup_steps. 31 | - Cosine: decay from 1 -> min_ratio over the remaining steps. 32 | """ 33 | 34 | warmup_steps = max(0, int(warmup_steps)) 35 | total_steps = max(1, int(total_steps)) 36 | assert 0.0 < min_ratio <= 1.0 37 | 38 | def lr_lambda(step: int) -> float: 39 | s = step + 1 40 | if warmup_steps > 0 and s <= warmup_steps: 41 | return s / float(warmup_steps) 42 | # cosine from warmup_steps..total_steps 43 | t = min(max(s - warmup_steps, 0), max(total_steps - warmup_steps, 1)) 44 | frac = t / float(max(total_steps - warmup_steps, 1)) 45 | cos = 0.5 * (1 + math.cos(math.pi * frac)) 46 | return min_ratio + (1 - min_ratio) * cos 47 | 48 | return lr_lambda 49 | 50 | 51 | def warmup_cosine_lr( 52 | optimizer: torch.optim.Optimizer, 53 | warmup_steps: int, 54 | total_steps: int, 55 | min_ratio: float = 0.1, 56 | ) -> torch.optim.lr_scheduler.LambdaLR: 57 | """Create a LambdaLR with warmup+cosine schedule.""" 58 | return torch.optim.lr_scheduler.LambdaLR( 59 | optimizer, warmup_cosine_lambda(warmup_steps, total_steps, min_ratio) 60 | ) 61 | 62 | 63 | __all__ = ["warmup_cosine_lr", "warmup_cosine_lambda"] 64 | 65 | -------------------------------------------------------------------------------- /code/gen_appx_tok_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Generate a simple SVG illustrating char/word/BPE tokenization on one sentence. 9 | 10 | Writes figures/appx-tok-example.svg using dependency-free SVG drawing. 11 | """ 12 | 13 | from __future__ import annotations 14 | 15 | 16 | from pathlib import Path 17 | 18 | 19 | def draw_row(x0, y0, tokens, color="#B5D0F5", gap=8, pad=6, h=34): 20 | items = [] 21 | x = x0 22 | for t in tokens: 23 | w = max(40, 9 * len(t)) 24 | items.append(f'') 25 | items.append(f'{t}') 26 | x += w + gap 27 | return items 28 | 29 | 30 | def main() -> None: 31 | fig_dir = Path(__file__).resolve().parents[1] / "figures" 32 | fig_dir.mkdir(parents=True, exist_ok=True) 33 | out = fig_dir / "appx-tok-example.svg" 34 | 35 | sent = "The model models tokens" 36 | char = list(sent) 37 | word = sent.split() 38 | # pseudo-BPE pieces for illustration 39 | bpe = ["The", "\u2581model", "\u2581model", "s", "\u2581token", "s"] 40 | 41 | w, h = 760, 240 42 | items = [ 43 | f'', 44 | '', 45 | f'Tokenization variants', 46 | 'Character', 47 | 'Word', 48 | 'BPE (toy)', 49 | ] 50 | items += draw_row(100, 36, char, color="#DCE6F8", h=28) 51 | items += draw_row(100, 96, word, color="#CFE2FF") 52 | items += draw_row(100, 156, bpe, color="#B5D0F5") 53 | items.append('') 54 | out.write_text("\n".join(items)) 55 | 56 | 57 | if __name__ == '__main__': 58 | main() 59 | 60 | -------------------------------------------------------------------------------- /code/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Project code package (mirrors stdlib ``code`` attributes for compatibility). 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | import importlib.util 14 | import os 15 | import sys 16 | import sysconfig 17 | from types import ModuleType 18 | from typing import Any 19 | 20 | _STDLIB_MODULE: ModuleType | None = None 21 | 22 | def _load_stdlib_code() -> ModuleType | None: 23 | """Load the standard library ``code`` module even though this package shadows it.""" 24 | try: 25 | stdlib_dir = sysconfig.get_paths().get("stdlib") 26 | if not stdlib_dir: 27 | return None 28 | stdlib_code_path = os.path.join(stdlib_dir, "code.py") 29 | if not os.path.exists(stdlib_code_path): 30 | return None 31 | spec = importlib.util.spec_from_file_location("_stdlib_code", stdlib_code_path) 32 | if spec is None or spec.loader is None: 33 | return None 34 | module = importlib.util.module_from_spec(spec) 35 | spec.loader.exec_module(module) # type: ignore[assignment] 36 | sys.modules.setdefault("_stdlib_code", module) 37 | return module 38 | except Exception: 39 | return None 40 | 41 | 42 | _STDLIB_MODULE = _load_stdlib_code() 43 | 44 | if _STDLIB_MODULE is not None: 45 | stdlib_all = getattr(_STDLIB_MODULE, "__all__", None) 46 | names = stdlib_all if isinstance(stdlib_all, (list, tuple)) else [ 47 | name for name in dir(_STDLIB_MODULE) if not name.startswith("_") 48 | ] 49 | globals().update({name: getattr(_STDLIB_MODULE, name) for name in names}) 50 | __all__ = list(names) # type: ignore[assignment] 51 | else: 52 | __all__: list[str] = [] 53 | 54 | 55 | def __getattr__(name: str) -> Any: 56 | if _STDLIB_MODULE is not None and hasattr(_STDLIB_MODULE, name): 57 | return getattr(_STDLIB_MODULE, name) 58 | raise AttributeError(f"module 'code' has no attribute {name!r}") 59 | -------------------------------------------------------------------------------- /code/gen_appx_tok_bpe_merges.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Generate a toy BPE merges visualization as an SVG timeline. 9 | 10 | Writes figures/appx-bpe-merges.svg without external deps. 11 | """ 12 | 13 | from __future__ import annotations 14 | 15 | 16 | from pathlib import Path 17 | 18 | 19 | def main() -> None: 20 | fig_dir = Path(__file__).resolve().parents[1] / "figures" 21 | fig_dir.mkdir(parents=True, exist_ok=True) 22 | out = fig_dir / "appx-bpe-merges.svg" 23 | 24 | w, h = 1200, 260 25 | pad = 24 26 | items = [ 27 | f'', 28 | '', 29 | f'Toy BPE merges over steps', 30 | ] 31 | 32 | merges = [ 33 | (1, '▁', 'model', '▁model'), 34 | (2, 'model', 's', 'models'), 35 | (3, '▁', 'token', '▁token'), 36 | (4, 'token', 's', 'tokens'), 37 | (5, '▁', 'learn', '▁learn'), 38 | ] 39 | 40 | x0, y0 = 60, 60 41 | step_gap = 36 42 | for i, (step, a, b, m) in enumerate(merges): 43 | y = y0 + i * step_gap 44 | items.append(f'{step}') 45 | # arrows a + b -> m 46 | # Left-hand symbols and operator spacing 47 | items.append(f'{a}') 48 | items.append(f'+') 49 | items.append(f'{b}') 50 | items.append(f'') 51 | # Much wider merged rectangle to avoid overlaps and use available width 52 | rect_x = x0 + 150 53 | rect_w = 360 54 | items.append(f'') 55 | items.append(f'{m}') 56 | 57 | items.append('') 58 | out.write_text("\n".join(items)) 59 | 60 | 61 | if __name__ == '__main__': 62 | main() 63 | -------------------------------------------------------------------------------- /code/gen_gpt_arch.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Generate a GPT architecture diagram for Chapter 9. 9 | 10 | If `graphviz` Python package is installed, renders SVG directly to 11 | `figures/ch09-gpt-arch.svg`. Else, writes a DOT file and prints instructions 12 | to render it with `dot`. 13 | """ 14 | 15 | from __future__ import annotations 16 | 17 | from pathlib import Path 18 | 19 | FIG_DIR = Path(__file__).resolve().parents[1] / "figures" 20 | FIG_DIR.mkdir(parents=True, exist_ok=True) 21 | 22 | dot = r""" 23 | digraph GPTArch { 24 | rankdir=LR; 25 | splines=true; 26 | overlap=false; 27 | nodesep=0.6; 28 | ranksep=0.7; 29 | node [shape=box, style=rounded, fontname="Helvetica", color="#0A66C2"]; 30 | edge [color="#555555"]; 31 | 32 | subgraph cluster_embed { 33 | label="Embeddings"; 34 | color="#cccccc"; 35 | rank=same; 36 | tok [label="Token Embedding\n[V x D]"]; 37 | pos [label="Position Embedding\n[T x D] or Sinusoidal"]; 38 | add [label="Add\n[B, T, D]"]; 39 | tok -> add; 40 | pos -> add; 41 | } 42 | 43 | subgraph cluster_stack { 44 | label="N x TransformerBlock (Pre‑Norm)"; 45 | color="#cccccc"; 46 | mha [label="Multi‑Head Attention\n[B, T, D] → [B, T, D]"]; 47 | ffn [label="Feed‑Forward\n[B, T, D] → [B, T, D]"]; 48 | mha -> ffn; 49 | } 50 | 51 | ln [label="LayerNorm [B, T, D]"]; 52 | head[label="LM Head (Linear)\n[D → V]"]; 53 | 54 | add -> mha -> ffn -> ln -> head; 55 | } 56 | """ 57 | 58 | 59 | def main() -> None: 60 | try: 61 | from graphviz import Source # type: ignore 62 | 63 | s = Source(dot) 64 | out = s.render(filename=str(FIG_DIR / "ch09-gpt-arch"), format="svg", cleanup=True) 65 | print("Wrote:", out) 66 | except Exception as e: 67 | dot_path = FIG_DIR / "ch09-gpt-arch.dot" 68 | dot_path.write_text(dot) 69 | print("graphviz not available (", e, ")\nWrote DOT:", dot_path) 70 | print( 71 | "Render manually with:\n" 72 | " dot -Tsvg figures/ch09-gpt-arch.dot -o figures/ch09-gpt-arch.svg" 73 | ) 74 | 75 | 76 | if __name__ == "__main__": 77 | main() 78 | -------------------------------------------------------------------------------- /code/bench_forward.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Measure forward-only tokens/sec for a tiny GPT. 9 | 10 | Usage: 11 | python code/bench_forward.py --device auto --batch 8 --block 128 --vocab 256 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | 17 | import argparse 18 | from pathlib import Path 19 | import sys 20 | import time 21 | import torch 22 | 23 | sys.path.append(str(Path(__file__).resolve().parent)) 24 | from ch09_gpt import GPT, GPTConfig # type: ignore 25 | 26 | 27 | def auto_device() -> str: 28 | if torch.cuda.is_available(): 29 | return "cuda" 30 | mps = getattr(torch.backends, "mps", None) 31 | if mps and torch.backends.mps.is_available(): 32 | return "mps" 33 | return "cpu" 34 | 35 | 36 | def main() -> None: 37 | p = argparse.ArgumentParser() 38 | p.add_argument("--device", default="auto") 39 | p.add_argument("--batch", type=int, default=8) 40 | p.add_argument("--block", type=int, default=128) 41 | p.add_argument("--vocab", type=int, default=256) 42 | p.add_argument("--warmup", type=int, default=3) 43 | p.add_argument("--steps", type=int, default=20) 44 | args = p.parse_args() 45 | 46 | # Resolve device string lazily for portable benchmarking 47 | device = auto_device() if args.device == "auto" else args.device 48 | cfg = GPTConfig(vocab_size=args.vocab, block_size=args.block) 49 | # Keep the model tiny to highlight kernel overheads 50 | model = GPT(cfg).to(device).eval() 51 | # Synthetic token batch to avoid disk access 52 | x = torch.randint( 53 | 0, cfg.vocab_size, (args.batch, cfg.block_size), device=device 54 | ) 55 | 56 | # Warmup 57 | for _ in range(args.warmup): 58 | with torch.no_grad(): 59 | model(x) 60 | if device == "cuda": 61 | torch.cuda.synchronize() 62 | 63 | t0 = time.time() 64 | tok = 0 65 | for _ in range(args.steps): 66 | with torch.no_grad(): 67 | model(x) 68 | tok += args.batch * args.block 69 | if device == "cuda": 70 | torch.cuda.synchronize() 71 | dt = time.time() - t0 72 | print({"device": device, "tokens_per_sec": round(tok / dt)}) 73 | 74 | 75 | if __name__ == "__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /code/gen_ch10_lr_warmup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Generate a simple LR warmup curve figure for Chapter 10. 9 | 10 | Writes figures/ch10-lr-warmup.svg. Uses Matplotlib if available; otherwise 11 | falls back to a small hand-written SVG path so the figure is always present. 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | 17 | from pathlib import Path 18 | 19 | 20 | def fallback_svg(out: Path, steps: int = 200, warmup: int = 50) -> None: 21 | w, h = 460, 180 22 | pad = 32 23 | # Build points for linear warmup to 1 and then flat 24 | xs = list(range(steps)) 25 | ys = [(x+1)/warmup if x < warmup else 1.0 for x in xs] 26 | # map to svg coords 27 | def mapx(x): 28 | return pad + (w - 2*pad) * (x / max(1, steps-1)) 29 | def mapy(y): 30 | # y in [0,1] -> svg y downwards 31 | return h - pad - (h - 2*pad) * y 32 | path = "M " + " ".join(f"{mapx(x):.1f},{mapy(y):.1f}" for x, y in zip(xs, ys)) 33 | style = ( 34 | '' 35 | ) 36 | svg = [ 37 | f'', 38 | style, 39 | f'LR warmup', 40 | f'', 41 | '' 42 | ] 43 | out.write_text("\n".join(svg)) 44 | 45 | 46 | def main() -> None: 47 | fig_dir = Path(__file__).resolve().parents[1] / "figures" 48 | fig_dir.mkdir(parents=True, exist_ok=True) 49 | out = fig_dir / "ch10-lr-warmup.svg" 50 | try: 51 | import matplotlib.pyplot as plt 52 | import numpy as np 53 | plt.style.use("seaborn-v0_8") 54 | steps, warmup = 200, 50 55 | xs = np.arange(steps) 56 | ys = np.minimum(1.0, (xs + 1) / float(warmup)) 57 | fig, ax = plt.subplots(figsize=(6.0, 2.2)) 58 | ax.plot(xs, ys, color="#0A66C2") 59 | ax.set_title("LR warmup") 60 | ax.set_xlabel("step") 61 | ax.set_ylabel("scale") 62 | fig.tight_layout() 63 | fig.savefig(out, format='svg') 64 | except Exception: 65 | fallback_svg(out) 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /code/quickdemo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Quick demo: create a tiny random bundle and sample once. 9 | 10 | This validates wiring without training or external files. 11 | """ 12 | 13 | from __future__ import annotations 14 | 15 | 16 | import argparse 17 | from pathlib import Path 18 | import sys 19 | import torch 20 | 21 | sys.path.append(str(Path(__file__).resolve().parent)) 22 | from ch09_gpt import GPT, GPTConfig # type: ignore 23 | from ch11_sampling import sample # type: ignore 24 | 25 | 26 | def auto_device() -> str: 27 | if torch.cuda.is_available(): 28 | return "cuda" 29 | mps = getattr(torch.backends, "mps", None) 30 | if mps and torch.backends.mps.is_available(): 31 | return "mps" 32 | return "cpu" 33 | 34 | 35 | def main() -> None: 36 | p = argparse.ArgumentParser(description="Write a tiny random bundle and sample once") 37 | p.add_argument("--out", default="model_bundle_demo.pt") 38 | p.add_argument("--prompt", default="Hello") 39 | p.add_argument("--max-new-tokens", type=int, default=40) 40 | p.add_argument("--temperature", type=float, default=1.0) 41 | p.add_argument("--top-p", type=float, default=0.95) 42 | p.add_argument("--top-k", type=int, default=0) 43 | p.add_argument("--device", default="auto") 44 | p.add_argument("--seed", type=int, default=0) 45 | args = p.parse_args() 46 | 47 | torch.manual_seed(args.seed) 48 | device = auto_device() if args.device == "auto" else args.device 49 | cfg = GPTConfig(vocab_size=256, block_size=64, d_model=64, n_head=4, n_layer=2, d_ff=128) 50 | model = GPT(cfg).to(device).eval() 51 | bundle = {"config": cfg.__dict__, "model_state": model.state_dict(), "tokenizer": None} 52 | torch.save(bundle, args.out) 53 | print("Wrote:", args.out) 54 | 55 | ids = torch.tensor([[c for c in args.prompt.encode("utf-8")]], dtype=torch.long, device=device) 56 | out = sample( 57 | model, ids, 58 | max_new_tokens=args.max_new_tokens, 59 | temperature=args.temperature, 60 | top_k=(args.top_k or None), 61 | top_p=(args.top_p or None), 62 | ) 63 | text = bytes(out[0].tolist()).decode("utf-8", errors="ignore") 64 | print("Sample:\n", text) 65 | 66 | 67 | if __name__ == "__main__": 68 | main() 69 | -------------------------------------------------------------------------------- /code/bench_timer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Tiny matmul timer to sanity-check device speed. 9 | 10 | Example: 11 | python -m code.bench_timer --device auto --size 2048 --repeats 5 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | import argparse 17 | import time 18 | 19 | 20 | def pick_device(torch): # type: ignore 21 | if torch.cuda.is_available(): 22 | return "cuda" 23 | mps = getattr(torch.backends, "mps", None) 24 | if mps and torch.backends.mps.is_available(): 25 | return "mps" 26 | return "cpu" 27 | 28 | 29 | def main() -> None: 30 | try: 31 | import torch # type: ignore 32 | except Exception as e: # pragma: no cover 33 | print("PyTorch not installed:", e) 34 | return 35 | 36 | # Read basic matmul settings from CLI 37 | ap = argparse.ArgumentParser() 38 | ap.add_argument("--device", default="auto", help="cpu|cuda|mps|auto") 39 | ap.add_argument("--size", type=int, default=1024) 40 | ap.add_argument("--repeats", type=int, default=5) 41 | args = ap.parse_args() 42 | 43 | # Resolve device and create square matrices 44 | device = pick_device(torch) if args.device == "auto" else args.device 45 | N = args.size 46 | x = torch.randn(N, N, device=device) 47 | y = torch.randn(N, N, device=device) 48 | 49 | # Warmup for CUDA/MPS 50 | for _ in range(2): 51 | _ = x @ y 52 | if device != "cpu": 53 | torch.cuda.synchronize() if device == "cuda" else None 54 | 55 | times = [] 56 | for _ in range(args.repeats): 57 | # Time a single matmul and sync to measure wall time 58 | t0 = time.time() 59 | z = x @ y 60 | if device == "cuda": 61 | torch.cuda.synchronize() 62 | elif device == "mps": 63 | # best-effort; MPS ops often synchronize implicitly on tensor access 64 | _ = z.cpu() 65 | times.append(time.time() - t0) 66 | 67 | print( 68 | { 69 | "device": device, 70 | "size": N, 71 | "repeats": args.repeats, 72 | "ms_mean": round(1000 * sum(times) / len(times), 2), 73 | "ms_min": round(1000 * min(times), 2), 74 | } 75 | ) 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /code/gen_ch12_lcs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Draw an LCS alignment sketch for ROUGE-L intuition. 9 | 10 | Writes figures/ch12-lcs.svg with two token rows and highlighted matches. 11 | Falls back to simple SVG so the book always builds. 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | 17 | from pathlib import Path 18 | 19 | 20 | def fallback_svg(out: Path) -> None: 21 | w, h = 680, 180 22 | pad = 24 23 | cell = 18 24 | hyp = ["the", "cat", "sat", "on", "the", "mat"] 25 | ref = ["the", "cat", "is", "on", "the", "mat"] 26 | svg = [ 27 | f'', 28 | '', 29 | 'ROUGE-L via LCS' 30 | ] 31 | # token rows 32 | x0 = pad + 120 33 | for i, tok in enumerate(ref): 34 | x = x0 + i * (cell + 10) 35 | svg.append(f'' ) 36 | svg.append(f'{tok}') 37 | for i, tok in enumerate(hyp): 38 | x = x0 + i * (cell + 10) 39 | svg.append(f'' ) 40 | svg.append(f'{tok}') 41 | # highlight LCS edges (the, cat, on, the, mat) 42 | match_idx = [(0,0),(1,1),(3,3),(4,4),(5,5)] 43 | for hi, ri in match_idx: 44 | xh = x0 + hi * (cell + 10) + cell/2 45 | xr = x0 + ri * (cell + 10) + cell/2 46 | svg.append(f'') 47 | svg.append('') 48 | out.write_text("\n".join(svg)) 49 | 50 | 51 | def main() -> None: 52 | fig_dir = Path(__file__).resolve().parents[1] / "figures" 53 | fig_dir.mkdir(parents=True, exist_ok=True) 54 | out = fig_dir / "ch12-lcs.svg" 55 | try: 56 | import matplotlib.pyplot as plt 57 | plt.style.use('seaborn-v0_8') 58 | # fallback is sufficient visually; keep matplotlib path minimal 59 | fallback_svg(out) 60 | except Exception: 61 | fallback_svg(out) 62 | 63 | 64 | if __name__ == '__main__': 65 | main() 66 | 67 | -------------------------------------------------------------------------------- /code/bench_sampling.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Measure sampling tokens/sec for a tiny GPT. 9 | 10 | Usage: 11 | python code/bench_sampling.py --device auto --max-new-tokens 200 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | 17 | import argparse 18 | from pathlib import Path 19 | import sys 20 | import time 21 | import torch 22 | 23 | sys.path.append(str(Path(__file__).resolve().parent)) 24 | from ch09_gpt import GPT, GPTConfig # type: ignore 25 | from ch11_sampling import sample # type: ignore 26 | 27 | 28 | def auto_device() -> str: 29 | if torch.cuda.is_available(): 30 | return "cuda" 31 | mps = getattr(torch.backends, "mps", None) 32 | if mps and torch.backends.mps.is_available(): 33 | return "mps" 34 | return "cpu" 35 | 36 | 37 | def main() -> None: 38 | p = argparse.ArgumentParser() 39 | p.add_argument("--device", default="auto") 40 | p.add_argument("--block", type=int, default=128) 41 | p.add_argument("--vocab", type=int, default=256) 42 | p.add_argument("--max-new-tokens", type=int, default=200) 43 | p.add_argument("--temperature", type=float, default=0.9) 44 | p.add_argument("--top-k", type=int, default=0) 45 | p.add_argument("--top-p", type=float, default=0.0) 46 | args = p.parse_args() 47 | 48 | # Choose device automatically unless explicitly set 49 | device = auto_device() if args.device == "auto" else args.device 50 | cfg = GPTConfig(vocab_size=args.vocab, block_size=args.block) 51 | # Construct a tiny model and prompt to isolate sampling speed 52 | model = GPT(cfg).to(device).eval() 53 | prompt = torch.randint( 54 | 0, cfg.vocab_size, (1, min(8, args.block)), device=device 55 | ) 56 | 57 | t0 = time.time() 58 | out = sample( 59 | model, 60 | prompt, 61 | max_new_tokens=args.max_new_tokens, 62 | temperature=args.temperature, 63 | top_k=(args.top_k or None), 64 | top_p=(args.top_p or None), 65 | ) 66 | if device == "cuda": 67 | torch.cuda.synchronize() 68 | dt = time.time() - t0 69 | gen = out.size(1) - prompt.size(1) 70 | print( 71 | { 72 | "device": device, 73 | "gen_tokens": int(gen), 74 | "tokens_per_sec": round(gen / dt), 75 | } 76 | ) 77 | 78 | if __name__ == '__main__': 79 | main() 80 | -------------------------------------------------------------------------------- /code/venv_tools.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Virtual environment helpers used in Chapter 2. 9 | 10 | Usage: 11 | python -m code.venv_tools info 12 | python -m code.venv_tools create .venv 13 | """ 14 | 15 | from __future__ import annotations 16 | 17 | import os 18 | import shutil 19 | import site 20 | import subprocess 21 | import sys 22 | from pathlib import Path 23 | 24 | 25 | def info() -> None: 26 | print("== Python & Environment ==") 27 | print("Executable:", sys.executable) 28 | print("Prefix:", sys.prefix) 29 | venv = os.environ.get("VIRTUAL_ENV") or (".venv" if ".venv" in sys.executable else "") 30 | print("VIRTUAL_ENV:", venv or "(not active)") 31 | site_dirs = site.getsitepackages() 32 | sp = ", ".join(p for p in site_dirs if ".venv" in p) or ", ".join(site_dirs) 33 | print("site-packages:", sp) 34 | 35 | 36 | def create(path: str = ".venv") -> None: 37 | """Create a virtual environment at `path` if it doesn't exist. 38 | 39 | This is a convenience wrapper around: `python -m venv `. 40 | """ 41 | p = Path(path) 42 | if p.exists(): 43 | print(f"Environment already exists at {p}") 44 | return 45 | print("Creating venv:", p) 46 | subprocess.check_call([sys.executable, "-m", "venv", str(p)]) 47 | print("Created. To activate:") 48 | if os.name == "nt": 49 | print(rf" .\{p}\Scripts\Activate.ps1 # PowerShell") 50 | else: 51 | print(f" source {p}/bin/activate") 52 | 53 | 54 | def remove(path: str = ".venv") -> None: 55 | p = Path(path) 56 | if not p.exists(): 57 | print("No such environment:", p) 58 | return 59 | print("Removing venv:", p) 60 | shutil.rmtree(p) 61 | print("Removed.") 62 | 63 | 64 | def main(argv: list[str] | None = None) -> None: 65 | argv = list(sys.argv[1:] if argv is None else argv) 66 | if not argv or argv[0] in {"-h", "--help", "help"}: 67 | print("Usage: python -m code.venv_tools [info|create|remove] [path]") 68 | return 69 | cmd = argv.pop(0) 70 | if cmd == "info": 71 | info() 72 | elif cmd == "create": 73 | create(argv[0] if argv else ".venv") 74 | elif cmd == "remove": 75 | remove(argv[0] if argv else ".venv") 76 | else: 77 | print("Unknown command:", cmd) 78 | sys.exit(2) 79 | 80 | 81 | if __name__ == "__main__": 82 | main() 83 | -------------------------------------------------------------------------------- /code/gen_ch13_accum.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Illustrate gradient accumulation: k micro-batches per optimizer step. 9 | 10 | Writes figures/ch13-accum.svg as a simple, dependency-free SVG. 11 | """ 12 | 13 | from __future__ import annotations 14 | 15 | 16 | from pathlib import Path 17 | 18 | 19 | def main() -> None: 20 | fig_dir = Path(__file__).resolve().parents[1] / "figures" 21 | fig_dir.mkdir(parents=True, exist_ok=True) 22 | out = fig_dir / "ch13-accum.svg" 23 | 24 | w, h = 700, 220 25 | pad = 24 26 | cell_w, cell_h = 110, 44 27 | gap = 18 28 | 29 | def batch(x, y, label, color="#B5D0F5"): 30 | return [ 31 | f'', 33 | f'{label}', 34 | ] 35 | 36 | items = [ 37 | '' % (w, h), 38 | '', 39 | 'Gradient accumulation (k micro-batches per step)', 40 | ] 41 | 42 | x0 = pad + 20 43 | y0 = 60 44 | k = 4 45 | for i in range(k): 46 | x = x0 + i * (cell_w + gap) 47 | items += batch(x, y0, f"micro-batch {i+1}") 48 | # plus sign between micro-batches 49 | if i < k - 1: 50 | items.append(f'+') 51 | 52 | # Arrow to optimizer step box 53 | x_end = x0 + (k-1) * (cell_w + gap) + cell_w + 40 54 | y_mid = y0 + cell_h/2 55 | items.append(f'') 56 | 57 | # Optimizer step box 58 | step_x, step_y = x_end, y0 59 | items += batch(step_x, step_y, "optimizer step", color="#9EC5F8") 60 | 61 | # Define arrow marker 62 | items.insert(1, ( 63 | '' 64 | '' 65 | )) 66 | 67 | items.append('') 68 | out.write_text("\n".join(items)) 69 | 70 | 71 | if __name__ == "__main__": 72 | main() 73 | 74 | -------------------------------------------------------------------------------- /code/gen_ch11_temp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Visualize the effect of temperature on a toy logit vector. 9 | 10 | Writes figures/ch11-temp.svg. Matplotlib if available, else fallback SVG. 11 | """ 12 | 13 | from __future__ import annotations 14 | 15 | 16 | from pathlib import Path 17 | 18 | 19 | def fallback_svg(out: Path) -> None: 20 | w, h = 520, 220 21 | pad = 28 22 | bars = [0.55, 0.25, 0.1, 0.06, 0.04] 23 | cols = ["#0A66C2", "#5491D6", "#7FADE5", "#A5C5EE", "#C9DCF7"] 24 | def bar(x, y, w_, h_, c): 25 | return (f'') 27 | svg = [ 28 | f'', 29 | '', 30 | f'Temperature' 31 | ] 32 | # draw three panels (T=0.7, 1.0, 1.3) with simple bars 33 | panel_w = (w - 2*pad) / 3 34 | for i, t in enumerate([0.7, 1.0, 1.3]): 35 | x0 = pad + i * panel_w 36 | svg.append(f'T={t}') 37 | maxh = h - 80 38 | for j, p in enumerate(bars): 39 | height = maxh * (p ** (1.0 if t==1.0 else (1.2 if t<1 else 0.8))) 40 | svg.append(bar(x0 + 16 + j*24, h-30-height, 18, height, cols[j])) 41 | svg.append('') 42 | out.write_text("\n".join(svg)) 43 | 44 | 45 | def main() -> None: 46 | fig_dir = Path(__file__).resolve().parents[1] / "figures" 47 | fig_dir.mkdir(parents=True, exist_ok=True) 48 | out = fig_dir / "ch11-temp.svg" 49 | try: 50 | import matplotlib.pyplot as plt 51 | import numpy as np 52 | plt.style.use('seaborn-v0_8') 53 | logits = np.array([2.0, 1.0, 0.0, -0.5, -1.0]) 54 | Ts = [0.7, 1.0, 1.3] 55 | fig, axes = plt.subplots(1, 3, figsize=(6.4, 2.2), constrained_layout=True) 56 | for ax, T in zip(axes, Ts): 57 | p = np.exp(logits / T); p = p / p.sum() 58 | ax.bar(range(len(p)), p, color="#0A66C2") 59 | ax.set_title(f"T={T}") 60 | ax.set_ylim(0, 1.0) 61 | ax.set_xticks([]); ax.set_yticks([]) 62 | fig.suptitle("Temperature") 63 | fig.savefig(out, format='svg') 64 | except Exception: 65 | fallback_svg(out) 66 | 67 | 68 | if __name__ == '__main__': 69 | main() 70 | 71 | -------------------------------------------------------------------------------- /code/gen_ch13_cosine.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Draw warmup + cosine LR schedule used in Chapter 13. 9 | 10 | Writes figures/ch13-lr-cosine.svg. Falls back to minimal SVG if Matplotlib 11 | is unavailable. 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | 17 | from pathlib import Path 18 | 19 | 20 | def fallback_svg(out: Path) -> None: 21 | w, h = 560, 220 22 | pad = 32 23 | warmup, total, minr = 100, 1000, 0.1 24 | xs = list(range(total)) 25 | ys = [] 26 | import math 27 | for s in xs: 28 | s1 = s + 1 29 | if s1 <= warmup: 30 | ys.append(s1 / warmup) 31 | else: 32 | t = s1 - warmup 33 | frac = t / (total - warmup) 34 | cos = 0.5 * (1 + math.cos(math.pi * frac)) 35 | ys.append(minr + (1 - minr) * cos) 36 | def mapx(x): return pad + (w - 2*pad) * (x / (total-1)) 37 | def mapy(y): return h - pad - (h - 2*pad) * y 38 | path = "M " + " ".join(f"{mapx(x):.1f},{mapy(y):.1f}" for x,y in zip(xs,ys)) 39 | svg = [ 40 | f'', 41 | '', 42 | 'Warmup + Cosine LR', 43 | f'', 44 | '' 45 | ] 46 | out.write_text("\n".join(svg)) 47 | 48 | 49 | def main() -> None: 50 | fig_dir = Path(__file__).resolve().parents[1] / "figures" 51 | fig_dir.mkdir(parents=True, exist_ok=True) 52 | out = fig_dir / "ch13-lr-cosine.svg" 53 | try: 54 | import matplotlib.pyplot as plt 55 | import numpy as np 56 | plt.style.use('seaborn-v0_8') 57 | warmup, total, minr = 100, 1000, 0.1 58 | xs = np.arange(total) 59 | ys = [] 60 | for s in xs: 61 | s1 = s + 1 62 | if s1 <= warmup: 63 | ys.append(s1 / warmup) 64 | else: 65 | t = s1 - warmup 66 | frac = t / (total - warmup) 67 | ys.append(minr + (1 - minr) * 0.5 * (1 + np.cos(np.pi * frac))) 68 | ys = np.array(ys) 69 | fig, ax = plt.subplots(figsize=(6.4, 2.2)) 70 | ax.plot(xs, ys, color="#0A66C2") 71 | ax.set_title("Warmup + Cosine LR") 72 | ax.set_xlabel("step"); ax.set_ylabel("scale") 73 | fig.tight_layout(); fig.savefig(out, format='svg') 74 | except Exception: 75 | fallback_svg(out) 76 | 77 | 78 | if __name__ == '__main__': 79 | main() 80 | 81 | -------------------------------------------------------------------------------- /code/ch14_lora.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | LoRA: Low-rank adapters for Linear layers (teaching version). 9 | 10 | This module provides a small, readable `LoRALinear` that adds a trainable 11 | low-rank delta to a frozen base weight: 12 | 13 | y = x @ W^T + scale * x @ (B @ A)^T 14 | 15 | where A ∈ R^{r×d_in}, B ∈ R^{d_out×r}, and `scale = alpha / r`. 16 | """ 17 | 18 | from __future__ import annotations 19 | 20 | 21 | import torch 22 | import torch.nn as nn 23 | 24 | 25 | class LoRALinear(nn.Module): 26 | def __init__( 27 | self, 28 | d_in: int, 29 | d_out: int, 30 | r: int = 8, 31 | alpha: float = 16.0, 32 | bias: bool = False, 33 | ) -> None: 34 | """Create a Linear with LoRA adapters. 35 | 36 | - d_in, d_out: base dimensions 37 | - r: adapter rank (small) 38 | - alpha: scaling factor (effective scale = alpha / r) 39 | - bias: include bias term on the base layer 40 | """ 41 | super().__init__() 42 | self.base = nn.Linear(d_in, d_out, bias=bias) 43 | self.r = int(r) 44 | self.alpha = float(alpha) 45 | self.scale = self.alpha / max(1, self.r) 46 | # LoRA adapters (A: r×d_in, B: d_out×r) 47 | if self.r > 0: 48 | self.A = nn.Linear(d_in, self.r, bias=False) 49 | self.B = nn.Linear(self.r, d_out, bias=False) 50 | # Init: A small, B zero so start as identity (delta≈0) 51 | nn.init.kaiming_uniform_(self.A.weight, a=2**0.5) 52 | nn.init.zeros_(self.B.weight) 53 | # Freeze base 54 | for p in self.base.parameters(): 55 | p.requires_grad = False 56 | else: 57 | self.A = None 58 | self.B = None 59 | self.merged = False 60 | 61 | def forward(self, x: torch.Tensor) -> torch.Tensor: 62 | y = self.base(x) 63 | if self.r > 0 and not self.merged: 64 | y = y + self.scale * self.B(self.A(x)) 65 | return y 66 | 67 | @torch.no_grad() 68 | def merge(self) -> None: 69 | """Fold the LoRA delta into the base weight for inference. 70 | 71 | After merging, adapters are disabled and the module acts like a 72 | standard Linear layer with updated weights. 73 | """ 74 | if self.r == 0 or self.merged: 75 | self.merged = True 76 | return 77 | # W' = W + scale * (B @ A) 78 | delta = (self.B.weight @ self.A.weight) * self.scale 79 | self.base.weight += delta 80 | self.merged = True 81 | 82 | 83 | __all__ = ["LoRALinear"] 84 | 85 | -------------------------------------------------------------------------------- /code/gen_ch11_nucleus.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Plot cumulative probability and nucleus threshold p. 9 | 10 | Writes figures/ch11-nucleus.svg. Uses Matplotlib if available; otherwise 11 | falls back to a minimal SVG line/area plot to ensure the book builds. 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | 17 | from pathlib import Path 18 | 19 | 20 | def fallback_svg(out: Path) -> None: 21 | w, h = 540, 220 22 | pad = 32 23 | # Toy sorted probabilities 24 | probs = [0.4, 0.25, 0.12, 0.08, 0.06, 0.04, 0.03, 0.02] 25 | cum = [] 26 | s = 0.0 27 | for p in probs: 28 | s += p 29 | cum.append(s) 30 | pthr = 0.9 31 | # Map to svg coords 32 | def mapx(i: int) -> float: 33 | return pad + (w - 2 * pad) * (i / (len(cum) - 1)) 34 | def mapy(y: float) -> float: 35 | return h - pad - (h - 2 * pad) * y 36 | path = "M " + " ".join(f"{mapx(i):.1f},{mapy(y):.1f}" for i, y in enumerate(cum)) 37 | ythr = mapy(pthr) 38 | svg = [ 39 | f'', 40 | '', 41 | f'Nucleus threshold', 42 | f'', 43 | f'', 45 | f'p=0.9', 46 | '' 47 | ] 48 | out.write_text("\n".join(svg)) 49 | 50 | 51 | def main() -> None: 52 | fig_dir = Path(__file__).resolve().parents[1] / "figures" 53 | fig_dir.mkdir(parents=True, exist_ok=True) 54 | out = fig_dir / "ch11-nucleus.svg" 55 | try: 56 | import matplotlib.pyplot as plt 57 | import numpy as np 58 | plt.style.use('seaborn-v0_8') 59 | probs = np.array([0.4, 0.25, 0.12, 0.08, 0.06, 0.04, 0.03, 0.02]) 60 | cum = np.cumsum(probs) 61 | fig, ax = plt.subplots(figsize=(6.0, 2.2)) 62 | ax.plot(cum, color="#0A66C2", lw=2) 63 | p = 0.9 64 | ax.axhline(p, color="#DD4444", ls='--') 65 | ax.text(len(cum)-1, p + 0.03, f"p={p}", color="#DD4444", 66 | ha='right', va='bottom') 67 | ax.set_xlim(0, len(cum)-1) 68 | ax.set_ylim(0, 1.0) 69 | ax.set_xticks([]); ax.set_yticks([]) 70 | ax.set_title("Nucleus threshold") 71 | fig.tight_layout() 72 | fig.savefig(out, format='svg') 73 | except Exception: 74 | fallback_svg(out) 75 | 76 | 77 | if __name__ == '__main__': 78 | main() 79 | 80 | -------------------------------------------------------------------------------- /code/ch12_eval_corpus.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Quick corpus evaluator for Chapter 12. 9 | 10 | Reads references and hypotheses from files (one example per line). The 11 | references file supports multiple references per example by separating them 12 | with the delimiter " ||| ". Tokenization defaults to whitespace with optional 13 | lowercasing. 14 | 15 | Outputs BLEU (corpus), ROUGE-L, METEOR (simplified), and distinct-1/2. 16 | """ 17 | 18 | from __future__ import annotations 19 | 20 | 21 | import argparse 22 | from pathlib import Path 23 | from typing import List, Sequence 24 | 25 | from code.ch12_metrics_text import ( 26 | bleu_corpus, 27 | rouge_l, 28 | meteor_simple, 29 | distinct_n, 30 | ) 31 | 32 | 33 | def parse_lines(path: str, lowercase: bool) -> List[str]: 34 | text = Path(path).read_text(encoding="utf-8").splitlines() 35 | return [t.lower() if lowercase else t for t in text] 36 | 37 | 38 | def to_refs(lines: List[str]) -> List[List[Sequence[str]]]: 39 | """Split each line on ' ||| ' to allow multiple references per example.""" 40 | out: List[List[Sequence[str]]] = [] 41 | for line in lines: 42 | refs = [seg.strip().split() for seg in line.split(" ||| ")] 43 | out.append(refs) 44 | return out 45 | 46 | 47 | def to_hyps(lines: List[str]) -> List[Sequence[str]]: 48 | return [ln.split() for ln in lines] 49 | 50 | 51 | def main() -> None: 52 | p = argparse.ArgumentParser(description="Evaluate text outputs against references") 53 | p.add_argument("--refs", required=True, help="path to references.txt") 54 | p.add_argument("--hyps", required=True, help="path to hypotheses.txt") 55 | p.add_argument("--lower", action="store_true", help="lowercase before tokenizing") 56 | p.add_argument("--max-n", type=int, default=4, help="max n-gram for BLEU") 57 | args = p.parse_args() 58 | 59 | ref_lines = parse_lines(args.refs, args.lower) 60 | hyp_lines = parse_lines(args.hyps, args.lower) 61 | if len(ref_lines) != len(hyp_lines): 62 | raise SystemExit("refs and hyps must have the same number of lines") 63 | 64 | references = to_refs(ref_lines) 65 | hypotheses = to_hyps(hyp_lines) 66 | 67 | bleu = bleu_corpus(references, hypotheses, max_n=args.max_n, smooth=True) 68 | rlg = rouge_l(references, hypotheses) 69 | met = meteor_simple(references, hypotheses) 70 | d1 = distinct_n(hypotheses, 1) 71 | d2 = distinct_n(hypotheses, 2) 72 | 73 | print("Examples:", len(hypotheses)) 74 | print(f"BLEU_{args.max_n}: {bleu:.3f}") 75 | print(f"ROUGE_L: {rlg:.3f}") 76 | print(f"METEOR*: {met:.3f} (simplified)") 77 | print(f"distinct-1: {d1:.3f}") 78 | print(f"distinct-2: {d2:.3f}") 79 | 80 | 81 | if __name__ == "__main__": 82 | main() 83 | 84 | -------------------------------------------------------------------------------- /code/gen_ch11_filters.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Illustrate top-k and top-p filtering on a toy distribution. 9 | 10 | Writes figures/ch11-topfilt.svg. Matplotlib if available, else fallback SVG. 11 | """ 12 | 13 | from __future__ import annotations 14 | 15 | 16 | from pathlib import Path 17 | 18 | 19 | def fallback_svg(out: Path) -> None: 20 | w, h = 600, 220 21 | pad = 28 22 | base = [0.40, 0.25, 0.12, 0.08, 0.05, 0.04, 0.03, 0.03] 23 | cols = ["#0A66C2"] * len(base) 24 | def panel(x0, title, mask): 25 | svg = [f'{title}'] 26 | x = x0 + 16 27 | for p, m in zip(base, mask): 28 | height = (h - 80) * (p if not m else 0.02) 29 | color = "#0A66C2" if not m else "#DCE6F8" 30 | svg.append( 31 | f'' 33 | ) 34 | x += 22 35 | return "\n".join(svg) 36 | topk_mask = [False, False, False, True, True, True, True, True] # keep 3 37 | topp_mask = [False, False, False, False, True, True, True, True] # keep to ~0.85 38 | svg = [ 39 | f'', 40 | '', 41 | f'Top-k vs Top-p', 42 | panel(20, 'Top-k (k=3)', topk_mask), 43 | panel(320, 'Top-p (p≈0.85)', topp_mask), 44 | '' 45 | ] 46 | out.write_text("\n".join(svg)) 47 | 48 | 49 | def main() -> None: 50 | fig_dir = Path(__file__).resolve().parents[1] / "figures" 51 | fig_dir.mkdir(parents=True, exist_ok=True) 52 | out = fig_dir / "ch11-topfilt.svg" 53 | try: 54 | import matplotlib.pyplot as plt 55 | import numpy as np 56 | plt.style.use('seaborn-v0_8') 57 | base = np.array([0.40, 0.25, 0.12, 0.08, 0.05, 0.04, 0.03, 0.03]) 58 | topk_mask = np.array([False, False, False, True, True, True, True, True]) 59 | topp_mask = np.array([False, False, False, False, True, True, True, True]) 60 | fig, axes = plt.subplots(1, 2, figsize=(6.4, 2.2), constrained_layout=True) 61 | axes[0].bar(range(len(base)), np.where(topk_mask, 0.02, base), color="#0A66C2") 62 | axes[0].set_title("Top-k (k=3)") 63 | axes[1].bar(range(len(base)), np.where(topp_mask, 0.02, base), color="#0A66C2") 64 | axes[1].set_title("Top-p (p≈0.85)") 65 | for ax in axes: 66 | ax.set_ylim(0, 0.5); ax.set_xticks([]); ax.set_yticks([]) 67 | fig.suptitle("Top-k vs Top-p") 68 | fig.savefig(out, format='svg') 69 | except Exception: 70 | fallback_svg(out) 71 | 72 | 73 | if __name__ == '__main__': 74 | main() 75 | 76 | -------------------------------------------------------------------------------- /code/ch5_linreg.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Minimal linear regression training in PyTorch (Chapter 5). 9 | 10 | Run: 11 | python code/ch5_linreg.py --device auto --epochs 400 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | import argparse 17 | from dataclasses import dataclass 18 | 19 | import torch 20 | 21 | 22 | def pick_device() -> torch.device: 23 | if torch.cuda.is_available(): 24 | return torch.device("cuda") 25 | mps = getattr(torch.backends, "mps", None) 26 | if mps and torch.backends.mps.is_available(): 27 | return torch.device("mps") 28 | return torch.device("cpu") 29 | 30 | 31 | @dataclass 32 | class Config: 33 | epochs: int = 400 34 | lr: float = 3e-2 35 | n: int = 128 36 | seed: int = 42 37 | device: str = "auto" # cpu|cuda|mps|auto 38 | 39 | 40 | def make_data( 41 | cfg: Config, device: torch.device 42 | ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: 43 | # Fix RNG for reproducibility across devices 44 | g = torch.Generator(device="cpu").manual_seed(cfg.seed) 45 | w_true = torch.tensor([2.0, -3.5]) 46 | b_true = torch.tensor(0.5) 47 | # Draw features and small Gaussian noise on target 48 | X = torch.randn(cfg.n, 2, generator=g).to(device) 49 | noise = 0.1 * torch.randn(cfg.n, generator=g).to(device) 50 | y = (X @ w_true.to(device)) + b_true.to(device) + noise 51 | return X, y, w_true.to(device), b_true.to(device) 52 | 53 | 54 | def train(cfg: Config) -> None: 55 | # Pick device lazily to match user selection 56 | device = pick_device() if cfg.device == "auto" else torch.device(cfg.device) 57 | X, y, w_true, b_true = make_data(cfg, device) 58 | 59 | model = torch.nn.Linear(2, 1).to(device) 60 | opt = torch.optim.AdamW(model.parameters(), lr=cfg.lr) 61 | loss_fn = torch.nn.MSELoss() 62 | 63 | for step in range(cfg.epochs + 1): 64 | # Usual gradient-descent step: zero, forward, loss, backward, update 65 | opt.zero_grad() 66 | pred = model(X).squeeze(-1) 67 | loss = loss_fn(pred, y) 68 | loss.backward() 69 | opt.step() 70 | if step % 100 == 0: 71 | print(f"step={step:04d} loss={loss.item():.4f}") 72 | 73 | w_learned = model.weight.detach().squeeze(0) 74 | b_learned = model.bias.detach().squeeze(0) 75 | print("true w:", w_true.cpu().tolist(), " b:", float(b_true)) 76 | print("learn w:", w_learned.cpu().tolist(), " b:", float(b_learned)) 77 | 78 | 79 | def parse_args() -> Config: 80 | ap = argparse.ArgumentParser() 81 | ap.add_argument("--epochs", type=int, default=400) 82 | ap.add_argument("--lr", type=float, default=3e-2) 83 | ap.add_argument("--device", default="auto") 84 | ns = ap.parse_args() 85 | return Config(epochs=ns.epochs, lr=ns.lr, device=ns.device) 86 | 87 | 88 | if __name__ == "__main__": 89 | cfg = parse_args() 90 | train(cfg) 91 | -------------------------------------------------------------------------------- /code/ch15_fastapi_app.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | FastAPI app serving a minimal /generate endpoint (Chapter 15). 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | 14 | import sys 15 | from pathlib import Path 16 | from typing import Optional 17 | 18 | import torch 19 | from fastapi import FastAPI 20 | from pydantic import BaseModel 21 | 22 | # Import from code/ 23 | sys.path.append(str(Path(__file__).resolve().parent)) 24 | from ch09_gpt import GPT, GPTConfig # type: ignore 25 | from ch11_sampling import sample # type: ignore 26 | from ch6_tokenize import SimpleTokenizer, Vocab # type: ignore 27 | 28 | 29 | class GenerateReq(BaseModel): 30 | prompt: str 31 | max_new_tokens: int = 80 32 | temperature: float = 0.9 33 | top_k: int = 0 34 | top_p: float = 0.95 35 | 36 | 37 | def load_bundle(path: str): 38 | b = torch.load(path, map_location="cpu") 39 | cfg = GPTConfig(**b["config"]) # type: ignore 40 | model = GPT(cfg).eval() 41 | model.load_state_dict(b["model_state"]) # type: ignore 42 | meta = b.get("tokenizer") 43 | tok = None 44 | if meta and meta.get("id_to_token"): 45 | id_to_token = list(meta["id_to_token"]) # ensure list 46 | token_to_id = {t: i for i, t in enumerate(id_to_token)} 47 | vocab = Vocab( 48 | token_to_id=token_to_id, 49 | id_to_token=id_to_token, 50 | pad=int(meta.get("pad_id", 0)), 51 | unk=int(meta.get("unk_id", 1)), 52 | ) 53 | tok = SimpleTokenizer(vocab=vocab, level=meta.get("level", "char")) 54 | return model, tok 55 | 56 | 57 | app = FastAPI(title="Mini‑GPT") 58 | MODEL, TOK = None, None 59 | 60 | 61 | @app.on_event("startup") 62 | def _startup(): 63 | global MODEL, TOK 64 | bundle = Path("model_bundle.pt") 65 | if bundle.exists(): 66 | MODEL, TOK = load_bundle(str(bundle)) 67 | 68 | 69 | @app.post("/generate") 70 | def generate(req: GenerateReq): 71 | model = MODEL 72 | tok = TOK 73 | if model is None: 74 | return {"error": "model not loaded; place model_bundle.pt next to the app"} 75 | if tok is None: 76 | ids = torch.tensor([[c for c in req.prompt.encode("utf-8")]], dtype=torch.long) 77 | out = sample( 78 | model, 79 | ids, 80 | max_new_tokens=req.max_new_tokens, 81 | temperature=req.temperature, 82 | top_k=(req.top_k or None), 83 | top_p=(req.top_p or None), 84 | ) 85 | text = bytes(out[0].tolist()).decode("utf-8", errors="ignore") 86 | else: 87 | ids = torch.tensor([tok.encode(req.prompt)], dtype=torch.long) 88 | out = sample( 89 | model, 90 | ids, 91 | max_new_tokens=req.max_new_tokens, 92 | temperature=req.temperature, 93 | top_k=(req.top_k or None), 94 | top_p=(req.top_p or None), 95 | ) 96 | text = tok.decode(out[0].tolist()) 97 | return {"text": text} 98 | 99 | -------------------------------------------------------------------------------- /code/ch15_streamlit_app.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Streamlit app for sampling from an exported GPT bundle (Chapter 15). 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | 14 | import sys 15 | from pathlib import Path 16 | import streamlit as st 17 | import torch 18 | 19 | # Allow importing modules from code/ 20 | sys.path.append(str(Path(__file__).resolve().parent)) 21 | from ch09_gpt import GPT, GPTConfig # type: ignore 22 | from ch11_sampling import sample # type: ignore 23 | from ch6_tokenize import SimpleTokenizer, Vocab # type: ignore 24 | 25 | 26 | @st.cache_resource 27 | def load_bundle(path: str): 28 | b = torch.load(path, map_location="cpu") 29 | cfg = GPTConfig(**b["config"]) # type: ignore 30 | model = GPT(cfg).eval() 31 | model.load_state_dict(b["model_state"]) # type: ignore 32 | meta = b.get("tokenizer") 33 | tok = None 34 | if meta and meta.get("id_to_token"): 35 | id_to_token = list(meta["id_to_token"]) # ensure list 36 | token_to_id = {t: i for i, t in enumerate(id_to_token)} 37 | vocab = Vocab( 38 | token_to_id=token_to_id, 39 | id_to_token=id_to_token, 40 | pad=int(meta.get("pad_id", 0)), 41 | unk=int(meta.get("unk_id", 1)), 42 | ) 43 | tok = SimpleTokenizer(vocab=vocab, level=meta.get("level", "char")) 44 | return model, tok 45 | 46 | 47 | st.title("Mini‑GPT Sampler") 48 | bundle_path = st.text_input("Bundle path", "model_bundle.pt") 49 | prompt = st.text_area("Prompt", "Hello") 50 | col1, col2, col3 = st.columns(3) 51 | with col1: 52 | max_new = st.number_input("Max new tokens", 1, 512, 80) 53 | with col2: 54 | temp = st.slider("Temperature", 0.0, 1.5, 0.9, 0.05) 55 | with col3: 56 | top_p = st.slider("Top‑p", 0.0, 1.0, 0.95, 0.05) 57 | top_k = st.slider("Top‑k (0=off)", 0, 200, 0, 5) 58 | 59 | if st.button("Generate"): 60 | try: 61 | model, tok = load_bundle(bundle_path) 62 | except Exception as e: 63 | st.error(f"Failed to load bundle: {e}") 64 | else: 65 | if tok is None: 66 | ids = torch.tensor([[c for c in prompt.encode("utf-8")]], dtype=torch.long) 67 | out = sample( 68 | model, 69 | ids, 70 | max_new_tokens=int(max_new), 71 | temperature=float(temp), 72 | top_k=(int(top_k) or None), 73 | top_p=(float(top_p) or None), 74 | ) 75 | text = bytes(out[0].tolist()).decode("utf-8", errors="ignore") 76 | else: 77 | ids = torch.tensor([tok.encode(prompt)], dtype=torch.long) 78 | out = sample( 79 | model, 80 | ids, 81 | max_new_tokens=int(max_new), 82 | temperature=float(temp), 83 | top_k=(int(top_k) or None), 84 | top_p=(float(top_p) or None), 85 | ) 86 | text = tok.decode(out[0].tolist()) 87 | st.subheader("Output") 88 | st.write(text) 89 | 90 | -------------------------------------------------------------------------------- /code/ch11_sampling.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Tiny sampling helpers (preview for Chapter 11). 9 | 10 | Functions here keep dependencies minimal and work directly with the GPT model 11 | from Chapter 9. They operate on integer token ids and return extended ids. 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | 17 | from typing import Optional 18 | 19 | import torch 20 | import torch.nn.functional as F 21 | 22 | 23 | def _top_k_filter(logits: torch.Tensor, k: int) -> torch.Tensor: 24 | if k <= 0: 25 | return logits 26 | v, _ = torch.topk(logits, k) 27 | thresh = v[:, [-1]] 28 | return torch.where(logits < thresh, torch.tensor(-1e9, device=logits.device), logits) 29 | 30 | 31 | def _top_p_filter(logits: torch.Tensor, p: float) -> torch.Tensor: 32 | if p <= 0 or p >= 1: 33 | return logits 34 | # sort descending and keep smallest set whose cumulative prob >= p 35 | sorted_logits, sorted_idx = torch.sort(logits, descending=True) 36 | probs = torch.softmax(sorted_logits, dim=-1) 37 | cum = torch.cumsum(probs, dim=-1) 38 | mask = cum > p 39 | # always keep the first token 40 | mask[..., 0] = False 41 | filtered = sorted_logits.masked_fill(mask, -1e9) 42 | # unsort back to original order 43 | unsorted = torch.empty_like(filtered).scatter_(1, sorted_idx, filtered) 44 | return unsorted 45 | 46 | 47 | @torch.no_grad() 48 | def sample( 49 | model, 50 | input_ids: torch.Tensor, 51 | max_new_tokens: int = 50, 52 | temperature: float = 1.0, 53 | top_k: Optional[int] = None, 54 | top_p: Optional[float] = None, 55 | eos_id: Optional[int] = None, 56 | ) -> torch.Tensor: 57 | """Generate tokens autoregressively. 58 | 59 | - temperature: 0 → greedy (argmax); >0 → softmax sampling 60 | - top_k: keep only the top‑k logits at each step (optional) 61 | - eos_id: if set, stop when generated 62 | """ 63 | model.eval() 64 | x = input_ids 65 | device = next(model.parameters()).device 66 | x = x.to(device) 67 | 68 | for _ in range(max_new_tokens): 69 | # Forward pass on the last block_size tokens 70 | T = x.size(1) 71 | block_size = getattr(model.cfg, "block_size", T) 72 | x_cond = x[:, -block_size:] 73 | logits, _ = model(x_cond) 74 | logits = logits[:, -1, :] # last position 75 | 76 | if temperature <= 0: 77 | # Greedy 78 | next_id = torch.argmax(logits, dim=-1, keepdim=True) 79 | else: 80 | logits = logits / temperature 81 | if top_k is not None and top_k > 0: 82 | logits = _top_k_filter(logits, top_k) 83 | if top_p is not None: 84 | logits = _top_p_filter(logits, float(top_p)) 85 | probs = F.softmax(logits, dim=-1) 86 | next_id = torch.multinomial(probs, num_samples=1) 87 | 88 | x = torch.cat([x, next_id], dim=1) 89 | if eos_id is not None and int(next_id[0, 0].item()) == int(eos_id): 90 | break 91 | return x 92 | 93 | 94 | __all__ = ["sample", "_top_k_filter", "_top_p_filter"] 95 | -------------------------------------------------------------------------------- /code/gen_ch10_windows.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Generate a sliding-window schematic for Chapter 10. 9 | 10 | Writes figures/ch10-windows.svg. Uses Matplotlib if available; otherwise 11 | falls back to a small hand-written SVG so the figure is always present. 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | 17 | from pathlib import Path 18 | 19 | 20 | def fallback_svg(out: Path, N: int = 24, T: int = 8) -> None: 21 | cell = 16 22 | pad = 18 23 | h = pad * 2 + cell * 3 24 | w = pad * 2 + cell * N 25 | y_ids = pad 26 | y_x = y_ids + cell 27 | y_y = y_x + cell 28 | # colors 29 | col_ids = "#DCE6F8" 30 | col_x = "#B5D0F5" 31 | col_y = "#9EC5F8" 32 | stroke = "#2b2b2b" 33 | style = ( 34 | '' 35 | ) 36 | svg = [ 37 | f'', 38 | style, 39 | f'ids', 40 | f'x = ids[i:i+T]', 41 | f'y = ids[i+1:i+T+1]', 42 | ] 43 | # ids row 44 | for j in range(N): 45 | svg.append( 46 | f'' 48 | ) 49 | # x window from j0..j0+T-1 50 | j0 = 4 51 | for j in range(T): 52 | xj = pad + (j0 + j) * cell 53 | svg.append( 54 | f'' 56 | ) 57 | # y window shifted by 1 58 | for j in range(T): 59 | xj = pad + (j0 + 1 + j) * cell 60 | svg.append( 61 | f'' 63 | ) 64 | svg.append('') 65 | out.write_text("\n".join(svg)) 66 | 67 | 68 | def main() -> None: 69 | fig_dir = Path(__file__).resolve().parents[1] / "figures" 70 | fig_dir.mkdir(parents=True, exist_ok=True) 71 | out = fig_dir / "ch10-windows.svg" 72 | try: 73 | import matplotlib.pyplot as plt 74 | import numpy as np 75 | 76 | plt.style.use("seaborn-v0_8") 77 | N, T = 24, 8 78 | fig, ax = plt.subplots(figsize=(8.0, 1.8)) 79 | ax.axis('off') 80 | y0 = 0 81 | # ids 82 | for j in range(N): 83 | ax.add_patch(plt.Rectangle((j, y0+0.8), 1, 0.8, fc="#DCE6F8", ec="#2b2b2b", lw=0.6)) 84 | ax.text(-1.1, y0+1.3, 'ids', ha='right', va='center') 85 | # x 86 | j0 = 4 87 | for j in range(T): 88 | ax.add_patch(plt.Rectangle((j0+j, y0-0.2), 1, 0.8, fc="#B5D0F5", ec="#2b2b2b", lw=0.6)) 89 | ax.text(-1.1, y0+0.2, 'x = ids[i:i+T]', ha='right', va='center') 90 | # y (shifted) 91 | for j in range(T): 92 | ax.add_patch(plt.Rectangle((j0+1+j, y0-1.2), 1, 0.8, fc="#9EC5F8", ec="#2b2b2b", lw=0.6)) 93 | ax.text(-1.1, y0-0.8, 'y = ids[i+1:i+T+1]', ha='right', va='center') 94 | ax.set_xlim(-2, N+1); ax.set_ylim(-2, 3) 95 | fig.savefig(out, format='svg', bbox_inches='tight') 96 | except Exception: 97 | fallback_svg(out) 98 | 99 | 100 | if __name__ == "__main__": 101 | main() 102 | -------------------------------------------------------------------------------- /code/ch15_cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Minimal sampling CLI over an exported bundle (Chapter 15). 9 | 10 | Usage: 11 | python code/ch15_cli.py --bundle model_bundle.pt --prompt "Hello" 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | 17 | import argparse 18 | import sys 19 | from pathlib import Path 20 | 21 | import torch 22 | 23 | # Import code/ modules directly when run as a script 24 | sys.path.append(str(Path(__file__).resolve().parent)) 25 | from ch09_gpt import GPT, GPTConfig # type: ignore 26 | from ch11_sampling import sample # type: ignore 27 | from ch6_tokenize import SimpleTokenizer, Vocab # type: ignore 28 | 29 | 30 | def auto_device() -> str: 31 | if torch.cuda.is_available(): 32 | return "cuda" 33 | mps = getattr(torch.backends, "mps", None) 34 | if mps and torch.backends.mps.is_available(): 35 | return "mps" 36 | return "cpu" 37 | 38 | 39 | def build_tokenizer(meta: dict | None): 40 | if not meta: 41 | return None 42 | try: 43 | id_to_token = list(meta["id_to_token"]) # ensure list 44 | token_to_id = {t: i for i, t in enumerate(id_to_token)} 45 | pad_id = int(meta.get("pad_id", 0)) 46 | unk_id = int(meta.get("unk_id", 1)) 47 | vocab = Vocab(token_to_id=token_to_id, id_to_token=id_to_token, pad=pad_id, unk=unk_id) 48 | return SimpleTokenizer(vocab=vocab, level=meta.get("level", "char")) 49 | except Exception: 50 | return None 51 | 52 | 53 | def main() -> None: 54 | p = argparse.ArgumentParser(description="Sample from a GPT bundle") 55 | p.add_argument("--bundle", required=True, help="bundle .pt from ch15_export") 56 | p.add_argument("--prompt", required=True, help="prompt string") 57 | p.add_argument("--max-new-tokens", type=int, default=80) 58 | p.add_argument("--temperature", type=float, default=0.9) 59 | p.add_argument("--top-k", type=int, default=0) 60 | p.add_argument("--top-p", type=float, default=0.0) 61 | p.add_argument("--device", default="auto") 62 | p.add_argument("--seed", type=int, default=0) 63 | args = p.parse_args() 64 | 65 | torch.manual_seed(args.seed) 66 | device = auto_device() if args.device == "auto" else args.device 67 | print({"device": device, "seed": args.seed}) 68 | b = torch.load(args.bundle, map_location=device) 69 | cfg = GPTConfig(**b["config"]) # type: ignore 70 | model = GPT(cfg).to(device) 71 | model.load_state_dict(b["model_state"]) # type: ignore 72 | model.eval() 73 | 74 | tok = build_tokenizer(b.get("tokenizer")) 75 | if tok is None: # fall back to byte-level 76 | ids = torch.tensor([[c for c in args.prompt.encode("utf-8")]], dtype=torch.long, device=device) 77 | out = sample( 78 | model, 79 | ids, 80 | max_new_tokens=args.max_new_tokens, 81 | temperature=args.temperature, 82 | top_k=(args.top_k or None), 83 | top_p=(args.top_p or None), 84 | ) 85 | print(bytes(out[0].tolist()).decode("utf-8", errors="ignore")) 86 | else: 87 | ids = torch.tensor([tok.encode(args.prompt)], dtype=torch.long, device=device) 88 | out = sample( 89 | model, 90 | ids, 91 | max_new_tokens=args.max_new_tokens, 92 | temperature=args.temperature, 93 | top_k=(args.top_k or None), 94 | top_p=(args.top_p or None), 95 | ) 96 | print(tok.decode(out[0].tolist())) 97 | 98 | 99 | if __name__ == "__main__": 100 | main() 101 | -------------------------------------------------------------------------------- /code/check_bundle.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Validate a model bundle by loading it and sampling once. 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | 14 | import argparse 15 | from pathlib import Path 16 | import sys 17 | import torch 18 | 19 | sys.path.append(str(Path(__file__).resolve().parent)) 20 | from ch09_gpt import GPT, GPTConfig # type: ignore 21 | from ch11_sampling import sample # type: ignore 22 | from ch6_tokenize import SimpleTokenizer, Vocab # type: ignore 23 | 24 | 25 | def auto_device() -> str: 26 | if torch.cuda.is_available(): 27 | return "cuda" 28 | mps = getattr(torch.backends, "mps", None) 29 | if mps and torch.backends.mps.is_available(): 30 | return "mps" 31 | return "cpu" 32 | 33 | 34 | def main() -> None: 35 | p = argparse.ArgumentParser(description="Bundle smoke-test: load and sample") 36 | p.add_argument("--bundle", required=True, help="path to model_bundle.pt") 37 | p.add_argument("--prompt", default="Hello", help="prompt string") 38 | p.add_argument("--max-new-tokens", type=int, default=32) 39 | p.add_argument("--temperature", type=float, default=0.9) 40 | p.add_argument("--top-p", type=float, default=0.95) 41 | p.add_argument("--top-k", type=int, default=0) 42 | p.add_argument("--device", default="auto", help="cpu|cuda|mps|auto") 43 | p.add_argument("--seed", type=int, default=0) 44 | args = p.parse_args() 45 | 46 | # Make sampling deterministic and pick a device 47 | torch.manual_seed(args.seed) 48 | device = auto_device() if args.device == "auto" else args.device 49 | print({"device": device, "seed": args.seed}) 50 | 51 | # Load bundle, restore model and optional tokenizer 52 | b = torch.load(args.bundle, map_location=device) 53 | cfg = GPTConfig(**b["config"]) # type: ignore 54 | model = GPT(cfg).to(device).eval() 55 | model.load_state_dict(b["model_state"]) # type: ignore 56 | meta = b.get("tokenizer") 57 | tok = None 58 | if meta and meta.get("id_to_token"): 59 | id_to_token = list(meta["id_to_token"]) # ensure list 60 | token_to_id = {t: i for i, t in enumerate(id_to_token)} 61 | vocab = Vocab( 62 | token_to_id=token_to_id, 63 | id_to_token=id_to_token, 64 | pad=int(meta.get("pad_id", 0)), 65 | unk=int(meta.get("unk_id", 1)), 66 | ) 67 | tok = SimpleTokenizer(vocab=vocab, level=meta.get("level", "char")) 68 | 69 | if tok is None: 70 | # Fallback to byte-level prompt if no tokenizer metadata exists 71 | ids = torch.tensor( 72 | [[c for c in args.prompt.encode("utf-8")]], 73 | dtype=torch.long, 74 | device=device, 75 | ) 76 | out = sample( 77 | model, ids, 78 | max_new_tokens=args.max_new_tokens, 79 | temperature=args.temperature, 80 | top_k=(args.top_k or None), 81 | top_p=(args.top_p or None), 82 | ) 83 | text = bytes(out[0].tolist()).decode("utf-8", errors="ignore") 84 | else: 85 | ids = torch.tensor( 86 | [tok.encode(args.prompt)], dtype=torch.long, device=device 87 | ) 88 | out = sample( 89 | model, ids, 90 | max_new_tokens=args.max_new_tokens, 91 | temperature=args.temperature, 92 | top_k=(args.top_k or None), 93 | top_p=(args.top_p or None), 94 | ) 95 | text = tok.decode(out[0].tolist()) 96 | print("OK — model loaded and sampled.\n", text) 97 | 98 | 99 | if __name__ == "__main__": 100 | main() 101 | -------------------------------------------------------------------------------- /code/gen_ch14_lora.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Generate a simple LoRA diagram: base Linear plus low-rank delta. 9 | 10 | Writes figures/ch14-lora.svg without external dependencies. 11 | """ 12 | 13 | from __future__ import annotations 14 | 15 | 16 | from pathlib import Path 17 | 18 | 19 | def main() -> None: 20 | fig_dir = Path(__file__).resolve().parents[1] / "figures" 21 | fig_dir.mkdir(parents=True, exist_ok=True) 22 | out = fig_dir / "ch14-lora.svg" 23 | 24 | w, h = 760, 260 25 | pad = 30 26 | items = [ 27 | f'', 28 | '', 29 | '\n' 30 | ' \n' 32 | ' \n' 33 | ' \n' 34 | '', 35 | ] 36 | # Base linear block (center) 37 | x0, y0 = pad + 80, 100 38 | items.append( 39 | f'' 40 | ) 41 | items.append( 42 | f'Base Linear W' 43 | ) 44 | # A and B blocks (adapter branch) 45 | ax, ay = x0 + 250, y0 - 50 46 | items.append( 47 | f'' 48 | ) 49 | items.append( 50 | f'A (r × d_in)' 51 | ) 52 | bx, by = ax, y0 + 76 53 | items.append( 54 | f'' 55 | ) 56 | items.append( 57 | f'B (d_out × r)' 58 | ) 59 | # Input x arrow into base and into A (branch) 60 | items.append( 61 | f'' 62 | ) 63 | items.append(f'x') 64 | items.append( 65 | f'' 66 | ) 67 | # A to B, B to sum 68 | items.append( 69 | f'' 70 | ) 71 | sumx, sumy = x0 + 470, y0 + 30 72 | items.append(f'') 73 | items.append(f'+') 74 | items.append( 75 | f'' 76 | ) 77 | items.append( 78 | f'' 79 | ) 80 | # Scale label α/r on the adapter path 81 | items.append(f'scale: α/r') 82 | # Sum to output 83 | outx = sumx + 160 84 | items.append( 85 | f'' 86 | ) 87 | items.append(f'output') 88 | # Annotations 89 | items.append( 90 | f'x' 91 | ) 92 | items.append(f'ΔW = B @ A') 93 | items.append('') 94 | out.write_text("\n".join(items)) 95 | 96 | 97 | if __name__ == '__main__': 98 | main() 99 | -------------------------------------------------------------------------------- /code/gen_masks_heatmap.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Generate causal and combined (padding x causal) mask heatmaps for Ch. 9. 9 | 10 | Always writes `figures/ch09-masks.svg`. Uses Matplotlib if available; otherwise 11 | falls back to a minimal hand-written SVG so the book build never misses it. 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | 17 | from pathlib import Path 18 | 19 | import torch 20 | 21 | 22 | def build_masks(T: int, pad_positions: list[int] | None = None): 23 | causal = torch.tril(torch.ones(T, T)) # [T, T] 24 | if not pad_positions: 25 | return causal, causal # combined==causal in this trivial case 26 | pad = torch.ones(T) 27 | for p in pad_positions: 28 | if 0 <= p < T: 29 | pad[p] = 0 30 | pad_bt = pad[None, :] 31 | combined = pad_bt[:, None, :] * causal # [1, T, T] 32 | return causal, combined.squeeze(0) 33 | 34 | 35 | def render_svg_simple(causal: torch.Tensor, combined: torch.Tensor, out: Path) -> None: 36 | """Write a simple 2-panel SVG without external deps. 37 | 38 | Blue squares (1) vs white squares (0). Titles above each panel. 39 | """ 40 | T = causal.size(0) 41 | cell = 16 42 | pad = 24 43 | gap = 40 44 | width = pad * 2 + cell * T * 2 + gap 45 | height = pad * 2 + cell * T + 28 # extra for titles 46 | def rects(mat: torch.Tensor, x0: int, y0: int) -> str: 47 | parts = [] 48 | for i in range(T): # rows (queries) 49 | for j in range(T): # cols (keys) 50 | v = float(mat[i, j]) 51 | color = "#0A66C2" if v > 0.5 else "#FFFFFF" 52 | parts.append( 53 | f'' 55 | ) 56 | return "\n".join(parts) 57 | x1 = pad 58 | x2 = pad + cell * T + gap 59 | y = pad + 24 60 | style = ( 61 | '' 62 | ) 63 | svg = [ 64 | f'', 65 | style, 66 | f'Causal mask [T,T]', 67 | f'Padding x causal [T,T]', 68 | rects(causal, x1, y), 69 | rects(combined, x2, y), 70 | '', 71 | ] 72 | out.write_text("\n".join(svg)) 73 | 74 | 75 | def main() -> None: 76 | fig_dir = Path(__file__).resolve().parents[1] / "figures" 77 | fig_dir.mkdir(parents=True, exist_ok=True) 78 | out = fig_dir / "ch09-masks.svg" 79 | causal, combined = build_masks(T=12, pad_positions=[9, 10, 11]) 80 | try: 81 | import matplotlib.pyplot as plt 82 | 83 | plt.style.use("seaborn-v0_8") 84 | fig, axes = plt.subplots(1, 2, figsize=(7.2, 2.8), constrained_layout=True) 85 | im0 = axes[0].imshow(causal, cmap="Blues", vmin=0, vmax=1) 86 | axes[0].set_title("Causal mask [T,T]") 87 | axes[0].set_xlabel("keys") 88 | axes[0].set_ylabel("queries") 89 | im1 = axes[1].imshow(combined, cmap="Blues", vmin=0, vmax=1) 90 | axes[1].set_title("Padding x causal [T,T]") 91 | axes[1].set_xlabel("keys") 92 | for ax in axes: 93 | ax.set_xticks([]); ax.set_yticks([]) 94 | fig.savefig(out, format="svg") 95 | print("Wrote:", out) 96 | except Exception: 97 | # Fallback: hand-written SVG so the book can include the figure 98 | render_svg_simple(causal, combined, out) 99 | print("Wrote (fallback SVG):", out) 100 | 101 | 102 | if __name__ == "__main__": 103 | main() 104 | -------------------------------------------------------------------------------- /code/ch6_tokenize.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | """ 9 | 10 | from __future__ import annotations 11 | 12 | from collections import Counter 13 | from dataclasses import dataclass 14 | from pathlib import Path 15 | from typing import Iterable, List, Dict 16 | 17 | 18 | @dataclass 19 | class Vocab: 20 | token_to_id: Dict[str, int] 21 | id_to_token: List[str] 22 | pad: int 23 | unk: int 24 | 25 | @classmethod 26 | def build( 27 | cls, 28 | tokens: Iterable[str], 29 | min_freq: int = 1, 30 | specials: Iterable[str] = ("", ""), 31 | ) -> "Vocab": 32 | # Count incoming tokens and prepend special ids 33 | counter = Counter(tokens) 34 | id_to_token = list(specials) 35 | for tok, freq in counter.most_common(): 36 | if freq >= min_freq and tok not in id_to_token: 37 | id_to_token.append(tok) 38 | token_to_id = {t: i for i, t in enumerate(id_to_token)} 39 | pad = token_to_id[specials[0]] 40 | unk = token_to_id[specials[1]] 41 | return cls(token_to_id, id_to_token, pad, unk) 42 | 43 | def __len__(self) -> int: 44 | return len(self.id_to_token) 45 | 46 | 47 | class SimpleTokenizer: 48 | """Tiny tokenizer for chapter 6 (char or word level).""" 49 | 50 | def __init__(self, vocab: Vocab, level: str = "char") -> None: 51 | assert level in {"char", "word"} 52 | self.vocab = vocab 53 | self.level = level 54 | self.pad = vocab.pad 55 | self.unk = vocab.unk 56 | 57 | @staticmethod 58 | def _split(text: str, level: str) -> List[str]: 59 | if level == "char": 60 | return list(text) 61 | # simple whitespace/punct split for demo purposes 62 | out: List[str] = [] 63 | token = [] 64 | for ch in text: 65 | if ch.isalnum(): 66 | token.append(ch.lower()) 67 | else: 68 | if token: 69 | out.append("".join(token)) 70 | token = [] 71 | if ch.strip(): # keep punctuation as its own token 72 | out.append(ch) 73 | if token: 74 | out.append("".join(token)) 75 | return out 76 | 77 | @classmethod 78 | def from_file( 79 | cls, path: str | Path, level: str = "char", min_freq: int = 1 80 | ) -> "SimpleTokenizer": 81 | # Load raw text and construct vocab directly 82 | text = Path(path).read_text(encoding="utf-8") 83 | tokens = cls._split(text, level) 84 | vocab = Vocab.build(tokens, min_freq=min_freq) 85 | return cls(vocab=vocab, level=level) 86 | 87 | def encode(self, text: str) -> List[int]: 88 | # Map tokens to ids with unk fallback 89 | ids: List[int] = [] 90 | for tok in self._split(text, self.level): 91 | ids.append(self.vocab.token_to_id.get(tok, self.unk)) 92 | return ids 93 | 94 | def decode(self, ids: Iterable[int]) -> str: 95 | # Convert back to tokens while skipping padding tokens 96 | toks: List[str] = [] 97 | for i in ids: 98 | if 0 <= i < len(self.vocab.id_to_token): 99 | tok = self.vocab.id_to_token[i] 100 | if tok not in {"", ""}: 101 | toks.append(tok) 102 | else: 103 | toks.append("") 104 | if self.level == "char": 105 | return "".join(toks) 106 | # naive word join: put space before alphanumerics only 107 | out: List[str] = [] 108 | for t in toks: 109 | if not out: 110 | out.append(t) 111 | elif t.isalnum(): 112 | out.append(" " + t) 113 | else: 114 | out.append(t) 115 | return "".join(out) 116 | -------------------------------------------------------------------------------- /code/gen_ch14_scaling.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Generate a synthetic scaling law figure with axes and annotations. 9 | 10 | Writes figures/ch14-scaling.svg (simple SVG; no external deps). 11 | """ 12 | 13 | from __future__ import annotations 14 | 15 | 16 | from pathlib import Path 17 | import math 18 | 19 | 20 | def main() -> None: 21 | fig_dir = Path(__file__).resolve().parents[1] / "figures" 22 | fig_dir.mkdir(parents=True, exist_ok=True) 23 | out = fig_dir / "ch14-scaling.svg" 24 | 25 | w, h = 680, 280 26 | pad = 46 27 | # Synthetic: loss = a * N^{-b} + c in log space (draw line with slight noise) 28 | xs = [10 ** (i / 10) for i in range(3, 33)] # ~1e0..1e3 29 | a, b, c = 1.0, 0.3, 0.2 30 | ys = [a * (x ** (-b)) + c for x in xs] 31 | # Map to log10 for plotting 32 | lx = [math.log10(x) for x in xs] 33 | ly = [math.log10(y) for y in ys] 34 | minx, maxx = min(lx), max(lx) 35 | miny, maxy = min(ly), max(ly) 36 | def mapx(x): 37 | return pad + (w - 2*pad) * ((x - minx) / (maxx - minx)) 38 | def mapy(y): 39 | return h - pad - (h - 2*pad) * ((y - miny) / (maxy - miny)) 40 | path = "M " + " ".join( 41 | f"{mapx(x):.1f},{mapy(y):.1f}" for x, y in zip(lx, ly) 42 | ) 43 | 44 | # Horizontal line for irreducible error c (approx last y value) 45 | y_c = mapy(min(ly) + 0.02) 46 | 47 | # Slope annotation segment around the middle 48 | mid = len(lx) // 2 49 | x1, y1 = mapx(lx[mid] - 0.3), mapy(ly[mid] + 0.08) 50 | x2, y2 = mapx(lx[mid] + 0.3), mapy(ly[mid] - 0.08) 51 | 52 | # Axis label positions centered along their axes to avoid overlaps 53 | x_axis_mid_x = w / 2 54 | x_axis_label_y = h - pad + 24 55 | y_axis_mid_y = (h - pad + pad) / 2 56 | y_axis_label_x = pad - 34 57 | 58 | svg = [ 59 | f'', 60 | '', 61 | '\n' 62 | ' \n' 63 | ' \n' 64 | ' \n' 65 | '', 66 | # Title 67 | f'Scaling law: log loss vs log scale', 68 | # Axes 69 | f'', 70 | f'', 71 | # Centered axis labels 72 | f'log10(N)', 73 | f'log10(loss)', 74 | # qualitative end labels 75 | f'small', 76 | f'large', 77 | f'low', 78 | f'high', 79 | # Curve 80 | f'', 81 | # Irreducible error line c with centered annotation 82 | f'', 83 | f'irreducible error c', 84 | # Slope annotation 85 | f'', 86 | f'slope ≈ −b', 87 | '' 88 | ] 89 | out.write_text("\n".join(svg)) 90 | 91 | 92 | if __name__ == '__main__': 93 | main() 94 | -------------------------------------------------------------------------------- /notebooks/ch16_discussion_conclusion_colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "346ba8e7", 6 | "metadata": {}, 7 | "source": [ 8 | "" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "023ac5fa", 14 | "metadata": {}, 15 | "source": [ 16 | "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n", 17 | "## Chapter 16 — Discussion & Conclusion\n", 18 | "**© Dr. Yves J. Hilpisch**
AI-Powered by GPT-5." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "622fc26f", 24 | "metadata": {}, 25 | "source": [ 26 | "## How to Use This Notebook\n", 27 | "\n", 28 | "- Synthesize insights gathered throughout the book into actionable principles.\n", 29 | "- Capture retrospective metrics and notes to inform your next project iteration.\n", 30 | "- Plan follow-up experiments and knowledge deep dives based on open questions." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "c53e561f", 36 | "metadata": {}, 37 | "source": [ 38 | "### Key Reflections\n", 39 | "\n", 40 | "Use this section to distill what mattered most. The prompts below nudge you to convert experience into reusable knowledge." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "id": "56317a1d", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "from datetime import date\n", 51 | "\n", 52 | "reflection_template = f\"\"\"## Post-Project Reflection ({date.today().isoformat()})\n", 53 | "\n", 54 | "### Wins\n", 55 | "- \\\n", 56 | "- \\\n", 57 | "\n", 58 | "### Challenges\n", 59 | "- \\\n", 60 | "- \\\n", 61 | "\n", 62 | "### Decisions to Revisit\n", 63 | "- \\\n", 64 | "\n", 65 | "### Next Experiments\n", 66 | "- \\\n", 67 | "\"\"\"\n", 68 | "print(reflection_template)\n" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "id": "cf1d5dd9", 74 | "metadata": {}, 75 | "source": [ 76 | "### Metric Review\n", 77 | "\n", 78 | "Gather the metrics that define success for your use case. This might include loss curves, evaluation scores, or deployment latency measurements." 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "id": "0b8befa5", 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "import json\n", 89 | "\n", 90 | "# Replace the sample metrics with your actual results.\n", 91 | "metrics = {\n", 92 | " \"final_train_loss\": 1.87,\n", 93 | " \"validation_perplexity\": 16.2,\n", 94 | " \"bleu_score\": None,\n", 95 | " \"deployment_latency_ms\": None,\n", 96 | "}\n", 97 | "print(json.dumps(metrics, indent=2))\n" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "id": "f0f4bf93", 103 | "metadata": {}, 104 | "source": [ 105 | "### Knowledge Transfer\n", 106 | "\n", 107 | "Summarize the playbooks, scripts, and lessons that you want to carry into your next project. Treat this as the beginning of your internal wiki." 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "id": "a5b1c38c", 113 | "metadata": {}, 114 | "source": [ 115 | "## Exercises\n", 116 | "\n", 117 | "- Write a retrospective memo that highlights one architectural decision you would change next time.\n", 118 | "- Collect three resources (papers, blog posts, repos) that will help you deepen a weak area.\n", 119 | "- Draft an action plan for deploying attoLLM in a realistic environment of your choice." 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "id": "3ac6cb70", 125 | "metadata": {}, 126 | "source": [ 127 | "" 128 | ] 129 | } 130 | ], 131 | "metadata": { 132 | "colab": { 133 | "name": "Chapter 16 · Discussion" 134 | }, 135 | "kernelspec": { 136 | "display_name": "Python 3", 137 | "language": "python", 138 | "name": "python3" 139 | }, 140 | "language_info": { 141 | "name": "python", 142 | "version": "3.10" 143 | } 144 | }, 145 | "nbformat": 4, 146 | "nbformat_minor": 5 147 | } 148 | -------------------------------------------------------------------------------- /code/ch10_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Small data helpers for Chapter 10: building a token id stream and slicing 9 | into (input, target) chunks for next-token prediction. 10 | 11 | We keep this self-contained and friendly: 12 | - A minimal dataset that returns (x, y) where y is x shifted by one. 13 | - Helpers to build ids from raw text using either a provided tokenizer or a 14 | byte-level fallback (0-255). 15 | """ 16 | 17 | from __future__ import annotations 18 | 19 | 20 | from dataclasses import dataclass 21 | from pathlib import Path 22 | from typing import Iterable, List, Sequence 23 | 24 | import torch 25 | from torch.utils.data import Dataset 26 | 27 | 28 | @dataclass 29 | class TextIds: 30 | """Container for a 1-D id stream and tokenizer metadata. 31 | 32 | - ids: concatenated token ids (1-D) 33 | - vocab_size: size of the token vocabulary 34 | - pad_id: optional pad index (for CE ignore_index) 35 | - unk_id: optional unknown-token index 36 | - level: optional tokenization level ('byte'|'char'|'word') 37 | - id_to_token: optional list of tokens by index for reconstruction/decoding 38 | """ 39 | ids: torch.Tensor 40 | vocab_size: int 41 | pad_id: int | None = None 42 | unk_id: int | None = None 43 | level: str | None = None 44 | id_to_token: list[str] | None = None 45 | 46 | 47 | def load_texts(paths: Sequence[str] | None) -> str: 48 | if not paths: 49 | return "Hello world. Hello vectors.\n" 50 | texts: List[str] = [] 51 | for p in paths: 52 | # Read each file as UTF-8 and concatenate with newlines 53 | data = Path(p).read_text(encoding="utf-8") 54 | texts.append(data) 55 | return "\n".join(texts) 56 | 57 | 58 | def build_ids_byte_level(text: str) -> TextIds: 59 | # Encode to bytes and map each byte directly to an id 60 | data = text.encode("utf-8", errors="ignore") 61 | ids = torch.tensor(list(data), dtype=torch.long) 62 | return TextIds( 63 | ids=ids, 64 | vocab_size=256, 65 | pad_id=None, 66 | unk_id=None, 67 | level="byte", 68 | id_to_token=None, 69 | ) 70 | 71 | 72 | def build_ids_with_tokenizer(text: str, level: str = "char") -> TextIds: 73 | """Use the SimpleTokenizer from Chapter 6 if available; else byte-level. 74 | 75 | We try local imports first so running `python code/...` works without 76 | installing `code/` as a package. 77 | """ 78 | try: 79 | # when executing scripts under code/, neighbors are importable 80 | from ch6_tokenize import SimpleTokenizer, Vocab # type: ignore 81 | except Exception: 82 | try: 83 | from code.ch6_tokenize import SimpleTokenizer, Vocab # type: ignore 84 | except Exception: 85 | return build_ids_byte_level(text) 86 | if hasattr(SimpleTokenizer, "from_text"): 87 | tok = SimpleTokenizer.from_text( # type: ignore[attr-defined] 88 | text, level=level 89 | ) 90 | else: 91 | # Build from raw text using the module's helpers 92 | tokens = SimpleTokenizer._split(text, level) 93 | vocab = Vocab.build(tokens) 94 | tok = SimpleTokenizer(vocab=vocab, level=level) 95 | ids = torch.tensor(tok.encode(text), dtype=torch.long) 96 | return TextIds( 97 | ids=ids, 98 | vocab_size=len(tok.vocab), 99 | pad_id=tok.pad, 100 | unk_id=tok.unk, 101 | level=level, 102 | id_to_token=list(tok.vocab.id_to_token), 103 | ) 104 | 105 | 106 | class LMSequenceDataset(Dataset[tuple[torch.Tensor, torch.Tensor]]): 107 | """Slice a long id stream into overlapping (x,y) chunks of length T. 108 | 109 | x is ids[i : i+T], y is ids[i+1 : i+T+1]. The number of samples is 110 | len(ids) - T. 111 | """ 112 | 113 | def __init__(self, ids: torch.Tensor, block_size: int): 114 | assert ids.ndim == 1 and ids.dtype == torch.long 115 | self.ids = ids 116 | self.T = int(block_size) 117 | 118 | def __len__(self) -> int: 119 | return max(0, self.ids.numel() - self.T) 120 | 121 | def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]: 122 | i = int(idx) 123 | x = self.ids[i : i + self.T] 124 | y = self.ids[i + 1 : i + self.T + 1] 125 | return x, y 126 | 127 | 128 | __all__ = [ 129 | "TextIds", 130 | "load_texts", 131 | "build_ids_byte_level", 132 | "build_ids_with_tokenizer", 133 | "LMSequenceDataset", 134 | ] 135 | -------------------------------------------------------------------------------- /notebooks/ch02_shell_cli_colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "a791c257", 6 | "metadata": {}, 7 | "source": [ 8 | "" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "776201f5", 14 | "metadata": {}, 15 | "source": [ 16 | "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n", 17 | "## Chapter 2 — Working with the Shell & AI Assistants\n", 18 | "**© Dr. Yves J. Hilpisch**
AI-Powered by GPT-5." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "e16f44d1", 24 | "metadata": {}, 25 | "source": [ 26 | "## How to Use This Notebook\n", 27 | "\n", 28 | "- Practice quick shell commands directly from Colab cells using the `!` prefix.\n", 29 | "- Automate repetitive tasks with small Python helpers that wrap shell operations.\n", 30 | "- Document how you collaborate with AI assistants so future you can reproduce the workflow." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "ddaf504c", 36 | "metadata": {}, 37 | "source": [ 38 | "### Why the Shell Matters\n", 39 | "\n", 40 | "Even in Colab, the shell lets you inspect files, run tests, and control experiments quickly. Combining shell commands with Python makes your workflow both flexible and repeatable." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "id": "b4d50a4b", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "# Use bash commands inline with the `!` prefix in Colab.\n", 51 | "!pwd\n", 52 | "!ls -1 | head -n 5\n" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "id": "210bc5a0", 58 | "metadata": {}, 59 | "source": [ 60 | "### Wrapping Shell Commands in Python\n", 61 | "\n", 62 | "Use `subprocess` when you need programmatic control (for example, checking exit codes or capturing output for logging)." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "id": "02729ef7", 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "import subprocess\n", 73 | "\n", 74 | "result = subprocess.run(['ls', '-a'], capture_output=True, text=True, check=True)\n", 75 | "lines = result.stdout.splitlines()\n", 76 | "print(f'Total entries: {len(lines)}')\n", 77 | "print('First 8 entries:')\n", 78 | "for entry in lines[:8]:\n", 79 | " print(entry)\n" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "id": "f51120c5", 85 | "metadata": {}, 86 | "source": [ 87 | "### Collaborating with AI Assistants\n", 88 | "\n", 89 | "Take notes on the prompts you issue to code assistants and save their outputs when they produce something you adopt. Lightweight documentation now saves time when you need to defend a design decision later." 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "id": "c3de21f0", 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "# Starter template for tracking assistant interactions.\n", 100 | "from datetime import datetime\n", 101 | "\n", 102 | "log_entry = {\n", 103 | " \"timestamp\": datetime.utcnow().isoformat() + \"Z\",\n", 104 | " \"tool\": \"Your assistant of choice\",\n", 105 | " \"prompt\": \"Summarize how to process text files for tokenization\",\n", 106 | " \"action_items\": [\n", 107 | " \"Research tokenization libraries\",\n", 108 | " \"Try out a simple whitespace tokenizer\",\n", 109 | " \"Compare to sentencepiece later\"\n", 110 | " ]\n", 111 | "}\n", 112 | "log_entry\n" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "id": "d159960e", 118 | "metadata": {}, 119 | "source": [ 120 | "## Exercises\n", 121 | "\n", 122 | "- Execute at least three shell commands that help you inspect the repository; record what you learned.\n", 123 | "- Wrap a shell command with `subprocess.run` and capture both stdout and stderr in variables.\n", 124 | "- Draft a reusable prompt template for your AI assistant that sets context, goals, and constraints." 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "id": "4e64be7b", 130 | "metadata": {}, 131 | "source": [ 132 | "" 133 | ] 134 | } 135 | ], 136 | "metadata": { 137 | "colab": { 138 | "name": "Chapter 02 · Shell & AI" 139 | }, 140 | "kernelspec": { 141 | "display_name": "Python 3", 142 | "language": "python", 143 | "name": "python3" 144 | }, 145 | "language_info": { 146 | "name": "python", 147 | "version": "3.10" 148 | } 149 | }, 150 | "nbformat": 4, 151 | "nbformat_minor": 5 152 | } 153 | -------------------------------------------------------------------------------- /code/example_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Self-contained example workspace and sample texts. 9 | 10 | Use this when readers don't have the book repo. It creates a temporary 11 | folder in the current working directory, fills it with small sample 12 | text files, and (optionally) cleans them up when done. 13 | 14 | Usage (CLI): 15 | python -m code.example_data create --defaults [--keep] 16 | python -m code.example_data path # print last created path 17 | python -m code.example_data cleanup 18 | 19 | Usage (Python): 20 | from code.example_data import ExampleWorkspace 21 | with ExampleWorkspace().create_defaults() as ws: 22 | print(ws.root) # use ws.root / files inside 23 | ... 24 | """ 25 | 26 | from __future__ import annotations 27 | 28 | import argparse 29 | import shutil 30 | import time 31 | from dataclasses import dataclass 32 | from pathlib import Path 33 | 34 | _LAST_PATH_FILE = Path(".example_workspace_path") 35 | 36 | 37 | DEFAULT_TEXTS = { 38 | "philosophy.txt": ( 39 | "We are what we repeatedly do. Excellence, then, is not an act " 40 | "but a habit. Questions sharpen knowledge; curiosity sustains it.\n" 41 | ), 42 | "science.txt": ( 43 | "Science is a way of thinking much more than it is a body of facts. " 44 | "Small experiments illuminate large ideas.\n" 45 | ), 46 | "poetry.txt": ( 47 | "The model dreams in tokens and time,\n" 48 | "A lantern of vectors that learn to rhyme.\n" 49 | ), 50 | } 51 | 52 | 53 | @dataclass 54 | class ExampleWorkspace: 55 | base_dir: Path = Path.cwd() 56 | name: str | None = None 57 | cleanup_on_exit: bool = True 58 | 59 | def __post_init__(self) -> None: 60 | if self.name is None: 61 | stamp = time.strftime("%Y%m%d-%H%M%S") 62 | self.name = f"examples-{stamp}" 63 | self.root = self.base_dir / self.name # type: ignore[attr-defined] 64 | 65 | def create(self) -> "ExampleWorkspace": 66 | self.root.mkdir(parents=True, exist_ok=True) 67 | _LAST_PATH_FILE.write_text(str(self.root)) 68 | return self 69 | 70 | def create_defaults(self) -> "ExampleWorkspace": 71 | self.create() 72 | for fname, text in DEFAULT_TEXTS.items(): 73 | (self.root / fname).write_text(text) 74 | return self 75 | 76 | def add_text(self, filename: str, content: str) -> Path: 77 | p = self.root / filename 78 | p.write_text(content) 79 | return p 80 | 81 | def cleanup(self) -> None: 82 | if self.root.exists(): 83 | shutil.rmtree(self.root) 84 | if _LAST_PATH_FILE.exists(): 85 | _LAST_PATH_FILE.unlink() 86 | 87 | # Context manager API 88 | def __enter__(self) -> "ExampleWorkspace": 89 | return self 90 | 91 | def __exit__(self, exc_type, exc, tb) -> None: # noqa: ANN001 92 | if self.cleanup_on_exit: 93 | self.cleanup() 94 | 95 | 96 | def _cmd_create(args: argparse.Namespace) -> None: 97 | ws = ExampleWorkspace(cleanup_on_exit=not args.keep) 98 | (ws.create_defaults()) 99 | print(ws.root) 100 | if not args.keep: 101 | print("(Temporary; will be cleaned up when used via context manager or explicitly)") 102 | 103 | 104 | def _cmd_path(_: argparse.Namespace) -> None: 105 | if _LAST_PATH_FILE.exists(): 106 | print(_LAST_PATH_FILE.read_text()) 107 | else: 108 | print("No workspace recorded. Use 'create' first.") 109 | 110 | 111 | def _cmd_cleanup(args: argparse.Namespace) -> None: 112 | target = Path(args.path).resolve() 113 | if not target.exists(): 114 | print("No such path:", target) 115 | return 116 | shutil.rmtree(target) 117 | if _LAST_PATH_FILE.exists(): 118 | try: 119 | last = Path(_LAST_PATH_FILE.read_text().strip()) 120 | if last == target: 121 | _LAST_PATH_FILE.unlink() 122 | except Exception: 123 | _LAST_PATH_FILE.unlink(missing_ok=True) # type: ignore[attr-defined] 124 | print("Removed:", target) 125 | 126 | 127 | def main(argv: list[str] | None = None) -> None: 128 | p = argparse.ArgumentParser(prog="code.example_data", add_help=True) 129 | sub = p.add_subparsers(dest="cmd", required=True) 130 | 131 | c = sub.add_parser("create", help="create a workspace and write default texts") 132 | c.add_argument("--keep", action="store_true", help="do not auto-clean later") 133 | c.set_defaults(func=_cmd_create) 134 | 135 | sub.add_parser("path", help="print last workspace path").set_defaults(func=_cmd_path) 136 | 137 | d = sub.add_parser("cleanup", help="remove a workspace path") 138 | d.add_argument("path", help="path to workspace directory") 139 | d.set_defaults(func=_cmd_cleanup) 140 | 141 | ns = p.parse_args(argv) 142 | ns.func(ns) 143 | 144 | 145 | if __name__ == "__main__": 146 | main() 147 | 148 | -------------------------------------------------------------------------------- /code/ch10_train.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Chapter 10: a compact training script for the GPT model. 9 | 10 | This keeps options small and readable. It supports either a byte-level build 11 | of token ids or the SimpleTokenizer from Chapter 6 if available. 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | 17 | import argparse 18 | from dataclasses import asdict 19 | from pathlib import Path 20 | from time import time 21 | 22 | import torch 23 | import torch.nn as nn 24 | from torch.utils.data import DataLoader 25 | 26 | # Make "code/" directory importable when running as script 27 | import sys 28 | sys.path.append(str(Path(__file__).resolve().parent)) 29 | 30 | from ch09_gpt import GPT, GPTConfig # type: ignore 31 | from ch10_data import ( # type: ignore 32 | LMSequenceDataset, 33 | build_ids_byte_level, 34 | build_ids_with_tokenizer, 35 | load_texts, 36 | ) 37 | 38 | 39 | def auto_device() -> str: 40 | if torch.cuda.is_available(): 41 | return "cuda" 42 | mps = getattr(torch.backends, "mps", None) 43 | if mps and torch.backends.mps.is_available(): 44 | return "mps" 45 | return "cpu" 46 | 47 | 48 | def main() -> None: 49 | p = argparse.ArgumentParser(description="Train a tiny GPT (Chapter 10)") 50 | p.add_argument("--data", nargs="*", help="text file(s) to train on") 51 | p.add_argument("--level", default="char", choices=["char", "word", "byte"], 52 | help="token level when using SimpleTokenizer; 'byte' forces byte-level") 53 | p.add_argument("--block-size", type=int, default=128) 54 | p.add_argument("--batch-size", type=int, default=64) 55 | p.add_argument("--epochs", type=int, default=1) 56 | p.add_argument("--steps", type=int, default=500, 57 | help="max training steps (overrides epochs if set)") 58 | p.add_argument("--lr", type=float, default=3e-4) 59 | p.add_argument("--warmup-steps", type=int, default=0, 60 | help="linear LR warmup steps (0 to disable)") 61 | p.add_argument("--device", default="auto") 62 | p.add_argument("--seed", type=int, default=0) 63 | p.add_argument("--save", type=str, default="checkpoints/ch10_gpt.pt") 64 | args = p.parse_args() 65 | 66 | torch.manual_seed(args.seed) 67 | device = auto_device() if args.device == "auto" else args.device 68 | 69 | # Build ids 70 | text = load_texts(args.data) 71 | if args.level == "byte": 72 | ids_info = build_ids_byte_level(text) 73 | else: 74 | ids_info = build_ids_with_tokenizer(text, level=args.level) 75 | 76 | ds = LMSequenceDataset(ids_info.ids, block_size=args.block_size) 77 | dl = DataLoader(ds, batch_size=args.batch_size, shuffle=True, drop_last=True) 78 | 79 | # Model config & model 80 | cfg = GPTConfig( 81 | vocab_size=ids_info.vocab_size, 82 | block_size=args.block_size, 83 | d_model=256, 84 | n_head=4, 85 | n_layer=4, 86 | d_ff=1024, 87 | dropout=0.1, 88 | pos_type="learned", 89 | tie_weights=True, 90 | ) 91 | model = GPT(cfg).to(device) 92 | opt = torch.optim.AdamW(model.parameters(), lr=args.lr) 93 | scheduler = None 94 | if args.warmup_steps > 0: 95 | # Linear warmup from 0 -> 1 over warmup_steps 96 | def lr_lambda(step: int) -> float: 97 | return min(1.0, (step + 1) / float(args.warmup_steps)) 98 | scheduler = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda) 99 | 100 | print("Device:", device) 101 | print("Config:", asdict(cfg)) 102 | print("Dataset tokens:", ds.ids.numel()) 103 | 104 | step = 0 105 | t0 = time() 106 | model.train() 107 | for epoch in range(max(1, args.epochs)): 108 | for x, y in dl: 109 | if args.steps and step >= args.steps: 110 | break 111 | x = x.to(device) 112 | y = y.to(device) 113 | opt.zero_grad(set_to_none=True) 114 | logits, loss = model(x, targets=y, pad_id=ids_info.pad_id) 115 | assert loss is not None 116 | loss.backward() 117 | opt.step() 118 | if scheduler is not None: 119 | scheduler.step() 120 | if step % 50 == 0: 121 | lr_now = opt.param_groups[0]["lr"] 122 | print( 123 | f"step {step:5d} lr {lr_now:.5f} " 124 | f"loss {loss.detach().item():.4f}" 125 | ) 126 | step += 1 127 | if args.steps and step >= args.steps: 128 | break 129 | 130 | dt = time() - t0 131 | print(f"Done. steps={step} time={dt:.1f}s") 132 | 133 | # Save checkpoint 134 | out = Path(args.save) 135 | out.parent.mkdir(parents=True, exist_ok=True) 136 | ckpt = { 137 | "config": asdict(cfg), 138 | "model_state": model.state_dict(), 139 | } 140 | # Save tokenizer metadata if available for easier sampling later 141 | if ids_info.id_to_token is not None: 142 | ckpt["tokenizer"] = { 143 | "level": ids_info.level, 144 | "id_to_token": ids_info.id_to_token, 145 | "pad_id": ids_info.pad_id, 146 | "unk_id": ids_info.unk_id, 147 | } 148 | torch.save(ckpt, out) 149 | print("Saved:", out) 150 | 151 | 152 | if __name__ == "__main__": 153 | main() 154 | -------------------------------------------------------------------------------- /code/sample_from_checkpoint.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Sample text from a trained checkpoint (Chapter 11 companion CLI). 9 | 10 | This script reconstructs the GPT model saved by Chapter 10's trainer and 11 | generates a short continuation from a prompt. It is byte-level by default so 12 | you can sample without extra tokenizer files. 13 | 14 | Examples 15 | -------- 16 | (.venv) $ python code/sample_from_checkpoint.py --ckpt checkpoints/ch10_gpt.pt --prompt "Philosophy is" --max-new-tokens 120 --temperature 0.9 --top-p 0.95 17 | 18 | Notes 19 | ----- 20 | - If you trained with a custom tokenizer from Chapter 6, pass a prompt that is 21 | compatible with your vocabulary or adapt this script to encode/decode with 22 | that tokenizer. 23 | """ 24 | 25 | from __future__ import annotations 26 | 27 | 28 | import argparse 29 | import sys 30 | from typing import Optional 31 | 32 | import torch 33 | 34 | # Make "code/" importable when running as a script 35 | from pathlib import Path 36 | sys.path.append(str(Path(__file__).resolve().parent)) 37 | 38 | from ch09_gpt import GPT, GPTConfig # type: ignore 39 | from ch11_sampling import sample # type: ignore 40 | 41 | 42 | def auto_device() -> str: 43 | if torch.cuda.is_available(): 44 | return "cuda" 45 | mps = getattr(torch.backends, "mps", None) 46 | if mps and torch.backends.mps.is_available(): 47 | return "mps" 48 | return "cpu" 49 | 50 | 51 | def main(argv: Optional[list[str]] = None) -> None: 52 | p = argparse.ArgumentParser(description="Sample from a GPT checkpoint") 53 | p.add_argument("--ckpt", default="checkpoints/ch10_gpt.pt", help="path to .pt") 54 | p.add_argument("--prompt", default="Hello", help="Prompt text") 55 | p.add_argument("--max-new-tokens", type=int, default=120) 56 | p.add_argument("--temperature", type=float, default=0.9) 57 | p.add_argument("--top-k", type=int, default=0) 58 | p.add_argument("--top-p", type=float, default=0.0) 59 | p.add_argument("--device", default="auto") 60 | p.add_argument("--seed", type=int, default=0) 61 | p.add_argument("--level", default="auto", choices=["auto", "byte", "char", "word"], 62 | help="tokenization level for prompt/decoding") 63 | p.add_argument("--ref-text", nargs='*', default=[], 64 | help="text file(s) used to rebuild tokenizer for char/word levels") 65 | args = p.parse_args(argv) 66 | 67 | torch.manual_seed(args.seed) 68 | device = auto_device() if args.device == "auto" else args.device 69 | ckpt = torch.load(args.ckpt, map_location=device) 70 | cfg = GPTConfig(**ckpt["config"]) 71 | model = GPT(cfg).to(device) 72 | model.load_state_dict(ckpt["model_state"]) 73 | model.eval() 74 | 75 | # Determine tokenization/decoding strategy 76 | level = args.level 77 | if level == "auto": 78 | # Heuristic: byte-level models typically have vocab_size==256 79 | level = "byte" if cfg.vocab_size == 256 else "char" 80 | 81 | if level == "byte": 82 | prompt_bytes = args.prompt.encode("utf-8") 83 | input_ids = torch.tensor([list(prompt_bytes)], dtype=torch.long, device=device) 84 | out = sample( 85 | model, 86 | input_ids, 87 | max_new_tokens=args.max_new_tokens, 88 | temperature=args.temperature, 89 | top_k=(args.top_k if args.top_k > 0 else None), 90 | top_p=(args.top_p if args.top_p > 0 else None), 91 | ) 92 | text = bytes(out[0].tolist()).decode("utf-8", errors="ignore") 93 | print(text) 94 | else: 95 | # Prefer tokenizer embedded in checkpoint; otherwise rebuild from refs. 96 | from ch6_tokenize import SimpleTokenizer, Vocab # type: ignore 97 | tok = None 98 | if "tokenizer" in ckpt: 99 | meta = ckpt["tokenizer"] 100 | if meta.get("level") == level and meta.get("id_to_token"): 101 | id_to_token = list(meta["id_to_token"]) # ensure list 102 | token_to_id = {t: i for i, t in enumerate(id_to_token)} 103 | pad_id = int(meta.get("pad_id", 0)) 104 | unk_id = int(meta.get("unk_id", 1)) 105 | vocab = Vocab(token_to_id=token_to_id, id_to_token=id_to_token, pad=pad_id, unk=unk_id) 106 | tok = SimpleTokenizer(vocab=vocab, level=level) 107 | if tok is None: 108 | if not args.ref_text: 109 | print("ERROR: provide --ref-text files to rebuild tokenizer for level=char/word.") 110 | sys.exit(2) 111 | ref = "\n".join(Path(p).read_text(encoding="utf-8") for p in args.ref_text) 112 | tokens = SimpleTokenizer._split(ref, level) 113 | vocab = Vocab.build(tokens) 114 | tok = SimpleTokenizer(vocab=vocab, level=level) 115 | if len(vocab) != cfg.vocab_size: 116 | print(f"WARNING: tokenizer vocab {len(vocab)} != model vocab {cfg.vocab_size}; decoding may be off.") 117 | input_ids = torch.tensor([tok.encode(args.prompt)], dtype=torch.long, device=device) 118 | out = sample( 119 | model, 120 | input_ids, 121 | max_new_tokens=args.max_new_tokens, 122 | temperature=args.temperature, 123 | top_k=(args.top_k if args.top_k > 0 else None), 124 | top_p=(args.top_p if args.top_p > 0 else None), 125 | ) 126 | print(tok.decode(out[0].tolist())) 127 | 128 | 129 | if __name__ == "__main__": 130 | main() 131 | -------------------------------------------------------------------------------- /notebooks/ch15_deployment_colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "8e6e35ea", 6 | "metadata": {}, 7 | "source": [ 8 | "" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "f18acf81", 14 | "metadata": {}, 15 | "source": [ 16 | "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n", 17 | "## Chapter 15 — Deployment & Applications\n", 18 | "**© Dr. Yves J. Hilpisch**
AI-Powered by GPT-5." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "494d5b47", 24 | "metadata": {}, 25 | "source": [ 26 | "## How to Use This Notebook\n", 27 | "\n", 28 | "- Package the trained model with lightweight serving infrastructure.\n", 29 | "- Design latency and throughput tests that match production realities.\n", 30 | "- Plan monitoring hooks to catch model drift and quality regressions." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "18e472fa", 36 | "metadata": {}, 37 | "source": [ 38 | "### Roadmap\n", 39 | "\n", 40 | "We export the model, wire a minimal API, and simulate traffic patterns to validate performance before launch." 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "id": "06ae0dd1", 46 | "metadata": {}, 47 | "source": [ 48 | "### Study Tips\n", 49 | "\n", 50 | "Document your deployment assumptions (hardware, concurrency, SLAs). These constraints shape every implementation choice." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "id": "57b5f971", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# Bundle structure (config + weights + tokenizer)\n", 61 | "import json, torch\n", 62 | "example = {\n", 63 | " 'config': {'vocab_size': 32, 'block_size': 8, 'd_model': 32,\n", 64 | " 'n_head': 4, 'n_layer': 2, 'd_ff': 64, 'dropout': 0.0},\n", 65 | " 'model_state': {},\n", 66 | " 'tokenizer': {'level': 'char', 'id_to_token': list(' _abcdefghijklmnopqrstuvwxyz'),\n", 67 | " 'pad_id': 0, 'unk_id': 1}\n", 68 | "}\n", 69 | "list(example.keys())\n" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "id": "3c11263e", 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "# Load + sample from a real bundle (requires repo present and a bundle path)\n", 80 | "import sys, pathlib\n", 81 | "sys.path.append(str(pathlib.Path('code').resolve()))\n", 82 | "from ch09_gpt import GPT, GPTConfig # type: ignore\n", 83 | "from ch11_sampling import sample # type: ignore\n", 84 | "from ch6_tokenize import SimpleTokenizer, Vocab # type: ignore\n", 85 | "\n", 86 | "def load_bundle(path: str):\n", 87 | " b = torch.load(path, map_location='cpu')\n", 88 | " cfg = GPTConfig(**b['config'])\n", 89 | " model = GPT(cfg).eval(); model.load_state_dict(b['model_state'])\n", 90 | " meta = b.get('tokenizer'); tok = None\n", 91 | " if meta and meta.get('id_to_token'):\n", 92 | " id_to_token = list(meta['id_to_token'])\n", 93 | " token_to_id = {t:i for i,t in enumerate(id_to_token)}\n", 94 | " vocab = Vocab(token_to_id=token_to_id, id_to_token=id_to_token,\n", 95 | " pad=int(meta.get('pad_id',0)), unk=int(meta.get('unk_id',1)))\n", 96 | " tok = SimpleTokenizer(vocab=vocab, level=meta.get('level','char'))\n", 97 | " return model, tok\n", 98 | "\n", 99 | "def sample_bundle(bundle_path: str, prompt: str,\n", 100 | " max_new=80, temperature=0.9, top_p=0.95, top_k=0):\n", 101 | " model, tok = load_bundle(bundle_path)\n", 102 | " if tok is None:\n", 103 | " ids = torch.tensor([[c for c in prompt.encode('utf-8')]], dtype=torch.long)\n", 104 | " out = sample(model, ids, max_new_tokens=max_new, temperature=temperature,\n", 105 | " top_k=(top_k or None), top_p=(top_p or None))\n", 106 | " return bytes(out[0].tolist()).decode('utf-8', errors='ignore')\n", 107 | " ids = torch.tensor([tok.encode(prompt)], dtype=torch.long)\n", 108 | " out = sample(model, ids, max_new_tokens=max_new, temperature=temperature,\n", 109 | " top_k=(top_k or None), top_p=(top_p or None))\n", 110 | " return tok.decode(out[0].tolist())\n", 111 | "\n", 112 | "# Example (requires a real bundle):\n", 113 | "# sample_bundle('model_bundle.pt', 'Hello')\n" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "id": "f03b3efd", 119 | "metadata": {}, 120 | "source": [ 121 | "## Exercises\n", 122 | "\n", 123 | "- Build a FastAPI or Flask microservice that serves one of the trained checkpoints.\n", 124 | "- Design a canary analysis comparing the deployed model to a previous baseline.\n", 125 | "- Draft a monitoring plan that includes automated alerts and qualitative spot checks." 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "id": "66e1a39f", 131 | "metadata": {}, 132 | "source": [ 133 | "" 134 | ] 135 | } 136 | ], 137 | "metadata": { 138 | "kernelspec": { 139 | "display_name": "Python 3", 140 | "language": "python", 141 | "name": "python3" 142 | }, 143 | "language_info": { 144 | "name": "python", 145 | "version": "3.10" 146 | } 147 | }, 148 | "nbformat": 4, 149 | "nbformat_minor": 5 150 | } 151 | -------------------------------------------------------------------------------- /notebooks/ch01_intro_colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "a4ad2d66", 6 | "metadata": {}, 7 | "source": [ 8 | "" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "417e62fd", 14 | "metadata": {}, 15 | "source": [ 16 | "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n", 17 | "## Chapter 1 — Introduction\n", 18 | "**© Dr. Yves J. Hilpisch**
AI-Powered by GPT-5." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "ddd151a2", 24 | "metadata": {}, 25 | "source": [ 26 | "## How to Use This Notebook\n", 27 | "\n", 28 | "- Orient yourself in the project structure used throughout the book.\n", 29 | "- Run quick environment diagnostics to ensure Colab is ready for later chapters.\n", 30 | "- Capture personal learning goals so you can revisit them after reading." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "53cb70df", 36 | "metadata": {}, 37 | "source": [ 38 | "### Learning Objectives\n", 39 | "\n", 40 | "- Understand how the book resources are organized.\n", 41 | "- Identify the core assets needed to build and train attoLLM.\n", 42 | "- Plan how you will iterate through the practical chapters." 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "id": "2c455e5b", 48 | "metadata": {}, 49 | "source": [ 50 | "### Project Tour\n", 51 | "\n", 52 | "The repository follows a predictable structure so you can navigate quickly:\n", 53 | "\n", 54 | "- `chapters/` holds the AsciiDoc sources for the printed book.\n", 55 | "- `notebooks/` mirrors the chapters with runnable, Colab-optimized notebooks.\n", 56 | "- `code/` contains reusable modules that the notebooks import.\n", 57 | "\n", 58 | "Spend a minute opening each directory in Colab's file browser so you know where to find things later." 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "92ee6ac5", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "from pathlib import Path\n", 69 | "import textwrap\n", 70 | "\n", 71 | "root = Path('.')\n", 72 | "folders = ['chapters', 'notebooks', 'code', 'data']\n", 73 | "for folder in folders:\n", 74 | " path = root / folder\n", 75 | " print(f\"{folder:>10}: {'found' if path.exists() else 'missing'}\")\n", 76 | "\n", 77 | "message = textwrap.dedent(\n", 78 | " \"\"\"Tip: In Colab you can double-click folders in the left sidebar to expand them.\n", 79 | "Make sure the directories you expect are present before moving on.\"\"\"\n", 80 | ")\n", 81 | "print()\n", 82 | "print(message)\n" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "id": "524b9c41", 88 | "metadata": {}, 89 | "source": [ 90 | "### Environment Check\n", 91 | "\n", 92 | "Even this introductory chapter benefits from confirming that Colab offers the right tooling. Run the cell below and capture the output in case you need to debug later." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "5d792061", 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "import platform\n", 103 | "import psutil\n", 104 | "\n", 105 | "print(f\"Python version : {platform.python_version()}\")\n", 106 | "print(f\"Kernel : {platform.release()}\")\n", 107 | "print(f\"Processor : {platform.processor() or 'N/A'}\")\n", 108 | "print(f\"Logical cores : {psutil.cpu_count(logical=True)}\")\n", 109 | "print(f\"Memory (GB) : {psutil.virtual_memory().total / 1e9:.2f}\")\n" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "id": "696a0174", 115 | "metadata": {}, 116 | "source": [ 117 | "### Capture Your Intentions\n", 118 | "\n", 119 | "Use the template below to jot down what you plan to build and what you hope to learn. Revisit this at the end of the book to measure progress." 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "id": "eec82bf4", 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "from datetime import date\n", 130 | "\n", 131 | "template = f\"\"\"# Learning Journal ({date.today().isoformat()})\n", 132 | "\n", 133 | "- Why I am reading this book: \n", 134 | "- My baseline understanding of LLMs: \n", 135 | "- Tools I am most comfortable with today: \n", 136 | "- Success will look like: \n", 137 | "\"\"\"\n", 138 | "print(template)\n" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "id": "d98dbbed", 144 | "metadata": {}, 145 | "source": [ 146 | "## Exercises\n", 147 | "\n", 148 | "- List three specific questions you want answered while working through the book.\n", 149 | "- Note which chapters you plan to tackle in the next study session and why.\n", 150 | "- Sketch a simple schedule for revisiting your learning journal every few chapters." 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "id": "6d018303", 156 | "metadata": {}, 157 | "source": [ 158 | "" 159 | ] 160 | } 161 | ], 162 | "metadata": { 163 | "colab": { 164 | "name": "Chapter 01 · Introduction" 165 | }, 166 | "kernelspec": { 167 | "display_name": "Python 3", 168 | "language": "python", 169 | "name": "python3" 170 | }, 171 | "language_info": { 172 | "name": "python", 173 | "version": "3.10" 174 | } 175 | }, 176 | "nbformat": 4, 177 | "nbformat_minor": 5 178 | } 179 | -------------------------------------------------------------------------------- /notebooks/ch04_hardware_software_colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "d720f282", 6 | "metadata": {}, 7 | "source": [ 8 | "" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "0ff8ea2e", 14 | "metadata": {}, 15 | "source": [ 16 | "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n", 17 | "## Chapter 4 — Hardware Platforms & Software Setup\n", 18 | "**© Dr. Yves J. Hilpisch**
AI-Powered by GPT-5." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "df0eca6b", 24 | "metadata": {}, 25 | "source": [ 26 | "## How to Use This Notebook\n", 27 | "\n", 28 | "- Inspect the hardware Colab assigned (CPU, GPU, RAM) and log the results.\n", 29 | "- Benchmark lightweight operations to estimate the runtime of later experiments.\n", 30 | "- Decide when to scale up or down based on the metrics you collect here." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "696cebf7", 36 | "metadata": {}, 37 | "source": [ 38 | "### Hardware Discovery\n", 39 | "\n", 40 | "Understanding what resources you have prevents surprises when training longer experiments. The cells below gather GPU, CPU, and RAM details." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "id": "1d413986", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "import platform\n", 51 | "import psutil\n", 52 | "\n", 53 | "print(f\"Python : {platform.python_version()}\")\n", 54 | "print(f\"Machine : {platform.machine()}\")\n", 55 | "print(f\"Processor : {platform.processor() or 'N/A'}\")\n", 56 | "print(f\"Logical CPU : {psutil.cpu_count(logical=True)}\")\n", 57 | "print(f\"Physical CPU: {psutil.cpu_count(logical=False)}\")\n", 58 | "print(f\"Total RAM : {psutil.virtual_memory().total / 1e9:.2f} GB\")\n" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "d5f06e3c", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# GPU diagnostics (works when a GPU runtime is enabled in Colab)\n", 69 | "try:\n", 70 | " import torch\n", 71 | " if torch.cuda.is_available():\n", 72 | " gpu_index = 0\n", 73 | " props = torch.cuda.get_device_properties(gpu_index)\n", 74 | " print(f\"Device : {props.name}\")\n", 75 | " print(f\"Memory (GB) : {props.total_memory / 1e9:.2f}\")\n", 76 | " print(f\"SM count : {props.multi_processor_count}\")\n", 77 | " else:\n", 78 | " print(\"No CUDA-capable GPU detected. Switch to a GPU runtime if needed.\")\n", 79 | "except Exception as exc:\n", 80 | " print(f\"Torch not available or failed to query CUDA: {exc}\")\n" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "id": "f5432d2c", 86 | "metadata": {}, 87 | "source": [ 88 | "### Quick Benchmark\n", 89 | "\n", 90 | "Run a tiny matmul benchmark. The absolute number is less important than the comparison you can make when switching runtimes." 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "id": "0fb72ff8", 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "import time\n", 101 | "import torch\n", 102 | "\n", 103 | "def benchmark_matmul(dim: int = 2048, repeats: int = 5):\n", 104 | " device = 'cuda' if torch.cuda.is_available() else 'cpu'\n", 105 | " x = torch.randn(dim, dim, device=device)\n", 106 | " y = torch.randn(dim, dim, device=device)\n", 107 | " if device == 'cuda':\n", 108 | " torch.cuda.synchronize()\n", 109 | " timings = []\n", 110 | " for _ in range(repeats):\n", 111 | " start = time.time()\n", 112 | " _ = x @ y\n", 113 | " if device == 'cuda':\n", 114 | " torch.cuda.synchronize()\n", 115 | " timings.append(time.time() - start)\n", 116 | " return device, sum(timings) / len(timings)\n", 117 | "\n", 118 | "device, avg = benchmark_matmul()\n", 119 | "print(f\"Average matmul time on {device.upper()}: {avg:.4f} seconds\")\n" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "id": "c6777f19", 125 | "metadata": {}, 126 | "source": [ 127 | "### Interpreting the Results\n", 128 | "\n", 129 | "Compare the metrics above with the recommendations in the chapter. Record whether your current environment is sufficient or if you should upgrade for compute-heavy chapters." 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "id": "6965f0d2", 135 | "metadata": {}, 136 | "source": [ 137 | "## Exercises\n", 138 | "\n", 139 | "- Measure the impact of different batch sizes on the benchmark timing above.\n", 140 | "- Record GPU memory usage before and after running the benchmark using `torch.cuda.memory_allocated()`.\n", 141 | "- Summarize in a short paragraph what hardware is ideal for full training runs versus experimentation." 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "id": "09f87ed5", 147 | "metadata": {}, 148 | "source": [ 149 | "" 150 | ] 151 | } 152 | ], 153 | "metadata": { 154 | "colab": { 155 | "name": "Chapter 04 · Hardware & Software" 156 | }, 157 | "kernelspec": { 158 | "display_name": "Python 3", 159 | "language": "python", 160 | "name": "python3" 161 | }, 162 | "language_info": { 163 | "name": "python", 164 | "version": "3.10" 165 | } 166 | }, 167 | "nbformat": 4, 168 | "nbformat_minor": 5 169 | } 170 | -------------------------------------------------------------------------------- /code/ch08_transformer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | """ 9 | 10 | from __future__ import annotations 11 | 12 | import math 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.functional as F 16 | 17 | 18 | def sinusoidal_positions( 19 | T: int, 20 | d_model: int, 21 | device: torch.device | None = None, 22 | ) -> torch.Tensor: 23 | """Return [T, d_model] sinusoidal position encodings (sin/cos pairs). 24 | 25 | - T: sequence length (time steps) 26 | - d_model: embedding dimension (even recommended) 27 | """ 28 | # positions [T,1] and index grid [1,D] 29 | pos = torch.arange(T, device=device).float()[:, None] 30 | i = torch.arange(d_model, device=device).float()[None, :] 31 | # frequency grid [T,D] 32 | angle = pos / (10000 ** (2 * (i // 2) / d_model)) 33 | enc = torch.zeros(T, d_model, device=device) 34 | enc[:, 0::2] = torch.sin(angle[:, 0::2]) # even dims = sin 35 | enc[:, 1::2] = torch.cos(angle[:, 1::2]) # odd dims = cos 36 | return enc 37 | 38 | 39 | class MultiHeadAttention(nn.Module): 40 | """Multi‑head self‑attention (single module, H heads). 41 | 42 | d_model = H * Dh, where Dh is per‑head dim. We project to Q,K,V, split into 43 | heads, apply scaled dot‑product attention per head, then concat and project out. 44 | """ 45 | 46 | def __init__(self, d_model: int, num_heads: int, dropout: float = 0.0): 47 | super().__init__() 48 | assert d_model % num_heads == 0, "d_model must be divisible by num_heads" 49 | self.h = num_heads # number of heads 50 | self.d = d_model // num_heads # per‑head dim 51 | self.qkv = nn.Linear(d_model, 3 * d_model, bias=False) # shared proj 52 | self.out = nn.Linear(d_model, d_model, bias=False) # output proj 53 | self.drop = nn.Dropout(dropout) 54 | 55 | def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor: 56 | B, T, Dm = x.shape # batch, time, model dim 57 | qkv = self.qkv(x) # [B, T, 3*Dm] 58 | q, k, v = qkv.chunk(3, dim=-1) # each [B, T, Dm] 59 | 60 | # Split heads: [B,T,Dm] -> [B,H,T,Dh], 61 | # then put heads dimension before time. 62 | def split(t: torch.Tensor) -> torch.Tensor: 63 | return t.view(B, T, self.h, self.d).transpose(1, 2) 64 | 65 | q, k, v = map(split, (q, k, v)) # [B, H, T, Dh] 66 | 67 | # Build [B, H, T, T] boolean mask (True = disallowed) 68 | mask_bool = None 69 | if mask is not None: 70 | if mask.dim() == 2: 71 | mask_bool = ( 72 | (mask == 0) 73 | .bool()[None, None, :, :] 74 | .expand(B, self.h, T, T) 75 | ) 76 | elif mask.dim() == 3: 77 | mask_bool = ( 78 | (mask == 0) 79 | .bool() 80 | .unsqueeze(1) 81 | .expand(B, self.h, T, T) 82 | ) 83 | elif mask.dim() == 4: 84 | if mask.size(1) == 1: 85 | mask_bool = ( 86 | (mask == 0).bool().expand(B, self.h, T, T) 87 | ) 88 | else: 89 | mask_bool = (mask == 0).bool() 90 | 91 | # Manual scaled dot‑product attention for portability (MPS-safe) 92 | Dh = self.d 93 | scores = (q @ k.transpose(-2, -1)) / (Dh ** 0.5) # [B,H,T,T] 94 | if mask_bool is not None: 95 | scores = scores.masked_fill(mask_bool, float(-1e9)) 96 | w = torch.softmax(scores, dim=-1) 97 | attn = w @ v # [B,H,T,Dh] 98 | attn = self.drop(attn) 99 | 100 | # Concatenate heads back: [B, H, T, Dh] -> [B, T, Dm] 101 | y = ( 102 | attn.transpose(1, 2) 103 | .contiguous() 104 | .view(B, T, Dm) 105 | ) 106 | return self.out(y) 107 | 108 | 109 | class FeedForward(nn.Module): 110 | """Position‑wise MLP with GELU and dropout.""" 111 | 112 | def __init__(self, d_model: int, d_ff: int, dropout: float = 0.0): 113 | super().__init__() 114 | self.net = nn.Sequential( 115 | nn.Linear(d_model, d_ff), # expand 116 | nn.GELU(), # nonlinearity 117 | nn.Dropout(dropout), 118 | nn.Linear(d_ff, d_model), # project back 119 | nn.Dropout(dropout), 120 | ) 121 | 122 | def forward(self, x: torch.Tensor) -> torch.Tensor: 123 | return self.net(x) 124 | 125 | 126 | class Residual(nn.Module): 127 | """Pre‑norm residual wrapper: x + sublayer(LN(x)).""" 128 | 129 | def __init__(self, d_model: int): 130 | super().__init__() 131 | self.norm = nn.LayerNorm(d_model) 132 | 133 | def forward(self, x: torch.Tensor, sublayer: nn.Module, *args, **kwargs) -> torch.Tensor: 134 | return x + sublayer(self.norm(x), *args, **kwargs) 135 | 136 | 137 | class TransformerBlock(nn.Module): 138 | """One pre‑norm transformer block: MHA + FFN with residuals.""" 139 | 140 | def __init__(self, d_model: int, num_heads: int, d_ff: int, dropout: float = 0.0): 141 | super().__init__() 142 | self.mha = MultiHeadAttention(d_model, num_heads, dropout) 143 | self.ffn = FeedForward(d_model, d_ff, dropout) 144 | self.res1 = Residual(d_model) 145 | self.res2 = Residual(d_model) 146 | 147 | def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor: 148 | x = self.res1(x, self.mha, mask) # attend + residual 149 | x = self.res2(x, self.ffn) # think (FFN) + residual 150 | return x 151 | 152 | 153 | __all__ = [ 154 | "sinusoidal_positions", 155 | "MultiHeadAttention", 156 | "FeedForward", 157 | "Residual", 158 | "TransformerBlock", 159 | ] 160 | -------------------------------------------------------------------------------- /notebooks/ch03_setup_project_colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "89036b8d", 6 | "metadata": {}, 7 | "source": [ 8 | "" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "c4b76bea", 14 | "metadata": {}, 15 | "source": [ 16 | "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n", 17 | "## Chapter 3 — Setting Up the Project\n", 18 | "**© Dr. Yves J. Hilpisch**
AI-Powered by GPT-5." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "dc91896e", 24 | "metadata": {}, 25 | "source": [ 26 | "## How to Use This Notebook\n", 27 | "\n", 28 | "- Clone or sync the repository into your Colab workspace in a reproducible way.\n", 29 | "- Install Python dependencies with pinned versions for deterministic runs.\n", 30 | "- Validate the setup by running a smoke test that imports key modules." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "15dd1003", 36 | "metadata": {}, 37 | "source": [ 38 | "### Repository Setup Options\n", 39 | "\n", 40 | "There are two common paths in Colab:\n", 41 | "\n", 42 | "1. Mount a cloud drive (Google Drive, Dropbox, etc.) and access the synchronized folder.\n", 43 | "2. Clone the repository directly with `git clone`.\n", 44 | "\n", 45 | "Pick the approach that gives you the most reliable storage for persistent artifacts." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "id": "ec4c7e2c", 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# Example: cloning via Git (replace the URL with your fork if needed)\n", 56 | "import os\n", 57 | "import subprocess\n", 58 | "import sys\n", 59 | "\n", 60 | "repo_url = \"https://github.com/your-org/atto-llm-book.git\"\n", 61 | "if os.environ.get('COLAB_RELEASE_TAG'):\n", 62 | " subprocess.run(['git', 'clone', '--depth=1', repo_url, 'project_repo'], check=True)\n", 63 | "else:\n", 64 | " print('Skipping git clone outside Colab. Run this cell manually when you have network access.')\n" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "4c3c9d62", 70 | "metadata": {}, 71 | "source": [ 72 | "### Dependency Management\n", 73 | "\n", 74 | "Use a requirements file whenever possible. That way your future training runs or deployments start from the same baseline environment." 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "id": "04cb23c5", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "# Use pip to install only what you need for this chapter.\n", 85 | "import os\n", 86 | "import subprocess\n", 87 | "import sys\n", 88 | "\n", 89 | "if os.environ.get('COLAB_RELEASE_TAG'):\n", 90 | " subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', '-r', 'requirements.txt'], check=True)\n", 91 | "else:\n", 92 | " print('Skipping pip install outside Colab to keep validation fast. Run this in Colab when needed.')\n" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "id": "bebf4752", 98 | "metadata": {}, 99 | "source": [ 100 | "### Smoke Test the Environment\n", 101 | "\n", 102 | "Import the utilities that later notebooks expect. If this cell fails, resolve the issue before moving forward." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "id": "bbac02ab", 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "try:\n", 113 | " from code.attollm.data import dataset\n", 114 | " from code.attollm.model import transformer\n", 115 | " print(\"Modules imported successfully.\")\n", 116 | "except ModuleNotFoundError as exc:\n", 117 | " print(f\"Import failed: {exc}\")\n", 118 | " print(\"Adjust PYTHONPATH or install missing packages before continuing.\")\n" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "id": "50a2eecd", 124 | "metadata": {}, 125 | "source": [ 126 | "### Configuration Snapshot\n", 127 | "\n", 128 | "Capture the versions of critical libraries so your future self (or collaborators) knows what environment produced your results." 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "id": "fd41e738", 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "import json\n", 139 | "import pkg_resources\n", 140 | "\n", 141 | "packages_of_interest = [\"torch\", \"numpy\", \"pandas\", \"transformers\"]\n", 142 | "versions = {}\n", 143 | "for pkg in packages_of_interest:\n", 144 | " try:\n", 145 | " versions[pkg] = pkg_resources.get_distribution(pkg).version\n", 146 | " except pkg_resources.DistributionNotFound:\n", 147 | " versions[pkg] = \"not installed\"\n", 148 | "print(json.dumps(versions, indent=2))\n" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "id": "12814a21", 154 | "metadata": {}, 155 | "source": [ 156 | "## Exercises\n", 157 | "\n", 158 | "- Decide where you will store checkpoints and large artifacts; document the path in your notes.\n", 159 | "- Create a new Python virtual environment (locally or in Colab) and list the exact activation steps.\n", 160 | "- Write a short shell script that provisions the environment from scratch using the commands above." 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "id": "0ea7b832", 166 | "metadata": {}, 167 | "source": [ 168 | "" 169 | ] 170 | } 171 | ], 172 | "metadata": { 173 | "colab": { 174 | "name": "Chapter 03 · Project Setup" 175 | }, 176 | "kernelspec": { 177 | "display_name": "Python 3", 178 | "language": "python", 179 | "name": "python3" 180 | }, 181 | "language_info": { 182 | "name": "python", 183 | "version": "3.10" 184 | } 185 | }, 186 | "nbformat": 4, 187 | "nbformat_minor": 5 188 | } 189 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Building a Large Language Model from Scratch - Book Companion Code 4 | 5 | This repository contains the executable code assets that accompany the book: 6 | 7 | Building a Large Language Model from Scratch — A Step‑by‑Step Guide Using Python and PyTorch 8 | 9 | It is designed as a hands‑on companion: pair the book manuscript with this repo to run the same scripts and notebooks locally, inspect results, and extend the examples. 10 | 11 | ## What’s Inside 12 | 13 | - `code/` — Python modules and scripts per chapter, plus small utilities. 14 | - `notebooks/` — Jupyter notebooks that mirror the book’s walkthroughs. 15 | - `figures/` — Static figures generated by the code (used in the book). 16 | - `tools/` — Validation helpers to run scripts/notebooks in batch. 17 | - `requirements.txt` — Minimal packages used across chapters. 18 | 19 | The project keeps dependencies intentionally light. Install PyTorch separately following the official instructions for your platform. 20 | 21 | ## Setup: Local Virtual Environment 22 | 23 | Below are reliable, copy‑pasteable steps to get a clean `.venv` and install requirements. 24 | 25 | Unix/macOS (bash/zsh) 26 | 27 | ``` 28 | python3 -m venv .venv 29 | source .venv/bin/activate 30 | python -m pip install --upgrade pip wheel 31 | pip install -r requirements.txt 32 | 33 | # Install PyTorch for your platform (examples): 34 | # CPU only: 35 | # pip install --index-url https://download.pytorch.org/whl/cpu torch torchvision torchaudio 36 | # Apple Silicon (MPS): 37 | # pip install torch torchvision torchaudio 38 | # CUDA (choose the right CUDA version from pytorch.org): 39 | # pip install --index-url https://download.pytorch.org/whl/cu121 torch torchvision torchaudio 40 | 41 | # Helpful links: 42 | # Apple Silicon (MPS) guide: https://pytorch.org/get-started/locally/ (select macOS + pip + MPS) 43 | # CUDA wheels index (e.g., cu121): https://download.pytorch.org/whl/cu121 44 | # CPU wheels index: https://download.pytorch.org/whl/cpu 45 | 46 | # Optional: for notebook execution helper 47 | pip install nbclient nbformat 48 | ``` 49 | 50 | Windows (PowerShell) 51 | 52 | ``` 53 | py -m venv .venv 54 | .\.venv\Scripts\Activate.ps1 55 | python -m pip install --upgrade pip wheel 56 | pip install -r requirements.txt 57 | 58 | # Install PyTorch from https://pytorch.org/get-started/locally/ 59 | # Example (CPU only): 60 | # pip install --index-url https://download.pytorch.org/whl/cpu torch torchvision torchaudio 61 | 62 | # Helpful links: 63 | # Apple Silicon (MPS) guide: https://pytorch.org/get-started/locally/ (select macOS + pip + MPS) 64 | # CUDA wheels index (e.g., cu121): https://download.pytorch.org/whl/cu121 65 | # CPU wheels index: https://download.pytorch.org/whl/cpu 66 | 67 | # Optional: for notebook execution helper 68 | pip install nbclient nbformat 69 | ``` 70 | 71 | ## Python Version 72 | 73 | - Recommended: Python 3.11. This repo is tested with 3.11 and avoids edge cases seen with newer versions. 74 | - If you encounter errors like: `AttributeError: module 'code' has no attribute 'InteractiveConsole'` (often on Python 3.13), use Python 3.11 for now. The project’s `code/` package name can shadow the stdlib module named `code` under some setups. 75 | 76 | Quick ways to create a 3.11 venv 77 | 78 | - macOS (Homebrew): 79 | 80 | ``` 81 | brew install python@3.11 82 | /opt/homebrew/bin/python3.11 -m venv .venv 83 | source .venv/bin/activate 84 | pip install -r requirements.txt 85 | ``` 86 | 87 | - pyenv: 88 | 89 | ``` 90 | pyenv install 3.11.9 91 | ~/.pyenv/versions/3.11.9/bin/python -m venv .venv 92 | source .venv/bin/activate 93 | pip install -r requirements.txt 94 | ``` 95 | 96 | ## Validating Everything Runs 97 | 98 | - Validate all Python scripts under `code/`: 99 | 100 | ``` 101 | python tools/validate_scripts.py 102 | ``` 103 | 104 | - Validate all notebooks under `notebooks/` (executes each notebook): 105 | 106 | ``` 107 | python tools/validate_notebooks.py 108 | # or force a specific kernel 109 | python tools/validate_notebooks.py --kernel python3 110 | ``` 111 | 112 | Both validators print structured, timed output for Structure / Imports / Execute and end with a summary like: 113 | 114 | ``` 115 | [25/26] Validating notebooks/ch25_reproducible_research_playbook.ipynb 116 | • Structure: OK (1.3ms) 117 | • Imports: OK (1.6ms scan, 0.0ms probe) 118 | • Execute: OK (1.06s) 119 | • Total: 1.07s 120 | ``` 121 | 122 | ## Requirements 123 | 124 | The minimal shared requirements are captured in `requirements.txt`: 125 | 126 | ``` 127 | numpy>=1.24 128 | tqdm>=4.66 129 | tensorboard>=2.13 130 | graphviz>=0.20 131 | typing-extensions>=4.8 132 | 133 | # IMPORTANT: Install PyTorch following instructions for your OS/GPU: 134 | # https://pytorch.org/get-started/locally/ 135 | ``` 136 | 137 | Some chapters may suggest optional extras (e.g., `fastapi`, `streamlit`) in the code comments; install those ad‑hoc when exploring those parts. 138 | 139 | ## Tips 140 | 141 | - Keep edit→run loops tight by working in a small terminal + editor split and running individual scripts first, then notebooks. 142 | - Use `code/env_check.py` and `code/check_backends.py` to confirm your Python/torch environment. 143 | - If notebook execution is slow, start with scripts to validate logic, then run the notebook cell‑by‑cell. 144 | 145 | ## Disclaimer 146 | 147 | This repository is provided "as is" for educational purposes. No warranties, guarantees, or representations of any kind are made regarding correctness, completeness, fitness for a particular purpose, or non‑infringement. Code and examples are intended for illustration and learning; they may omit production concerns (error handling, security, performance, robustness). Use at your own risk. Dependencies evolve over time; version changes can affect behavior or break examples. 148 | 149 | ## Credits and License 150 | 151 | © Dr. Yves J. Hilpisch (The Python Quants GmbH). All rights reserved unless otherwise noted. See source files for per‑file notices where applicable. 152 | 153 | 154 | This repository is a companion to the book “Building a Large Language Model from Scratch — A Step‑by‑Step Guide Using Python and PyTorch”. 155 | 156 | 157 | -------------------------------------------------------------------------------- /notebooks/ch07_attention.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "0232c872", 6 | "metadata": {}, 7 | "source": [ 8 | "" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "77846170", 14 | "metadata": {}, 15 | "source": [ 16 | "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n", 17 | "## Chapter 7 — Attention & Self-Attention Mechanism\n", 18 | "**© Dr. Yves J. Hilpisch**
AI-Powered by GPT-5." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "4f58d442", 24 | "metadata": {}, 25 | "source": [ 26 | "## How to Use This Notebook\n", 27 | "\n", 28 | "- Implement scaled dot-product attention step by step before abstracting it away.\n", 29 | "- Inspect attention weights on curated toy sequences to build intuition.\n", 30 | "- Connect the math to code by tracing shapes and broadcasting rules carefully." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "5b73ad6d", 36 | "metadata": {}, 37 | "source": [ 38 | "### Roadmap\n", 39 | "\n", 40 | "We derive attention scores, apply masking, and then batch the operation so it scales to transformer blocks." 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "id": "e2858b3c", 46 | "metadata": {}, 47 | "source": [ 48 | "### Study Tips\n", 49 | "\n", 50 | "Print intermediate tensors as you go. Seeing the score matrices and masks makes it easier to reason about what each line of code accomplishes." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "id": "2eca1602", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# Ensure torch is available (Colab friendly)\n", 61 | "try:\n", 62 | " import torch # noqa\n", 63 | " print('torch:', torch.__version__)\n", 64 | "except Exception:\n", 65 | " import os\n", 66 | " gpu = os.system('nvidia-smi > /dev/null 2>&1') == 0\n", 67 | " index = 'https://download.pytorch.org/whl/cu121' if gpu else 'https://download.pytorch.org/whl/cpu'\n", 68 | " get_ipython().run_line_magic('pip', f'install -q torch --index-url {index}')\n", 69 | " import torch\n", 70 | " print('torch:', torch.__version__)\n" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "id": "258ac315", 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "import matplotlib.pyplot as plt\n", 81 | "plt.style.use('seaborn-v0_8')\n", 82 | "%config InlineBackend.figure_format = 'svg'\n" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "id": "13a671dd", 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "from torch import Tensor\n", 93 | "def scaled_dot_product_attention(q: Tensor, k: Tensor, v: Tensor, mask: Tensor | None = None) -> Tensor:\n", 94 | " d = q.size(-1)\n", 95 | " scores = (q @ k.transpose(-2, -1)) / (d ** 0.5)\n", 96 | " if mask is not None:\n", 97 | " scores = scores.masked_fill(mask == 0, float('-inf'))\n", 98 | " w = torch.softmax(scores, dim=-1)\n", 99 | " return w @ v\n", 100 | "def causal_mask(batch: int, time: int, device=None):\n", 101 | " base = torch.tril(torch.ones(time, time, device=device))\n", 102 | " return base.unsqueeze(0).expand(batch, -1, -1)\n" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "id": "8b88571d", 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "# Set random seed\n", 113 | "torch.manual_seed(0)\n" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "id": "93f39e6f", 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "# Define shapes\n", 124 | "B, T, D = 1, 6, 4\n", 125 | "(B, T, D)\n" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "id": "cb6c3e8a", 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "# Create a toy input\n", 136 | "x = torch.randn(B, T, D)\n", 137 | "x\n" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "id": "32236ebd", 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "# Build a causal mask\n", 148 | "mask = causal_mask(B, T)\n", 149 | "mask\n" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "id": "a8820239", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "# Apply attention\n", 160 | "y = scaled_dot_product_attention(x, x, x, mask)\n", 161 | "y\n" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "id": "f1121ec2", 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "# Visualize a row of attention weights\n", 172 | "with torch.no_grad():\n", 173 | " d = x.size(-1)\n", 174 | " scores = (x @ x.transpose(-2, -1)) / (d ** 0.5)\n", 175 | " scores = scores.masked_fill(mask == 0, float('-inf'))\n", 176 | " w = torch.softmax(scores, dim=-1)[0] # [T, T]\n", 177 | "plt.figure(figsize=(4, 3))\n", 178 | "plt.imshow(w, cmap='viridis', aspect='auto')\n", 179 | "plt.colorbar(label='weight')\n", 180 | "plt.xlabel('key\\npositions')\n", 181 | "plt.ylabel('query positions')\n", 182 | "plt.title('Causal attention weights (toy)')\n", 183 | "plt.tight_layout()\n" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "id": "43d21080", 189 | "metadata": {}, 190 | "source": [ 191 | "## Exercises\n", 192 | "\n", 193 | "- Implement additive attention and compare its behavior with scaled dot-product attention.\n", 194 | "- Visualize attention maps for sequences with padding to confirm masking works as expected.\n", 195 | "- Modify the notebook to support multi-head attention and measure the parameter count increase." 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "id": "acded7d4", 201 | "metadata": {}, 202 | "source": [ 203 | "" 204 | ] 205 | } 206 | ], 207 | "metadata": { 208 | "kernelspec": { 209 | "display_name": "Python 3", 210 | "language": "python", 211 | "name": "python3" 212 | }, 213 | "language_info": { 214 | "name": "python", 215 | "version": "3.10" 216 | } 217 | }, 218 | "nbformat": 4, 219 | "nbformat_minor": 5 220 | } 221 | -------------------------------------------------------------------------------- /notebooks/attollm_colab_starter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "2cf14c08", 6 | "metadata": {}, 7 | "source": [ 8 | "" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "c2c4bacb", 14 | "metadata": {}, 15 | "source": [ 16 | "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n", 17 | "## Project Companion — attoLLM Colab Starter\n", 18 | "**© Dr. Yves J. Hilpisch**
AI-Powered by GPT-5." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "7b132250", 24 | "metadata": {}, 25 | "source": [ 26 | "## How to Use This Notebook\n", 27 | "\n", 28 | "- Sanity-check GPU, disk, and Python versions before diving into model training.\n", 29 | "- Mount or sync the storage location you plan to use for datasets and checkpoints.\n", 30 | "- Bookmark the helper utilities you expect to reuse across multiple chapters." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "cd6be911", 36 | "metadata": {}, 37 | "source": [ 38 | "### Starter Checklist\n", 39 | "\n", 40 | "Use the diagnostics cells here before every new Colab session. Consistent environments remove a whole class of hard-to-track bugs." 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "id": "93519a54", 46 | "metadata": {}, 47 | "source": [ 48 | "### Collaboration Notes\n", 49 | "\n", 50 | "Share a copy of this notebook with teammates so everyone runs the same validation steps before kicking off workloads." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "id": "67ace6cb", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!nvidia-smi || echo 'No NVIDIA GPU available'" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "id": "d18d930c", 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# Install a recent CUDA wheel when GPU is present; fallback to CPU wheel\n", 71 | "import os\n", 72 | "import subprocess\n", 73 | "import sys\n", 74 | "\n", 75 | "\n", 76 | "def ensure_torch():\n", 77 | " try:\n", 78 | " import torch # type: ignore\n", 79 | " return torch\n", 80 | " except Exception as exc: # pragma: no cover - diagnostics\n", 81 | " print('torch import failed:', exc)\n", 82 | " if os.environ.get('COLAB_RELEASE_TAG'):\n", 83 | " gpu_available = os.system('nvidia-smi > /dev/null 2>&1') == 0\n", 84 | " index = 'https://download.pytorch.org/whl/cu121' if gpu_available else 'https://download.pytorch.org/whl/cpu'\n", 85 | " subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'torch', '--index-url', index], check=True)\n", 86 | " import torch # type: ignore\n", 87 | " return torch\n", 88 | " return None\n", 89 | "\n", 90 | "\n", 91 | "torch = ensure_torch()\n", 92 | "if torch is None:\n", 93 | " print('Install torch manually when you have network access (skipped for validation).')\n", 94 | "else:\n", 95 | " print('torch:', torch.__version__, 'cuda?', torch.cuda.is_available())\n" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "id": "06d1e430", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "# Scaffold attoLLM in /content/attollm (Colab) or a local folder otherwise\n", 106 | "import os\n", 107 | "import pathlib\n", 108 | "import subprocess\n", 109 | "import sys\n", 110 | "import textwrap\n", 111 | "\n", 112 | "root = pathlib.Path('/content/attollm') if os.environ.get('COLAB_RELEASE_TAG') else pathlib.Path.cwd() / 'attollm_scaffold'\n", 113 | "\n", 114 | "(root / 'src/attollm').mkdir(parents=True, exist_ok=True)\n", 115 | "for d in ['scripts', 'configs', 'data/raw', 'data/processed', 'data/cache', 'checkpoints', 'tests']:\n", 116 | " (root / d).mkdir(parents=True, exist_ok=True)\n", 117 | "\n", 118 | "(root / 'README.md').write_text('# attoLLM (Colab)\\n')\n", 119 | "\n", 120 | "gitignore_text = '\\n'.join([\n", 121 | " '__pycache__/',\n", 122 | " '*.pyc',\n", 123 | " 'data/cache/',\n", 124 | " 'checkpoints/',\n", 125 | " 'runs/',\n", 126 | " '*.pt',\n", 127 | " '*.pth',\n", 128 | "]) + '\\n'\n", 129 | "(root / '.gitignore').write_text(gitignore_text)\n", 130 | "\n", 131 | "requirements_text = 'numpy>=1.24\\n' 'tqdm>=4.66\\n'\n", 132 | "(root / 'requirements.txt').write_text(requirements_text)\n", 133 | "\n", 134 | "pyproject_text = textwrap.dedent(\"\"\"\n", 135 | " [build-system]\n", 136 | " requires = [\"setuptools>=68\", \"wheel\"]\n", 137 | " build-backend = \"setuptools.build_meta\"\n", 138 | "\n", 139 | " [project]\n", 140 | " name = \"attollm\"\n", 141 | " version = \"0.0.1\"\n", 142 | " requires-python = \">=3.10\"\n", 143 | " dependencies = []\n", 144 | "\n", 145 | " [tool.setuptools]\n", 146 | " package-dir = {\"\" = \"src\"}\n", 147 | "\n", 148 | " [tool.setuptools.packages.find]\n", 149 | " where = [\"src\"]\n", 150 | "\"\"\")\n", 151 | "(root / 'pyproject.toml').write_text(pyproject_text)\n", 152 | "\n", 153 | "(root / 'src/attollm/__init__.py').write_text('__all__ = [\"hello\"]\\n')\n", 154 | "\n", 155 | "hello_py = textwrap.dedent(\"\"\"\n", 156 | " def main():\n", 157 | " print(\"Hello from attoLLM (Colab)!\")\n", 158 | "\n", 159 | " if __name__ == \"__main__\":\n", 160 | " main()\n", 161 | "\"\"\")\n", 162 | "(root / 'src/attollm/hello.py').write_text(hello_py)\n", 163 | "\n", 164 | "print('Scaffolded at:', root)\n", 165 | "\n", 166 | "if os.environ.get('COLAB_RELEASE_TAG'):\n", 167 | " subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', '-e', str(root)], check=True)\n", 168 | " subprocess.run([sys.executable, '-m', 'attollm.hello'], check=True)\n", 169 | "else:\n", 170 | " print('Running outside Colab; editable install skipped to keep validation fast.')\n", 171 | " print('Inspect the scaffolded package locally at', root)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "id": "e2b12701", 177 | "metadata": {}, 178 | "source": [ 179 | "## Exercises\n", 180 | "\n", 181 | "- Adapt the environment check to alert you when CUDA is missing or mismatched.\n", 182 | "- Mount your preferred cloud storage (Drive, S3 via fsspec, etc.) and verify read/write access.\n", 183 | "- Create a short markdown playbook describing how you spin up a fresh Colab runtime for this project." 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "id": "5f075006", 189 | "metadata": {}, 190 | "source": [ 191 | "" 192 | ] 193 | } 194 | ], 195 | "metadata": { 196 | "kernelspec": { 197 | "display_name": "Python 3", 198 | "language": "python", 199 | "name": "python3" 200 | }, 201 | "language_info": { 202 | "name": "python", 203 | "version": "3.10" 204 | } 205 | }, 206 | "nbformat": 4, 207 | "nbformat_minor": 5 208 | } 209 | -------------------------------------------------------------------------------- /code/ch12_metrics_text.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building a Large Language Model from Scratch 3 | — A Step-by-Step Guide Using Python and PyTorch 4 | 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 6 | AI-Powered by GPT-5. 7 | 8 | Educational text metrics for Chapter 12. 9 | 10 | This module implements small, dependency‑free versions of common metrics: 11 | 12 | - BLEU (corpus): n‑gram precision with brevity penalty, optional Add‑1 smoothing. 13 | - ROUGE‑L: F‑measure based on the longest common subsequence (LCS). 14 | - METEOR (simplified): unigram precision/recall F‑mean with a fragmentation 15 | penalty estimated from contiguous matching chunks (no stemming/synonyms). 16 | - Diversity helpers: distinct‑1 / distinct‑2. 17 | 18 | Inputs are tokenized sequences (lists of strings or ints). We keep the 19 | implementations compact and readable for teaching; they are not drop‑in 20 | replacements for official packages, but align with the main ideas. 21 | """ 22 | 23 | from __future__ import annotations 24 | 25 | 26 | from typing import Iterable, List, Sequence, Tuple 27 | from collections import Counter 28 | 29 | 30 | def _ngram_counts(tokens: Sequence, n: int) -> Counter: 31 | return Counter(tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1)) 32 | 33 | 34 | def bleu_corpus( 35 | references: List[List[Sequence]], 36 | hypotheses: List[Sequence], 37 | max_n: int = 4, 38 | smooth: bool = True, 39 | ) -> float: 40 | """Compute a simple corpus BLEU. 41 | 42 | Args: 43 | references: list of lists of reference token sequences (per hypothesis) 44 | hypotheses: list of hypothesis token sequences 45 | max_n: highest n‑gram order (default 4) 46 | smooth: Add‑1 smoothing for counts 47 | Returns: 48 | BLEU score in [0, 1] 49 | """ 50 | assert len(references) == len(hypotheses) 51 | 52 | # Modified n‑gram precisions 53 | num = [0] * max_n 54 | den = [0] * max_n 55 | 56 | ref_len = 0 57 | hyp_len = 0 58 | 59 | for refs, hyp in zip(references, hypotheses): 60 | hyp_len += len(hyp) 61 | # reference length closest to hypothesis (brevity penalty) 62 | ref_lengths = [len(r) for r in refs] 63 | ref_len += min(ref_lengths, key=lambda rl: (abs(rl - len(hyp)), rl)) 64 | 65 | for n in range(1, max_n + 1): 66 | hyp_counts = _ngram_counts(hyp, n) 67 | max_ref_counts: Counter = Counter() 68 | for r in refs: 69 | max_ref_counts |= _ngram_counts(r, n) 70 | # clipped counts 71 | overlap = { 72 | g: min(c, max_ref_counts.get(g, 0)) for g, c in hyp_counts.items() 73 | } 74 | num[n - 1] += sum(overlap.values()) 75 | den[n - 1] += max(1, sum(hyp_counts.values())) 76 | 77 | # Smoothed precisions 78 | precisions = [] 79 | for i in range(max_n): 80 | if smooth: 81 | precisions.append((num[i] + 1) / (den[i] + 1)) 82 | else: 83 | precisions.append(0.0 if den[i] == 0 else num[i] / den[i]) 84 | 85 | # Brevity penalty 86 | import math 87 | 88 | if hyp_len == 0: 89 | return 0.0 90 | if hyp_len > ref_len: 91 | bp = 1.0 92 | else: 93 | bp = math.exp(1 - ref_len / max(1, hyp_len)) 94 | 95 | # Geometric mean of precisions 96 | gm = math.exp(sum((1 / max_n) * math.log(max(p, 1e-16)) for p in precisions)) 97 | return bp * gm 98 | 99 | 100 | def _lcs_length(a: Sequence, b: Sequence) -> int: 101 | # Classic DP for LCS length (O(len(a)*len(b))) 102 | la, lb = len(a), len(b) 103 | dp = [0] * (lb + 1) 104 | for i in range(1, la + 1): 105 | prev = 0 106 | for j in range(1, lb + 1): 107 | tmp = dp[j] 108 | if a[i - 1] == b[j - 1]: 109 | dp[j] = prev + 1 110 | else: 111 | dp[j] = max(dp[j], dp[j - 1]) 112 | prev = tmp 113 | return dp[lb] 114 | 115 | 116 | def rouge_l( 117 | references: List[List[Sequence]], 118 | hypotheses: List[Sequence], 119 | beta: float = 1.2, 120 | ) -> float: 121 | """Compute ROUGE‑L F‑measure averaged across examples. 122 | 123 | For each hypothesis we take the best reference by LCS F‑measure. 124 | """ 125 | import math 126 | 127 | scores: List[float] = [] 128 | for refs, hyp in zip(references, hypotheses): 129 | best = 0.0 130 | for r in refs: 131 | lcs = _lcs_length(r, hyp) 132 | if lcs == 0: 133 | continue 134 | prec = lcs / max(1, len(hyp)) 135 | rec = lcs / max(1, len(r)) 136 | if prec == 0 and rec == 0: 137 | f = 0.0 138 | else: 139 | beta2 = beta * beta 140 | f = (1 + beta2) * prec * rec / max(beta2 * prec + rec, 1e-12) 141 | best = max(best, f) 142 | scores.append(best) 143 | return sum(scores) / max(1, len(scores)) 144 | 145 | 146 | def _matching_chunks(h: Sequence, r: Sequence) -> Tuple[int, int]: 147 | """Return (matches, chunks) for contiguous exact matches between h and r. 148 | 149 | Used for a simplified METEOR chunk penalty. 150 | """ 151 | # Build index of tokens in r 152 | from collections import defaultdict 153 | 154 | pos = defaultdict(list) 155 | for j, tok in enumerate(r): 156 | pos[tok].append(j) 157 | 158 | matches = 0 159 | chunks = 0 160 | prev_j = None 161 | for tok in h: 162 | if not pos[tok]: 163 | continue 164 | j = pos[tok].pop(0) # greedy match leftmost 165 | matches += 1 166 | if prev_j is None or j != prev_j + 1: 167 | chunks += 1 168 | prev_j = j 169 | return matches, chunks 170 | 171 | 172 | def meteor_simple( 173 | references: List[List[Sequence]], 174 | hypotheses: List[Sequence], 175 | alpha: float = 0.9, 176 | beta: float = 3.0, 177 | gamma: float = 0.5, 178 | ) -> float: 179 | """Simplified METEOR. 180 | 181 | For each (refs, hyp): 182 | - Compute unigram precision P and recall R against each ref (exact match). 183 | - F_mean = (P*R) / (alpha*P + (1 - alpha)*R) 184 | - Compute chunk penalty: Pen = gamma * (chunks / matches) ** beta 185 | - Score = F_mean * (1 - Pen) 186 | We take the max score over references and average over the corpus. 187 | """ 188 | import math 189 | 190 | scores: List[float] = [] 191 | for refs, hyp in zip(references, hypotheses): 192 | best = 0.0 193 | for r in refs: 194 | # unigram matches 195 | hyp_counts = Counter(hyp) 196 | ref_counts = Counter(r) 197 | overlap = sum(min(hyp_counts[t], ref_counts[t]) for t in hyp_counts) 198 | P = overlap / max(1, len(hyp)) 199 | R = overlap / max(1, len(r)) 200 | if P == 0 or R == 0: 201 | cand = 0.0 202 | else: 203 | Fm = (P * R) / max(alpha * P + (1 - alpha) * R, 1e-12) 204 | m, ch = _matching_chunks(hyp, r) 205 | if m == 0: 206 | penalty = 0.0 207 | else: 208 | penalty = gamma * ((ch / m) ** beta) 209 | cand = Fm * (1 - penalty) 210 | best = max(best, cand) 211 | scores.append(best) 212 | return sum(scores) / max(1, len(scores)) 213 | 214 | 215 | def distinct_n(hypotheses: List[Sequence], n: int = 1) -> float: 216 | """Proportion of distinct n‑grams across all hypotheses (diversity).""" 217 | grams = Counter() 218 | total = 0 219 | for h in hypotheses: 220 | c = _ngram_counts(h, n) 221 | grams.update(c) 222 | total += sum(c.values()) 223 | if total == 0: 224 | return 0.0 225 | return len(grams) / total 226 | 227 | 228 | __all__ = [ 229 | "bleu_corpus", 230 | "rouge_l", 231 | "meteor_simple", 232 | "distinct_n", 233 | ] 234 | 235 | -------------------------------------------------------------------------------- /code/ch09_gpt.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | """ 4 | Building a Large Language Model from Scratch 5 | — A Step-by-Step Guide Using Python and PyTorch 6 | 7 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH) 8 | AI-Powered by GPT-5. 9 | 10 | GPT assembly: token/position embeddings, a stack of Transformer blocks, 11 | and a language‑model head. Kept small and readable to align with the book’s 12 | step‑by‑step narrative. 13 | 14 | Key choices 15 | ----------- 16 | - Pre‑norm blocks (LayerNorm before sublayers) as in Chapter 8. 17 | - Learned positional embeddings by default; optional sinusoidal positions. 18 | - Optional weight tying between token embeddings and LM head. 19 | - Causal mask is always applied; padding mask is optionally combined when 20 | a `pad_id` is provided (or an explicit `attention_mask`). 21 | 22 | Shapes 23 | ------ 24 | Inputs are token ids of shape [B, T]; hidden/state tensors stay [B, T, D]. 25 | Logits are [B, T, V]. 26 | """ 27 | 28 | from dataclasses import dataclass 29 | from typing import Optional, Tuple 30 | 31 | import torch 32 | import torch.nn as nn 33 | import torch.nn.functional as F 34 | 35 | # Support running scripts from repo root by importing neighbor modules directly. 36 | try: 37 | from ch08_transformer import ( # type: ignore 38 | TransformerBlock, 39 | sinusoidal_positions, 40 | ) 41 | except Exception: 42 | from code.ch08_transformer import ( # type: ignore 43 | TransformerBlock, 44 | sinusoidal_positions, 45 | ) 46 | 47 | 48 | @dataclass 49 | class GPTConfig: 50 | """Hyperparameters for a small, readable GPT. 51 | 52 | - vocab_size: number of tokens in the vocabulary 53 | - block_size: maximum sequence length (context window) 54 | - d_model: model (embedding) dimension 55 | - n_head: number of attention heads per block 56 | - n_layer: number of transformer blocks 57 | - d_ff: feed‑forward hidden dimension (often 4 * d_model) 58 | - dropout: dropout rate in MHA/FFN 59 | - pos_type: 'learned' (GPT‑style) or 'sinusoidal' (chapter 8 option) 60 | - tie_weights: whether to tie LM head weight with token embeddings 61 | """ 62 | 63 | vocab_size: int 64 | block_size: int 65 | d_model: int = 128 66 | n_head: int = 4 67 | n_layer: int = 2 68 | d_ff: int = 512 69 | dropout: float = 0.1 70 | pos_type: str = "learned" # or "sinusoidal" 71 | tie_weights: bool = True 72 | 73 | 74 | class GPT(nn.Module): 75 | """A compact GPT‑style language model composed from Chapter 8 blocks. 76 | 77 | Forward signature: 78 | logits, loss = model(input_ids, targets=None, attention_mask=None, pad_id=None) 79 | """ 80 | 81 | def __init__(self, cfg: GPTConfig): 82 | super().__init__() 83 | self.cfg = cfg 84 | 85 | V, Tm, D = cfg.vocab_size, cfg.block_size, cfg.d_model 86 | 87 | # Embeddings: tokens and positions 88 | self.tok_emb = nn.Embedding(V, D) 89 | if cfg.pos_type == "learned": 90 | self.pos_emb = nn.Embedding(Tm, D) 91 | else: 92 | self.pos_emb = None # we'll add sinusoidal positions on the fly 93 | 94 | self.drop = nn.Dropout(cfg.dropout) 95 | 96 | # Transformer blocks 97 | self.blocks = nn.ModuleList( 98 | [ 99 | TransformerBlock( 100 | D, 101 | cfg.n_head, 102 | cfg.d_ff, 103 | cfg.dropout, 104 | ) 105 | for _ in range(cfg.n_layer) 106 | ] 107 | ) 108 | 109 | # Final normalization and LM head 110 | self.norm_f = nn.LayerNorm(D) 111 | self.lm_head = nn.Linear(D, V, bias=False) 112 | 113 | # Optional weight tying: share weights between embedding and LM head 114 | if cfg.tie_weights: 115 | self.lm_head.weight = self.tok_emb.weight 116 | 117 | self.apply(self._init_weights) 118 | 119 | def _init_weights(self, module: nn.Module) -> None: 120 | """Small‑norm initialization consistent with readable training. 121 | 122 | GPT‑2 uses ~N(0, 0.02) for embeddings and projection weights. We follow 123 | a similar pattern here for stability at small scales. 124 | """ 125 | if isinstance(module, (nn.Linear, nn.Embedding)): 126 | nn.init.normal_(module.weight, mean=0.0, std=0.02) 127 | if isinstance(module, nn.Linear) and module.bias is not None: 128 | nn.init.zeros_(module.bias) 129 | 130 | def _build_mask( 131 | self, 132 | input_ids: torch.Tensor, 133 | attention_mask: Optional[torch.Tensor], 134 | pad_id: Optional[int], 135 | ) -> torch.Tensor: 136 | """Combine causal and optional padding masks into [B, T, T]. 137 | 138 | - causal: lower‑triangular ones 139 | - padding: 1 for tokens, 0 for pads (derived from input or provided) 140 | """ 141 | B, T = input_ids.size(0), input_ids.size(1) 142 | device = input_ids.device 143 | causal = torch.tril(torch.ones(T, T, device=device)) # [T, T] 144 | 145 | pad_mask_bt: Optional[torch.Tensor] = None 146 | if attention_mask is not None: 147 | pad_mask_bt = attention_mask.float() # [B, T] 148 | elif pad_id is not None: 149 | # Derive from token ids: 1 for tokens, 0 for PAD 150 | pad_mask_bt = (input_ids != pad_id).float() # [B, T] 151 | 152 | if pad_mask_bt is None: 153 | # No padding info: return causal broadcasted to [B, T, T] 154 | return causal.unsqueeze(0).expand(B, -1, -1) 155 | else: 156 | # Broadcast multiply: [B, 1, T] * [T, T] -> [B, T, T] 157 | return pad_mask_bt[:, None, :] * causal 158 | 159 | def forward( 160 | self, 161 | input_ids: torch.Tensor, 162 | targets: Optional[torch.Tensor] = None, 163 | attention_mask: Optional[torch.Tensor] = None, 164 | pad_id: Optional[int] = None, 165 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: 166 | """Compute logits (and optional loss) for a batch of token ids. 167 | 168 | Args: 169 | input_ids: LongTensor [B, T], token ids (T <= block_size) 170 | targets: optional LongTensor [B, T] for next‑token loss 171 | attention_mask: optional Float/Bool [B, T], 1 for tokens, 0 for pads 172 | pad_id: optional int, used only to set ignore_index in loss 173 | 174 | Returns: 175 | logits: [B, T, V] 176 | loss: scalar tensor or None 177 | """ 178 | B, T = input_ids.size() 179 | assert T <= self.cfg.block_size, ( 180 | f"sequence length {T} exceeds block_size {self.cfg.block_size}. " 181 | f"Slice prompts to the last {self.cfg.block_size} tokens." 182 | ) 183 | 184 | device = input_ids.device 185 | 186 | # Token + positional embeddings 187 | x = self.tok_emb(input_ids) # [B, T, D] 188 | 189 | if self.cfg.pos_type == "learned": 190 | positions = torch.arange(T, device=device)[None, :] # [1, T] 191 | x = x + self.pos_emb(positions) # [B, T, D] 192 | else: 193 | pe = sinusoidal_positions( 194 | T, 195 | self.cfg.d_model, 196 | device=device, 197 | ) # [T, D] 198 | x = x + pe[None, :, :] # [B, T, D] 199 | 200 | x = self.drop(x) 201 | 202 | # Build causal (and optional padding) mask once 203 | mask_btt = self._build_mask( 204 | input_ids, 205 | attention_mask, 206 | pad_id, 207 | ) # [B, T, T] 208 | 209 | # Pass through stacked Transformer blocks 210 | for block in self.blocks: 211 | x = block(x, mask_btt) 212 | 213 | # Final norm and LM head projection 214 | x = self.norm_f(x) 215 | logits = self.lm_head(x) # [B, T, V] 216 | 217 | loss = None 218 | if targets is not None: 219 | # Flatten [B, T, V] -> [B*T, V] and [B, T] -> [B*T] 220 | logits_flat = logits.reshape(B * T, -1) 221 | targets_flat = targets.reshape(B * T) 222 | ignore = pad_id if pad_id is not None else -100 223 | loss = F.cross_entropy(logits_flat, targets_flat, ignore_index=ignore) 224 | 225 | return logits, loss 226 | 227 | 228 | __all__ = ["GPTConfig", "GPT"] 229 | -------------------------------------------------------------------------------- /notebooks/ch13_improvements_colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "fd6c7a4d", 6 | "metadata": {}, 7 | "source": [ 8 | "" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "2d78c8e6", 14 | "metadata": {}, 15 | "source": [ 16 | "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n", 17 | "## Chapter 13 — Improvements & Extensions\n", 18 | "**© Dr. Yves J. Hilpisch**
AI-Powered by GPT-5." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "73dcb0fc", 24 | "metadata": {}, 25 | "source": [ 26 | "## How to Use This Notebook\n", 27 | "\n", 28 | "- Prototype improvement ideas such as LoRA, adapters, or data augmentation.\n", 29 | "- Measure the trade-offs between accuracy gains and computational costs.\n", 30 | "- Document experiments thoroughly so you can reproduce winners later." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "8bd6b6e6", 36 | "metadata": {}, 37 | "source": [ 38 | "### Roadmap\n", 39 | "\n", 40 | "We experiment with several upgrade paths, each isolated so you can evaluate impact independently." 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "id": "2b976c8a", 46 | "metadata": {}, 47 | "source": [ 48 | "### Study Tips\n", 49 | "\n", 50 | "Change one variable at a time. Rapid iteration is tempting, but disciplined ablations reveal what truly matters." 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "id": "4bcdd880", 56 | "metadata": {}, 57 | "source": [ 58 | "This notebook demonstrates training improvements from Chapter 13: \n", 59 | "- Mixed precision (AMP) on CUDA for speed\n", 60 | "- Gradient clipping for stability\n", 61 | "- Warmup + cosine learning-rate schedule\n", 62 | "- Gradient accumulation to emulate larger batches\n", 63 | "Each cell creates one object and shows it immediately to match the book's \n", 64 | "creation rule. Comments explain why each step matters.\n" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "id": "96f4a16f", 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "# Torch setup\n", 75 | "import sys, subprocess\n", 76 | "import contextlib\n", 77 | "try:\n", 78 | " import torch # noqa: F401\n", 79 | "except Exception:\n", 80 | " idx = 'https://download.pytorch.org/whl/cpu'\n", 81 | " subprocess.check_call([sys.executable, '-m', 'pip', 'install',\n", 82 | " '--index-url', idx, 'torch'])\n", 83 | " import torch # noqa: F401\n", 84 | "torch.manual_seed(0); device = ('cuda' if torch.cuda.is_available() else\n", 85 | " 'cpu'); device\n" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "id": "563cb5a2", 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "# Tiny language model: embedding + linear head.\n", 96 | "# Used to demonstrate mechanics without heavy compute.\n", 97 | "class TinyLM(torch.nn.Module):\n", 98 | " def __init__(self, V=64, D=64):\n", 99 | " \"\"\"Create a minimal LM with vocabulary V and hidden size D.\n", 100 | " Embedding maps ids->vectors; linear head maps vectors->logits.\n", 101 | " \"\"\"\n", 102 | " super().__init__(); self.emb = torch.nn.Embedding(V, D)\n", 103 | " self.lin = torch.nn.Linear(D, V)\n", 104 | " def forward(self, x, y=None):\n", 105 | " \"\"\"Return (logits, loss). Loss is CE if targets y are given.\n", 106 | " \"\"\"\n", 107 | " h = self.emb(x); logits = self.lin(h)\n", 108 | " loss = None\n", 109 | " if y is not None:\n", 110 | " B,T,V = logits.shape\n", 111 | " loss = torch.nn.functional.cross_entropy(\n", 112 | " logits.reshape(B*T, V), y.reshape(B*T))\n", 113 | " return logits, loss\n", 114 | "TinyLM()\n" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "id": "053fe05f", 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "# Data: random ids to exercise the loop\n", 125 | "V, T, B = 64, 32, 64\n", 126 | "ids = torch.randint(0, V, (B, T))\n", 127 | "ids.shape\n" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "id": "05663fb1", 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "# Warmup + cosine schedule: scale base LR in [minr, 1].\n", 138 | "# Warmup ramps 0->1; cosine glides 1->minr over remaining steps.\n", 139 | "import math\n", 140 | "def warmup_cosine_lambda(warmup, total, minr=0.1):\n", 141 | " \"\"\"Return a LambdaLR-compatible function.\n", 142 | " warmup: warmup steps; total: total steps; minr: floor ratio.\n", 143 | " \"\"\"\n", 144 | " def f(step):\n", 145 | " s = step + 1\n", 146 | " if s <= warmup: return s/float(warmup)\n", 147 | " t = s - warmup; frac = t/max(1,total-warmup)\n", 148 | " return minr + (1-minr)*0.5*(1+math.cos(math.pi*frac))\n", 149 | " return f\n", 150 | "warmup_cosine_lambda(10, 100)(0)\n" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "id": "b98b50f1", 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "import contextlib\n", 161 | "# Train with AMP (CUDA), clipping, accumulation, and schedule.\n", 162 | "# On CPU/MPS, AMP is disabled and training runs in full precision.\n", 163 | "model = TinyLM().to(device)\n", 164 | "opt = torch.optim.AdamW(model.parameters(), lr=3e-4)\n", 165 | "sched = torch.optim.lr_scheduler.LambdaLR(\n", 166 | " opt, warmup_cosine_lambda(10, 100, 0.1))\n", 167 | "try:\n", 168 | " scaler = torch.amp.GradScaler('cuda', enabled=(device == 'cuda'))\n", 169 | "except Exception:\n", 170 | " scaler = torch.cuda.amp.GradScaler(enabled=(device == 'cuda'))\n", 171 | "accum, clip = 4, 1.0\n", 172 | "hist = []\n", 173 | "model.train()\n", 174 | "opt.zero_grad(set_to_none=True)\n", 175 | "for step in range(100):\n", 176 | " x = ids.to(device)\n", 177 | " y = ids.to(device)\n", 178 | " autocast_ctx = torch.amp.autocast('cuda', dtype=torch.float16) if device == 'cuda' else contextlib.nullcontext()\n", 179 | " with autocast_ctx:\n", 180 | " _, loss = model(x, y)\n", 181 | " if device == 'cuda':\n", 182 | " scaler.scale(loss).backward()\n", 183 | " else:\n", 184 | " loss.backward()\n", 185 | " if (step + 1) % accum == 0:\n", 186 | " if device == 'cuda':\n", 187 | " scaler.unscale_(opt)\n", 188 | " torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n", 189 | " if device == 'cuda':\n", 190 | " scaler.step(opt)\n", 191 | " scaler.update()\n", 192 | " else:\n", 193 | " opt.step()\n", 194 | " opt.zero_grad(set_to_none=True)\n", 195 | " sched.step()\n", 196 | " if step % 10 == 0:\n", 197 | " hist.append(float(loss.detach().cpu().item()))\n", 198 | "hist[:5]\n" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "id": "b9384616", 204 | "metadata": {}, 205 | "source": [ 206 | "## Exercises\n", 207 | "\n", 208 | "- Implement LoRA for one transformer layer and compare training speed.\n", 209 | "- Try a data augmentation technique and report its effect on validation metrics.\n", 210 | "- Create a decision matrix that scores each extension by cost, complexity, and expected impact." 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "id": "78d12000", 216 | "metadata": {}, 217 | "source": [ 218 | "" 219 | ] 220 | } 221 | ], 222 | "metadata": { 223 | "kernelspec": { 224 | "display_name": "Python 3", 225 | "language": "python", 226 | "name": "python3" 227 | }, 228 | "language_info": { 229 | "name": "python", 230 | "version": "3.10" 231 | } 232 | }, 233 | "nbformat": 4, 234 | "nbformat_minor": 5 235 | } 236 | -------------------------------------------------------------------------------- /notebooks/ch11_sampling_colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "80e4cff1", 6 | "metadata": {}, 7 | "source": [ 8 | "" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "d1da8bfe", 14 | "metadata": {}, 15 | "source": [ 16 | "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n", 17 | "## Chapter 11 — Testing & Sampling\n", 18 | "**© Dr. Yves J. Hilpisch**
AI-Powered by GPT-5." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "5438995f", 24 | "metadata": {}, 25 | "source": [ 26 | "## How to Use This Notebook\n", 27 | "\n", 28 | "- Compare sampling strategies such as greedy, top-k, and nucleus sampling.\n", 29 | "- Evaluate qualitative outputs with checklists anchored to your use case.\n", 30 | "- Instrument temperature sweeps to understand controllability." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "3e684d84", 36 | "metadata": {}, 37 | "source": [ 38 | "### Roadmap\n", 39 | "\n", 40 | "We load a trained checkpoint, generate continuations with multiple decoding schemes, and analyze the trade-offs each introduces." 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "id": "a47ed00b", 46 | "metadata": {}, 47 | "source": [ 48 | "### Study Tips\n", 49 | "\n", 50 | "Save representative generations for later review. Side-by-side comparisons are invaluable during stakeholder discussions." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "id": "207f0121", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# Torch + plotting setup\n", 61 | "import sys, subprocess\n", 62 | "try:\n", 63 | " import torch # noqa: F401\n", 64 | "except Exception:\n", 65 | " idx = 'https://download.pytorch.org/whl/cpu'\n", 66 | " subprocess.check_call([sys.executable, '-m', 'pip', 'install',\n", 67 | " '--index-url', idx, 'torch', 'torchvision',\n", 68 | " 'torchaudio'])\n", 69 | " import torch # noqa: F401\n", 70 | "import matplotlib.pyplot as plt\n", 71 | "plt.style.use('seaborn-v0_8')\n", 72 | "%config InlineBackend.figure_format = 'svg'\n", 73 | "torch.manual_seed(0); 'ok'\n" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "id": "4081ca91", 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "# Probability at temperature T for toy logits.\n", 84 | "logits = torch.tensor([[2.0, 1.0, 0.2, -1.0]])\n", 85 | "\n", 86 | "def probs_at_T(T):\n", 87 | " \"\"\"Return softmax(logits/T) for a single toy row.\n", 88 | " Lower T sharpens, higher T flattens.\n", 89 | " \"\"\"\n", 90 | " p = torch.softmax(logits / T, dim=-1)\n", 91 | " return p\n", 92 | "probs_at_T(1.0)\n" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "9421af5a", 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "# Plot temperature effects\n", 103 | "Ts = [0.7, 1.0, 1.3]\n", 104 | "fig, axes = plt.subplots(1, 3, figsize=(6.0, 2.2), constrained_layout=True)\n", 105 | "for ax, T in zip(axes, Ts):\n", 106 | " p = probs_at_T(T)[0]\n", 107 | " ax.bar(range(len(p)), p, color='#0A66C2')\n", 108 | " ax.set_title(f'T={T}')\n", 109 | " ax.set_ylim(0, 1.0); ax.set_xticks([]); ax.set_yticks([])\n", 110 | "fig.suptitle('Temperature'); fig\n" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "90ba5747", 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "# Top-k and top-p filters: set low-prob tokens to a large negative\n", 121 | "# logit so softmax effectively assigns zero probability.\n", 122 | "def top_k_filter(logits, k):\n", 123 | " \"\"\"Keep only the k largest logits per row.\n", 124 | " Others are set to a very negative number.\n", 125 | " \"\"\"\n", 126 | " if k <= 0: return logits\n", 127 | " v, _ = torch.topk(logits, k)\n", 128 | " thr = v[:, [-1]]\n", 129 | " return torch.where(logits < thr, torch.tensor(-1e9), logits)\n", 130 | "def top_p_filter(logits, p):\n", 131 | " \"\"\"Keep the smallest set of tokens whose cumulative\n", 132 | " probability exceeds p. Works row-wise.\n", 133 | " \"\"\"\n", 134 | " if p <= 0 or p >= 1: return logits\n", 135 | " s, idx = torch.sort(logits, dim=-1, descending=True)\n", 136 | " pr = torch.softmax(s, dim=-1)\n", 137 | " cum = torch.cumsum(pr, dim=-1)\n", 138 | " mask = cum > p; mask[..., 0] = False\n", 139 | " s = s.masked_fill(mask, -1e9)\n", 140 | " out = torch.empty_like(s).scatter_(1, idx, s)\n", 141 | " return out\n", 142 | "top_k_filter(logits, 3), top_p_filter(logits, 0.9)\n" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "id": "fe19648c", 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "# One sampling step on toy logits: apply temperature and optional\n", 153 | "# top-k/top-p, then draw the next token (or greedy if T<=0).\n", 154 | "def step_sample(logits, T=1.0, k=None, p=None):\n", 155 | " \"\"\"Return next token ids for a single step.\n", 156 | " \"\"\"\n", 157 | " x = logits / T if T > 0 else logits\n", 158 | " if k is not None: x = top_k_filter(x, k)\n", 159 | " if p is not None: x = top_p_filter(x, p)\n", 160 | " if T <= 0: return torch.argmax(x, dim=-1, keepdim=True)\n", 161 | " pr = torch.softmax(x, dim=-1)\n", 162 | " return torch.multinomial(pr, num_samples=1)\n", 163 | "step_sample(logits, T=0.8, k=3, p=0.9)\n" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "id": "25dbd184", 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "# Simple dummy language model for a quick perplexity demo.\n", 174 | "class DummyLM(torch.nn.Module):\n", 175 | " def __init__(self, V):\n", 176 | " super().__init__(); self.V = V\n", 177 | " def forward(self, x, targets=None):\n", 178 | " B, T = x.size(); logits = torch.zeros(B, T, self.V)\n", 179 | " loss = None\n", 180 | " if targets is not None:\n", 181 | " loss = torch.nn.functional.cross_entropy(\n", 182 | " logits.reshape(B*T, self.V), targets.reshape(B*T)\n", 183 | " )\n", 184 | " return logits, loss\n", 185 | "def perplexity(model, loader):\n", 186 | " \"\"\"Compute (H, exp(H)) over a loader of (x,y) pairs.\n", 187 | " \"\"\"\n", 188 | " total, tokens = 0.0, 0\n", 189 | " for x, y in loader:\n", 190 | " _, loss = model(x, targets=y)\n", 191 | " total += float(loss.detach().item()) * y.numel()\n", 192 | " tokens += int(y.numel())\n", 193 | " H = total / max(1, tokens)\n", 194 | " import math; return H, math.exp(H)\n", 195 | "V = 16; model = DummyLM(V)\n", 196 | "ids = torch.randint(0, V, (1, 128))\n", 197 | "class DS(torch.utils.data.Dataset):\n", 198 | " def __len__(self): return 64\n", 199 | " def __getitem__(self, i):\n", 200 | " x = ids[0, i:i+32]; y = ids[0, i+1:i+33]; return x, y\n", 201 | "dl = torch.utils.data.DataLoader(DS(), batch_size=16, drop_last=True)\n", 202 | "perplexity(model, dl)\n" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "id": "81391417", 208 | "metadata": {}, 209 | "source": [ 210 | "## Exercises\n", 211 | "\n", 212 | "- Implement beam search and compare its outputs against nucleus sampling.\n", 213 | "- Add automated toxicity or bias checks using an available open-source detector.\n", 214 | "- Create a table summarizing how temperature and top-k interact across several prompts." 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "id": "0842cb66", 220 | "metadata": {}, 221 | "source": [ 222 | "" 223 | ] 224 | } 225 | ], 226 | "metadata": { 227 | "kernelspec": { 228 | "display_name": "Python 3", 229 | "language": "python", 230 | "name": "python3" 231 | }, 232 | "language_info": { 233 | "name": "python", 234 | "version": "3.10" 235 | } 236 | }, 237 | "nbformat": 4, 238 | "nbformat_minor": 5 239 | } 240 | -------------------------------------------------------------------------------- /notebooks/ch12_evaluation_colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "5af6d8c0", 6 | "metadata": {}, 7 | "source": [ 8 | "" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "01bbcf1d", 14 | "metadata": {}, 15 | "source": [ 16 | "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n", 17 | "## Chapter 12 — Evaluation Metrics Beyond Perplexity\n", 18 | "**© Dr. Yves J. Hilpisch**
AI-Powered by GPT-5." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "b72ebf49", 24 | "metadata": {}, 25 | "source": [ 26 | "## How to Use This Notebook\n", 27 | "\n", 28 | "- Compute both automatic and human-in-the-loop metrics for your model outputs.\n", 29 | "- Design lightweight evaluation datasets aligned with your deployment scenario.\n", 30 | "- Visualize metric trends to spot regressions early." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "76409cd8", 36 | "metadata": {}, 37 | "source": [ 38 | "### Roadmap\n", 39 | "\n", 40 | "We calculate perplexity, calibration metrics, and task-specific scores, then consolidate findings into a concise report." 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "id": "573952d2", 46 | "metadata": {}, 47 | "source": [ 48 | "### Study Tips\n", 49 | "\n", 50 | "Treat metrics as decision-making tools. Note which ones actually influence your go/no-go criteria." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "id": "bdfe12f0", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# Imports and style\n", 61 | "import math\n", 62 | "from collections import Counter\n", 63 | "import matplotlib.pyplot as plt\n", 64 | "plt.style.use('seaborn-v0_8')\n", 65 | "%config InlineBackend.figure_format = 'svg'\n", 66 | "'ok'\n" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "id": "c26bba4e", 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "# Tiny BLEU (corpus)\n", 77 | "def ngram_counts(tokens, n):\n", 78 | " return Counter(tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1))\n", 79 | "def bleu_corpus(references, hypotheses, max_n=4, smooth=True):\n", 80 | " \"\"\"Educational BLEU with Add-1 smoothing and brevity penalty.\n", 81 | " references: list of list of token lists; hypotheses: list of token lists.\n", 82 | " Returns BLEU in [0,1].\n", 83 | " \"\"\"\n", 84 | " num, den = [0]*max_n, [0]*max_n; ref_len = 0; hyp_len = 0\n", 85 | " for refs, hyp in zip(references, hypotheses):\n", 86 | " hyp_len += len(hyp)\n", 87 | " rl = [len(r) for r in refs]\n", 88 | " ref_len += min(rl, key=lambda L: (abs(L-len(hyp)), L))\n", 89 | " for n in range(1, max_n+1):\n", 90 | " h = ngram_counts(hyp, n)\n", 91 | " m = Counter();\n", 92 | " for r in refs: m |= ngram_counts(r, n)\n", 93 | " overlap = {g: min(c, m.get(g,0)) for g,c in h.items()}\n", 94 | " num[n-1] += sum(overlap.values()); den[n-1] += max(1, sum(h.values()))\n", 95 | " prec = [((num[i]+1)/(den[i]+1) if smooth else (0 if den[i]==0 else num[i]/den[i]))\n", 96 | " for i in range(max_n)]\n", 97 | " gm = math.exp(sum((1/max_n)*math.log(max(p,1e-16)) for p in prec))\n", 98 | " bp = 1.0 if hyp_len > ref_len else math.exp(1 - ref_len/max(1,hyp_len))\n", 99 | " return bp*gm\n", 100 | "refs = [[\"the cat is on the mat\".split()]]\n", 101 | "hyp = [\"the cat sat on the mat\".split()]\n", 102 | "round(bleu_corpus(refs, hyp), 3)\n" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "id": "af1380d8", 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "# ROUGE-L via LCS\n", 113 | "def lcs_length(a, b):\n", 114 | " dp = [0]*(len(b)+1)\n", 115 | " for i in range(1, len(a)+1):\n", 116 | " prev = 0\n", 117 | " for j in range(1, len(b)+1):\n", 118 | " tmp = dp[j]\n", 119 | " dp[j] = prev + 1 if a[i-1]==b[j-1] else max(dp[j], dp[j-1])\n", 120 | " prev = tmp\n", 121 | " return dp[len(b)]\n", 122 | "def rouge_l(references, hypotheses, beta=1.2):\n", 123 | " \"\"\"Average F-measure over best reference per example.\n", 124 | " \"\"\"\n", 125 | " scores = []\n", 126 | " for refs, hyp in zip(references, hypotheses):\n", 127 | " best = 0.0\n", 128 | " for r in refs:\n", 129 | " l = lcs_length(r, hyp)\n", 130 | " if l==0: continue\n", 131 | " p = l/max(1,len(hyp)); q = l/max(1,len(r))\n", 132 | " b2 = beta*beta; f = (1+b2)*p*q/max(b2*p+q,1e-12)\n", 133 | " best = max(best, f)\n", 134 | " scores.append(best)\n", 135 | " return sum(scores)/max(1,len(scores))\n", 136 | "refs = [[\"the cat is on the mat\".split()]]\n", 137 | "hyp = [\"the cat sat on the mat\".split()]\n", 138 | "round(rouge_l(refs, hyp), 3)\n" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "id": "6143dad7", 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "# Simplified METEOR: unigram F-mean with chunk penalty\n", 149 | "def matching_chunks(h, r):\n", 150 | " pos = {}\n", 151 | " for j,t in enumerate(r): pos.setdefault(t, []).append(j)\n", 152 | " matches, chunks, prev = 0, 0, None\n", 153 | " for t in h:\n", 154 | " if not pos.get(t): continue\n", 155 | " j = pos[t].pop(0); matches += 1\n", 156 | " if prev is None or j != prev+1: chunks += 1\n", 157 | " prev = j\n", 158 | " return matches, chunks\n", 159 | "def meteor_simple(references, hypotheses, alpha=0.9, beta=3.0, gamma=0.5):\n", 160 | " scores = []\n", 161 | " for refs, hyp in zip(references, hypotheses):\n", 162 | " best = 0.0\n", 163 | " for r in refs:\n", 164 | " hc, rc = Counter(hyp), Counter(r)\n", 165 | " overlap = sum(min(hc[t], rc[t]) for t in hc)\n", 166 | " P = overlap/max(1,len(hyp)); R = overlap/max(1,len(r))\n", 167 | " if P==0 or R==0: cand = 0.0\n", 168 | " else:\n", 169 | " Fm = (P*R)/max(alpha*P+(1-alpha)*R, 1e-12)\n", 170 | " m, ch = matching_chunks(hyp, r)\n", 171 | " pen = 0.0 if m==0 else gamma*((ch/m)**beta)\n", 172 | " cand = Fm*(1-pen)\n", 173 | " best = max(best, cand)\n", 174 | " scores.append(best)\n", 175 | " return sum(scores)/max(1,len(scores))\n", 176 | "refs = [[\"the cat is on the mat\".split()]]\n", 177 | "hyp = [\"the cat sat on the mat\".split()]\n", 178 | "round(meteor_simple(refs, hyp), 3)\n" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "id": "be9c3ccd", 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "# Diversity: distinct-1 / distinct-2\n", 189 | "def distinct_n(hyps, n=1):\n", 190 | " grams, total = Counter(), 0\n", 191 | " for h in hyps:\n", 192 | " c = Counter(tuple(h[i:i+n]) for i in range(len(h)-n+1))\n", 193 | " grams.update(c); total += sum(c.values())\n", 194 | " return 0.0 if total==0 else len(grams)/total\n", 195 | "hyps = [\"the cat sat on the mat\".split(),\n", 196 | " \"the cat sat on the mat\".split()]\n", 197 | "round(distinct_n(hyps,1),3), round(distinct_n(hyps,2),3)\n" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "id": "eb58b9f8", 203 | "metadata": {}, 204 | "source": [ 205 | "## Exercises\n", 206 | "\n", 207 | "- Add a calibration plot (reliability diagram) for your model’s probability outputs.\n", 208 | "- Design a qualitative evaluation rubric and capture results from two reviewers.\n", 209 | "- Implement a simple regression test that fails when a critical metric drops below a threshold." 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "id": "917d3b6d", 215 | "metadata": {}, 216 | "source": [ 217 | "" 218 | ] 219 | } 220 | ], 221 | "metadata": { 222 | "kernelspec": { 223 | "display_name": "Python 3", 224 | "language": "python", 225 | "name": "python3" 226 | }, 227 | "language_info": { 228 | "name": "python", 229 | "version": "3.10" 230 | } 231 | }, 232 | "nbformat": 4, 233 | "nbformat_minor": 5 234 | } 235 | --------------------------------------------------------------------------------