├── model_bundle_demo.pt
├── code
    ├── hello_world.py
    ├── ch15_export.py
    ├── ch7_attention.py
    ├── ch11_metrics.py
    ├── gpt_shapes_selftest.py
    ├── check_backends.py
    ├── env_check.py
    ├── gen_dev_loop.py
    ├── gen_roadmap.py
    ├── gen_ch13_clip.py
    ├── ch13_schedules.py
    ├── gen_appx_tok_example.py
    ├── __init__.py
    ├── gen_appx_tok_bpe_merges.py
    ├── gen_gpt_arch.py
    ├── bench_forward.py
    ├── gen_ch10_lr_warmup.py
    ├── quickdemo.py
    ├── bench_timer.py
    ├── gen_ch12_lcs.py
    ├── bench_sampling.py
    ├── venv_tools.py
    ├── gen_ch13_accum.py
    ├── gen_ch11_temp.py
    ├── gen_ch13_cosine.py
    ├── ch14_lora.py
    ├── gen_ch11_nucleus.py
    ├── ch12_eval_corpus.py
    ├── gen_ch11_filters.py
    ├── ch5_linreg.py
    ├── ch15_fastapi_app.py
    ├── ch15_streamlit_app.py
    ├── ch11_sampling.py
    ├── gen_ch10_windows.py
    ├── ch15_cli.py
    ├── check_bundle.py
    ├── gen_ch14_lora.py
    ├── gen_masks_heatmap.py
    ├── ch6_tokenize.py
    ├── gen_ch14_scaling.py
    ├── ch10_data.py
    ├── example_data.py
    ├── ch10_train.py
    ├── sample_from_checkpoint.py
    ├── ch08_transformer.py
    ├── ch12_metrics_text.py
    └── ch09_gpt.py
├── requirements.txt
├── .gitignore
├── notebooks
    ├── ch16_discussion_conclusion_colab.ipynb
    ├── ch02_shell_cli_colab.ipynb
    ├── ch15_deployment_colab.ipynb
    ├── ch01_intro_colab.ipynb
    ├── ch04_hardware_software_colab.ipynb
    ├── ch03_setup_project_colab.ipynb
    ├── ch07_attention.ipynb
    ├── attollm_colab_starter.ipynb
    ├── ch13_improvements_colab.ipynb
    ├── ch11_sampling_colab.ipynb
    └── ch12_evaluation_colab.ipynb
└── README.md


/model_bundle_demo.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhilpisch/llmcode/main/model_bundle_demo.pt


--------------------------------------------------------------------------------
/code/hello_world.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | """
 9 | 
10 | def main() -> None:
11 |     print("Hello, LLM world!")
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     main()
16 | 
17 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Core Python libs used across chapters (keep light; install PyTorch per platform)
 2 | numpy>=1.24
 3 | psutil>=5.9
 4 | tqdm>=4.66
 5 | tensorboard>=2.13
 6 | typing-extensions>=4.8
 7 | 
 8 | # Visualization & figure generation
 9 | matplotlib>=3.7
10 | graphviz>=0.20
11 | 
12 | # Notebook validation & tooling
13 | nbformat>=5.10
14 | nbclient>=0.9
15 | 
16 | # Tokenization appendix extras
17 | tokenizers>=0.15
18 | sentencepiece>=0.1.99
19 | 
20 | # Deployment extras (Chapter 15)
21 | fastapi>=0.110
22 | pydantic>=2
23 | uvicorn>=0.29
24 | streamlit>=1.33
25 | 
26 | # IMPORTANT: Install PyTorch following instructions for your OS/GPU:
27 | # https://pytorch.org/get-started/locally/
28 | # For a CPU-only install (example):
29 | # torch>=2.2; sys_platform == 'darwin' or sys_platform == 'linux'
30 | # For Apple Silicon with MPS (example):
31 | # pip install torch==2.3.1 --extra-index-url https://download.pytorch.org/whl/cpu
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # OS / Editor
 2 | .DS_Store
 3 | Thumbs.db
 4 | .idea/
 5 | .vscode/
 6 | .history/
 7 | 
 8 | # Python bytecode / caches
 9 | __pycache__/
10 | *.py[cod]
11 | *$py.class
12 | .pytest_cache/
13 | .cache/
14 | 
15 | # Jupyter
16 | .ipynb_checkpoints/
17 | 
18 | # Virtual environments
19 | .env
20 | .venv/
21 | env/
22 | venv/
23 | ENV/
24 | env.bak/
25 | venv.bak/
26 | .python-version
27 | 
28 | # Packaging / build
29 | build/
30 | dist/
31 | downloads/
32 | eggs/
33 | .eggs/
34 | *.egg-info/
35 | *.egg
36 | sdist/
37 | lib/
38 | lib64/
39 | share/python-wheels/
40 | pip-wheel-metadata/
41 | pip-log.txt
42 | pip-delete-this-directory.txt
43 | MANIFEST
44 | 
45 | # Coverage / testing
46 | .coverage
47 | .coverage.*
48 | coverage.xml
49 | nosetests.xml
50 | pytestdebug.log
51 | htmlcov/
52 | .tox/
53 | .nox/
54 | 
55 | # Logs / outputs
56 | *.log
57 | logs/
58 | outputs/
59 | runs/
60 | checkpoints/
61 | models/
62 | *.pt
63 | 
64 | # Notebooks exports
65 | *.nbconvert.ipynb
66 | 
67 | # Graphviz caches
68 | *.gv.pdf
69 | *.gv.png
70 | *.gv.svg
71 | 
72 | # Generated figures (copied from book repo)
73 | figures/
74 | 


--------------------------------------------------------------------------------
/code/ch15_export.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Export a clean model bundle with config, weights, and tokenizer metadata.
 9 | 
10 | Usage:
11 |   python code/ch15_export.py --ckpt checkpoints/ch13_gpt_best.pt --out model_bundle.pt
12 | """
13 | 
14 | from __future__ import annotations
15 | 
16 | 
17 | import argparse
18 | from pathlib import Path
19 | import torch
20 | 
21 | 
22 | def main() -> None:
23 |     p = argparse.ArgumentParser(description="Export GPT bundle")
24 |     p.add_argument("--ckpt", required=True, help="input checkpoint .pt")
25 |     p.add_argument("--out", required=True, help="output bundle .pt")
26 |     args = p.parse_args()
27 | 
28 |     ckpt = torch.load(args.ckpt, map_location="cpu")
29 |     bundle = {
30 |         "config": ckpt.get("config"),
31 |         "model_state": ckpt.get("model_state"),
32 |         "tokenizer": ckpt.get("tokenizer"),
33 |     }
34 |     Path(args.out).parent.mkdir(parents=True, exist_ok=True)
35 |     torch.save(bundle, args.out)
36 |     print("Wrote:", args.out)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     main()
41 | 
42 | 


--------------------------------------------------------------------------------
/code/ch7_attention.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | """
 9 | 
10 | from __future__ import annotations
11 | 
12 | import torch
13 | from torch import Tensor
14 | 
15 | 
16 | def scaled_dot_product_attention(q: Tensor, k: Tensor, v: Tensor, mask: Tensor | None = None) -> Tensor:
17 |     """Single-head scaled dot-product attention.
18 | 
19 |     Args:
20 |         q,k,v: [B, T, D]
21 |         mask: optional [B, T, T] with 1 for allowed positions, 0 for masked
22 |     Returns:
23 |         [B, T, D]
24 |     """
25 |     d = q.size(-1)
26 |     scores = (q @ k.transpose(-2, -1)) / (d ** 0.5)  # [B, T, T]
27 |     if mask is not None:
28 |         scores = scores.masked_fill(mask == 0, float("-inf"))
29 |     w = torch.softmax(scores, dim=-1)                 # [B, T, T]
30 |     return w @ v                                      # [B, T, D]
31 | 
32 | 
33 | def causal_mask(batch: int, time: int, device: torch.device | None = None) -> Tensor:
34 |     base = torch.tril(torch.ones(time, time, device=device))  # [T, T]
35 |     return base.unsqueeze(0).expand(batch, -1, -1)            # [B, T, T]
36 | 
37 | 
38 | __all__ = ["scaled_dot_product_attention", "causal_mask"]
39 | 
40 | 


--------------------------------------------------------------------------------
/code/ch11_metrics.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Simple evaluation helpers for Chapter 11.
 9 | 
10 | Perplexity is derived from average cross-entropy on a held-out set:
11 |     PPL = exp(H)
12 | We compute mean loss over a DataLoader of (x, y) pairs.
13 | """
14 | 
15 | from __future__ import annotations
16 | 
17 | 
18 | import math
19 | from typing import Iterable, Tuple
20 | 
21 | import torch
22 | import torch.nn.functional as F
23 | 
24 | 
25 | @torch.no_grad()
26 | def perplexity(model, loader) -> Tuple[float, float]:
27 |     device = next(model.parameters()).device
28 |     model.eval()
29 |     total_loss = 0.0
30 |     total_tokens = 0
31 |     for x, y in loader:
32 |         x = x.to(device)
33 |         y = y.to(device)
34 |         logits, loss = model(x, targets=y)
35 |         if loss is None:
36 |             # fallback: compute CE manually
37 |             B, T, V = logits.shape
38 |             lf = logits.reshape(B * T, V)
39 |             yf = y.reshape(B * T)
40 |             loss = F.cross_entropy(lf, yf)
41 |         n = y.numel()
42 |         total_loss += float(loss.detach().item()) * n
43 |         total_tokens += int(n)
44 |     H = total_loss / max(1, total_tokens)
45 |     return H, math.exp(H)
46 | 
47 | 
48 | __all__ = ["perplexity"]
49 | 
50 | 


--------------------------------------------------------------------------------
/code/gpt_shapes_selftest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Self-test: verify GPT forward shapes and mask broadcasting.
 9 | 
10 | Runs a tiny forward pass and asserts expected tensor ranks/shapes for a
11 | minimal config. Use this as a quick wiring check during refactors.
12 | 
13 | Usage:
14 |   python code/gpt_shapes_selftest.py
15 | """
16 | 
17 | from __future__ import annotations
18 | 
19 | 
20 | from pathlib import Path
21 | import sys
22 | import torch
23 | 
24 | sys.path.append(str(Path(__file__).resolve().parent))
25 | from ch09_gpt import GPT, GPTConfig  # type: ignore
26 | 
27 | 
28 | def main() -> None:
29 |     cfg = GPTConfig(vocab_size=256, block_size=8, d_model=64, n_head=4, n_layer=2, d_ff=128)
30 |     model = GPT(cfg)
31 |     B, T = 2, 8
32 |     x = torch.randint(0, cfg.vocab_size, (B, T))
33 |     pad_id = None
34 |     logits, loss = model(x, targets=x, pad_id=pad_id)
35 |     assert logits.shape == (B, T, cfg.vocab_size), logits.shape
36 |     assert loss is not None and loss.ndim == 0
37 |     # Check causal mask shape indirectly via attention path: run shorter T
38 |     T2 = 5
39 |     x2 = torch.randint(0, cfg.vocab_size, (B, T2))
40 |     logits2, _ = model(x2)
41 |     assert logits2.shape[:2] == (B, T2)
42 |     print("OK — GPT shapes and mask broadcasting look good.")
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     main()
47 | 
48 | 


--------------------------------------------------------------------------------
/code/check_backends.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Print available PyTorch backends and basic device info.
 9 | 
10 | Run with: python -m code.check_backends
11 | """
12 | 
13 | from __future__ import annotations
14 | 
15 | def main() -> None:
16 |     try:
17 |         import torch  # type: ignore
18 |     except Exception as e:  # pragma: no cover
19 |         print("PyTorch not installed:", e)
20 |         return
21 | 
22 |     has_mps_backend = getattr(torch.backends, "mps", None)
23 |     # Guard against missing PyTorch by echoing version early
24 |     print("torch:", torch.__version__)
25 |     has_cuda = torch.cuda.is_available()
26 |     has_mps = bool(has_mps_backend and torch.backends.mps.is_available())
27 |     # Report CUDA capability first for people with multiple GPUs
28 |     print("CUDA available:", has_cuda)
29 |     if has_cuda:
30 |         print("CUDA device count:", torch.cuda.device_count())
31 |         for i in range(torch.cuda.device_count()):
32 |             name = torch.cuda.get_device_name(i)
33 |             print(f"  [{i}]", name)
34 |     # Show Apple MPS status as a secondary hardware target
35 |     print("MPS available:", has_mps)
36 |     device = "cuda" if has_cuda else "mps" if has_mps else "cpu"
37 |     # Preferred device ordering mirrors the training scripts
38 |     print("Preferred device:", device)
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     main()
43 | 


--------------------------------------------------------------------------------
/code/env_check.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Minimal environment and device sanity check.
 9 | 
10 | Run with: python -m code.env_check
11 | """
12 | 
13 | from __future__ import annotations
14 | 
15 | import os
16 | import platform
17 | import sys
18 | 
19 | 
20 | def main() -> None:
21 |     # Show basic runtime information
22 |     print("== Environment ==")
23 |     print("Python:", platform.python_version())
24 |     print("Platform:", platform.platform())
25 |     print("Executable:", sys.executable)
26 |     print("CWD:", os.getcwd())
27 | 
28 |     try:
29 |         import torch  # type: ignore
30 | 
31 |         # Echo installed PyTorch version and device availability
32 |         print("\n== PyTorch ==")
33 |         print("torch:", torch.__version__)
34 |         cuda = torch.cuda.is_available()
35 |         mps = getattr(torch.backends, "mps", None)
36 |         print("CUDA available:", cuda)
37 |         if cuda:
38 |             print("CUDA device count:", torch.cuda.device_count())
39 |             if torch.cuda.device_count() > 0:
40 |                 print("CUDA device 0:", torch.cuda.get_device_name(0))
41 |         print("MPS available:", bool(mps and torch.backends.mps.is_available()))
42 |     except Exception as e:  # pragma: no cover - diagnostics only
43 |         print("\nPyTorch not installed or not importable:", e)
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     main()
48 | 


--------------------------------------------------------------------------------
/code/gen_dev_loop.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Generate a simple dev loop diagram (edit → run → iterate → commit).
 9 | 
10 | If `graphviz` is installed, we render SVG directly; else we write DOT.
11 | """
12 | 
13 | from __future__ import annotations
14 | 
15 | from pathlib import Path
16 | 
17 | FIG_DIR = Path(__file__).resolve().parents[1] / "figures"
18 | FIG_DIR.mkdir(parents=True, exist_ok=True)
19 | 
20 | dot_content = r"""
21 | digraph DevLoop {
22 |   rankdir=LR;
23 |   node [shape=box, style=rounded, color="#4B5563", fontname="Helvetica"];
24 |   edge [color="#6B7280"];
25 | 
26 |   edit    [label="Edit\n(code / text)"];
27 |   run     [label="Run\n(scripts / tests)"];
28 |   iterate [label="Iterate\n(tune / refactor)"];
29 |   commit  [label="Commit\n(Git / PR)"];
30 | 
31 |   edit -> run -> iterate -> edit;
32 |   iterate -> commit;
33 | }
34 | """
35 | 
36 | def main() -> None:
37 |     try:
38 |         from graphviz import Source  # type: ignore
39 | 
40 |         s = Source(dot_content)
41 |         out = s.render(filename=str(FIG_DIR / "dev-loop"), format="svg", cleanup=True)
42 |         print("Wrote:", out)
43 |     except Exception as e:
44 |         dot_path = FIG_DIR / "dev-loop.dot"
45 |         dot_path.write_text(dot_content)
46 |         print("graphviz not available (", e, ")\nWrote DOT:", dot_path)
47 |         print("Render manually with:\n  dot -Tsvg figures/dev-loop.dot -o figures/dev-loop.svg")
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     main()
52 | 
53 | 


--------------------------------------------------------------------------------
/code/gen_roadmap.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Generate a simple LLM build roadmap diagram.
 9 | 
10 | If `graphviz` Python package is installed, renders SVG directly.
11 | Else, writes `figures/llm-roadmap.dot` for manual rendering:
12 | 
13 |     dot -Tsvg figures/llm-roadmap.dot -o figures/llm-roadmap.svg
14 | """
15 | 
16 | from __future__ import annotations
17 | 
18 | from pathlib import Path
19 | 
20 | FIG_DIR = Path(__file__).resolve().parents[1] / "figures"
21 | FIG_DIR.mkdir(parents=True, exist_ok=True)
22 | 
23 | dot_content = r"""
24 | digraph LLMRoadmap {
25 |   rankdir=LR;
26 |   node [shape=box, style=rounded, color="#0A66C2", fontname="Helvetica"];
27 |   edge [color="#555555"];
28 | 
29 |   setup     [label="Repo Setup\n& Env Checks"];
30 |   data      [label="Data\n& Tokenization"];
31 |   model     [label="Embeddings\n+ Transformer Blocks"];
32 |   training  [label="Training\n(CE Loss, AdamW)"];
33 |   sampling  [label="Sampling\n(top-k, top-p)"];
34 |   eval      [label="Evaluation\n(Perplexity & More)"];
35 |   deploy    [label="Deployment\n(CLI, App, API)"];
36 | 
37 |   setup -> data -> model -> training -> sampling -> eval -> deploy;
38 | }
39 | """
40 | 
41 | def main() -> None:
42 |     try:
43 |         from graphviz import Source  # type: ignore
44 | 
45 |         s = Source(dot_content)
46 |         out = s.render(filename=str(FIG_DIR / "llm-roadmap"), format="svg", cleanup=True)
47 |         print("Wrote:", out)
48 |     except Exception as e:
49 |         dot_path = FIG_DIR / "llm-roadmap.dot"
50 |         dot_path.write_text(dot_content)
51 |         print("graphviz not available (", e, ")\nWrote DOT:", dot_path)
52 |         print(
53 |             "Render manually with:\n"
54 |             "  dot -Tsvg figures/llm-roadmap.dot -o figures/llm-roadmap.svg"
55 |         )
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     main()
60 | 


--------------------------------------------------------------------------------
/code/gen_ch13_clip.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Illustrate gradient-norm clipping with a synthetic curve.
 9 | 
10 | Writes figures/ch13-clip.svg. No Matplotlib dependency required; generates a
11 | simple SVG line for gradient norm and a horizontal clip threshold.
12 | """
13 | 
14 | from __future__ import annotations
15 | 
16 | 
17 | from pathlib import Path
18 | import math
19 | 
20 | 
21 | def main() -> None:
22 |     fig_dir = Path(__file__).resolve().parents[1] / "figures"
23 |     fig_dir.mkdir(parents=True, exist_ok=True)
24 |     out = fig_dir / "ch13-clip.svg"
25 | 
26 |     w, h = 560, 220
27 |     pad = 32
28 |     steps = 200
29 |     thr = 1.0
30 |     xs = list(range(steps))
31 |     # Synthetic noisy curve around 1.2 with spikes
32 |     ys = [1.2 + 0.15 * math.sin(0.1 * i) + (0.0 if i % 37 else 1.2) for i in xs]
33 | 
34 |     def mapx(x): return pad + (w - 2*pad) * (x / (steps - 1))
35 |     def mapy(y):
36 |         ymin, ymax = 0.0, 2.8
37 |         return h - pad - (h - 2*pad) * ((y - ymin) / (ymax - ymin))
38 | 
39 |     path = "M " + " ".join(f"{mapx(x):.1f},{mapy(y):.1f}" for x, y in zip(xs, ys))
40 |     ythr = mapy(thr)
41 |     svg = [
42 |         f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="{h}">',
43 |         '<style> text{font-family:Helvetica,Arial,sans-serif;font-size:12px;}</style>',
44 |         '<text x="280" y="18" text-anchor="middle">Gradient norm with clipping</text>',
45 |         f'<path d="{path}" fill="none" stroke="#0A66C2" stroke-width="2"/>',
46 |         f'<line x1="{pad}" y1="{ythr:.1f}" x2="{w-pad}" y2="{ythr:.1f}" '
47 |         'stroke="#DD4444" stroke-dasharray="4,3"/>',
48 |         f'<text x="{w-pad}" y="{ythr-6:.1f}" text-anchor="end" fill="#DD4444">clip=1.0</text>',
49 |         '</svg>'
50 |     ]
51 |     out.write_text("\n".join(svg))
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     main()
56 | 
57 | 


--------------------------------------------------------------------------------
/code/ch13_schedules.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Learning-rate schedule helpers (Chapter 13).
 9 | 
10 | Includes a warmup+cosine decay schedule implemented via PyTorch's LambdaLR.
11 | The schedule scales the base LR by a factor in [min_ratio, 1].
12 | """
13 | 
14 | from __future__ import annotations
15 | 
16 | 
17 | import math
18 | from typing import Optional
19 | 
20 | import torch
21 | 
22 | 
23 | def warmup_cosine_lambda(
24 |     warmup_steps: int,
25 |     total_steps: int,
26 |     min_ratio: float = 0.1,
27 | ):
28 |     """Return a lambda(step) for LambdaLR implementing warmup+cosine decay.
29 | 
30 |     - Warmup: linearly scale 0 -> 1 over warmup_steps.
31 |     - Cosine: decay from 1 -> min_ratio over the remaining steps.
32 |     """
33 | 
34 |     warmup_steps = max(0, int(warmup_steps))
35 |     total_steps = max(1, int(total_steps))
36 |     assert 0.0 < min_ratio <= 1.0
37 | 
38 |     def lr_lambda(step: int) -> float:
39 |         s = step + 1
40 |         if warmup_steps > 0 and s <= warmup_steps:
41 |             return s / float(warmup_steps)
42 |         # cosine from warmup_steps..total_steps
43 |         t = min(max(s - warmup_steps, 0), max(total_steps - warmup_steps, 1))
44 |         frac = t / float(max(total_steps - warmup_steps, 1))
45 |         cos = 0.5 * (1 + math.cos(math.pi * frac))
46 |         return min_ratio + (1 - min_ratio) * cos
47 | 
48 |     return lr_lambda
49 | 
50 | 
51 | def warmup_cosine_lr(
52 |     optimizer: torch.optim.Optimizer,
53 |     warmup_steps: int,
54 |     total_steps: int,
55 |     min_ratio: float = 0.1,
56 | ) -> torch.optim.lr_scheduler.LambdaLR:
57 |     """Create a LambdaLR with warmup+cosine schedule."""
58 |     return torch.optim.lr_scheduler.LambdaLR(
59 |         optimizer, warmup_cosine_lambda(warmup_steps, total_steps, min_ratio)
60 |     )
61 | 
62 | 
63 | __all__ = ["warmup_cosine_lr", "warmup_cosine_lambda"]
64 | 
65 | 


--------------------------------------------------------------------------------
/code/gen_appx_tok_example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Generate a simple SVG illustrating char/word/BPE tokenization on one sentence.
 9 | 
10 | Writes figures/appx-tok-example.svg using dependency-free SVG drawing.
11 | """
12 | 
13 | from __future__ import annotations
14 | 
15 | 
16 | from pathlib import Path
17 | 
18 | 
19 | def draw_row(x0, y0, tokens, color="#B5D0F5", gap=8, pad=6, h=34):
20 |     items = []
21 |     x = x0
22 |     for t in tokens:
23 |         w = max(40, 9 * len(t))
24 |         items.append(f'<rect x="{x}" y="{y0}" width="{w}" height="{h}" fill="{color}" stroke="#2b2b2b"/>')
25 |         items.append(f'<text x="{x + w/2}" y="{y0 + h/2 + 4}" text-anchor="middle">{t}</text>')
26 |         x += w + gap
27 |     return items
28 | 
29 | 
30 | def main() -> None:
31 |     fig_dir = Path(__file__).resolve().parents[1] / "figures"
32 |     fig_dir.mkdir(parents=True, exist_ok=True)
33 |     out = fig_dir / "appx-tok-example.svg"
34 | 
35 |     sent = "The model models tokens"
36 |     char = list(sent)
37 |     word = sent.split()
38 |     # pseudo-BPE pieces for illustration
39 |     bpe = ["The", "\u2581model", "\u2581model", "s", "\u2581token", "s"]
40 | 
41 |     w, h = 760, 240
42 |     items = [
43 |         f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="{h}">',
44 |         '<style> text{font-family:Helvetica,Arial,sans-serif;font-size:12px;}</style>',
45 |         f'<text x="{w/2}" y="20" text-anchor="middle">Tokenization variants</text>',
46 |         '<text x="16" y="56">Character</text>',
47 |         '<text x="16" y="116">Word</text>',
48 |         '<text x="16" y="176">BPE (toy)</text>',
49 |     ]
50 |     items += draw_row(100, 36, char, color="#DCE6F8", h=28)
51 |     items += draw_row(100, 96, word, color="#CFE2FF")
52 |     items += draw_row(100, 156, bpe, color="#B5D0F5")
53 |     items.append('</svg>')
54 |     out.write_text("\n".join(items))
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     main()
59 | 
60 | 


--------------------------------------------------------------------------------
/code/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Project code package (mirrors stdlib ``code`` attributes for compatibility).
 9 | """
10 | 
11 | from __future__ import annotations
12 | 
13 | import importlib.util
14 | import os
15 | import sys
16 | import sysconfig
17 | from types import ModuleType
18 | from typing import Any
19 | 
20 | _STDLIB_MODULE: ModuleType | None = None
21 | 
22 | def _load_stdlib_code() -> ModuleType | None:
23 |     """Load the standard library ``code`` module even though this package shadows it."""
24 |     try:
25 |         stdlib_dir = sysconfig.get_paths().get("stdlib")
26 |         if not stdlib_dir:
27 |             return None
28 |         stdlib_code_path = os.path.join(stdlib_dir, "code.py")
29 |         if not os.path.exists(stdlib_code_path):
30 |             return None
31 |         spec = importlib.util.spec_from_file_location("_stdlib_code", stdlib_code_path)
32 |         if spec is None or spec.loader is None:
33 |             return None
34 |         module = importlib.util.module_from_spec(spec)
35 |         spec.loader.exec_module(module)  # type: ignore[assignment]
36 |         sys.modules.setdefault("_stdlib_code", module)
37 |         return module
38 |     except Exception:
39 |         return None
40 | 
41 | 
42 | _STDLIB_MODULE = _load_stdlib_code()
43 | 
44 | if _STDLIB_MODULE is not None:
45 |     stdlib_all = getattr(_STDLIB_MODULE, "__all__", None)
46 |     names = stdlib_all if isinstance(stdlib_all, (list, tuple)) else [
47 |         name for name in dir(_STDLIB_MODULE) if not name.startswith("_")
48 |     ]
49 |     globals().update({name: getattr(_STDLIB_MODULE, name) for name in names})
50 |     __all__ = list(names)  # type: ignore[assignment]
51 | else:
52 |     __all__: list[str] = []
53 | 
54 | 
55 | def __getattr__(name: str) -> Any:
56 |     if _STDLIB_MODULE is not None and hasattr(_STDLIB_MODULE, name):
57 |         return getattr(_STDLIB_MODULE, name)
58 |     raise AttributeError(f"module 'code' has no attribute {name!r}")
59 | 


--------------------------------------------------------------------------------
/code/gen_appx_tok_bpe_merges.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Generate a toy BPE merges visualization as an SVG timeline.
 9 | 
10 | Writes figures/appx-bpe-merges.svg without external deps.
11 | """
12 | 
13 | from __future__ import annotations
14 | 
15 | 
16 | from pathlib import Path
17 | 
18 | 
19 | def main() -> None:
20 |     fig_dir = Path(__file__).resolve().parents[1] / "figures"
21 |     fig_dir.mkdir(parents=True, exist_ok=True)
22 |     out = fig_dir / "appx-bpe-merges.svg"
23 | 
24 |     w, h = 1200, 260
25 |     pad = 24
26 |     items = [
27 |         f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="{h}">',
28 |         '<style> text{font-family:Helvetica,Arial,sans-serif;font-size:12px;}</style>',
29 |         f'<text x="{w/2}" y="20" text-anchor="middle">Toy BPE merges over steps</text>',
30 |     ]
31 | 
32 |     merges = [
33 |         (1, '▁', 'model', '▁model'),
34 |         (2, 'model', 's', 'models'),
35 |         (3, '▁', 'token', '▁token'),
36 |         (4, 'token', 's', 'tokens'),
37 |         (5, '▁', 'learn', '▁learn'),
38 |     ]
39 | 
40 |     x0, y0 = 60, 60
41 |     step_gap = 36
42 |     for i, (step, a, b, m) in enumerate(merges):
43 |         y = y0 + i * step_gap
44 |         items.append(f'<text x="{x0-30}" y="{y+6}" text-anchor="end">{step}</text>')
45 |         # arrows a + b -> m
46 |         # Left-hand symbols and operator spacing
47 |         items.append(f'<text x="{x0}" y="{y+6}">{a}</text>')
48 |         items.append(f'<text x="{x0+40}" y="{y+6}">+</text>')
49 |         items.append(f'<text x="{x0+72}" y="{y+6}">{b}</text>')
50 |         items.append(f'<text x="{x0+120}" y="{y+6}">→</text>')
51 |         # Much wider merged rectangle to avoid overlaps and use available width
52 |         rect_x = x0 + 150
53 |         rect_w = 360
54 |         items.append(f'<rect x="{rect_x}" y="{y-14}" width="{rect_w}" height="24" fill="#B5D0F5" stroke="#2b2b2b"/>')
55 |         items.append(f'<text x="{rect_x + rect_w/2}" y="{y+4}" text-anchor="middle">{m}</text>')
56 | 
57 |     items.append('</svg>')
58 |     out.write_text("\n".join(items))
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     main()
63 | 


--------------------------------------------------------------------------------
/code/gen_gpt_arch.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Generate a GPT architecture diagram for Chapter 9.
 9 | 
10 | If `graphviz` Python package is installed, renders SVG directly to
11 | `figures/ch09-gpt-arch.svg`. Else, writes a DOT file and prints instructions
12 | to render it with `dot`.
13 | """
14 | 
15 | from __future__ import annotations
16 | 
17 | from pathlib import Path
18 | 
19 | FIG_DIR = Path(__file__).resolve().parents[1] / "figures"
20 | FIG_DIR.mkdir(parents=True, exist_ok=True)
21 | 
22 | dot = r"""
23 | digraph GPTArch {
24 |   rankdir=LR;
25 |   splines=true;
26 |   overlap=false;
27 |   nodesep=0.6;
28 |   ranksep=0.7;
29 |   node [shape=box, style=rounded, fontname="Helvetica", color="#0A66C2"];
30 |   edge [color="#555555"];
31 | 
32 |   subgraph cluster_embed {
33 |     label="Embeddings";
34 |     color="#cccccc";
35 |     rank=same;
36 |     tok [label="Token Embedding\n[V x D]"];
37 |     pos [label="Position Embedding\n[T x D] or Sinusoidal"];
38 |     add [label="Add\n[B, T, D]"];
39 |     tok -> add;
40 |     pos -> add;
41 |   }
42 | 
43 |   subgraph cluster_stack {
44 |     label="N x TransformerBlock (Pre‑Norm)";
45 |     color="#cccccc";
46 |     mha [label="Multi‑Head Attention\n[B, T, D] → [B, T, D]"];
47 |     ffn [label="Feed‑Forward\n[B, T, D] → [B, T, D]"];
48 |     mha -> ffn;
49 |   }
50 | 
51 |   ln  [label="LayerNorm [B, T, D]"];
52 |   head[label="LM Head (Linear)\n[D → V]"];
53 | 
54 |   add -> mha -> ffn -> ln -> head;
55 | }
56 | """
57 | 
58 | 
59 | def main() -> None:
60 |     try:
61 |         from graphviz import Source  # type: ignore
62 | 
63 |         s = Source(dot)
64 |         out = s.render(filename=str(FIG_DIR / "ch09-gpt-arch"), format="svg", cleanup=True)
65 |         print("Wrote:", out)
66 |     except Exception as e:
67 |         dot_path = FIG_DIR / "ch09-gpt-arch.dot"
68 |         dot_path.write_text(dot)
69 |         print("graphviz not available (", e, ")\nWrote DOT:", dot_path)
70 |         print(
71 |             "Render manually with:\n"
72 |             "  dot -Tsvg figures/ch09-gpt-arch.dot -o figures/ch09-gpt-arch.svg"
73 |         )
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     main()
78 | 


--------------------------------------------------------------------------------
/code/bench_forward.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Measure forward-only tokens/sec for a tiny GPT.
 9 | 
10 | Usage:
11 |   python code/bench_forward.py --device auto --batch 8 --block 128 --vocab 256
12 | """
13 | 
14 | from __future__ import annotations
15 | 
16 | 
17 | import argparse
18 | from pathlib import Path
19 | import sys
20 | import time
21 | import torch
22 | 
23 | sys.path.append(str(Path(__file__).resolve().parent))
24 | from ch09_gpt import GPT, GPTConfig  # type: ignore
25 | 
26 | 
27 | def auto_device() -> str:
28 |     if torch.cuda.is_available():
29 |         return "cuda"
30 |     mps = getattr(torch.backends, "mps", None)
31 |     if mps and torch.backends.mps.is_available():
32 |         return "mps"
33 |     return "cpu"
34 | 
35 | 
36 | def main() -> None:
37 |     p = argparse.ArgumentParser()
38 |     p.add_argument("--device", default="auto")
39 |     p.add_argument("--batch", type=int, default=8)
40 |     p.add_argument("--block", type=int, default=128)
41 |     p.add_argument("--vocab", type=int, default=256)
42 |     p.add_argument("--warmup", type=int, default=3)
43 |     p.add_argument("--steps", type=int, default=20)
44 |     args = p.parse_args()
45 | 
46 |     # Resolve device string lazily for portable benchmarking
47 |     device = auto_device() if args.device == "auto" else args.device
48 |     cfg = GPTConfig(vocab_size=args.vocab, block_size=args.block)
49 |     # Keep the model tiny to highlight kernel overheads
50 |     model = GPT(cfg).to(device).eval()
51 |     # Synthetic token batch to avoid disk access
52 |     x = torch.randint(
53 |         0, cfg.vocab_size, (args.batch, cfg.block_size), device=device
54 |     )
55 | 
56 |     # Warmup
57 |     for _ in range(args.warmup):
58 |         with torch.no_grad():
59 |             model(x)
60 |     if device == "cuda":
61 |         torch.cuda.synchronize()
62 | 
63 |     t0 = time.time()
64 |     tok = 0
65 |     for _ in range(args.steps):
66 |         with torch.no_grad():
67 |             model(x)
68 |         tok += args.batch * args.block
69 |     if device == "cuda":
70 |         torch.cuda.synchronize()
71 |     dt = time.time() - t0
72 |     print({"device": device, "tokens_per_sec": round(tok / dt)})
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     main()
77 | 


--------------------------------------------------------------------------------
/code/gen_ch10_lr_warmup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Generate a simple LR warmup curve figure for Chapter 10.
 9 | 
10 | Writes figures/ch10-lr-warmup.svg. Uses Matplotlib if available; otherwise
11 | falls back to a small hand-written SVG path so the figure is always present.
12 | """
13 | 
14 | from __future__ import annotations
15 | 
16 | 
17 | from pathlib import Path
18 | 
19 | 
20 | def fallback_svg(out: Path, steps: int = 200, warmup: int = 50) -> None:
21 |     w, h = 460, 180
22 |     pad = 32
23 |     # Build points for linear warmup to 1 and then flat
24 |     xs = list(range(steps))
25 |     ys = [(x+1)/warmup if x < warmup else 1.0 for x in xs]
26 |     # map to svg coords
27 |     def mapx(x):
28 |         return pad + (w - 2*pad) * (x / max(1, steps-1))
29 |     def mapy(y):
30 |         # y in [0,1] -> svg y downwards
31 |         return h - pad - (h - 2*pad) * y
32 |     path = "M " + " ".join(f"{mapx(x):.1f},{mapy(y):.1f}" for x, y in zip(xs, ys))
33 |     style = (
34 |         '<style> text{font-family:Helvetica,Arial,sans-serif;font-size:12px;}</style>'
35 |     )
36 |     svg = [
37 |         f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="{h}">',
38 |         style,
39 |         f'<text x="{w/2}" y="16" text-anchor="middle" fill="#222">LR warmup</text>',
40 |         f'<path d="{path}" fill="none" stroke="#0A66C2" stroke-width="2" />',
41 |         '</svg>'
42 |     ]
43 |     out.write_text("\n".join(svg))
44 | 
45 | 
46 | def main() -> None:
47 |     fig_dir = Path(__file__).resolve().parents[1] / "figures"
48 |     fig_dir.mkdir(parents=True, exist_ok=True)
49 |     out = fig_dir / "ch10-lr-warmup.svg"
50 |     try:
51 |         import matplotlib.pyplot as plt
52 |         import numpy as np
53 |         plt.style.use("seaborn-v0_8")
54 |         steps, warmup = 200, 50
55 |         xs = np.arange(steps)
56 |         ys = np.minimum(1.0, (xs + 1) / float(warmup))
57 |         fig, ax = plt.subplots(figsize=(6.0, 2.2))
58 |         ax.plot(xs, ys, color="#0A66C2")
59 |         ax.set_title("LR warmup")
60 |         ax.set_xlabel("step")
61 |         ax.set_ylabel("scale")
62 |         fig.tight_layout()
63 |         fig.savefig(out, format='svg')
64 |     except Exception:
65 |         fallback_svg(out)
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     main()
70 | 


--------------------------------------------------------------------------------
/code/quickdemo.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Quick demo: create a tiny random bundle and sample once.
 9 | 
10 | This validates wiring without training or external files.
11 | """
12 | 
13 | from __future__ import annotations
14 | 
15 | 
16 | import argparse
17 | from pathlib import Path
18 | import sys
19 | import torch
20 | 
21 | sys.path.append(str(Path(__file__).resolve().parent))
22 | from ch09_gpt import GPT, GPTConfig  # type: ignore
23 | from ch11_sampling import sample  # type: ignore
24 | 
25 | 
26 | def auto_device() -> str:
27 |     if torch.cuda.is_available():
28 |         return "cuda"
29 |     mps = getattr(torch.backends, "mps", None)
30 |     if mps and torch.backends.mps.is_available():
31 |         return "mps"
32 |     return "cpu"
33 | 
34 | 
35 | def main() -> None:
36 |     p = argparse.ArgumentParser(description="Write a tiny random bundle and sample once")
37 |     p.add_argument("--out", default="model_bundle_demo.pt")
38 |     p.add_argument("--prompt", default="Hello")
39 |     p.add_argument("--max-new-tokens", type=int, default=40)
40 |     p.add_argument("--temperature", type=float, default=1.0)
41 |     p.add_argument("--top-p", type=float, default=0.95)
42 |     p.add_argument("--top-k", type=int, default=0)
43 |     p.add_argument("--device", default="auto")
44 |     p.add_argument("--seed", type=int, default=0)
45 |     args = p.parse_args()
46 | 
47 |     torch.manual_seed(args.seed)
48 |     device = auto_device() if args.device == "auto" else args.device
49 |     cfg = GPTConfig(vocab_size=256, block_size=64, d_model=64, n_head=4, n_layer=2, d_ff=128)
50 |     model = GPT(cfg).to(device).eval()
51 |     bundle = {"config": cfg.__dict__, "model_state": model.state_dict(), "tokenizer": None}
52 |     torch.save(bundle, args.out)
53 |     print("Wrote:", args.out)
54 | 
55 |     ids = torch.tensor([[c for c in args.prompt.encode("utf-8")]], dtype=torch.long, device=device)
56 |     out = sample(
57 |         model, ids,
58 |         max_new_tokens=args.max_new_tokens,
59 |         temperature=args.temperature,
60 |         top_k=(args.top_k or None),
61 |         top_p=(args.top_p or None),
62 |     )
63 |     text = bytes(out[0].tolist()).decode("utf-8", errors="ignore")
64 |     print("Sample:\n", text)
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     main()
69 | 


--------------------------------------------------------------------------------
/code/bench_timer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Tiny matmul timer to sanity-check device speed.
 9 | 
10 | Example:
11 |   python -m code.bench_timer --device auto --size 2048 --repeats 5
12 | """
13 | 
14 | from __future__ import annotations
15 | 
16 | import argparse
17 | import time
18 | 
19 | 
20 | def pick_device(torch):  # type: ignore
21 |     if torch.cuda.is_available():
22 |         return "cuda"
23 |     mps = getattr(torch.backends, "mps", None)
24 |     if mps and torch.backends.mps.is_available():
25 |         return "mps"
26 |     return "cpu"
27 | 
28 | 
29 | def main() -> None:
30 |     try:
31 |         import torch  # type: ignore
32 |     except Exception as e:  # pragma: no cover
33 |         print("PyTorch not installed:", e)
34 |         return
35 | 
36 |     # Read basic matmul settings from CLI
37 |     ap = argparse.ArgumentParser()
38 |     ap.add_argument("--device", default="auto", help="cpu|cuda|mps|auto")
39 |     ap.add_argument("--size", type=int, default=1024)
40 |     ap.add_argument("--repeats", type=int, default=5)
41 |     args = ap.parse_args()
42 | 
43 |     # Resolve device and create square matrices
44 |     device = pick_device(torch) if args.device == "auto" else args.device
45 |     N = args.size
46 |     x = torch.randn(N, N, device=device)
47 |     y = torch.randn(N, N, device=device)
48 | 
49 |     # Warmup for CUDA/MPS
50 |     for _ in range(2):
51 |         _ = x @ y
52 |         if device != "cpu":
53 |             torch.cuda.synchronize() if device == "cuda" else None
54 | 
55 |     times = []
56 |     for _ in range(args.repeats):
57 |         # Time a single matmul and sync to measure wall time
58 |         t0 = time.time()
59 |         z = x @ y
60 |         if device == "cuda":
61 |             torch.cuda.synchronize()
62 |         elif device == "mps":
63 |             # best-effort; MPS ops often synchronize implicitly on tensor access
64 |             _ = z.cpu()
65 |         times.append(time.time() - t0)
66 | 
67 |     print(
68 |         {
69 |             "device": device,
70 |             "size": N,
71 |             "repeats": args.repeats,
72 |             "ms_mean": round(1000 * sum(times) / len(times), 2),
73 |             "ms_min": round(1000 * min(times), 2),
74 |         }
75 |     )
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     main()
80 | 


--------------------------------------------------------------------------------
/code/gen_ch12_lcs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Draw an LCS alignment sketch for ROUGE-L intuition.
 9 | 
10 | Writes figures/ch12-lcs.svg with two token rows and highlighted matches.
11 | Falls back to simple SVG so the book always builds.
12 | """
13 | 
14 | from __future__ import annotations
15 | 
16 | 
17 | from pathlib import Path
18 | 
19 | 
20 | def fallback_svg(out: Path) -> None:
21 |     w, h = 680, 180
22 |     pad = 24
23 |     cell = 18
24 |     hyp = ["the", "cat", "sat", "on", "the", "mat"]
25 |     ref = ["the", "cat", "is", "on", "the", "mat"]
26 |     svg = [
27 |         f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="{h}">',
28 |         '<style> text{font-family:Helvetica,Arial,sans-serif;font-size:12px;}</style>',
29 |         '<text x="20" y="20">ROUGE-L via LCS</text>'
30 |     ]
31 |     # token rows
32 |     x0 = pad + 120
33 |     for i, tok in enumerate(ref):
34 |         x = x0 + i * (cell + 10)
35 |         svg.append(f'<rect x="{x}" y="40" width="{cell}" height="{cell}" fill="#DCE6F8" stroke="#2b2b2b"/>' )
36 |         svg.append(f'<text x="{x+cell/2}" y="{40+cell+14}" text-anchor="middle">{tok}</text>')
37 |     for i, tok in enumerate(hyp):
38 |         x = x0 + i * (cell + 10)
39 |         svg.append(f'<rect x="{x}" y="90" width="{cell}" height="{cell}" fill="#B5D0F5" stroke="#2b2b2b"/>' )
40 |         svg.append(f'<text x="{x+cell/2}" y="{90+cell+14}" text-anchor="middle">{tok}</text>')
41 |     # highlight LCS edges (the, cat, on, the, mat)
42 |     match_idx = [(0,0),(1,1),(3,3),(4,4),(5,5)]
43 |     for hi, ri in match_idx:
44 |         xh = x0 + hi * (cell + 10) + cell/2
45 |         xr = x0 + ri * (cell + 10) + cell/2
46 |         svg.append(f'<line x1="{xr}" y1="40" x2="{xh}" y2="{90+cell}" stroke="#0A66C2"/>')
47 |     svg.append('</svg>')
48 |     out.write_text("\n".join(svg))
49 | 
50 | 
51 | def main() -> None:
52 |     fig_dir = Path(__file__).resolve().parents[1] / "figures"
53 |     fig_dir.mkdir(parents=True, exist_ok=True)
54 |     out = fig_dir / "ch12-lcs.svg"
55 |     try:
56 |         import matplotlib.pyplot as plt
57 |         plt.style.use('seaborn-v0_8')
58 |         # fallback is sufficient visually; keep matplotlib path minimal
59 |         fallback_svg(out)
60 |     except Exception:
61 |         fallback_svg(out)
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     main()
66 | 
67 | 


--------------------------------------------------------------------------------
/code/bench_sampling.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Measure sampling tokens/sec for a tiny GPT.
 9 | 
10 | Usage:
11 |   python code/bench_sampling.py --device auto --max-new-tokens 200
12 | """
13 | 
14 | from __future__ import annotations
15 | 
16 | 
17 | import argparse
18 | from pathlib import Path
19 | import sys
20 | import time
21 | import torch
22 | 
23 | sys.path.append(str(Path(__file__).resolve().parent))
24 | from ch09_gpt import GPT, GPTConfig  # type: ignore
25 | from ch11_sampling import sample  # type: ignore
26 | 
27 | 
28 | def auto_device() -> str:
29 |     if torch.cuda.is_available():
30 |         return "cuda"
31 |     mps = getattr(torch.backends, "mps", None)
32 |     if mps and torch.backends.mps.is_available():
33 |         return "mps"
34 |     return "cpu"
35 | 
36 | 
37 | def main() -> None:
38 |     p = argparse.ArgumentParser()
39 |     p.add_argument("--device", default="auto")
40 |     p.add_argument("--block", type=int, default=128)
41 |     p.add_argument("--vocab", type=int, default=256)
42 |     p.add_argument("--max-new-tokens", type=int, default=200)
43 |     p.add_argument("--temperature", type=float, default=0.9)
44 |     p.add_argument("--top-k", type=int, default=0)
45 |     p.add_argument("--top-p", type=float, default=0.0)
46 |     args = p.parse_args()
47 | 
48 |     # Choose device automatically unless explicitly set
49 |     device = auto_device() if args.device == "auto" else args.device
50 |     cfg = GPTConfig(vocab_size=args.vocab, block_size=args.block)
51 |     # Construct a tiny model and prompt to isolate sampling speed
52 |     model = GPT(cfg).to(device).eval()
53 |     prompt = torch.randint(
54 |         0, cfg.vocab_size, (1, min(8, args.block)), device=device
55 |     )
56 | 
57 |     t0 = time.time()
58 |     out = sample(
59 |         model,
60 |         prompt,
61 |         max_new_tokens=args.max_new_tokens,
62 |         temperature=args.temperature,
63 |         top_k=(args.top_k or None),
64 |         top_p=(args.top_p or None),
65 |     )
66 |     if device == "cuda":
67 |         torch.cuda.synchronize()
68 |     dt = time.time() - t0
69 |     gen = out.size(1) - prompt.size(1)
70 |     print(
71 |         {
72 |             "device": device,
73 |             "gen_tokens": int(gen),
74 |             "tokens_per_sec": round(gen / dt),
75 |         }
76 |     )
77 | 
78 | if __name__ == '__main__':
79 |     main()
80 | 


--------------------------------------------------------------------------------
/code/venv_tools.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Virtual environment helpers used in Chapter 2.
 9 | 
10 | Usage:
11 |   python -m code.venv_tools info
12 |   python -m code.venv_tools create .venv
13 | """
14 | 
15 | from __future__ import annotations
16 | 
17 | import os
18 | import shutil
19 | import site
20 | import subprocess
21 | import sys
22 | from pathlib import Path
23 | 
24 | 
25 | def info() -> None:
26 |     print("== Python & Environment ==")
27 |     print("Executable:", sys.executable)
28 |     print("Prefix:", sys.prefix)
29 |     venv = os.environ.get("VIRTUAL_ENV") or (".venv" if ".venv" in sys.executable else "")
30 |     print("VIRTUAL_ENV:", venv or "(not active)")
31 |     site_dirs = site.getsitepackages()
32 |     sp = ", ".join(p for p in site_dirs if ".venv" in p) or ", ".join(site_dirs)
33 |     print("site-packages:", sp)
34 | 
35 | 
36 | def create(path: str = ".venv") -> None:
37 |     """Create a virtual environment at `path` if it doesn't exist.
38 | 
39 |     This is a convenience wrapper around: `python -m venv <path>`.
40 |     """
41 |     p = Path(path)
42 |     if p.exists():
43 |         print(f"Environment already exists at {p}")
44 |         return
45 |     print("Creating venv:", p)
46 |     subprocess.check_call([sys.executable, "-m", "venv", str(p)])
47 |     print("Created. To activate:")
48 |     if os.name == "nt":
49 |         print(rf"  .\{p}\Scripts\Activate.ps1  # PowerShell")
50 |     else:
51 |         print(f"  source {p}/bin/activate")
52 | 
53 | 
54 | def remove(path: str = ".venv") -> None:
55 |     p = Path(path)
56 |     if not p.exists():
57 |         print("No such environment:", p)
58 |         return
59 |     print("Removing venv:", p)
60 |     shutil.rmtree(p)
61 |     print("Removed.")
62 | 
63 | 
64 | def main(argv: list[str] | None = None) -> None:
65 |     argv = list(sys.argv[1:] if argv is None else argv)
66 |     if not argv or argv[0] in {"-h", "--help", "help"}:
67 |         print("Usage: python -m code.venv_tools [info|create|remove] [path]")
68 |         return
69 |     cmd = argv.pop(0)
70 |     if cmd == "info":
71 |         info()
72 |     elif cmd == "create":
73 |         create(argv[0] if argv else ".venv")
74 |     elif cmd == "remove":
75 |         remove(argv[0] if argv else ".venv")
76 |     else:
77 |         print("Unknown command:", cmd)
78 |         sys.exit(2)
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     main()
83 | 


--------------------------------------------------------------------------------
/code/gen_ch13_accum.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Illustrate gradient accumulation: k micro-batches per optimizer step.
 9 | 
10 | Writes figures/ch13-accum.svg as a simple, dependency-free SVG.
11 | """
12 | 
13 | from __future__ import annotations
14 | 
15 | 
16 | from pathlib import Path
17 | 
18 | 
19 | def main() -> None:
20 |     fig_dir = Path(__file__).resolve().parents[1] / "figures"
21 |     fig_dir.mkdir(parents=True, exist_ok=True)
22 |     out = fig_dir / "ch13-accum.svg"
23 | 
24 |     w, h = 700, 220
25 |     pad = 24
26 |     cell_w, cell_h = 110, 44
27 |     gap = 18
28 | 
29 |     def batch(x, y, label, color="#B5D0F5"):
30 |         return [
31 |             f'<rect x="{x}" y="{y}" width="{cell_w}" height="{cell_h}" '
32 |             f'fill="{color}" stroke="#2b2b2b"/>',
33 |             f'<text x="{x+cell_w/2}" y="{y+cell_h/2+4}" text-anchor="middle">{label}</text>',
34 |         ]
35 | 
36 |     items = [
37 |         '<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d">' % (w, h),
38 |         '<style> text{font-family:Helvetica,Arial,sans-serif;font-size:12px;}</style>',
39 |         '<text x="350" y="20" text-anchor="middle">Gradient accumulation (k micro-batches per step)</text>',
40 |     ]
41 | 
42 |     x0 = pad + 20
43 |     y0 = 60
44 |     k = 4
45 |     for i in range(k):
46 |         x = x0 + i * (cell_w + gap)
47 |         items += batch(x, y0, f"micro-batch {i+1}")
48 |         # plus sign between micro-batches
49 |         if i < k - 1:
50 |             items.append(f'<text x="{x+cell_w+gap/2}" y="{y0+cell_h/2+4}" text-anchor="middle">+</text>')
51 | 
52 |     # Arrow to optimizer step box
53 |     x_end = x0 + (k-1) * (cell_w + gap) + cell_w + 40
54 |     y_mid = y0 + cell_h/2
55 |     items.append(f'<line x1="{x0+ k*(cell_w+gap) - gap}" y1="{y_mid}" x2="{x_end}" y2="{y_mid}" stroke="#0A66C2" marker-end="url(#arrow)"/>')
56 | 
57 |     # Optimizer step box
58 |     step_x, step_y = x_end, y0
59 |     items += batch(step_x, step_y, "optimizer step", color="#9EC5F8")
60 | 
61 |     # Define arrow marker
62 |     items.insert(1, (
63 |         '<defs><marker id="arrow" markerWidth="10" markerHeight="6" refX="10" refY="3" orient="auto">'
64 |         '<path d="M0,0 L10,3 L0,6 z" fill="#0A66C2"/></marker></defs>'
65 |     ))
66 | 
67 |     items.append('</svg>')
68 |     out.write_text("\n".join(items))
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     main()
73 | 
74 | 


--------------------------------------------------------------------------------
/code/gen_ch11_temp.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Visualize the effect of temperature on a toy logit vector.
 9 | 
10 | Writes figures/ch11-temp.svg. Matplotlib if available, else fallback SVG.
11 | """
12 | 
13 | from __future__ import annotations
14 | 
15 | 
16 | from pathlib import Path
17 | 
18 | 
19 | def fallback_svg(out: Path) -> None:
20 |     w, h = 520, 220
21 |     pad = 28
22 |     bars = [0.55, 0.25, 0.1, 0.06, 0.04]
23 |     cols = ["#0A66C2", "#5491D6", "#7FADE5", "#A5C5EE", "#C9DCF7"]
24 |     def bar(x, y, w_, h_, c):
25 |         return (f'<rect x="{x:.1f}" y="{y:.1f}" width="{w_:.1f}" height="{h_:.1f}" '
26 |                 f'fill="{c}" />')
27 |     svg = [
28 |         f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="{h}">',
29 |         '<style> text{font-family:Helvetica,Arial,sans-serif;font-size:12px;}</style>',
30 |         f'<text x="{w/2}" y="18" text-anchor="middle" fill="#222">Temperature</text>'
31 |     ]
32 |     # draw three panels (T=0.7, 1.0, 1.3) with simple bars
33 |     panel_w = (w - 2*pad) / 3
34 |     for i, t in enumerate([0.7, 1.0, 1.3]):
35 |         x0 = pad + i * panel_w
36 |         svg.append(f'<text x="{x0+panel_w/2}" y="40" text-anchor="middle">T={t}</text>')
37 |         maxh = h - 80
38 |         for j, p in enumerate(bars):
39 |             height = maxh * (p ** (1.0 if t==1.0 else (1.2 if t<1 else 0.8)))
40 |             svg.append(bar(x0 + 16 + j*24, h-30-height, 18, height, cols[j]))
41 |     svg.append('</svg>')
42 |     out.write_text("\n".join(svg))
43 | 
44 | 
45 | def main() -> None:
46 |     fig_dir = Path(__file__).resolve().parents[1] / "figures"
47 |     fig_dir.mkdir(parents=True, exist_ok=True)
48 |     out = fig_dir / "ch11-temp.svg"
49 |     try:
50 |         import matplotlib.pyplot as plt
51 |         import numpy as np
52 |         plt.style.use('seaborn-v0_8')
53 |         logits = np.array([2.0, 1.0, 0.0, -0.5, -1.0])
54 |         Ts = [0.7, 1.0, 1.3]
55 |         fig, axes = plt.subplots(1, 3, figsize=(6.4, 2.2), constrained_layout=True)
56 |         for ax, T in zip(axes, Ts):
57 |             p = np.exp(logits / T); p = p / p.sum()
58 |             ax.bar(range(len(p)), p, color="#0A66C2")
59 |             ax.set_title(f"T={T}")
60 |             ax.set_ylim(0, 1.0)
61 |             ax.set_xticks([]); ax.set_yticks([])
62 |         fig.suptitle("Temperature")
63 |         fig.savefig(out, format='svg')
64 |     except Exception:
65 |         fallback_svg(out)
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     main()
70 | 
71 | 


--------------------------------------------------------------------------------
/code/gen_ch13_cosine.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Draw warmup + cosine LR schedule used in Chapter 13.
 9 | 
10 | Writes figures/ch13-lr-cosine.svg. Falls back to minimal SVG if Matplotlib
11 | is unavailable.
12 | """
13 | 
14 | from __future__ import annotations
15 | 
16 | 
17 | from pathlib import Path
18 | 
19 | 
20 | def fallback_svg(out: Path) -> None:
21 |     w, h = 560, 220
22 |     pad = 32
23 |     warmup, total, minr = 100, 1000, 0.1
24 |     xs = list(range(total))
25 |     ys = []
26 |     import math
27 |     for s in xs:
28 |         s1 = s + 1
29 |         if s1 <= warmup:
30 |             ys.append(s1 / warmup)
31 |         else:
32 |             t = s1 - warmup
33 |             frac = t / (total - warmup)
34 |             cos = 0.5 * (1 + math.cos(math.pi * frac))
35 |             ys.append(minr + (1 - minr) * cos)
36 |     def mapx(x): return pad + (w - 2*pad) * (x / (total-1))
37 |     def mapy(y): return h - pad - (h - 2*pad) * y
38 |     path = "M " + " ".join(f"{mapx(x):.1f},{mapy(y):.1f}" for x,y in zip(xs,ys))
39 |     svg = [
40 |         f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="{h}">',
41 |         '<style> text{font-family:Helvetica,Arial,sans-serif;font-size:12px;}</style>',
42 |         '<text x="280" y="18" text-anchor="middle">Warmup + Cosine LR</text>',
43 |         f'<path d="{path}" fill="none" stroke="#0A66C2" stroke-width="2"/>',
44 |         '</svg>'
45 |     ]
46 |     out.write_text("\n".join(svg))
47 | 
48 | 
49 | def main() -> None:
50 |     fig_dir = Path(__file__).resolve().parents[1] / "figures"
51 |     fig_dir.mkdir(parents=True, exist_ok=True)
52 |     out = fig_dir / "ch13-lr-cosine.svg"
53 |     try:
54 |         import matplotlib.pyplot as plt
55 |         import numpy as np
56 |         plt.style.use('seaborn-v0_8')
57 |         warmup, total, minr = 100, 1000, 0.1
58 |         xs = np.arange(total)
59 |         ys = []
60 |         for s in xs:
61 |             s1 = s + 1
62 |             if s1 <= warmup:
63 |                 ys.append(s1 / warmup)
64 |             else:
65 |                 t = s1 - warmup
66 |                 frac = t / (total - warmup)
67 |                 ys.append(minr + (1 - minr) * 0.5 * (1 + np.cos(np.pi * frac)))
68 |         ys = np.array(ys)
69 |         fig, ax = plt.subplots(figsize=(6.4, 2.2))
70 |         ax.plot(xs, ys, color="#0A66C2")
71 |         ax.set_title("Warmup + Cosine LR")
72 |         ax.set_xlabel("step"); ax.set_ylabel("scale")
73 |         fig.tight_layout(); fig.savefig(out, format='svg')
74 |     except Exception:
75 |         fallback_svg(out)
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     main()
80 | 
81 | 


--------------------------------------------------------------------------------
/code/ch14_lora.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | LoRA: Low-rank adapters for Linear layers (teaching version).
 9 | 
10 | This module provides a small, readable `LoRALinear` that adds a trainable
11 | low-rank delta to a frozen base weight:
12 | 
13 |     y = x @ W^T + scale * x @ (B @ A)^T
14 | 
15 | where A ∈ R^{r×d_in}, B ∈ R^{d_out×r}, and `scale = alpha / r`.
16 | """
17 | 
18 | from __future__ import annotations
19 | 
20 | 
21 | import torch
22 | import torch.nn as nn
23 | 
24 | 
25 | class LoRALinear(nn.Module):
26 |     def __init__(
27 |         self,
28 |         d_in: int,
29 |         d_out: int,
30 |         r: int = 8,
31 |         alpha: float = 16.0,
32 |         bias: bool = False,
33 |     ) -> None:
34 |         """Create a Linear with LoRA adapters.
35 | 
36 |         - d_in, d_out: base dimensions
37 |         - r: adapter rank (small)
38 |         - alpha: scaling factor (effective scale = alpha / r)
39 |         - bias: include bias term on the base layer
40 |         """
41 |         super().__init__()
42 |         self.base = nn.Linear(d_in, d_out, bias=bias)
43 |         self.r = int(r)
44 |         self.alpha = float(alpha)
45 |         self.scale = self.alpha / max(1, self.r)
46 |         # LoRA adapters (A: r×d_in, B: d_out×r)
47 |         if self.r > 0:
48 |             self.A = nn.Linear(d_in, self.r, bias=False)
49 |             self.B = nn.Linear(self.r, d_out, bias=False)
50 |             # Init: A small, B zero so start as identity (delta≈0)
51 |             nn.init.kaiming_uniform_(self.A.weight, a=2**0.5)
52 |             nn.init.zeros_(self.B.weight)
53 |             # Freeze base
54 |             for p in self.base.parameters():
55 |                 p.requires_grad = False
56 |         else:
57 |             self.A = None
58 |             self.B = None
59 |         self.merged = False
60 | 
61 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
62 |         y = self.base(x)
63 |         if self.r > 0 and not self.merged:
64 |             y = y + self.scale * self.B(self.A(x))
65 |         return y
66 | 
67 |     @torch.no_grad()
68 |     def merge(self) -> None:
69 |         """Fold the LoRA delta into the base weight for inference.
70 | 
71 |         After merging, adapters are disabled and the module acts like a
72 |         standard Linear layer with updated weights.
73 |         """
74 |         if self.r == 0 or self.merged:
75 |             self.merged = True
76 |             return
77 |         # W' = W + scale * (B @ A)
78 |         delta = (self.B.weight @ self.A.weight) * self.scale
79 |         self.base.weight += delta
80 |         self.merged = True
81 | 
82 | 
83 | __all__ = ["LoRALinear"]
84 | 
85 | 


--------------------------------------------------------------------------------
/code/gen_ch11_nucleus.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Plot cumulative probability and nucleus threshold p.
 9 | 
10 | Writes figures/ch11-nucleus.svg. Uses Matplotlib if available; otherwise
11 | falls back to a minimal SVG line/area plot to ensure the book builds.
12 | """
13 | 
14 | from __future__ import annotations
15 | 
16 | 
17 | from pathlib import Path
18 | 
19 | 
20 | def fallback_svg(out: Path) -> None:
21 |     w, h = 540, 220
22 |     pad = 32
23 |     # Toy sorted probabilities
24 |     probs = [0.4, 0.25, 0.12, 0.08, 0.06, 0.04, 0.03, 0.02]
25 |     cum = []
26 |     s = 0.0
27 |     for p in probs:
28 |         s += p
29 |         cum.append(s)
30 |     pthr = 0.9
31 |     # Map to svg coords
32 |     def mapx(i: int) -> float:
33 |         return pad + (w - 2 * pad) * (i / (len(cum) - 1))
34 |     def mapy(y: float) -> float:
35 |         return h - pad - (h - 2 * pad) * y
36 |     path = "M " + " ".join(f"{mapx(i):.1f},{mapy(y):.1f}" for i, y in enumerate(cum))
37 |     ythr = mapy(pthr)
38 |     svg = [
39 |         f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="{h}">',
40 |         '<style> text{font-family:Helvetica,Arial,sans-serif;font-size:12px;}</style>',
41 |         f'<text x="{w/2}" y="18" text-anchor="middle" fill="#222">Nucleus threshold</text>',
42 |         f'<path d="{path}" fill="none" stroke="#0A66C2" stroke-width="2" />',
43 |         f'<line x1="{pad}" y1="{ythr:.1f}" x2="{w-pad}" y2="{ythr:.1f}" '
44 |         f'stroke="#DD4444" stroke-dasharray="4,3" />',
45 |         f'<text x="{w-pad}" y="{ythr-6:.1f}" text-anchor="end" fill="#DD4444">p=0.9</text>',
46 |         '</svg>'
47 |     ]
48 |     out.write_text("\n".join(svg))
49 | 
50 | 
51 | def main() -> None:
52 |     fig_dir = Path(__file__).resolve().parents[1] / "figures"
53 |     fig_dir.mkdir(parents=True, exist_ok=True)
54 |     out = fig_dir / "ch11-nucleus.svg"
55 |     try:
56 |         import matplotlib.pyplot as plt
57 |         import numpy as np
58 |         plt.style.use('seaborn-v0_8')
59 |         probs = np.array([0.4, 0.25, 0.12, 0.08, 0.06, 0.04, 0.03, 0.02])
60 |         cum = np.cumsum(probs)
61 |         fig, ax = plt.subplots(figsize=(6.0, 2.2))
62 |         ax.plot(cum, color="#0A66C2", lw=2)
63 |         p = 0.9
64 |         ax.axhline(p, color="#DD4444", ls='--')
65 |         ax.text(len(cum)-1, p + 0.03, f"p={p}", color="#DD4444",
66 |                 ha='right', va='bottom')
67 |         ax.set_xlim(0, len(cum)-1)
68 |         ax.set_ylim(0, 1.0)
69 |         ax.set_xticks([]); ax.set_yticks([])
70 |         ax.set_title("Nucleus threshold")
71 |         fig.tight_layout()
72 |         fig.savefig(out, format='svg')
73 |     except Exception:
74 |         fallback_svg(out)
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     main()
79 | 
80 | 


--------------------------------------------------------------------------------
/code/ch12_eval_corpus.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Quick corpus evaluator for Chapter 12.
 9 | 
10 | Reads references and hypotheses from files (one example per line). The
11 | references file supports multiple references per example by separating them
12 | with the delimiter " ||| ". Tokenization defaults to whitespace with optional
13 | lowercasing.
14 | 
15 | Outputs BLEU (corpus), ROUGE-L, METEOR (simplified), and distinct-1/2.
16 | """
17 | 
18 | from __future__ import annotations
19 | 
20 | 
21 | import argparse
22 | from pathlib import Path
23 | from typing import List, Sequence
24 | 
25 | from code.ch12_metrics_text import (
26 |     bleu_corpus,
27 |     rouge_l,
28 |     meteor_simple,
29 |     distinct_n,
30 | )
31 | 
32 | 
33 | def parse_lines(path: str, lowercase: bool) -> List[str]:
34 |     text = Path(path).read_text(encoding="utf-8").splitlines()
35 |     return [t.lower() if lowercase else t for t in text]
36 | 
37 | 
38 | def to_refs(lines: List[str]) -> List[List[Sequence[str]]]:
39 |     """Split each line on ' ||| ' to allow multiple references per example."""
40 |     out: List[List[Sequence[str]]] = []
41 |     for line in lines:
42 |         refs = [seg.strip().split() for seg in line.split(" ||| ")]
43 |         out.append(refs)
44 |     return out
45 | 
46 | 
47 | def to_hyps(lines: List[str]) -> List[Sequence[str]]:
48 |     return [ln.split() for ln in lines]
49 | 
50 | 
51 | def main() -> None:
52 |     p = argparse.ArgumentParser(description="Evaluate text outputs against references")
53 |     p.add_argument("--refs", required=True, help="path to references.txt")
54 |     p.add_argument("--hyps", required=True, help="path to hypotheses.txt")
55 |     p.add_argument("--lower", action="store_true", help="lowercase before tokenizing")
56 |     p.add_argument("--max-n", type=int, default=4, help="max n-gram for BLEU")
57 |     args = p.parse_args()
58 | 
59 |     ref_lines = parse_lines(args.refs, args.lower)
60 |     hyp_lines = parse_lines(args.hyps, args.lower)
61 |     if len(ref_lines) != len(hyp_lines):
62 |         raise SystemExit("refs and hyps must have the same number of lines")
63 | 
64 |     references = to_refs(ref_lines)
65 |     hypotheses = to_hyps(hyp_lines)
66 | 
67 |     bleu = bleu_corpus(references, hypotheses, max_n=args.max_n, smooth=True)
68 |     rlg = rouge_l(references, hypotheses)
69 |     met = meteor_simple(references, hypotheses)
70 |     d1 = distinct_n(hypotheses, 1)
71 |     d2 = distinct_n(hypotheses, 2)
72 | 
73 |     print("Examples:", len(hypotheses))
74 |     print(f"BLEU_{args.max_n}:  {bleu:.3f}")
75 |     print(f"ROUGE_L:  {rlg:.3f}")
76 |     print(f"METEOR*: {met:.3f}  (simplified)")
77 |     print(f"distinct-1: {d1:.3f}")
78 |     print(f"distinct-2: {d2:.3f}")
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     main()
83 | 
84 | 


--------------------------------------------------------------------------------
/code/gen_ch11_filters.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Illustrate top-k and top-p filtering on a toy distribution.
 9 | 
10 | Writes figures/ch11-topfilt.svg. Matplotlib if available, else fallback SVG.
11 | """
12 | 
13 | from __future__ import annotations
14 | 
15 | 
16 | from pathlib import Path
17 | 
18 | 
19 | def fallback_svg(out: Path) -> None:
20 |     w, h = 600, 220
21 |     pad = 28
22 |     base = [0.40, 0.25, 0.12, 0.08, 0.05, 0.04, 0.03, 0.03]
23 |     cols = ["#0A66C2"] * len(base)
24 |     def panel(x0, title, mask):
25 |         svg = [f'<text x="{x0+140}" y="40" text-anchor="middle">{title}</text>']
26 |         x = x0 + 16
27 |         for p, m in zip(base, mask):
28 |             height = (h - 80) * (p if not m else 0.02)
29 |             color = "#0A66C2" if not m else "#DCE6F8"
30 |             svg.append(
31 |                 f'<rect x="{x}" y="{h-30-height}" width="18" height="{height}" '
32 |                 f'fill="{color}" />'
33 |             )
34 |             x += 22
35 |         return "\n".join(svg)
36 |     topk_mask = [False, False, False, True, True, True, True, True]  # keep 3
37 |     topp_mask = [False, False, False, False, True, True, True, True]  # keep to ~0.85
38 |     svg = [
39 |         f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="{h}">',
40 |         '<style> text{font-family:Helvetica,Arial,sans-serif;font-size:12px;}</style>',
41 |         f'<text x="{w/2}" y="18" text-anchor="middle" fill="#222">Top-k vs Top-p</text>',
42 |         panel(20, 'Top-k (k=3)', topk_mask),
43 |         panel(320, 'Top-p (p≈0.85)', topp_mask),
44 |         '</svg>'
45 |     ]
46 |     out.write_text("\n".join(svg))
47 | 
48 | 
49 | def main() -> None:
50 |     fig_dir = Path(__file__).resolve().parents[1] / "figures"
51 |     fig_dir.mkdir(parents=True, exist_ok=True)
52 |     out = fig_dir / "ch11-topfilt.svg"
53 |     try:
54 |         import matplotlib.pyplot as plt
55 |         import numpy as np
56 |         plt.style.use('seaborn-v0_8')
57 |         base = np.array([0.40, 0.25, 0.12, 0.08, 0.05, 0.04, 0.03, 0.03])
58 |         topk_mask = np.array([False, False, False, True, True, True, True, True])
59 |         topp_mask = np.array([False, False, False, False, True, True, True, True])
60 |         fig, axes = plt.subplots(1, 2, figsize=(6.4, 2.2), constrained_layout=True)
61 |         axes[0].bar(range(len(base)), np.where(topk_mask, 0.02, base), color="#0A66C2")
62 |         axes[0].set_title("Top-k (k=3)")
63 |         axes[1].bar(range(len(base)), np.where(topp_mask, 0.02, base), color="#0A66C2")
64 |         axes[1].set_title("Top-p (p≈0.85)")
65 |         for ax in axes:
66 |             ax.set_ylim(0, 0.5); ax.set_xticks([]); ax.set_yticks([])
67 |         fig.suptitle("Top-k vs Top-p")
68 |         fig.savefig(out, format='svg')
69 |     except Exception:
70 |         fallback_svg(out)
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     main()
75 | 
76 | 


--------------------------------------------------------------------------------
/code/ch5_linreg.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Minimal linear regression training in PyTorch (Chapter 5).
 9 | 
10 | Run:
11 |   python code/ch5_linreg.py --device auto --epochs 400
12 | """
13 | 
14 | from __future__ import annotations
15 | 
16 | import argparse
17 | from dataclasses import dataclass
18 | 
19 | import torch
20 | 
21 | 
22 | def pick_device() -> torch.device:
23 |     if torch.cuda.is_available():
24 |         return torch.device("cuda")
25 |     mps = getattr(torch.backends, "mps", None)
26 |     if mps and torch.backends.mps.is_available():
27 |         return torch.device("mps")
28 |     return torch.device("cpu")
29 | 
30 | 
31 | @dataclass
32 | class Config:
33 |     epochs: int = 400
34 |     lr: float = 3e-2
35 |     n: int = 128
36 |     seed: int = 42
37 |     device: str = "auto"  # cpu|cuda|mps|auto
38 | 
39 | 
40 | def make_data(
41 |     cfg: Config, device: torch.device
42 | ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
43 |     # Fix RNG for reproducibility across devices
44 |     g = torch.Generator(device="cpu").manual_seed(cfg.seed)
45 |     w_true = torch.tensor([2.0, -3.5])
46 |     b_true = torch.tensor(0.5)
47 |     # Draw features and small Gaussian noise on target
48 |     X = torch.randn(cfg.n, 2, generator=g).to(device)
49 |     noise = 0.1 * torch.randn(cfg.n, generator=g).to(device)
50 |     y = (X @ w_true.to(device)) + b_true.to(device) + noise
51 |     return X, y, w_true.to(device), b_true.to(device)
52 | 
53 | 
54 | def train(cfg: Config) -> None:
55 |     # Pick device lazily to match user selection
56 |     device = pick_device() if cfg.device == "auto" else torch.device(cfg.device)
57 |     X, y, w_true, b_true = make_data(cfg, device)
58 | 
59 |     model = torch.nn.Linear(2, 1).to(device)
60 |     opt = torch.optim.AdamW(model.parameters(), lr=cfg.lr)
61 |     loss_fn = torch.nn.MSELoss()
62 | 
63 |     for step in range(cfg.epochs + 1):
64 |         # Usual gradient-descent step: zero, forward, loss, backward, update
65 |         opt.zero_grad()
66 |         pred = model(X).squeeze(-1)
67 |         loss = loss_fn(pred, y)
68 |         loss.backward()
69 |         opt.step()
70 |         if step % 100 == 0:
71 |             print(f"step={step:04d} loss={loss.item():.4f}")
72 | 
73 |     w_learned = model.weight.detach().squeeze(0)
74 |     b_learned = model.bias.detach().squeeze(0)
75 |     print("true  w:", w_true.cpu().tolist(), " b:", float(b_true))
76 |     print("learn w:", w_learned.cpu().tolist(), " b:", float(b_learned))
77 | 
78 | 
79 | def parse_args() -> Config:
80 |     ap = argparse.ArgumentParser()
81 |     ap.add_argument("--epochs", type=int, default=400)
82 |     ap.add_argument("--lr", type=float, default=3e-2)
83 |     ap.add_argument("--device", default="auto")
84 |     ns = ap.parse_args()
85 |     return Config(epochs=ns.epochs, lr=ns.lr, device=ns.device)
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     cfg = parse_args()
90 |     train(cfg)
91 | 


--------------------------------------------------------------------------------
/code/ch15_fastapi_app.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | FastAPI app serving a minimal /generate endpoint (Chapter 15).
 9 | """
10 | 
11 | from __future__ import annotations
12 | 
13 | 
14 | import sys
15 | from pathlib import Path
16 | from typing import Optional
17 | 
18 | import torch
19 | from fastapi import FastAPI
20 | from pydantic import BaseModel
21 | 
22 | # Import from code/
23 | sys.path.append(str(Path(__file__).resolve().parent))
24 | from ch09_gpt import GPT, GPTConfig  # type: ignore
25 | from ch11_sampling import sample  # type: ignore
26 | from ch6_tokenize import SimpleTokenizer, Vocab  # type: ignore
27 | 
28 | 
29 | class GenerateReq(BaseModel):
30 |     prompt: str
31 |     max_new_tokens: int = 80
32 |     temperature: float = 0.9
33 |     top_k: int = 0
34 |     top_p: float = 0.95
35 | 
36 | 
37 | def load_bundle(path: str):
38 |     b = torch.load(path, map_location="cpu")
39 |     cfg = GPTConfig(**b["config"])  # type: ignore
40 |     model = GPT(cfg).eval()
41 |     model.load_state_dict(b["model_state"])  # type: ignore
42 |     meta = b.get("tokenizer")
43 |     tok = None
44 |     if meta and meta.get("id_to_token"):
45 |         id_to_token = list(meta["id_to_token"])  # ensure list
46 |         token_to_id = {t: i for i, t in enumerate(id_to_token)}
47 |         vocab = Vocab(
48 |             token_to_id=token_to_id,
49 |             id_to_token=id_to_token,
50 |             pad=int(meta.get("pad_id", 0)),
51 |             unk=int(meta.get("unk_id", 1)),
52 |         )
53 |         tok = SimpleTokenizer(vocab=vocab, level=meta.get("level", "char"))
54 |     return model, tok
55 | 
56 | 
57 | app = FastAPI(title="Mini‑GPT")
58 | MODEL, TOK = None, None
59 | 
60 | 
61 | @app.on_event("startup")
62 | def _startup():
63 |     global MODEL, TOK
64 |     bundle = Path("model_bundle.pt")
65 |     if bundle.exists():
66 |         MODEL, TOK = load_bundle(str(bundle))
67 | 
68 | 
69 | @app.post("/generate")
70 | def generate(req: GenerateReq):
71 |     model = MODEL
72 |     tok = TOK
73 |     if model is None:
74 |         return {"error": "model not loaded; place model_bundle.pt next to the app"}
75 |     if tok is None:
76 |         ids = torch.tensor([[c for c in req.prompt.encode("utf-8")]], dtype=torch.long)
77 |         out = sample(
78 |             model,
79 |             ids,
80 |             max_new_tokens=req.max_new_tokens,
81 |             temperature=req.temperature,
82 |             top_k=(req.top_k or None),
83 |             top_p=(req.top_p or None),
84 |         )
85 |         text = bytes(out[0].tolist()).decode("utf-8", errors="ignore")
86 |     else:
87 |         ids = torch.tensor([tok.encode(req.prompt)], dtype=torch.long)
88 |         out = sample(
89 |             model,
90 |             ids,
91 |             max_new_tokens=req.max_new_tokens,
92 |             temperature=req.temperature,
93 |             top_k=(req.top_k or None),
94 |             top_p=(req.top_p or None),
95 |         )
96 |         text = tok.decode(out[0].tolist())
97 |     return {"text": text}
98 | 
99 | 


--------------------------------------------------------------------------------
/code/ch15_streamlit_app.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Streamlit app for sampling from an exported GPT bundle (Chapter 15).
 9 | """
10 | 
11 | from __future__ import annotations
12 | 
13 | 
14 | import sys
15 | from pathlib import Path
16 | import streamlit as st
17 | import torch
18 | 
19 | # Allow importing modules from code/
20 | sys.path.append(str(Path(__file__).resolve().parent))
21 | from ch09_gpt import GPT, GPTConfig  # type: ignore
22 | from ch11_sampling import sample  # type: ignore
23 | from ch6_tokenize import SimpleTokenizer, Vocab  # type: ignore
24 | 
25 | 
26 | @st.cache_resource
27 | def load_bundle(path: str):
28 |     b = torch.load(path, map_location="cpu")
29 |     cfg = GPTConfig(**b["config"])  # type: ignore
30 |     model = GPT(cfg).eval()
31 |     model.load_state_dict(b["model_state"])  # type: ignore
32 |     meta = b.get("tokenizer")
33 |     tok = None
34 |     if meta and meta.get("id_to_token"):
35 |         id_to_token = list(meta["id_to_token"])  # ensure list
36 |         token_to_id = {t: i for i, t in enumerate(id_to_token)}
37 |         vocab = Vocab(
38 |             token_to_id=token_to_id,
39 |             id_to_token=id_to_token,
40 |             pad=int(meta.get("pad_id", 0)),
41 |             unk=int(meta.get("unk_id", 1)),
42 |         )
43 |         tok = SimpleTokenizer(vocab=vocab, level=meta.get("level", "char"))
44 |     return model, tok
45 | 
46 | 
47 | st.title("Mini‑GPT Sampler")
48 | bundle_path = st.text_input("Bundle path", "model_bundle.pt")
49 | prompt = st.text_area("Prompt", "Hello")
50 | col1, col2, col3 = st.columns(3)
51 | with col1:
52 |     max_new = st.number_input("Max new tokens", 1, 512, 80)
53 | with col2:
54 |     temp = st.slider("Temperature", 0.0, 1.5, 0.9, 0.05)
55 | with col3:
56 |     top_p = st.slider("Top‑p", 0.0, 1.0, 0.95, 0.05)
57 | top_k = st.slider("Top‑k (0=off)", 0, 200, 0, 5)
58 | 
59 | if st.button("Generate"):
60 |     try:
61 |         model, tok = load_bundle(bundle_path)
62 |     except Exception as e:
63 |         st.error(f"Failed to load bundle: {e}")
64 |     else:
65 |         if tok is None:
66 |             ids = torch.tensor([[c for c in prompt.encode("utf-8")]], dtype=torch.long)
67 |             out = sample(
68 |                 model,
69 |                 ids,
70 |                 max_new_tokens=int(max_new),
71 |                 temperature=float(temp),
72 |                 top_k=(int(top_k) or None),
73 |                 top_p=(float(top_p) or None),
74 |             )
75 |             text = bytes(out[0].tolist()).decode("utf-8", errors="ignore")
76 |         else:
77 |             ids = torch.tensor([tok.encode(prompt)], dtype=torch.long)
78 |             out = sample(
79 |                 model,
80 |                 ids,
81 |                 max_new_tokens=int(max_new),
82 |                 temperature=float(temp),
83 |                 top_k=(int(top_k) or None),
84 |                 top_p=(float(top_p) or None),
85 |             )
86 |             text = tok.decode(out[0].tolist())
87 |         st.subheader("Output")
88 |         st.write(text)
89 | 
90 | 


--------------------------------------------------------------------------------
/code/ch11_sampling.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Tiny sampling helpers (preview for Chapter 11).
 9 | 
10 | Functions here keep dependencies minimal and work directly with the GPT model
11 | from Chapter 9. They operate on integer token ids and return extended ids.
12 | """
13 | 
14 | from __future__ import annotations
15 | 
16 | 
17 | from typing import Optional
18 | 
19 | import torch
20 | import torch.nn.functional as F
21 | 
22 | 
23 | def _top_k_filter(logits: torch.Tensor, k: int) -> torch.Tensor:
24 |     if k <= 0:
25 |         return logits
26 |     v, _ = torch.topk(logits, k)
27 |     thresh = v[:, [-1]]
28 |     return torch.where(logits < thresh, torch.tensor(-1e9, device=logits.device), logits)
29 | 
30 | 
31 | def _top_p_filter(logits: torch.Tensor, p: float) -> torch.Tensor:
32 |     if p <= 0 or p >= 1:
33 |         return logits
34 |     # sort descending and keep smallest set whose cumulative prob >= p
35 |     sorted_logits, sorted_idx = torch.sort(logits, descending=True)
36 |     probs = torch.softmax(sorted_logits, dim=-1)
37 |     cum = torch.cumsum(probs, dim=-1)
38 |     mask = cum > p
39 |     # always keep the first token
40 |     mask[..., 0] = False
41 |     filtered = sorted_logits.masked_fill(mask, -1e9)
42 |     # unsort back to original order
43 |     unsorted = torch.empty_like(filtered).scatter_(1, sorted_idx, filtered)
44 |     return unsorted
45 | 
46 | 
47 | @torch.no_grad()
48 | def sample(
49 |     model,
50 |     input_ids: torch.Tensor,
51 |     max_new_tokens: int = 50,
52 |     temperature: float = 1.0,
53 |     top_k: Optional[int] = None,
54 |     top_p: Optional[float] = None,
55 |     eos_id: Optional[int] = None,
56 | ) -> torch.Tensor:
57 |     """Generate tokens autoregressively.
58 | 
59 |     - temperature: 0 → greedy (argmax); >0 → softmax sampling
60 |     - top_k: keep only the top‑k logits at each step (optional)
61 |     - eos_id: if set, stop when generated
62 |     """
63 |     model.eval()
64 |     x = input_ids
65 |     device = next(model.parameters()).device
66 |     x = x.to(device)
67 | 
68 |     for _ in range(max_new_tokens):
69 |         # Forward pass on the last block_size tokens
70 |         T = x.size(1)
71 |         block_size = getattr(model.cfg, "block_size", T)
72 |         x_cond = x[:, -block_size:]
73 |         logits, _ = model(x_cond)
74 |         logits = logits[:, -1, :]  # last position
75 | 
76 |         if temperature <= 0:
77 |             # Greedy
78 |             next_id = torch.argmax(logits, dim=-1, keepdim=True)
79 |         else:
80 |             logits = logits / temperature
81 |             if top_k is not None and top_k > 0:
82 |                 logits = _top_k_filter(logits, top_k)
83 |             if top_p is not None:
84 |                 logits = _top_p_filter(logits, float(top_p))
85 |             probs = F.softmax(logits, dim=-1)
86 |             next_id = torch.multinomial(probs, num_samples=1)
87 | 
88 |         x = torch.cat([x, next_id], dim=1)
89 |         if eos_id is not None and int(next_id[0, 0].item()) == int(eos_id):
90 |             break
91 |     return x
92 | 
93 | 
94 | __all__ = ["sample", "_top_k_filter", "_top_p_filter"]
95 | 


--------------------------------------------------------------------------------
/code/gen_ch10_windows.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Building a Large Language Model from Scratch
  3 | — A Step-by-Step Guide Using Python and PyTorch
  4 | 
  5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
  6 | AI-Powered by GPT-5.
  7 | 
  8 | Generate a sliding-window schematic for Chapter 10.
  9 | 
 10 | Writes figures/ch10-windows.svg. Uses Matplotlib if available; otherwise
 11 | falls back to a small hand-written SVG so the figure is always present.
 12 | """
 13 | 
 14 | from __future__ import annotations
 15 | 
 16 | 
 17 | from pathlib import Path
 18 | 
 19 | 
 20 | def fallback_svg(out: Path, N: int = 24, T: int = 8) -> None:
 21 |     cell = 16
 22 |     pad = 18
 23 |     h = pad * 2 + cell * 3
 24 |     w = pad * 2 + cell * N
 25 |     y_ids = pad
 26 |     y_x = y_ids + cell
 27 |     y_y = y_x + cell
 28 |     # colors
 29 |     col_ids = "#DCE6F8"
 30 |     col_x = "#B5D0F5"
 31 |     col_y = "#9EC5F8"
 32 |     stroke = "#2b2b2b"
 33 |     style = (
 34 |         '<style> text{font-family:Helvetica,Arial,sans-serif;font-size:12px;}</style>'
 35 |     )
 36 |     svg = [
 37 |         f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="{h}">',
 38 |         style,
 39 |         f'<text x="{pad}" y="{y_ids-4}" fill="#222">ids</text>',
 40 |         f'<text x="{pad}" y="{y_x-4}" fill="#222">x = ids[i:i+T]</text>',
 41 |         f'<text x="{pad}" y="{y_y-4}" fill="#222">y = ids[i+1:i+T+1]</text>',
 42 |     ]
 43 |     # ids row
 44 |     for j in range(N):
 45 |         svg.append(
 46 |             f'<rect x="{pad + j*cell}" y="{y_ids}" width="{cell}" height="{cell}" '
 47 |             f'fill="{col_ids}" stroke="{stroke}" stroke-width="0.4" />'
 48 |         )
 49 |     # x window from j0..j0+T-1
 50 |     j0 = 4
 51 |     for j in range(T):
 52 |         xj = pad + (j0 + j) * cell
 53 |         svg.append(
 54 |             f'<rect x="{xj}" y="{y_x}" width="{cell}" height="{cell}" '
 55 |             f'fill="{col_x}" stroke="{stroke}" stroke-width="0.4" />'
 56 |         )
 57 |     # y window shifted by 1
 58 |     for j in range(T):
 59 |         xj = pad + (j0 + 1 + j) * cell
 60 |         svg.append(
 61 |             f'<rect x="{xj}" y="{y_y}" width="{cell}" height="{cell}" '
 62 |             f'fill="{col_y}" stroke="{stroke}" stroke-width="0.4" />'
 63 |         )
 64 |     svg.append('</svg>')
 65 |     out.write_text("\n".join(svg))
 66 | 
 67 | 
 68 | def main() -> None:
 69 |     fig_dir = Path(__file__).resolve().parents[1] / "figures"
 70 |     fig_dir.mkdir(parents=True, exist_ok=True)
 71 |     out = fig_dir / "ch10-windows.svg"
 72 |     try:
 73 |         import matplotlib.pyplot as plt
 74 |         import numpy as np
 75 | 
 76 |         plt.style.use("seaborn-v0_8")
 77 |         N, T = 24, 8
 78 |         fig, ax = plt.subplots(figsize=(8.0, 1.8))
 79 |         ax.axis('off')
 80 |         y0 = 0
 81 |         # ids
 82 |         for j in range(N):
 83 |             ax.add_patch(plt.Rectangle((j, y0+0.8), 1, 0.8, fc="#DCE6F8", ec="#2b2b2b", lw=0.6))
 84 |         ax.text(-1.1, y0+1.3, 'ids', ha='right', va='center')
 85 |         # x
 86 |         j0 = 4
 87 |         for j in range(T):
 88 |             ax.add_patch(plt.Rectangle((j0+j, y0-0.2), 1, 0.8, fc="#B5D0F5", ec="#2b2b2b", lw=0.6))
 89 |         ax.text(-1.1, y0+0.2, 'x = ids[i:i+T]', ha='right', va='center')
 90 |         # y (shifted)
 91 |         for j in range(T):
 92 |             ax.add_patch(plt.Rectangle((j0+1+j, y0-1.2), 1, 0.8, fc="#9EC5F8", ec="#2b2b2b", lw=0.6))
 93 |         ax.text(-1.1, y0-0.8, 'y = ids[i+1:i+T+1]', ha='right', va='center')
 94 |         ax.set_xlim(-2, N+1); ax.set_ylim(-2, 3)
 95 |         fig.savefig(out, format='svg', bbox_inches='tight')
 96 |     except Exception:
 97 |         fallback_svg(out)
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     main()
102 | 


--------------------------------------------------------------------------------
/code/ch15_cli.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Building a Large Language Model from Scratch
  3 | — A Step-by-Step Guide Using Python and PyTorch
  4 | 
  5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
  6 | AI-Powered by GPT-5.
  7 | 
  8 | Minimal sampling CLI over an exported bundle (Chapter 15).
  9 | 
 10 | Usage:
 11 |   python code/ch15_cli.py --bundle model_bundle.pt --prompt "Hello"
 12 | """
 13 | 
 14 | from __future__ import annotations
 15 | 
 16 | 
 17 | import argparse
 18 | import sys
 19 | from pathlib import Path
 20 | 
 21 | import torch
 22 | 
 23 | # Import code/ modules directly when run as a script
 24 | sys.path.append(str(Path(__file__).resolve().parent))
 25 | from ch09_gpt import GPT, GPTConfig  # type: ignore
 26 | from ch11_sampling import sample  # type: ignore
 27 | from ch6_tokenize import SimpleTokenizer, Vocab  # type: ignore
 28 | 
 29 | 
 30 | def auto_device() -> str:
 31 |     if torch.cuda.is_available():
 32 |         return "cuda"
 33 |     mps = getattr(torch.backends, "mps", None)
 34 |     if mps and torch.backends.mps.is_available():
 35 |         return "mps"
 36 |     return "cpu"
 37 | 
 38 | 
 39 | def build_tokenizer(meta: dict | None):
 40 |     if not meta:
 41 |         return None
 42 |     try:
 43 |         id_to_token = list(meta["id_to_token"])  # ensure list
 44 |         token_to_id = {t: i for i, t in enumerate(id_to_token)}
 45 |         pad_id = int(meta.get("pad_id", 0))
 46 |         unk_id = int(meta.get("unk_id", 1))
 47 |         vocab = Vocab(token_to_id=token_to_id, id_to_token=id_to_token, pad=pad_id, unk=unk_id)
 48 |         return SimpleTokenizer(vocab=vocab, level=meta.get("level", "char"))
 49 |     except Exception:
 50 |         return None
 51 | 
 52 | 
 53 | def main() -> None:
 54 |     p = argparse.ArgumentParser(description="Sample from a GPT bundle")
 55 |     p.add_argument("--bundle", required=True, help="bundle .pt from ch15_export")
 56 |     p.add_argument("--prompt", required=True, help="prompt string")
 57 |     p.add_argument("--max-new-tokens", type=int, default=80)
 58 |     p.add_argument("--temperature", type=float, default=0.9)
 59 |     p.add_argument("--top-k", type=int, default=0)
 60 |     p.add_argument("--top-p", type=float, default=0.0)
 61 |     p.add_argument("--device", default="auto")
 62 |     p.add_argument("--seed", type=int, default=0)
 63 |     args = p.parse_args()
 64 | 
 65 |     torch.manual_seed(args.seed)
 66 |     device = auto_device() if args.device == "auto" else args.device
 67 |     print({"device": device, "seed": args.seed})
 68 |     b = torch.load(args.bundle, map_location=device)
 69 |     cfg = GPTConfig(**b["config"])  # type: ignore
 70 |     model = GPT(cfg).to(device)
 71 |     model.load_state_dict(b["model_state"])  # type: ignore
 72 |     model.eval()
 73 | 
 74 |     tok = build_tokenizer(b.get("tokenizer"))
 75 |     if tok is None:  # fall back to byte-level
 76 |         ids = torch.tensor([[c for c in args.prompt.encode("utf-8")]], dtype=torch.long, device=device)
 77 |         out = sample(
 78 |             model,
 79 |             ids,
 80 |             max_new_tokens=args.max_new_tokens,
 81 |             temperature=args.temperature,
 82 |             top_k=(args.top_k or None),
 83 |             top_p=(args.top_p or None),
 84 |         )
 85 |         print(bytes(out[0].tolist()).decode("utf-8", errors="ignore"))
 86 |     else:
 87 |         ids = torch.tensor([tok.encode(args.prompt)], dtype=torch.long, device=device)
 88 |         out = sample(
 89 |             model,
 90 |             ids,
 91 |             max_new_tokens=args.max_new_tokens,
 92 |             temperature=args.temperature,
 93 |             top_k=(args.top_k or None),
 94 |             top_p=(args.top_p or None),
 95 |         )
 96 |         print(tok.decode(out[0].tolist()))
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     main()
101 | 


--------------------------------------------------------------------------------
/code/check_bundle.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Building a Large Language Model from Scratch
  3 | — A Step-by-Step Guide Using Python and PyTorch
  4 | 
  5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
  6 | AI-Powered by GPT-5.
  7 | 
  8 | Validate a model bundle by loading it and sampling once.
  9 | """
 10 | 
 11 | from __future__ import annotations
 12 | 
 13 | 
 14 | import argparse
 15 | from pathlib import Path
 16 | import sys
 17 | import torch
 18 | 
 19 | sys.path.append(str(Path(__file__).resolve().parent))
 20 | from ch09_gpt import GPT, GPTConfig  # type: ignore
 21 | from ch11_sampling import sample      # type: ignore
 22 | from ch6_tokenize import SimpleTokenizer, Vocab  # type: ignore
 23 | 
 24 | 
 25 | def auto_device() -> str:
 26 |     if torch.cuda.is_available():
 27 |         return "cuda"
 28 |     mps = getattr(torch.backends, "mps", None)
 29 |     if mps and torch.backends.mps.is_available():
 30 |         return "mps"
 31 |     return "cpu"
 32 | 
 33 | 
 34 | def main() -> None:
 35 |     p = argparse.ArgumentParser(description="Bundle smoke-test: load and sample")
 36 |     p.add_argument("--bundle", required=True, help="path to model_bundle.pt")
 37 |     p.add_argument("--prompt", default="Hello", help="prompt string")
 38 |     p.add_argument("--max-new-tokens", type=int, default=32)
 39 |     p.add_argument("--temperature", type=float, default=0.9)
 40 |     p.add_argument("--top-p", type=float, default=0.95)
 41 |     p.add_argument("--top-k", type=int, default=0)
 42 |     p.add_argument("--device", default="auto", help="cpu|cuda|mps|auto")
 43 |     p.add_argument("--seed", type=int, default=0)
 44 |     args = p.parse_args()
 45 | 
 46 |     # Make sampling deterministic and pick a device
 47 |     torch.manual_seed(args.seed)
 48 |     device = auto_device() if args.device == "auto" else args.device
 49 |     print({"device": device, "seed": args.seed})
 50 | 
 51 |     # Load bundle, restore model and optional tokenizer
 52 |     b = torch.load(args.bundle, map_location=device)
 53 |     cfg = GPTConfig(**b["config"])  # type: ignore
 54 |     model = GPT(cfg).to(device).eval()
 55 |     model.load_state_dict(b["model_state"])  # type: ignore
 56 |     meta = b.get("tokenizer")
 57 |     tok = None
 58 |     if meta and meta.get("id_to_token"):
 59 |         id_to_token = list(meta["id_to_token"])  # ensure list
 60 |         token_to_id = {t: i for i, t in enumerate(id_to_token)}
 61 |         vocab = Vocab(
 62 |             token_to_id=token_to_id,
 63 |             id_to_token=id_to_token,
 64 |             pad=int(meta.get("pad_id", 0)),
 65 |             unk=int(meta.get("unk_id", 1)),
 66 |         )
 67 |         tok = SimpleTokenizer(vocab=vocab, level=meta.get("level", "char"))
 68 | 
 69 |     if tok is None:
 70 |         # Fallback to byte-level prompt if no tokenizer metadata exists
 71 |         ids = torch.tensor(
 72 |             [[c for c in args.prompt.encode("utf-8")]],
 73 |             dtype=torch.long,
 74 |             device=device,
 75 |         )
 76 |         out = sample(
 77 |             model, ids,
 78 |             max_new_tokens=args.max_new_tokens,
 79 |             temperature=args.temperature,
 80 |             top_k=(args.top_k or None),
 81 |             top_p=(args.top_p or None),
 82 |         )
 83 |         text = bytes(out[0].tolist()).decode("utf-8", errors="ignore")
 84 |     else:
 85 |         ids = torch.tensor(
 86 |             [tok.encode(args.prompt)], dtype=torch.long, device=device
 87 |         )
 88 |         out = sample(
 89 |             model, ids,
 90 |             max_new_tokens=args.max_new_tokens,
 91 |             temperature=args.temperature,
 92 |             top_k=(args.top_k or None),
 93 |             top_p=(args.top_p or None),
 94 |         )
 95 |         text = tok.decode(out[0].tolist())
 96 |     print("OK — model loaded and sampled.\n", text)
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     main()
101 | 


--------------------------------------------------------------------------------
/code/gen_ch14_lora.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Generate a simple LoRA diagram: base Linear plus low-rank delta.
 9 | 
10 | Writes figures/ch14-lora.svg without external dependencies.
11 | """
12 | 
13 | from __future__ import annotations
14 | 
15 | 
16 | from pathlib import Path
17 | 
18 | 
19 | def main() -> None:
20 |     fig_dir = Path(__file__).resolve().parents[1] / "figures"
21 |     fig_dir.mkdir(parents=True, exist_ok=True)
22 |     out = fig_dir / "ch14-lora.svg"
23 | 
24 |     w, h = 760, 260
25 |     pad = 30
26 |     items = [
27 |         f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="{h}">',
28 |         '<style> text{font-family:Helvetica,Arial,sans-serif;font-size:12px;}</style>',
29 |         '<defs>\n'
30 |         '  <marker id="arrow" markerWidth="10" markerHeight="6"\n'
31 |         '          refX="10" refY="3" orient="auto">\n'
32 |         '    <path d="M0,0 L10,3 L0,6 z" fill="#0A66C2"/>\n'
33 |         '  </marker>\n'
34 |         '</defs>',
35 |     ]
36 |     # Base linear block (center)
37 |     x0, y0 = pad + 80, 100
38 |     items.append(
39 |         f'<rect x="{x0}" y="{y0}" width="200" height="60" fill="#DCE6F8" stroke="#2b2b2b"/>'
40 |     )
41 |     items.append(
42 |         f'<text x="{x0+100}" y="{y0+36}" text-anchor="middle">Base Linear W</text>'
43 |     )
44 |     # A and B blocks (adapter branch)
45 |     ax, ay = x0 + 250, y0 - 50
46 |     items.append(
47 |         f'<rect x="{ax}" y="{ay}" width="140" height="34" fill="#B5D0F5" stroke="#2b2b2b"/>'
48 |     )
49 |     items.append(
50 |         f'<text x="{ax+70}" y="{ay+22}" text-anchor="middle">A (r × d_in)</text>'
51 |     )
52 |     bx, by = ax, y0 + 76
53 |     items.append(
54 |         f'<rect x="{bx}" y="{by}" width="140" height="34" fill="#B5D0F5" stroke="#2b2b2b"/>'
55 |     )
56 |     items.append(
57 |         f'<text x="{bx+70}" y="{by+22}" text-anchor="middle">B (d_out × r)</text>'
58 |     )
59 |     # Input x arrow into base and into A (branch)
60 |     items.append(
61 |         f'<line x1="{x0-40}" y1="{y0+30}" x2="{x0}" y2="{y0+30}" stroke="#0A66C2" marker-end="url(#arrow)"/>'
62 |     )
63 |     items.append(f'<text x="{x0-44}" y="{y0+34}" text-anchor="end">x</text>')
64 |     items.append(
65 |         f'<line x1="{x0-40}" y1="{ay+17}" x2="{ax}" y2="{ay+17}" stroke="#0A66C2" marker-end="url(#arrow)"/>'
66 |     )
67 |     # A to B, B to sum
68 |     items.append(
69 |         f'<line x1="{ax+140}" y1="{ay+17}" x2="{bx+70}" y2="{by}" stroke="#0A66C2" marker-end="url(#arrow)"/>'
70 |     )
71 |     sumx, sumy = x0 + 470, y0 + 30
72 |     items.append(f'<circle cx="{sumx}" cy="{sumy}" r="10" fill="#FFFFFF" stroke="#2b2b2b"/>')
73 |     items.append(f'<text x="{sumx}" y="{sumy+4}" text-anchor="middle">+</text>')
74 |     items.append(
75 |         f'<line x1="{x0+200}" y1="{y0+30}" x2="{sumx-10}" y2="{sumy}" stroke="#0A66C2" marker-end="url(#arrow)"/>'
76 |     )
77 |     items.append(
78 |         f'<line x1="{bx+140}" y1="{by+17}" x2="{sumx-10}" y2="{sumy}" stroke="#0A66C2" marker-end="url(#arrow)"/>'
79 |     )
80 |     # Scale label α/r on the adapter path
81 |     items.append(f'<text x="{bx+118}" y="{by-8}">scale: α/r</text>')
82 |     # Sum to output
83 |     outx = sumx + 160
84 |     items.append(
85 |         f'<line x1="{sumx+10}" y1="{sumy}" x2="{outx}" y2="{sumy}" stroke="#0A66C2" marker-end="url(#arrow)"/>'
86 |     )
87 |     items.append(f'<text x="{outx+10}" y="{sumy+4}">output</text>')
88 |     # Annotations
89 |     items.append(
90 |         f'<text x="{x0}" y="{y0-10}">x</text>'
91 |     )
92 |     items.append(f'<text x="{ax+160}" y="{ay+22}">ΔW = B @ A</text>')
93 |     items.append('</svg>')
94 |     out.write_text("\n".join(items))
95 | 
96 | 
97 | if __name__ == '__main__':
98 |     main()
99 | 


--------------------------------------------------------------------------------
/code/gen_masks_heatmap.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Building a Large Language Model from Scratch
  3 | — A Step-by-Step Guide Using Python and PyTorch
  4 | 
  5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
  6 | AI-Powered by GPT-5.
  7 | 
  8 | Generate causal and combined (padding x causal) mask heatmaps for Ch. 9.
  9 | 
 10 | Always writes `figures/ch09-masks.svg`. Uses Matplotlib if available; otherwise
 11 | falls back to a minimal hand-written SVG so the book build never misses it.
 12 | """
 13 | 
 14 | from __future__ import annotations
 15 | 
 16 | 
 17 | from pathlib import Path
 18 | 
 19 | import torch
 20 | 
 21 | 
 22 | def build_masks(T: int, pad_positions: list[int] | None = None):
 23 |     causal = torch.tril(torch.ones(T, T))  # [T, T]
 24 |     if not pad_positions:
 25 |         return causal, causal  # combined==causal in this trivial case
 26 |     pad = torch.ones(T)
 27 |     for p in pad_positions:
 28 |         if 0 <= p < T:
 29 |             pad[p] = 0
 30 |     pad_bt = pad[None, :]
 31 |     combined = pad_bt[:, None, :] * causal  # [1, T, T]
 32 |     return causal, combined.squeeze(0)
 33 | 
 34 | 
 35 | def render_svg_simple(causal: torch.Tensor, combined: torch.Tensor, out: Path) -> None:
 36 |     """Write a simple 2-panel SVG without external deps.
 37 | 
 38 |     Blue squares (1) vs white squares (0). Titles above each panel.
 39 |     """
 40 |     T = causal.size(0)
 41 |     cell = 16
 42 |     pad = 24
 43 |     gap = 40
 44 |     width = pad * 2 + cell * T * 2 + gap
 45 |     height = pad * 2 + cell * T + 28  # extra for titles
 46 |     def rects(mat: torch.Tensor, x0: int, y0: int) -> str:
 47 |         parts = []
 48 |         for i in range(T):          # rows (queries)
 49 |             for j in range(T):      # cols (keys)
 50 |                 v = float(mat[i, j])
 51 |                 color = "#0A66C2" if v > 0.5 else "#FFFFFF"
 52 |                 parts.append(
 53 |                     f'<rect x="{x0 + j*cell}" y="{y0 + i*cell}" width="{cell}" '
 54 |                     f'height="{cell}" fill="{color}" stroke="#2b2b2b" stroke-width="0.4" />'
 55 |                 )
 56 |         return "\n".join(parts)
 57 |     x1 = pad
 58 |     x2 = pad + cell * T + gap
 59 |     y = pad + 24
 60 |     style = (
 61 |         '<style> text{font-family:Helvetica,Arial,sans-serif;font-size:12px;}</style>'
 62 |     )
 63 |     svg = [
 64 |         f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}">',
 65 |         style,
 66 |         f'<text x="{x1}" y="{pad}" fill="#222">Causal mask [T,T]</text>',
 67 |         f'<text x="{x2}" y="{pad}" fill="#222">Padding x causal [T,T]</text>',
 68 |         rects(causal, x1, y),
 69 |         rects(combined, x2, y),
 70 |         '</svg>',
 71 |     ]
 72 |     out.write_text("\n".join(svg))
 73 | 
 74 | 
 75 | def main() -> None:
 76 |     fig_dir = Path(__file__).resolve().parents[1] / "figures"
 77 |     fig_dir.mkdir(parents=True, exist_ok=True)
 78 |     out = fig_dir / "ch09-masks.svg"
 79 |     causal, combined = build_masks(T=12, pad_positions=[9, 10, 11])
 80 |     try:
 81 |         import matplotlib.pyplot as plt
 82 | 
 83 |         plt.style.use("seaborn-v0_8")
 84 |         fig, axes = plt.subplots(1, 2, figsize=(7.2, 2.8), constrained_layout=True)
 85 |         im0 = axes[0].imshow(causal, cmap="Blues", vmin=0, vmax=1)
 86 |         axes[0].set_title("Causal mask [T,T]")
 87 |         axes[0].set_xlabel("keys")
 88 |         axes[0].set_ylabel("queries")
 89 |         im1 = axes[1].imshow(combined, cmap="Blues", vmin=0, vmax=1)
 90 |         axes[1].set_title("Padding x causal [T,T]")
 91 |         axes[1].set_xlabel("keys")
 92 |         for ax in axes:
 93 |             ax.set_xticks([]); ax.set_yticks([])
 94 |         fig.savefig(out, format="svg")
 95 |         print("Wrote:", out)
 96 |     except Exception:
 97 |         # Fallback: hand-written SVG so the book can include the figure
 98 |         render_svg_simple(causal, combined, out)
 99 |         print("Wrote (fallback SVG):", out)
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     main()
104 | 


--------------------------------------------------------------------------------
/code/ch6_tokenize.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Building a Large Language Model from Scratch
  3 | — A Step-by-Step Guide Using Python and PyTorch
  4 | 
  5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
  6 | AI-Powered by GPT-5.
  7 | 
  8 | """
  9 | 
 10 | from __future__ import annotations
 11 | 
 12 | from collections import Counter
 13 | from dataclasses import dataclass
 14 | from pathlib import Path
 15 | from typing import Iterable, List, Dict
 16 | 
 17 | 
 18 | @dataclass
 19 | class Vocab:
 20 |     token_to_id: Dict[str, int]
 21 |     id_to_token: List[str]
 22 |     pad: int
 23 |     unk: int
 24 | 
 25 |     @classmethod
 26 |     def build(
 27 |         cls,
 28 |         tokens: Iterable[str],
 29 |         min_freq: int = 1,
 30 |         specials: Iterable[str] = ("<PAD>", "<UNK>"),
 31 |     ) -> "Vocab":
 32 |         # Count incoming tokens and prepend special ids
 33 |         counter = Counter(tokens)
 34 |         id_to_token = list(specials)
 35 |         for tok, freq in counter.most_common():
 36 |             if freq >= min_freq and tok not in id_to_token:
 37 |                 id_to_token.append(tok)
 38 |         token_to_id = {t: i for i, t in enumerate(id_to_token)}
 39 |         pad = token_to_id[specials[0]]
 40 |         unk = token_to_id[specials[1]]
 41 |         return cls(token_to_id, id_to_token, pad, unk)
 42 | 
 43 |     def __len__(self) -> int:
 44 |         return len(self.id_to_token)
 45 | 
 46 | 
 47 | class SimpleTokenizer:
 48 |     """Tiny tokenizer for chapter 6 (char or word level)."""
 49 | 
 50 |     def __init__(self, vocab: Vocab, level: str = "char") -> None:
 51 |         assert level in {"char", "word"}
 52 |         self.vocab = vocab
 53 |         self.level = level
 54 |         self.pad = vocab.pad
 55 |         self.unk = vocab.unk
 56 | 
 57 |     @staticmethod
 58 |     def _split(text: str, level: str) -> List[str]:
 59 |         if level == "char":
 60 |             return list(text)
 61 |         # simple whitespace/punct split for demo purposes
 62 |         out: List[str] = []
 63 |         token = []
 64 |         for ch in text:
 65 |             if ch.isalnum():
 66 |                 token.append(ch.lower())
 67 |             else:
 68 |                 if token:
 69 |                     out.append("".join(token))
 70 |                     token = []
 71 |                 if ch.strip():  # keep punctuation as its own token
 72 |                     out.append(ch)
 73 |         if token:
 74 |             out.append("".join(token))
 75 |         return out
 76 | 
 77 |     @classmethod
 78 |     def from_file(
 79 |         cls, path: str | Path, level: str = "char", min_freq: int = 1
 80 |     ) -> "SimpleTokenizer":
 81 |         # Load raw text and construct vocab directly
 82 |         text = Path(path).read_text(encoding="utf-8")
 83 |         tokens = cls._split(text, level)
 84 |         vocab = Vocab.build(tokens, min_freq=min_freq)
 85 |         return cls(vocab=vocab, level=level)
 86 | 
 87 |     def encode(self, text: str) -> List[int]:
 88 |         # Map tokens to ids with unk fallback
 89 |         ids: List[int] = []
 90 |         for tok in self._split(text, self.level):
 91 |             ids.append(self.vocab.token_to_id.get(tok, self.unk))
 92 |         return ids
 93 | 
 94 |     def decode(self, ids: Iterable[int]) -> str:
 95 |         # Convert back to tokens while skipping padding tokens
 96 |         toks: List[str] = []
 97 |         for i in ids:
 98 |             if 0 <= i < len(self.vocab.id_to_token):
 99 |                 tok = self.vocab.id_to_token[i]
100 |                 if tok not in {"<PAD>", "<UNK>"}:
101 |                     toks.append(tok)
102 |             else:
103 |                 toks.append("<UNK>")
104 |         if self.level == "char":
105 |             return "".join(toks)
106 |         # naive word join: put space before alphanumerics only
107 |         out: List[str] = []
108 |         for t in toks:
109 |             if not out:
110 |                 out.append(t)
111 |             elif t.isalnum():
112 |                 out.append(" " + t)
113 |             else:
114 |                 out.append(t)
115 |         return "".join(out)
116 | 


--------------------------------------------------------------------------------
/code/gen_ch14_scaling.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building a Large Language Model from Scratch
 3 | — A Step-by-Step Guide Using Python and PyTorch
 4 | 
 5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
 6 | AI-Powered by GPT-5.
 7 | 
 8 | Generate a synthetic scaling law figure with axes and annotations.
 9 | 
10 | Writes figures/ch14-scaling.svg (simple SVG; no external deps).
11 | """
12 | 
13 | from __future__ import annotations
14 | 
15 | 
16 | from pathlib import Path
17 | import math
18 | 
19 | 
20 | def main() -> None:
21 |     fig_dir = Path(__file__).resolve().parents[1] / "figures"
22 |     fig_dir.mkdir(parents=True, exist_ok=True)
23 |     out = fig_dir / "ch14-scaling.svg"
24 | 
25 |     w, h = 680, 280
26 |     pad = 46
27 |     # Synthetic: loss = a * N^{-b} + c in log space (draw line with slight noise)
28 |     xs = [10 ** (i / 10) for i in range(3, 33)]  # ~1e0..1e3
29 |     a, b, c = 1.0, 0.3, 0.2
30 |     ys = [a * (x ** (-b)) + c for x in xs]
31 |     # Map to log10 for plotting
32 |     lx = [math.log10(x) for x in xs]
33 |     ly = [math.log10(y) for y in ys]
34 |     minx, maxx = min(lx), max(lx)
35 |     miny, maxy = min(ly), max(ly)
36 |     def mapx(x):
37 |         return pad + (w - 2*pad) * ((x - minx) / (maxx - minx))
38 |     def mapy(y):
39 |         return h - pad - (h - 2*pad) * ((y - miny) / (maxy - miny))
40 |     path = "M " + " ".join(
41 |         f"{mapx(x):.1f},{mapy(y):.1f}" for x, y in zip(lx, ly)
42 |     )
43 | 
44 |     # Horizontal line for irreducible error c (approx last y value)
45 |     y_c = mapy(min(ly) + 0.02)
46 | 
47 |     # Slope annotation segment around the middle
48 |     mid = len(lx) // 2
49 |     x1, y1 = mapx(lx[mid] - 0.3), mapy(ly[mid] + 0.08)
50 |     x2, y2 = mapx(lx[mid] + 0.3), mapy(ly[mid] - 0.08)
51 | 
52 |     # Axis label positions centered along their axes to avoid overlaps
53 |     x_axis_mid_x = w / 2
54 |     x_axis_label_y = h - pad + 24
55 |     y_axis_mid_y = (h - pad + pad) / 2
56 |     y_axis_label_x = pad - 34
57 | 
58 |     svg = [
59 |         f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="{h}">',
60 |         '<style> text{font-family:Helvetica,Arial,sans-serif;font-size:12px;}</style>',
61 |         '<defs>\n'
62 |         '  <marker id="arrow" markerWidth="10" markerHeight="6" refX="10" refY="3" orient="auto">\n'
63 |         '    <path d="M0,0 L10,3 L0,6 z" fill="#2b2b2b"/>\n'
64 |         '  </marker>\n'
65 |         '</defs>',
66 |         # Title
67 |         f'<text x="{w/2:.1f}" y="20" text-anchor="middle">Scaling law: log loss vs log scale</text>',
68 |         # Axes
69 |         f'<line x1="{pad}" y1="{h-pad}" x2="{w-pad}" y2="{h-pad}" stroke="#2b2b2b" marker-end="url(#arrow)"/>',
70 |         f'<line x1="{pad}" y1="{h-pad}" x2="{pad}" y2="{pad}" stroke="#2b2b2b" marker-end="url(#arrow)"/>',
71 |         # Centered axis labels
72 |         f'<text x="{x_axis_mid_x:.1f}" y="{x_axis_label_y:.1f}" text-anchor="middle">log10(N)</text>',
73 |         f'<text x="{y_axis_label_x:.1f}" y="{y_axis_mid_y:.1f}" text-anchor="middle" transform="rotate(-90 {y_axis_label_x:.1f},{y_axis_mid_y:.1f})">log10(loss)</text>',
74 |         # qualitative end labels
75 |         f'<text x="{pad+6}" y="{h-pad+16}" fill="#555">small</text>',
76 |         f'<text x="{w-pad-40}" y="{h-pad+16}" fill="#555">large</text>',
77 |         f'<text x="{pad-14}" y="{h-pad-6}" fill="#555" text-anchor="end">low</text>',
78 |         f'<text x="{pad-14}" y="{pad+6}" fill="#555" text-anchor="end">high</text>',
79 |         # Curve
80 |         f'<path d="{path}" fill="none" stroke="#0A66C2" stroke-width="2"/>',
81 |         # Irreducible error line c with centered annotation
82 |         f'<line x1="{pad}" y1="{y_c:.1f}" x2="{w-pad}" y2="{y_c:.1f}" stroke="#DD4444" stroke-dasharray="5,4"/>',
83 |         f'<text x="{w/2:.1f}" y="{y_c-8:.1f}" text-anchor="middle" fill="#DD4444">irreducible error c</text>',
84 |         # Slope annotation
85 |         f'<line x1="{x1:.1f}" y1="{y1:.1f}" x2="{x2:.1f}" y2="{y2:.1f}" stroke="#555555"/>',
86 |         f'<text x="{(x1+x2)/2+8:.1f}" y="{(y1+y2)/2-8:.1f}" fill="#555">slope ≈ −b</text>',
87 |         '</svg>'
88 |     ]
89 |     out.write_text("\n".join(svg))
90 | 
91 | 
92 | if __name__ == '__main__':
93 |     main()
94 | 


--------------------------------------------------------------------------------
/notebooks/ch16_discussion_conclusion_colab.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "346ba8e7",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "023ac5fa",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n",
 17 |     "## Chapter 16 — Discussion & Conclusion\n",
 18 |     "**© Dr. Yves J. Hilpisch**<br>AI-Powered by GPT-5."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "622fc26f",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## How to Use This Notebook\n",
 27 |     "\n",
 28 |     "- Synthesize insights gathered throughout the book into actionable principles.\n",
 29 |     "- Capture retrospective metrics and notes to inform your next project iteration.\n",
 30 |     "- Plan follow-up experiments and knowledge deep dives based on open questions."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "id": "c53e561f",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "### Key Reflections\n",
 39 |     "\n",
 40 |     "Use this section to distill what mattered most. The prompts below nudge you to convert experience into reusable knowledge."
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "id": "56317a1d",
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "from datetime import date\n",
 51 |     "\n",
 52 |     "reflection_template = f\"\"\"## Post-Project Reflection ({date.today().isoformat()})\n",
 53 |     "\n",
 54 |     "### Wins\n",
 55 |     "- \\\n",
 56 |     "- \\\n",
 57 |     "\n",
 58 |     "### Challenges\n",
 59 |     "- \\\n",
 60 |     "- \\\n",
 61 |     "\n",
 62 |     "### Decisions to Revisit\n",
 63 |     "- \\\n",
 64 |     "\n",
 65 |     "### Next Experiments\n",
 66 |     "- \\\n",
 67 |     "\"\"\"\n",
 68 |     "print(reflection_template)\n"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "id": "cf1d5dd9",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "### Metric Review\n",
 77 |     "\n",
 78 |     "Gather the metrics that define success for your use case. This might include loss curves, evaluation scores, or deployment latency measurements."
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "id": "0b8befa5",
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "import json\n",
 89 |     "\n",
 90 |     "# Replace the sample metrics with your actual results.\n",
 91 |     "metrics = {\n",
 92 |     "    \"final_train_loss\": 1.87,\n",
 93 |     "    \"validation_perplexity\": 16.2,\n",
 94 |     "    \"bleu_score\": None,\n",
 95 |     "    \"deployment_latency_ms\": None,\n",
 96 |     "}\n",
 97 |     "print(json.dumps(metrics, indent=2))\n"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "id": "f0f4bf93",
103 |    "metadata": {},
104 |    "source": [
105 |     "### Knowledge Transfer\n",
106 |     "\n",
107 |     "Summarize the playbooks, scripts, and lessons that you want to carry into your next project. Treat this as the beginning of your internal wiki."
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "id": "a5b1c38c",
113 |    "metadata": {},
114 |    "source": [
115 |     "## Exercises\n",
116 |     "\n",
117 |     "- Write a retrospective memo that highlights one architectural decision you would change next time.\n",
118 |     "- Collect three resources (papers, blog posts, repos) that will help you deepen a weak area.\n",
119 |     "- Draft an action plan for deploying attoLLM in a realistic environment of your choice."
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "id": "3ac6cb70",
125 |    "metadata": {},
126 |    "source": [
127 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
128 |    ]
129 |   }
130 |  ],
131 |  "metadata": {
132 |   "colab": {
133 |    "name": "Chapter 16 · Discussion"
134 |   },
135 |   "kernelspec": {
136 |    "display_name": "Python 3",
137 |    "language": "python",
138 |    "name": "python3"
139 |   },
140 |   "language_info": {
141 |    "name": "python",
142 |    "version": "3.10"
143 |   }
144 |  },
145 |  "nbformat": 4,
146 |  "nbformat_minor": 5
147 | }
148 | 


--------------------------------------------------------------------------------
/code/ch10_data.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Building a Large Language Model from Scratch
  3 | — A Step-by-Step Guide Using Python and PyTorch
  4 | 
  5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
  6 | AI-Powered by GPT-5.
  7 | 
  8 | Small data helpers for Chapter 10: building a token id stream and slicing
  9 | into (input, target) chunks for next-token prediction.
 10 | 
 11 | We keep this self-contained and friendly:
 12 | - A minimal dataset that returns (x, y) where y is x shifted by one.
 13 | - Helpers to build ids from raw text using either a provided tokenizer or a
 14 |   byte-level fallback (0-255).
 15 | """
 16 | 
 17 | from __future__ import annotations
 18 | 
 19 | 
 20 | from dataclasses import dataclass
 21 | from pathlib import Path
 22 | from typing import Iterable, List, Sequence
 23 | 
 24 | import torch
 25 | from torch.utils.data import Dataset
 26 | 
 27 | 
 28 | @dataclass
 29 | class TextIds:
 30 |     """Container for a 1-D id stream and tokenizer metadata.
 31 | 
 32 |     - ids: concatenated token ids (1-D)
 33 |     - vocab_size: size of the token vocabulary
 34 |     - pad_id: optional pad index (for CE ignore_index)
 35 |     - unk_id: optional unknown-token index
 36 |     - level: optional tokenization level ('byte'|'char'|'word')
 37 |     - id_to_token: optional list of tokens by index for reconstruction/decoding
 38 |     """
 39 |     ids: torch.Tensor
 40 |     vocab_size: int
 41 |     pad_id: int | None = None
 42 |     unk_id: int | None = None
 43 |     level: str | None = None
 44 |     id_to_token: list[str] | None = None
 45 | 
 46 | 
 47 | def load_texts(paths: Sequence[str] | None) -> str:
 48 |     if not paths:
 49 |         return "Hello world. Hello vectors.\n"
 50 |     texts: List[str] = []
 51 |     for p in paths:
 52 |         # Read each file as UTF-8 and concatenate with newlines
 53 |         data = Path(p).read_text(encoding="utf-8")
 54 |         texts.append(data)
 55 |     return "\n".join(texts)
 56 | 
 57 | 
 58 | def build_ids_byte_level(text: str) -> TextIds:
 59 |     # Encode to bytes and map each byte directly to an id
 60 |     data = text.encode("utf-8", errors="ignore")
 61 |     ids = torch.tensor(list(data), dtype=torch.long)
 62 |     return TextIds(
 63 |         ids=ids,
 64 |         vocab_size=256,
 65 |         pad_id=None,
 66 |         unk_id=None,
 67 |         level="byte",
 68 |         id_to_token=None,
 69 |     )
 70 | 
 71 | 
 72 | def build_ids_with_tokenizer(text: str, level: str = "char") -> TextIds:
 73 |     """Use the SimpleTokenizer from Chapter 6 if available; else byte-level.
 74 | 
 75 |     We try local imports first so running `python code/...` works without
 76 |     installing `code/` as a package.
 77 |     """
 78 |     try:
 79 |         # when executing scripts under code/, neighbors are importable
 80 |         from ch6_tokenize import SimpleTokenizer, Vocab  # type: ignore
 81 |     except Exception:
 82 |         try:
 83 |             from code.ch6_tokenize import SimpleTokenizer, Vocab  # type: ignore
 84 |         except Exception:
 85 |             return build_ids_byte_level(text)
 86 |     if hasattr(SimpleTokenizer, "from_text"):
 87 |         tok = SimpleTokenizer.from_text(  # type: ignore[attr-defined]
 88 |             text, level=level
 89 |         )
 90 |     else:
 91 |         # Build from raw text using the module's helpers
 92 |         tokens = SimpleTokenizer._split(text, level)
 93 |         vocab = Vocab.build(tokens)
 94 |         tok = SimpleTokenizer(vocab=vocab, level=level)
 95 |     ids = torch.tensor(tok.encode(text), dtype=torch.long)
 96 |     return TextIds(
 97 |         ids=ids,
 98 |         vocab_size=len(tok.vocab),
 99 |         pad_id=tok.pad,
100 |         unk_id=tok.unk,
101 |         level=level,
102 |         id_to_token=list(tok.vocab.id_to_token),
103 |     )
104 | 
105 | 
106 | class LMSequenceDataset(Dataset[tuple[torch.Tensor, torch.Tensor]]):
107 |     """Slice a long id stream into overlapping (x,y) chunks of length T.
108 | 
109 |     x is ids[i : i+T], y is ids[i+1 : i+T+1]. The number of samples is
110 |     len(ids) - T.
111 |     """
112 | 
113 |     def __init__(self, ids: torch.Tensor, block_size: int):
114 |         assert ids.ndim == 1 and ids.dtype == torch.long
115 |         self.ids = ids
116 |         self.T = int(block_size)
117 | 
118 |     def __len__(self) -> int:
119 |         return max(0, self.ids.numel() - self.T)
120 | 
121 |     def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
122 |         i = int(idx)
123 |         x = self.ids[i : i + self.T]
124 |         y = self.ids[i + 1 : i + self.T + 1]
125 |         return x, y
126 | 
127 | 
128 | __all__ = [
129 |     "TextIds",
130 |     "load_texts",
131 |     "build_ids_byte_level",
132 |     "build_ids_with_tokenizer",
133 |     "LMSequenceDataset",
134 | ]
135 | 


--------------------------------------------------------------------------------
/notebooks/ch02_shell_cli_colab.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "a791c257",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "776201f5",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n",
 17 |     "## Chapter 2 — Working with the Shell & AI Assistants\n",
 18 |     "**© Dr. Yves J. Hilpisch**<br>AI-Powered by GPT-5."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "e16f44d1",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## How to Use This Notebook\n",
 27 |     "\n",
 28 |     "- Practice quick shell commands directly from Colab cells using the `!` prefix.\n",
 29 |     "- Automate repetitive tasks with small Python helpers that wrap shell operations.\n",
 30 |     "- Document how you collaborate with AI assistants so future you can reproduce the workflow."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "id": "ddaf504c",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "### Why the Shell Matters\n",
 39 |     "\n",
 40 |     "Even in Colab, the shell lets you inspect files, run tests, and control experiments quickly. Combining shell commands with Python makes your workflow both flexible and repeatable."
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "id": "b4d50a4b",
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "# Use bash commands inline with the `!` prefix in Colab.\n",
 51 |     "!pwd\n",
 52 |     "!ls -1 | head -n 5\n"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "id": "210bc5a0",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "### Wrapping Shell Commands in Python\n",
 61 |     "\n",
 62 |     "Use `subprocess` when you need programmatic control (for example, checking exit codes or capturing output for logging)."
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "id": "02729ef7",
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "import subprocess\n",
 73 |     "\n",
 74 |     "result = subprocess.run(['ls', '-a'], capture_output=True, text=True, check=True)\n",
 75 |     "lines = result.stdout.splitlines()\n",
 76 |     "print(f'Total entries: {len(lines)}')\n",
 77 |     "print('First 8 entries:')\n",
 78 |     "for entry in lines[:8]:\n",
 79 |     "    print(entry)\n"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "id": "f51120c5",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "### Collaborating with AI Assistants\n",
 88 |     "\n",
 89 |     "Take notes on the prompts you issue to code assistants and save their outputs when they produce something you adopt. Lightweight documentation now saves time when you need to defend a design decision later."
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "id": "c3de21f0",
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "# Starter template for tracking assistant interactions.\n",
100 |     "from datetime import datetime\n",
101 |     "\n",
102 |     "log_entry = {\n",
103 |     "    \"timestamp\": datetime.utcnow().isoformat() + \"Z\",\n",
104 |     "    \"tool\": \"Your assistant of choice\",\n",
105 |     "    \"prompt\": \"Summarize how to process text files for tokenization\",\n",
106 |     "    \"action_items\": [\n",
107 |     "        \"Research tokenization libraries\",\n",
108 |     "        \"Try out a simple whitespace tokenizer\",\n",
109 |     "        \"Compare to sentencepiece later\"\n",
110 |     "    ]\n",
111 |     "}\n",
112 |     "log_entry\n"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "id": "d159960e",
118 |    "metadata": {},
119 |    "source": [
120 |     "## Exercises\n",
121 |     "\n",
122 |     "- Execute at least three shell commands that help you inspect the repository; record what you learned.\n",
123 |     "- Wrap a shell command with `subprocess.run` and capture both stdout and stderr in variables.\n",
124 |     "- Draft a reusable prompt template for your AI assistant that sets context, goals, and constraints."
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "id": "4e64be7b",
130 |    "metadata": {},
131 |    "source": [
132 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
133 |    ]
134 |   }
135 |  ],
136 |  "metadata": {
137 |   "colab": {
138 |    "name": "Chapter 02 · Shell & AI"
139 |   },
140 |   "kernelspec": {
141 |    "display_name": "Python 3",
142 |    "language": "python",
143 |    "name": "python3"
144 |   },
145 |   "language_info": {
146 |    "name": "python",
147 |    "version": "3.10"
148 |   }
149 |  },
150 |  "nbformat": 4,
151 |  "nbformat_minor": 5
152 | }
153 | 


--------------------------------------------------------------------------------
/code/example_data.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Building a Large Language Model from Scratch
  3 | — A Step-by-Step Guide Using Python and PyTorch
  4 | 
  5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
  6 | AI-Powered by GPT-5.
  7 | 
  8 | Self-contained example workspace and sample texts.
  9 | 
 10 | Use this when readers don't have the book repo. It creates a temporary
 11 | folder in the current working directory, fills it with small sample
 12 | text files, and (optionally) cleans them up when done.
 13 | 
 14 | Usage (CLI):
 15 |   python -m code.example_data create --defaults [--keep]
 16 |   python -m code.example_data path     # print last created path
 17 |   python -m code.example_data cleanup <path>
 18 | 
 19 | Usage (Python):
 20 |   from code.example_data import ExampleWorkspace
 21 |   with ExampleWorkspace().create_defaults() as ws:
 22 |       print(ws.root)  # use ws.root / files inside
 23 |       ...
 24 | """
 25 | 
 26 | from __future__ import annotations
 27 | 
 28 | import argparse
 29 | import shutil
 30 | import time
 31 | from dataclasses import dataclass
 32 | from pathlib import Path
 33 | 
 34 | _LAST_PATH_FILE = Path(".example_workspace_path")
 35 | 
 36 | 
 37 | DEFAULT_TEXTS = {
 38 |     "philosophy.txt": (
 39 |         "We are what we repeatedly do. Excellence, then, is not an act "
 40 |         "but a habit. Questions sharpen knowledge; curiosity sustains it.\n"
 41 |     ),
 42 |     "science.txt": (
 43 |         "Science is a way of thinking much more than it is a body of facts. "
 44 |         "Small experiments illuminate large ideas.\n"
 45 |     ),
 46 |     "poetry.txt": (
 47 |         "The model dreams in tokens and time,\n"
 48 |         "A lantern of vectors that learn to rhyme.\n"
 49 |     ),
 50 | }
 51 | 
 52 | 
 53 | @dataclass
 54 | class ExampleWorkspace:
 55 |     base_dir: Path = Path.cwd()
 56 |     name: str | None = None
 57 |     cleanup_on_exit: bool = True
 58 | 
 59 |     def __post_init__(self) -> None:
 60 |         if self.name is None:
 61 |             stamp = time.strftime("%Y%m%d-%H%M%S")
 62 |             self.name = f"examples-{stamp}"
 63 |         self.root = self.base_dir / self.name  # type: ignore[attr-defined]
 64 | 
 65 |     def create(self) -> "ExampleWorkspace":
 66 |         self.root.mkdir(parents=True, exist_ok=True)
 67 |         _LAST_PATH_FILE.write_text(str(self.root))
 68 |         return self
 69 | 
 70 |     def create_defaults(self) -> "ExampleWorkspace":
 71 |         self.create()
 72 |         for fname, text in DEFAULT_TEXTS.items():
 73 |             (self.root / fname).write_text(text)
 74 |         return self
 75 | 
 76 |     def add_text(self, filename: str, content: str) -> Path:
 77 |         p = self.root / filename
 78 |         p.write_text(content)
 79 |         return p
 80 | 
 81 |     def cleanup(self) -> None:
 82 |         if self.root.exists():
 83 |             shutil.rmtree(self.root)
 84 |         if _LAST_PATH_FILE.exists():
 85 |             _LAST_PATH_FILE.unlink()
 86 | 
 87 |     # Context manager API
 88 |     def __enter__(self) -> "ExampleWorkspace":
 89 |         return self
 90 | 
 91 |     def __exit__(self, exc_type, exc, tb) -> None:  # noqa: ANN001
 92 |         if self.cleanup_on_exit:
 93 |             self.cleanup()
 94 | 
 95 | 
 96 | def _cmd_create(args: argparse.Namespace) -> None:
 97 |     ws = ExampleWorkspace(cleanup_on_exit=not args.keep)
 98 |     (ws.create_defaults())
 99 |     print(ws.root)
100 |     if not args.keep:
101 |         print("(Temporary; will be cleaned up when used via context manager or explicitly)")
102 | 
103 | 
104 | def _cmd_path(_: argparse.Namespace) -> None:
105 |     if _LAST_PATH_FILE.exists():
106 |         print(_LAST_PATH_FILE.read_text())
107 |     else:
108 |         print("No workspace recorded. Use 'create' first.")
109 | 
110 | 
111 | def _cmd_cleanup(args: argparse.Namespace) -> None:
112 |     target = Path(args.path).resolve()
113 |     if not target.exists():
114 |         print("No such path:", target)
115 |         return
116 |     shutil.rmtree(target)
117 |     if _LAST_PATH_FILE.exists():
118 |         try:
119 |             last = Path(_LAST_PATH_FILE.read_text().strip())
120 |             if last == target:
121 |                 _LAST_PATH_FILE.unlink()
122 |         except Exception:
123 |             _LAST_PATH_FILE.unlink(missing_ok=True)  # type: ignore[attr-defined]
124 |     print("Removed:", target)
125 | 
126 | 
127 | def main(argv: list[str] | None = None) -> None:
128 |     p = argparse.ArgumentParser(prog="code.example_data", add_help=True)
129 |     sub = p.add_subparsers(dest="cmd", required=True)
130 | 
131 |     c = sub.add_parser("create", help="create a workspace and write default texts")
132 |     c.add_argument("--keep", action="store_true", help="do not auto-clean later")
133 |     c.set_defaults(func=_cmd_create)
134 | 
135 |     sub.add_parser("path", help="print last workspace path").set_defaults(func=_cmd_path)
136 | 
137 |     d = sub.add_parser("cleanup", help="remove a workspace path")
138 |     d.add_argument("path", help="path to workspace directory")
139 |     d.set_defaults(func=_cmd_cleanup)
140 | 
141 |     ns = p.parse_args(argv)
142 |     ns.func(ns)
143 | 
144 | 
145 | if __name__ == "__main__":
146 |     main()
147 | 
148 | 


--------------------------------------------------------------------------------
/code/ch10_train.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Building a Large Language Model from Scratch
  3 | — A Step-by-Step Guide Using Python and PyTorch
  4 | 
  5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
  6 | AI-Powered by GPT-5.
  7 | 
  8 | Chapter 10: a compact training script for the GPT model.
  9 | 
 10 | This keeps options small and readable. It supports either a byte-level build
 11 | of token ids or the SimpleTokenizer from Chapter 6 if available.
 12 | """
 13 | 
 14 | from __future__ import annotations
 15 | 
 16 | 
 17 | import argparse
 18 | from dataclasses import asdict
 19 | from pathlib import Path
 20 | from time import time
 21 | 
 22 | import torch
 23 | import torch.nn as nn
 24 | from torch.utils.data import DataLoader
 25 | 
 26 | # Make "code/" directory importable when running as script
 27 | import sys
 28 | sys.path.append(str(Path(__file__).resolve().parent))
 29 | 
 30 | from ch09_gpt import GPT, GPTConfig  # type: ignore
 31 | from ch10_data import (  # type: ignore
 32 |     LMSequenceDataset,
 33 |     build_ids_byte_level,
 34 |     build_ids_with_tokenizer,
 35 |     load_texts,
 36 | )
 37 | 
 38 | 
 39 | def auto_device() -> str:
 40 |     if torch.cuda.is_available():
 41 |         return "cuda"
 42 |     mps = getattr(torch.backends, "mps", None)
 43 |     if mps and torch.backends.mps.is_available():
 44 |         return "mps"
 45 |     return "cpu"
 46 | 
 47 | 
 48 | def main() -> None:
 49 |     p = argparse.ArgumentParser(description="Train a tiny GPT (Chapter 10)")
 50 |     p.add_argument("--data", nargs="*", help="text file(s) to train on")
 51 |     p.add_argument("--level", default="char", choices=["char", "word", "byte"],
 52 |                    help="token level when using SimpleTokenizer; 'byte' forces byte-level")
 53 |     p.add_argument("--block-size", type=int, default=128)
 54 |     p.add_argument("--batch-size", type=int, default=64)
 55 |     p.add_argument("--epochs", type=int, default=1)
 56 |     p.add_argument("--steps", type=int, default=500,
 57 |                    help="max training steps (overrides epochs if set)")
 58 |     p.add_argument("--lr", type=float, default=3e-4)
 59 |     p.add_argument("--warmup-steps", type=int, default=0,
 60 |                    help="linear LR warmup steps (0 to disable)")
 61 |     p.add_argument("--device", default="auto")
 62 |     p.add_argument("--seed", type=int, default=0)
 63 |     p.add_argument("--save", type=str, default="checkpoints/ch10_gpt.pt")
 64 |     args = p.parse_args()
 65 | 
 66 |     torch.manual_seed(args.seed)
 67 |     device = auto_device() if args.device == "auto" else args.device
 68 | 
 69 |     # Build ids
 70 |     text = load_texts(args.data)
 71 |     if args.level == "byte":
 72 |         ids_info = build_ids_byte_level(text)
 73 |     else:
 74 |         ids_info = build_ids_with_tokenizer(text, level=args.level)
 75 | 
 76 |     ds = LMSequenceDataset(ids_info.ids, block_size=args.block_size)
 77 |     dl = DataLoader(ds, batch_size=args.batch_size, shuffle=True, drop_last=True)
 78 | 
 79 |     # Model config & model
 80 |     cfg = GPTConfig(
 81 |         vocab_size=ids_info.vocab_size,
 82 |         block_size=args.block_size,
 83 |         d_model=256,
 84 |         n_head=4,
 85 |         n_layer=4,
 86 |         d_ff=1024,
 87 |         dropout=0.1,
 88 |         pos_type="learned",
 89 |         tie_weights=True,
 90 |     )
 91 |     model = GPT(cfg).to(device)
 92 |     opt = torch.optim.AdamW(model.parameters(), lr=args.lr)
 93 |     scheduler = None
 94 |     if args.warmup_steps > 0:
 95 |         # Linear warmup from 0 -> 1 over warmup_steps
 96 |         def lr_lambda(step: int) -> float:
 97 |             return min(1.0, (step + 1) / float(args.warmup_steps))
 98 |         scheduler = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda)
 99 | 
100 |     print("Device:", device)
101 |     print("Config:", asdict(cfg))
102 |     print("Dataset tokens:", ds.ids.numel())
103 | 
104 |     step = 0
105 |     t0 = time()
106 |     model.train()
107 |     for epoch in range(max(1, args.epochs)):
108 |         for x, y in dl:
109 |             if args.steps and step >= args.steps:
110 |                 break
111 |             x = x.to(device)
112 |             y = y.to(device)
113 |             opt.zero_grad(set_to_none=True)
114 |             logits, loss = model(x, targets=y, pad_id=ids_info.pad_id)
115 |             assert loss is not None
116 |             loss.backward()
117 |             opt.step()
118 |             if scheduler is not None:
119 |                 scheduler.step()
120 |             if step % 50 == 0:
121 |                 lr_now = opt.param_groups[0]["lr"]
122 |                 print(
123 |                     f"step {step:5d} lr {lr_now:.5f} "
124 |                     f"loss {loss.detach().item():.4f}"
125 |                 )
126 |             step += 1
127 |         if args.steps and step >= args.steps:
128 |             break
129 | 
130 |     dt = time() - t0
131 |     print(f"Done. steps={step} time={dt:.1f}s")
132 | 
133 |     # Save checkpoint
134 |     out = Path(args.save)
135 |     out.parent.mkdir(parents=True, exist_ok=True)
136 |     ckpt = {
137 |         "config": asdict(cfg),
138 |         "model_state": model.state_dict(),
139 |     }
140 |     # Save tokenizer metadata if available for easier sampling later
141 |     if ids_info.id_to_token is not None:
142 |         ckpt["tokenizer"] = {
143 |             "level": ids_info.level,
144 |             "id_to_token": ids_info.id_to_token,
145 |             "pad_id": ids_info.pad_id,
146 |             "unk_id": ids_info.unk_id,
147 |         }
148 |     torch.save(ckpt, out)
149 |     print("Saved:", out)
150 | 
151 | 
152 | if __name__ == "__main__":
153 |     main()
154 | 


--------------------------------------------------------------------------------
/code/sample_from_checkpoint.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Building a Large Language Model from Scratch
  3 | — A Step-by-Step Guide Using Python and PyTorch
  4 | 
  5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
  6 | AI-Powered by GPT-5.
  7 | 
  8 | Sample text from a trained checkpoint (Chapter 11 companion CLI).
  9 | 
 10 | This script reconstructs the GPT model saved by Chapter 10's trainer and
 11 | generates a short continuation from a prompt. It is byte-level by default so
 12 | you can sample without extra tokenizer files.
 13 | 
 14 | Examples
 15 | --------
 16 | (.venv) $ python code/sample_from_checkpoint.py   --ckpt checkpoints/ch10_gpt.pt --prompt "Philosophy is"   --max-new-tokens 120 --temperature 0.9 --top-p 0.95
 17 | 
 18 | Notes
 19 | -----
 20 | - If you trained with a custom tokenizer from Chapter 6, pass a prompt that is
 21 |   compatible with your vocabulary or adapt this script to encode/decode with
 22 |   that tokenizer.
 23 | """
 24 | 
 25 | from __future__ import annotations
 26 | 
 27 | 
 28 | import argparse
 29 | import sys
 30 | from typing import Optional
 31 | 
 32 | import torch
 33 | 
 34 | # Make "code/" importable when running as a script
 35 | from pathlib import Path
 36 | sys.path.append(str(Path(__file__).resolve().parent))
 37 | 
 38 | from ch09_gpt import GPT, GPTConfig  # type: ignore
 39 | from ch11_sampling import sample  # type: ignore
 40 | 
 41 | 
 42 | def auto_device() -> str:
 43 |     if torch.cuda.is_available():
 44 |         return "cuda"
 45 |     mps = getattr(torch.backends, "mps", None)
 46 |     if mps and torch.backends.mps.is_available():
 47 |         return "mps"
 48 |     return "cpu"
 49 | 
 50 | 
 51 | def main(argv: Optional[list[str]] = None) -> None:
 52 |     p = argparse.ArgumentParser(description="Sample from a GPT checkpoint")
 53 |     p.add_argument("--ckpt", default="checkpoints/ch10_gpt.pt", help="path to .pt")
 54 |     p.add_argument("--prompt", default="Hello", help="Prompt text")
 55 |     p.add_argument("--max-new-tokens", type=int, default=120)
 56 |     p.add_argument("--temperature", type=float, default=0.9)
 57 |     p.add_argument("--top-k", type=int, default=0)
 58 |     p.add_argument("--top-p", type=float, default=0.0)
 59 |     p.add_argument("--device", default="auto")
 60 |     p.add_argument("--seed", type=int, default=0)
 61 |     p.add_argument("--level", default="auto", choices=["auto", "byte", "char", "word"],
 62 |                    help="tokenization level for prompt/decoding")
 63 |     p.add_argument("--ref-text", nargs='*', default=[],
 64 |                    help="text file(s) used to rebuild tokenizer for char/word levels")
 65 |     args = p.parse_args(argv)
 66 | 
 67 |     torch.manual_seed(args.seed)
 68 |     device = auto_device() if args.device == "auto" else args.device
 69 |     ckpt = torch.load(args.ckpt, map_location=device)
 70 |     cfg = GPTConfig(**ckpt["config"])
 71 |     model = GPT(cfg).to(device)
 72 |     model.load_state_dict(ckpt["model_state"])
 73 |     model.eval()
 74 | 
 75 |     # Determine tokenization/decoding strategy
 76 |     level = args.level
 77 |     if level == "auto":
 78 |         # Heuristic: byte-level models typically have vocab_size==256
 79 |         level = "byte" if cfg.vocab_size == 256 else "char"
 80 | 
 81 |     if level == "byte":
 82 |         prompt_bytes = args.prompt.encode("utf-8")
 83 |         input_ids = torch.tensor([list(prompt_bytes)], dtype=torch.long, device=device)
 84 |         out = sample(
 85 |             model,
 86 |             input_ids,
 87 |             max_new_tokens=args.max_new_tokens,
 88 |             temperature=args.temperature,
 89 |             top_k=(args.top_k if args.top_k > 0 else None),
 90 |             top_p=(args.top_p if args.top_p > 0 else None),
 91 |         )
 92 |         text = bytes(out[0].tolist()).decode("utf-8", errors="ignore")
 93 |         print(text)
 94 |     else:
 95 |         # Prefer tokenizer embedded in checkpoint; otherwise rebuild from refs.
 96 |         from ch6_tokenize import SimpleTokenizer, Vocab  # type: ignore
 97 |         tok = None
 98 |         if "tokenizer" in ckpt:
 99 |             meta = ckpt["tokenizer"]
100 |             if meta.get("level") == level and meta.get("id_to_token"):
101 |                 id_to_token = list(meta["id_to_token"])  # ensure list
102 |                 token_to_id = {t: i for i, t in enumerate(id_to_token)}
103 |                 pad_id = int(meta.get("pad_id", 0))
104 |                 unk_id = int(meta.get("unk_id", 1))
105 |                 vocab = Vocab(token_to_id=token_to_id, id_to_token=id_to_token, pad=pad_id, unk=unk_id)
106 |                 tok = SimpleTokenizer(vocab=vocab, level=level)
107 |         if tok is None:
108 |             if not args.ref_text:
109 |                 print("ERROR: provide --ref-text files to rebuild tokenizer for level=char/word.")
110 |                 sys.exit(2)
111 |             ref = "\n".join(Path(p).read_text(encoding="utf-8") for p in args.ref_text)
112 |             tokens = SimpleTokenizer._split(ref, level)
113 |             vocab = Vocab.build(tokens)
114 |             tok = SimpleTokenizer(vocab=vocab, level=level)
115 |             if len(vocab) != cfg.vocab_size:
116 |                 print(f"WARNING: tokenizer vocab {len(vocab)} != model vocab {cfg.vocab_size}; decoding may be off.")
117 |         input_ids = torch.tensor([tok.encode(args.prompt)], dtype=torch.long, device=device)
118 |         out = sample(
119 |             model,
120 |             input_ids,
121 |             max_new_tokens=args.max_new_tokens,
122 |             temperature=args.temperature,
123 |             top_k=(args.top_k if args.top_k > 0 else None),
124 |             top_p=(args.top_p if args.top_p > 0 else None),
125 |         )
126 |         print(tok.decode(out[0].tolist()))
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     main()
131 | 


--------------------------------------------------------------------------------
/notebooks/ch15_deployment_colab.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "8e6e35ea",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "f18acf81",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n",
 17 |     "## Chapter 15 — Deployment & Applications\n",
 18 |     "**© Dr. Yves J. Hilpisch**<br>AI-Powered by GPT-5."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "494d5b47",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## How to Use This Notebook\n",
 27 |     "\n",
 28 |     "- Package the trained model with lightweight serving infrastructure.\n",
 29 |     "- Design latency and throughput tests that match production realities.\n",
 30 |     "- Plan monitoring hooks to catch model drift and quality regressions."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "id": "18e472fa",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "### Roadmap\n",
 39 |     "\n",
 40 |     "We export the model, wire a minimal API, and simulate traffic patterns to validate performance before launch."
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "id": "06ae0dd1",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "### Study Tips\n",
 49 |     "\n",
 50 |     "Document your deployment assumptions (hardware, concurrency, SLAs). These constraints shape every implementation choice."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "id": "57b5f971",
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "# Bundle structure (config + weights + tokenizer)\n",
 61 |     "import json, torch\n",
 62 |     "example = {\n",
 63 |     "  'config': {'vocab_size': 32, 'block_size': 8, 'd_model': 32,\n",
 64 |     "             'n_head': 4, 'n_layer': 2, 'd_ff': 64, 'dropout': 0.0},\n",
 65 |     "  'model_state': {},\n",
 66 |     "  'tokenizer': {'level': 'char', 'id_to_token': list(' _abcdefghijklmnopqrstuvwxyz'),\n",
 67 |     "                'pad_id': 0, 'unk_id': 1}\n",
 68 |     "}\n",
 69 |     "list(example.keys())\n"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "id": "3c11263e",
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "# Load + sample from a real bundle (requires repo present and a bundle path)\n",
 80 |     "import sys, pathlib\n",
 81 |     "sys.path.append(str(pathlib.Path('code').resolve()))\n",
 82 |     "from ch09_gpt import GPT, GPTConfig  # type: ignore\n",
 83 |     "from ch11_sampling import sample      # type: ignore\n",
 84 |     "from ch6_tokenize import SimpleTokenizer, Vocab  # type: ignore\n",
 85 |     "\n",
 86 |     "def load_bundle(path: str):\n",
 87 |     "    b = torch.load(path, map_location='cpu')\n",
 88 |     "    cfg = GPTConfig(**b['config'])\n",
 89 |     "    model = GPT(cfg).eval(); model.load_state_dict(b['model_state'])\n",
 90 |     "    meta = b.get('tokenizer'); tok = None\n",
 91 |     "    if meta and meta.get('id_to_token'):\n",
 92 |     "        id_to_token = list(meta['id_to_token'])\n",
 93 |     "        token_to_id = {t:i for i,t in enumerate(id_to_token)}\n",
 94 |     "        vocab = Vocab(token_to_id=token_to_id, id_to_token=id_to_token,\n",
 95 |     "                     pad=int(meta.get('pad_id',0)), unk=int(meta.get('unk_id',1)))\n",
 96 |     "        tok = SimpleTokenizer(vocab=vocab, level=meta.get('level','char'))\n",
 97 |     "    return model, tok\n",
 98 |     "\n",
 99 |     "def sample_bundle(bundle_path: str, prompt: str,\n",
100 |     "                  max_new=80, temperature=0.9, top_p=0.95, top_k=0):\n",
101 |     "    model, tok = load_bundle(bundle_path)\n",
102 |     "    if tok is None:\n",
103 |     "        ids = torch.tensor([[c for c in prompt.encode('utf-8')]], dtype=torch.long)\n",
104 |     "        out = sample(model, ids, max_new_tokens=max_new, temperature=temperature,\n",
105 |     "                     top_k=(top_k or None), top_p=(top_p or None))\n",
106 |     "        return bytes(out[0].tolist()).decode('utf-8', errors='ignore')\n",
107 |     "    ids = torch.tensor([tok.encode(prompt)], dtype=torch.long)\n",
108 |     "    out = sample(model, ids, max_new_tokens=max_new, temperature=temperature,\n",
109 |     "                 top_k=(top_k or None), top_p=(top_p or None))\n",
110 |     "    return tok.decode(out[0].tolist())\n",
111 |     "\n",
112 |     "# Example (requires a real bundle):\n",
113 |     "# sample_bundle('model_bundle.pt', 'Hello')\n"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "id": "f03b3efd",
119 |    "metadata": {},
120 |    "source": [
121 |     "## Exercises\n",
122 |     "\n",
123 |     "- Build a FastAPI or Flask microservice that serves one of the trained checkpoints.\n",
124 |     "- Design a canary analysis comparing the deployed model to a previous baseline.\n",
125 |     "- Draft a monitoring plan that includes automated alerts and qualitative spot checks."
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "id": "66e1a39f",
131 |    "metadata": {},
132 |    "source": [
133 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
134 |    ]
135 |   }
136 |  ],
137 |  "metadata": {
138 |   "kernelspec": {
139 |    "display_name": "Python 3",
140 |    "language": "python",
141 |    "name": "python3"
142 |   },
143 |   "language_info": {
144 |    "name": "python",
145 |    "version": "3.10"
146 |   }
147 |  },
148 |  "nbformat": 4,
149 |  "nbformat_minor": 5
150 | }
151 | 


--------------------------------------------------------------------------------
/notebooks/ch01_intro_colab.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "a4ad2d66",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "417e62fd",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n",
 17 |     "## Chapter 1 — Introduction\n",
 18 |     "**© Dr. Yves J. Hilpisch**<br>AI-Powered by GPT-5."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "ddd151a2",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## How to Use This Notebook\n",
 27 |     "\n",
 28 |     "- Orient yourself in the project structure used throughout the book.\n",
 29 |     "- Run quick environment diagnostics to ensure Colab is ready for later chapters.\n",
 30 |     "- Capture personal learning goals so you can revisit them after reading."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "id": "53cb70df",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "### Learning Objectives\n",
 39 |     "\n",
 40 |     "- Understand how the book resources are organized.\n",
 41 |     "- Identify the core assets needed to build and train attoLLM.\n",
 42 |     "- Plan how you will iterate through the practical chapters."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "id": "2c455e5b",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "### Project Tour\n",
 51 |     "\n",
 52 |     "The repository follows a predictable structure so you can navigate quickly:\n",
 53 |     "\n",
 54 |     "- `chapters/` holds the AsciiDoc sources for the printed book.\n",
 55 |     "- `notebooks/` mirrors the chapters with runnable, Colab-optimized notebooks.\n",
 56 |     "- `code/` contains reusable modules that the notebooks import.\n",
 57 |     "\n",
 58 |     "Spend a minute opening each directory in Colab's file browser so you know where to find things later."
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "id": "92ee6ac5",
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "from pathlib import Path\n",
 69 |     "import textwrap\n",
 70 |     "\n",
 71 |     "root = Path('.')\n",
 72 |     "folders = ['chapters', 'notebooks', 'code', 'data']\n",
 73 |     "for folder in folders:\n",
 74 |     "    path = root / folder\n",
 75 |     "    print(f\"{folder:>10}: {'found' if path.exists() else 'missing'}\")\n",
 76 |     "\n",
 77 |     "message = textwrap.dedent(\n",
 78 |     "    \"\"\"Tip: In Colab you can double-click folders in the left sidebar to expand them.\n",
 79 |     "Make sure the directories you expect are present before moving on.\"\"\"\n",
 80 |     ")\n",
 81 |     "print()\n",
 82 |     "print(message)\n"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "id": "524b9c41",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "### Environment Check\n",
 91 |     "\n",
 92 |     "Even this introductory chapter benefits from confirming that Colab offers the right tooling. Run the cell below and capture the output in case you need to debug later."
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "id": "5d792061",
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "import platform\n",
103 |     "import psutil\n",
104 |     "\n",
105 |     "print(f\"Python version : {platform.python_version()}\")\n",
106 |     "print(f\"Kernel         : {platform.release()}\")\n",
107 |     "print(f\"Processor      : {platform.processor() or 'N/A'}\")\n",
108 |     "print(f\"Logical cores  : {psutil.cpu_count(logical=True)}\")\n",
109 |     "print(f\"Memory (GB)    : {psutil.virtual_memory().total / 1e9:.2f}\")\n"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "id": "696a0174",
115 |    "metadata": {},
116 |    "source": [
117 |     "### Capture Your Intentions\n",
118 |     "\n",
119 |     "Use the template below to jot down what you plan to build and what you hope to learn. Revisit this at the end of the book to measure progress."
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "id": "eec82bf4",
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "from datetime import date\n",
130 |     "\n",
131 |     "template = f\"\"\"# Learning Journal ({date.today().isoformat()})\n",
132 |     "\n",
133 |     "- Why I am reading this book: \n",
134 |     "- My baseline understanding of LLMs: \n",
135 |     "- Tools I am most comfortable with today: \n",
136 |     "- Success will look like: \n",
137 |     "\"\"\"\n",
138 |     "print(template)\n"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "id": "d98dbbed",
144 |    "metadata": {},
145 |    "source": [
146 |     "## Exercises\n",
147 |     "\n",
148 |     "- List three specific questions you want answered while working through the book.\n",
149 |     "- Note which chapters you plan to tackle in the next study session and why.\n",
150 |     "- Sketch a simple schedule for revisiting your learning journal every few chapters."
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "id": "6d018303",
156 |    "metadata": {},
157 |    "source": [
158 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
159 |    ]
160 |   }
161 |  ],
162 |  "metadata": {
163 |   "colab": {
164 |    "name": "Chapter 01 · Introduction"
165 |   },
166 |   "kernelspec": {
167 |    "display_name": "Python 3",
168 |    "language": "python",
169 |    "name": "python3"
170 |   },
171 |   "language_info": {
172 |    "name": "python",
173 |    "version": "3.10"
174 |   }
175 |  },
176 |  "nbformat": 4,
177 |  "nbformat_minor": 5
178 | }
179 | 


--------------------------------------------------------------------------------
/notebooks/ch04_hardware_software_colab.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "d720f282",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "0ff8ea2e",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n",
 17 |     "## Chapter 4 — Hardware Platforms & Software Setup\n",
 18 |     "**© Dr. Yves J. Hilpisch**<br>AI-Powered by GPT-5."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "df0eca6b",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## How to Use This Notebook\n",
 27 |     "\n",
 28 |     "- Inspect the hardware Colab assigned (CPU, GPU, RAM) and log the results.\n",
 29 |     "- Benchmark lightweight operations to estimate the runtime of later experiments.\n",
 30 |     "- Decide when to scale up or down based on the metrics you collect here."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "id": "696cebf7",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "### Hardware Discovery\n",
 39 |     "\n",
 40 |     "Understanding what resources you have prevents surprises when training longer experiments. The cells below gather GPU, CPU, and RAM details."
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "id": "1d413986",
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "import platform\n",
 51 |     "import psutil\n",
 52 |     "\n",
 53 |     "print(f\"Python      : {platform.python_version()}\")\n",
 54 |     "print(f\"Machine     : {platform.machine()}\")\n",
 55 |     "print(f\"Processor   : {platform.processor() or 'N/A'}\")\n",
 56 |     "print(f\"Logical CPU : {psutil.cpu_count(logical=True)}\")\n",
 57 |     "print(f\"Physical CPU: {psutil.cpu_count(logical=False)}\")\n",
 58 |     "print(f\"Total RAM   : {psutil.virtual_memory().total / 1e9:.2f} GB\")\n"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "id": "d5f06e3c",
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "# GPU diagnostics (works when a GPU runtime is enabled in Colab)\n",
 69 |     "try:\n",
 70 |     "    import torch\n",
 71 |     "    if torch.cuda.is_available():\n",
 72 |     "        gpu_index = 0\n",
 73 |     "        props = torch.cuda.get_device_properties(gpu_index)\n",
 74 |     "        print(f\"Device      : {props.name}\")\n",
 75 |     "        print(f\"Memory (GB) : {props.total_memory / 1e9:.2f}\")\n",
 76 |     "        print(f\"SM count    : {props.multi_processor_count}\")\n",
 77 |     "    else:\n",
 78 |     "        print(\"No CUDA-capable GPU detected. Switch to a GPU runtime if needed.\")\n",
 79 |     "except Exception as exc:\n",
 80 |     "    print(f\"Torch not available or failed to query CUDA: {exc}\")\n"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "id": "f5432d2c",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "### Quick Benchmark\n",
 89 |     "\n",
 90 |     "Run a tiny matmul benchmark. The absolute number is less important than the comparison you can make when switching runtimes."
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "id": "0fb72ff8",
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "import time\n",
101 |     "import torch\n",
102 |     "\n",
103 |     "def benchmark_matmul(dim: int = 2048, repeats: int = 5):\n",
104 |     "    device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
105 |     "    x = torch.randn(dim, dim, device=device)\n",
106 |     "    y = torch.randn(dim, dim, device=device)\n",
107 |     "    if device == 'cuda':\n",
108 |     "        torch.cuda.synchronize()\n",
109 |     "    timings = []\n",
110 |     "    for _ in range(repeats):\n",
111 |     "        start = time.time()\n",
112 |     "        _ = x @ y\n",
113 |     "        if device == 'cuda':\n",
114 |     "            torch.cuda.synchronize()\n",
115 |     "        timings.append(time.time() - start)\n",
116 |     "    return device, sum(timings) / len(timings)\n",
117 |     "\n",
118 |     "device, avg = benchmark_matmul()\n",
119 |     "print(f\"Average matmul time on {device.upper()}: {avg:.4f} seconds\")\n"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "id": "c6777f19",
125 |    "metadata": {},
126 |    "source": [
127 |     "### Interpreting the Results\n",
128 |     "\n",
129 |     "Compare the metrics above with the recommendations in the chapter. Record whether your current environment is sufficient or if you should upgrade for compute-heavy chapters."
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "id": "6965f0d2",
135 |    "metadata": {},
136 |    "source": [
137 |     "## Exercises\n",
138 |     "\n",
139 |     "- Measure the impact of different batch sizes on the benchmark timing above.\n",
140 |     "- Record GPU memory usage before and after running the benchmark using `torch.cuda.memory_allocated()`.\n",
141 |     "- Summarize in a short paragraph what hardware is ideal for full training runs versus experimentation."
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "id": "09f87ed5",
147 |    "metadata": {},
148 |    "source": [
149 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
150 |    ]
151 |   }
152 |  ],
153 |  "metadata": {
154 |   "colab": {
155 |    "name": "Chapter 04 · Hardware & Software"
156 |   },
157 |   "kernelspec": {
158 |    "display_name": "Python 3",
159 |    "language": "python",
160 |    "name": "python3"
161 |   },
162 |   "language_info": {
163 |    "name": "python",
164 |    "version": "3.10"
165 |   }
166 |  },
167 |  "nbformat": 4,
168 |  "nbformat_minor": 5
169 | }
170 | 


--------------------------------------------------------------------------------
/code/ch08_transformer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Building a Large Language Model from Scratch
  3 | — A Step-by-Step Guide Using Python and PyTorch
  4 | 
  5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
  6 | AI-Powered by GPT-5.
  7 | 
  8 | """
  9 | 
 10 | from __future__ import annotations
 11 | 
 12 | import math
 13 | import torch
 14 | import torch.nn as nn
 15 | import torch.nn.functional as F
 16 | 
 17 | 
 18 | def sinusoidal_positions(
 19 |     T: int,
 20 |     d_model: int,
 21 |     device: torch.device | None = None,
 22 | ) -> torch.Tensor:
 23 |     """Return [T, d_model] sinusoidal position encodings (sin/cos pairs).
 24 | 
 25 |     - T: sequence length (time steps)
 26 |     - d_model: embedding dimension (even recommended)
 27 |     """
 28 |     # positions [T,1] and index grid [1,D]
 29 |     pos = torch.arange(T, device=device).float()[:, None]
 30 |     i = torch.arange(d_model, device=device).float()[None, :]
 31 |     # frequency grid [T,D]
 32 |     angle = pos / (10000 ** (2 * (i // 2) / d_model))
 33 |     enc = torch.zeros(T, d_model, device=device)
 34 |     enc[:, 0::2] = torch.sin(angle[:, 0::2])  # even dims = sin
 35 |     enc[:, 1::2] = torch.cos(angle[:, 1::2])  # odd  dims = cos
 36 |     return enc
 37 | 
 38 | 
 39 | class MultiHeadAttention(nn.Module):
 40 |     """Multi‑head self‑attention (single module, H heads).
 41 | 
 42 |     d_model = H * Dh, where Dh is per‑head dim. We project to Q,K,V, split into
 43 |     heads, apply scaled dot‑product attention per head, then concat and project out.
 44 |     """
 45 | 
 46 |     def __init__(self, d_model: int, num_heads: int, dropout: float = 0.0):
 47 |         super().__init__()
 48 |         assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
 49 |         self.h = num_heads                                     # number of heads
 50 |         self.d = d_model // num_heads                          # per‑head dim
 51 |         self.qkv = nn.Linear(d_model, 3 * d_model, bias=False) # shared proj
 52 |         self.out = nn.Linear(d_model, d_model, bias=False)     # output proj
 53 |         self.drop = nn.Dropout(dropout)
 54 | 
 55 |     def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
 56 |         B, T, Dm = x.shape                                     # batch, time, model dim
 57 |         qkv = self.qkv(x)                                      # [B, T, 3*Dm]
 58 |         q, k, v = qkv.chunk(3, dim=-1)                         # each [B, T, Dm]
 59 | 
 60 |         # Split heads: [B,T,Dm] -> [B,H,T,Dh],
 61 |         # then put heads dimension before time.
 62 |         def split(t: torch.Tensor) -> torch.Tensor:
 63 |             return t.view(B, T, self.h, self.d).transpose(1, 2)
 64 | 
 65 |         q, k, v = map(split, (q, k, v))                        # [B, H, T, Dh]
 66 | 
 67 |         # Build [B, H, T, T] boolean mask (True = disallowed)
 68 |         mask_bool = None
 69 |         if mask is not None:
 70 |             if mask.dim() == 2:
 71 |                 mask_bool = (
 72 |                     (mask == 0)
 73 |                     .bool()[None, None, :, :]
 74 |                     .expand(B, self.h, T, T)
 75 |                 )
 76 |             elif mask.dim() == 3:
 77 |                 mask_bool = (
 78 |                     (mask == 0)
 79 |                     .bool()
 80 |                     .unsqueeze(1)
 81 |                     .expand(B, self.h, T, T)
 82 |                 )
 83 |             elif mask.dim() == 4:
 84 |                 if mask.size(1) == 1:
 85 |                     mask_bool = (
 86 |                         (mask == 0).bool().expand(B, self.h, T, T)
 87 |                     )
 88 |                 else:
 89 |                     mask_bool = (mask == 0).bool()
 90 | 
 91 |         # Manual scaled dot‑product attention for portability (MPS-safe)
 92 |         Dh = self.d
 93 |         scores = (q @ k.transpose(-2, -1)) / (Dh ** 0.5)  # [B,H,T,T]
 94 |         if mask_bool is not None:
 95 |             scores = scores.masked_fill(mask_bool, float(-1e9))
 96 |         w = torch.softmax(scores, dim=-1)
 97 |         attn = w @ v  # [B,H,T,Dh]
 98 |         attn = self.drop(attn)
 99 | 
100 |         # Concatenate heads back: [B, H, T, Dh] -> [B, T, Dm]
101 |         y = (
102 |             attn.transpose(1, 2)
103 |             .contiguous()
104 |             .view(B, T, Dm)
105 |         )
106 |         return self.out(y)
107 | 
108 | 
109 | class FeedForward(nn.Module):
110 |     """Position‑wise MLP with GELU and dropout."""
111 | 
112 |     def __init__(self, d_model: int, d_ff: int, dropout: float = 0.0):
113 |         super().__init__()
114 |         self.net = nn.Sequential(
115 |             nn.Linear(d_model, d_ff),   # expand
116 |             nn.GELU(),                  # nonlinearity
117 |             nn.Dropout(dropout),
118 |             nn.Linear(d_ff, d_model),   # project back
119 |             nn.Dropout(dropout),
120 |         )
121 | 
122 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
123 |         return self.net(x)
124 | 
125 | 
126 | class Residual(nn.Module):
127 |     """Pre‑norm residual wrapper: x + sublayer(LN(x))."""
128 | 
129 |     def __init__(self, d_model: int):
130 |         super().__init__()
131 |         self.norm = nn.LayerNorm(d_model)
132 | 
133 |     def forward(self, x: torch.Tensor, sublayer: nn.Module, *args, **kwargs) -> torch.Tensor:
134 |         return x + sublayer(self.norm(x), *args, **kwargs)
135 | 
136 | 
137 | class TransformerBlock(nn.Module):
138 |     """One pre‑norm transformer block: MHA + FFN with residuals."""
139 | 
140 |     def __init__(self, d_model: int, num_heads: int, d_ff: int, dropout: float = 0.0):
141 |         super().__init__()
142 |         self.mha = MultiHeadAttention(d_model, num_heads, dropout)
143 |         self.ffn = FeedForward(d_model, d_ff, dropout)
144 |         self.res1 = Residual(d_model)
145 |         self.res2 = Residual(d_model)
146 | 
147 |     def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
148 |         x = self.res1(x, self.mha, mask)   # attend + residual
149 |         x = self.res2(x, self.ffn)         # think (FFN) + residual
150 |         return x
151 | 
152 | 
153 | __all__ = [
154 |     "sinusoidal_positions",
155 |     "MultiHeadAttention",
156 |     "FeedForward",
157 |     "Residual",
158 |     "TransformerBlock",
159 | ]
160 | 


--------------------------------------------------------------------------------
/notebooks/ch03_setup_project_colab.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "89036b8d",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "c4b76bea",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n",
 17 |     "## Chapter 3 — Setting Up the Project\n",
 18 |     "**© Dr. Yves J. Hilpisch**<br>AI-Powered by GPT-5."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "dc91896e",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## How to Use This Notebook\n",
 27 |     "\n",
 28 |     "- Clone or sync the repository into your Colab workspace in a reproducible way.\n",
 29 |     "- Install Python dependencies with pinned versions for deterministic runs.\n",
 30 |     "- Validate the setup by running a smoke test that imports key modules."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "id": "15dd1003",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "### Repository Setup Options\n",
 39 |     "\n",
 40 |     "There are two common paths in Colab:\n",
 41 |     "\n",
 42 |     "1. Mount a cloud drive (Google Drive, Dropbox, etc.) and access the synchronized folder.\n",
 43 |     "2. Clone the repository directly with `git clone`.\n",
 44 |     "\n",
 45 |     "Pick the approach that gives you the most reliable storage for persistent artifacts."
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "id": "ec4c7e2c",
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "# Example: cloning via Git (replace the URL with your fork if needed)\n",
 56 |     "import os\n",
 57 |     "import subprocess\n",
 58 |     "import sys\n",
 59 |     "\n",
 60 |     "repo_url = \"https://github.com/your-org/atto-llm-book.git\"\n",
 61 |     "if os.environ.get('COLAB_RELEASE_TAG'):\n",
 62 |     "    subprocess.run(['git', 'clone', '--depth=1', repo_url, 'project_repo'], check=True)\n",
 63 |     "else:\n",
 64 |     "    print('Skipping git clone outside Colab. Run this cell manually when you have network access.')\n"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "id": "4c3c9d62",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "### Dependency Management\n",
 73 |     "\n",
 74 |     "Use a requirements file whenever possible. That way your future training runs or deployments start from the same baseline environment."
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "id": "04cb23c5",
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "# Use pip to install only what you need for this chapter.\n",
 85 |     "import os\n",
 86 |     "import subprocess\n",
 87 |     "import sys\n",
 88 |     "\n",
 89 |     "if os.environ.get('COLAB_RELEASE_TAG'):\n",
 90 |     "    subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', '-r', 'requirements.txt'], check=True)\n",
 91 |     "else:\n",
 92 |     "    print('Skipping pip install outside Colab to keep validation fast. Run this in Colab when needed.')\n"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "id": "bebf4752",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "### Smoke Test the Environment\n",
101 |     "\n",
102 |     "Import the utilities that later notebooks expect. If this cell fails, resolve the issue before moving forward."
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "id": "bbac02ab",
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "try:\n",
113 |     "    from code.attollm.data import dataset\n",
114 |     "    from code.attollm.model import transformer\n",
115 |     "    print(\"Modules imported successfully.\")\n",
116 |     "except ModuleNotFoundError as exc:\n",
117 |     "    print(f\"Import failed: {exc}\")\n",
118 |     "    print(\"Adjust PYTHONPATH or install missing packages before continuing.\")\n"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "id": "50a2eecd",
124 |    "metadata": {},
125 |    "source": [
126 |     "### Configuration Snapshot\n",
127 |     "\n",
128 |     "Capture the versions of critical libraries so your future self (or collaborators) knows what environment produced your results."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "id": "fd41e738",
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "import json\n",
139 |     "import pkg_resources\n",
140 |     "\n",
141 |     "packages_of_interest = [\"torch\", \"numpy\", \"pandas\", \"transformers\"]\n",
142 |     "versions = {}\n",
143 |     "for pkg in packages_of_interest:\n",
144 |     "    try:\n",
145 |     "        versions[pkg] = pkg_resources.get_distribution(pkg).version\n",
146 |     "    except pkg_resources.DistributionNotFound:\n",
147 |     "        versions[pkg] = \"not installed\"\n",
148 |     "print(json.dumps(versions, indent=2))\n"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "id": "12814a21",
154 |    "metadata": {},
155 |    "source": [
156 |     "## Exercises\n",
157 |     "\n",
158 |     "- Decide where you will store checkpoints and large artifacts; document the path in your notes.\n",
159 |     "- Create a new Python virtual environment (locally or in Colab) and list the exact activation steps.\n",
160 |     "- Write a short shell script that provisions the environment from scratch using the commands above."
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "id": "0ea7b832",
166 |    "metadata": {},
167 |    "source": [
168 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
169 |    ]
170 |   }
171 |  ],
172 |  "metadata": {
173 |   "colab": {
174 |    "name": "Chapter 03 · Project Setup"
175 |   },
176 |   "kernelspec": {
177 |    "display_name": "Python 3",
178 |    "language": "python",
179 |    "name": "python3"
180 |   },
181 |   "language_info": {
182 |    "name": "python",
183 |    "version": "3.10"
184 |   }
185 |  },
186 |  "nbformat": 4,
187 |  "nbformat_minor": 5
188 | }
189 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <img src="https://theaiengineer.dev/tae_logo_gw_flatter.png" width=35% align=right>
  2 | 
  3 | # Building a Large Language Model from Scratch - Book Companion Code
  4 | 
  5 | This repository contains the executable code assets that accompany the book:
  6 | 
  7 | Building a Large Language Model from Scratch — A Step‑by‑Step Guide Using Python and PyTorch
  8 | 
  9 | It is designed as a hands‑on companion: pair the book manuscript with this repo to run the same scripts and notebooks locally, inspect results, and extend the examples.
 10 | 
 11 | ## What’s Inside
 12 | 
 13 | - `code/` — Python modules and scripts per chapter, plus small utilities.
 14 | - `notebooks/` — Jupyter notebooks that mirror the book’s walkthroughs.
 15 | - `figures/` — Static figures generated by the code (used in the book).
 16 | - `tools/` — Validation helpers to run scripts/notebooks in batch.
 17 | - `requirements.txt` — Minimal packages used across chapters.
 18 | 
 19 | The project keeps dependencies intentionally light. Install PyTorch separately following the official instructions for your platform.
 20 | 
 21 | ## Setup: Local Virtual Environment
 22 | 
 23 | Below are reliable, copy‑pasteable steps to get a clean `.venv` and install requirements.
 24 | 
 25 | Unix/macOS (bash/zsh)
 26 | 
 27 | ```
 28 | python3 -m venv .venv
 29 | source .venv/bin/activate
 30 | python -m pip install --upgrade pip wheel
 31 | pip install -r requirements.txt
 32 | 
 33 | # Install PyTorch for your platform (examples):
 34 | # CPU only:
 35 | #   pip install --index-url https://download.pytorch.org/whl/cpu torch torchvision torchaudio
 36 | # Apple Silicon (MPS):
 37 | #   pip install torch torchvision torchaudio
 38 | # CUDA (choose the right CUDA version from pytorch.org):
 39 | #   pip install --index-url https://download.pytorch.org/whl/cu121 torch torchvision torchaudio
 40 | 
 41 | # Helpful links:
 42 | #   Apple Silicon (MPS) guide: https://pytorch.org/get-started/locally/ (select macOS + pip + MPS)
 43 | #   CUDA wheels index (e.g., cu121): https://download.pytorch.org/whl/cu121
 44 | #   CPU wheels index: https://download.pytorch.org/whl/cpu
 45 | 
 46 | # Optional: for notebook execution helper
 47 | pip install nbclient nbformat
 48 | ```
 49 | 
 50 | Windows (PowerShell)
 51 | 
 52 | ```
 53 | py -m venv .venv
 54 | .\.venv\Scripts\Activate.ps1
 55 | python -m pip install --upgrade pip wheel
 56 | pip install -r requirements.txt
 57 | 
 58 | # Install PyTorch from https://pytorch.org/get-started/locally/
 59 | # Example (CPU only):
 60 | #   pip install --index-url https://download.pytorch.org/whl/cpu torch torchvision torchaudio
 61 | 
 62 | # Helpful links:
 63 | #   Apple Silicon (MPS) guide: https://pytorch.org/get-started/locally/ (select macOS + pip + MPS)
 64 | #   CUDA wheels index (e.g., cu121): https://download.pytorch.org/whl/cu121
 65 | #   CPU wheels index: https://download.pytorch.org/whl/cpu
 66 | 
 67 | # Optional: for notebook execution helper
 68 | pip install nbclient nbformat
 69 | ```
 70 | 
 71 | ## Python Version
 72 | 
 73 | - Recommended: Python 3.11. This repo is tested with 3.11 and avoids edge cases seen with newer versions.
 74 | - If you encounter errors like: `AttributeError: module 'code' has no attribute 'InteractiveConsole'` (often on Python 3.13), use Python 3.11 for now. The project’s `code/` package name can shadow the stdlib module named `code` under some setups.
 75 | 
 76 | Quick ways to create a 3.11 venv
 77 | 
 78 | - macOS (Homebrew):
 79 | 
 80 | ```
 81 | brew install python@3.11
 82 | /opt/homebrew/bin/python3.11 -m venv .venv
 83 | source .venv/bin/activate
 84 | pip install -r requirements.txt
 85 | ```
 86 | 
 87 | - pyenv:
 88 | 
 89 | ```
 90 | pyenv install 3.11.9
 91 | ~/.pyenv/versions/3.11.9/bin/python -m venv .venv
 92 | source .venv/bin/activate
 93 | pip install -r requirements.txt
 94 | ```
 95 | 
 96 | ## Validating Everything Runs
 97 | 
 98 | - Validate all Python scripts under `code/`:
 99 | 
100 | ```
101 | python tools/validate_scripts.py
102 | ```
103 | 
104 | - Validate all notebooks under `notebooks/` (executes each notebook):
105 | 
106 | ```
107 | python tools/validate_notebooks.py
108 | # or force a specific kernel
109 | python tools/validate_notebooks.py --kernel python3
110 | ```
111 | 
112 | Both validators print structured, timed output for Structure / Imports / Execute and end with a summary like:
113 | 
114 | ```
115 | [25/26] Validating notebooks/ch25_reproducible_research_playbook.ipynb
116 |   • Structure: OK   (1.3ms)
117 |   • Imports: OK   (1.6ms scan, 0.0ms probe)
118 |   • Execute: OK   (1.06s)
119 |   • Total: 1.07s
120 | ```
121 | 
122 | ## Requirements
123 | 
124 | The minimal shared requirements are captured in `requirements.txt`:
125 | 
126 | ```
127 | numpy>=1.24
128 | tqdm>=4.66
129 | tensorboard>=2.13
130 | graphviz>=0.20
131 | typing-extensions>=4.8
132 | 
133 | # IMPORTANT: Install PyTorch following instructions for your OS/GPU:
134 | # https://pytorch.org/get-started/locally/
135 | ```
136 | 
137 | Some chapters may suggest optional extras (e.g., `fastapi`, `streamlit`) in the code comments; install those ad‑hoc when exploring those parts.
138 | 
139 | ## Tips
140 | 
141 | - Keep edit→run loops tight by working in a small terminal + editor split and running individual scripts first, then notebooks.
142 | - Use `code/env_check.py` and `code/check_backends.py` to confirm your Python/torch environment.
143 | - If notebook execution is slow, start with scripts to validate logic, then run the notebook cell‑by‑cell.
144 | 
145 | ## Disclaimer
146 | 
147 | This repository is provided "as is" for educational purposes. No warranties, guarantees, or representations of any kind are made regarding correctness, completeness, fitness for a particular purpose, or non‑infringement. Code and examples are intended for illustration and learning; they may omit production concerns (error handling, security, performance, robustness). Use at your own risk. Dependencies evolve over time; version changes can affect behavior or break examples.
148 | 
149 | ## Credits and License
150 | 
151 | © Dr. Yves J. Hilpisch (The Python Quants GmbH). All rights reserved unless otherwise noted. See source files for per‑file notices where applicable.
152 | 
153 | 
154 | This repository is a companion to the book “Building a Large Language Model from Scratch — A Step‑by‑Step Guide Using Python and PyTorch”.
155 | 
156 | <img src="https://theaiengineer.dev/tae_logo_gw_flatter.png" width=35% align=right>
157 | 


--------------------------------------------------------------------------------
/notebooks/ch07_attention.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "0232c872",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "77846170",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n",
 17 |     "## Chapter 7 — Attention & Self-Attention Mechanism\n",
 18 |     "**© Dr. Yves J. Hilpisch**<br>AI-Powered by GPT-5."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "4f58d442",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## How to Use This Notebook\n",
 27 |     "\n",
 28 |     "- Implement scaled dot-product attention step by step before abstracting it away.\n",
 29 |     "- Inspect attention weights on curated toy sequences to build intuition.\n",
 30 |     "- Connect the math to code by tracing shapes and broadcasting rules carefully."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "id": "5b73ad6d",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "### Roadmap\n",
 39 |     "\n",
 40 |     "We derive attention scores, apply masking, and then batch the operation so it scales to transformer blocks."
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "id": "e2858b3c",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "### Study Tips\n",
 49 |     "\n",
 50 |     "Print intermediate tensors as you go. Seeing the score matrices and masks makes it easier to reason about what each line of code accomplishes."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "id": "2eca1602",
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "# Ensure torch is available (Colab friendly)\n",
 61 |     "try:\n",
 62 |     "    import torch  # noqa\n",
 63 |     "    print('torch:', torch.__version__)\n",
 64 |     "except Exception:\n",
 65 |     "    import os\n",
 66 |     "    gpu = os.system('nvidia-smi > /dev/null 2>&1') == 0\n",
 67 |     "    index = 'https://download.pytorch.org/whl/cu121' if gpu else 'https://download.pytorch.org/whl/cpu'\n",
 68 |     "    get_ipython().run_line_magic('pip', f'install -q torch --index-url {index}')\n",
 69 |     "    import torch\n",
 70 |     "    print('torch:', torch.__version__)\n"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "id": "258ac315",
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "import matplotlib.pyplot as plt\n",
 81 |     "plt.style.use('seaborn-v0_8')\n",
 82 |     "%config InlineBackend.figure_format = 'svg'\n"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "id": "13a671dd",
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "from torch import Tensor\n",
 93 |     "def scaled_dot_product_attention(q: Tensor, k: Tensor, v: Tensor, mask: Tensor | None = None) -> Tensor:\n",
 94 |     "    d = q.size(-1)\n",
 95 |     "    scores = (q @ k.transpose(-2, -1)) / (d ** 0.5)\n",
 96 |     "    if mask is not None:\n",
 97 |     "        scores = scores.masked_fill(mask == 0, float('-inf'))\n",
 98 |     "    w = torch.softmax(scores, dim=-1)\n",
 99 |     "    return w @ v\n",
100 |     "def causal_mask(batch: int, time: int, device=None):\n",
101 |     "    base = torch.tril(torch.ones(time, time, device=device))\n",
102 |     "    return base.unsqueeze(0).expand(batch, -1, -1)\n"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "id": "8b88571d",
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "# Set random seed\n",
113 |     "torch.manual_seed(0)\n"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "id": "93f39e6f",
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "# Define shapes\n",
124 |     "B, T, D = 1, 6, 4\n",
125 |     "(B, T, D)\n"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "id": "cb6c3e8a",
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "# Create a toy input\n",
136 |     "x = torch.randn(B, T, D)\n",
137 |     "x\n"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "id": "32236ebd",
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "# Build a causal mask\n",
148 |     "mask = causal_mask(B, T)\n",
149 |     "mask\n"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "id": "a8820239",
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "# Apply attention\n",
160 |     "y = scaled_dot_product_attention(x, x, x, mask)\n",
161 |     "y\n"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "id": "f1121ec2",
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "# Visualize a row of attention weights\n",
172 |     "with torch.no_grad():\n",
173 |     "    d = x.size(-1)\n",
174 |     "    scores = (x @ x.transpose(-2, -1)) / (d ** 0.5)\n",
175 |     "    scores = scores.masked_fill(mask == 0, float('-inf'))\n",
176 |     "    w = torch.softmax(scores, dim=-1)[0]  # [T, T]\n",
177 |     "plt.figure(figsize=(4, 3))\n",
178 |     "plt.imshow(w, cmap='viridis', aspect='auto')\n",
179 |     "plt.colorbar(label='weight')\n",
180 |     "plt.xlabel('key\\npositions')\n",
181 |     "plt.ylabel('query positions')\n",
182 |     "plt.title('Causal attention weights (toy)')\n",
183 |     "plt.tight_layout()\n"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "id": "43d21080",
189 |    "metadata": {},
190 |    "source": [
191 |     "## Exercises\n",
192 |     "\n",
193 |     "- Implement additive attention and compare its behavior with scaled dot-product attention.\n",
194 |     "- Visualize attention maps for sequences with padding to confirm masking works as expected.\n",
195 |     "- Modify the notebook to support multi-head attention and measure the parameter count increase."
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "id": "acded7d4",
201 |    "metadata": {},
202 |    "source": [
203 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
204 |    ]
205 |   }
206 |  ],
207 |  "metadata": {
208 |   "kernelspec": {
209 |    "display_name": "Python 3",
210 |    "language": "python",
211 |    "name": "python3"
212 |   },
213 |   "language_info": {
214 |    "name": "python",
215 |    "version": "3.10"
216 |   }
217 |  },
218 |  "nbformat": 4,
219 |  "nbformat_minor": 5
220 | }
221 | 


--------------------------------------------------------------------------------
/notebooks/attollm_colab_starter.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "2cf14c08",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "c2c4bacb",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n",
 17 |     "## Project Companion — attoLLM Colab Starter\n",
 18 |     "**© Dr. Yves J. Hilpisch**<br>AI-Powered by GPT-5."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "7b132250",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## How to Use This Notebook\n",
 27 |     "\n",
 28 |     "- Sanity-check GPU, disk, and Python versions before diving into model training.\n",
 29 |     "- Mount or sync the storage location you plan to use for datasets and checkpoints.\n",
 30 |     "- Bookmark the helper utilities you expect to reuse across multiple chapters."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "id": "cd6be911",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "### Starter Checklist\n",
 39 |     "\n",
 40 |     "Use the diagnostics cells here before every new Colab session. Consistent environments remove a whole class of hard-to-track bugs."
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "id": "93519a54",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "### Collaboration Notes\n",
 49 |     "\n",
 50 |     "Share a copy of this notebook with teammates so everyone runs the same validation steps before kicking off workloads."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "id": "67ace6cb",
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "!nvidia-smi || echo 'No NVIDIA GPU available'"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "id": "d18d930c",
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "# Install a recent CUDA wheel when GPU is present; fallback to CPU wheel\n",
 71 |     "import os\n",
 72 |     "import subprocess\n",
 73 |     "import sys\n",
 74 |     "\n",
 75 |     "\n",
 76 |     "def ensure_torch():\n",
 77 |     "    try:\n",
 78 |     "        import torch  # type: ignore\n",
 79 |     "        return torch\n",
 80 |     "    except Exception as exc:  # pragma: no cover - diagnostics\n",
 81 |     "        print('torch import failed:', exc)\n",
 82 |     "        if os.environ.get('COLAB_RELEASE_TAG'):\n",
 83 |     "            gpu_available = os.system('nvidia-smi > /dev/null 2>&1') == 0\n",
 84 |     "            index = 'https://download.pytorch.org/whl/cu121' if gpu_available else 'https://download.pytorch.org/whl/cpu'\n",
 85 |     "            subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'torch', '--index-url', index], check=True)\n",
 86 |     "            import torch  # type: ignore\n",
 87 |     "            return torch\n",
 88 |     "        return None\n",
 89 |     "\n",
 90 |     "\n",
 91 |     "torch = ensure_torch()\n",
 92 |     "if torch is None:\n",
 93 |     "    print('Install torch manually when you have network access (skipped for validation).')\n",
 94 |     "else:\n",
 95 |     "    print('torch:', torch.__version__, 'cuda?', torch.cuda.is_available())\n"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "id": "06d1e430",
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "# Scaffold attoLLM in /content/attollm (Colab) or a local folder otherwise\n",
106 |     "import os\n",
107 |     "import pathlib\n",
108 |     "import subprocess\n",
109 |     "import sys\n",
110 |     "import textwrap\n",
111 |     "\n",
112 |     "root = pathlib.Path('/content/attollm') if os.environ.get('COLAB_RELEASE_TAG') else pathlib.Path.cwd() / 'attollm_scaffold'\n",
113 |     "\n",
114 |     "(root / 'src/attollm').mkdir(parents=True, exist_ok=True)\n",
115 |     "for d in ['scripts', 'configs', 'data/raw', 'data/processed', 'data/cache', 'checkpoints', 'tests']:\n",
116 |     "    (root / d).mkdir(parents=True, exist_ok=True)\n",
117 |     "\n",
118 |     "(root / 'README.md').write_text('# attoLLM (Colab)\\n')\n",
119 |     "\n",
120 |     "gitignore_text = '\\n'.join([\n",
121 |     "    '__pycache__/',\n",
122 |     "    '*.pyc',\n",
123 |     "    'data/cache/',\n",
124 |     "    'checkpoints/',\n",
125 |     "    'runs/',\n",
126 |     "    '*.pt',\n",
127 |     "    '*.pth',\n",
128 |     "]) + '\\n'\n",
129 |     "(root / '.gitignore').write_text(gitignore_text)\n",
130 |     "\n",
131 |     "requirements_text = 'numpy>=1.24\\n' 'tqdm>=4.66\\n'\n",
132 |     "(root / 'requirements.txt').write_text(requirements_text)\n",
133 |     "\n",
134 |     "pyproject_text = textwrap.dedent(\"\"\"\n",
135 |     "    [build-system]\n",
136 |     "    requires = [\"setuptools>=68\", \"wheel\"]\n",
137 |     "    build-backend = \"setuptools.build_meta\"\n",
138 |     "\n",
139 |     "    [project]\n",
140 |     "    name = \"attollm\"\n",
141 |     "    version = \"0.0.1\"\n",
142 |     "    requires-python = \">=3.10\"\n",
143 |     "    dependencies = []\n",
144 |     "\n",
145 |     "    [tool.setuptools]\n",
146 |     "    package-dir = {\"\" = \"src\"}\n",
147 |     "\n",
148 |     "    [tool.setuptools.packages.find]\n",
149 |     "    where = [\"src\"]\n",
150 |     "\"\"\")\n",
151 |     "(root / 'pyproject.toml').write_text(pyproject_text)\n",
152 |     "\n",
153 |     "(root / 'src/attollm/__init__.py').write_text('__all__ = [\"hello\"]\\n')\n",
154 |     "\n",
155 |     "hello_py = textwrap.dedent(\"\"\"\n",
156 |     "    def main():\n",
157 |     "        print(\"Hello from attoLLM (Colab)!\")\n",
158 |     "\n",
159 |     "    if __name__ == \"__main__\":\n",
160 |     "        main()\n",
161 |     "\"\"\")\n",
162 |     "(root / 'src/attollm/hello.py').write_text(hello_py)\n",
163 |     "\n",
164 |     "print('Scaffolded at:', root)\n",
165 |     "\n",
166 |     "if os.environ.get('COLAB_RELEASE_TAG'):\n",
167 |     "    subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', '-e', str(root)], check=True)\n",
168 |     "    subprocess.run([sys.executable, '-m', 'attollm.hello'], check=True)\n",
169 |     "else:\n",
170 |     "    print('Running outside Colab; editable install skipped to keep validation fast.')\n",
171 |     "    print('Inspect the scaffolded package locally at', root)"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "id": "e2b12701",
177 |    "metadata": {},
178 |    "source": [
179 |     "## Exercises\n",
180 |     "\n",
181 |     "- Adapt the environment check to alert you when CUDA is missing or mismatched.\n",
182 |     "- Mount your preferred cloud storage (Drive, S3 via fsspec, etc.) and verify read/write access.\n",
183 |     "- Create a short markdown playbook describing how you spin up a fresh Colab runtime for this project."
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "id": "5f075006",
189 |    "metadata": {},
190 |    "source": [
191 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
192 |    ]
193 |   }
194 |  ],
195 |  "metadata": {
196 |   "kernelspec": {
197 |    "display_name": "Python 3",
198 |    "language": "python",
199 |    "name": "python3"
200 |   },
201 |   "language_info": {
202 |    "name": "python",
203 |    "version": "3.10"
204 |   }
205 |  },
206 |  "nbformat": 4,
207 |  "nbformat_minor": 5
208 | }
209 | 


--------------------------------------------------------------------------------
/code/ch12_metrics_text.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Building a Large Language Model from Scratch
  3 | — A Step-by-Step Guide Using Python and PyTorch
  4 | 
  5 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
  6 | AI-Powered by GPT-5.
  7 | 
  8 | Educational text metrics for Chapter 12.
  9 | 
 10 | This module implements small, dependency‑free versions of common metrics:
 11 | 
 12 | - BLEU (corpus): n‑gram precision with brevity penalty, optional Add‑1 smoothing.
 13 | - ROUGE‑L: F‑measure based on the longest common subsequence (LCS).
 14 | - METEOR (simplified): unigram precision/recall F‑mean with a fragmentation
 15 |   penalty estimated from contiguous matching chunks (no stemming/synonyms).
 16 | - Diversity helpers: distinct‑1 / distinct‑2.
 17 | 
 18 | Inputs are tokenized sequences (lists of strings or ints). We keep the
 19 | implementations compact and readable for teaching; they are not drop‑in
 20 | replacements for official packages, but align with the main ideas.
 21 | """
 22 | 
 23 | from __future__ import annotations
 24 | 
 25 | 
 26 | from typing import Iterable, List, Sequence, Tuple
 27 | from collections import Counter
 28 | 
 29 | 
 30 | def _ngram_counts(tokens: Sequence, n: int) -> Counter:
 31 |     return Counter(tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1))
 32 | 
 33 | 
 34 | def bleu_corpus(
 35 |     references: List[List[Sequence]],
 36 |     hypotheses: List[Sequence],
 37 |     max_n: int = 4,
 38 |     smooth: bool = True,
 39 | ) -> float:
 40 |     """Compute a simple corpus BLEU.
 41 | 
 42 |     Args:
 43 |         references: list of lists of reference token sequences (per hypothesis)
 44 |         hypotheses: list of hypothesis token sequences
 45 |         max_n: highest n‑gram order (default 4)
 46 |         smooth: Add‑1 smoothing for counts
 47 |     Returns:
 48 |         BLEU score in [0, 1]
 49 |     """
 50 |     assert len(references) == len(hypotheses)
 51 | 
 52 |     # Modified n‑gram precisions
 53 |     num = [0] * max_n
 54 |     den = [0] * max_n
 55 | 
 56 |     ref_len = 0
 57 |     hyp_len = 0
 58 | 
 59 |     for refs, hyp in zip(references, hypotheses):
 60 |         hyp_len += len(hyp)
 61 |         # reference length closest to hypothesis (brevity penalty)
 62 |         ref_lengths = [len(r) for r in refs]
 63 |         ref_len += min(ref_lengths, key=lambda rl: (abs(rl - len(hyp)), rl))
 64 | 
 65 |         for n in range(1, max_n + 1):
 66 |             hyp_counts = _ngram_counts(hyp, n)
 67 |             max_ref_counts: Counter = Counter()
 68 |             for r in refs:
 69 |                 max_ref_counts |= _ngram_counts(r, n)
 70 |             # clipped counts
 71 |             overlap = {
 72 |                 g: min(c, max_ref_counts.get(g, 0)) for g, c in hyp_counts.items()
 73 |             }
 74 |             num[n - 1] += sum(overlap.values())
 75 |             den[n - 1] += max(1, sum(hyp_counts.values()))
 76 | 
 77 |     # Smoothed precisions
 78 |     precisions = []
 79 |     for i in range(max_n):
 80 |         if smooth:
 81 |             precisions.append((num[i] + 1) / (den[i] + 1))
 82 |         else:
 83 |             precisions.append(0.0 if den[i] == 0 else num[i] / den[i])
 84 | 
 85 |     # Brevity penalty
 86 |     import math
 87 | 
 88 |     if hyp_len == 0:
 89 |         return 0.0
 90 |     if hyp_len > ref_len:
 91 |         bp = 1.0
 92 |     else:
 93 |         bp = math.exp(1 - ref_len / max(1, hyp_len))
 94 | 
 95 |     # Geometric mean of precisions
 96 |     gm = math.exp(sum((1 / max_n) * math.log(max(p, 1e-16)) for p in precisions))
 97 |     return bp * gm
 98 | 
 99 | 
100 | def _lcs_length(a: Sequence, b: Sequence) -> int:
101 |     # Classic DP for LCS length (O(len(a)*len(b)))
102 |     la, lb = len(a), len(b)
103 |     dp = [0] * (lb + 1)
104 |     for i in range(1, la + 1):
105 |         prev = 0
106 |         for j in range(1, lb + 1):
107 |             tmp = dp[j]
108 |             if a[i - 1] == b[j - 1]:
109 |                 dp[j] = prev + 1
110 |             else:
111 |                 dp[j] = max(dp[j], dp[j - 1])
112 |             prev = tmp
113 |     return dp[lb]
114 | 
115 | 
116 | def rouge_l(
117 |     references: List[List[Sequence]],
118 |     hypotheses: List[Sequence],
119 |     beta: float = 1.2,
120 | ) -> float:
121 |     """Compute ROUGE‑L F‑measure averaged across examples.
122 | 
123 |     For each hypothesis we take the best reference by LCS F‑measure.
124 |     """
125 |     import math
126 | 
127 |     scores: List[float] = []
128 |     for refs, hyp in zip(references, hypotheses):
129 |         best = 0.0
130 |         for r in refs:
131 |             lcs = _lcs_length(r, hyp)
132 |             if lcs == 0:
133 |                 continue
134 |             prec = lcs / max(1, len(hyp))
135 |             rec = lcs / max(1, len(r))
136 |             if prec == 0 and rec == 0:
137 |                 f = 0.0
138 |             else:
139 |                 beta2 = beta * beta
140 |                 f = (1 + beta2) * prec * rec / max(beta2 * prec + rec, 1e-12)
141 |             best = max(best, f)
142 |         scores.append(best)
143 |     return sum(scores) / max(1, len(scores))
144 | 
145 | 
146 | def _matching_chunks(h: Sequence, r: Sequence) -> Tuple[int, int]:
147 |     """Return (matches, chunks) for contiguous exact matches between h and r.
148 | 
149 |     Used for a simplified METEOR chunk penalty.
150 |     """
151 |     # Build index of tokens in r
152 |     from collections import defaultdict
153 | 
154 |     pos = defaultdict(list)
155 |     for j, tok in enumerate(r):
156 |         pos[tok].append(j)
157 | 
158 |     matches = 0
159 |     chunks = 0
160 |     prev_j = None
161 |     for tok in h:
162 |         if not pos[tok]:
163 |             continue
164 |         j = pos[tok].pop(0)  # greedy match leftmost
165 |         matches += 1
166 |         if prev_j is None or j != prev_j + 1:
167 |             chunks += 1
168 |         prev_j = j
169 |     return matches, chunks
170 | 
171 | 
172 | def meteor_simple(
173 |     references: List[List[Sequence]],
174 |     hypotheses: List[Sequence],
175 |     alpha: float = 0.9,
176 |     beta: float = 3.0,
177 |     gamma: float = 0.5,
178 | ) -> float:
179 |     """Simplified METEOR.
180 | 
181 |     For each (refs, hyp):
182 |     - Compute unigram precision P and recall R against each ref (exact match).
183 |     - F_mean = (P*R) / (alpha*P + (1 - alpha)*R)
184 |     - Compute chunk penalty: Pen = gamma * (chunks / matches) ** beta
185 |     - Score = F_mean * (1 - Pen)
186 |     We take the max score over references and average over the corpus.
187 |     """
188 |     import math
189 | 
190 |     scores: List[float] = []
191 |     for refs, hyp in zip(references, hypotheses):
192 |         best = 0.0
193 |         for r in refs:
194 |             # unigram matches
195 |             hyp_counts = Counter(hyp)
196 |             ref_counts = Counter(r)
197 |             overlap = sum(min(hyp_counts[t], ref_counts[t]) for t in hyp_counts)
198 |             P = overlap / max(1, len(hyp))
199 |             R = overlap / max(1, len(r))
200 |             if P == 0 or R == 0:
201 |                 cand = 0.0
202 |             else:
203 |                 Fm = (P * R) / max(alpha * P + (1 - alpha) * R, 1e-12)
204 |                 m, ch = _matching_chunks(hyp, r)
205 |                 if m == 0:
206 |                     penalty = 0.0
207 |                 else:
208 |                     penalty = gamma * ((ch / m) ** beta)
209 |                 cand = Fm * (1 - penalty)
210 |             best = max(best, cand)
211 |         scores.append(best)
212 |     return sum(scores) / max(1, len(scores))
213 | 
214 | 
215 | def distinct_n(hypotheses: List[Sequence], n: int = 1) -> float:
216 |     """Proportion of distinct n‑grams across all hypotheses (diversity)."""
217 |     grams = Counter()
218 |     total = 0
219 |     for h in hypotheses:
220 |         c = _ngram_counts(h, n)
221 |         grams.update(c)
222 |         total += sum(c.values())
223 |     if total == 0:
224 |         return 0.0
225 |     return len(grams) / total
226 | 
227 | 
228 | __all__ = [
229 |     "bleu_corpus",
230 |     "rouge_l",
231 |     "meteor_simple",
232 |     "distinct_n",
233 | ]
234 | 
235 | 


--------------------------------------------------------------------------------
/code/ch09_gpt.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | """
  4 | Building a Large Language Model from Scratch
  5 | — A Step-by-Step Guide Using Python and PyTorch
  6 | 
  7 | (c) Dr. Yves J. Hilpisch (The Python Quants GmbH)
  8 | AI-Powered by GPT-5.
  9 | 
 10 | GPT assembly: token/position embeddings, a stack of Transformer blocks,
 11 | and a language‑model head. Kept small and readable to align with the book’s
 12 | step‑by‑step narrative.
 13 | 
 14 | Key choices
 15 | -----------
 16 | - Pre‑norm blocks (LayerNorm before sublayers) as in Chapter 8.
 17 | - Learned positional embeddings by default; optional sinusoidal positions.
 18 | - Optional weight tying between token embeddings and LM head.
 19 | - Causal mask is always applied; padding mask is optionally combined when
 20 |   a `pad_id` is provided (or an explicit `attention_mask`).
 21 | 
 22 | Shapes
 23 | ------
 24 | Inputs are token ids of shape [B, T]; hidden/state tensors stay [B, T, D].
 25 | Logits are [B, T, V].
 26 | """
 27 | 
 28 | from dataclasses import dataclass
 29 | from typing import Optional, Tuple
 30 | 
 31 | import torch
 32 | import torch.nn as nn
 33 | import torch.nn.functional as F
 34 | 
 35 | # Support running scripts from repo root by importing neighbor modules directly.
 36 | try:
 37 |     from ch08_transformer import (  # type: ignore
 38 |         TransformerBlock,
 39 |         sinusoidal_positions,
 40 |     )
 41 | except Exception:
 42 |     from code.ch08_transformer import (  # type: ignore
 43 |         TransformerBlock,
 44 |         sinusoidal_positions,
 45 |     )
 46 | 
 47 | 
 48 | @dataclass
 49 | class GPTConfig:
 50 |     """Hyperparameters for a small, readable GPT.
 51 | 
 52 |     - vocab_size: number of tokens in the vocabulary
 53 |     - block_size: maximum sequence length (context window)
 54 |     - d_model: model (embedding) dimension
 55 |     - n_head: number of attention heads per block
 56 |     - n_layer: number of transformer blocks
 57 |     - d_ff: feed‑forward hidden dimension (often 4 * d_model)
 58 |     - dropout: dropout rate in MHA/FFN
 59 |     - pos_type: 'learned' (GPT‑style) or 'sinusoidal' (chapter 8 option)
 60 |     - tie_weights: whether to tie LM head weight with token embeddings
 61 |     """
 62 | 
 63 |     vocab_size: int
 64 |     block_size: int
 65 |     d_model: int = 128
 66 |     n_head: int = 4
 67 |     n_layer: int = 2
 68 |     d_ff: int = 512
 69 |     dropout: float = 0.1
 70 |     pos_type: str = "learned"  # or "sinusoidal"
 71 |     tie_weights: bool = True
 72 | 
 73 | 
 74 | class GPT(nn.Module):
 75 |     """A compact GPT‑style language model composed from Chapter 8 blocks.
 76 | 
 77 |     Forward signature:
 78 |         logits, loss = model(input_ids, targets=None, attention_mask=None, pad_id=None)
 79 |     """
 80 | 
 81 |     def __init__(self, cfg: GPTConfig):
 82 |         super().__init__()
 83 |         self.cfg = cfg
 84 | 
 85 |         V, Tm, D = cfg.vocab_size, cfg.block_size, cfg.d_model
 86 | 
 87 |         # Embeddings: tokens and positions
 88 |         self.tok_emb = nn.Embedding(V, D)
 89 |         if cfg.pos_type == "learned":
 90 |             self.pos_emb = nn.Embedding(Tm, D)
 91 |         else:
 92 |             self.pos_emb = None  # we'll add sinusoidal positions on the fly
 93 | 
 94 |         self.drop = nn.Dropout(cfg.dropout)
 95 | 
 96 |         # Transformer blocks
 97 |         self.blocks = nn.ModuleList(
 98 |             [
 99 |                 TransformerBlock(
100 |                     D,
101 |                     cfg.n_head,
102 |                     cfg.d_ff,
103 |                     cfg.dropout,
104 |                 )
105 |                 for _ in range(cfg.n_layer)
106 |             ]
107 |         )
108 | 
109 |         # Final normalization and LM head
110 |         self.norm_f = nn.LayerNorm(D)
111 |         self.lm_head = nn.Linear(D, V, bias=False)
112 | 
113 |         # Optional weight tying: share weights between embedding and LM head
114 |         if cfg.tie_weights:
115 |             self.lm_head.weight = self.tok_emb.weight
116 | 
117 |         self.apply(self._init_weights)
118 | 
119 |     def _init_weights(self, module: nn.Module) -> None:
120 |         """Small‑norm initialization consistent with readable training.
121 | 
122 |         GPT‑2 uses ~N(0, 0.02) for embeddings and projection weights. We follow
123 |         a similar pattern here for stability at small scales.
124 |         """
125 |         if isinstance(module, (nn.Linear, nn.Embedding)):
126 |             nn.init.normal_(module.weight, mean=0.0, std=0.02)
127 |         if isinstance(module, nn.Linear) and module.bias is not None:
128 |             nn.init.zeros_(module.bias)
129 | 
130 |     def _build_mask(
131 |         self,
132 |         input_ids: torch.Tensor,
133 |         attention_mask: Optional[torch.Tensor],
134 |         pad_id: Optional[int],
135 |     ) -> torch.Tensor:
136 |         """Combine causal and optional padding masks into [B, T, T].
137 | 
138 |         - causal: lower‑triangular ones
139 |         - padding: 1 for tokens, 0 for pads (derived from input or provided)
140 |         """
141 |         B, T = input_ids.size(0), input_ids.size(1)
142 |         device = input_ids.device
143 |         causal = torch.tril(torch.ones(T, T, device=device))  # [T, T]
144 | 
145 |         pad_mask_bt: Optional[torch.Tensor] = None
146 |         if attention_mask is not None:
147 |             pad_mask_bt = attention_mask.float()  # [B, T]
148 |         elif pad_id is not None:
149 |             # Derive from token ids: 1 for tokens, 0 for PAD
150 |             pad_mask_bt = (input_ids != pad_id).float()  # [B, T]
151 | 
152 |         if pad_mask_bt is None:
153 |             # No padding info: return causal broadcasted to [B, T, T]
154 |             return causal.unsqueeze(0).expand(B, -1, -1)
155 |         else:
156 |             # Broadcast multiply: [B, 1, T] * [T, T] -> [B, T, T]
157 |             return pad_mask_bt[:, None, :] * causal
158 | 
159 |     def forward(
160 |         self,
161 |         input_ids: torch.Tensor,
162 |         targets: Optional[torch.Tensor] = None,
163 |         attention_mask: Optional[torch.Tensor] = None,
164 |         pad_id: Optional[int] = None,
165 |     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
166 |         """Compute logits (and optional loss) for a batch of token ids.
167 | 
168 |         Args:
169 |             input_ids: LongTensor [B, T], token ids (T <= block_size)
170 |             targets: optional LongTensor [B, T] for next‑token loss
171 |             attention_mask: optional Float/Bool [B, T], 1 for tokens, 0 for pads
172 |             pad_id: optional int, used only to set ignore_index in loss
173 | 
174 |         Returns:
175 |             logits: [B, T, V]
176 |             loss:   scalar tensor or None
177 |         """
178 |         B, T = input_ids.size()
179 |         assert T <= self.cfg.block_size, (
180 |             f"sequence length {T} exceeds block_size {self.cfg.block_size}. "
181 |             f"Slice prompts to the last {self.cfg.block_size} tokens."
182 |         )
183 | 
184 |         device = input_ids.device
185 | 
186 |         # Token + positional embeddings
187 |         x = self.tok_emb(input_ids)  # [B, T, D]
188 | 
189 |         if self.cfg.pos_type == "learned":
190 |             positions = torch.arange(T, device=device)[None, :]  # [1, T]
191 |             x = x + self.pos_emb(positions)  # [B, T, D]
192 |         else:
193 |             pe = sinusoidal_positions(
194 |                 T,
195 |                 self.cfg.d_model,
196 |                 device=device,
197 |             )  # [T, D]
198 |             x = x + pe[None, :, :]  # [B, T, D]
199 | 
200 |         x = self.drop(x)
201 | 
202 |         # Build causal (and optional padding) mask once
203 |         mask_btt = self._build_mask(
204 |             input_ids,
205 |             attention_mask,
206 |             pad_id,
207 |         )  # [B, T, T]
208 | 
209 |         # Pass through stacked Transformer blocks
210 |         for block in self.blocks:
211 |             x = block(x, mask_btt)
212 | 
213 |         # Final norm and LM head projection
214 |         x = self.norm_f(x)
215 |         logits = self.lm_head(x)  # [B, T, V]
216 | 
217 |         loss = None
218 |         if targets is not None:
219 |             # Flatten [B, T, V] -> [B*T, V] and [B, T] -> [B*T]
220 |             logits_flat = logits.reshape(B * T, -1)
221 |             targets_flat = targets.reshape(B * T)
222 |             ignore = pad_id if pad_id is not None else -100
223 |             loss = F.cross_entropy(logits_flat, targets_flat, ignore_index=ignore)
224 | 
225 |         return logits, loss
226 | 
227 | 
228 | __all__ = ["GPTConfig", "GPT"]
229 | 


--------------------------------------------------------------------------------
/notebooks/ch13_improvements_colab.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "fd6c7a4d",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "2d78c8e6",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n",
 17 |     "## Chapter 13 — Improvements & Extensions\n",
 18 |     "**© Dr. Yves J. Hilpisch**<br>AI-Powered by GPT-5."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "73dcb0fc",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## How to Use This Notebook\n",
 27 |     "\n",
 28 |     "- Prototype improvement ideas such as LoRA, adapters, or data augmentation.\n",
 29 |     "- Measure the trade-offs between accuracy gains and computational costs.\n",
 30 |     "- Document experiments thoroughly so you can reproduce winners later."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "id": "8bd6b6e6",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "### Roadmap\n",
 39 |     "\n",
 40 |     "We experiment with several upgrade paths, each isolated so you can evaluate impact independently."
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "id": "2b976c8a",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "### Study Tips\n",
 49 |     "\n",
 50 |     "Change one variable at a time. Rapid iteration is tempting, but disciplined ablations reveal what truly matters."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "id": "4bcdd880",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "This notebook demonstrates training improvements from Chapter 13: \n",
 59 |     "- Mixed precision (AMP) on CUDA for speed\n",
 60 |     "- Gradient clipping for stability\n",
 61 |     "- Warmup + cosine learning-rate schedule\n",
 62 |     "- Gradient accumulation to emulate larger batches\n",
 63 |     "Each cell creates one object and shows it immediately to match the book's \n",
 64 |     "creation rule. Comments explain why each step matters.\n"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "id": "96f4a16f",
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "# Torch setup\n",
 75 |     "import sys, subprocess\n",
 76 |     "import contextlib\n",
 77 |     "try:\n",
 78 |     "    import torch  # noqa: F401\n",
 79 |     "except Exception:\n",
 80 |     "    idx = 'https://download.pytorch.org/whl/cpu'\n",
 81 |     "    subprocess.check_call([sys.executable, '-m', 'pip', 'install',\n",
 82 |     "                           '--index-url', idx, 'torch'])\n",
 83 |     "    import torch  # noqa: F401\n",
 84 |     "torch.manual_seed(0); device = ('cuda' if torch.cuda.is_available() else\n",
 85 |     "    'cpu'); device\n"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "id": "563cb5a2",
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "# Tiny language model: embedding + linear head.\n",
 96 |     "# Used to demonstrate mechanics without heavy compute.\n",
 97 |     "class TinyLM(torch.nn.Module):\n",
 98 |     "    def __init__(self, V=64, D=64):\n",
 99 |     "        \"\"\"Create a minimal LM with vocabulary V and hidden size D.\n",
100 |     "        Embedding maps ids->vectors; linear head maps vectors->logits.\n",
101 |     "        \"\"\"\n",
102 |     "        super().__init__(); self.emb = torch.nn.Embedding(V, D)\n",
103 |     "        self.lin = torch.nn.Linear(D, V)\n",
104 |     "    def forward(self, x, y=None):\n",
105 |     "        \"\"\"Return (logits, loss). Loss is CE if targets y are given.\n",
106 |     "        \"\"\"\n",
107 |     "        h = self.emb(x); logits = self.lin(h)\n",
108 |     "        loss = None\n",
109 |     "        if y is not None:\n",
110 |     "            B,T,V = logits.shape\n",
111 |     "            loss = torch.nn.functional.cross_entropy(\n",
112 |     "                logits.reshape(B*T, V), y.reshape(B*T))\n",
113 |     "        return logits, loss\n",
114 |     "TinyLM()\n"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "id": "053fe05f",
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "# Data: random ids to exercise the loop\n",
125 |     "V, T, B = 64, 32, 64\n",
126 |     "ids = torch.randint(0, V, (B, T))\n",
127 |     "ids.shape\n"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "id": "05663fb1",
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "# Warmup + cosine schedule: scale base LR in [minr, 1].\n",
138 |     "# Warmup ramps 0->1; cosine glides 1->minr over remaining steps.\n",
139 |     "import math\n",
140 |     "def warmup_cosine_lambda(warmup, total, minr=0.1):\n",
141 |     "    \"\"\"Return a LambdaLR-compatible function.\n",
142 |     "    warmup: warmup steps; total: total steps; minr: floor ratio.\n",
143 |     "    \"\"\"\n",
144 |     "    def f(step):\n",
145 |     "        s = step + 1\n",
146 |     "        if s <= warmup: return s/float(warmup)\n",
147 |     "        t = s - warmup; frac = t/max(1,total-warmup)\n",
148 |     "        return minr + (1-minr)*0.5*(1+math.cos(math.pi*frac))\n",
149 |     "    return f\n",
150 |     "warmup_cosine_lambda(10, 100)(0)\n"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "id": "b98b50f1",
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "import contextlib\n",
161 |     "# Train with AMP (CUDA), clipping, accumulation, and schedule.\n",
162 |     "# On CPU/MPS, AMP is disabled and training runs in full precision.\n",
163 |     "model = TinyLM().to(device)\n",
164 |     "opt = torch.optim.AdamW(model.parameters(), lr=3e-4)\n",
165 |     "sched = torch.optim.lr_scheduler.LambdaLR(\n",
166 |     "    opt, warmup_cosine_lambda(10, 100, 0.1))\n",
167 |     "try:\n",
168 |     "    scaler = torch.amp.GradScaler('cuda', enabled=(device == 'cuda'))\n",
169 |     "except Exception:\n",
170 |     "    scaler = torch.cuda.amp.GradScaler(enabled=(device == 'cuda'))\n",
171 |     "accum, clip = 4, 1.0\n",
172 |     "hist = []\n",
173 |     "model.train()\n",
174 |     "opt.zero_grad(set_to_none=True)\n",
175 |     "for step in range(100):\n",
176 |     "    x = ids.to(device)\n",
177 |     "    y = ids.to(device)\n",
178 |     "    autocast_ctx = torch.amp.autocast('cuda', dtype=torch.float16) if device == 'cuda' else contextlib.nullcontext()\n",
179 |     "    with autocast_ctx:\n",
180 |     "        _, loss = model(x, y)\n",
181 |     "    if device == 'cuda':\n",
182 |     "        scaler.scale(loss).backward()\n",
183 |     "    else:\n",
184 |     "        loss.backward()\n",
185 |     "    if (step + 1) % accum == 0:\n",
186 |     "        if device == 'cuda':\n",
187 |     "            scaler.unscale_(opt)\n",
188 |     "        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n",
189 |     "        if device == 'cuda':\n",
190 |     "            scaler.step(opt)\n",
191 |     "            scaler.update()\n",
192 |     "        else:\n",
193 |     "            opt.step()\n",
194 |     "        opt.zero_grad(set_to_none=True)\n",
195 |     "        sched.step()\n",
196 |     "    if step % 10 == 0:\n",
197 |     "        hist.append(float(loss.detach().cpu().item()))\n",
198 |     "hist[:5]\n"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "id": "b9384616",
204 |    "metadata": {},
205 |    "source": [
206 |     "## Exercises\n",
207 |     "\n",
208 |     "- Implement LoRA for one transformer layer and compare training speed.\n",
209 |     "- Try a data augmentation technique and report its effect on validation metrics.\n",
210 |     "- Create a decision matrix that scores each extension by cost, complexity, and expected impact."
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "markdown",
215 |    "id": "78d12000",
216 |    "metadata": {},
217 |    "source": [
218 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
219 |    ]
220 |   }
221 |  ],
222 |  "metadata": {
223 |   "kernelspec": {
224 |    "display_name": "Python 3",
225 |    "language": "python",
226 |    "name": "python3"
227 |   },
228 |   "language_info": {
229 |    "name": "python",
230 |    "version": "3.10"
231 |   }
232 |  },
233 |  "nbformat": 4,
234 |  "nbformat_minor": 5
235 | }
236 | 


--------------------------------------------------------------------------------
/notebooks/ch11_sampling_colab.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "80e4cff1",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "d1da8bfe",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n",
 17 |     "## Chapter 11 — Testing & Sampling\n",
 18 |     "**© Dr. Yves J. Hilpisch**<br>AI-Powered by GPT-5."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "5438995f",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## How to Use This Notebook\n",
 27 |     "\n",
 28 |     "- Compare sampling strategies such as greedy, top-k, and nucleus sampling.\n",
 29 |     "- Evaluate qualitative outputs with checklists anchored to your use case.\n",
 30 |     "- Instrument temperature sweeps to understand controllability."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "id": "3e684d84",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "### Roadmap\n",
 39 |     "\n",
 40 |     "We load a trained checkpoint, generate continuations with multiple decoding schemes, and analyze the trade-offs each introduces."
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "id": "a47ed00b",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "### Study Tips\n",
 49 |     "\n",
 50 |     "Save representative generations for later review. Side-by-side comparisons are invaluable during stakeholder discussions."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "id": "207f0121",
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "# Torch + plotting setup\n",
 61 |     "import sys, subprocess\n",
 62 |     "try:\n",
 63 |     "    import torch  # noqa: F401\n",
 64 |     "except Exception:\n",
 65 |     "    idx = 'https://download.pytorch.org/whl/cpu'\n",
 66 |     "    subprocess.check_call([sys.executable, '-m', 'pip', 'install',\n",
 67 |     "                           '--index-url', idx, 'torch', 'torchvision',\n",
 68 |     "                           'torchaudio'])\n",
 69 |     "    import torch  # noqa: F401\n",
 70 |     "import matplotlib.pyplot as plt\n",
 71 |     "plt.style.use('seaborn-v0_8')\n",
 72 |     "%config InlineBackend.figure_format = 'svg'\n",
 73 |     "torch.manual_seed(0); 'ok'\n"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "id": "4081ca91",
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "# Probability at temperature T for toy logits.\n",
 84 |     "logits = torch.tensor([[2.0, 1.0, 0.2, -1.0]])\n",
 85 |     "\n",
 86 |     "def probs_at_T(T):\n",
 87 |     "    \"\"\"Return softmax(logits/T) for a single toy row.\n",
 88 |     "    Lower T sharpens, higher T flattens.\n",
 89 |     "    \"\"\"\n",
 90 |     "    p = torch.softmax(logits / T, dim=-1)\n",
 91 |     "    return p\n",
 92 |     "probs_at_T(1.0)\n"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "id": "9421af5a",
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "# Plot temperature effects\n",
103 |     "Ts = [0.7, 1.0, 1.3]\n",
104 |     "fig, axes = plt.subplots(1, 3, figsize=(6.0, 2.2), constrained_layout=True)\n",
105 |     "for ax, T in zip(axes, Ts):\n",
106 |     "    p = probs_at_T(T)[0]\n",
107 |     "    ax.bar(range(len(p)), p, color='#0A66C2')\n",
108 |     "    ax.set_title(f'T={T}')\n",
109 |     "    ax.set_ylim(0, 1.0); ax.set_xticks([]); ax.set_yticks([])\n",
110 |     "fig.suptitle('Temperature'); fig\n"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "id": "90ba5747",
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "# Top-k and top-p filters: set low-prob tokens to a large negative\n",
121 |     "# logit so softmax effectively assigns zero probability.\n",
122 |     "def top_k_filter(logits, k):\n",
123 |     "    \"\"\"Keep only the k largest logits per row.\n",
124 |     "    Others are set to a very negative number.\n",
125 |     "    \"\"\"\n",
126 |     "    if k <= 0: return logits\n",
127 |     "    v, _ = torch.topk(logits, k)\n",
128 |     "    thr = v[:, [-1]]\n",
129 |     "    return torch.where(logits < thr, torch.tensor(-1e9), logits)\n",
130 |     "def top_p_filter(logits, p):\n",
131 |     "    \"\"\"Keep the smallest set of tokens whose cumulative\n",
132 |     "    probability exceeds p. Works row-wise.\n",
133 |     "    \"\"\"\n",
134 |     "    if p <= 0 or p >= 1: return logits\n",
135 |     "    s, idx = torch.sort(logits, dim=-1, descending=True)\n",
136 |     "    pr = torch.softmax(s, dim=-1)\n",
137 |     "    cum = torch.cumsum(pr, dim=-1)\n",
138 |     "    mask = cum > p; mask[..., 0] = False\n",
139 |     "    s = s.masked_fill(mask, -1e9)\n",
140 |     "    out = torch.empty_like(s).scatter_(1, idx, s)\n",
141 |     "    return out\n",
142 |     "top_k_filter(logits, 3), top_p_filter(logits, 0.9)\n"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "id": "fe19648c",
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "# One sampling step on toy logits: apply temperature and optional\n",
153 |     "# top-k/top-p, then draw the next token (or greedy if T<=0).\n",
154 |     "def step_sample(logits, T=1.0, k=None, p=None):\n",
155 |     "    \"\"\"Return next token ids for a single step.\n",
156 |     "    \"\"\"\n",
157 |     "    x = logits / T if T > 0 else logits\n",
158 |     "    if k is not None: x = top_k_filter(x, k)\n",
159 |     "    if p is not None: x = top_p_filter(x, p)\n",
160 |     "    if T <= 0: return torch.argmax(x, dim=-1, keepdim=True)\n",
161 |     "    pr = torch.softmax(x, dim=-1)\n",
162 |     "    return torch.multinomial(pr, num_samples=1)\n",
163 |     "step_sample(logits, T=0.8, k=3, p=0.9)\n"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "id": "25dbd184",
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "# Simple dummy language model for a quick perplexity demo.\n",
174 |     "class DummyLM(torch.nn.Module):\n",
175 |     "    def __init__(self, V):\n",
176 |     "        super().__init__(); self.V = V\n",
177 |     "    def forward(self, x, targets=None):\n",
178 |     "        B, T = x.size(); logits = torch.zeros(B, T, self.V)\n",
179 |     "        loss = None\n",
180 |     "        if targets is not None:\n",
181 |     "            loss = torch.nn.functional.cross_entropy(\n",
182 |     "                logits.reshape(B*T, self.V), targets.reshape(B*T)\n",
183 |     "            )\n",
184 |     "        return logits, loss\n",
185 |     "def perplexity(model, loader):\n",
186 |     "    \"\"\"Compute (H, exp(H)) over a loader of (x,y) pairs.\n",
187 |     "    \"\"\"\n",
188 |     "    total, tokens = 0.0, 0\n",
189 |     "    for x, y in loader:\n",
190 |     "        _, loss = model(x, targets=y)\n",
191 |     "        total += float(loss.detach().item()) * y.numel()\n",
192 |     "        tokens += int(y.numel())\n",
193 |     "    H = total / max(1, tokens)\n",
194 |     "    import math; return H, math.exp(H)\n",
195 |     "V = 16; model = DummyLM(V)\n",
196 |     "ids = torch.randint(0, V, (1, 128))\n",
197 |     "class DS(torch.utils.data.Dataset):\n",
198 |     "    def __len__(self): return 64\n",
199 |     "    def __getitem__(self, i):\n",
200 |     "        x = ids[0, i:i+32]; y = ids[0, i+1:i+33]; return x, y\n",
201 |     "dl = torch.utils.data.DataLoader(DS(), batch_size=16, drop_last=True)\n",
202 |     "perplexity(model, dl)\n"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "id": "81391417",
208 |    "metadata": {},
209 |    "source": [
210 |     "## Exercises\n",
211 |     "\n",
212 |     "- Implement beam search and compare its outputs against nucleus sampling.\n",
213 |     "- Add automated toxicity or bias checks using an available open-source detector.\n",
214 |     "- Create a table summarizing how temperature and top-k interact across several prompts."
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "id": "0842cb66",
220 |    "metadata": {},
221 |    "source": [
222 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
223 |    ]
224 |   }
225 |  ],
226 |  "metadata": {
227 |   "kernelspec": {
228 |    "display_name": "Python 3",
229 |    "language": "python",
230 |    "name": "python3"
231 |   },
232 |   "language_info": {
233 |    "name": "python",
234 |    "version": "3.10"
235 |   }
236 |  },
237 |  "nbformat": 4,
238 |  "nbformat_minor": 5
239 | }
240 | 


--------------------------------------------------------------------------------
/notebooks/ch12_evaluation_colab.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "5af6d8c0",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "01bbcf1d",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Building a Large Language Model from Scratch — A Step-by-Step Guide Using Python and PyTorch\n",
 17 |     "## Chapter 12 — Evaluation Metrics Beyond Perplexity\n",
 18 |     "**© Dr. Yves J. Hilpisch**<br>AI-Powered by GPT-5."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "b72ebf49",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## How to Use This Notebook\n",
 27 |     "\n",
 28 |     "- Compute both automatic and human-in-the-loop metrics for your model outputs.\n",
 29 |     "- Design lightweight evaluation datasets aligned with your deployment scenario.\n",
 30 |     "- Visualize metric trends to spot regressions early."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "id": "76409cd8",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "### Roadmap\n",
 39 |     "\n",
 40 |     "We calculate perplexity, calibration metrics, and task-specific scores, then consolidate findings into a concise report."
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "id": "573952d2",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "### Study Tips\n",
 49 |     "\n",
 50 |     "Treat metrics as decision-making tools. Note which ones actually influence your go/no-go criteria."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "id": "bdfe12f0",
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "# Imports and style\n",
 61 |     "import math\n",
 62 |     "from collections import Counter\n",
 63 |     "import matplotlib.pyplot as plt\n",
 64 |     "plt.style.use('seaborn-v0_8')\n",
 65 |     "%config InlineBackend.figure_format = 'svg'\n",
 66 |     "'ok'\n"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "id": "c26bba4e",
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "# Tiny BLEU (corpus)\n",
 77 |     "def ngram_counts(tokens, n):\n",
 78 |     "    return Counter(tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1))\n",
 79 |     "def bleu_corpus(references, hypotheses, max_n=4, smooth=True):\n",
 80 |     "    \"\"\"Educational BLEU with Add-1 smoothing and brevity penalty.\n",
 81 |     "    references: list of list of token lists; hypotheses: list of token lists.\n",
 82 |     "    Returns BLEU in [0,1].\n",
 83 |     "    \"\"\"\n",
 84 |     "    num, den = [0]*max_n, [0]*max_n; ref_len = 0; hyp_len = 0\n",
 85 |     "    for refs, hyp in zip(references, hypotheses):\n",
 86 |     "        hyp_len += len(hyp)\n",
 87 |     "        rl = [len(r) for r in refs]\n",
 88 |     "        ref_len += min(rl, key=lambda L: (abs(L-len(hyp)), L))\n",
 89 |     "        for n in range(1, max_n+1):\n",
 90 |     "            h = ngram_counts(hyp, n)\n",
 91 |     "            m = Counter();\n",
 92 |     "            for r in refs: m |= ngram_counts(r, n)\n",
 93 |     "            overlap = {g: min(c, m.get(g,0)) for g,c in h.items()}\n",
 94 |     "            num[n-1] += sum(overlap.values()); den[n-1] += max(1, sum(h.values()))\n",
 95 |     "    prec = [((num[i]+1)/(den[i]+1) if smooth else (0 if den[i]==0 else num[i]/den[i]))\n",
 96 |     "            for i in range(max_n)]\n",
 97 |     "    gm = math.exp(sum((1/max_n)*math.log(max(p,1e-16)) for p in prec))\n",
 98 |     "    bp = 1.0 if hyp_len > ref_len else math.exp(1 - ref_len/max(1,hyp_len))\n",
 99 |     "    return bp*gm\n",
100 |     "refs = [[\"the cat is on the mat\".split()]]\n",
101 |     "hyp = [\"the cat sat on the mat\".split()]\n",
102 |     "round(bleu_corpus(refs, hyp), 3)\n"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "id": "af1380d8",
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "# ROUGE-L via LCS\n",
113 |     "def lcs_length(a, b):\n",
114 |     "    dp = [0]*(len(b)+1)\n",
115 |     "    for i in range(1, len(a)+1):\n",
116 |     "        prev = 0\n",
117 |     "        for j in range(1, len(b)+1):\n",
118 |     "            tmp = dp[j]\n",
119 |     "            dp[j] = prev + 1 if a[i-1]==b[j-1] else max(dp[j], dp[j-1])\n",
120 |     "            prev = tmp\n",
121 |     "    return dp[len(b)]\n",
122 |     "def rouge_l(references, hypotheses, beta=1.2):\n",
123 |     "    \"\"\"Average F-measure over best reference per example.\n",
124 |     "    \"\"\"\n",
125 |     "    scores = []\n",
126 |     "    for refs, hyp in zip(references, hypotheses):\n",
127 |     "        best = 0.0\n",
128 |     "        for r in refs:\n",
129 |     "            l = lcs_length(r, hyp)\n",
130 |     "            if l==0: continue\n",
131 |     "            p = l/max(1,len(hyp)); q = l/max(1,len(r))\n",
132 |     "            b2 = beta*beta; f = (1+b2)*p*q/max(b2*p+q,1e-12)\n",
133 |     "            best = max(best, f)\n",
134 |     "        scores.append(best)\n",
135 |     "    return sum(scores)/max(1,len(scores))\n",
136 |     "refs = [[\"the cat is on the mat\".split()]]\n",
137 |     "hyp = [\"the cat sat on the mat\".split()]\n",
138 |     "round(rouge_l(refs, hyp), 3)\n"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "id": "6143dad7",
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "# Simplified METEOR: unigram F-mean with chunk penalty\n",
149 |     "def matching_chunks(h, r):\n",
150 |     "    pos = {}\n",
151 |     "    for j,t in enumerate(r): pos.setdefault(t, []).append(j)\n",
152 |     "    matches, chunks, prev = 0, 0, None\n",
153 |     "    for t in h:\n",
154 |     "        if not pos.get(t): continue\n",
155 |     "        j = pos[t].pop(0); matches += 1\n",
156 |     "        if prev is None or j != prev+1: chunks += 1\n",
157 |     "        prev = j\n",
158 |     "    return matches, chunks\n",
159 |     "def meteor_simple(references, hypotheses, alpha=0.9, beta=3.0, gamma=0.5):\n",
160 |     "    scores = []\n",
161 |     "    for refs, hyp in zip(references, hypotheses):\n",
162 |     "        best = 0.0\n",
163 |     "        for r in refs:\n",
164 |     "            hc, rc = Counter(hyp), Counter(r)\n",
165 |     "            overlap = sum(min(hc[t], rc[t]) for t in hc)\n",
166 |     "            P = overlap/max(1,len(hyp)); R = overlap/max(1,len(r))\n",
167 |     "            if P==0 or R==0: cand = 0.0\n",
168 |     "            else:\n",
169 |     "                Fm = (P*R)/max(alpha*P+(1-alpha)*R, 1e-12)\n",
170 |     "                m, ch = matching_chunks(hyp, r)\n",
171 |     "                pen = 0.0 if m==0 else gamma*((ch/m)**beta)\n",
172 |     "                cand = Fm*(1-pen)\n",
173 |     "            best = max(best, cand)\n",
174 |     "        scores.append(best)\n",
175 |     "    return sum(scores)/max(1,len(scores))\n",
176 |     "refs = [[\"the cat is on the mat\".split()]]\n",
177 |     "hyp = [\"the cat sat on the mat\".split()]\n",
178 |     "round(meteor_simple(refs, hyp), 3)\n"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "id": "be9c3ccd",
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "# Diversity: distinct-1 / distinct-2\n",
189 |     "def distinct_n(hyps, n=1):\n",
190 |     "    grams, total = Counter(), 0\n",
191 |     "    for h in hyps:\n",
192 |     "        c = Counter(tuple(h[i:i+n]) for i in range(len(h)-n+1))\n",
193 |     "        grams.update(c); total += sum(c.values())\n",
194 |     "    return 0.0 if total==0 else len(grams)/total\n",
195 |     "hyps = [\"the cat sat on the mat\".split(),\n",
196 |     "        \"the cat sat on the mat\".split()]\n",
197 |     "round(distinct_n(hyps,1),3), round(distinct_n(hyps,2),3)\n"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "id": "eb58b9f8",
203 |    "metadata": {},
204 |    "source": [
205 |     "## Exercises\n",
206 |     "\n",
207 |     "- Add a calibration plot (reliability diagram) for your model’s probability outputs.\n",
208 |     "- Design a qualitative evaluation rubric and capture results from two reviewers.\n",
209 |     "- Implement a simple regression test that fails when a critical metric drops below a threshold."
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "id": "917d3b6d",
215 |    "metadata": {},
216 |    "source": [
217 |     "<img src=\"https://theaiengineer.dev/tae_logo_gw_flatter.png\" width=35% align=right>"
218 |    ]
219 |   }
220 |  ],
221 |  "metadata": {
222 |   "kernelspec": {
223 |    "display_name": "Python 3",
224 |    "language": "python",
225 |    "name": "python3"
226 |   },
227 |   "language_info": {
228 |    "name": "python",
229 |    "version": "3.10"
230 |   }
231 |  },
232 |  "nbformat": 4,
233 |  "nbformat_minor": 5
234 | }
235 | 


--------------------------------------------------------------------------------