├── .gitignore
├── 01_intro
└── 01_introduction.pdf
├── 02_data
├── 02.ipynb
├── figures
│ ├── 01.png
│ ├── 02.png
│ ├── 03.png
│ ├── 04.png
│ ├── 05.png
│ ├── 06.png
│ ├── 07.png
│ ├── 08.png
│ ├── 09.png
│ ├── 10.png
│ ├── 11.png
│ ├── 12.png
│ ├── 13.png
│ ├── 14.png
│ └── 15.png
├── supplementary.py
└── the-verdict.txt
├── 03_architecture
├── 03.ipynb
├── figures
│ ├── 01.png
│ ├── 02.png
│ ├── 03.png
│ ├── 04.png
│ ├── 05.png
│ ├── 06.png
│ ├── 07.png
│ ├── 08.png
│ ├── 09.png
│ └── 10.png
└── supplementary.py
├── 04_pretraining
├── 04.ipynb
├── figures
│ ├── 01.png
│ ├── 02.png
│ ├── 03.png
│ └── 04.png
├── supplementary.py
└── the-verdict.txt
├── 05_weightloading
├── 05_part-1.ipynb
├── 05_part-2.ipynb
├── figures
│ ├── 01.png
│ └── 02.png
├── gpt_download.py
└── supplementary.py
├── 06_finetuning
├── 06_part-1.ipynb
├── 06_part-2.ipynb
├── 06_part-3.ipynb
├── 06_part-4.ipynb
├── figures
│ ├── 01.png
│ ├── 02.png
│ ├── 03.png
│ ├── 04.png
│ ├── 05.png
│ ├── 06.png
│ ├── 07.png
│ ├── 08.png
│ ├── 09.png
│ ├── 10.png
│ ├── 11.png
│ ├── 12.png
│ ├── 13.png
│ ├── 14.png
│ └── 15.png
└── instruction-data.json
├── 07_outro
└── 07_conclusion.pdf
├── LICENSE.txt
├── README.md
├── requirements.txt
└── setup
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | # Slide files
2 | *.key
3 |
4 | # Solution files
5 | *_solution*
6 | loss-plot.pdf
7 | test_response_before.json
8 | test_response_before_after.json
9 | test.json
10 | train.json
11 |
12 | # Model weights
13 | 124M/*
14 | model.pth
15 | checkpoint
16 | encoder.json
17 | hparams.json
18 | model.ckpt.data-00000-of-00001
19 | model.ckpt.index
20 | model.ckpt.meta
21 | vocab.bpe
22 |
23 | # OS and temporary files
24 | .DS_Store
25 | *.pyc
26 |
27 | # Byte-compiled / optimized / DLL files
28 | __pycache__/
29 | *.py[cod]
30 | *$py.class
31 |
32 | # C extensions
33 | *.so
34 |
35 | # Distribution / packaging
36 | .Python
37 | build/
38 | develop-eggs/
39 | dist/
40 | downloads/
41 | eggs/
42 | .eggs/
43 | lib/
44 | lib64/
45 | parts/
46 | sdist/
47 | var/
48 | wheels/
49 | share/python-wheels/
50 | *.egg-info/
51 | .installed.cfg
52 | *.egg
53 | MANIFEST
54 |
55 | # PyInstaller
56 | # Usually these files are written by a python script from a template
57 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
58 | *.manifest
59 | *.spec
60 |
61 | # Installer logs
62 | pip-log.txt
63 | pip-delete-this-directory.txt
64 |
65 | # Unit test / coverage reports
66 | htmlcov/
67 | .tox/
68 | .nox/
69 | .coverage
70 | .coverage.*
71 | .cache
72 | nosetests.xml
73 | coverage.xml
74 | *.cover
75 | *.py,cover
76 | .hypothesis/
77 | .pytest_cache/
78 | cover/
79 |
80 | # Translations
81 | *.mo
82 | *.pot
83 |
84 | # Django stuff:
85 | *.log
86 | local_settings.py
87 | db.sqlite3
88 | db.sqlite3-journal
89 |
90 | # Flask stuff:
91 | instance/
92 | .webassets-cache
93 |
94 | # Scrapy stuff:
95 | .scrapy
96 |
97 | # Sphinx documentation
98 | docs/_build/
99 |
100 | # PyBuilder
101 | .pybuilder/
102 | target/
103 |
104 | # Jupyter Notebook
105 | .ipynb_checkpoints
106 |
107 | # IPython
108 | profile_default/
109 | ipython_config.py
110 |
111 | # pyenv
112 | # For a library or package, you might want to ignore these files since the code is
113 | # intended to run in multiple environments; otherwise, check them in:
114 | # .python-version
115 |
116 | # pipenv
117 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
118 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
119 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
120 | # install all needed dependencies.
121 | #Pipfile.lock
122 |
123 | # poetry
124 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
125 | # This is especially recommended for binary packages to ensure reproducibility, and is more
126 | # commonly ignored for libraries.
127 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
128 | #poetry.lock
129 |
130 | # pdm
131 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
132 | #pdm.lock
133 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
134 | # in version control.
135 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
136 | .pdm.toml
137 | .pdm-python
138 | .pdm-build/
139 |
140 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
141 | __pypackages__/
142 |
143 | # Celery stuff
144 | celerybeat-schedule
145 | celerybeat.pid
146 |
147 | # SageMath parsed files
148 | *.sage.py
149 |
150 | # Environments
151 | .env
152 | .venv
153 | env/
154 | venv/
155 | ENV/
156 | env.bak/
157 | venv.bak/
158 |
159 | # Spyder project settings
160 | .spyderproject
161 | .spyproject
162 |
163 | # Rope project settings
164 | .ropeproject
165 |
166 | # mkdocs documentation
167 | /site
168 |
169 | # mypy
170 | .mypy_cache/
171 | .dmypy.json
172 | dmypy.json
173 |
174 | # Pyre type checker
175 | .pyre/
176 |
177 | # pytype static type analyzer
178 | .pytype/
179 |
180 | # Cython debug symbols
181 | cython_debug/
182 |
183 | # PyCharm
184 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
185 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
186 | # and can be added to the global gitignore or merged into this file. For a more nuclear
187 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
188 | #.idea/
189 |
--------------------------------------------------------------------------------
/01_intro/01_introduction.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/01_intro/01_introduction.pdf
--------------------------------------------------------------------------------
/02_data/02.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "d95f841a-63c9-41d4-aea1-496b3d2024dd",
6 | "metadata": {},
7 | "source": [
8 | "**LLM Workshop 2024 by Sebastian Raschka**\n",
9 | "\n",
10 | "This code is based on *Build a Large Language Model (From Scratch)*, [https://github.com/rasbt/LLMs-from-scratch](https://github.com/rasbt/LLMs-from-scratch)"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "id": "25aa40e3-5109-433f-9153-f5770531fe94",
16 | "metadata": {},
17 | "source": [
18 | "
\n",
19 | "
\n",
20 | "
\n",
21 | "
\n",
22 | "\n",
23 | "# 2) Understanding LLM Input Data"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "id": "76d5d2c0-cba8-404e-9bf3-71a218cae3cf",
29 | "metadata": {},
30 | "source": [
31 | "Packages that are being used in this notebook:"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "id": "4d1305cf-12d5-46fe-a2c9-36fb71c5b3d3",
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "from importlib.metadata import version\n",
42 | "\n",
43 | "\n",
44 | "print(\"torch version:\", version(\"torch\"))\n",
45 | "print(\"tiktoken version:\", version(\"tiktoken\"))"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "id": "5a42fbfd-e3c2-43c2-bc12-f5f870a0b10a",
51 | "metadata": {},
52 | "source": [
53 | "- This notebook provides a brief overview of the data preparation and sampling procedures to get input data \"ready\" for an LLM\n",
54 | "- Understanding what the input data looks like is a great first step towards understanding how LLMs work"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "id": "628b2922-594d-4ff9-bd82-04f1ebdf41f5",
60 | "metadata": {},
61 | "source": [
62 | ""
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "id": "eddbb984-8d23-40c5-bbfa-c3c379e7eec3",
68 | "metadata": {},
69 | "source": [
70 | "
\n",
71 | "
\n",
72 | "
\n",
73 | "
\n",
74 | "\n",
75 | "# 2.1 Tokenizing text"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "id": "f9c90731-7dc9-4cd3-8c4a-488e33b48e80",
81 | "metadata": {},
82 | "source": [
83 | "- In this section, we tokenize text, which means breaking text into smaller units, such as individual words and punctuation characters"
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "id": "09872fdb-9d4e-40c4-949d-52a01a43ec4b",
89 | "metadata": {},
90 | "source": [
91 | ""
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "id": "8cceaa18-833d-46b6-b211-b20c53902805",
97 | "metadata": {},
98 | "source": [
99 | "- Load raw text we want to work with\n",
100 | "- [The Verdict by Edith Wharton](https://en.wikisource.org/wiki/The_Verdict) is a public domain short story"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "id": "8a769e87-470a-48b9-8bdb-12841b416198",
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
111 | " raw_text = f.read()\n",
112 | " \n",
113 | "print(\"Total number of character:\", len(raw_text))\n",
114 | "print(raw_text[:99])"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "id": "9b971a46-ac03-4368-88ae-3f20279e8f4e",
120 | "metadata": {},
121 | "source": [
122 | "- The goal is to tokenize and embed this text for an LLM\n",
123 | "- Let's develop a simple tokenizer based on some simple sample text that we can then later apply to the text above"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "id": "6cbe9330-b587-4262-be9f-497a84ec0e8a",
129 | "metadata": {},
130 | "source": [
131 | "
"
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "id": "3daa1687-2c08-485a-87cc-a93c2f9586d7",
137 | "metadata": {},
138 | "source": [
139 | "- The following regular expression will split on whitespaces and punctuation"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "id": "737dd5b0-9dbb-4a97-9ae4-3482c8c04be7",
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "import re\n",
150 | "\n",
151 | "preprocessed = re.split(r'([,.:;?_!\"()\\']|--|\\s)', raw_text)\n",
152 | "preprocessed = [item for item in preprocessed if item]\n",
153 | "print(preprocessed[:38])"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "id": "35db7b5e-510b-4c45-995f-f5ad64a8e19c",
160 | "metadata": {},
161 | "outputs": [],
162 | "source": [
163 | "print(\"Number of tokens:\", len(preprocessed))"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "id": "0b5ce8fe-3a07-4f2a-90f1-a0321ce3a231",
169 | "metadata": {},
170 | "source": [
171 | "
\n",
172 | "
\n",
173 | "
\n",
174 | "
\n",
175 | "\n",
176 | "# 2.2 Converting tokens into token IDs"
177 | ]
178 | },
179 | {
180 | "cell_type": "markdown",
181 | "id": "a5204973-f414-4c0d-87b0-cfec1f06e6ff",
182 | "metadata": {},
183 | "source": [
184 | "- Next, we convert the text tokens into token IDs that we can process via embedding layers later\n",
185 | "- For this we first need to build a vocabulary"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "id": "177b041d-f739-43b8-bd81-0443ae3a7f8d",
191 | "metadata": {},
192 | "source": [
193 | ""
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "id": "8eeade64-037b-4b59-9039-d3b000ef8886",
199 | "metadata": {},
200 | "source": [
201 | "- The vocabulary contains the unique words in the input text"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": null,
207 | "id": "7fdf0533-5ab6-42a5-83fa-a3b045de6396",
208 | "metadata": {},
209 | "outputs": [],
210 | "source": [
211 | "all_words = sorted(set(preprocessed))\n",
212 | "vocab_size = len(all_words)\n",
213 | "\n",
214 | "print(vocab_size)"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "id": "77d00d96-881f-4691-bb03-84fec2a75a26",
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "vocab = {token:integer for integer,token in enumerate(all_words)}"
225 | ]
226 | },
227 | {
228 | "cell_type": "markdown",
229 | "id": "75bd1f81-3a8f-4dd9-9dd6-e75f32dacbe3",
230 | "metadata": {},
231 | "source": [
232 | "- Below are the first 50 entries in this vocabulary:"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": null,
238 | "id": "e1c5de4a-aa4e-4aec-b532-10bb364039d6",
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "for i, item in enumerate(vocab.items()):\n",
243 | " print(item)\n",
244 | " if i >= 50:\n",
245 | " break"
246 | ]
247 | },
248 | {
249 | "cell_type": "markdown",
250 | "id": "3b1dc314-351b-476a-9459-0ec9ddc29b19",
251 | "metadata": {},
252 | "source": [
253 | "- Below, we illustrate the tokenization of a short sample text using a small vocabulary:"
254 | ]
255 | },
256 | {
257 | "cell_type": "markdown",
258 | "id": "67407a9f-0202-4e7c-9ed7-1b3154191ebc",
259 | "metadata": {},
260 | "source": [
261 | "
"
262 | ]
263 | },
264 | {
265 | "cell_type": "markdown",
266 | "id": "4e569647-2589-4c9d-9a5c-aef1c88a0a9a",
267 | "metadata": {},
268 | "source": [
269 | "- Let's now put it all together into a tokenizer class"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "id": "f531bf46-7c25-4ef8-bff8-0d27518676d5",
276 | "metadata": {},
277 | "outputs": [],
278 | "source": [
279 | "class SimpleTokenizerV1:\n",
280 | " def __init__(self, vocab):\n",
281 | " self.str_to_int = vocab\n",
282 | " self.int_to_str = {i:s for s,i in vocab.items()}\n",
283 | " \n",
284 | " def encode(self, text):\n",
285 | " preprocessed = re.split(r'([,.?_!\"()\\']|--|\\s)', text)\n",
286 | " preprocessed = [\n",
287 | " item.strip() for item in preprocessed if item.strip()\n",
288 | " ]\n",
289 | " ids = [self.str_to_int[s] for s in preprocessed]\n",
290 | " return ids\n",
291 | " \n",
292 | " def decode(self, ids):\n",
293 | " text = \" \".join([self.int_to_str[i] for i in ids])\n",
294 | " # Replace spaces before the specified punctuations\n",
295 | " text = re.sub(r'\\s+([,.?!\"()\\'])', r'\\1', text)\n",
296 | " return text"
297 | ]
298 | },
299 | {
300 | "cell_type": "markdown",
301 | "id": "dee7a1e5-b54f-4ca1-87ef-3d663c4ee1e7",
302 | "metadata": {},
303 | "source": [
304 | "- The `encode` function turns text into token IDs\n",
305 | "- The `decode` function turns token IDs back into text"
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "id": "cc21d347-ec03-4823-b3d4-9d686e495617",
311 | "metadata": {},
312 | "source": [
313 | "
"
314 | ]
315 | },
316 | {
317 | "cell_type": "markdown",
318 | "id": "c2950a94-6b0d-474e-8ed0-66d0c3c1a95c",
319 | "metadata": {},
320 | "source": [
321 | "- We can use the tokenizer to encode (that is, tokenize) texts into integers\n",
322 | "- These integers can then be embedded (later) as input of/for the LLM"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": null,
328 | "id": "647364ec-7995-4654-9b4a-7607ccf5f1e4",
329 | "metadata": {},
330 | "outputs": [],
331 | "source": [
332 | "tokenizer = SimpleTokenizerV1(vocab)\n",
333 | "\n",
334 | "text = \"\"\"\"It's the last he painted, you know,\" \n",
335 | " Mrs. Gisburn said with pardonable pride.\"\"\"\n",
336 | "ids = tokenizer.encode(text)\n",
337 | "print(ids)"
338 | ]
339 | },
340 | {
341 | "cell_type": "markdown",
342 | "id": "3201706e-a487-4b60-b99d-5765865f29a0",
343 | "metadata": {},
344 | "source": [
345 | "- We can decode the integers back into text"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": null,
351 | "id": "01d8c8fb-432d-4a49-b332-99f23b233746",
352 | "metadata": {},
353 | "outputs": [],
354 | "source": [
355 | "tokenizer.decode(ids)"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": null,
361 | "id": "54f6aa8b-9827-412e-9035-e827296ab0fe",
362 | "metadata": {},
363 | "outputs": [],
364 | "source": [
365 | "tokenizer.decode(tokenizer.encode(text))"
366 | ]
367 | },
368 | {
369 | "cell_type": "markdown",
370 | "id": "5c4ba34b-170f-4e71-939b-77aabb776f14",
371 | "metadata": {},
372 | "source": [
373 | "
\n",
374 | "
\n",
375 | "
\n",
376 | "
\n",
377 | "\n",
378 | "# 2.3 BytePair encoding"
379 | ]
380 | },
381 | {
382 | "cell_type": "markdown",
383 | "id": "2309494c-79cf-4a2d-bc28-a94d602f050e",
384 | "metadata": {},
385 | "source": [
386 | "- GPT-2 used BytePair encoding (BPE) as its tokenizer\n",
387 | "- it allows the model to break down words that aren't in its predefined vocabulary into smaller subword units or even individual characters, enabling it to handle out-of-vocabulary words\n",
388 | "- For instance, if GPT-2's vocabulary doesn't have the word \"unfamiliarword,\" it might tokenize it as [\"unfam\", \"iliar\", \"word\"] or some other subword breakdown, depending on its trained BPE merges\n",
389 | "- The original BPE tokenizer can be found here: [https://github.com/openai/gpt-2/blob/master/src/encoder.py](https://github.com/openai/gpt-2/blob/master/src/encoder.py)\n",
390 | "- In this lecture, we are using the BPE tokenizer from OpenAI's open-source [tiktoken](https://github.com/openai/tiktoken) library, which implements its core algorithms in Rust to improve computational performance\n",
391 | "- (Based on an analysis [here](https://github.com/rasbt/LLMs-from-scratch/blob/main/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb), I found that `tiktoken` is approx. 3x faster than the original tokenizer and 6x faster than an equivalent tokenizer in Hugging Face)"
392 | ]
393 | },
394 | {
395 | "cell_type": "code",
396 | "execution_count": null,
397 | "id": "ede1d41f-934b-4bf4-8184-54394a257a94",
398 | "metadata": {},
399 | "outputs": [],
400 | "source": [
401 | "# pip install tiktoken"
402 | ]
403 | },
404 | {
405 | "cell_type": "code",
406 | "execution_count": null,
407 | "id": "48967a77-7d17-42bf-9e92-fc619d63a59e",
408 | "metadata": {},
409 | "outputs": [],
410 | "source": [
411 | "import importlib\n",
412 | "import tiktoken\n",
413 | "\n",
414 | "print(\"tiktoken version:\", importlib.metadata.version(\"tiktoken\"))"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": null,
420 | "id": "6ad3312f-a5f7-4efc-9d7d-8ea09d7b5128",
421 | "metadata": {},
422 | "outputs": [],
423 | "source": [
424 | "tokenizer = tiktoken.get_encoding(\"gpt2\")"
425 | ]
426 | },
427 | {
428 | "cell_type": "code",
429 | "execution_count": null,
430 | "id": "5ff2cd85-7cfb-4325-b390-219938589428",
431 | "metadata": {},
432 | "outputs": [],
433 | "source": [
434 | "text = (\n",
435 | " \"Hello, do you like tea? <|endoftext|> In the sunlit terraces\"\n",
436 | " \"of someunknownPlace.\"\n",
437 | ")\n",
438 | "\n",
439 | "integers = tokenizer.encode(text, allowed_special={\"<|endoftext|>\"})\n",
440 | "\n",
441 | "print(integers)"
442 | ]
443 | },
444 | {
445 | "cell_type": "code",
446 | "execution_count": null,
447 | "id": "d26a48bb-f82e-41a8-a955-a1c9cf9d50ab",
448 | "metadata": {},
449 | "outputs": [],
450 | "source": [
451 | "strings = tokenizer.decode(integers)\n",
452 | "\n",
453 | "print(strings)"
454 | ]
455 | },
456 | {
457 | "cell_type": "markdown",
458 | "id": "e8c2e7b4-6a22-42aa-8e4d-901f06378d4a",
459 | "metadata": {},
460 | "source": [
461 | "- BPE tokenizers break down unknown words into subwords and individual characters:"
462 | ]
463 | },
464 | {
465 | "cell_type": "markdown",
466 | "id": "c082d41f-33d7-4827-97d8-993d5a84bb3c",
467 | "metadata": {},
468 | "source": [
469 | ""
470 | ]
471 | },
472 | {
473 | "cell_type": "code",
474 | "execution_count": null,
475 | "id": "0beb27ee-1156-457c-839e-eebb48d94d0e",
476 | "metadata": {},
477 | "outputs": [],
478 | "source": [
479 | "tokenizer.encode(\"Akwirw ier\", allowed_special={\"<|endoftext|>\"})"
480 | ]
481 | },
482 | {
483 | "cell_type": "markdown",
484 | "id": "abbd7c0d-70f8-4386-a114-907e96c950b0",
485 | "metadata": {},
486 | "source": [
487 | "
\n",
488 | "
\n",
489 | "
\n",
490 | "
\n",
491 | "\n",
492 | "# 2.4 Data sampling with a sliding window"
493 | ]
494 | },
495 | {
496 | "cell_type": "markdown",
497 | "id": "509d9826-6384-462e-aa8a-a7c73cd6aad0",
498 | "metadata": {},
499 | "source": [
500 | "- Above, we took care of the tokenization (converting text into word tokens represented as token ID numbers)\n",
501 | "- Now, let's talk about how we create the data loading for LLMs\n",
502 | "- We train LLMs to generate one word at a time, so we want to prepare the training data accordingly where the next word in a sequence represents the target to predict"
503 | ]
504 | },
505 | {
506 | "cell_type": "markdown",
507 | "id": "39fb44f4-0c43-4a6a-9c2f-9cf31452354c",
508 | "metadata": {},
509 | "source": [
510 | ""
511 | ]
512 | },
513 | {
514 | "cell_type": "markdown",
515 | "id": "0c9a3d50-885b-49bc-b791-9f5cc8bc7b7c",
516 | "metadata": {},
517 | "source": [
518 | "- For this, we use a sliding window approach, changing the position by +1:\n",
519 | "\n",
520 | "
"
521 | ]
522 | },
523 | {
524 | "cell_type": "markdown",
525 | "id": "b006212f-de45-468d-bdee-5806216d1679",
526 | "metadata": {},
527 | "source": [
528 | "- Note that in practice it's best to set the stride equal to the context length so that we don't have overlaps between the inputs (the targets are still shifted by +1 always)"
529 | ]
530 | },
531 | {
532 | "cell_type": "markdown",
533 | "id": "9cb467e0-bdcd-4dda-b9b0-a738c5d33ac3",
534 | "metadata": {},
535 | "source": [
536 | "
"
537 | ]
538 | },
539 | {
540 | "cell_type": "code",
541 | "execution_count": null,
542 | "id": "fb55f51a",
543 | "metadata": {},
544 | "outputs": [],
545 | "source": [
546 | "from supplementary import create_dataloader_v1\n",
547 | "\n",
548 | "\n",
549 | "dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)\n",
550 | "\n",
551 | "data_iter = iter(dataloader)\n",
552 | "inputs, targets = next(data_iter)\n",
553 | "print(\"Inputs:\\n\", inputs)\n",
554 | "print(\"\\nTargets:\\n\", targets)"
555 | ]
556 | },
557 | {
558 | "cell_type": "markdown",
559 | "id": "2dc671fb-6945-4594-b33f-8b462a69720d",
560 | "metadata": {},
561 | "source": [
562 | "
\n",
563 | "
\n",
564 | "
\n",
565 | "
\n",
566 | "\n",
567 | "# Exercise: Prepare your own favorite text dataset"
568 | ]
569 | }
570 | ],
571 | "metadata": {
572 | "kernelspec": {
573 | "display_name": "Python 3 (ipykernel)",
574 | "language": "python",
575 | "name": "python3"
576 | },
577 | "language_info": {
578 | "codemirror_mode": {
579 | "name": "ipython",
580 | "version": 3
581 | },
582 | "file_extension": ".py",
583 | "mimetype": "text/x-python",
584 | "name": "python",
585 | "nbconvert_exporter": "python",
586 | "pygments_lexer": "ipython3",
587 | "version": "3.10.10"
588 | }
589 | },
590 | "nbformat": 4,
591 | "nbformat_minor": 5
592 | }
593 |
--------------------------------------------------------------------------------
/02_data/figures/01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/02_data/figures/01.png
--------------------------------------------------------------------------------
/02_data/figures/02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/02_data/figures/02.png
--------------------------------------------------------------------------------
/02_data/figures/03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/02_data/figures/03.png
--------------------------------------------------------------------------------
/02_data/figures/04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/02_data/figures/04.png
--------------------------------------------------------------------------------
/02_data/figures/05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/02_data/figures/05.png
--------------------------------------------------------------------------------
/02_data/figures/06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/02_data/figures/06.png
--------------------------------------------------------------------------------
/02_data/figures/07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/02_data/figures/07.png
--------------------------------------------------------------------------------
/02_data/figures/08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/02_data/figures/08.png
--------------------------------------------------------------------------------
/02_data/figures/09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/02_data/figures/09.png
--------------------------------------------------------------------------------
/02_data/figures/10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/02_data/figures/10.png
--------------------------------------------------------------------------------
/02_data/figures/11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/02_data/figures/11.png
--------------------------------------------------------------------------------
/02_data/figures/12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/02_data/figures/12.png
--------------------------------------------------------------------------------
/02_data/figures/13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/02_data/figures/13.png
--------------------------------------------------------------------------------
/02_data/figures/14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/02_data/figures/14.png
--------------------------------------------------------------------------------
/02_data/figures/15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/02_data/figures/15.png
--------------------------------------------------------------------------------
/02_data/supplementary.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
2 | # Source for "Build a Large Language Model From Scratch"
3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch
4 | # Code: https://github.com/rasbt/LLMs-from-scratch
5 |
6 | import torch
7 | import tiktoken
8 | from torch.utils.data import Dataset, DataLoader
9 |
10 |
11 | class GPTDatasetV1(Dataset):
12 | def __init__(self, txt, tokenizer, max_length, stride):
13 | self.input_ids = []
14 | self.target_ids = []
15 |
16 | # Tokenize the entire text
17 | token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
18 |
19 | # Use a sliding window to chunk the book into overlapping sequences of max_length
20 | for i in range(0, len(token_ids) - max_length, stride):
21 | input_chunk = token_ids[i:i + max_length]
22 | target_chunk = token_ids[i + 1: i + max_length + 1]
23 | self.input_ids.append(torch.tensor(input_chunk))
24 | self.target_ids.append(torch.tensor(target_chunk))
25 |
26 | def __len__(self):
27 | return len(self.input_ids)
28 |
29 | def __getitem__(self, idx):
30 | return self.input_ids[idx], self.target_ids[idx]
31 |
32 |
33 | def create_dataloader_v1(txt, batch_size=4, max_length=256,
34 | stride=128, shuffle=True, drop_last=True,
35 | num_workers=0):
36 |
37 | # Initialize the tokenizer
38 | tokenizer = tiktoken.get_encoding("gpt2")
39 |
40 | # Create dataset
41 | dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
42 |
43 | # Create dataloader
44 | dataloader = DataLoader(
45 | dataset,
46 | batch_size=batch_size,
47 | shuffle=shuffle,
48 | drop_last=drop_last,
49 | num_workers=num_workers
50 | )
51 |
52 | return dataloader
--------------------------------------------------------------------------------
/02_data/the-verdict.txt:
--------------------------------------------------------------------------------
1 | I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)
2 |
3 | "The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it's going to send the value of my picture 'way up; but I don't think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing's lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn's "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?
4 |
5 | Well!--even through the prism of Hermia's tears I felt able to face the fact with equanimity. Poor Jack Gisburn! The women had made him--it was fitting that they should mourn him. Among his own sex fewer regrets were heard, and in his own trade hardly a murmur. Professional jealousy? Perhaps. If it were, the honour of the craft was vindicated by little Claude Nutley, who, in all good faith, brought out in the Burlington a very handsome "obituary" on Jack--one of those showy articles stocked with random technicalities that I have heard (I won't say by whom) compared to Gisburn's painting. And so--his resolve being apparently irrevocable--the discussion gradually died out, and, as Mrs. Thwing had predicted, the price of "Gisburns" went up.
6 |
7 | It was not till three years later that, in the course of a few weeks' idling on the Riviera, it suddenly occurred to me to wonder why Gisburn had given up his painting. On reflection, it really was a tempting problem. To accuse his wife would have been too easy--his fair sitters had been denied the solace of saying that Mrs. Gisburn had "dragged him down." For Mrs. Gisburn--as such--had not existed till nearly a year after Jack's resolve had been taken. It might be that he had married her--since he liked his ease--because he didn't want to go on painting; but it would have been hard to prove that he had given up his painting because he had married her.
8 |
9 | Of course, if she had not dragged him down, she had equally, as Miss Croft contended, failed to "lift him up"--she had not led him back to the easel. To put the brush into his hand again--what a vocation for a wife! But Mrs. Gisburn appeared to have disdained it--and I felt it might be interesting to find out why.
10 |
11 | The desultory life of the Riviera lends itself to such purely academic speculations; and having, on my way to Monte Carlo, caught a glimpse of Jack's balustraded terraces between the pines, I had myself borne thither the next day.
12 |
13 | I found the couple at tea beneath their palm-trees; and Mrs. Gisburn's welcome was so genial that, in the ensuing weeks, I claimed it frequently. It was not that my hostess was "interesting": on that point I could have given Miss Croft the fullest reassurance. It was just because she was _not_ interesting--if I may be pardoned the bull--that I found her so. For Jack, all his life, had been surrounded by interesting women: they had fostered his art, it had been reared in the hot-house of their adulation. And it was therefore instructive to note what effect the "deadening atmosphere of mediocrity" (I quote Miss Croft) was having on him.
14 |
15 | I have mentioned that Mrs. Gisburn was rich; and it was immediately perceptible that her husband was extracting from this circumstance a delicate but substantial satisfaction. It is, as a rule, the people who scorn money who get most out of it; and Jack's elegant disdain of his wife's big balance enabled him, with an appearance of perfect good-breeding, to transmute it into objects of art and luxury. To the latter, I must add, he remained relatively indifferent; but he was buying Renaissance bronzes and eighteenth-century pictures with a discrimination that bespoke the amplest resources.
16 |
17 | "Money's only excuse is to put beauty into circulation," was one of the axioms he laid down across the Sevres and silver of an exquisitely appointed luncheon-table, when, on a later day, I had again run over from Monte Carlo; and Mrs. Gisburn, beaming on him, added for my enlightenment: "Jack is so morbidly sensitive to every form of beauty."
18 |
19 | Poor Jack! It had always been his fate to have women say such things of him: the fact should be set down in extenuation. What struck me now was that, for the first time, he resented the tone. I had seen him, so often, basking under similar tributes--was it the conjugal note that robbed them of their savour? No--for, oddly enough, it became apparent that he was fond of Mrs. Gisburn--fond enough not to see her absurdity. It was his own absurdity he seemed to be wincing under--his own attitude as an object for garlands and incense.
20 |
21 | "My dear, since I've chucked painting people don't say that stuff about me--they say it about Victor Grindle," was his only protest, as he rose from the table and strolled out onto the sunlit terrace.
22 |
23 | I glanced after him, struck by his last word. Victor Grindle was, in fact, becoming the man of the moment--as Jack himself, one might put it, had been the man of the hour. The younger artist was said to have formed himself at my friend's feet, and I wondered if a tinge of jealousy underlay the latter's mysterious abdication. But no--for it was not till after that event that the _rose Dubarry_ drawing-rooms had begun to display their "Grindles."
24 |
25 | I turned to Mrs. Gisburn, who had lingered to give a lump of sugar to her spaniel in the dining-room.
26 |
27 | "Why _has_ he chucked painting?" I asked abruptly.
28 |
29 | She raised her eyebrows with a hint of good-humoured surprise.
30 |
31 | "Oh, he doesn't _have_ to now, you know; and I want him to enjoy himself," she said quite simply.
32 |
33 | I looked about the spacious white-panelled room, with its _famille-verte_ vases repeating the tones of the pale damask curtains, and its eighteenth-century pastels in delicate faded frames.
34 |
35 | "Has he chucked his pictures too? I haven't seen a single one in the house."
36 |
37 | A slight shade of constraint crossed Mrs. Gisburn's open countenance. "It's his ridiculous modesty, you know. He says they're not fit to have about; he's sent them all away except one--my portrait--and that I have to keep upstairs."
38 |
39 | His ridiculous modesty--Jack's modesty about his pictures? My curiosity was growing like the bean-stalk. I said persuasively to my hostess: "I must really see your portrait, you know."
40 |
41 | She glanced out almost timorously at the terrace where her husband, lounging in a hooded chair, had lit a cigar and drawn the Russian deerhound's head between his knees.
42 |
43 | "Well, come while he's not looking," she said, with a laugh that tried to hide her nervousness; and I followed her between the marble Emperors of the hall, and up the wide stairs with terra-cotta nymphs poised among flowers at each landing.
44 |
45 | In the dimmest corner of her boudoir, amid a profusion of delicate and distinguished objects, hung one of the familiar oval canvases, in the inevitable garlanded frame. The mere outline of the frame called up all Gisburn's past!
46 |
47 | Mrs. Gisburn drew back the window-curtains, moved aside a _jardiniere_ full of pink azaleas, pushed an arm-chair away, and said: "If you stand here you can just manage to see it. I had it over the mantel-piece, but he wouldn't let it stay."
48 |
49 | Yes--I could just manage to see it--the first portrait of Jack's I had ever had to strain my eyes over! Usually they had the place of honour--say the central panel in a pale yellow or _rose Dubarry_ drawing-room, or a monumental easel placed so that it took the light through curtains of old Venetian point. The more modest place became the picture better; yet, as my eyes grew accustomed to the half-light, all the characteristic qualities came out--all the hesitations disguised as audacities, the tricks of prestidigitation by which, with such consummate skill, he managed to divert attention from the real business of the picture to some pretty irrelevance of detail. Mrs. Gisburn, presenting a neutral surface to work on--forming, as it were, so inevitably the background of her own picture--had lent herself in an unusual degree to the display of this false virtuosity. The picture was one of Jack's "strongest," as his admirers would have put it--it represented, on his part, a swelling of muscles, a congesting of veins, a balancing, straddling and straining, that reminded one of the circus-clown's ironic efforts to lift a feather. It met, in short, at every point the demand of lovely woman to be painted "strongly" because she was tired of being painted "sweetly"--and yet not to lose an atom of the sweetness.
50 |
51 | "It's the last he painted, you know," Mrs. Gisburn said with pardonable pride. "The last but one," she corrected herself--"but the other doesn't count, because he destroyed it."
52 |
53 | "Destroyed it?" I was about to follow up this clue when I heard a footstep and saw Jack himself on the threshold.
54 |
55 | As he stood there, his hands in the pockets of his velveteen coat, the thin brown waves of hair pushed back from his white forehead, his lean sunburnt cheeks furrowed by a smile that lifted the tips of a self-confident moustache, I felt to what a degree he had the same quality as his pictures--the quality of looking cleverer than he was.
56 |
57 | His wife glanced at him deprecatingly, but his eyes travelled past her to the portrait.
58 |
59 | "Mr. Rickham wanted to see it," she began, as if excusing herself. He shrugged his shoulders, still smiling.
60 |
61 | "Oh, Rickham found me out long ago," he said lightly; then, passing his arm through mine: "Come and see the rest of the house."
62 |
63 | He showed it to me with a kind of naive suburban pride: the bath-rooms, the speaking-tubes, the dress-closets, the trouser-presses--all the complex simplifications of the millionaire's domestic economy. And whenever my wonder paid the expected tribute he said, throwing out his chest a little: "Yes, I really don't see how people manage to live without that."
64 |
65 | Well--it was just the end one might have foreseen for him. Only he was, through it all and in spite of it all--as he had been through, and in spite of, his pictures--so handsome, so charming, so disarming, that one longed to cry out: "Be dissatisfied with your leisure!" as once one had longed to say: "Be dissatisfied with your work!"
66 |
67 | But, with the cry on my lips, my diagnosis suffered an unexpected check.
68 |
69 | "This is my own lair," he said, leading me into a dark plain room at the end of the florid vista. It was square and brown and leathery: no "effects"; no bric-a-brac, none of the air of posing for reproduction in a picture weekly--above all, no least sign of ever having been used as a studio.
70 |
71 | The fact brought home to me the absolute finality of Jack's break with his old life.
72 |
73 | "Don't you ever dabble with paint any more?" I asked, still looking about for a trace of such activity.
74 |
75 | "Never," he said briefly.
76 |
77 | "Or water-colour--or etching?"
78 |
79 | His confident eyes grew dim, and his cheeks paled a little under their handsome sunburn.
80 |
81 | "Never think of it, my dear fellow--any more than if I'd never touched a brush."
82 |
83 | And his tone told me in a flash that he never thought of anything else.
84 |
85 | I moved away, instinctively embarrassed by my unexpected discovery; and as I turned, my eye fell on a small picture above the mantel-piece--the only object breaking the plain oak panelling of the room.
86 |
87 | "Oh, by Jove!" I said.
88 |
89 | It was a sketch of a donkey--an old tired donkey, standing in the rain under a wall.
90 |
91 | "By Jove--a Stroud!" I cried.
92 |
93 | He was silent; but I felt him close behind me, breathing a little quickly.
94 |
95 | "What a wonder! Made with a dozen lines--but on everlasting foundations. You lucky chap, where did you get it?"
96 |
97 | He answered slowly: "Mrs. Stroud gave it to me."
98 |
99 | "Ah--I didn't know you even knew the Strouds. He was such an inflexible hermit."
100 |
101 | "I didn't--till after. . . . She sent for me to paint him when he was dead."
102 |
103 | "When he was dead? You?"
104 |
105 | I must have let a little too much amazement escape through my surprise, for he answered with a deprecating laugh: "Yes--she's an awful simpleton, you know, Mrs. Stroud. Her only idea was to have him done by a fashionable painter--ah, poor Stroud! She thought it the surest way of proclaiming his greatness--of forcing it on a purblind public. And at the moment I was _the_ fashionable painter."
106 |
107 | "Ah, poor Stroud--as you say. Was _that_ his history?"
108 |
109 | "That was his history. She believed in him, gloried in him--or thought she did. But she couldn't bear not to have all the drawing-rooms with her. She couldn't bear the fact that, on varnishing days, one could always get near enough to see his pictures. Poor woman! She's just a fragment groping for other fragments. Stroud is the only whole I ever knew."
110 |
111 | "You ever knew? But you just said--"
112 |
113 | Gisburn had a curious smile in his eyes.
114 |
115 | "Oh, I knew him, and he knew me--only it happened after he was dead."
116 |
117 | I dropped my voice instinctively. "When she sent for you?"
118 |
119 | "Yes--quite insensible to the irony. She wanted him vindicated--and by me!"
120 |
121 | He laughed again, and threw back his head to look up at the sketch of the donkey. "There were days when I couldn't look at that thing--couldn't face it. But I forced myself to put it here; and now it's cured me--cured me. That's the reason why I don't dabble any more, my dear Rickham; or rather Stroud himself is the reason."
122 |
123 | For the first time my idle curiosity about my companion turned into a serious desire to understand him better.
124 |
125 | "I wish you'd tell me how it happened," I said.
126 |
127 | He stood looking up at the sketch, and twirling between his fingers a cigarette he had forgotten to light. Suddenly he turned toward me.
128 |
129 | "I'd rather like to tell you--because I've always suspected you of loathing my work."
130 |
131 | I made a deprecating gesture, which he negatived with a good-humoured shrug.
132 |
133 | "Oh, I didn't care a straw when I believed in myself--and now it's an added tie between us!"
134 |
135 | He laughed slightly, without bitterness, and pushed one of the deep arm-chairs forward. "There: make yourself comfortable--and here are the cigars you like."
136 |
137 | He placed them at my elbow and continued to wander up and down the room, stopping now and then beneath the picture.
138 |
139 | "How it happened? I can tell you in five minutes--and it didn't take much longer to happen. . . . I can remember now how surprised and pleased I was when I got Mrs. Stroud's note. Of course, deep down, I had always _felt_ there was no one like him--only I had gone with the stream, echoed the usual platitudes about him, till I half got to think he was a failure, one of the kind that are left behind. By Jove, and he _was_ left behind--because he had come to stay! The rest of us had to let ourselves be swept along or go under, but he was high above the current--on everlasting foundations, as you say.
140 |
141 | "Well, I went off to the house in my most egregious mood--rather moved, Lord forgive me, at the pathos of poor Stroud's career of failure being crowned by the glory of my painting him! Of course I meant to do the picture for nothing--I told Mrs. Stroud so when she began to stammer something about her poverty. I remember getting off a prodigious phrase about the honour being _mine_--oh, I was princely, my dear Rickham! I was posing to myself like one of my own sitters.
142 |
143 | "Then I was taken up and left alone with him. I had sent all my traps in advance, and I had only to set up the easel and get to work. He had been dead only twenty-four hours, and he died suddenly, of heart disease, so that there had been no preliminary work of destruction--his face was clear and untouched. I had met him once or twice, years before, and thought him insignificant and dingy. Now I saw that he was superb.
144 |
145 | "I was glad at first, with a merely aesthetic satisfaction: glad to have my hand on such a 'subject.' Then his strange life-likeness began to affect me queerly--as I blocked the head in I felt as if he were watching me do it. The sensation was followed by the thought: if he _were_ watching me, what would he say to my way of working? My strokes began to go a little wild--I felt nervous and uncertain.
146 |
147 | "Once, when I looked up, I seemed to see a smile behind his close grayish beard--as if he had the secret, and were amusing himself by holding it back from me. That exasperated me still more. The secret? Why, I had a secret worth twenty of his! I dashed at the canvas furiously, and tried some of my bravura tricks. But they failed me, they crumbled. I saw that he wasn't watching the showy bits--I couldn't distract his attention; he just kept his eyes on the hard passages between. Those were the ones I had always shirked, or covered up with some lying paint. And how he saw through my lies!
148 |
149 | "I looked up again, and caught sight of that sketch of the donkey hanging on the wall near his bed. His wife told me afterward it was the last thing he had done--just a note taken with a shaking hand, when he was down in Devonshire recovering from a previous heart attack. Just a note! But it tells his whole history. There are years of patient scornful persistence in every line. A man who had swum with the current could never have learned that mighty up-stream stroke. . . .
150 |
151 | "I turned back to my work, and went on groping and muddling; then I looked at the donkey again. I saw that, when Stroud laid in the first stroke, he knew just what the end would be. He had possessed his subject, absorbed it, recreated it. When had I done that with any of my things? They hadn't been born of me--I had just adopted them. . . .
152 |
153 | "Hang it, Rickham, with that face watching me I couldn't do another stroke. The plain truth was, I didn't know where to put it--_I had never known_. Only, with my sitters and my public, a showy splash of colour covered up the fact--I just threw paint into their faces. . . . Well, paint was the one medium those dead eyes could see through--see straight to the tottering foundations underneath. Don't you know how, in talking a foreign language, even fluently, one says half the time not what one wants to but what one can? Well--that was the way I painted; and as he lay there and watched me, the thing they called my 'technique' collapsed like a house of cards. He didn't sneer, you understand, poor Stroud--he just lay there quietly watching, and on his lips, through the gray beard, I seemed to hear the question: 'Are you sure you know where you're coming out?'
154 |
155 | "If I could have painted that face, with that question on it, I should have done a great thing. The next greatest thing was to see that I couldn't--and that grace was given me. But, oh, at that minute, Rickham, was there anything on earth I wouldn't have given to have Stroud alive before me, and to hear him say: 'It's not too late--I'll show you how'?
156 |
157 | "It _was_ too late--it would have been, even if he'd been alive. I packed up my traps, and went down and told Mrs. Stroud. Of course I didn't tell her _that_--it would have been Greek to her. I simply said I couldn't paint him, that I was too moved. She rather liked the idea--she's so romantic! It was that that made her give me the donkey. But she was terribly upset at not getting the portrait--she did so want him 'done' by some one showy! At first I was afraid she wouldn't let me off--and at my wits' end I suggested Grindle. Yes, it was I who started Grindle: I told Mrs. Stroud he was the 'coming' man, and she told somebody else, and so it got to be true. . . . And he painted Stroud without wincing; and she hung the picture among her husband's things. . . ."
158 |
159 | He flung himself down in the arm-chair near mine, laid back his head, and clasping his arms beneath it, looked up at the picture above the chimney-piece.
160 |
161 | "I like to fancy that Stroud himself would have given it to me, if he'd been able to say what he thought that day."
162 |
163 | And, in answer to a question I put half-mechanically--"Begin again?" he flashed out. "When the one thing that brings me anywhere near him is that I knew enough to leave off?"
164 |
165 | He stood up and laid his hand on my shoulder with a laugh. "Only the irony of it is that I _am_ still painting--since Grindle's doing it for me! The Strouds stand alone, and happen once--but there's no exterminating our kind of art."
--------------------------------------------------------------------------------
/03_architecture/03.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "08f4321d-d32a-4a90-bfc7-e923f316b2f8",
6 | "metadata": {},
7 | "source": [
8 | "**LLM Workshop 2024 by Sebastian Raschka**\n",
9 | "\n",
10 | "This code is based on *Build a Large Language Model (From Scratch)*, [https://github.com/rasbt/LLMs-from-scratch](https://github.com/rasbt/LLMs-from-scratch)"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "id": "ce9295b2-182b-490b-8325-83a67c4a001d",
16 | "metadata": {},
17 | "source": [
18 | "
\n",
19 | "
\n",
20 | "
\n",
21 | "
\n",
22 | "\n",
23 | "# 3) Coding an LLM architecture"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "id": "f9eac223-a125-40f7-bacc-bd0d890450c7",
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "from importlib.metadata import version\n",
34 | "\n",
35 | "\n",
36 | "print(\"torch version:\", version(\"torch\"))\n",
37 | "print(\"tiktoken version:\", version(\"tiktoken\"))"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "id": "e7da97ed-e02f-4d7f-b68e-a0eba3716e02",
43 | "metadata": {},
44 | "source": [
45 | "- In this notebook, we implement a GPT-like LLM architecture; the next notebook will focus on training this LLM"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "id": "7d4f11e0-4434-4979-9dee-e1207df0eb01",
51 | "metadata": {},
52 | "source": [
53 | ""
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "id": "53fe99ab-0bcf-4778-a6b5-6db81fb826ef",
59 | "metadata": {},
60 | "source": [
61 | "
\n",
62 | "
\n",
63 | "
\n",
64 | "
\n",
65 | "\n",
66 | "# 3.1 Coding an LLM architecture"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "id": "ad72d1ff-d82d-4e33-a88e-3c1a8831797b",
72 | "metadata": {},
73 | "source": [
74 | "- Models like GPT, Gemma, Phi, Mistral, Llama etc. generate words sequentially and are based on the decoder part of the original transformer architecture\n",
75 | "- Therefore, these LLMs are often referred to as \"decoder-like\" LLMs\n",
76 | "- Compared to conventional deep learning models, LLMs are larger, mainly due to their vast number of parameters, not the amount of code\n",
77 | "- We'll see that many elements are repeated in an LLM's architecture"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "id": "5c5213e9-bd1c-437e-aee8-f5e8fb717251",
83 | "metadata": {},
84 | "source": [
85 | ""
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "id": "0d43f5e2-fb51-434a-b9be-abeef6b98d99",
91 | "metadata": {},
92 | "source": [
93 | "- In the previous notebook, we used small embedding dimensions for token inputs and outputs for ease of illustration, ensuring they neatly fit on the screen\n",
94 | "- In this notebook, we consider embedding and model sizes akin to a small GPT-2 model\n",
95 | "- We'll specifically code the architecture of the smallest GPT-2 model (124 million parameters), as outlined in Radford et al.'s [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) (note that the initial report lists it as 117M parameters, but this was later corrected in the model weight repository)\n"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "id": "3b3fc01b-3d69-4b74-bc89-c9ab472842ea",
101 | "metadata": {},
102 | "source": [
103 | "\n",
104 | "
\n",
105 | "\n",
106 | "- The next notebook will show how to load pretrained weights into our implementation, which will be compatible with model sizes of 345, 762, and 1542 million parameters\n",
107 | "- Models like Llama and others are very similar to this model, since they are all based on the same core concepts\n",
108 | "\n",
109 | "
"
110 | ]
111 | },
112 | {
113 | "cell_type": "markdown",
114 | "id": "21baa14d-24b8-4820-8191-a2808f7fbabc",
115 | "metadata": {},
116 | "source": [
117 | "- Configuration details for the 124 million parameter GPT-2 model (GPT-2 \"small\") include:"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "id": "5ed66875-1f24-445d-add6-006aae3c5707",
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "GPT_CONFIG_124M = {\n",
128 | " \"vocab_size\": 50257, # Vocabulary size\n",
129 | " \"context_length\": 1024, # Context length\n",
130 | " \"emb_dim\": 768, # Embedding dimension\n",
131 | " \"n_heads\": 12, # Number of attention heads\n",
132 | " \"n_layers\": 12, # Number of layers\n",
133 | " \"drop_rate\": 0.0, # Dropout rate\n",
134 | " \"qkv_bias\": False # Query-Key-Value bias\n",
135 | "}"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "id": "46618527-15ac-4c32-ad85-6cfea83e006e",
141 | "metadata": {},
142 | "source": [
143 | "
\n",
144 | "
\n",
145 | "
\n",
146 | "
\n",
147 | "\n",
148 | "\n",
149 | "\n",
150 | "# 3.2 Coding the GPT model"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "id": "dec7d03d-9ff3-4ca3-ad67-01b67c2f5457",
156 | "metadata": {},
157 | "source": [
158 | "- We are almost there: now let's plug in the transformer block into the architecture we coded at the very beginning of this notebook so that we obtain a useable GPT architecture\n",
159 | "- Note that the transformer block is repeated multiple times; in the case of the smallest 124M GPT-2 model, we repeat it 12 times:"
160 | ]
161 | },
162 | {
163 | "cell_type": "markdown",
164 | "id": "9b7b362d-f8c5-48d2-8ebd-722480ac5073",
165 | "metadata": {},
166 | "source": [
167 | ""
168 | ]
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "id": "324e4b5d-ed89-4fdf-9a52-67deee0593bc",
173 | "metadata": {},
174 | "source": [
175 | "- The corresponding code implementation, where `cfg[\"n_layers\"] = 12`:"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "id": "c61de39c-d03c-4a32-8b57-f49ac3834857",
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "import torch.nn as nn\n",
186 | "from supplementary import TransformerBlock, LayerNorm\n",
187 | "\n",
188 | "\n",
189 | "class GPTModel(nn.Module):\n",
190 | " def __init__(self, cfg):\n",
191 | " super().__init__()\n",
192 | " self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n",
193 | " self.pos_emb = nn.Embedding(cfg[\"context_length\"], cfg[\"emb_dim\"])\n",
194 | " self.drop_emb = nn.Dropout(cfg[\"drop_rate\"])\n",
195 | " \n",
196 | " self.trf_blocks = nn.Sequential(\n",
197 | " *[TransformerBlock(cfg) for _ in range(cfg[\"n_layers\"])])\n",
198 | " \n",
199 | " self.final_norm = LayerNorm(cfg[\"emb_dim\"])\n",
200 | " self.out_head = nn.Linear(\n",
201 | " cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False\n",
202 | " )\n",
203 | "\n",
204 | " def forward(self, in_idx):\n",
205 | " batch_size, seq_len = in_idx.shape\n",
206 | " tok_embeds = self.tok_emb(in_idx)\n",
207 | " pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n",
208 | " x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size]\n",
209 | " x = self.drop_emb(x)\n",
210 | " x = self.trf_blocks(x)\n",
211 | " x = self.final_norm(x)\n",
212 | " logits = self.out_head(x)\n",
213 | " return logits"
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "id": "2750270f-c45d-4410-8767-a6adbd05d5c3",
219 | "metadata": {},
220 | "source": [
221 | "- Using the configuration of the 124M parameter model, we can now instantiate this GPT model with random initial weights as follows:"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "id": "9bf6abb6",
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "import torch\n",
232 | "import tiktoken\n",
233 | "\n",
234 | "tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
235 | "\n",
236 | "batch = []\n",
237 | "\n",
238 | "txt1 = \"Every effort moves you\"\n",
239 | "txt2 = \"Every day holds a\"\n",
240 | "\n",
241 | "batch.append(torch.tensor(tokenizer.encode(txt1)))\n",
242 | "batch.append(torch.tensor(tokenizer.encode(txt2)))\n",
243 | "batch = torch.stack(batch, dim=0)\n",
244 | "print(batch)"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": null,
250 | "id": "ef94fd9c-4e9d-470d-8f8e-dd23d1bb1f64",
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "torch.manual_seed(123)\n",
255 | "model = GPTModel(GPT_CONFIG_124M)\n",
256 | "\n",
257 | "out = model(batch)\n",
258 | "print(\"Input batch:\\n\", batch)\n",
259 | "print(\"\\nOutput shape:\", out.shape)\n",
260 | "print(out)"
261 | ]
262 | },
263 | {
264 | "cell_type": "markdown",
265 | "id": "44a1bb67-be42-431d-87d0-00c005f4a520",
266 | "metadata": {},
267 | "source": [
268 | "- We will train this model in the next notebook"
269 | ]
270 | },
271 | {
272 | "cell_type": "markdown",
273 | "id": "da5d9bc0-95ab-45d4-9378-417628d86e35",
274 | "metadata": {},
275 | "source": [
276 | "
\n",
277 | "
\n",
278 | "
\n",
279 | "
\n",
280 | "\n",
281 | "\n",
282 | "\n",
283 | "# 3.4 Generating text"
284 | ]
285 | },
286 | {
287 | "cell_type": "markdown",
288 | "id": "48da5deb-6ee0-4b9b-8dd2-abed7ed65172",
289 | "metadata": {},
290 | "source": [
291 | "- LLMs like the GPT model we implemented above are used to generate one word at a time"
292 | ]
293 | },
294 | {
295 | "cell_type": "markdown",
296 | "id": "caade12a-fe97-480f-939c-87d24044edff",
297 | "metadata": {},
298 | "source": [
299 | ""
300 | ]
301 | },
302 | {
303 | "cell_type": "markdown",
304 | "id": "a7061524-a3bd-4803-ade6-2e3b7b79ac13",
305 | "metadata": {},
306 | "source": [
307 | "- The following `generate_text_simple` function implements greedy decoding, which is a simple and fast method to generate text\n",
308 | "- In greedy decoding, at each step, the model chooses the word (or token) with the highest probability as its next output (the highest logit corresponds to the highest probability, so we technically wouldn't even have to compute the softmax function explicitly)\n",
309 | "- The figure below depicts how the GPT model, given an input context, generates the next word token"
310 | ]
311 | },
312 | {
313 | "cell_type": "markdown",
314 | "id": "7ee0f32c-c18c-445e-b294-a879de2aa187",
315 | "metadata": {},
316 | "source": [
317 | "
"
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": null,
323 | "id": "c9b428a9-8764-4b36-80cd-7d4e00595ba6",
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "def generate_text_simple(model, idx, max_new_tokens, context_size):\n",
328 | " # idx is (batch, n_tokens) array of indices in the current context\n",
329 | " for _ in range(max_new_tokens):\n",
330 | " \n",
331 | " # Crop current context if it exceeds the supported context size\n",
332 | " # E.g., if LLM supports only 5 tokens, and the context size is 10\n",
333 | " # then only the last 5 tokens are used as context\n",
334 | " idx_cond = idx[:, -context_size:]\n",
335 | " \n",
336 | " # Get the predictions\n",
337 | " with torch.no_grad():\n",
338 | " logits = model(idx_cond)\n",
339 | " \n",
340 | " # Focus only on the last time step\n",
341 | " # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)\n",
342 | " logits = logits[:, -1, :] \n",
343 | "\n",
344 | " # Apply softmax to get probabilities\n",
345 | " probas = torch.softmax(logits, dim=-1) # (batch, vocab_size)\n",
346 | "\n",
347 | " # Get the idx of the vocab entry with the highest probability value\n",
348 | " idx_next = torch.argmax(probas, dim=-1, keepdim=True) # (batch, 1)\n",
349 | "\n",
350 | " # Append sampled index to the running sequence\n",
351 | " idx = torch.cat((idx, idx_next), dim=1) # (batch, n_tokens+1)\n",
352 | "\n",
353 | " return idx"
354 | ]
355 | },
356 | {
357 | "cell_type": "markdown",
358 | "id": "6515f2c1-3cc7-421c-8d58-cc2f563b7030",
359 | "metadata": {},
360 | "source": [
361 | "- The `generate_text_simple` above implements an iterative process, where it creates one token at a time\n",
362 | "\n",
363 | "
"
364 | ]
365 | },
366 | {
367 | "cell_type": "markdown",
368 | "id": "b0fa8b2c-4d97-4259-a8da-8ffb6bb088be",
369 | "metadata": {},
370 | "source": [
371 | "
\n",
372 | "
\n",
373 | "
\n",
374 | "
\n",
375 | "\n",
376 | "\n",
377 | "\n",
378 | "# Exercise: Generate some text"
379 | ]
380 | },
381 | {
382 | "cell_type": "markdown",
383 | "id": "f682eac4-f9bd-438b-9dec-6b1cc7bc05ce",
384 | "metadata": {},
385 | "source": [
386 | "1. Use the `tokenizer.encode` method to prepare some input text\n",
387 | "2. Then, convert this text into a pytprch tensor via (`torch.tensor`)\n",
388 | "3. Add a batch dimension via `.unsqueeze(0)`\n",
389 | "4. Use the `generate_text_simple` function to have the GPT generate some text based on your prepared input text\n",
390 | "5. The output from step 4 will be token IDs, convert them back into text via the `tokenizer.decode` method"
391 | ]
392 | },
393 | {
394 | "cell_type": "code",
395 | "execution_count": null,
396 | "id": "2286f6de-5222-46f8-ad0d-d1f380a36636",
397 | "metadata": {},
398 | "outputs": [],
399 | "source": [
400 | "model.eval(); # disable dropout"
401 | ]
402 | },
403 | {
404 | "cell_type": "markdown",
405 | "id": "02fa7ae0-f30d-454c-a92a-a75894ea68d2",
406 | "metadata": {},
407 | "source": [
408 | "
\n",
409 | "
\n",
410 | "
\n",
411 | "
\n",
412 | "\n",
413 | "\n",
414 | "\n",
415 | "# Solution"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": null,
421 | "id": "fdc23e58-dd3f-48d8-a767-944c1b6e030f",
422 | "metadata": {},
423 | "outputs": [],
424 | "source": [
425 | "start_context = \"Hello, I am\"\n",
426 | "\n",
427 | "encoded = tokenizer.encode(start_context)\n",
428 | "print(\"encoded:\", encoded)\n",
429 | "\n",
430 | "encoded_tensor = torch.tensor(encoded).unsqueeze(0)\n",
431 | "print(\"encoded_tensor.shape:\", encoded_tensor.shape)"
432 | ]
433 | },
434 | {
435 | "cell_type": "code",
436 | "execution_count": null,
437 | "id": "599b0821-9755-4cf1-8da4-a1c0fec448b1",
438 | "metadata": {},
439 | "outputs": [],
440 | "source": [
441 | "out = generate_text_simple(\n",
442 | " model=model,\n",
443 | " idx=encoded_tensor, \n",
444 | " max_new_tokens=6, \n",
445 | " context_size=GPT_CONFIG_124M[\"context_length\"]\n",
446 | ")\n",
447 | "\n",
448 | "print(\"Output:\", out)\n",
449 | "print(\"Output length:\", len(out[0]))"
450 | ]
451 | },
452 | {
453 | "cell_type": "markdown",
454 | "id": "6de4f875-f967-4089-8410-b5cd2c200de8",
455 | "metadata": {},
456 | "source": [
457 | "- Remove batch dimension and convert back into text:"
458 | ]
459 | },
460 | {
461 | "cell_type": "code",
462 | "execution_count": null,
463 | "id": "74c8d848-8ac1-41d4-b229-72ba7698297c",
464 | "metadata": {},
465 | "outputs": [],
466 | "source": [
467 | "decoded_text = tokenizer.decode(out.squeeze(0).tolist())\n",
468 | "print(decoded_text)"
469 | ]
470 | },
471 | {
472 | "cell_type": "markdown",
473 | "id": "8c538bcd-a209-4273-9527-60d6fef1f6ab",
474 | "metadata": {},
475 | "source": [
476 | "- Note that the model is untrained; hence the random output texts above\n",
477 | "- We will train the model in the next notebook"
478 | ]
479 | }
480 | ],
481 | "metadata": {
482 | "kernelspec": {
483 | "display_name": "Python 3 (ipykernel)",
484 | "language": "python",
485 | "name": "python3"
486 | },
487 | "language_info": {
488 | "codemirror_mode": {
489 | "name": "ipython",
490 | "version": 3
491 | },
492 | "file_extension": ".py",
493 | "mimetype": "text/x-python",
494 | "name": "python",
495 | "nbconvert_exporter": "python",
496 | "pygments_lexer": "ipython3",
497 | "version": "3.10.10"
498 | }
499 | },
500 | "nbformat": 4,
501 | "nbformat_minor": 5
502 | }
503 |
--------------------------------------------------------------------------------
/03_architecture/figures/01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/03_architecture/figures/01.png
--------------------------------------------------------------------------------
/03_architecture/figures/02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/03_architecture/figures/02.png
--------------------------------------------------------------------------------
/03_architecture/figures/03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/03_architecture/figures/03.png
--------------------------------------------------------------------------------
/03_architecture/figures/04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/03_architecture/figures/04.png
--------------------------------------------------------------------------------
/03_architecture/figures/05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/03_architecture/figures/05.png
--------------------------------------------------------------------------------
/03_architecture/figures/06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/03_architecture/figures/06.png
--------------------------------------------------------------------------------
/03_architecture/figures/07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/03_architecture/figures/07.png
--------------------------------------------------------------------------------
/03_architecture/figures/08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/03_architecture/figures/08.png
--------------------------------------------------------------------------------
/03_architecture/figures/09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/03_architecture/figures/09.png
--------------------------------------------------------------------------------
/03_architecture/figures/10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/03_architecture/figures/10.png
--------------------------------------------------------------------------------
/03_architecture/supplementary.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
2 | # Source for "Build a Large Language Model From Scratch"
3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch
4 | # Code: https://github.com/rasbt/LLMs-from-scratch
5 |
6 | import tiktoken
7 | import torch
8 | import torch.nn as nn
9 | from torch.utils.data import Dataset, DataLoader
10 |
11 |
12 | class GPTDatasetV1(Dataset):
13 | def __init__(self, txt, tokenizer, max_length, stride):
14 | self.input_ids = []
15 | self.target_ids = []
16 |
17 | # Tokenize the entire text
18 | token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
19 |
20 | # Use a sliding window to chunk the book into overlapping sequences of max_length
21 | for i in range(0, len(token_ids) - max_length, stride):
22 | input_chunk = token_ids[i:i + max_length]
23 | target_chunk = token_ids[i + 1: i + max_length + 1]
24 | self.input_ids.append(torch.tensor(input_chunk))
25 | self.target_ids.append(torch.tensor(target_chunk))
26 |
27 | def __len__(self):
28 | return len(self.input_ids)
29 |
30 | def __getitem__(self, idx):
31 | return self.input_ids[idx], self.target_ids[idx]
32 |
33 |
34 | def create_dataloader_v1(txt, batch_size=4, max_length=256,
35 | stride=128, shuffle=True, drop_last=True, num_workers=0):
36 | # Initialize the tokenizer
37 | tokenizer = tiktoken.get_encoding("gpt2")
38 |
39 | # Create dataset
40 | dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
41 |
42 | # Create dataloader
43 | dataloader = DataLoader(
44 | dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
45 |
46 | return dataloader
47 |
48 |
49 | class MultiHeadAttention(nn.Module):
50 | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
51 | super().__init__()
52 | assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
53 |
54 | self.d_out = d_out
55 | self.num_heads = num_heads
56 | self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim
57 |
58 | self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
59 | self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
60 | self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
61 | self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs
62 | self.dropout = nn.Dropout(dropout)
63 | self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
64 |
65 | def forward(self, x):
66 | b, num_tokens, d_in = x.shape
67 |
68 | keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
69 | queries = self.W_query(x)
70 | values = self.W_value(x)
71 |
72 | # We implicitly split the matrix by adding a `num_heads` dimension
73 | # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
74 | keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
75 | values = values.view(b, num_tokens, self.num_heads, self.head_dim)
76 | queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
77 |
78 | # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
79 | keys = keys.transpose(1, 2)
80 | queries = queries.transpose(1, 2)
81 | values = values.transpose(1, 2)
82 |
83 | # Compute scaled dot-product attention (aka self-attention) with a causal mask
84 | attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head
85 |
86 | # Original mask truncated to the number of tokens and converted to boolean
87 | mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
88 |
89 | # Use the mask to fill attention scores
90 | attn_scores.masked_fill_(mask_bool, -torch.inf)
91 |
92 | attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
93 | attn_weights = self.dropout(attn_weights)
94 |
95 | # Shape: (b, num_tokens, num_heads, head_dim)
96 | context_vec = (attn_weights @ values).transpose(1, 2)
97 |
98 | # Combine heads, where self.d_out = self.num_heads * self.head_dim
99 | context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
100 | context_vec = self.out_proj(context_vec) # optional projection
101 |
102 | return context_vec
103 |
104 |
105 | class LayerNorm(nn.Module):
106 | def __init__(self, emb_dim):
107 | super().__init__()
108 | self.eps = 1e-5
109 | self.scale = nn.Parameter(torch.ones(emb_dim))
110 | self.shift = nn.Parameter(torch.zeros(emb_dim))
111 |
112 | def forward(self, x):
113 | mean = x.mean(dim=-1, keepdim=True)
114 | var = x.var(dim=-1, keepdim=True, unbiased=False)
115 | norm_x = (x - mean) / torch.sqrt(var + self.eps)
116 | return self.scale * norm_x + self.shift
117 |
118 |
119 | class GELU(nn.Module):
120 | def __init__(self):
121 | super().__init__()
122 |
123 | def forward(self, x):
124 | return 0.5 * x * (1 + torch.tanh(
125 | torch.sqrt(torch.tensor(2.0 / torch.pi)) *
126 | (x + 0.044715 * torch.pow(x, 3))
127 | ))
128 |
129 |
130 | class FeedForward(nn.Module):
131 | def __init__(self, cfg):
132 | super().__init__()
133 | self.layers = nn.Sequential(
134 | nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
135 | GELU(),
136 | nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
137 | )
138 |
139 | def forward(self, x):
140 | return self.layers(x)
141 |
142 |
143 | class TransformerBlock(nn.Module):
144 | def __init__(self, cfg):
145 | super().__init__()
146 | self.att = MultiHeadAttention(
147 | d_in=cfg["emb_dim"],
148 | d_out=cfg["emb_dim"],
149 | context_length=cfg["context_length"],
150 | num_heads=cfg["n_heads"],
151 | dropout=cfg["drop_rate"],
152 | qkv_bias=cfg["qkv_bias"])
153 | self.ff = FeedForward(cfg)
154 | self.norm1 = LayerNorm(cfg["emb_dim"])
155 | self.norm2 = LayerNorm(cfg["emb_dim"])
156 | self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
157 |
158 | def forward(self, x):
159 | # Shortcut connection for attention block
160 | shortcut = x
161 | x = self.norm1(x)
162 | x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
163 | x = self.drop_shortcut(x)
164 | x = x + shortcut # Add the original input back
165 |
166 | # Shortcut connection for feed forward block
167 | shortcut = x
168 | x = self.norm2(x)
169 | x = self.ff(x)
170 | x = self.drop_shortcut(x)
171 | x = x + shortcut # Add the original input back
172 |
173 | return x
--------------------------------------------------------------------------------
/04_pretraining/figures/01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/04_pretraining/figures/01.png
--------------------------------------------------------------------------------
/04_pretraining/figures/02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/04_pretraining/figures/02.png
--------------------------------------------------------------------------------
/04_pretraining/figures/03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/04_pretraining/figures/03.png
--------------------------------------------------------------------------------
/04_pretraining/figures/04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/LLM-workshop-2024/cf92941293bbb0bebc7397baf7dfbad8a10806bc/04_pretraining/figures/04.png
--------------------------------------------------------------------------------
/04_pretraining/supplementary.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
2 | # Source for "Build a Large Language Model From Scratch"
3 | # - https://www.manning.com/books/build-a-large-language-model-from-scratch
4 | # Code: https://github.com/rasbt/LLMs-from-scratch
5 |
6 | import matplotlib.pyplot as plt
7 | from matplotlib.ticker import MaxNLocator
8 | import tiktoken
9 | import torch
10 | import torch.nn as nn
11 | from torch.utils.data import Dataset, DataLoader
12 |
13 |
14 | class GPTDatasetV1(Dataset):
15 | def __init__(self, txt, tokenizer, max_length, stride):
16 | self.input_ids = []
17 | self.target_ids = []
18 |
19 | # Tokenize the entire text
20 | token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
21 |
22 | # Use a sliding window to chunk the book into overlapping sequences of max_length
23 | for i in range(0, len(token_ids) - max_length, stride):
24 | input_chunk = token_ids[i:i + max_length]
25 | target_chunk = token_ids[i + 1: i + max_length + 1]
26 | self.input_ids.append(torch.tensor(input_chunk))
27 | self.target_ids.append(torch.tensor(target_chunk))
28 |
29 | def __len__(self):
30 | return len(self.input_ids)
31 |
32 | def __getitem__(self, idx):
33 | return self.input_ids[idx], self.target_ids[idx]
34 |
35 |
36 | def create_dataloader_v1(txt, batch_size=4, max_length=256,
37 | stride=128, shuffle=True, drop_last=True, num_workers=0):
38 | # Initialize the tokenizer
39 | tokenizer = tiktoken.get_encoding("gpt2")
40 |
41 | # Create dataset
42 | dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
43 |
44 | # Create dataloader
45 | dataloader = DataLoader(
46 | dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
47 |
48 | return dataloader
49 |
50 |
51 | class MultiHeadAttention(nn.Module):
52 | def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
53 | super().__init__()
54 | assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
55 |
56 | self.d_out = d_out
57 | self.num_heads = num_heads
58 | self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim
59 |
60 | self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
61 | self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
62 | self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
63 | self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs
64 | self.dropout = nn.Dropout(dropout)
65 | self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
66 |
67 | def forward(self, x):
68 | b, num_tokens, d_in = x.shape
69 |
70 | keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
71 | queries = self.W_query(x)
72 | values = self.W_value(x)
73 |
74 | # We implicitly split the matrix by adding a `num_heads` dimension
75 | # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
76 | keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
77 | values = values.view(b, num_tokens, self.num_heads, self.head_dim)
78 | queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
79 |
80 | # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
81 | keys = keys.transpose(1, 2)
82 | queries = queries.transpose(1, 2)
83 | values = values.transpose(1, 2)
84 |
85 | # Compute scaled dot-product attention (aka self-attention) with a causal mask
86 | attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head
87 |
88 | # Original mask truncated to the number of tokens and converted to boolean
89 | mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
90 |
91 | # Use the mask to fill attention scores
92 | attn_scores.masked_fill_(mask_bool, -torch.inf)
93 |
94 | attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
95 | attn_weights = self.dropout(attn_weights)
96 |
97 | # Shape: (b, num_tokens, num_heads, head_dim)
98 | context_vec = (attn_weights @ values).transpose(1, 2)
99 |
100 | # Combine heads, where self.d_out = self.num_heads * self.head_dim
101 | context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
102 | context_vec = self.out_proj(context_vec) # optional projection
103 |
104 | return context_vec
105 |
106 |
107 | class LayerNorm(nn.Module):
108 | def __init__(self, emb_dim):
109 | super().__init__()
110 | self.eps = 1e-5
111 | self.scale = nn.Parameter(torch.ones(emb_dim))
112 | self.shift = nn.Parameter(torch.zeros(emb_dim))
113 |
114 | def forward(self, x):
115 | mean = x.mean(dim=-1, keepdim=True)
116 | var = x.var(dim=-1, keepdim=True, unbiased=False)
117 | norm_x = (x - mean) / torch.sqrt(var + self.eps)
118 | return self.scale * norm_x + self.shift
119 |
120 |
121 | class GELU(nn.Module):
122 | def __init__(self):
123 | super().__init__()
124 |
125 | def forward(self, x):
126 | return 0.5 * x * (1 + torch.tanh(
127 | torch.sqrt(torch.tensor(2.0 / torch.pi)) *
128 | (x + 0.044715 * torch.pow(x, 3))
129 | ))
130 |
131 |
132 | class FeedForward(nn.Module):
133 | def __init__(self, cfg):
134 | super().__init__()
135 | self.layers = nn.Sequential(
136 | nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
137 | GELU(),
138 | nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
139 | )
140 |
141 | def forward(self, x):
142 | return self.layers(x)
143 |
144 |
145 | class TransformerBlock(nn.Module):
146 | def __init__(self, cfg):
147 | super().__init__()
148 | self.att = MultiHeadAttention(
149 | d_in=cfg["emb_dim"],
150 | d_out=cfg["emb_dim"],
151 | context_length=cfg["context_length"],
152 | num_heads=cfg["n_heads"],
153 | dropout=cfg["drop_rate"],
154 | qkv_bias=cfg["qkv_bias"])
155 | self.ff = FeedForward(cfg)
156 | self.norm1 = LayerNorm(cfg["emb_dim"])
157 | self.norm2 = LayerNorm(cfg["emb_dim"])
158 | self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
159 |
160 | def forward(self, x):
161 | # Shortcut connection for attention block
162 | shortcut = x
163 | x = self.norm1(x)
164 | x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
165 | x = self.drop_shortcut(x)
166 | x = x + shortcut # Add the original input back
167 |
168 | # Shortcut connection for feed forward block
169 | shortcut = x
170 | x = self.norm2(x)
171 | x = self.ff(x)
172 | x = self.drop_shortcut(x)
173 | x = x + shortcut # Add the original input back
174 |
175 | return x
176 |
177 |
178 | class GPTModel(nn.Module):
179 | def __init__(self, cfg):
180 | super().__init__()
181 | self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
182 | self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
183 | self.drop_emb = nn.Dropout(cfg["drop_rate"])
184 |
185 | self.trf_blocks = nn.Sequential(
186 | *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
187 |
188 | self.final_norm = LayerNorm(cfg["emb_dim"])
189 | self.out_head = nn.Linear(
190 | cfg["emb_dim"], cfg["vocab_size"], bias=False
191 | )
192 |
193 | def forward(self, in_idx):
194 | batch_size, seq_len = in_idx.shape
195 | tok_embeds = self.tok_emb(in_idx)
196 | pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
197 | x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size]
198 | x = self.drop_emb(x)
199 | x = self.trf_blocks(x)
200 | x = self.final_norm(x)
201 | logits = self.out_head(x)
202 | return logits
203 |
204 |
205 | def calc_loss_batch(input_batch, target_batch, model, device):
206 | input_batch, target_batch = input_batch.to(device), target_batch.to(device)
207 | logits = model(input_batch)
208 | loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
209 | return loss
210 |
211 |
212 | def calc_loss_loader(data_loader, model, device, num_batches=None):
213 | total_loss = 0.
214 | if len(data_loader) == 0:
215 | return float("nan")
216 | elif num_batches is None:
217 | num_batches = len(data_loader)
218 | else:
219 | # Reduce the number of batches to match the total number of batches in the data loader
220 | # if num_batches exceeds the number of batches in the data loader
221 | num_batches = min(num_batches, len(data_loader))
222 | for i, (input_batch, target_batch) in enumerate(data_loader):
223 | if i < num_batches:
224 | loss = calc_loss_batch(input_batch, target_batch, model, device)
225 | total_loss += loss.item()
226 | else:
227 | break
228 | return total_loss / num_batches
229 |
230 |
231 | def evaluate_model(model, train_loader, val_loader, device, eval_iter):
232 | model.eval()
233 | with torch.no_grad():
234 | train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
235 | val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
236 | model.train()
237 | return train_loss, val_loss
238 |
239 |
240 | def text_to_token_ids(text, tokenizer):
241 | encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
242 | encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
243 | return encoded_tensor
244 |
245 |
246 | def token_ids_to_text(token_ids, tokenizer):
247 | flat = token_ids.squeeze(0) # remove batch dimension
248 | return tokenizer.decode(flat.tolist())
249 |
250 |
251 | def generate_and_print_sample(model, tokenizer, device, start_context):
252 | model.eval()
253 | context_size = model.pos_emb.weight.shape[0]
254 | encoded = text_to_token_ids(start_context, tokenizer).to(device)
255 | with torch.no_grad():
256 | token_ids = generate_text_simple(
257 | model=model, idx=encoded,
258 | max_new_tokens=50, context_size=context_size
259 | )
260 | decoded_text = token_ids_to_text(token_ids, tokenizer)
261 | print(decoded_text.replace("\n", " ")) # Compact print format
262 | model.train()
263 |
264 |
265 | def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
266 | fig, ax1 = plt.subplots(figsize=(5, 3))
267 |
268 | # Plot training and validation loss against epochs
269 | ax1.plot(epochs_seen, train_losses, label="Training loss")
270 | ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss")
271 | ax1.set_xlabel("Epochs")
272 | ax1.set_ylabel("Loss")
273 | ax1.legend(loc="upper right")
274 | ax1.xaxis.set_major_locator(MaxNLocator(integer=True)) # only show integer labels on x-axis
275 |
276 | # Create a second x-axis for tokens seen
277 | ax2 = ax1.twiny() # Create a second x-axis that shares the same y-axis
278 | ax2.plot(tokens_seen, train_losses, alpha=0) # Invisible plot for aligning ticks
279 | ax2.set_xlabel("Tokens seen")
280 |
281 | fig.tight_layout() # Adjust layout to make room
282 | plt.savefig("loss-plot.pdf")
283 | plt.show()
284 |
285 |
286 | def generate_text_simple(model, idx, max_new_tokens, context_size):
287 | # idx is (batch, n_tokens) array of indices in the current context
288 | for _ in range(max_new_tokens):
289 |
290 | # Crop current context if it exceeds the supported context size
291 | # E.g., if LLM supports only 5 tokens, and the context size is 10
292 | # then only the last 5 tokens are used as context
293 | idx_cond = idx[:, -context_size:]
294 |
295 | # Get the predictions
296 | with torch.no_grad():
297 | logits = model(idx_cond)
298 |
299 | # Focus only on the last time step
300 | # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
301 | logits = logits[:, -1, :]
302 |
303 | # Apply softmax to get probabilities
304 | probas = torch.softmax(logits, dim=-1) # (batch, vocab_size)
305 |
306 | # Get the idx of the vocab entry with the highest probability value
307 | idx_next = torch.argmax(probas, dim=-1, keepdim=True) # (batch, 1)
308 |
309 | # Append sampled index to the running sequence
310 | idx = torch.cat((idx, idx_next), dim=1) # (batch, n_tokens+1)
311 |
312 | return idx
--------------------------------------------------------------------------------
/04_pretraining/the-verdict.txt:
--------------------------------------------------------------------------------
1 | I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)
2 |
3 | "The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it's going to send the value of my picture 'way up; but I don't think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing's lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn's "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?
4 |
5 | Well!--even through the prism of Hermia's tears I felt able to face the fact with equanimity. Poor Jack Gisburn! The women had made him--it was fitting that they should mourn him. Among his own sex fewer regrets were heard, and in his own trade hardly a murmur. Professional jealousy? Perhaps. If it were, the honour of the craft was vindicated by little Claude Nutley, who, in all good faith, brought out in the Burlington a very handsome "obituary" on Jack--one of those showy articles stocked with random technicalities that I have heard (I won't say by whom) compared to Gisburn's painting. And so--his resolve being apparently irrevocable--the discussion gradually died out, and, as Mrs. Thwing had predicted, the price of "Gisburns" went up.
6 |
7 | It was not till three years later that, in the course of a few weeks' idling on the Riviera, it suddenly occurred to me to wonder why Gisburn had given up his painting. On reflection, it really was a tempting problem. To accuse his wife would have been too easy--his fair sitters had been denied the solace of saying that Mrs. Gisburn had "dragged him down." For Mrs. Gisburn--as such--had not existed till nearly a year after Jack's resolve had been taken. It might be that he had married her--since he liked his ease--because he didn't want to go on painting; but it would have been hard to prove that he had given up his painting because he had married her.
8 |
9 | Of course, if she had not dragged him down, she had equally, as Miss Croft contended, failed to "lift him up"--she had not led him back to the easel. To put the brush into his hand again--what a vocation for a wife! But Mrs. Gisburn appeared to have disdained it--and I felt it might be interesting to find out why.
10 |
11 | The desultory life of the Riviera lends itself to such purely academic speculations; and having, on my way to Monte Carlo, caught a glimpse of Jack's balustraded terraces between the pines, I had myself borne thither the next day.
12 |
13 | I found the couple at tea beneath their palm-trees; and Mrs. Gisburn's welcome was so genial that, in the ensuing weeks, I claimed it frequently. It was not that my hostess was "interesting": on that point I could have given Miss Croft the fullest reassurance. It was just because she was _not_ interesting--if I may be pardoned the bull--that I found her so. For Jack, all his life, had been surrounded by interesting women: they had fostered his art, it had been reared in the hot-house of their adulation. And it was therefore instructive to note what effect the "deadening atmosphere of mediocrity" (I quote Miss Croft) was having on him.
14 |
15 | I have mentioned that Mrs. Gisburn was rich; and it was immediately perceptible that her husband was extracting from this circumstance a delicate but substantial satisfaction. It is, as a rule, the people who scorn money who get most out of it; and Jack's elegant disdain of his wife's big balance enabled him, with an appearance of perfect good-breeding, to transmute it into objects of art and luxury. To the latter, I must add, he remained relatively indifferent; but he was buying Renaissance bronzes and eighteenth-century pictures with a discrimination that bespoke the amplest resources.
16 |
17 | "Money's only excuse is to put beauty into circulation," was one of the axioms he laid down across the Sevres and silver of an exquisitely appointed luncheon-table, when, on a later day, I had again run over from Monte Carlo; and Mrs. Gisburn, beaming on him, added for my enlightenment: "Jack is so morbidly sensitive to every form of beauty."
18 |
19 | Poor Jack! It had always been his fate to have women say such things of him: the fact should be set down in extenuation. What struck me now was that, for the first time, he resented the tone. I had seen him, so often, basking under similar tributes--was it the conjugal note that robbed them of their savour? No--for, oddly enough, it became apparent that he was fond of Mrs. Gisburn--fond enough not to see her absurdity. It was his own absurdity he seemed to be wincing under--his own attitude as an object for garlands and incense.
20 |
21 | "My dear, since I've chucked painting people don't say that stuff about me--they say it about Victor Grindle," was his only protest, as he rose from the table and strolled out onto the sunlit terrace.
22 |
23 | I glanced after him, struck by his last word. Victor Grindle was, in fact, becoming the man of the moment--as Jack himself, one might put it, had been the man of the hour. The younger artist was said to have formed himself at my friend's feet, and I wondered if a tinge of jealousy underlay the latter's mysterious abdication. But no--for it was not till after that event that the _rose Dubarry_ drawing-rooms had begun to display their "Grindles."
24 |
25 | I turned to Mrs. Gisburn, who had lingered to give a lump of sugar to her spaniel in the dining-room.
26 |
27 | "Why _has_ he chucked painting?" I asked abruptly.
28 |
29 | She raised her eyebrows with a hint of good-humoured surprise.
30 |
31 | "Oh, he doesn't _have_ to now, you know; and I want him to enjoy himself," she said quite simply.
32 |
33 | I looked about the spacious white-panelled room, with its _famille-verte_ vases repeating the tones of the pale damask curtains, and its eighteenth-century pastels in delicate faded frames.
34 |
35 | "Has he chucked his pictures too? I haven't seen a single one in the house."
36 |
37 | A slight shade of constraint crossed Mrs. Gisburn's open countenance. "It's his ridiculous modesty, you know. He says they're not fit to have about; he's sent them all away except one--my portrait--and that I have to keep upstairs."
38 |
39 | His ridiculous modesty--Jack's modesty about his pictures? My curiosity was growing like the bean-stalk. I said persuasively to my hostess: "I must really see your portrait, you know."
40 |
41 | She glanced out almost timorously at the terrace where her husband, lounging in a hooded chair, had lit a cigar and drawn the Russian deerhound's head between his knees.
42 |
43 | "Well, come while he's not looking," she said, with a laugh that tried to hide her nervousness; and I followed her between the marble Emperors of the hall, and up the wide stairs with terra-cotta nymphs poised among flowers at each landing.
44 |
45 | In the dimmest corner of her boudoir, amid a profusion of delicate and distinguished objects, hung one of the familiar oval canvases, in the inevitable garlanded frame. The mere outline of the frame called up all Gisburn's past!
46 |
47 | Mrs. Gisburn drew back the window-curtains, moved aside a _jardiniere_ full of pink azaleas, pushed an arm-chair away, and said: "If you stand here you can just manage to see it. I had it over the mantel-piece, but he wouldn't let it stay."
48 |
49 | Yes--I could just manage to see it--the first portrait of Jack's I had ever had to strain my eyes over! Usually they had the place of honour--say the central panel in a pale yellow or _rose Dubarry_ drawing-room, or a monumental easel placed so that it took the light through curtains of old Venetian point. The more modest place became the picture better; yet, as my eyes grew accustomed to the half-light, all the characteristic qualities came out--all the hesitations disguised as audacities, the tricks of prestidigitation by which, with such consummate skill, he managed to divert attention from the real business of the picture to some pretty irrelevance of detail. Mrs. Gisburn, presenting a neutral surface to work on--forming, as it were, so inevitably the background of her own picture--had lent herself in an unusual degree to the display of this false virtuosity. The picture was one of Jack's "strongest," as his admirers would have put it--it represented, on his part, a swelling of muscles, a congesting of veins, a balancing, straddling and straining, that reminded one of the circus-clown's ironic efforts to lift a feather. It met, in short, at every point the demand of lovely woman to be painted "strongly" because she was tired of being painted "sweetly"--and yet not to lose an atom of the sweetness.
50 |
51 | "It's the last he painted, you know," Mrs. Gisburn said with pardonable pride. "The last but one," she corrected herself--"but the other doesn't count, because he destroyed it."
52 |
53 | "Destroyed it?" I was about to follow up this clue when I heard a footstep and saw Jack himself on the threshold.
54 |
55 | As he stood there, his hands in the pockets of his velveteen coat, the thin brown waves of hair pushed back from his white forehead, his lean sunburnt cheeks furrowed by a smile that lifted the tips of a self-confident moustache, I felt to what a degree he had the same quality as his pictures--the quality of looking cleverer than he was.
56 |
57 | His wife glanced at him deprecatingly, but his eyes travelled past her to the portrait.
58 |
59 | "Mr. Rickham wanted to see it," she began, as if excusing herself. He shrugged his shoulders, still smiling.
60 |
61 | "Oh, Rickham found me out long ago," he said lightly; then, passing his arm through mine: "Come and see the rest of the house."
62 |
63 | He showed it to me with a kind of naive suburban pride: the bath-rooms, the speaking-tubes, the dress-closets, the trouser-presses--all the complex simplifications of the millionaire's domestic economy. And whenever my wonder paid the expected tribute he said, throwing out his chest a little: "Yes, I really don't see how people manage to live without that."
64 |
65 | Well--it was just the end one might have foreseen for him. Only he was, through it all and in spite of it all--as he had been through, and in spite of, his pictures--so handsome, so charming, so disarming, that one longed to cry out: "Be dissatisfied with your leisure!" as once one had longed to say: "Be dissatisfied with your work!"
66 |
67 | But, with the cry on my lips, my diagnosis suffered an unexpected check.
68 |
69 | "This is my own lair," he said, leading me into a dark plain room at the end of the florid vista. It was square and brown and leathery: no "effects"; no bric-a-brac, none of the air of posing for reproduction in a picture weekly--above all, no least sign of ever having been used as a studio.
70 |
71 | The fact brought home to me the absolute finality of Jack's break with his old life.
72 |
73 | "Don't you ever dabble with paint any more?" I asked, still looking about for a trace of such activity.
74 |
75 | "Never," he said briefly.
76 |
77 | "Or water-colour--or etching?"
78 |
79 | His confident eyes grew dim, and his cheeks paled a little under their handsome sunburn.
80 |
81 | "Never think of it, my dear fellow--any more than if I'd never touched a brush."
82 |
83 | And his tone told me in a flash that he never thought of anything else.
84 |
85 | I moved away, instinctively embarrassed by my unexpected discovery; and as I turned, my eye fell on a small picture above the mantel-piece--the only object breaking the plain oak panelling of the room.
86 |
87 | "Oh, by Jove!" I said.
88 |
89 | It was a sketch of a donkey--an old tired donkey, standing in the rain under a wall.
90 |
91 | "By Jove--a Stroud!" I cried.
92 |
93 | He was silent; but I felt him close behind me, breathing a little quickly.
94 |
95 | "What a wonder! Made with a dozen lines--but on everlasting foundations. You lucky chap, where did you get it?"
96 |
97 | He answered slowly: "Mrs. Stroud gave it to me."
98 |
99 | "Ah--I didn't know you even knew the Strouds. He was such an inflexible hermit."
100 |
101 | "I didn't--till after. . . . She sent for me to paint him when he was dead."
102 |
103 | "When he was dead? You?"
104 |
105 | I must have let a little too much amazement escape through my surprise, for he answered with a deprecating laugh: "Yes--she's an awful simpleton, you know, Mrs. Stroud. Her only idea was to have him done by a fashionable painter--ah, poor Stroud! She thought it the surest way of proclaiming his greatness--of forcing it on a purblind public. And at the moment I was _the_ fashionable painter."
106 |
107 | "Ah, poor Stroud--as you say. Was _that_ his history?"
108 |
109 | "That was his history. She believed in him, gloried in him--or thought she did. But she couldn't bear not to have all the drawing-rooms with her. She couldn't bear the fact that, on varnishing days, one could always get near enough to see his pictures. Poor woman! She's just a fragment groping for other fragments. Stroud is the only whole I ever knew."
110 |
111 | "You ever knew? But you just said--"
112 |
113 | Gisburn had a curious smile in his eyes.
114 |
115 | "Oh, I knew him, and he knew me--only it happened after he was dead."
116 |
117 | I dropped my voice instinctively. "When she sent for you?"
118 |
119 | "Yes--quite insensible to the irony. She wanted him vindicated--and by me!"
120 |
121 | He laughed again, and threw back his head to look up at the sketch of the donkey. "There were days when I couldn't look at that thing--couldn't face it. But I forced myself to put it here; and now it's cured me--cured me. That's the reason why I don't dabble any more, my dear Rickham; or rather Stroud himself is the reason."
122 |
123 | For the first time my idle curiosity about my companion turned into a serious desire to understand him better.
124 |
125 | "I wish you'd tell me how it happened," I said.
126 |
127 | He stood looking up at the sketch, and twirling between his fingers a cigarette he had forgotten to light. Suddenly he turned toward me.
128 |
129 | "I'd rather like to tell you--because I've always suspected you of loathing my work."
130 |
131 | I made a deprecating gesture, which he negatived with a good-humoured shrug.
132 |
133 | "Oh, I didn't care a straw when I believed in myself--and now it's an added tie between us!"
134 |
135 | He laughed slightly, without bitterness, and pushed one of the deep arm-chairs forward. "There: make yourself comfortable--and here are the cigars you like."
136 |
137 | He placed them at my elbow and continued to wander up and down the room, stopping now and then beneath the picture.
138 |
139 | "How it happened? I can tell you in five minutes--and it didn't take much longer to happen. . . . I can remember now how surprised and pleased I was when I got Mrs. Stroud's note. Of course, deep down, I had always _felt_ there was no one like him--only I had gone with the stream, echoed the usual platitudes about him, till I half got to think he was a failure, one of the kind that are left behind. By Jove, and he _was_ left behind--because he had come to stay! The rest of us had to let ourselves be swept along or go under, but he was high above the current--on everlasting foundations, as you say.
140 |
141 | "Well, I went off to the house in my most egregious mood--rather moved, Lord forgive me, at the pathos of poor Stroud's career of failure being crowned by the glory of my painting him! Of course I meant to do the picture for nothing--I told Mrs. Stroud so when she began to stammer something about her poverty. I remember getting off a prodigious phrase about the honour being _mine_--oh, I was princely, my dear Rickham! I was posing to myself like one of my own sitters.
142 |
143 | "Then I was taken up and left alone with him. I had sent all my traps in advance, and I had only to set up the easel and get to work. He had been dead only twenty-four hours, and he died suddenly, of heart disease, so that there had been no preliminary work of destruction--his face was clear and untouched. I had met him once or twice, years before, and thought him insignificant and dingy. Now I saw that he was superb.
144 |
145 | "I was glad at first, with a merely aesthetic satisfaction: glad to have my hand on such a 'subject.' Then his strange life-likeness began to affect me queerly--as I blocked the head in I felt as if he were watching me do it. The sensation was followed by the thought: if he _were_ watching me, what would he say to my way of working? My strokes began to go a little wild--I felt nervous and uncertain.
146 |
147 | "Once, when I looked up, I seemed to see a smile behind his close grayish beard--as if he had the secret, and were amusing himself by holding it back from me. That exasperated me still more. The secret? Why, I had a secret worth twenty of his! I dashed at the canvas furiously, and tried some of my bravura tricks. But they failed me, they crumbled. I saw that he wasn't watching the showy bits--I couldn't distract his attention; he just kept his eyes on the hard passages between. Those were the ones I had always shirked, or covered up with some lying paint. And how he saw through my lies!
148 |
149 | "I looked up again, and caught sight of that sketch of the donkey hanging on the wall near his bed. His wife told me afterward it was the last thing he had done--just a note taken with a shaking hand, when he was down in Devonshire recovering from a previous heart attack. Just a note! But it tells his whole history. There are years of patient scornful persistence in every line. A man who had swum with the current could never have learned that mighty up-stream stroke. . . .
150 |
151 | "I turned back to my work, and went on groping and muddling; then I looked at the donkey again. I saw that, when Stroud laid in the first stroke, he knew just what the end would be. He had possessed his subject, absorbed it, recreated it. When had I done that with any of my things? They hadn't been born of me--I had just adopted them. . . .
152 |
153 | "Hang it, Rickham, with that face watching me I couldn't do another stroke. The plain truth was, I didn't know where to put it--_I had never known_. Only, with my sitters and my public, a showy splash of colour covered up the fact--I just threw paint into their faces. . . . Well, paint was the one medium those dead eyes could see through--see straight to the tottering foundations underneath. Don't you know how, in talking a foreign language, even fluently, one says half the time not what one wants to but what one can? Well--that was the way I painted; and as he lay there and watched me, the thing they called my 'technique' collapsed like a house of cards. He didn't sneer, you understand, poor Stroud--he just lay there quietly watching, and on his lips, through the gray beard, I seemed to hear the question: 'Are you sure you know where you're coming out?'
154 |
155 | "If I could have painted that face, with that question on it, I should have done a great thing. The next greatest thing was to see that I couldn't--and that grace was given me. But, oh, at that minute, Rickham, was there anything on earth I wouldn't have given to have Stroud alive before me, and to hear him say: 'It's not too late--I'll show you how'?
156 |
157 | "It _was_ too late--it would have been, even if he'd been alive. I packed up my traps, and went down and told Mrs. Stroud. Of course I didn't tell her _that_--it would have been Greek to her. I simply said I couldn't paint him, that I was too moved. She rather liked the idea--she's so romantic! It was that that made her give me the donkey. But she was terribly upset at not getting the portrait--she did so want him 'done' by some one showy! At first I was afraid she wouldn't let me off--and at my wits' end I suggested Grindle. Yes, it was I who started Grindle: I told Mrs. Stroud he was the 'coming' man, and she told somebody else, and so it got to be true. . . . And he painted Stroud without wincing; and she hung the picture among her husband's things. . . ."
158 |
159 | He flung himself down in the arm-chair near mine, laid back his head, and clasping his arms beneath it, looked up at the picture above the chimney-piece.
160 |
161 | "I like to fancy that Stroud himself would have given it to me, if he'd been able to say what he thought that day."
162 |
163 | And, in answer to a question I put half-mechanically--"Begin again?" he flashed out. "When the one thing that brings me anywhere near him is that I knew enough to leave off?"
164 |
165 | He stood up and laid his hand on my shoulder with a laugh. "Only the irony of it is that I _am_ still painting--since Grindle's doing it for me! The Strouds stand alone, and happen once--but there's no exterminating our kind of art."
--------------------------------------------------------------------------------
/05_weightloading/05_part-1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "45398736-7e89-4263-89c8-92153baff553",
6 | "metadata": {},
7 | "source": [
8 | "**LLM Workshop 2024 by Sebastian Raschka**\n",
9 | "\n",
10 | "This code is based on *Build a Large Language Model (From Scratch)*, [https://github.com/rasbt/LLMs-from-scratch](https://github.com/rasbt/LLMs-from-scratch)"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "id": "66dd524e-864c-4012-b0a2-ccfc56e80024",
16 | "metadata": {
17 | "id": "66dd524e-864c-4012-b0a2-ccfc56e80024"
18 | },
19 | "source": [
20 | "
\n",
21 | "
\n",
22 | "
\n",
23 | "
\n",
24 | "\n",
25 | "# 5) Loading pretrained weights (part 1)"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "id": "07a57fb9-f69e-44ca-ab29-3537fa5c0157",
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "from importlib.metadata import version\n",
36 | "\n",
37 | "pkgs = [\"matplotlib\", \n",
38 | " \"numpy\", \n",
39 | " \"tiktoken\", \n",
40 | " \"torch\",\n",
41 | " ]\n",
42 | "for p in pkgs:\n",
43 | " print(f\"{p} version: {version(p)}\")"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "id": "83eb6c38-7278-40e0-bd9f-8a2b1feac3ec",
49 | "metadata": {},
50 | "source": [
51 | "- Previously, we only trained a small GPT-2 model using a very small short-story book for educational purposes\n",
52 | "- Fortunately, we don't have to spend tens to hundreds of thousands of dollars to pretrain the model on a large pretraining corpus but can load pretrained weights (we start with the GPT-2 weights provided by OpenAI)\n",
53 | "\n",
54 | ""
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "id": "75cab892-a165-4f43-9601-f517bc212ab6",
60 | "metadata": {},
61 | "source": [
62 | "- First, some boilerplate code to download the files from OpenAI and load the weights into Python\n",
63 | "- Since OpenAI used [TensorFlow](https://www.tensorflow.org/), we will have to install and use TensorFlow for loading the weights; [tqdm](https://github.com/tqdm/tqdm) is a progress bar library\n",
64 | "- Uncomment and run the next cell to install the required libraries"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "id": "fb9fdf02-972a-444e-bf65-8ffcaaf30ce8",
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "# pip install tensorflow tqdm"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "id": "a0747edc-559c-44ef-a93f-079d60227e3f",
81 | "metadata": {},
82 | "outputs": [],
83 | "source": [
84 | "print(\"TensorFlow version:\", version(\"tensorflow\"))\n",
85 | "print(\"tqdm version:\", version(\"tqdm\"))"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "id": "c5bc89eb-4d39-4287-9b0c-e459ebe7f5ed",
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "# Relative import from the gpt_download.py contained in this folder\n",
96 | "from gpt_download import download_and_load_gpt2"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "id": "ff76a736-6f9f-4328-872e-f89a7b70a2cc",
102 | "metadata": {},
103 | "source": [
104 | "- We can then download the model weights for the 124 million parameter model as follows:"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "id": "76271dd7-108d-4f5b-9c01-6ae0aac4b395",
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "settings, params = download_and_load_gpt2(model_size=\"124M\", models_dir=\"gpt2\")"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "id": "b1a31951-d971-4a6e-9c43-11ee1168ec6a",
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "print(\"Settings:\", settings)"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "id": "857c8331-130e-46ba-921d-fa35d7a73cfe",
131 | "metadata": {},
132 | "outputs": [],
133 | "source": [
134 | "print(\"Parameter dictionary keys:\", params.keys())"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "id": "c48dac94-8562-4a66-84ef-46c613cdc4cd",
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "print(params[\"wte\"])\n",
145 | "print(\"Token embedding weight tensor dimensions:\", params[\"wte\"].shape)"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "id": "466e100c-294e-4afc-a70a-2f398ac4c104",
151 | "metadata": {},
152 | "source": [
153 | "- Alternatively, \"355M\", \"774M\", and \"1558M\" are also supported `model_size` arguments\n",
154 | "- The difference between these differently sized models is summarized in the figure below:"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "id": "20f19d32-5aae-4176-9f86-f391672c8f0d",
160 | "metadata": {},
161 | "source": [
162 | "
"
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "id": "ea6e5076-f08d-41fc-bd8b-1cfe53538f41",
168 | "metadata": {},
169 | "source": [
170 | "- Above, we loaded the 124M GPT-2 model weights into Python, however we still need to transfer them into our `GPTModel` instance\n",
171 | "- First, we initialize a new GPTModel instance\n",
172 | "- Note that the original GPT model initialized the linear layers for the query, key, and value matrices in the multi-head attention module with bias vectors, which is not required or recommended; however, to be able to load the weights correctly, we have to enable these too by setting `qkv_bias` to `True` in our implementation, too\n",
173 | "- We are also using the `1024` token context length that was used by the original GPT-2 model(s)"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "id": "9fef90dd-0654-4667-844f-08e28339ef7d",
180 | "metadata": {},
181 | "outputs": [],
182 | "source": [
183 | "GPT_CONFIG_124M = {\n",
184 | " \"vocab_size\": 50257, # Vocabulary size\n",
185 | " \"context_length\": 256, # Shortened context length (orig: 1024)\n",
186 | " \"emb_dim\": 768, # Embedding dimension\n",
187 | " \"n_heads\": 12, # Number of attention heads\n",
188 | " \"n_layers\": 12, # Number of layers\n",
189 | " \"drop_rate\": 0.1, # Dropout rate\n",
190 | " \"qkv_bias\": False # Query-key-value bias\n",
191 | "}\n",
192 | "\n",
193 | "\n",
194 | "# Define model configurations in a dictionary for compactness\n",
195 | "model_configs = {\n",
196 | " \"gpt2-small (124M)\": {\"emb_dim\": 768, \"n_layers\": 12, \"n_heads\": 12},\n",
197 | " \"gpt2-medium (355M)\": {\"emb_dim\": 1024, \"n_layers\": 24, \"n_heads\": 16},\n",
198 | " \"gpt2-large (774M)\": {\"emb_dim\": 1280, \"n_layers\": 36, \"n_heads\": 20},\n",
199 | " \"gpt2-xl (1558M)\": {\"emb_dim\": 1600, \"n_layers\": 48, \"n_heads\": 25},\n",
200 | "}\n",
201 | "\n",
202 | "# Copy the base configuration and update with specific model settings\n",
203 | "model_name = \"gpt2-small (124M)\" # Example model name\n",
204 | "NEW_CONFIG = GPT_CONFIG_124M.copy()\n",
205 | "NEW_CONFIG.update(model_configs[model_name])\n",
206 | "NEW_CONFIG.update({\"context_length\": 1024, \"qkv_bias\": True})"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": null,
212 | "id": "c20cdc14-684d-4f77-a786-6646f66bae81",
213 | "metadata": {},
214 | "outputs": [],
215 | "source": [
216 | "from supplementary import GPTModel\n",
217 | "\n",
218 | "gpt = GPTModel(NEW_CONFIG)\n",
219 | "gpt.eval();"
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "id": "272f29ac-8342-4b3d-a57d-9b0166ced314",
225 | "metadata": {},
226 | "source": [
227 | "- The next task is to assign the OpenAI weights to the corresponding weight tensors in our `GPTModel` instance"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "id": "f9a92229-c002-49a6-8cfb-248297ad8296",
234 | "metadata": {},
235 | "outputs": [],
236 | "source": [
237 | "def assign(left, right):\n",
238 | " if left.shape != right.shape:\n",
239 | " raise ValueError(f\"Shape mismatch. Left: {left.shape}, Right: {right.shape}\")\n",
240 | " return torch.nn.Parameter(torch.tensor(right))"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": null,
246 | "id": "f22d5d95-ca5a-425c-a9ec-fc432a12d4e9",
247 | "metadata": {},
248 | "outputs": [],
249 | "source": [
250 | "import torch\n",
251 | "import numpy as np\n",
252 | "\n",
253 | "def load_weights_into_gpt(gpt, params):\n",
254 | " gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])\n",
255 | " gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])\n",
256 | " \n",
257 | " for b in range(len(params[\"blocks\"])):\n",
258 | " q_w, k_w, v_w = np.split(\n",
259 | " (params[\"blocks\"][b][\"attn\"][\"c_attn\"])[\"w\"], 3, axis=-1)\n",
260 | " gpt.trf_blocks[b].att.W_query.weight = assign(\n",
261 | " gpt.trf_blocks[b].att.W_query.weight, q_w.T)\n",
262 | " gpt.trf_blocks[b].att.W_key.weight = assign(\n",
263 | " gpt.trf_blocks[b].att.W_key.weight, k_w.T)\n",
264 | " gpt.trf_blocks[b].att.W_value.weight = assign(\n",
265 | " gpt.trf_blocks[b].att.W_value.weight, v_w.T)\n",
266 | "\n",
267 | " q_b, k_b, v_b = np.split(\n",
268 | " (params[\"blocks\"][b][\"attn\"][\"c_attn\"])[\"b\"], 3, axis=-1)\n",
269 | " gpt.trf_blocks[b].att.W_query.bias = assign(\n",
270 | " gpt.trf_blocks[b].att.W_query.bias, q_b)\n",
271 | " gpt.trf_blocks[b].att.W_key.bias = assign(\n",
272 | " gpt.trf_blocks[b].att.W_key.bias, k_b)\n",
273 | " gpt.trf_blocks[b].att.W_value.bias = assign(\n",
274 | " gpt.trf_blocks[b].att.W_value.bias, v_b)\n",
275 | "\n",
276 | " gpt.trf_blocks[b].att.out_proj.weight = assign(\n",
277 | " gpt.trf_blocks[b].att.out_proj.weight, \n",
278 | " params[\"blocks\"][b][\"attn\"][\"c_proj\"][\"w\"].T)\n",
279 | " gpt.trf_blocks[b].att.out_proj.bias = assign(\n",
280 | " gpt.trf_blocks[b].att.out_proj.bias, \n",
281 | " params[\"blocks\"][b][\"attn\"][\"c_proj\"][\"b\"])\n",
282 | "\n",
283 | " gpt.trf_blocks[b].ff.layers[0].weight = assign(\n",
284 | " gpt.trf_blocks[b].ff.layers[0].weight, \n",
285 | " params[\"blocks\"][b][\"mlp\"][\"c_fc\"][\"w\"].T)\n",
286 | " gpt.trf_blocks[b].ff.layers[0].bias = assign(\n",
287 | " gpt.trf_blocks[b].ff.layers[0].bias, \n",
288 | " params[\"blocks\"][b][\"mlp\"][\"c_fc\"][\"b\"])\n",
289 | " gpt.trf_blocks[b].ff.layers[2].weight = assign(\n",
290 | " gpt.trf_blocks[b].ff.layers[2].weight, \n",
291 | " params[\"blocks\"][b][\"mlp\"][\"c_proj\"][\"w\"].T)\n",
292 | " gpt.trf_blocks[b].ff.layers[2].bias = assign(\n",
293 | " gpt.trf_blocks[b].ff.layers[2].bias, \n",
294 | " params[\"blocks\"][b][\"mlp\"][\"c_proj\"][\"b\"])\n",
295 | "\n",
296 | " gpt.trf_blocks[b].norm1.scale = assign(\n",
297 | " gpt.trf_blocks[b].norm1.scale, \n",
298 | " params[\"blocks\"][b][\"ln_1\"][\"g\"])\n",
299 | " gpt.trf_blocks[b].norm1.shift = assign(\n",
300 | " gpt.trf_blocks[b].norm1.shift, \n",
301 | " params[\"blocks\"][b][\"ln_1\"][\"b\"])\n",
302 | " gpt.trf_blocks[b].norm2.scale = assign(\n",
303 | " gpt.trf_blocks[b].norm2.scale, \n",
304 | " params[\"blocks\"][b][\"ln_2\"][\"g\"])\n",
305 | " gpt.trf_blocks[b].norm2.shift = assign(\n",
306 | " gpt.trf_blocks[b].norm2.shift, \n",
307 | " params[\"blocks\"][b][\"ln_2\"][\"b\"])\n",
308 | "\n",
309 | " gpt.final_norm.scale = assign(gpt.final_norm.scale, params[\"g\"])\n",
310 | " gpt.final_norm.shift = assign(gpt.final_norm.shift, params[\"b\"])\n",
311 | " gpt.out_head.weight = assign(gpt.out_head.weight, params[\"wte\"])\n",
312 | " \n",
313 | " \n",
314 | "load_weights_into_gpt(gpt, params)\n",
315 | "\n",
316 | "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
317 | "gpt.to(device);"
318 | ]
319 | },
320 | {
321 | "cell_type": "markdown",
322 | "id": "4f7472cb-54dc-4311-96d8-b2694f885cee",
323 | "metadata": {},
324 | "source": [
325 | "- If the model is loaded correctly, we can use it to generate new text using our previous `generate` function:"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": null,
331 | "id": "1f690253-f845-4347-b7b6-43fabbd2affa",
332 | "metadata": {},
333 | "outputs": [],
334 | "source": [
335 | "import tiktoken\n",
336 | "from supplementary import (\n",
337 | " generate_text_simple,\n",
338 | " text_to_token_ids,\n",
339 | " token_ids_to_text\n",
340 | ")\n",
341 | "\n",
342 | "\n",
343 | "tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
344 | "\n",
345 | "torch.manual_seed(123)\n",
346 | "\n",
347 | "token_ids = generate_text_simple(\n",
348 | " model=gpt,\n",
349 | " idx=text_to_token_ids(\"Every effort moves you\", tokenizer).to(device),\n",
350 | " max_new_tokens=10,\n",
351 | " context_size=GPT_CONFIG_124M[\"context_length\"]\n",
352 | ")\n",
353 | "\n",
354 | "print(\"Output text:\\n\", token_ids_to_text(token_ids, tokenizer))"
355 | ]
356 | },
357 | {
358 | "cell_type": "markdown",
359 | "id": "6d079f98-a7c4-462e-8416-5a64f670861c",
360 | "metadata": {},
361 | "source": [
362 | "- We know that we loaded the model weights correctly because the model can generate coherent text; if we made even a small mistake, the mode would not be able to do that"
363 | ]
364 | },
365 | {
366 | "cell_type": "markdown",
367 | "id": "1a30d071-30cd-43df-ba83-a6b162593b19",
368 | "metadata": {},
369 | "source": [
370 | "
\n",
371 | "
\n",
372 | "
\n",
373 | "
\n",
374 | "\n",
375 | "# Exercise 1: Trying larger LLMs"
376 | ]
377 | },
378 | {
379 | "cell_type": "markdown",
380 | "id": "9a459f5a-578e-4145-8bac-aad43915de0d",
381 | "metadata": {},
382 | "source": [
383 | "- Load one of the larger LLMs and see how the output quality compares\n",
384 | "- Ask it to answer specific instructions, for example to summarize text or correct the spelling of a sentence"
385 | ]
386 | }
387 | ],
388 | "metadata": {
389 | "accelerator": "GPU",
390 | "colab": {
391 | "gpuType": "A100",
392 | "machine_shape": "hm",
393 | "provenance": []
394 | },
395 | "kernelspec": {
396 | "display_name": "Python 3 (ipykernel)",
397 | "language": "python",
398 | "name": "python3"
399 | },
400 | "language_info": {
401 | "codemirror_mode": {
402 | "name": "ipython",
403 | "version": 3
404 | },
405 | "file_extension": ".py",
406 | "mimetype": "text/x-python",
407 | "name": "python",
408 | "nbconvert_exporter": "python",
409 | "pygments_lexer": "ipython3",
410 | "version": "3.10.10"
411 | }
412 | },
413 | "nbformat": 4,
414 | "nbformat_minor": 5
415 | }
416 |
--------------------------------------------------------------------------------
/05_weightloading/05_part-2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "0599d57c-16e4-478c-954d-89dbbd193ced",
6 | "metadata": {},
7 | "source": [
8 | "**LLM Workshop 2024 by Sebastian Raschka**"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "14b82151-07c6-4867-b374-741258033b52",
14 | "metadata": {},
15 | "source": [
16 | "
\n",
17 | "
\n",
18 | "
\n",
19 | "
\n",
20 | "\n",
21 | "# 5) Loading pretrained weights (part 2; using LitGPT)"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "id": "4d617b8f-8493-4afa-8c91-f3d1ab79795b",
27 | "metadata": {},
28 | "source": [
29 | "- Now, we are loading the weights using an open-source library called LitGPT\n",
30 | "- LitGPT is fundamentally similar to the LLM code we implemented previously, but it is much more sophisticated and supports more than 20 different LLMs (Mistral, Gemma, Llama, Phi, and more)\n",
31 | "\n",
32 | "# ⚡ LitGPT\n",
33 | "\n",
34 | "**20+ high-performance LLMs with recipes to pretrain, finetune, deploy at scale.**\n",
35 | "\n",
36 | "
\n", 37 | "✅ From scratch implementations ✅ No abstractions ✅ Beginner friendly \n", 38 | "✅ Flash attention ✅ FSDP ✅ LoRA, QLoRA, Adapter\n", 39 | "✅ Reduce GPU memory (fp4/8/16/32) ✅ 1-1000+ GPUs/TPUs ✅ 20+ LLMs \n", 40 | "\n", 41 | "\n", 42 | "## Basic usage:\n", 43 | "\n", 44 | "```\n", 45 | "# ligpt [action] [model]\n", 46 | "litgpt download meta-llama/Meta-Llama-3-8B-Instruct\n", 47 | "litgpt chat meta-llama/Meta-Llama-3-8B-Instruct\n", 48 | "litgpt evaluate meta-llama/Meta-Llama-3-8B-Instruct\n", 49 | "litgpt finetune meta-llama/Meta-Llama-3-8B-Instruct\n", 50 | "litgpt pretrain meta-llama/Meta-Llama-3-8B-Instruct\n", 51 | "litgpt serve meta-llama/Meta-Llama-3-8B-Instruct\n", 52 | "```\n", 53 | "\n", 54 | "\n", 55 | "- You can learn more about LitGPT in the [corresponding GitHub repository](https://github.com/Lightning-AI/litgpt), that contains many tutorials, use cases, and examples\n" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "id": "b1f9508e", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "# pip install litgpt" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "48cf71fa-af17-4c72-a6ab-f258a2b5a8ac", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "from importlib.metadata import version\n", 76 | "\n", 77 | "pkgs = [\"litgpt\", \n", 78 | " \"torch\",\n", 79 | " ]\n", 80 | "for p in pkgs:\n", 81 | " print(f\"{p} version: {version(p)}\")" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "id": "fe29baa9-c3b0-493d-94b4-eaa8146d6b3c", 87 | "metadata": {}, 88 | "source": [ 89 | "- First, let's see what LLMs are supported" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "id": "0ae8df66-f391-4266-b437-a1f601a6ac40", 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "!litgpt download list" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "id": "2495037e-0068-49ad-9bed-0bcdc440727d", 105 | "metadata": {}, 106 | "source": [ 107 | "- We can then download an LLM via the following command" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "id": "fb0c202d", 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "!litgpt download microsoft/phi-2" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "id": "6caf5be0-4aa1-498f-b08a-68ff234cbea5", 123 | "metadata": {}, 124 | "source": [ 125 | "- And there's also a Python API to use the model" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "id": "e057edbf", 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "from litgpt import LLM\n", 136 | "\n", 137 | "llm = LLM.load(\"microsoft/phi-2\")\n", 138 | "\n", 139 | "llm.generate(\"What do Llamas eat?\")" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "id": "fc775d4e", 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "result = llm.generate(\"What do Llamas eat?\", stream=True, max_new_tokens=200)\n", 150 | "for e in result:\n", 151 | " print(e, end=\"\", flush=True)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "id": "288158da", 157 | "metadata": {}, 158 | "source": [ 159 | "