├── .gitignore
├── 1. Genesis.md
├── 10. Token Embedding.md
├── 10. Token Embedding
└── demo.ipynb
├── 11. Positional Embedding
└── practical.ipynb
├── 11. Positional Encoding.md
├── 12. Data Preprocessing
├── preprocess.ipynb
└── the-verdict.txt
├── 12. Data Processing.md
├── 12. Self Attention
└── notebook.ipynb
├── 13. Intro to Attention Mechanism.md
├── 14. Simplified Attention.md
├── 15. Self Attention.md
├── 16. Causal Attention.md
├── 16. Causal Attention
└── notebook.ipynb
├── 17. Multihead Attention.md
├── 17. Multihead Attention
└── notebook.ipynb
├── 18. Multihead Attention.md
├── 18. Multihead Attention
└── notebook.ipynb
├── 19. Birds Eye View of LLM.md
├── 19. LLM Architecture
└── notebook.ipynb
├── 2. LLM Basics.md
├── 3. Stages of Building LLM.md
├── 4. Basic Transformer.md
├── 5. GPT working.md
├── 6. Stages.md
├── 7. Tokenization
├── 7. Tokenization from Scratch.ipynb
├── 7. Tokenization.md
└── the-verdict.txt
├── 8. Byte Pair Encoding.md
├── 8. Byte Pair Encoding
├── .gitkeep
└── Hands on with GPT Tokenizer BPE.ipynb
├── 9. Input Target Pair.md
├── 9. Input Target Pairs
├── .gitkeep
├── 9. Input Target Pair.ipynb
├── Untitled.ipynb
└── the-verdict.txt
├── LICENSE
├── README.md
├── assets
├── 1. Genesis
│ ├── market.jpg
│ └── open-close.png
├── 10. Embedding
│ ├── nn.png
│ ├── size.png
│ ├── summary.png
│ └── vector.png
├── 11. Positional Encoding
│ ├── abs.png
│ └── pos.png
├── 13. Intro Attention
│ ├── att.png
│ ├── attwt.png
│ ├── endec.png
│ ├── im.gif
│ ├── jump.gif
│ └── trans.png
├── 14. SimAttention
│ ├── dotprod.gif
│ ├── final.png
│ └── sim.png
├── 15. Self Attention
│ ├── multipl.png
│ ├── only.gif
│ ├── selfatt.png
│ └── var.gif
├── 16. Causal Attention
│ ├── dropout.png
│ ├── mask.png
│ └── strategy.png
├── 17. Multihead Attention
│ ├── change.png
│ ├── dim.png
│ ├── flip.png
│ ├── flip2.png
│ ├── multi.png
│ ├── out.png
│ └── prev.png
├── 19. Birds Eye View
│ ├── sofar.png
│ ├── trans.png
│ └── view.png
├── 2. Basics
│ ├── parm2.jpg
│ ├── parms.jpg
│ ├── trans.gif
│ ├── trans2.webp
│ └── vs.png
├── 3. Stages
│ ├── cost.png
│ ├── data.png
│ ├── fine-hervy.png
│ ├── nomoney.gif
│ └── notice.png
├── 4. LLM Basic
│ ├── attention.png
│ └── word.png
├── 5. Working
│ ├── borrow.gif
│ └── gpt-work.png
├── 6. Steps
│ ├── stage.png
│ └── step.png
├── 8. BPE
│ └── paper.png
├── 9. Input Target Pairs
│ ├── dataloader.png
│ ├── inp.png
│ └── stride.png
└── screw.gif
└── image.png
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # UV
98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | #uv.lock
102 |
103 | # poetry
104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | # This is especially recommended for binary packages to ensure reproducibility, and is more
106 | # commonly ignored for libraries.
107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 |
110 | # pdm
111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | # in version control.
115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 |
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 |
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 |
127 | # SageMath parsed files
128 | *.sage.py
129 |
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 |
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 |
143 | # Rope project settings
144 | .ropeproject
145 |
146 | # mkdocs documentation
147 | /site
148 |
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 |
154 | # Pyre type checker
155 | .pyre/
156 |
157 | # pytype static type analyzer
158 | .pytype/
159 |
160 | # Cython debug symbols
161 | cython_debug/
162 |
163 | # PyCharm
164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | # and can be added to the global gitignore or merged into this file. For a more nuclear
167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 |
170 | # PyPI configuration file
171 | .pypirc
172 |
--------------------------------------------------------------------------------
/1. Genesis.md:
--------------------------------------------------------------------------------
1 | ## First Chatbot
2 | [Eliza](https://psych.fullerton.edu/mbirnbaum/psych101/eliza.htm)
3 |
4 | ## Closed Source Vs Open Source models
5 |
6 |
7 | ## Job Market
8 |
9 |
10 |
11 | ### Qn) What is Langchain:
12 | #### Ans: Tools to Build LLM app
--------------------------------------------------------------------------------
/10. Token Embedding.md:
--------------------------------------------------------------------------------
1 | # Token Embedding
2 | ## Why token embeddings are needed?
3 |
4 | - Computer need numerical representation of word
5 | - We used Token IDs
6 | - We cannot use directly Token IDs , cat and kitten are similar but `token id doesn't capture sementic meaning`
7 |
8 | ### Why not One hot encoding
9 | "dog" --> [0,0,1,0,0,0,0,...]
10 | "puppy" --> [0,0,0,0,1,0,0,...]
11 |
12 | Again random number assignment
13 |
14 | ### Let's encode every wor a vector
15 | - What is dimension
16 | > On the basis of feature...
17 |
18 |
19 |
20 | - Can see which is near to each other and farther from each other.
21 |
22 | > From here vector embedding comes from.
23 |
24 | ## How do we come up with those values of features?
25 | - We can train Neural Network to Construct vector Embedding
26 |
27 |
28 |
29 | ## How are token embeddings created for LLMs?
30 | 1. Create Vocabulary- We have token and token id into it.
31 | 2. Every token id converted into vector.
32 |
33 | > What was vector Embedding dimension for training GPT2 and what was vocabulary size.
34 |
35 | > `Vector Embedding dimension`: 768 and 1600 for larger.
36 | > `Vocabulary size`: 50,257
37 |
38 | Weights: 768 * 50,257 [Embedding layer Weight matrix]
39 |
40 |
41 |
42 |
43 | Steps:
44 | - Initialise embedding weights woth random values
45 | [ This is starting point for LLM learning process]
46 |
47 | - These weights are `optimised as part of LLM training process`.
48 | > Backpropagation were applied
49 |
50 | ## LLM Training
51 | - Weight of Embedding are optimised
52 | - Weights of next word prediction are also optimised
53 |
54 | ## Lookup operation
55 | Embedding layer is a lookup operation that retrives rows from the embedding layer weight matrix using a token ID.
56 |
57 |
58 |
--------------------------------------------------------------------------------
/10. Token Embedding/demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Playing with Embedding "
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# https://huggingface.co/fse/word2vec-google-news-300"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 3,
22 | "metadata": {},
23 | "outputs": [
24 | {
25 | "name": "stdout",
26 | "output_type": "stream",
27 | "text": [
28 | "[--------------------------------------------------] 1.8% 30.4/1662.8MB downloaded"
29 | ]
30 | },
31 | {
32 | "ename": "KeyboardInterrupt",
33 | "evalue": "",
34 | "output_type": "error",
35 | "traceback": [
36 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
37 | "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
38 | "Cell \u001b[1;32mIn[3], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mgensim\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdownloader\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mapi\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m model \u001b[38;5;241m=\u001b[39m\u001b[43mapi\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mword2vec-google-news-300\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
39 | "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\gensim\\downloader.py:496\u001b[0m, in \u001b[0;36mload\u001b[1;34m(name, return_path)\u001b[0m\n\u001b[0;32m 494\u001b[0m path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(folder_dir, file_name)\n\u001b[0;32m 495\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(folder_dir):\n\u001b[1;32m--> 496\u001b[0m \u001b[43m_download\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m return_path:\n\u001b[0;32m 499\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m path\n",
40 | "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\gensim\\downloader.py:396\u001b[0m, in \u001b[0;36m_download\u001b[1;34m(name)\u001b[0m\n\u001b[0;32m 394\u001b[0m fname \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{fname}\u001b[39;00m\u001b[38;5;124m.gz\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(fname\u001b[38;5;241m=\u001b[39mname)\n\u001b[0;32m 395\u001b[0m dst_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(tmp_dir, fname)\n\u001b[1;32m--> 396\u001b[0m \u001b[43murllib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43murlretrieve\u001b[49m\u001b[43m(\u001b[49m\u001b[43murl_data\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdst_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreporthook\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_progress\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 397\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _calculate_md5_checksum(dst_path) \u001b[38;5;241m==\u001b[39m _get_checksum(name):\n\u001b[0;32m 398\u001b[0m sys\u001b[38;5;241m.\u001b[39mstdout\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
41 | "File \u001b[1;32mc:\\Program Files\\Python311\\Lib\\urllib\\request.py:270\u001b[0m, in \u001b[0;36murlretrieve\u001b[1;34m(url, filename, reporthook, data)\u001b[0m\n\u001b[0;32m 267\u001b[0m reporthook(blocknum, bs, size)\n\u001b[0;32m 269\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m--> 270\u001b[0m block \u001b[38;5;241m=\u001b[39m \u001b[43mfp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 271\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m block:\n\u001b[0;32m 272\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n",
42 | "File \u001b[1;32mc:\\Program Files\\Python311\\Lib\\http\\client.py:466\u001b[0m, in \u001b[0;36mHTTPResponse.read\u001b[1;34m(self, amt)\u001b[0m\n\u001b[0;32m 463\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlength \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m amt \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlength:\n\u001b[0;32m 464\u001b[0m \u001b[38;5;66;03m# clip the read to the \"end of response\"\u001b[39;00m\n\u001b[0;32m 465\u001b[0m amt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlength\n\u001b[1;32m--> 466\u001b[0m s \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfp\u001b[38;5;241m.\u001b[39mread(amt)\n\u001b[0;32m 467\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m s \u001b[38;5;129;01mand\u001b[39;00m amt:\n\u001b[0;32m 468\u001b[0m \u001b[38;5;66;03m# Ideally, we would raise IncompleteRead if the content-length\u001b[39;00m\n\u001b[0;32m 469\u001b[0m \u001b[38;5;66;03m# wasn't satisfied, but it might break compatibility.\u001b[39;00m\n\u001b[0;32m 470\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_close_conn()\n",
43 | "File \u001b[1;32mc:\\Program Files\\Python311\\Lib\\socket.py:706\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[1;34m(self, b)\u001b[0m\n\u001b[0;32m 704\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m 705\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 706\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sock\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrecv_into\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 707\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m timeout:\n\u001b[0;32m 708\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_timeout_occurred \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
44 | "File \u001b[1;32mc:\\Program Files\\Python311\\Lib\\ssl.py:1311\u001b[0m, in \u001b[0;36mSSLSocket.recv_into\u001b[1;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[0;32m 1307\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m flags \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m 1308\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 1309\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnon-zero flags not allowed in calls to recv_into() on \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m\n\u001b[0;32m 1310\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m)\n\u001b[1;32m-> 1311\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnbytes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1312\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1313\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mrecv_into(buffer, nbytes, flags)\n",
45 | "File \u001b[1;32mc:\\Program Files\\Python311\\Lib\\ssl.py:1167\u001b[0m, in \u001b[0;36mSSLSocket.read\u001b[1;34m(self, len, buffer)\u001b[0m\n\u001b[0;32m 1165\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1166\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m buffer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 1167\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sslobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1168\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1169\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sslobj\u001b[38;5;241m.\u001b[39mread(\u001b[38;5;28mlen\u001b[39m)\n",
46 | "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
47 | ]
48 | }
49 | ],
50 | "source": [
51 | "import gensim.downloader as api\n",
52 | "model =api.load(\"word2vec-google-news-300\")"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "word_vectors=model"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 4,
67 | "metadata": {},
68 | "outputs": [
69 | {
70 | "ename": "NameError",
71 | "evalue": "name 'word_vectors' is not defined",
72 | "output_type": "error",
73 | "traceback": [
74 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
75 | "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
76 | "Cell \u001b[1;32mIn[4], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mword_vectors\u001b[49m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVictor\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n",
77 | "\u001b[1;31mNameError\u001b[0m: name 'word_vectors' is not defined"
78 | ]
79 | }
80 | ],
81 | "source": [
82 | "print(word_vectors[\"Victor\"])"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "print(word_vectors[\"Victor\"].shape)"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "## Similar Word"
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "metadata": {},
104 | "source": [
105 | "### King+Woman-Man=?"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "print(word_vectors.most_similar(positive=['king','woman'],negative=['man'],topn=10))"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {},
121 | "outputs": [],
122 | "source": [
123 | "print(word_vectors.similarity('uncle','aunt'))\n",
124 | "print(word_vectors.similarity('paper','water'))"
125 | ]
126 | }
127 | ],
128 | "metadata": {
129 | "kernelspec": {
130 | "display_name": "Python 3",
131 | "language": "python",
132 | "name": "python3"
133 | },
134 | "language_info": {
135 | "codemirror_mode": {
136 | "name": "ipython",
137 | "version": 3
138 | },
139 | "file_extension": ".py",
140 | "mimetype": "text/x-python",
141 | "name": "python",
142 | "nbconvert_exporter": "python",
143 | "pygments_lexer": "ipython3",
144 | "version": "3.11.5"
145 | }
146 | },
147 | "nbformat": 4,
148 | "nbformat_minor": 2
149 | }
150 |
--------------------------------------------------------------------------------
/11. Positional Embedding/practical.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import torch"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 3,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "vocab_size=50257\n",
19 | "output_dim=256\n",
20 | "token_embedding_layer=torch.nn.Embedding(vocab_size,output_dim)"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "# Get input from dataloader\n",
30 | "max_len=4\n",
31 | "\n",
32 | "dataloader=create_dataloader(raw_text,batch_size=8,max_length=max_len,stride=max_len,shuffle=False)\n",
33 | "\n",
34 | "data_iter=iter(dataloader)\n",
35 | "inputs,targets=next(data_iter)"
36 | ]
37 | }
38 | ],
39 | "metadata": {
40 | "kernelspec": {
41 | "display_name": "Python 3",
42 | "language": "python",
43 | "name": "python3"
44 | },
45 | "language_info": {
46 | "codemirror_mode": {
47 | "name": "ipython",
48 | "version": 3
49 | },
50 | "file_extension": ".py",
51 | "mimetype": "text/x-python",
52 | "name": "python",
53 | "nbconvert_exporter": "python",
54 | "pygments_lexer": "ipython3",
55 | "version": "3.11.5"
56 | }
57 | },
58 | "nbformat": 4,
59 | "nbformat_minor": 2
60 | }
61 |
--------------------------------------------------------------------------------
/11. Positional Encoding.md:
--------------------------------------------------------------------------------
1 | # Positional Encoding
2 | - Built in top of Token Encoding/Embedding.
3 |
4 | ## Why Positional Encoding Needed
5 | >The `cat` sat on the mat
6 | On the mat the `cat` sat
7 |
8 | Same vector for `cat` in different sentences.
9 |
10 |
11 |
12 | - It is helpful to inject additional position information to the LLM.
13 | - There are two types of positional embeddings.
14 |
15 | A. `Absolute`
16 | Absolute position is added
17 |
18 |
19 | Positional vectors have same dimension as the original token embeddings.
20 |
21 | B. `Relative`
22 | Emphasis on relative position or distance between tokens.
23 |
24 | The model learn the relationship in terms of "how far apart" rather than at which exact position.
25 |
26 | **Advantage**: Model can generalise better to sequence fo `varying length`, even if it has not seen such length during training.
27 |
28 | Both type of positional encoding better then not using any type of positional encoding. It enable LLMs to `understand the order and relationship` between tokens.
29 |
30 | Absolute: When fixed order of token is crucial, eg: sequence generation.
31 | - GPT was trained on this and original Transformer too.
32 | - Used more commonly.
33 |
34 | Relative: Suitable for language modelling over long sequence, where same phrase xan repeat over and over again.
35 |
36 | #### Positional embedding also need to be optimised during training process. This optimization is part of model training itself.
37 |
38 |
--------------------------------------------------------------------------------
/12. Data Preprocessing/the-verdict.txt:
--------------------------------------------------------------------------------
1 | I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)
2 |
3 | "The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it's going to send the value of my picture 'way up; but I don't think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing's lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn's "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?
4 |
5 | Well!--even through the prism of Hermia's tears I felt able to face the fact with equanimity. Poor Jack Gisburn! The women had made him--it was fitting that they should mourn him. Among his own sex fewer regrets were heard, and in his own trade hardly a murmur. Professional jealousy? Perhaps. If it were, the honour of the craft was vindicated by little Claude Nutley, who, in all good faith, brought out in the Burlington a very handsome "obituary" on Jack--one of those showy articles stocked with random technicalities that I have heard (I won't say by whom) compared to Gisburn's painting. And so--his resolve being apparently irrevocable--the discussion gradually died out, and, as Mrs. Thwing had predicted, the price of "Gisburns" went up.
6 |
7 | It was not till three years later that, in the course of a few weeks' idling on the Riviera, it suddenly occurred to me to wonder why Gisburn had given up his painting. On reflection, it really was a tempting problem. To accuse his wife would have been too easy--his fair sitters had been denied the solace of saying that Mrs. Gisburn had "dragged him down." For Mrs. Gisburn--as such--had not existed till nearly a year after Jack's resolve had been taken. It might be that he had married her--since he liked his ease--because he didn't want to go on painting; but it would have been hard to prove that he had given up his painting because he had married her.
8 |
9 | Of course, if she had not dragged him down, she had equally, as Miss Croft contended, failed to "lift him up"--she had not led him back to the easel. To put the brush into his hand again--what a vocation for a wife! But Mrs. Gisburn appeared to have disdained it--and I felt it might be interesting to find out why.
10 |
11 | The desultory life of the Riviera lends itself to such purely academic speculations; and having, on my way to Monte Carlo, caught a glimpse of Jack's balustraded terraces between the pines, I had myself borne thither the next day.
12 |
13 | I found the couple at tea beneath their palm-trees; and Mrs. Gisburn's welcome was so genial that, in the ensuing weeks, I claimed it frequently. It was not that my hostess was "interesting": on that point I could have given Miss Croft the fullest reassurance. It was just because she was _not_ interesting--if I may be pardoned the bull--that I found her so. For Jack, all his life, had been surrounded by interesting women: they had fostered his art, it had been reared in the hot-house of their adulation. And it was therefore instructive to note what effect the "deadening atmosphere of mediocrity" (I quote Miss Croft) was having on him.
14 |
15 | I have mentioned that Mrs. Gisburn was rich; and it was immediately perceptible that her husband was extracting from this circumstance a delicate but substantial satisfaction. It is, as a rule, the people who scorn money who get most out of it; and Jack's elegant disdain of his wife's big balance enabled him, with an appearance of perfect good-breeding, to transmute it into objects of art and luxury. To the latter, I must add, he remained relatively indifferent; but he was buying Renaissance bronzes and eighteenth-century pictures with a discrimination that bespoke the amplest resources.
16 |
17 | "Money's only excuse is to put beauty into circulation," was one of the axioms he laid down across the Sevres and silver of an exquisitely appointed luncheon-table, when, on a later day, I had again run over from Monte Carlo; and Mrs. Gisburn, beaming on him, added for my enlightenment: "Jack is so morbidly sensitive to every form of beauty."
18 |
19 | Poor Jack! It had always been his fate to have women say such things of him: the fact should be set down in extenuation. What struck me now was that, for the first time, he resented the tone. I had seen him, so often, basking under similar tributes--was it the conjugal note that robbed them of their savour? No--for, oddly enough, it became apparent that he was fond of Mrs. Gisburn--fond enough not to see her absurdity. It was his own absurdity he seemed to be wincing under--his own attitude as an object for garlands and incense.
20 |
21 | "My dear, since I've chucked painting people don't say that stuff about me--they say it about Victor Grindle," was his only protest, as he rose from the table and strolled out onto the sunlit terrace.
22 |
23 | I glanced after him, struck by his last word. Victor Grindle was, in fact, becoming the man of the moment--as Jack himself, one might put it, had been the man of the hour. The younger artist was said to have formed himself at my friend's feet, and I wondered if a tinge of jealousy underlay the latter's mysterious abdication. But no--for it was not till after that event that the _rose Dubarry_ drawing-rooms had begun to display their "Grindles."
24 |
25 | I turned to Mrs. Gisburn, who had lingered to give a lump of sugar to her spaniel in the dining-room.
26 |
27 | "Why _has_ he chucked painting?" I asked abruptly.
28 |
29 | She raised her eyebrows with a hint of good-humoured surprise.
30 |
31 | "Oh, he doesn't _have_ to now, you know; and I want him to enjoy himself," she said quite simply.
32 |
33 | I looked about the spacious white-panelled room, with its _famille-verte_ vases repeating the tones of the pale damask curtains, and its eighteenth-century pastels in delicate faded frames.
34 |
35 | "Has he chucked his pictures too? I haven't seen a single one in the house."
36 |
37 | A slight shade of constraint crossed Mrs. Gisburn's open countenance. "It's his ridiculous modesty, you know. He says they're not fit to have about; he's sent them all away except one--my portrait--and that I have to keep upstairs."
38 |
39 | His ridiculous modesty--Jack's modesty about his pictures? My curiosity was growing like the bean-stalk. I said persuasively to my hostess: "I must really see your portrait, you know."
40 |
41 | She glanced out almost timorously at the terrace where her husband, lounging in a hooded chair, had lit a cigar and drawn the Russian deerhound's head between his knees.
42 |
43 | "Well, come while he's not looking," she said, with a laugh that tried to hide her nervousness; and I followed her between the marble Emperors of the hall, and up the wide stairs with terra-cotta nymphs poised among flowers at each landing.
44 |
45 | In the dimmest corner of her boudoir, amid a profusion of delicate and distinguished objects, hung one of the familiar oval canvases, in the inevitable garlanded frame. The mere outline of the frame called up all Gisburn's past!
46 |
47 | Mrs. Gisburn drew back the window-curtains, moved aside a _jardiniere_ full of pink azaleas, pushed an arm-chair away, and said: "If you stand here you can just manage to see it. I had it over the mantel-piece, but he wouldn't let it stay."
48 |
49 | Yes--I could just manage to see it--the first portrait of Jack's I had ever had to strain my eyes over! Usually they had the place of honour--say the central panel in a pale yellow or _rose Dubarry_ drawing-room, or a monumental easel placed so that it took the light through curtains of old Venetian point. The more modest place became the picture better; yet, as my eyes grew accustomed to the half-light, all the characteristic qualities came out--all the hesitations disguised as audacities, the tricks of prestidigitation by which, with such consummate skill, he managed to divert attention from the real business of the picture to some pretty irrelevance of detail. Mrs. Gisburn, presenting a neutral surface to work on--forming, as it were, so inevitably the background of her own picture--had lent herself in an unusual degree to the display of this false virtuosity. The picture was one of Jack's "strongest," as his admirers would have put it--it represented, on his part, a swelling of muscles, a congesting of veins, a balancing, straddling and straining, that reminded one of the circus-clown's ironic efforts to lift a feather. It met, in short, at every point the demand of lovely woman to be painted "strongly" because she was tired of being painted "sweetly"--and yet not to lose an atom of the sweetness.
50 |
51 | "It's the last he painted, you know," Mrs. Gisburn said with pardonable pride. "The last but one," she corrected herself--"but the other doesn't count, because he destroyed it."
52 |
53 | "Destroyed it?" I was about to follow up this clue when I heard a footstep and saw Jack himself on the threshold.
54 |
55 | As he stood there, his hands in the pockets of his velveteen coat, the thin brown waves of hair pushed back from his white forehead, his lean sunburnt cheeks furrowed by a smile that lifted the tips of a self-confident moustache, I felt to what a degree he had the same quality as his pictures--the quality of looking cleverer than he was.
56 |
57 | His wife glanced at him deprecatingly, but his eyes travelled past her to the portrait.
58 |
59 | "Mr. Rickham wanted to see it," she began, as if excusing herself. He shrugged his shoulders, still smiling.
60 |
61 | "Oh, Rickham found me out long ago," he said lightly; then, passing his arm through mine: "Come and see the rest of the house."
62 |
63 | He showed it to me with a kind of naive suburban pride: the bath-rooms, the speaking-tubes, the dress-closets, the trouser-presses--all the complex simplifications of the millionaire's domestic economy. And whenever my wonder paid the expected tribute he said, throwing out his chest a little: "Yes, I really don't see how people manage to live without that."
64 |
65 | Well--it was just the end one might have foreseen for him. Only he was, through it all and in spite of it all--as he had been through, and in spite of, his pictures--so handsome, so charming, so disarming, that one longed to cry out: "Be dissatisfied with your leisure!" as once one had longed to say: "Be dissatisfied with your work!"
66 |
67 | But, with the cry on my lips, my diagnosis suffered an unexpected check.
68 |
69 | "This is my own lair," he said, leading me into a dark plain room at the end of the florid vista. It was square and brown and leathery: no "effects"; no bric-a-brac, none of the air of posing for reproduction in a picture weekly--above all, no least sign of ever having been used as a studio.
70 |
71 | The fact brought home to me the absolute finality of Jack's break with his old life.
72 |
73 | "Don't you ever dabble with paint any more?" I asked, still looking about for a trace of such activity.
74 |
75 | "Never," he said briefly.
76 |
77 | "Or water-colour--or etching?"
78 |
79 | His confident eyes grew dim, and his cheeks paled a little under their handsome sunburn.
80 |
81 | "Never think of it, my dear fellow--any more than if I'd never touched a brush."
82 |
83 | And his tone told me in a flash that he never thought of anything else.
84 |
85 | I moved away, instinctively embarrassed by my unexpected discovery; and as I turned, my eye fell on a small picture above the mantel-piece--the only object breaking the plain oak panelling of the room.
86 |
87 | "Oh, by Jove!" I said.
88 |
89 | It was a sketch of a donkey--an old tired donkey, standing in the rain under a wall.
90 |
91 | "By Jove--a Stroud!" I cried.
92 |
93 | He was silent; but I felt him close behind me, breathing a little quickly.
94 |
95 | "What a wonder! Made with a dozen lines--but on everlasting foundations. You lucky chap, where did you get it?"
96 |
97 | He answered slowly: "Mrs. Stroud gave it to me."
98 |
99 | "Ah--I didn't know you even knew the Strouds. He was such an inflexible hermit."
100 |
101 | "I didn't--till after. . . . She sent for me to paint him when he was dead."
102 |
103 | "When he was dead? You?"
104 |
105 | I must have let a little too much amazement escape through my surprise, for he answered with a deprecating laugh: "Yes--she's an awful simpleton, you know, Mrs. Stroud. Her only idea was to have him done by a fashionable painter--ah, poor Stroud! She thought it the surest way of proclaiming his greatness--of forcing it on a purblind public. And at the moment I was _the_ fashionable painter."
106 |
107 | "Ah, poor Stroud--as you say. Was _that_ his history?"
108 |
109 | "That was his history. She believed in him, gloried in him--or thought she did. But she couldn't bear not to have all the drawing-rooms with her. She couldn't bear the fact that, on varnishing days, one could always get near enough to see his pictures. Poor woman! She's just a fragment groping for other fragments. Stroud is the only whole I ever knew."
110 |
111 | "You ever knew? But you just said--"
112 |
113 | Gisburn had a curious smile in his eyes.
114 |
115 | "Oh, I knew him, and he knew me--only it happened after he was dead."
116 |
117 | I dropped my voice instinctively. "When she sent for you?"
118 |
119 | "Yes--quite insensible to the irony. She wanted him vindicated--and by me!"
120 |
121 | He laughed again, and threw back his head to look up at the sketch of the donkey. "There were days when I couldn't look at that thing--couldn't face it. But I forced myself to put it here; and now it's cured me--cured me. That's the reason why I don't dabble any more, my dear Rickham; or rather Stroud himself is the reason."
122 |
123 | For the first time my idle curiosity about my companion turned into a serious desire to understand him better.
124 |
125 | "I wish you'd tell me how it happened," I said.
126 |
127 | He stood looking up at the sketch, and twirling between his fingers a cigarette he had forgotten to light. Suddenly he turned toward me.
128 |
129 | "I'd rather like to tell you--because I've always suspected you of loathing my work."
130 |
131 | I made a deprecating gesture, which he negatived with a good-humoured shrug.
132 |
133 | "Oh, I didn't care a straw when I believed in myself--and now it's an added tie between us!"
134 |
135 | He laughed slightly, without bitterness, and pushed one of the deep arm-chairs forward. "There: make yourself comfortable--and here are the cigars you like."
136 |
137 | He placed them at my elbow and continued to wander up and down the room, stopping now and then beneath the picture.
138 |
139 | "How it happened? I can tell you in five minutes--and it didn't take much longer to happen. . . . I can remember now how surprised and pleased I was when I got Mrs. Stroud's note. Of course, deep down, I had always _felt_ there was no one like him--only I had gone with the stream, echoed the usual platitudes about him, till I half got to think he was a failure, one of the kind that are left behind. By Jove, and he _was_ left behind--because he had come to stay! The rest of us had to let ourselves be swept along or go under, but he was high above the current--on everlasting foundations, as you say.
140 |
141 | "Well, I went off to the house in my most egregious mood--rather moved, Lord forgive me, at the pathos of poor Stroud's career of failure being crowned by the glory of my painting him! Of course I meant to do the picture for nothing--I told Mrs. Stroud so when she began to stammer something about her poverty. I remember getting off a prodigious phrase about the honour being _mine_--oh, I was princely, my dear Rickham! I was posing to myself like one of my own sitters.
142 |
143 | "Then I was taken up and left alone with him. I had sent all my traps in advance, and I had only to set up the easel and get to work. He had been dead only twenty-four hours, and he died suddenly, of heart disease, so that there had been no preliminary work of destruction--his face was clear and untouched. I had met him once or twice, years before, and thought him insignificant and dingy. Now I saw that he was superb.
144 |
145 | "I was glad at first, with a merely aesthetic satisfaction: glad to have my hand on such a 'subject.' Then his strange life-likeness began to affect me queerly--as I blocked the head in I felt as if he were watching me do it. The sensation was followed by the thought: if he _were_ watching me, what would he say to my way of working? My strokes began to go a little wild--I felt nervous and uncertain.
146 |
147 | "Once, when I looked up, I seemed to see a smile behind his close grayish beard--as if he had the secret, and were amusing himself by holding it back from me. That exasperated me still more. The secret? Why, I had a secret worth twenty of his! I dashed at the canvas furiously, and tried some of my bravura tricks. But they failed me, they crumbled. I saw that he wasn't watching the showy bits--I couldn't distract his attention; he just kept his eyes on the hard passages between. Those were the ones I had always shirked, or covered up with some lying paint. And how he saw through my lies!
148 |
149 | "I looked up again, and caught sight of that sketch of the donkey hanging on the wall near his bed. His wife told me afterward it was the last thing he had done--just a note taken with a shaking hand, when he was down in Devonshire recovering from a previous heart attack. Just a note! But it tells his whole history. There are years of patient scornful persistence in every line. A man who had swum with the current could never have learned that mighty up-stream stroke. . . .
150 |
151 | "I turned back to my work, and went on groping and muddling; then I looked at the donkey again. I saw that, when Stroud laid in the first stroke, he knew just what the end would be. He had possessed his subject, absorbed it, recreated it. When had I done that with any of my things? They hadn't been born of me--I had just adopted them. . . .
152 |
153 | "Hang it, Rickham, with that face watching me I couldn't do another stroke. The plain truth was, I didn't know where to put it--_I had never known_. Only, with my sitters and my public, a showy splash of colour covered up the fact--I just threw paint into their faces. . . . Well, paint was the one medium those dead eyes could see through--see straight to the tottering foundations underneath. Don't you know how, in talking a foreign language, even fluently, one says half the time not what one wants to but what one can? Well--that was the way I painted; and as he lay there and watched me, the thing they called my 'technique' collapsed like a house of cards. He didn't sneer, you understand, poor Stroud--he just lay there quietly watching, and on his lips, through the gray beard, I seemed to hear the question: 'Are you sure you know where you're coming out?'
154 |
155 | "If I could have painted that face, with that question on it, I should have done a great thing. The next greatest thing was to see that I couldn't--and that grace was given me. But, oh, at that minute, Rickham, was there anything on earth I wouldn't have given to have Stroud alive before me, and to hear him say: 'It's not too late--I'll show you how'?
156 |
157 | "It _was_ too late--it would have been, even if he'd been alive. I packed up my traps, and went down and told Mrs. Stroud. Of course I didn't tell her _that_--it would have been Greek to her. I simply said I couldn't paint him, that I was too moved. She rather liked the idea--she's so romantic! It was that that made her give me the donkey. But she was terribly upset at not getting the portrait--she did so want him 'done' by some one showy! At first I was afraid she wouldn't let me off--and at my wits' end I suggested Grindle. Yes, it was I who started Grindle: I told Mrs. Stroud he was the 'coming' man, and she told somebody else, and so it got to be true. . . . And he painted Stroud without wincing; and she hung the picture among her husband's things. . . ."
158 |
159 | He flung himself down in the arm-chair near mine, laid back his head, and clasping his arms beneath it, looked up at the picture above the chimney-piece.
160 |
161 | "I like to fancy that Stroud himself would have given it to me, if he'd been able to say what he thought that day."
162 |
163 | And, in answer to a question I put half-mechanically--"Begin again?" he flashed out. "When the one thing that brings me anywhere near him is that I knew enough to leave off?"
164 |
165 | He stood up and laid his hand on my shoulder with a laugh. "Only the irony of it is that I _am_ still painting--since Grindle's doing it for me! The Strouds stand alone, and happen once--but there's no exterminating our kind of art."
--------------------------------------------------------------------------------
/12. Data Processing.md:
--------------------------------------------------------------------------------
1 | # LLM Data Pre-Processing
2 |
3 |
4 |
5 | 4 Steps:
6 | **I. Tokenization**
7 | - Word Based
8 | - Subword Based (BPE Tokenizer)
9 | - Character Based
10 |
11 |
12 |
13 | **II. Token Embeddings**
14 | Converting token IDs to vectors
15 |
16 | **III. Positional Embeddings**
17 | Encoding information about positions
18 |
19 | **IV. Input Embeddings = Token Embeddings + Positional Embeddings**
20 | ___
21 |
--------------------------------------------------------------------------------
/12. Self Attention/notebook.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import torch\n",
10 | "\n",
11 | "inputs = torch.tensor(\n",
12 | " [[0.43, 0.15, 0.89], # Your\n",
13 | " [0.55, 0.87, 0.66], # journey\n",
14 | " [0.57, 0.85, 0.64], # starts\n",
15 | " [0.22, 0.58, 0.33], # with\n",
16 | " [0.77, 0.25, 0.10], # one\n",
17 | " [0.05, 0.80, 0.55]] # step\n",
18 | ")"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "### Initialise Query, Key and Value "
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "d_in=inputs.shape[1]\n",
35 | "d_out=2"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 5,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "torch.manual_seed(123)\n",
45 | "W_query=torch.nn.Parameter(torch.rand(d_in,d_out),requires_grad=False)\n",
46 | "W_key=torch.nn.Parameter(torch.rand(d_in,d_out),requires_grad=False)\n",
47 | "W_value=torch.nn.Parameter(torch.rand(d_in,d_out),requires_grad=False)\n"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 9,
53 | "metadata": {},
54 | "outputs": [
55 | {
56 | "name": "stdout",
57 | "output_type": "stream",
58 | "text": [
59 | "Parameter containing:\n",
60 | "tensor([[0.2961, 0.5166],\n",
61 | " [0.2517, 0.6886],\n",
62 | " [0.0740, 0.8665]])\n",
63 | "Parameter containing:\n",
64 | "tensor([[0.1366, 0.1025],\n",
65 | " [0.1841, 0.7264],\n",
66 | " [0.3153, 0.6871]])\n",
67 | "Parameter containing:\n",
68 | "tensor([[0.0756, 0.1966],\n",
69 | " [0.3164, 0.4017],\n",
70 | " [0.1186, 0.8274]])\n"
71 | ]
72 | }
73 | ],
74 | "source": [
75 | "print(W_query)\n",
76 | "print(W_key)\n",
77 | "print(W_value)"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {},
83 | "source": [
84 | "### Get Query, Key and Value Metrics"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 10,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "keys=inputs@ W_key\n",
94 | "values=inputs@W_value\n",
95 | "queries=inputs@W_query"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 12,
101 | "metadata": {},
102 | "outputs": [
103 | {
104 | "name": "stdout",
105 | "output_type": "stream",
106 | "text": [
107 | "tensor([[0.3669, 0.7646],\n",
108 | " [0.4433, 1.1419],\n",
109 | " [0.4361, 1.1156],\n",
110 | " [0.2408, 0.6706],\n",
111 | " [0.1827, 0.3292],\n",
112 | " [0.3275, 0.9642]])\n",
113 | "tensor([[0.1855, 0.8812],\n",
114 | " [0.3951, 1.0037],\n",
115 | " [0.3879, 0.9831],\n",
116 | " [0.2393, 0.5493],\n",
117 | " [0.1492, 0.3346],\n",
118 | " [0.3221, 0.7863]])\n",
119 | "tensor([[0.2309, 1.0966],\n",
120 | " [0.4306, 1.4551],\n",
121 | " [0.4300, 1.4343],\n",
122 | " [0.2355, 0.7990],\n",
123 | " [0.2983, 0.6565],\n",
124 | " [0.2568, 1.0533]])\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "print(keys)\n",
130 | "print(values)\n",
131 | "print(queries)"
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "metadata": {},
137 | "source": [
138 | "### Compute Attention Score"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 16,
144 | "metadata": {},
145 | "outputs": [
146 | {
147 | "name": "stdout",
148 | "output_type": "stream",
149 | "text": [
150 | "tensor([[0.9231, 1.3545, 1.3241, 0.7910, 0.4032, 1.1330],\n",
151 | " [1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440],\n",
152 | " [1.2544, 1.8284, 1.7877, 1.0654, 0.5508, 1.5238],\n",
153 | " [0.6973, 1.0167, 0.9941, 0.5925, 0.3061, 0.8475],\n",
154 | " [0.6114, 0.8819, 0.8626, 0.5121, 0.2707, 0.7307],\n",
155 | " [0.8995, 1.3165, 1.2871, 0.7682, 0.3937, 1.0996]])\n"
156 | ]
157 | }
158 | ],
159 | "source": [
160 | "attn_scores=queries@keys.T\n",
161 | "print(attn_scores)"
162 | ]
163 | },
164 | {
165 | "cell_type": "markdown",
166 | "metadata": {},
167 | "source": [
168 | "### Normalization"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "metadata": {},
175 | "outputs": [
176 | {
177 | "name": "stdout",
178 | "output_type": "stream",
179 | "text": [
180 | "tensor([[0.1551, 0.2104, 0.2059, 0.1413, 0.1074, 0.1799],\n",
181 | " [0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820],\n",
182 | " [0.1503, 0.2256, 0.2192, 0.1315, 0.0914, 0.1819],\n",
183 | " [0.1591, 0.1994, 0.1962, 0.1477, 0.1206, 0.1769],\n",
184 | " [0.1610, 0.1949, 0.1923, 0.1501, 0.1265, 0.1752],\n",
185 | " [0.1557, 0.2092, 0.2048, 0.1419, 0.1089, 0.1794]])\n"
186 | ]
187 | }
188 | ],
189 | "source": [
190 | "d_k=keys.shape[-1]\n",
191 | "attn_weights=torch.softmax(attn_scores/d_k**0.5,dim=-1) #Why dim=-1, because we are adding all column to 1 not row\n",
192 | "print(attn_weights)"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 18,
198 | "metadata": {},
199 | "outputs": [
200 | {
201 | "name": "stdout",
202 | "output_type": "stream",
203 | "text": [
204 | "tensor([[0.2996, 0.8053],\n",
205 | " [0.3061, 0.8210],\n",
206 | " [0.3058, 0.8203],\n",
207 | " [0.2948, 0.7939],\n",
208 | " [0.2927, 0.7891],\n",
209 | " [0.2990, 0.8040]])\n"
210 | ]
211 | }
212 | ],
213 | "source": [
214 | "context_vec=attn_weights@values\n",
215 | "print(context_vec)"
216 | ]
217 | },
218 | {
219 | "cell_type": "markdown",
220 | "metadata": {},
221 | "source": []
222 | },
223 | {
224 | "cell_type": "markdown",
225 | "metadata": {},
226 | "source": [
227 | "## Overall"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 21,
233 | "metadata": {},
234 | "outputs": [],
235 | "source": [
236 | "import torch.nn as nn\n",
237 | "class SelfAttention(nn.Module):\n",
238 | " def __init__(self, d_in, d_out):\n",
239 | " super().__init__()\n",
240 | " self.d_out = d_out\n",
241 | " self.W_query = nn.Parameter(torch.rand(d_in, d_out))\n",
242 | " self.W_key = nn.Parameter(torch.rand(d_in, d_out))\n",
243 | " self.W_value = nn.Parameter(torch.rand(d_in, d_out))\n",
244 | " \n",
245 | " def forward(self, x):\n",
246 | " keys = x @ self.W_key\n",
247 | " queries = x @ self.W_query\n",
248 | " values = x @ self.W_value\n",
249 | " attn_scores = queries @ keys.T # omega\n",
250 | " attn_weights = torch.softmax(\n",
251 | " attn_scores / keys.shape[-1]**0.5, dim=-1\n",
252 | " )\n",
253 | " context_vec = attn_weights @ values\n",
254 | " return context_vec"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": 22,
260 | "metadata": {},
261 | "outputs": [
262 | {
263 | "name": "stdout",
264 | "output_type": "stream",
265 | "text": [
266 | "tensor([[0.2996, 0.8053],\n",
267 | " [0.3061, 0.8210],\n",
268 | " [0.3058, 0.8203],\n",
269 | " [0.2948, 0.7939],\n",
270 | " [0.2927, 0.7891],\n",
271 | " [0.2990, 0.8040]], grad_fn=)\n"
272 | ]
273 | }
274 | ],
275 | "source": [
276 | "torch.manual_seed(123)\n",
277 | "self_att=SelfAttention(d_in,d_out)\n",
278 | "print(self_att(inputs))"
279 | ]
280 | },
281 | {
282 | "cell_type": "markdown",
283 | "metadata": {},
284 | "source": [
285 | "### Improvements"
286 | ]
287 | },
288 | {
289 | "cell_type": "markdown",
290 | "metadata": {},
291 | "source": [
292 | ">> Using nn.Linear"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": null,
298 | "metadata": {},
299 | "outputs": [],
300 | "source": [
301 | "class SelfAttention_v2(nn.Module):\n",
302 | " def __init__(self, d_in, d_out, qkv_bias=False):\n",
303 | " super().__init__()\n",
304 | " self.d_out = d_out\n",
305 | " self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
306 | " self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
307 | " self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
308 | " def forward(self, x):\n",
309 | " keys = self.W_key(x)\n",
310 | " queries = self.W_query(x)\n",
311 | " values = self.W_value(x)\n",
312 | " attn_scores = queries @ keys.T\n",
313 | " attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=1)\n",
314 | " context_vec = attn_weights @ values\n",
315 | " return context_vec"
316 | ]
317 | },
318 | {
319 | "cell_type": "markdown",
320 | "metadata": {},
321 | "source": []
322 | }
323 | ],
324 | "metadata": {
325 | "kernelspec": {
326 | "display_name": "Python 3",
327 | "language": "python",
328 | "name": "python3"
329 | },
330 | "language_info": {
331 | "codemirror_mode": {
332 | "name": "ipython",
333 | "version": 3
334 | },
335 | "file_extension": ".py",
336 | "mimetype": "text/x-python",
337 | "name": "python",
338 | "nbconvert_exporter": "python",
339 | "pygments_lexer": "ipython3",
340 | "version": "3.11.5"
341 | }
342 | },
343 | "nbformat": 4,
344 | "nbformat_minor": 2
345 | }
346 |
--------------------------------------------------------------------------------
/13. Intro to Attention Mechanism.md:
--------------------------------------------------------------------------------
1 | # Intro to Attention Mechanism
2 |
3 | ## Why Attention Mechanism?
4 | > "The `cat` that was `sitting` on the mat,
5 | which was next to the dog, `jumped.`"
6 |
7 | There are some words related to `cat` to which it LLM should pay attention in order to understand about `cat`
8 |
9 |
10 |
11 | Neural Network doesn't capture Long term dependencies
12 |
13 | ## 4 Type if Attention Mechanism
14 | **A.** Simplified Self Attention
15 |
16 | **B.** Self Attention:
17 | Trainable weights
18 |
19 | **C.** Casual Attention:
20 | Consider previous and current inputs
21 |
22 | **D.** Multi-Head Attention:
23 | Enable to simultaneously attend to information from different reprsentation subspace.
24 |
25 | ## History
26 | ### Problem withh modelling long sequence
27 |
28 |
29 | `
30 | Eg: Harry Potter went to station number 93 by 4 ....
31 | `
32 | We need `memory` yo know station number
33 |
34 | To address this issue Encoder and Decoder was introduced
35 |
36 | `Encoder`: Receive and Read german text and pass to decoder
37 | `Decoder`: Translate to English Text
38 |
39 |
40 |
41 | `Fig :Encoder get german language and converts to **context vector** which contains meaning then sends to decoder. Decoder generate output`
42 |
43 |
44 |
45 | >Before Transformer, **RNN** were most popular encoder-decoder architecture for language translation.
46 |
47 | **Encoder Decoder in RNN**
48 |
49 | Input Text -> Encode Text -> Update `Hidden State` at each step -> Final `Hidden State`
50 |
51 |
52 | Why do we need attention then?
53 | Decoder have only access to final hidden state in RNN.
54 |
55 | It is tough for only one hidden state to have entire information
56 |
57 | ### 2014 Bahdanau Attention Mechanism
58 | [Source]()
59 |
60 |
61 | Modifies the encoder- decoder RNN such that the decoder can selectively access different parts of the input sequence at each decoding step.
62 | Access to all input words and allow how much importance to make.
63 |
64 | Imoprtance is determined by `attention weight`
65 |
66 |
67 |
68 | Some input token are more important then other for generating a given output token
69 |
70 | This importance is determined by the so called `attention weight`.
71 |
72 | ### 2017 Attention is all you Need
73 | Researchers found that RNN architecture are not required for building deep neural network for natural language processing and proposed the original transformer architecture.
74 |
75 | - With `self attention mechanism` inspired by Bahdhanau Attention Mechanism.
76 |
77 | Attention Mechanism:
78 | - `Dynamic Focus`(Selectively select which word to given more focus)
79 |
80 | 1980: RNN
81 | - Feature: Hidden Layer
82 | - Problem :
83 | Vanishing Gradient
84 |
85 | 1997: LSMT
86 | - Problem :
87 | Long Term COntext
88 |
89 | 2014: Attention
90 | - Feature: Which inofrmation to give more attention.
91 |
92 | 2017: Transformer
93 | - Self Attention: Allow each position of input sequence to attens to all position in same sequnence.
94 |
95 |
96 |
--------------------------------------------------------------------------------
/14. Simplified Attention.md:
--------------------------------------------------------------------------------
1 | ## Context Vector
2 | How much each word is related to other words
3 |
4 | `Aim`: Convert Embedding Vector to Context Vector
5 |
6 |
7 |
8 | x1: Token embedding of 1
9 | z1: Context vector of 1
10 |
11 | `Query`: word for which we are finding context vector
12 |
13 | ## Step 1:
14 | (Compute Attention Scores)
15 | Calculate attention attention score between each token for specific query vector
16 |
17 | > Which mathematical operation require to find importance between 2 vector?
18 |
19 | >Dot product between 2 vectors
20 | A⋅B= ∣A∣.∣B∣ cosθ
21 |
22 |
23 |
24 |
25 | ## Step 2:
26 | (Compute Attention Weights)
27 | Perform Normalization: Know % of each importance
28 |
29 | > Attention Score vs Attention Weight
30 | > Both represent same things, `Attention Weights` sum up to 1.
31 |
32 | There are 2 type of normalization:
33 | `Simply divide by sum`
34 | `Use softmax`: smaller value neglected (preferable)
35 |
36 | **Pytorch Softmax Implementation**
37 | Why -max in numerator? [INTERVIEW QUESTION]
38 | ```python
39 | def own_softmax(self, x)
40 |
41 | maxes = torch.max(x, 1, keepdim=True)[0]
42 | x_exp = torch.exp(x-maxes)
43 | x_exp_sum = torch.sum(x_exp, 1, keepdim=True)
44 |
45 | return x_exp/x_exp_sum
46 | ```
47 |
48 | ## Step 3:
49 | (Compute Context Vectors)
50 | `Scaling`
51 | Multiply input vector with each corresponding attention weight (normalization %)
52 |
53 | Find vector summation
54 |
55 |
56 |
--------------------------------------------------------------------------------
/15. Self Attention.md:
--------------------------------------------------------------------------------
1 | ## Self Attention with Trainable Weights
2 | or `Scaled dot product Attention`
3 |
4 | Previously we have done:
5 | 1. Compute Attention Scores (Dot Product)
6 | 2. Compute Attention Weights (Normalization - Applied Softmax)
7 | 3. Compute Context Vectors (Multiply Each Attention Score with Weights)
8 |
9 | ## So Need Trainable Weights
10 | 1. Introduce `weight matrices` that update during model
11 |
12 | > Three trainable weights matrics:
13 | -- Query
14 | -- Key
15 | -- Value
16 |
17 |
18 | 2. Compute Attention Score (Dot product between $Query$ with $Key^T$ )
19 | > But it is not interpretable, thus need Normalization will also help in Backpropagation
20 |
21 | > **INTERVIEW QN**: Why Normalization of Attention Score Needed
22 |
23 |
24 | 3. Compute Attention Weights (Normalised)
25 | But Before Normalization we should scale by $\sqrt{dim(keys)}$.
26 | Then Apply Softmax(Normalization)
27 |
28 | > **INTERVIEW QN**: Why Scaling of Score with $\sqrt{dim(keys)}$ require?
29 |
30 | > Reason 1: If have applies softmax in bigger number then we get bigger number and model will say with confidence yes! That is next word or That is only relatable to me!
31 |
32 |
33 |
34 | > Reason 2: When we multiply Query and $Key^T$ resultant matrix's **Variance** also increases proportionally .
35 |
36 |
37 |
38 | Using **Square root** makes variance close to one
39 |
40 | Therefore it is called **Scaled Dot Product Vector**
41 |
42 | 4. Find Context Vector: Multiply Attention Weight with corresponding Values then sum them up.
43 |
44 | ## Overall
45 | ```python
46 | import torch.nn as nn
47 | class SelfAttention(nn.Module):
48 | def __init__(self, d_in, d_out):
49 | super().__init__()
50 | # Step 1
51 | self.d_out = d_out
52 | self.W_query = nn.Parameter(torch.rand(d_in, d_out))
53 | self.W_key = nn.Parameter(torch.rand(d_in, d_out))
54 | self.W_value = nn.Parameter(torch.rand(d_in, d_out))
55 |
56 | def forward(self, x):
57 | # Step 2
58 | keys = x @ self.W_key
59 | queries = x @ self.W_query
60 | values = x @ self.W_value
61 | # Step 3
62 | attn_scores = queries @ keys.T
63 | attn_weights = torch.softmax(
64 | attn_scores / keys.shape[-1]**0.5, dim=-1
65 | )
66 | # Step 4
67 | context_vec = attn_weights @ values
68 | return context_vec
69 | ```
70 |
71 | ### Improvements: Can use nn.Linear Initialisation which is better for model training
72 |
73 | ```python
74 | class SelfAttention_v2(nn.Module):
75 | def __init__(self, d_in, d_out, qkv_bias=False):
76 | super().__init__()
77 | self.d_out = d_out
78 | # nn.Linear
79 | self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
80 | self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
81 | self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
82 | def forward(self, x):
83 | keys = self.W_key(x)
84 | queries = self.W_query(x)
85 | values = self.W_value(x)
86 | attn_scores = queries @ keys.T
87 | attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=1)
88 | context_vec = attn_weights @ values
89 | return context_vec
90 | ```
91 | ### Conclusion
92 |
93 |
94 | **Query**: Analogous to search query in a database. It Represent the current token model.
95 |
96 | **Key**: In Attention Mechanism each item in input sequence has a key. Keys are used to match with the query.
97 |
98 | **Value**: It represent actual content or representation of input items once model determine which keys (which part of the input) are most relevant to the query (current focus item), it retrives the corresponding values.
--------------------------------------------------------------------------------
/16. Causal Attention.md:
--------------------------------------------------------------------------------
1 | ## Causal Attention
2 |
3 | - Causal Attention is also known as masked attention is a special form of self attention.
4 |
5 | - It is restricts the model to only consider previous and current inputs in a sequence, when processing any given token
6 |
7 | - This is in contrast to the self attention mechanism, which alllow access to the entire input sequence at once.
8 |
9 | - When computing attention score, the causal attention mechanism ensures that the model only factors in token occur at or before the current token in the sequence.
10 |
11 | - To achieve this in GPT like LLMs for each token processed, we mask out the future tokens, which come after the current token in the input text.
12 |
13 |
14 |
15 | We mask out the attention weightd above the diagonal, and we normalize the non masked attention weights,sum upto 1 in each row.
16 |
17 | ## Apply Causal Attention Mask
18 | **Strategy**
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/16. Causal Attention/notebook.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 13,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import torch\n",
10 | "\n",
11 | "inputs = torch.tensor(\n",
12 | " [[0.43, 0.15, 0.89], # Your\n",
13 | " [0.55, 0.87, 0.66], # journey\n",
14 | " [0.57, 0.85, 0.64], # starts\n",
15 | " [0.22, 0.58, 0.33], # with\n",
16 | " [0.77, 0.25, 0.10], # one\n",
17 | " [0.05, 0.80, 0.55]] # step\n",
18 | ")"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 14,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "class SelfAttention_v2(nn.Module):\n",
28 | " def __init__(self, d_in, d_out, qkv_bias=False):\n",
29 | " super().__init__()\n",
30 | " self.d_out = d_out\n",
31 | " self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
32 | " self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
33 | " self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
34 | " def forward(self, x):\n",
35 | " keys = self.W_key(x)\n",
36 | " queries = self.W_query(x)\n",
37 | " values = self.W_value(x)\n",
38 | " attn_scores = queries @ keys.T\n",
39 | " attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=1)\n",
40 | " context_vec = attn_weights @ values\n",
41 | " return context_vec"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 15,
47 | "metadata": {},
48 | "outputs": [
49 | {
50 | "name": "stdout",
51 | "output_type": "stream",
52 | "text": [
53 | "tensor([[-0.5337, -0.1051],\n",
54 | " [-0.5323, -0.1080],\n",
55 | " [-0.5323, -0.1079],\n",
56 | " [-0.5297, -0.1076],\n",
57 | " [-0.5311, -0.1066],\n",
58 | " [-0.5299, -0.1081]], grad_fn=)\n"
59 | ]
60 | }
61 | ],
62 | "source": [
63 | "d_in=inputs.shape[1]\n",
64 | "d_out=2\n",
65 | "torch.manual_seed(123)\n",
66 | "self_att=SelfAttention_v2(d_in,d_out)\n",
67 | "print(self_att(inputs))"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 16,
73 | "metadata": {},
74 | "outputs": [
75 | {
76 | "name": "stdout",
77 | "output_type": "stream",
78 | "text": [
79 | "tensor([[0.1717, 0.1762, 0.1761, 0.1555, 0.1627, 0.1579],\n",
80 | " [0.1636, 0.1749, 0.1746, 0.1612, 0.1605, 0.1652],\n",
81 | " [0.1637, 0.1749, 0.1746, 0.1611, 0.1606, 0.1651],\n",
82 | " [0.1636, 0.1704, 0.1702, 0.1652, 0.1632, 0.1674],\n",
83 | " [0.1667, 0.1722, 0.1721, 0.1618, 0.1633, 0.1639],\n",
84 | " [0.1624, 0.1709, 0.1706, 0.1654, 0.1625, 0.1682]],\n",
85 | " grad_fn=)\n"
86 | ]
87 | }
88 | ],
89 | "source": [
90 | "queries = self_att.W_query(inputs)\n",
91 | "keys = self_att.W_key(inputs)\n",
92 | "attn_scores = queries@keys.T\n",
93 | "attn_weights = torch.softmax(attn_scores / keys.shape[-1] ** 0.5, dim=1)\n",
94 | "print(attn_weights)"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 17,
100 | "metadata": {},
101 | "outputs": [
102 | {
103 | "name": "stdout",
104 | "output_type": "stream",
105 | "text": [
106 | "tensor([[1., 0., 0., 0., 0., 0.],\n",
107 | " [1., 1., 0., 0., 0., 0.],\n",
108 | " [1., 1., 1., 0., 0., 0.],\n",
109 | " [1., 1., 1., 1., 0., 0.],\n",
110 | " [1., 1., 1., 1., 1., 0.],\n",
111 | " [1., 1., 1., 1., 1., 1.]])\n"
112 | ]
113 | }
114 | ],
115 | "source": [
116 | "context_length = attn_scores.shape[0]\n",
117 | "mask_simple = torch.tril(torch.ones(context_length, context_length))\n",
118 | "print(mask_simple)"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 18,
124 | "metadata": {},
125 | "outputs": [
126 | {
127 | "name": "stdout",
128 | "output_type": "stream",
129 | "text": [
130 | "tensor([[1., 0., 0., 0., 0., 0.],\n",
131 | " [1., 1., 0., 0., 0., 0.],\n",
132 | " [1., 1., 1., 0., 0., 0.],\n",
133 | " [1., 1., 1., 1., 0., 0.],\n",
134 | " [1., 1., 1., 1., 1., 0.],\n",
135 | " [1., 1., 1., 1., 1., 1.]])\n"
136 | ]
137 | }
138 | ],
139 | "source": [
140 | "context_length = attn_scores.shape[0]\n",
141 | "mask_simple = torch.tril(torch.ones(context_length, context_length))\n",
142 | "print(mask_simple)"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 19,
148 | "metadata": {},
149 | "outputs": [
150 | {
151 | "name": "stdout",
152 | "output_type": "stream",
153 | "text": [
154 | "tensor([[0.1717, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],\n",
155 | " [0.1636, 0.1749, 0.0000, 0.0000, 0.0000, 0.0000],\n",
156 | " [0.1637, 0.1749, 0.1746, 0.0000, 0.0000, 0.0000],\n",
157 | " [0.1636, 0.1704, 0.1702, 0.1652, 0.0000, 0.0000],\n",
158 | " [0.1667, 0.1722, 0.1721, 0.1618, 0.1633, 0.0000],\n",
159 | " [0.1624, 0.1709, 0.1706, 0.1654, 0.1625, 0.1682]],\n",
160 | " grad_fn=)\n"
161 | ]
162 | }
163 | ],
164 | "source": [
165 | "masked_simple = attn_weights*mask_simple\n",
166 | "print(masked_simple)"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": 20,
172 | "metadata": {},
173 | "outputs": [
174 | {
175 | "name": "stdout",
176 | "output_type": "stream",
177 | "text": [
178 | "tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],\n",
179 | " [0.4833, 0.5167, 0.0000, 0.0000, 0.0000, 0.0000],\n",
180 | " [0.3190, 0.3408, 0.3402, 0.0000, 0.0000, 0.0000],\n",
181 | " [0.2445, 0.2545, 0.2542, 0.2468, 0.0000, 0.0000],\n",
182 | " [0.1994, 0.2060, 0.2058, 0.1935, 0.1953, 0.0000],\n",
183 | " [0.1624, 0.1709, 0.1706, 0.1654, 0.1625, 0.1682]],\n",
184 | " grad_fn=)\n"
185 | ]
186 | }
187 | ],
188 | "source": [
189 | "row_sums = masked_simple.sum(dim=1, keepdim=True)\n",
190 | "masked_simple_norm = masked_simple / row_sums\n",
191 | "print(masked_simple_norm)"
192 | ]
193 | },
194 | {
195 | "cell_type": "markdown",
196 | "metadata": {},
197 | "source": [
198 | "'''\n",
199 | "##### Still having problem!, although we have masked we already calculated sooftmax which influences others. \n",
200 | "\n",
201 | "**DATA LEAKAGE**\n",
202 | "\n",
203 | "'''"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 21,
209 | "metadata": {},
210 | "outputs": [
211 | {
212 | "name": "stdout",
213 | "output_type": "stream",
214 | "text": [
215 | "tensor([[0.3111, 0.3479, 0.3471, 0.1714, 0.2350, 0.1928],\n",
216 | " [0.1655, 0.2602, 0.2576, 0.1445, 0.1384, 0.1790],\n",
217 | " [0.1667, 0.2602, 0.2577, 0.1443, 0.1391, 0.1784],\n",
218 | " [0.0510, 0.1080, 0.1064, 0.0643, 0.0476, 0.0835],\n",
219 | " [0.1415, 0.1875, 0.1863, 0.0987, 0.1121, 0.1174],\n",
220 | " [0.0476, 0.1192, 0.1171, 0.0731, 0.0477, 0.0966]],\n",
221 | " grad_fn=)\n"
222 | ]
223 | }
224 | ],
225 | "source": [
226 | "print(attn_scores)"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 22,
232 | "metadata": {},
233 | "outputs": [
234 | {
235 | "name": "stdout",
236 | "output_type": "stream",
237 | "text": [
238 | "tensor([[0.3111, -inf, -inf, -inf, -inf, -inf],\n",
239 | " [0.1655, 0.2602, -inf, -inf, -inf, -inf],\n",
240 | " [0.1667, 0.2602, 0.2577, -inf, -inf, -inf],\n",
241 | " [0.0510, 0.1080, 0.1064, 0.0643, -inf, -inf],\n",
242 | " [0.1415, 0.1875, 0.1863, 0.0987, 0.1121, -inf],\n",
243 | " [0.0476, 0.1192, 0.1171, 0.0731, 0.0477, 0.0966]],\n",
244 | " grad_fn=)\n"
245 | ]
246 | }
247 | ],
248 | "source": [
249 | "mask=torch.triu(torch.ones(context_length,context_length),diagonal=1)\n",
250 | "masked=attn_scores.masked_fill(mask.bool(),-torch.inf)\n",
251 | "print(masked)"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": null,
257 | "metadata": {},
258 | "outputs": [
259 | {
260 | "data": {
261 | "text/plain": [
262 | "tensor([[0., 1., 1., 1., 1., 1.],\n",
263 | " [0., 0., 1., 1., 1., 1.],\n",
264 | " [0., 0., 0., 1., 1., 1.],\n",
265 | " [0., 0., 0., 0., 1., 1.],\n",
266 | " [0., 0., 0., 0., 0., 1.],\n",
267 | " [0., 0., 0., 0., 0., 0.]])"
268 | ]
269 | },
270 | "execution_count": 25,
271 | "metadata": {},
272 | "output_type": "execute_result"
273 | }
274 | ],
275 | "source": [
276 | "## Quick Look\n",
277 | "torch.triu(torch.ones(context_length,context_length),diagonal=1)"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": 23,
283 | "metadata": {},
284 | "outputs": [
285 | {
286 | "name": "stdout",
287 | "output_type": "stream",
288 | "text": [
289 | "tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],\n",
290 | " [0.4833, 0.5167, 0.0000, 0.0000, 0.0000, 0.0000],\n",
291 | " [0.3190, 0.3408, 0.3402, 0.0000, 0.0000, 0.0000],\n",
292 | " [0.2445, 0.2545, 0.2542, 0.2468, 0.0000, 0.0000],\n",
293 | " [0.1994, 0.2060, 0.2058, 0.1935, 0.1953, 0.0000],\n",
294 | " [0.1624, 0.1709, 0.1706, 0.1654, 0.1625, 0.1682]],\n",
295 | " grad_fn=)\n"
296 | ]
297 | }
298 | ],
299 | "source": [
300 | "attn_weights = torch.softmax(masked / keys.shape[-1] ** 0.5, dim=1)\n",
301 | "print(attn_weights)"
302 | ]
303 | },
304 | {
305 | "cell_type": "markdown",
306 | "metadata": {},
307 | "source": [
308 | "'''\n",
309 | "**We can use DROPOUT**\n",
310 | "'''"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": null,
316 | "metadata": {},
317 | "outputs": [
318 | {
319 | "name": "stdout",
320 | "output_type": "stream",
321 | "text": [
322 | "tensor([[2., 2., 2., 2., 2., 2.],\n",
323 | " [0., 2., 0., 0., 0., 0.],\n",
324 | " [0., 0., 2., 0., 2., 0.],\n",
325 | " [2., 2., 0., 0., 0., 2.],\n",
326 | " [2., 0., 0., 0., 0., 2.],\n",
327 | " [0., 2., 0., 0., 0., 0.]])\n"
328 | ]
329 | }
330 | ],
331 | "source": [
332 | "torch.manual_seed(123)\n",
333 | "dropout=torch.nn.Dropout(0.5)\n",
334 | "example=torch.ones(6,6)\n",
335 | "print(dropout(example))\n",
336 | "## We are getting 2 because of scaling"
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": 29,
342 | "metadata": {},
343 | "outputs": [
344 | {
345 | "name": "stdout",
346 | "output_type": "stream",
347 | "text": [
348 | "tensor([[2.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],\n",
349 | " [0.0000, 1.0335, 0.0000, 0.0000, 0.0000, 0.0000],\n",
350 | " [0.0000, 0.0000, 0.6804, 0.0000, 0.0000, 0.0000],\n",
351 | " [0.4889, 0.5090, 0.0000, 0.0000, 0.0000, 0.0000],\n",
352 | " [0.3988, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],\n",
353 | " [0.0000, 0.3418, 0.0000, 0.0000, 0.0000, 0.0000]],\n",
354 | " grad_fn=)\n"
355 | ]
356 | }
357 | ],
358 | "source": [
359 | "torch.manual_seed(123)\n",
360 | "print(dropout(attn_weights))"
361 | ]
362 | },
363 | {
364 | "cell_type": "markdown",
365 | "metadata": {},
366 | "source": [
367 | "#### Implement Causal Attention Class"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": 30,
373 | "metadata": {},
374 | "outputs": [
375 | {
376 | "name": "stdout",
377 | "output_type": "stream",
378 | "text": [
379 | "torch.Size([2, 6, 3])\n"
380 | ]
381 | }
382 | ],
383 | "source": [
384 | "batch=torch.stack((inputs,inputs),dim=0)\n",
385 | "print(batch.shape)\n"
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": 34,
391 | "metadata": {},
392 | "outputs": [],
393 | "source": [
394 | "class CasualAttention(nn.Module):\n",
395 | " def __init__(self, d_in, d_out,context_length,dropout, qkv_bias=False):\n",
396 | " super().__init__()\n",
397 | " self.d_out = d_out\n",
398 | " self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
399 | " self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
400 | " self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
401 | " self.dropout=nn.Dropout(dropout)\n",
402 | " self.register_buffer('mask',torch.triu(torch.ones(context_length,context_length),diagonal=1))\n",
403 | "\n",
404 | " def forward(self, x):\n",
405 | " b,num_tokens,d_in=x.shape\n",
406 | " keys = self.W_key(x)\n",
407 | " queries = self.W_query(x)\n",
408 | " values = self.W_value(x)\n",
409 | "\n",
410 | " attn_scores = queries @ keys.transpose(1,2)\n",
411 | " attn_scores.masked_fill_(self.mask.bool()[:num_tokens,:num_tokens],-torch.inf)\n",
412 | " attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)\n",
413 | " attn_weights =self.dropout(attn_weights)\n",
414 | "\n",
415 | " context_vec = attn_weights @ values\n",
416 | " return context_vec"
417 | ]
418 | },
419 | {
420 | "cell_type": "code",
421 | "execution_count": 35,
422 | "metadata": {},
423 | "outputs": [
424 | {
425 | "name": "stdout",
426 | "output_type": "stream",
427 | "text": [
428 | "context_vecs.shape: torch.Size([2, 6, 2])\n"
429 | ]
430 | }
431 | ],
432 | "source": [
433 | "torch.manual_seed(123)\n",
434 | "context_length=batch.shape[1]\n",
435 | "ca=CasualAttention(d_in,d_out,context_length,0.0)\n",
436 | "context_vecs=ca(batch)\n",
437 | "print(\"context_vecs.shape:\",context_vecs.shape)"
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": null,
443 | "metadata": {},
444 | "outputs": [],
445 | "source": []
446 | }
447 | ],
448 | "metadata": {
449 | "kernelspec": {
450 | "display_name": "Python 3",
451 | "language": "python",
452 | "name": "python3"
453 | },
454 | "language_info": {
455 | "codemirror_mode": {
456 | "name": "ipython",
457 | "version": 3
458 | },
459 | "file_extension": ".py",
460 | "mimetype": "text/x-python",
461 | "name": "python",
462 | "nbconvert_exporter": "python",
463 | "pygments_lexer": "ipython3",
464 | "version": "3.11.5"
465 | }
466 | },
467 | "nbformat": 4,
468 | "nbformat_minor": 2
469 | }
470 |
--------------------------------------------------------------------------------
/17. Multihead Attention.md:
--------------------------------------------------------------------------------
1 | ## Multihead Attention
2 |
3 | 1. The term "multi-head" refer to dividing the attention mechanism into multiole heads
4 |
5 | **A.** Stacking multiple single head attention layers.
6 |
7 | **B.** Implementing multi-head atention involves creating multiple instance of the self-attetnion mechanism; each with it's own weights and then combining their outputs.
8 |
9 | 2. This can be computationally intensive, but it makes LLMs powerful at comples pattern recognition tasks.
10 |
11 |
12 |
13 |
14 |
15 | ### Output
16 |
17 |
--------------------------------------------------------------------------------
/17. Multihead Attention/notebook.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 6,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import torch\n",
10 | "\n",
11 | "inputs = torch.tensor(\n",
12 | " [[0.43, 0.15, 0.89], # Your\n",
13 | " [0.55, 0.87, 0.66], # journey\n",
14 | " [0.57, 0.85, 0.64], # starts\n",
15 | " [0.22, 0.58, 0.33], # with\n",
16 | " [0.77, 0.25, 0.10], # one\n",
17 | " [0.05, 0.80, 0.55]] # step\n",
18 | ")"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 7,
24 | "metadata": {},
25 | "outputs": [
26 | {
27 | "name": "stdout",
28 | "output_type": "stream",
29 | "text": [
30 | "torch.Size([2, 6, 3])\n"
31 | ]
32 | }
33 | ],
34 | "source": [
35 | "\n",
36 | "batch=torch.stack((inputs,inputs),dim=0)\n",
37 | "print(batch.shape)"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 9,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "import torch.nn as nn\n",
47 | "class CasualAttention(nn.Module):\n",
48 | " def __init__(self, d_in, d_out,context_length,dropout, qkv_bias=False):\n",
49 | " super().__init__()\n",
50 | " self.d_out = d_out\n",
51 | " self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
52 | " self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
53 | " self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
54 | " self.dropout=nn.Dropout(dropout)\n",
55 | " self.register_buffer('mask',torch.triu(torch.ones(context_length,context_length),diagonal=1))\n",
56 | "\n",
57 | " def forward(self, x):\n",
58 | " b,num_tokens,d_in=x.shape\n",
59 | " keys = self.W_key(x)\n",
60 | " queries = self.W_query(x)\n",
61 | " values = self.W_value(x)\n",
62 | "\n",
63 | " attn_scores = queries @ keys.transpose(1,2)\n",
64 | " attn_scores.masked_fill_(self.mask.bool()[:num_tokens,:num_tokens],-torch.inf)\n",
65 | " attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)\n",
66 | " attn_weights =self.dropout(attn_weights)\n",
67 | "\n",
68 | " context_vec = attn_weights @ values\n",
69 | " return context_vec"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {},
76 | "outputs": [
77 | {
78 | "name": "stdout",
79 | "output_type": "stream",
80 | "text": [
81 | "context_vecs.shape: torch.Size([2, 6, 2])\n"
82 | ]
83 | }
84 | ],
85 | "source": []
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 17,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "class MultiHeadAttentionWrapper(nn.Module):\n",
94 | " def __init__(self,d_in,d_out,context_length,dropout,num_heads,qkv_bias=False):\n",
95 | " super().__init__() \n",
96 | " self.heads=nn.ModuleList([CasualAttention(d_in,d_out,context_length,dropout, qkv_bias) for _ in range(num_heads)])\n",
97 | "\n",
98 | " def forward(self,x):\n",
99 | " return torch.cat([head(x) for head in self.heads],dim=-1)\n",
100 | " "
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 19,
106 | "metadata": {},
107 | "outputs": [
108 | {
109 | "name": "stdout",
110 | "output_type": "stream",
111 | "text": [
112 | "context_vecs.shape: torch.Size([2, 6, 4])\n",
113 | "tensor([[[-0.4519, 0.2216, 0.4772, 0.1063],\n",
114 | " [-0.5874, 0.0058, 0.5891, 0.3257],\n",
115 | " [-0.6300, -0.0632, 0.6202, 0.3860],\n",
116 | " [-0.5675, -0.0843, 0.5478, 0.3589],\n",
117 | " [-0.5526, -0.0981, 0.5321, 0.3428],\n",
118 | " [-0.5299, -0.1081, 0.5077, 0.3493]],\n",
119 | "\n",
120 | " [[-0.4519, 0.2216, 0.4772, 0.1063],\n",
121 | " [-0.5874, 0.0058, 0.5891, 0.3257],\n",
122 | " [-0.6300, -0.0632, 0.6202, 0.3860],\n",
123 | " [-0.5675, -0.0843, 0.5478, 0.3589],\n",
124 | " [-0.5526, -0.0981, 0.5321, 0.3428],\n",
125 | " [-0.5299, -0.1081, 0.5077, 0.3493]]], grad_fn=)\n"
126 | ]
127 | }
128 | ],
129 | "source": [
130 | "d_in=inputs.shape[1]\n",
131 | "d_out=2\n",
132 | "\n",
133 | "torch.manual_seed(123)\n",
134 | "context_length=batch.shape[1]\n",
135 | "mha=MultiHeadAttentionWrapper(d_in,d_out,context_length,0.0,num_heads=2)\n",
136 | "context_vecs=mha(batch)\n",
137 | "print(\"context_vecs.shape:\",context_vecs.shape)\n",
138 | "print(context_vecs)"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": null,
144 | "metadata": {},
145 | "outputs": [],
146 | "source": []
147 | }
148 | ],
149 | "metadata": {
150 | "kernelspec": {
151 | "display_name": "Python 3",
152 | "language": "python",
153 | "name": "python3"
154 | },
155 | "language_info": {
156 | "codemirror_mode": {
157 | "name": "ipython",
158 | "version": 3
159 | },
160 | "file_extension": ".py",
161 | "mimetype": "text/x-python",
162 | "name": "python",
163 | "nbconvert_exporter": "python",
164 | "pygments_lexer": "ipython3",
165 | "version": "3.11.5"
166 | }
167 | },
168 | "nbformat": 4,
169 | "nbformat_minor": 2
170 | }
171 |
--------------------------------------------------------------------------------
/18. Multihead Attention.md:
--------------------------------------------------------------------------------
1 | # Implementing multihead attention with weight split
2 |
3 | GPT contains 96 multihead attention so $96 \times 3$ matrix will be needed which can be reduced
4 |
5 | Previously we have done:
6 |
7 |
8 |
9 | #### $head\_dimension = \frac{d_{out}}{n_{head}}$
10 |
11 | ## Step 1: Input X consist of 3 things
12 | - $b$: batch
13 | - $num_{tokens}$: number of token
14 | - $d_{in}$: input dimension
15 |
16 | $$\mathbf{X}=\begin{bmatrix}
17 | 1.0 & 2.0 & 3.0 & 4.0 & 5.0 & 6.0 \\
18 | 6.0 & 5.0 & 4.0 & 3.0 & 2.0 & 1.0 \\
19 | 1.0 & 1.0 & 1.0 & 1.0 & 1.0 & 1.0
20 | \end{bmatrix}$$
21 |
22 | - Consider 3 words with 6 dimension vector
23 |
24 | ## Step 2: Decide $d_{out}$ and $num_{heads}$
25 |
26 | $d_{out}=6$
27 | $num_{heads}=2$
28 |
29 | $head\_dimension = \frac{d_{out}}{n_{head}} = \frac{6}{2} =3$
30 |
31 | ## Step 3: Initial Trainable weight for key, query, value ($W_k$, $W_q$, $W_v$)
32 |
33 | - Since $d_in, d_out =(6*6)$ we need to make it compatible
34 |
35 | Here is Random Initialisation, used Linear Layer
36 |
37 | $$\mathbf{W_q} =
38 | \begin{bmatrix}
39 | 0.6323 & -0.2366 & 1.2455 & 0.3465 & 1.2458 & 0.3229 \\
40 | 0.6571 & -0.2378 & -0.5311 & -0.2610 & -1.4819 & -1.6418 \\
41 | -0.2990 & 0.4216 & 0.2114 & -0.0271 & -0.5682 & 0.6937 \\
42 | -1.1291 & -1.0102 & 0.6946 & 0.1094 & 0.5130 & -0.8669 \\
43 | 0.3480 & 0.2593 & 0.4412 & 1.0017 & -0.3913 & -0.2878 \\
44 | 0.2484 & 0.2846 & -0.3386 & -0.6164 & 1.2722 & 0.5754
45 | \end{bmatrix}
46 | $$
47 |
48 | $$\mathbf{W_k} =
49 | \begin{bmatrix}
50 | -0.3703 & 0.5431 & -0.0372 & -0.4406 & 0.4103 & -0.1773 \\
51 | 1.5993 & -0.2777 & -1.1989 & -0.4301 & 0.6927 & -1.3384 \\
52 | 1.2478 & -0.1872 & -8.1678 & 1.4382 & 1.2927 & 0.4822 \\
53 | -0.8984 & -0.8983 & 8.3334 & -0.6312 & 0.1022 & -1.0715 \\
54 | -0.7647 & 0.1734 & 0.6305 & 1.0155 & 0.8474 & 0.1454 \\
55 | -1.5085 & -0.4529 & 0.0997 & -0.1084 & 0.8046 & 0.3459
56 | \end{bmatrix}
57 | $$
58 |
59 | $$\mathbf{W_v} =
60 | \begin{bmatrix}
61 | 1.6395 & 1.1234 & -0.1001 & 0.5021 & -1.0590 & 0.1412 \\
62 | -8.4271 & 0.5681 & 0.4164 & -1.2534 & 1.3061 & 0.3610 \\
63 | -0.2824 & -0.4314 & 1.2358 & 0.1181 & -1.2467 & 0.1893 \\
64 | 1.3440 & 0.1487 & -0.6174 & 0.8890 & -0.3282 & 1.4662 \\
65 | 0.1814 & -0.4761 & -0.0402 & 0.7326 & 0.7654 & -0.1000 \\
66 | -8.8974 & 0.6786 & 8.5682 & -0.2443 & -8.4883 & 1.3996
67 | \end{bmatrix}
68 | $$
69 |
70 | ## Step 4: Calculate Keys, Queries and Value Matrix
71 |
72 | - $\mathbf{X} \times \mathbf{W_q}$
73 | - $\mathbf{X} \times \mathbf{W_k}$
74 | - $\mathbf{X} \times \mathbf{W_v}$
75 |
76 | $$
77 | \mathbf{X} \times \mathbf{W_q} =
78 | \begin{bmatrix}
79 | 4.2932 & -2.4849 & 2.4194 & 4.0865 & 1.5162 & -0.6586 \\
80 | -0.6586 & -2.4849 & 2.4194 & 4.0865 & 4.2932 & -2.4849 \\
81 | 1.4587 & 0.4809 & 1.7221 & 0.7082 & -0.1234 & 0.7995
82 | \end{bmatrix}
83 | $$
84 |
85 | $$\mathbf{X} \times \mathbf{W_k} =
86 | \begin{bmatrix}
87 | 3.5049 & -0.2332 & -13.0061 & 4.3009 & 7.2783 & -1.5149 \\
88 | -1.5149 & -0.2332 & -13.0061 & 4.3009 & 3.5049 & -0.2332 \\
89 | -0.3752 & -0.4537 & 1.1995 & 0.1121 & 1.2829 & -0.0216
90 | \end{bmatrix}
91 | $$
92 |
93 |
94 | $$\mathbf{X} \times \mathbf{W_v} =
95 | \begin{bmatrix}
96 | -3.7603 & 1.7251 & 15.5114 & -1.3864 & -5.8559 & 5.7465 \\
97 | 5.7465 & 1.7251 & 15.5114 & -1.3864 & -3.7603 & 1.7251 \\
98 | -0.5320 & 0.2449 & 1.8943 & -0.0822 & -0.9636 & 0.2034
99 | \end{bmatrix}
100 | $$
101 |
102 |
103 |
104 |
105 | - Each row corresponds to one token
106 | - Each token is 6 dimension.
107 |
108 | ## Step 5: Unroll last dimension of Keys, Queries and Values to include $num_{heads}$ and $head_{dim}$
109 |
110 | $head\_dimension = \frac{d_{out}}{n_{head}} = \frac{6}{2} =3$
111 |
112 |
113 |
114 | $$
115 | \mathbf{Q_{reshaped}} =
116 | \begin{bmatrix}
117 | \begin{bmatrix} -0.4888 & 0.2361 & 2.8463 \\
118 | -0.2184 & 5.4503 & -1.8915
119 | \end{bmatrix} \\
120 | \begin{bmatrix}
121 | -2.3531 & -0.7912 & 2.0534 \\
122 | -4.3369 & 3.2125 & 1.2578
123 | \end{bmatrix} \\
124 | \begin{bmatrix}
125 | -2.5745 & 0.2893 & 1.1454 \\
126 | 0.9021 & 1.5632 & 0.6930
127 | \end{bmatrix}
128 | \end{bmatrix}
129 | $$
130 |
131 | $$\mathbf{K_{reshaped}} =
132 | \begin{bmatrix}
133 | \begin{bmatrix}
134 | 0.4143 & -1.4023 & -2.7131 \\
135 | 3.4907 & -2.1993 & 0.2381
136 | \end{bmatrix} \\
137 | \begin{bmatrix}
138 | 1.1957 & 1.3712 & 0.6885 \\
139 | -1.5484 & 4.2152 & 2.1248
140 | \end{bmatrix} \\
141 | \begin{bmatrix}
142 | -0.1226 & 0.1155 & 0.4755 \\
143 | -0.0176 & 0.8339 & 0.7582
144 | \end{bmatrix}
145 | \end{bmatrix}$$
146 |
147 | $$
148 | \mathbf{V_{reshaped}} =
149 | \begin{bmatrix}
150 | \begin{bmatrix}
151 | -3.6194 & 2.0935 & 1.3879 \\
152 | 2.1231 & -1.2262 & -0.2556
153 | \end{bmatrix} \\
154 | \begin{bmatrix}
155 | 1.1106 & -0.4063 & -0.5588 \\
156 | 1.8222 & 1.8721 & 0.4929
157 | \end{bmatrix} \\
158 | \begin{bmatrix}
159 | -1.6594 & 0.1052 & -0.0468 \\
160 | 0.8916 & -1.4384 & -0.5651
161 | \end{bmatrix}
162 | \end{bmatrix}
163 | $$
164 |
165 | Each matrix inside matrix corresponds to one token and which are divided into 2 head.
166 |
167 | Practically: 2 People paying attention to each token
168 |
169 | ## Step 6: Group matrices by number of head
170 | Why: For computation by each head
171 |
172 | We will get attention score matrix for each head saperately then we will concatinate
173 |
174 |
175 |
176 | New dimension: $(1,2,3,3)$
177 |
178 | $$\mathbf{Q_{transformed}} =
179 | \begin{bmatrix}
180 | \begin{bmatrix}
181 | -0.4888 & 0.2361 & 2.8463 \\
182 | -2.3531 & -0.7912 & 2.0534 \\
183 | -2.5745 & 0.2893 & 1.1454
184 | \end{bmatrix} \\
185 | \begin{bmatrix}
186 | -0.2184 & 5.4503 & -1.8915 \\
187 | -4.3369 & 3.2125 & 1.2578 \\
188 | 0.9021 & 1.5632 & 0.6930
189 | \end{bmatrix}
190 | \end{bmatrix}
191 | $$
192 |
193 | $$\mathbf{K_{transformed}} =
194 | \begin{bmatrix}
195 | \begin{bmatrix}
196 | 0.4143 & -1.4023 & -2.7131 \\
197 | 1.1957 & 1.3712 & 0.6885 \\
198 | -0.1226 & 0.1155 & 0.4755
199 | \end{bmatrix} \\
200 | \begin{bmatrix}
201 | 3.4907 & -2.1993 & 0.2381 \\
202 | -1.5484 & 4.2152 & 2.1248 \\
203 | -0.0176 & 0.8339 & 0.7582
204 | \end{bmatrix}
205 | \end{bmatrix}
206 | $$
207 |
208 | $$\mathbf{V_{transformed}} =
209 | \begin{bmatrix}
210 | \begin{bmatrix}
211 | -3.6194 & 2.0935 & 1.3879 \\
212 | 1.1106 & -0.4063 & -0.5588 \\
213 | -1.6594 & 0.1052 & -0.0468
214 | \end{bmatrix} \\
215 | \begin{bmatrix}
216 | 2.1231 & -1.2262 & -0.2556 \\
217 | 1.8222 & 1.8721 & 0.4929 \\
218 | 0.8916 & -1.4384 & -0.5651
219 | \end{bmatrix}
220 | \end{bmatrix}
221 | $$
222 |
223 | - Each inner matrix for each head computation and 3 rows inside it represent 3 token for each head
224 |
225 | ## Step 7: Find Attention Score
226 | $\mathbf{Attention\_Score} = \mathbf{Q} \times \mathbf{K}^T$
227 |
228 | $$\mathbf{K_{transformed}}^\top =
229 | \begin{bmatrix}
230 | \begin{bmatrix}
231 | 0.4143 & 1.1957 & -0.1226 \\
232 | -1.4023 & 1.3712 & 0.1155 \\
233 | -2.7131 & 0.6885 & 0.4755
234 | \end{bmatrix} \\
235 | \begin{bmatrix}
236 | 3.4907 & -1.5484 & -0.0176 \\
237 | -2.1993 & 4.2152 & 0.8339 \\
238 | 0.2381 & 2.1248 & 0.7582
239 | \end{bmatrix}
240 | \end{bmatrix}
241 | $$
242 |
243 |
244 | $$
245 | \mathbf{Q_{transformed} \times K_{transformed}^T} =
246 | \begin{bmatrix}
247 | \begin{bmatrix}
248 | -0.6347 & 0.0932 & -1.0212 \\
249 | -1.4575 & 3.5434 & 6.9408 \\
250 | -1.0186 & 3.9301 & 7.6656
251 | \end{bmatrix} \\
252 | \begin{bmatrix}
253 | -8.9021 & 23.1722 & 3.7513 \\
254 | -15.5508 & 28.7933 & 5.6098 \\
255 | 2.2056 & 5.6847 & 2.5589
256 | \end{bmatrix}
257 | \end{bmatrix}
258 | $$
259 |
260 | Dimension: (𝑏, 𝑛𝑢𝑚_ℎ𝑒𝑎𝑑, 𝑛𝑢𝑚_𝑡𝑜𝑘𝑒𝑛𝑠, 𝑛𝑢𝑚_𝑡𝑜𝑘𝑒𝑛𝑠)
261 |
262 | We have matrix showing how much attention score related to one word with another
263 |
264 | ## Step 8: Find Attention weights
265 | Mask attention scores to implement causal attention
266 |
267 | $$
268 | \mathbf{(Q_{transformed} \times K_{transformed}^T)_{masked}} =
269 | \begin{bmatrix}
270 | \begin{bmatrix}
271 | -0.6347 & -\infty & -\infty \\
272 | -1.4575 & 3.5434 & -\infty \\
273 | -1.0186 & 3.9301 & 7.6656
274 | \end{bmatrix} \\
275 | \begin{bmatrix}
276 | -8.9021 & -\infty & -\infty \\
277 | -15.5508 & 28.7933 & -\infty \\
278 | 2.2056 & 5.6847 & 2.5589
279 | \end{bmatrix}
280 | \end{bmatrix}
281 | $$
282 |
283 | - If we implemet softmax $-\infty$ will become 0 and all row sum up to 1
284 |
285 | - Before that we need to divide by $\sqrt {head_{dim}}$ = $\sqrt{\frac {6}{2}}$ = $\sqrt 3$
286 |
287 | $$
288 | \mathbf{\frac{(Q_{transformed} \times K_{transformed}^T)_{masked}}{\sqrt{3}}} =
289 | \begin{bmatrix}
290 | \begin{bmatrix}
291 | -0.3664 & -\infty & -\infty \\
292 | -0.8412 & 2.0459 & -\infty \\
293 | -0.5880 & 2.2686 & 4.4256
294 | \end{bmatrix} \\
295 | \begin{bmatrix}
296 | -5.1388 & -\infty & -\infty \\
297 | -8.9754 & 16.6212 & -\infty \\
298 | 1.2732 & 3.2819 & 1.4770
299 | \end{bmatrix}
300 | \end{bmatrix}
301 | $$
302 |
303 | - After applying softmax
304 |
305 | $$
306 | \mathbf{softmax\left(\frac{(Q_{transformed} \times K_{transformed}^T)_{masked}}{\sqrt{3}}\right)} =
307 | \begin{bmatrix}
308 | \begin{bmatrix}
309 | 1.0000 & 0.0000 & 0.0000 \\
310 | 0.0724 & 0.9276 & 0.0000 \\
311 | 0.0015 & 0.0878 & 0.9107
312 | \end{bmatrix} \\
313 | \begin{bmatrix}
314 | 1.0000 & 0.0000 & 0.0000 \\
315 | 0.0000 & 1.0000 & 0.0000 \\
316 | 0.0807 & 0.7843 & 0.1350
317 | \end{bmatrix}
318 | \end{bmatrix}
319 | $$
320 |
321 | - We can also implement dropout after this
322 |
323 | ## Step 9: Compute Context Vector
324 |
325 | $$
326 | \mathbf{Context\ Vector} = \mathbf{Attention\ Weights} \times \mathbf{Values}
327 | $$
328 |
329 | $$
330 | \mathbf{Context\ Vector} =
331 | \begin{bmatrix}
332 | \begin{bmatrix}
333 | 1.0000 & 0.0000 & 0.0000 \\
334 | 0.0724 & 0.9276 & 0.0000 \\
335 | 0.0015 & 0.0878 & 0.9107
336 | \end{bmatrix} \\
337 | \begin{bmatrix}
338 | 1.0000 & 0.0000 & 0.0000 \\
339 | 0.0000 & 1.0000 & 0.0000 \\
340 | 0.0807 & 0.7843 & 0.1350
341 | \end{bmatrix}
342 | \end{bmatrix}
343 | \times
344 | \begin{bmatrix}
345 | \begin{bmatrix}
346 | -3.6194 & 2.0935 & 1.3879 \\
347 | 1.1106 & -0.4063 & -0.5588 \\
348 | -1.6594 & 0.1052 & -0.0468
349 | \end{bmatrix} \\
350 | \begin{bmatrix}
351 | 2.1231 & -1.2262 & -0.2556 \\
352 | 1.8222 & 1.8721 & 0.4929 \\
353 | 0.8916 & -1.4384 & -0.5651
354 | \end{bmatrix}
355 | \end{bmatrix}
356 | $$
357 |
358 |
359 |
360 | $$
361 | \mathbf{Context\ Vector} =
362 | \begin{bmatrix}
363 | \begin{bmatrix}
364 | -3.6194 & 2.0935 & 1.3879 \\
365 | 0.8036 & -0.2926 & -0.4385 \\
366 | -1.5123 & 0.0901 & -0.0603
367 | \end{bmatrix} \\
368 | \begin{bmatrix}
369 | 2.1231 & -1.2262 & -0.2556 \\
370 | 1.8222 & 1.8721 & 0.4929 \\
371 | 1.7251 & 1.0674 & 0.3078
372 | \end{bmatrix}
373 | \end{bmatrix}
374 | $$
375 |
376 | Dimension:
377 | (b, num_head, num_tokens, num_tokens)
378 | ×
379 | (b, num_head, num_tokens, head_dim)
380 | →
381 | (b, num_head, num_tokens, head_dim)
382 |
383 |
384 | Each row represent context vector for particular token
385 |
386 | - We need to merge $num_{head}$ and $head_{dim}$ because resultant context vector matrix should have dimension of $d_{out}$
387 |
388 | ## Step 10 : Reformat context Vector
389 |
390 |
391 |
392 | Making closer to merge easily
393 |
394 | $$
395 | \mathbf{Context\ Vector^\top } =
396 | \begin{bmatrix}
397 | \begin{bmatrix}
398 | 3.6194 & 2.0935 & 1.3879 \\
399 | 2.1231 & -1.2262 & -0.2556
400 | \end{bmatrix} \\
401 | \begin{bmatrix}
402 | 0.8036 & -0.2926 & -0.4385 \\
403 | 1.8222 & 1.8721 & 0.4929
404 | \end{bmatrix} \\
405 | \begin{bmatrix}
406 | -1.5123 & 0.0901 & -0.0603 \\
407 | 1.7251 & 1.0674 & 0.3078
408 | \end{bmatrix}
409 | \end{bmatrix}
410 | $$
411 |
412 | Each inner matrix represent each token
413 |
414 | ## Step 11: Combine or Flatten each token
415 |
416 |
417 | $$
418 | \mathbf{Context\ Vector^\top } =
419 | \begin{bmatrix}
420 | \begin{bmatrix} -3.6194 & 2.0935 & 1.3879 & 2.1231 & -1.2262 & -0.2556 \end{bmatrix} \\
421 | \begin{bmatrix} 0.8036 & -0.2926 & -0.4385 & 1.8222 & 1.8721 & 0.4929 \end{bmatrix} \\
422 | \begin{bmatrix} -1.5123 & 0.0901 & -0.0603 & 1.7251 & 1.0674 & 0.3078 \end{bmatrix}
423 | \end{bmatrix}
424 | $$
425 |
426 | - Rows represent context vector for each token
427 |
428 | Dimension : $(b,num_{token}, d_{out})$ = (1, 3, 6)
--------------------------------------------------------------------------------
/18. Multihead Attention/notebook.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 11,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import torch\n",
10 | "import torch.nn as nn\n",
11 | "class MultiHeadAttention(nn.Module):\n",
12 | " def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):\n",
13 | " super().__init__()\n",
14 | " assert d_out % num_heads == 0, \"d_out must be divisible by num_heads\"\n",
15 | " self.d_out = d_out\n",
16 | " self.num_heads = num_heads\n",
17 | "\n",
18 | " #Step 2\n",
19 | " self.head_dim = d_out // num_heads \n",
20 | "\n",
21 | " #Step 3\n",
22 | " self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
23 | " self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
24 | " self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
25 | " self.out_proj = nn.Linear(d_out, d_out) \n",
26 | "\n",
27 | " self.dropout = nn.Dropout(dropout)\n",
28 | " \n",
29 | " #...Step 8 \n",
30 | " self.register_buffer(\n",
31 | " 'mask',\n",
32 | " torch.triu(torch.ones(context_length, context_length), diagonal=1)\n",
33 | " )\n",
34 | " \n",
35 | " def forward(self, x):\n",
36 | " b, num_tokens, d_in = x.shape\n",
37 | "\n",
38 | " #Step 4\n",
39 | " keys = self.W_key(x) \n",
40 | " queries = self.W_query(x) \n",
41 | " values = self.W_value(x) \n",
42 | "\n",
43 | " #Step 5\n",
44 | " keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) \n",
45 | " queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)\n",
46 | " values = values.view(b, num_tokens, self.num_heads, self.head_dim) \n",
47 | "\n",
48 | " #Step 6\n",
49 | " keys = keys.transpose(1, 2) \n",
50 | " queries = queries.transpose(1, 2) \n",
51 | " values = values.transpose(1, 2) \n",
52 | "\n",
53 | " #Step 7\n",
54 | " attn_scores = queries @ keys.transpose(2, 3) \n",
55 | "\n",
56 | " #Step 8\n",
57 | " mask_bool = self.mask.bool()[:num_tokens, :num_tokens] \n",
58 | " attn_scores.masked_fill_(mask_bool, -torch.inf) \n",
59 | " attn_weights = torch.softmax(\n",
60 | " attn_scores / keys.shape[-1]**0.5, dim=-1\n",
61 | " )\n",
62 | " attn_weights = self.dropout(attn_weights)\n",
63 | "\n",
64 | " #Step 9 + Step 10\n",
65 | " context_vec = (attn_weights @ values).transpose(1, 2)\n",
66 | "\n",
67 | " #Step 10 + Step 11\n",
68 | " context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)\n",
69 | " context_vec = self.out_proj(context_vec)\n",
70 | " \n",
71 | " return context_vec"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 14,
77 | "metadata": {},
78 | "outputs": [
79 | {
80 | "name": "stdout",
81 | "output_type": "stream",
82 | "text": [
83 | "torch.Size([2, 3, 4])\n",
84 | "tensor([[[ 0.6326, -0.4115],\n",
85 | " [ 0.7489, -0.2632],\n",
86 | " [ 0.7444, -0.2735]],\n",
87 | "\n",
88 | " [[ 0.6326, -0.4115],\n",
89 | " [ 0.7489, -0.2632],\n",
90 | " [ 0.7444, -0.2735]]], grad_fn=)\n",
91 | "context_vecs.shape: torch.Size([2, 3, 2])\n"
92 | ]
93 | }
94 | ],
95 | "source": [
96 | "#Step 1\n",
97 | "a = torch.tensor([[0.2745, 0.6584, 0.2775, 0.8573],\n",
98 | " [0.8993, 0.0390, 0.9268, 0.7388],\n",
99 | " [0.7179, 0.7058, 0.9156, 0.4340]],\n",
100 | " )\n",
101 | "\n",
102 | "torch.manual_seed(123)\n",
103 | "batch=torch.stack((a,a),dim=0)\n",
104 | "print(batch.shape)\n",
105 | "\n",
106 | "batch_size, context_length, d_in = batch.shape\n",
107 | "d_out = 2\n",
108 | "mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads=2)\n",
109 | "context_vecs = mha(batch)\n",
110 | "print(context_vecs)\n",
111 | "print(\"context_vecs.shape:\", context_vecs.shape)"
112 | ]
113 | }
114 | ],
115 | "metadata": {
116 | "kernelspec": {
117 | "display_name": "Python 3",
118 | "language": "python",
119 | "name": "python3"
120 | },
121 | "language_info": {
122 | "codemirror_mode": {
123 | "name": "ipython",
124 | "version": 3
125 | },
126 | "file_extension": ".py",
127 | "mimetype": "text/x-python",
128 | "name": "python",
129 | "nbconvert_exporter": "python",
130 | "pygments_lexer": "ipython3",
131 | "version": "3.11.5"
132 | }
133 | },
134 | "nbformat": 4,
135 | "nbformat_minor": 2
136 | }
137 |
--------------------------------------------------------------------------------
/19. Birds Eye View of LLM.md:
--------------------------------------------------------------------------------
1 | ## Birds Eye View
2 |
3 | Now we will fo through LLM Architecture
4 |
5 | ### Coding LLM Architecture
6 |
7 |
8 |
9 | - Expanding Transformer Block
10 |
11 |
12 |
13 | Seen So far:
14 | a. Input Tokenization
15 | b. Embedding (Token + Positional)
16 | c. Masked Multi-Head Attention
17 |
18 |
19 |
20 | Will Look:
21 | a. Transformer Block
22 |
23 | - Upto the size of GPT 2 :-
24 | Parms: 117M
25 | Layer : 12
26 | $d_{model}$ : 768
27 |
28 | - OpenAI has made GPT 2 weights public, GPT 3 or 4 weights have not been yet made public.
29 |
30 | - Configuration:
31 |
32 | ```python
33 | GPT_CONFIG_124M = {
34 | "vocab_size": 50257, # Vocabulary size
35 | "context_length": 1024, # Context length
36 | "emb_dim": 768, # Embedding dimension
37 | "n_heads": 12, # Number of attention heads
38 | "n_layers": 12, # Number of layers
39 | "drop_rate": 0.1, # Dropout rate
40 | "qkv_bias": False # Query-Key-Value bias
41 | }
42 | ```
43 | Inside `forward method`:
44 | 1. "every efforts moves you" converted into `Token ID`.
45 | 2. Each `Token ID` get converted into `embedding vector` of 768 dimension.
46 | 3. Create Positional Embedding
47 | 4. Add Token Embedding and Positional Embedding
48 | 5. Implement Dropout
49 | 6. Pass through `Transformer Block` (We have 12 Transformer Block in GPT 2)
--------------------------------------------------------------------------------
/19. LLM Architecture/notebook.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "tensor([[6109, 3626, 6100, 345],\n",
13 | " [6109, 1110, 6622, 257]])\n"
14 | ]
15 | }
16 | ],
17 | "source": [
18 | "import tiktoken\n",
19 | "import torch\n",
20 | "tokenizer=tiktoken.get_encoding(\"gpt2\")\n",
21 | "batch=[]\n",
22 | "\n",
23 | "txt1=\"Every effort moves you\"\n",
24 | "txt2=\"Every day holds a\"\n",
25 | "batch.append(torch.tensor(tokenizer.encode(txt1)))\n",
26 | "batch.append(torch.tensor(tokenizer.encode(txt2)))\n",
27 | "\n",
28 | "batch=torch.stack(batch,dim=0)\n",
29 | "print(batch)"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "import torch\n",
39 | "import torch.nn as nn\n",
40 | "\n",
41 | "\n",
42 | "class GPTModel(nn.Module):\n",
43 | "\n",
44 | " def __init__(self,cfg):\n",
45 | " super().__init__()\n",
46 | " self.tok_emb=nn.Embedding(cfg[\"vocab_size\"],cfg[\"emb_dim\"])\n",
47 | " self.pos_emb=nn.Embedding(cfg[\"context_length\"],cfg[\"emb_dim\"])\n",
48 | " self.drop_emb=nn.Dropout(cfg[\"Drop_rate\"])\n",
49 | " self.out_head=nn.Linear(\n",
50 | " cfg[\"emb_dim\"],cfg[\"vocab_size\"],bias=False\n",
51 | " )\n",
52 | "\n",
53 | " def forward(self,in_idx):\n",
54 | " batch_size,seq_len=in_idx.shape\n",
55 | " tok_embeds=self.tok_emb(in_idx)\n",
56 | " pos_embeds=self.pos_emb(torch.arange(seq_len,device=in_idx.device))\n",
57 | " x=tok_embeds+pos_embeds\n",
58 | " x=self.drop_emb(x)\n",
59 | " x=self.trf_blocks(x)\n",
60 | " x=self.final_norm(x)\n",
61 | " logits=self.out_head(x)\n",
62 | " return logits"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": []
71 | }
72 | ],
73 | "metadata": {
74 | "kernelspec": {
75 | "display_name": "Python 3",
76 | "language": "python",
77 | "name": "python3"
78 | },
79 | "language_info": {
80 | "codemirror_mode": {
81 | "name": "ipython",
82 | "version": 3
83 | },
84 | "file_extension": ".py",
85 | "mimetype": "text/x-python",
86 | "name": "python",
87 | "nbconvert_exporter": "python",
88 | "pygments_lexer": "ipython3",
89 | "version": "3.11.5"
90 | }
91 | },
92 | "nbformat": 4,
93 | "nbformat_minor": 2
94 | }
95 |
--------------------------------------------------------------------------------
/2. LLM Basics.md:
--------------------------------------------------------------------------------
1 | # Basics
2 | ## 1. What is Large Language Model
3 | Neural Network designed to `UNDERSTAND`, `GENERATE` and `RESPOND` to human like text.
4 |
5 | or
6 |
7 | Deep Neural Network trained on massive amount of text data.
8 |
9 | [Check Here](https://chatgpt.com/) 😁
10 |
11 | ## 2. What does `Large` mean in Large Language Model
12 | Models have billions of `parameters`!
13 | > Parameters: Number of `weights` and `biases`.
14 |
15 |
16 |
17 |
18 | `LANGAUGE MODELS`: Only deals with language/NLP tasks.
19 |
20 | ## 3. LLM vs Earlier NLP models
21 | NLP Models: Designed for Specific Tasks.
22 | LLM: Wide range of NLP tasks from same architecture.
23 |
24 | NLP: Could not write Emails.
25 | LLM: Can write Emails 😅.
26 |
27 | ## 4. What makes LLM so good
28 | `Transformers`
29 |
30 | Not this one 😁
31 |
32 | This one Instead
33 |
34 | Paper: [Attention is All You Need](https://arxiv.org/pdf/1706.03762)
35 |
36 | ## 5. LLM vs GenAI vs Deep Learning vs Machine Learning
37 |
38 |
39 |
40 | ```
41 | AI: Ruled based AI-Booking GAS using Whatsapp bot (not an ML)
42 | |_ML: Involve Neural Network and others fields too, like decision tree.
43 | |_DL: ANN, CNN, RNN, LSTM
44 | |_LLM: Doesn't deal with images
45 | ```
46 | `Genarative AI`: LLM+DL (also deals with image, sound etc)
47 |
48 | ## 6. Application of LLMs
49 | 5 main category:
50 | - [Content Creation](https://chatgpt.com/share/6783dd73-43d4-800e-8f5b-64fcc0ece76b)
51 | - [Chatbots / Virtual Assistant](https://chatgpt.com/share/6783de08-5d90-800e-b8b2-5d786fa99686)
52 | - [Machine Translation](https://chatgpt.com/share/6783de5a-1728-800e-881a-719992cd51e7)
53 | - [Sentiment Analysis](https://chatgpt.com/share/6783deb4-cff4-800e-a501-5abacea9b143)
54 | - [Novel text generation](https://chatgpt.com/share/6783df62-8c48-800e-b655-f27afb5cccc7)
--------------------------------------------------------------------------------
/3. Stages of Building LLM.md:
--------------------------------------------------------------------------------
1 | # Two Stages: Pretraining + Finetuning
2 |
3 | ## 1. Pretraining
4 | Meaning: `Training` on large dataset
5 | Let (1 Token=1 Word)
6 | Locations where data collected for training GPT3.
7 | [SOURCE](https://arxiv.org/pdf/2005.14165)
8 |
9 | + [CommonCRAWL](https://commoncrawl.org/)
10 | + [Wikipeida](https://www.wikipedia.org/)
11 | + [WebText2](https://openwebtext2.readthedocs.io/en/latest/#download-raw-scrapes-version)
12 | + Books
13 |
14 | Initially it was trained for word completion
15 |
16 |
17 |
18 | [SOURCE](https://openai.com/index/language-unsupervised/)
19 | > Note: Pretrainined model also known as `Foundational Model` and
20 |
21 | ## 2. Finetuning
22 | Meaning: `Refinement` by training on narrower dataset, specific to a particular task or domain.
23 |
24 |
25 | [SOURCE >>](https://openai.com/index/introducing-improvements-to-the-fine-tuning-api-and-expanding-our-custom-models-program/)
26 | [CHECK](https://www.harvey.ai/)
27 |
28 | ---
29 | Now I know source where they trained I will also train and create `FOUNDATIONAL MODEL`🥳
30 | But
31 |
32 |
33 |
34 |
35 | ---
36 | Steps of building LLM:
37 | 1. Training on large corpus of text data (RAW text: regular text witout labelling info)
38 | 2. First Training Stage: `Pre-Training`
39 | Creating initial pretrained LLM, base/foundational model.
40 | 3. Further train on labelled data: `Fine Tuning`
41 | A. Instruction Finetuning `Dataset consist of Instruction - Answer`
42 | B. Finetuning for classfication task `Dataset consist of Text - Associated Label`
43 |
--------------------------------------------------------------------------------
/4. Basic Transformer.md:
--------------------------------------------------------------------------------
1 | ## 1. Transformer Architecture
2 | - Most modern LLMS rely in the transformer architecture.
3 | Paper: [Attention is all you need](https://arxiv.org/pdf/1706.03762)
4 | - Initially this paper published for Translation Task (Englist to German and French)
5 |
6 |
7 |
8 | "This is an Example" -> Broken into Token -> `Encoder` (Capture sementic meaning) | Some Neural Network are also trained for this step -> Vector Embedding
9 |
10 | "Das est ein" -> Input text-> Preprocessing -> `Decoder` -> Output Layer -> "Das est ein `Beispiel`"
11 |
12 | + `Encoder`: Encode input text into Vector
13 | + `Decoder`: Generate output text from encodeed vectors
14 |
15 | > Note:In GPT there is no Encoder
16 |
17 | ## 2. Attention
18 | #### Key part of Transformer
19 |
20 |
21 | + Allow model to weigh importance of different words/ tokens relative to each other.
22 |
23 | + Enable model to capture long range dependencies
24 |
25 | ## 3. Later Variation of Transformer
26 | BERT: Bidirectional encoder representations from Transformers.
27 | + Receive input where words are rendomly masked during training
28 |
29 | This is an ____ of how LLM _____ perform
30 | > This is an `example` of how LLM `can` perform.
31 |
32 | GPT: Generative Pretrained Transformers.
33 | + Learn to generate one word at a time
34 |
35 | This is an example of how LLM can perform.
36 | > This is an example of how LLM `can` perform.
37 |
38 | ## 4. Transformer VS LLM
39 | `Not all transformers are LLMs`
40 | + Transformers can also be used for Computer Vision. [Vision Transformer](https://viso.ai/deep-learning/vision-transformer-vit/)
41 |
42 | `Not all LLMs are Transformer`
43 | + LLMs can be based on RNN or CNN architecture as well
--------------------------------------------------------------------------------
/5. GPT working.md:
--------------------------------------------------------------------------------
1 | ## 1. Progression from Transformer to GPT
2 | 2017 Attention is all you need
3 | + Introduing self attention Mechnaism
4 | + Introduce Transformer
5 |
6 | 2018 Generative Pre-Training
7 | + Unsupervised Pre-Training
8 |
9 | 2019 Language Model are Unsupervised Multitask Learners
10 | + 4 models used (GPT 2 architecture)
11 | + GPT-2 SMALL, GPT-2 MEDIUM, GPT-2 LARGE, GPT-2 EXTRA LARGE
12 |
13 | 2020 Langauge Model are Few Shot Learning
14 | + 8 models used (GPT 3 architecture)
15 | + Can do numerous things
16 |
17 | 2022 GPT 3.5 (Commercially Viral)
18 |
19 | 2024 GPT 4
20 |
21 | > What is Difference between GPT and Transformer :
22 | > GPT borrow architecture from Transformer and don't have Encoder block
23 |
24 |
25 |
26 | ---
27 | ## 2. Zero Shot Vv Few Shot Learning
28 |
29 | `Zero Shot Learning` : Model predict answer given any description
30 | > "Hey Convert this into French"
31 |
32 | `One Shot Learning` : Model sees single example of task then perform
33 | > "Here is translation of XYZ, Translate PQR into French"
34 |
35 | `Few Shot Learning` : Model sees few examples of task then perform
36 | + GPT 3 was few shot learner, although it was only trained for language translation.
37 |
38 | > GPT 4 is/was few shot learner, but can even do zero shot learning.
39 |
40 | ###### Zero shot learning is getting result without providing example, Few shot learning is getting result by providing some example.
41 |
42 | ---
43 | `ALREADY COVERED`
44 |
45 | `Token` : Complex procedure to convert work into token called `Tokenization`
46 | OR
47 | Unit or word which models read
48 |
49 | + GPT3 trained on 300B token, Pretraining costs $4.6 Million.
50 | + Pre trained models are BASE/FOUNDATIONAL models which can be used for further finetuning.
51 | + Many pretrained LLMs are available as Open source models.
52 | + GPT consist of only DECODER block
53 | ---
54 | ## 3. Why GPT known as Auto Regressor model and why falls under Unsupervised Learning
55 |
56 | > GPT model are simple tranined on Next word predictor
57 |
58 |
59 |
60 | `Unsupervised`: We use structure of data to create label itself
61 |
62 | `Auto Regressive`:Previous output used as Input for future prediction
63 |
64 | ---
65 | + GPT architecture is simpler, have no encoder block
66 |
67 | Original Transformer: 6 encoder-decoder block
68 | GPT 3: 96 transformer layers, 175 billion parameters
69 |
70 |
71 | ---
72 | ## 4. Emergent Behaviour
73 | > Although GPT only tranined for translation task, it perform other tasks too.
--------------------------------------------------------------------------------
/6. Stages.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Stage I | Building an LLM
4 | Understand basic Mechanism
5 | - Data preperation and Sampling
6 | - `Tokenization` : Break sentence into token
7 | - `Vector Embeddings` : Capture sementic meaning + Positional Encoding
8 | - Construct data into batches
9 | - Attention Mechanism
10 | - What does Mean and different types
11 | - LLM Architecture
12 |
13 | ## Stage II | Pretraining | Foundational Model
14 | Pretrain on unlabeled data
15 | - Training Loop
16 | - Model Evaluation
17 | - Load pretrained weights
18 |
19 |
20 |
21 | ## Stage III | Finetuning
22 | - Classifier
23 | - Personal Assistant
24 |
25 |
--------------------------------------------------------------------------------
/7. Tokenization/7. Tokenization.md:
--------------------------------------------------------------------------------
1 | ## Tokenization
2 | "Data Preperation and Sampling"
3 | Data need to preprocessed in manner to be used in Pretrining.
4 |
5 | Breaking down sentence into specific work.
6 |
7 | ## How do you prepare input text for training LLM?
8 | Step 1: Splitting text into individual word and subword token
9 | Step 2: Convert token into token IDs
10 | Step 3: Encode token IDs into vector representation (Vector Embedding)
11 |
12 | ## Step 1 : Tokenization Text
13 |
--------------------------------------------------------------------------------
/7. Tokenization/the-verdict.txt:
--------------------------------------------------------------------------------
1 | I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)
2 |
3 | "The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it's going to send the value of my picture 'way up; but I don't think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing's lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn's "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?
4 |
5 | Well!--even through the prism of Hermia's tears I felt able to face the fact with equanimity. Poor Jack Gisburn! The women had made him--it was fitting that they should mourn him. Among his own sex fewer regrets were heard, and in his own trade hardly a murmur. Professional jealousy? Perhaps. If it were, the honour of the craft was vindicated by little Claude Nutley, who, in all good faith, brought out in the Burlington a very handsome "obituary" on Jack--one of those showy articles stocked with random technicalities that I have heard (I won't say by whom) compared to Gisburn's painting. And so--his resolve being apparently irrevocable--the discussion gradually died out, and, as Mrs. Thwing had predicted, the price of "Gisburns" went up.
6 |
7 | It was not till three years later that, in the course of a few weeks' idling on the Riviera, it suddenly occurred to me to wonder why Gisburn had given up his painting. On reflection, it really was a tempting problem. To accuse his wife would have been too easy--his fair sitters had been denied the solace of saying that Mrs. Gisburn had "dragged him down." For Mrs. Gisburn--as such--had not existed till nearly a year after Jack's resolve had been taken. It might be that he had married her--since he liked his ease--because he didn't want to go on painting; but it would have been hard to prove that he had given up his painting because he had married her.
8 |
9 | Of course, if she had not dragged him down, she had equally, as Miss Croft contended, failed to "lift him up"--she had not led him back to the easel. To put the brush into his hand again--what a vocation for a wife! But Mrs. Gisburn appeared to have disdained it--and I felt it might be interesting to find out why.
10 |
11 | The desultory life of the Riviera lends itself to such purely academic speculations; and having, on my way to Monte Carlo, caught a glimpse of Jack's balustraded terraces between the pines, I had myself borne thither the next day.
12 |
13 | I found the couple at tea beneath their palm-trees; and Mrs. Gisburn's welcome was so genial that, in the ensuing weeks, I claimed it frequently. It was not that my hostess was "interesting": on that point I could have given Miss Croft the fullest reassurance. It was just because she was _not_ interesting--if I may be pardoned the bull--that I found her so. For Jack, all his life, had been surrounded by interesting women: they had fostered his art, it had been reared in the hot-house of their adulation. And it was therefore instructive to note what effect the "deadening atmosphere of mediocrity" (I quote Miss Croft) was having on him.
14 |
15 | I have mentioned that Mrs. Gisburn was rich; and it was immediately perceptible that her husband was extracting from this circumstance a delicate but substantial satisfaction. It is, as a rule, the people who scorn money who get most out of it; and Jack's elegant disdain of his wife's big balance enabled him, with an appearance of perfect good-breeding, to transmute it into objects of art and luxury. To the latter, I must add, he remained relatively indifferent; but he was buying Renaissance bronzes and eighteenth-century pictures with a discrimination that bespoke the amplest resources.
16 |
17 | "Money's only excuse is to put beauty into circulation," was one of the axioms he laid down across the Sevres and silver of an exquisitely appointed luncheon-table, when, on a later day, I had again run over from Monte Carlo; and Mrs. Gisburn, beaming on him, added for my enlightenment: "Jack is so morbidly sensitive to every form of beauty."
18 |
19 | Poor Jack! It had always been his fate to have women say such things of him: the fact should be set down in extenuation. What struck me now was that, for the first time, he resented the tone. I had seen him, so often, basking under similar tributes--was it the conjugal note that robbed them of their savour? No--for, oddly enough, it became apparent that he was fond of Mrs. Gisburn--fond enough not to see her absurdity. It was his own absurdity he seemed to be wincing under--his own attitude as an object for garlands and incense.
20 |
21 | "My dear, since I've chucked painting people don't say that stuff about me--they say it about Victor Grindle," was his only protest, as he rose from the table and strolled out onto the sunlit terrace.
22 |
23 | I glanced after him, struck by his last word. Victor Grindle was, in fact, becoming the man of the moment--as Jack himself, one might put it, had been the man of the hour. The younger artist was said to have formed himself at my friend's feet, and I wondered if a tinge of jealousy underlay the latter's mysterious abdication. But no--for it was not till after that event that the _rose Dubarry_ drawing-rooms had begun to display their "Grindles."
24 |
25 | I turned to Mrs. Gisburn, who had lingered to give a lump of sugar to her spaniel in the dining-room.
26 |
27 | "Why _has_ he chucked painting?" I asked abruptly.
28 |
29 | She raised her eyebrows with a hint of good-humoured surprise.
30 |
31 | "Oh, he doesn't _have_ to now, you know; and I want him to enjoy himself," she said quite simply.
32 |
33 | I looked about the spacious white-panelled room, with its _famille-verte_ vases repeating the tones of the pale damask curtains, and its eighteenth-century pastels in delicate faded frames.
34 |
35 | "Has he chucked his pictures too? I haven't seen a single one in the house."
36 |
37 | A slight shade of constraint crossed Mrs. Gisburn's open countenance. "It's his ridiculous modesty, you know. He says they're not fit to have about; he's sent them all away except one--my portrait--and that I have to keep upstairs."
38 |
39 | His ridiculous modesty--Jack's modesty about his pictures? My curiosity was growing like the bean-stalk. I said persuasively to my hostess: "I must really see your portrait, you know."
40 |
41 | She glanced out almost timorously at the terrace where her husband, lounging in a hooded chair, had lit a cigar and drawn the Russian deerhound's head between his knees.
42 |
43 | "Well, come while he's not looking," she said, with a laugh that tried to hide her nervousness; and I followed her between the marble Emperors of the hall, and up the wide stairs with terra-cotta nymphs poised among flowers at each landing.
44 |
45 | In the dimmest corner of her boudoir, amid a profusion of delicate and distinguished objects, hung one of the familiar oval canvases, in the inevitable garlanded frame. The mere outline of the frame called up all Gisburn's past!
46 |
47 | Mrs. Gisburn drew back the window-curtains, moved aside a _jardiniere_ full of pink azaleas, pushed an arm-chair away, and said: "If you stand here you can just manage to see it. I had it over the mantel-piece, but he wouldn't let it stay."
48 |
49 | Yes--I could just manage to see it--the first portrait of Jack's I had ever had to strain my eyes over! Usually they had the place of honour--say the central panel in a pale yellow or _rose Dubarry_ drawing-room, or a monumental easel placed so that it took the light through curtains of old Venetian point. The more modest place became the picture better; yet, as my eyes grew accustomed to the half-light, all the characteristic qualities came out--all the hesitations disguised as audacities, the tricks of prestidigitation by which, with such consummate skill, he managed to divert attention from the real business of the picture to some pretty irrelevance of detail. Mrs. Gisburn, presenting a neutral surface to work on--forming, as it were, so inevitably the background of her own picture--had lent herself in an unusual degree to the display of this false virtuosity. The picture was one of Jack's "strongest," as his admirers would have put it--it represented, on his part, a swelling of muscles, a congesting of veins, a balancing, straddling and straining, that reminded one of the circus-clown's ironic efforts to lift a feather. It met, in short, at every point the demand of lovely woman to be painted "strongly" because she was tired of being painted "sweetly"--and yet not to lose an atom of the sweetness.
50 |
51 | "It's the last he painted, you know," Mrs. Gisburn said with pardonable pride. "The last but one," she corrected herself--"but the other doesn't count, because he destroyed it."
52 |
53 | "Destroyed it?" I was about to follow up this clue when I heard a footstep and saw Jack himself on the threshold.
54 |
55 | As he stood there, his hands in the pockets of his velveteen coat, the thin brown waves of hair pushed back from his white forehead, his lean sunburnt cheeks furrowed by a smile that lifted the tips of a self-confident moustache, I felt to what a degree he had the same quality as his pictures--the quality of looking cleverer than he was.
56 |
57 | His wife glanced at him deprecatingly, but his eyes travelled past her to the portrait.
58 |
59 | "Mr. Rickham wanted to see it," she began, as if excusing herself. He shrugged his shoulders, still smiling.
60 |
61 | "Oh, Rickham found me out long ago," he said lightly; then, passing his arm through mine: "Come and see the rest of the house."
62 |
63 | He showed it to me with a kind of naive suburban pride: the bath-rooms, the speaking-tubes, the dress-closets, the trouser-presses--all the complex simplifications of the millionaire's domestic economy. And whenever my wonder paid the expected tribute he said, throwing out his chest a little: "Yes, I really don't see how people manage to live without that."
64 |
65 | Well--it was just the end one might have foreseen for him. Only he was, through it all and in spite of it all--as he had been through, and in spite of, his pictures--so handsome, so charming, so disarming, that one longed to cry out: "Be dissatisfied with your leisure!" as once one had longed to say: "Be dissatisfied with your work!"
66 |
67 | But, with the cry on my lips, my diagnosis suffered an unexpected check.
68 |
69 | "This is my own lair," he said, leading me into a dark plain room at the end of the florid vista. It was square and brown and leathery: no "effects"; no bric-a-brac, none of the air of posing for reproduction in a picture weekly--above all, no least sign of ever having been used as a studio.
70 |
71 | The fact brought home to me the absolute finality of Jack's break with his old life.
72 |
73 | "Don't you ever dabble with paint any more?" I asked, still looking about for a trace of such activity.
74 |
75 | "Never," he said briefly.
76 |
77 | "Or water-colour--or etching?"
78 |
79 | His confident eyes grew dim, and his cheeks paled a little under their handsome sunburn.
80 |
81 | "Never think of it, my dear fellow--any more than if I'd never touched a brush."
82 |
83 | And his tone told me in a flash that he never thought of anything else.
84 |
85 | I moved away, instinctively embarrassed by my unexpected discovery; and as I turned, my eye fell on a small picture above the mantel-piece--the only object breaking the plain oak panelling of the room.
86 |
87 | "Oh, by Jove!" I said.
88 |
89 | It was a sketch of a donkey--an old tired donkey, standing in the rain under a wall.
90 |
91 | "By Jove--a Stroud!" I cried.
92 |
93 | He was silent; but I felt him close behind me, breathing a little quickly.
94 |
95 | "What a wonder! Made with a dozen lines--but on everlasting foundations. You lucky chap, where did you get it?"
96 |
97 | He answered slowly: "Mrs. Stroud gave it to me."
98 |
99 | "Ah--I didn't know you even knew the Strouds. He was such an inflexible hermit."
100 |
101 | "I didn't--till after. . . . She sent for me to paint him when he was dead."
102 |
103 | "When he was dead? You?"
104 |
105 | I must have let a little too much amazement escape through my surprise, for he answered with a deprecating laugh: "Yes--she's an awful simpleton, you know, Mrs. Stroud. Her only idea was to have him done by a fashionable painter--ah, poor Stroud! She thought it the surest way of proclaiming his greatness--of forcing it on a purblind public. And at the moment I was _the_ fashionable painter."
106 |
107 | "Ah, poor Stroud--as you say. Was _that_ his history?"
108 |
109 | "That was his history. She believed in him, gloried in him--or thought she did. But she couldn't bear not to have all the drawing-rooms with her. She couldn't bear the fact that, on varnishing days, one could always get near enough to see his pictures. Poor woman! She's just a fragment groping for other fragments. Stroud is the only whole I ever knew."
110 |
111 | "You ever knew? But you just said--"
112 |
113 | Gisburn had a curious smile in his eyes.
114 |
115 | "Oh, I knew him, and he knew me--only it happened after he was dead."
116 |
117 | I dropped my voice instinctively. "When she sent for you?"
118 |
119 | "Yes--quite insensible to the irony. She wanted him vindicated--and by me!"
120 |
121 | He laughed again, and threw back his head to look up at the sketch of the donkey. "There were days when I couldn't look at that thing--couldn't face it. But I forced myself to put it here; and now it's cured me--cured me. That's the reason why I don't dabble any more, my dear Rickham; or rather Stroud himself is the reason."
122 |
123 | For the first time my idle curiosity about my companion turned into a serious desire to understand him better.
124 |
125 | "I wish you'd tell me how it happened," I said.
126 |
127 | He stood looking up at the sketch, and twirling between his fingers a cigarette he had forgotten to light. Suddenly he turned toward me.
128 |
129 | "I'd rather like to tell you--because I've always suspected you of loathing my work."
130 |
131 | I made a deprecating gesture, which he negatived with a good-humoured shrug.
132 |
133 | "Oh, I didn't care a straw when I believed in myself--and now it's an added tie between us!"
134 |
135 | He laughed slightly, without bitterness, and pushed one of the deep arm-chairs forward. "There: make yourself comfortable--and here are the cigars you like."
136 |
137 | He placed them at my elbow and continued to wander up and down the room, stopping now and then beneath the picture.
138 |
139 | "How it happened? I can tell you in five minutes--and it didn't take much longer to happen. . . . I can remember now how surprised and pleased I was when I got Mrs. Stroud's note. Of course, deep down, I had always _felt_ there was no one like him--only I had gone with the stream, echoed the usual platitudes about him, till I half got to think he was a failure, one of the kind that are left behind. By Jove, and he _was_ left behind--because he had come to stay! The rest of us had to let ourselves be swept along or go under, but he was high above the current--on everlasting foundations, as you say.
140 |
141 | "Well, I went off to the house in my most egregious mood--rather moved, Lord forgive me, at the pathos of poor Stroud's career of failure being crowned by the glory of my painting him! Of course I meant to do the picture for nothing--I told Mrs. Stroud so when she began to stammer something about her poverty. I remember getting off a prodigious phrase about the honour being _mine_--oh, I was princely, my dear Rickham! I was posing to myself like one of my own sitters.
142 |
143 | "Then I was taken up and left alone with him. I had sent all my traps in advance, and I had only to set up the easel and get to work. He had been dead only twenty-four hours, and he died suddenly, of heart disease, so that there had been no preliminary work of destruction--his face was clear and untouched. I had met him once or twice, years before, and thought him insignificant and dingy. Now I saw that he was superb.
144 |
145 | "I was glad at first, with a merely aesthetic satisfaction: glad to have my hand on such a 'subject.' Then his strange life-likeness began to affect me queerly--as I blocked the head in I felt as if he were watching me do it. The sensation was followed by the thought: if he _were_ watching me, what would he say to my way of working? My strokes began to go a little wild--I felt nervous and uncertain.
146 |
147 | "Once, when I looked up, I seemed to see a smile behind his close grayish beard--as if he had the secret, and were amusing himself by holding it back from me. That exasperated me still more. The secret? Why, I had a secret worth twenty of his! I dashed at the canvas furiously, and tried some of my bravura tricks. But they failed me, they crumbled. I saw that he wasn't watching the showy bits--I couldn't distract his attention; he just kept his eyes on the hard passages between. Those were the ones I had always shirked, or covered up with some lying paint. And how he saw through my lies!
148 |
149 | "I looked up again, and caught sight of that sketch of the donkey hanging on the wall near his bed. His wife told me afterward it was the last thing he had done--just a note taken with a shaking hand, when he was down in Devonshire recovering from a previous heart attack. Just a note! But it tells his whole history. There are years of patient scornful persistence in every line. A man who had swum with the current could never have learned that mighty up-stream stroke. . . .
150 |
151 | "I turned back to my work, and went on groping and muddling; then I looked at the donkey again. I saw that, when Stroud laid in the first stroke, he knew just what the end would be. He had possessed his subject, absorbed it, recreated it. When had I done that with any of my things? They hadn't been born of me--I had just adopted them. . . .
152 |
153 | "Hang it, Rickham, with that face watching me I couldn't do another stroke. The plain truth was, I didn't know where to put it--_I had never known_. Only, with my sitters and my public, a showy splash of colour covered up the fact--I just threw paint into their faces. . . . Well, paint was the one medium those dead eyes could see through--see straight to the tottering foundations underneath. Don't you know how, in talking a foreign language, even fluently, one says half the time not what one wants to but what one can? Well--that was the way I painted; and as he lay there and watched me, the thing they called my 'technique' collapsed like a house of cards. He didn't sneer, you understand, poor Stroud--he just lay there quietly watching, and on his lips, through the gray beard, I seemed to hear the question: 'Are you sure you know where you're coming out?'
154 |
155 | "If I could have painted that face, with that question on it, I should have done a great thing. The next greatest thing was to see that I couldn't--and that grace was given me. But, oh, at that minute, Rickham, was there anything on earth I wouldn't have given to have Stroud alive before me, and to hear him say: 'It's not too late--I'll show you how'?
156 |
157 | "It _was_ too late--it would have been, even if he'd been alive. I packed up my traps, and went down and told Mrs. Stroud. Of course I didn't tell her _that_--it would have been Greek to her. I simply said I couldn't paint him, that I was too moved. She rather liked the idea--she's so romantic! It was that that made her give me the donkey. But she was terribly upset at not getting the portrait--she did so want him 'done' by some one showy! At first I was afraid she wouldn't let me off--and at my wits' end I suggested Grindle. Yes, it was I who started Grindle: I told Mrs. Stroud he was the 'coming' man, and she told somebody else, and so it got to be true. . . . And he painted Stroud without wincing; and she hung the picture among her husband's things. . . ."
158 |
159 | He flung himself down in the arm-chair near mine, laid back his head, and clasping his arms beneath it, looked up at the picture above the chimney-piece.
160 |
161 | "I like to fancy that Stroud himself would have given it to me, if he'd been able to say what he thought that day."
162 |
163 | And, in answer to a question I put half-mechanically--"Begin again?" he flashed out. "When the one thing that brings me anywhere near him is that I knew enough to leave off?"
164 |
165 | He stood up and laid his hand on my shoulder with a laugh. "Only the irony of it is that I _am_ still painting--since Grindle's doing it for me! The Strouds stand alone, and happen once--but there's no exterminating our kind of art."
--------------------------------------------------------------------------------
/8. Byte Pair Encoding.md:
--------------------------------------------------------------------------------
1 | # Byte Pair Encoding
2 | - Much more sophisticated tokenization scheme.
3 | - Was used to train modern LLM.
4 |
5 | ## Tokenization Algorithm
6 | 1. Word Based "My hobby is playing cricket" -> ['My' , 'hobby', 'is', 'playing', 'cricket']
7 | Problem:
8 | - What to do with Out of Vocabulary (OOV) words.
9 | - boy & boys are similar bur similarity is not capture
10 |
11 | 2. **`Sub-word Based`**
12 | - It capture root words(Boy & Boys)
13 | There are 2 Rules:
14 |
15 | A. Do `not` split `frequently` used words into smaller subwords. [Feature from word level]
16 |
17 | B. `Split` the `rare words` into smaller, meaning subwords. [Feature from character level]
18 |
19 | Features:
20 | - Subword splitting hepls model learn differnet word with same root word as "token" like "tokens" and "tokenizing" are `similar` in meaning.
21 | - It also helps the model learn that `"tokenization"` and `"modernization"` are made up of differnet root words but have the same suffix `"ization"` and are used in same syntatic situations.
22 |
23 | 3. Character Based
24 | "My hobby is playing cricket" -> ['M','y','H','o',.....]
25 |
26 | Problem:
27 | - Small Vocabulary (every language have fixed character)
28 | - Meaning associated word is completely lost.
29 | - Tokenized sequence is much longer than initial raw text.
30 |
31 | ## BPT
32 | Byte Pair Encoding is subword tokenization algorithm
33 |
34 | History:
35 | - Introduced in 1994, was data compression algorithm. [Paper](http://www.pennelynn.com/Documents/CUJ/HTML/94HTML/19940045.HTM)
36 |
37 |
38 | - `Most common` pair of `consecutive bytes` of data is `replaced` with a byte that does not occur in data.
39 |
40 | ## Practical Demonstration of BPE
41 | Original Data: aaabdaaabac
42 |
43 | - Most occurance: `'aa'`: 4 times
44 | - Replace it with Z
45 | Compressed data: **Z**abd**Z**abac
46 |
47 | - Next Common Byte pair: `'ab'`
48 | - Replace it with Y
49 | Compressed data: Z**Y**dZ**Y**ac
50 | Only `ac` byte pair left but appear only once.
51 |
52 | - We can also compress `ZY` replace it by `W`
53 | Compressed data: **W**d**W**ac
54 |
55 | ## What to do BPE with it with LLM?
56 | Rule I: BPE ensures that most word in the vocabulary are represented as a `single token`, while `rare words` are `broken` down into two or more subword token.
57 |
58 | ## Practical Example
59 | - Let's consider below dataset of words:
60 | {"old":7,"older:3,"finest":9,"lowest":4}
61 |
62 | - Preprocessing : We need to add end token `` at the end of each word.
63 | {`"old"`:7,`"older`:3,`"finest"`:9,`"lowest"`:4}
64 |
65 | - Split word into character and count their frequency.
66 |
67 | | Number | Token | Frequency |
68 | |--------|-------|-----------|
69 | | 1 | `` | 23 |
70 | | 2 | o | 14 |
71 | | 3 | l | 14 |
72 | | 4 | d | 10 |
73 | | 5 | e | 16 |
74 | | 6 | r | 3 |
75 | | 7 | f | 9 |
76 | | 8 | i | 9 |
77 | | 9 | n | 9 |
78 | | 10 | s | 13 |
79 | | 11 | t | 13 |
80 | | 12 | w | 4 |
81 |
82 | - Look for the most frequent pairing .
83 | Merge them and perform the iteration again and again until we reach the token limit or iteration limit.
84 |
85 | Iteration 1: Most appeared `e` pairing with `s`
86 |
87 | | Number | Token | Frequency |
88 | |--------|-------|------------|
89 | | 1 | `` | 23 |
90 | | 2 | o | 14 |
91 | | 3 | l | 14 |
92 | | 4 | d | 10 |
93 | | 5 | e | 16 - 13 = 3 |
94 | | 6 | r | 3 |
95 | | 7 | f | 9 |
96 | | 8 | i | 9 |
97 | | 9 | n | 9 |
98 | | 10 | s | 13 - 13 = 0 |
99 | | 11 | t | 13 |
100 | | 12 | w | 4 |
101 | | 13 | es | 9 + 4 = 13 |
102 |
103 | Iteration 2: Merge the token `es` and `t` as they appeared 13 times in our dataset
104 |
105 | | Number | Token | Frequency |
106 | |--------|-------|-----------------|
107 | | 1 | `` | 23 |
108 | | 2 | o | 14 |
109 | | 3 | l | 14 |
110 | | 4 | d | 10 |
111 | | 5 | e | 16 - 13 = 3 |
112 | | 6 | r | 3 |
113 | | 7 | f | 9 |
114 | | 8 | i | 9 |
115 | | 9 | n | 9 |
116 | | 10 | s | 13 - 13 = 0 |
117 | | 11 | t | 13 - 13 = 0 |
118 | | 12 | w | 4 |
119 | | 13 | es | 9 + 4 = 13 - 13 = 0 |
120 | | 14 | est | 13 |
121 |
122 | Now look into `est` with ``
123 |
124 | | Number | Token | Frequency |
125 | |--------|---------|---------------------|
126 | | 1 | `` | 23 - 13 = 10 |
127 | | 2 | o | 14 |
128 | | 3 | l | 14 |
129 | | 4 | d | 10 |
130 | | 5 | e | 16 - 13 = 3 |
131 | | 6 | r | 3 |
132 | | 7 | i | 9 |
133 | | 8 | n | 9 |
134 | | 9 | s | 13 - 13 = 0 |
135 | | 10 | t | 13 - 13 = 0 |
136 | | 11 | w | 4 |
137 | | 12 | es | 9 + 4 - 13 = 0 |
138 | | 13 | est`` | 13 |
139 |
140 | `est` to differentiate and know that it is ending word.
141 |
142 | Iteration 3: Merge `est` with ``
143 |
144 | | Number | Token | Frequency |
145 | |--------|-----------|--------------------|
146 | | 1 | `` | 10 (23 - 13 = 10) |
147 | | 2 | `o` | 14 |
148 | | 3 | `l` | 14 |
149 | | 4 | `d` | 10 |
150 | | 5 | `e` | 3 (16 - 13 = 3) |
151 | | 6 | `r` | 3 |
152 | | 7 | `f` | 9 |
153 | | 8 | `i` | 9 |
154 | | 9 | `n` | 9 |
155 | | 10 | `s` | 0 (13 - 13 = 0) |
156 | | 11 | `t` | 0 (13 - 13 = 0) |
157 | | 12 | `w` | 4 |
158 | | 13 | `es` | 0 (9 + 4 - 13 = 0) |
159 | | 14 | `est` | 0 (13 - 13 = 0) |
160 | | 15 | `est` | 13 |
161 |
162 | Iteration 4: `o` and `l` to be combined
163 |
164 | | Number | Token | Frequency |
165 | |--------|-----------|--------------------|
166 | | 1 | `` | 23 |
167 | | 2 | `o` | 4 (14 - 10 = 4) |
168 | | 3 | `l` | 4 (14 - 10 = 4) |
169 | | 4 | `d` | 10 |
170 | | 5 | `e` | 3 (16 - 13 = 3) |
171 | | 6 | `r` | 3 |
172 | | 7 | `f` | 9 |
173 | | 8 | `i` | 9 |
174 | | 9 | `n` | 9 |
175 | | 10 | `s` | 0 (13 - 13 = 0) |
176 | | 11 | `t` | 0 (13 - 13 = 0) |
177 | | 12 | `w` | 4 |
178 | | 13 | `es` | 0 (9 + 4 - 13 = 0) |
179 | | 14 | `est` | 13 |
180 | | 15 | `ol` | 10 (7 + 3 = 10) |
181 |
182 | Iteration 5: `ol` combine with `d`
183 |
184 | | Number | Token | Frequency |
185 | |--------|-----------|----------------------------|
186 | | 1 | `` | 10 (23 - 13 = 10) |
187 | | 2 | `o` | 4 (14 - 10 = 4) |
188 | | 3 | `l` | 4 (14 - 10 = 4) |
189 | | 4 | `d` | 0 (10 - 10 = 0) |
190 | | 5 | `e` | 3 (16 - 13 = 3) |
191 | | 6 | `r` | 3 |
192 | | 7 | `f` | 9 |
193 | | 8 | `i` | 9 |
194 | | 9 | `n` | 9 |
195 | | 10 | `s` | 0 (13 - 13 = 0) |
196 | | 11 | `t` | 0 (13 - 13 = 0) |
197 | | 12 | `w` | 4 |
198 | | 13 | `es` | 0 (9 + 4 - 13 - 13 = 0) |
199 | | 14 | `est` | 13 |
200 | | 15 | `est` | 13 |
201 | | 16 | `ol` | 0 (7 + 3 - 10 - 10 = 0) |
202 | | 17 | `old` | 10 (7 + 3 = 10) |
203 |
204 | `f`, `i` and `n` appear 9 times but we have just one word with these characters. so, we are not merging.
205 |
206 | Let's remove token with frequency with 0
207 |
208 | | Number | Token | Frequency |
209 | |--------|--------|-----------|
210 | | 1 | `` | 10 |
211 | | 2 | o | 4 |
212 | | 3 | l | 4 |
213 | | 4 | e | 3 |
214 | | 5 | r | 3 |
215 | | 6 | f | 9 |
216 | | 7 | i | 9 |
217 | | 8 | n | 9 |
218 | | 9 | w | 4 |
219 | | 10 | est``| 13 |
220 | | 11 | old | 10 |
221 |
222 | `Observation`: Captured `root word` and `prefix`
223 |
224 | Stopping Criteria while encoding:
225 | - Token count
226 | - Number of iteration
227 |
228 | When GPT 2 were trained there were only 50,257 tokens due to BPE.
229 | It also solves out of vocabulary problem
--------------------------------------------------------------------------------
/8. Byte Pair Encoding/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/8. Byte Pair Encoding/.gitkeep
--------------------------------------------------------------------------------
/8. Byte Pair Encoding/Hands on with GPT Tokenizer BPE.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "6ca90705",
6 | "metadata": {
7 | "papermill": {
8 | "duration": 0.003256,
9 | "end_time": "2025-01-21T15:24:02.242695",
10 | "exception": false,
11 | "start_time": "2025-01-21T15:24:02.239439",
12 | "status": "completed"
13 | },
14 | "tags": []
15 | },
16 | "source": [
17 | "# Hands on With GPT Tokenizer"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 1,
23 | "id": "8152e018",
24 | "metadata": {
25 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
26 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
27 | "execution": {
28 | "iopub.execute_input": "2025-01-21T15:24:02.250096Z",
29 | "iopub.status.busy": "2025-01-21T15:24:02.249603Z",
30 | "iopub.status.idle": "2025-01-21T15:24:07.662107Z",
31 | "shell.execute_reply": "2025-01-21T15:24:07.660675Z"
32 | },
33 | "papermill": {
34 | "duration": 5.418676,
35 | "end_time": "2025-01-21T15:24:07.664510",
36 | "exception": false,
37 | "start_time": "2025-01-21T15:24:02.245834",
38 | "status": "completed"
39 | },
40 | "tags": []
41 | },
42 | "outputs": [
43 | {
44 | "name": "stdout",
45 | "output_type": "stream",
46 | "text": [
47 | "Requirement already satisfied: tiktoken in /usr/local/lib/python3.10/dist-packages (0.8.0)\r\n",
48 | "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken) (2024.11.6)\r\n",
49 | "Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.10/dist-packages (from tiktoken) (2.32.3)\r\n",
50 | "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken) (3.4.0)\r\n",
51 | "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken) (3.10)\r\n",
52 | "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken) (2.2.3)\r\n",
53 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken) (2024.12.14)\r\n"
54 | ]
55 | }
56 | ],
57 | "source": [
58 | "!pip install tiktoken"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 2,
64 | "id": "f9ea434e",
65 | "metadata": {
66 | "execution": {
67 | "iopub.execute_input": "2025-01-21T15:24:07.672067Z",
68 | "iopub.status.busy": "2025-01-21T15:24:07.671682Z",
69 | "iopub.status.idle": "2025-01-21T15:24:07.722464Z",
70 | "shell.execute_reply": "2025-01-21T15:24:07.721300Z"
71 | },
72 | "papermill": {
73 | "duration": 0.056873,
74 | "end_time": "2025-01-21T15:24:07.724542",
75 | "exception": false,
76 | "start_time": "2025-01-21T15:24:07.667669",
77 | "status": "completed"
78 | },
79 | "tags": []
80 | },
81 | "outputs": [],
82 | "source": [
83 | "import importlib\n",
84 | "import tiktoken"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 3,
90 | "id": "65d238f2",
91 | "metadata": {
92 | "execution": {
93 | "iopub.execute_input": "2025-01-21T15:24:07.732296Z",
94 | "iopub.status.busy": "2025-01-21T15:24:07.731914Z",
95 | "iopub.status.idle": "2025-01-21T15:24:07.739412Z",
96 | "shell.execute_reply": "2025-01-21T15:24:07.738291Z"
97 | },
98 | "papermill": {
99 | "duration": 0.013104,
100 | "end_time": "2025-01-21T15:24:07.740967",
101 | "exception": false,
102 | "start_time": "2025-01-21T15:24:07.727863",
103 | "status": "completed"
104 | },
105 | "tags": []
106 | },
107 | "outputs": [
108 | {
109 | "name": "stdout",
110 | "output_type": "stream",
111 | "text": [
112 | "Version: 0.8.0\n"
113 | ]
114 | }
115 | ],
116 | "source": [
117 | "print(\"Version: \",importlib.metadata.version(\"tiktoken\"))"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 4,
123 | "id": "24decc23",
124 | "metadata": {
125 | "execution": {
126 | "iopub.execute_input": "2025-01-21T15:24:07.748579Z",
127 | "iopub.status.busy": "2025-01-21T15:24:07.748247Z",
128 | "iopub.status.idle": "2025-01-21T15:24:09.467222Z",
129 | "shell.execute_reply": "2025-01-21T15:24:09.466027Z"
130 | },
131 | "papermill": {
132 | "duration": 1.725117,
133 | "end_time": "2025-01-21T15:24:09.469392",
134 | "exception": false,
135 | "start_time": "2025-01-21T15:24:07.744275",
136 | "status": "completed"
137 | },
138 | "tags": []
139 | },
140 | "outputs": [],
141 | "source": [
142 | "tokenizer=tiktoken.get_encoding(\"gpt2\")"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 5,
148 | "id": "868842c2",
149 | "metadata": {
150 | "execution": {
151 | "iopub.execute_input": "2025-01-21T15:24:09.476830Z",
152 | "iopub.status.busy": "2025-01-21T15:24:09.476478Z",
153 | "iopub.status.idle": "2025-01-21T15:24:09.480872Z",
154 | "shell.execute_reply": "2025-01-21T15:24:09.479886Z"
155 | },
156 | "papermill": {
157 | "duration": 0.010266,
158 | "end_time": "2025-01-21T15:24:09.482803",
159 | "exception": false,
160 | "start_time": "2025-01-21T15:24:09.472537",
161 | "status": "completed"
162 | },
163 | "tags": []
164 | },
165 | "outputs": [],
166 | "source": [
167 | "text=(\"Hello, do you like tea of coffee? <|endoftext|> Country is facing crucial economic crisisToday.\")"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": 6,
173 | "id": "6a6a6603",
174 | "metadata": {
175 | "execution": {
176 | "iopub.execute_input": "2025-01-21T15:24:09.490326Z",
177 | "iopub.status.busy": "2025-01-21T15:24:09.489941Z",
178 | "iopub.status.idle": "2025-01-21T15:24:09.494807Z",
179 | "shell.execute_reply": "2025-01-21T15:24:09.493666Z"
180 | },
181 | "papermill": {
182 | "duration": 0.010268,
183 | "end_time": "2025-01-21T15:24:09.496411",
184 | "exception": false,
185 | "start_time": "2025-01-21T15:24:09.486143",
186 | "status": "completed"
187 | },
188 | "tags": []
189 | },
190 | "outputs": [],
191 | "source": [
192 | "integer=tokenizer.encode(text,allowed_special={\"<|endoftext|>\"})"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 7,
198 | "id": "86bcbf42",
199 | "metadata": {
200 | "execution": {
201 | "iopub.execute_input": "2025-01-21T15:24:09.503652Z",
202 | "iopub.status.busy": "2025-01-21T15:24:09.503329Z",
203 | "iopub.status.idle": "2025-01-21T15:24:09.508613Z",
204 | "shell.execute_reply": "2025-01-21T15:24:09.507205Z"
205 | },
206 | "papermill": {
207 | "duration": 0.010873,
208 | "end_time": "2025-01-21T15:24:09.510410",
209 | "exception": false,
210 | "start_time": "2025-01-21T15:24:09.499537",
211 | "status": "completed"
212 | },
213 | "tags": []
214 | },
215 | "outputs": [
216 | {
217 | "name": "stdout",
218 | "output_type": "stream",
219 | "text": [
220 | "[15496, 11, 466, 345, 588, 8887, 286, 6891, 30, 220, 50256, 12946, 318, 6476, 8780, 3034, 4902, 8888, 13]\n"
221 | ]
222 | }
223 | ],
224 | "source": [
225 | "print(integer)"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 8,
231 | "id": "56a9be73",
232 | "metadata": {
233 | "execution": {
234 | "iopub.execute_input": "2025-01-21T15:24:09.517608Z",
235 | "iopub.status.busy": "2025-01-21T15:24:09.517304Z",
236 | "iopub.status.idle": "2025-01-21T15:24:09.521151Z",
237 | "shell.execute_reply": "2025-01-21T15:24:09.520239Z"
238 | },
239 | "papermill": {
240 | "duration": 0.009325,
241 | "end_time": "2025-01-21T15:24:09.522752",
242 | "exception": false,
243 | "start_time": "2025-01-21T15:24:09.513427",
244 | "status": "completed"
245 | },
246 | "tags": []
247 | },
248 | "outputs": [],
249 | "source": [
250 | "string=tokenizer.decode(integer)"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": 9,
256 | "id": "c104338a",
257 | "metadata": {
258 | "execution": {
259 | "iopub.execute_input": "2025-01-21T15:24:09.530254Z",
260 | "iopub.status.busy": "2025-01-21T15:24:09.529865Z",
261 | "iopub.status.idle": "2025-01-21T15:24:09.534683Z",
262 | "shell.execute_reply": "2025-01-21T15:24:09.533372Z"
263 | },
264 | "papermill": {
265 | "duration": 0.010533,
266 | "end_time": "2025-01-21T15:24:09.536428",
267 | "exception": false,
268 | "start_time": "2025-01-21T15:24:09.525895",
269 | "status": "completed"
270 | },
271 | "tags": []
272 | },
273 | "outputs": [
274 | {
275 | "name": "stdout",
276 | "output_type": "stream",
277 | "text": [
278 | "Hello, do you like tea of coffee? <|endoftext|> Country is facing crucial economic crisisToday.\n"
279 | ]
280 | }
281 | ],
282 | "source": [
283 | "print(string)"
284 | ]
285 | }
286 | ],
287 | "metadata": {
288 | "kaggle": {
289 | "accelerator": "none",
290 | "dataSources": [],
291 | "dockerImageVersionId": 30839,
292 | "isGpuEnabled": false,
293 | "isInternetEnabled": true,
294 | "language": "python",
295 | "sourceType": "notebook"
296 | },
297 | "kernelspec": {
298 | "display_name": "Python 3",
299 | "language": "python",
300 | "name": "python3"
301 | },
302 | "language_info": {
303 | "codemirror_mode": {
304 | "name": "ipython",
305 | "version": 3
306 | },
307 | "file_extension": ".py",
308 | "mimetype": "text/x-python",
309 | "name": "python",
310 | "nbconvert_exporter": "python",
311 | "pygments_lexer": "ipython3",
312 | "version": "3.10.12"
313 | },
314 | "papermill": {
315 | "default_parameters": {},
316 | "duration": 10.637654,
317 | "end_time": "2025-01-21T15:24:10.059015",
318 | "environment_variables": {},
319 | "exception": null,
320 | "input_path": "__notebook__.ipynb",
321 | "output_path": "__notebook__.ipynb",
322 | "parameters": {},
323 | "start_time": "2025-01-21T15:23:59.421361",
324 | "version": "2.6.0"
325 | }
326 | },
327 | "nbformat": 4,
328 | "nbformat_minor": 5
329 | }
330 |
--------------------------------------------------------------------------------
/9. Input Target Pair.md:
--------------------------------------------------------------------------------
1 | # Input Target Pair
2 | The last step before we create `vector embedding` is to create `input target pair`.
3 |
4 | What do these input target pairs look like?
5 |
6 |
7 |
8 | `Blue`: Input
9 | `Red`: Output
10 |
11 | - Self-Supervised Learning | Auto-Regressive
12 |
13 | ## Summary
14 | - Given text sample: Extracting `Input block`. &
15 | `LLM prediction task` during training is to `predict the next word` that follow input block.
16 | During training we mask out all words that are past the target
17 |
18 | ## Process
19 | - Create 2 variables `x` and `y`
20 | - `x` contain input token and
21 | - `y` contain target (input shifted by one)
22 |
23 | ### Eg:
24 | X=[1,2,3,4]
25 | y=[2,3,4,5]
26 |
27 | ### Context Size
28 | How many word to be given to model for prediction; in notebook 4
29 | - In LLM number of prediction task is as set in context_size
30 |
31 | ## Implementing into Dataloader
32 | We need to convert it into PyTorch tensors.
33 | We will implement a data loader that fetches input-output target pairs using a sliding window approach.
34 |
35 |
36 |
37 | To implement efficient dataloader, we collect inputs in a tensor x, where each row represent one input context. The second tensor y contains the corresponding prediction targets (next words), which are created by shifting the input by one position.
38 |
39 | #### Steps:
40 | 1. Initialize tokenizer
41 | 2. Create Dataset
42 | 3. drop_true= True
43 | To drop the last batch if it is shorter than the specified batch_size to prevent loss spikes during training.
44 | 4. The number of CPU processes to use for preprocessing
45 |
46 |
47 | - `Batch size`= Number of batches model process at once before updating parameter
48 | - `num_workers`= For parallel processing
49 |
50 | ### Stride
51 |
52 |
53 | - `Stride`: Gap of word between input of different batches
54 | - More stride prevent repetead data feeding for training and prevent overfitting.
55 | - We usually put `stride length`=`context length` inorder not to miss any word.
--------------------------------------------------------------------------------
/9. Input Target Pairs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/9. Input Target Pairs/.gitkeep
--------------------------------------------------------------------------------
/9. Input Target Pairs/9. Input Target Pair.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "78e735f4-1f1f-419f-84c8-9d96b456daf8",
7 | "metadata": {},
8 | "outputs": [
9 | {
10 | "name": "stdout",
11 | "output_type": "stream",
12 | "text": [
13 | "Defaulting to user installation because normal site-packages is not writeable\n",
14 | "Collecting tiktoken\n",
15 | " Obtaining dependency information for tiktoken from https://files.pythonhosted.org/packages/1e/86/eea2309dc258fb86c7d9b10db536434fc16420feaa3b6113df18b23db7c2/tiktoken-0.8.0-cp311-cp311-win_amd64.whl.metadata\n",
16 | " Downloading tiktoken-0.8.0-cp311-cp311-win_amd64.whl.metadata (6.8 kB)\n",
17 | "Requirement already satisfied: regex>=2022.1.18 in c:\\users\\hp\\appdata\\roaming\\python\\python311\\site-packages (from tiktoken) (2023.10.3)\n",
18 | "Requirement already satisfied: requests>=2.26.0 in c:\\program files\\python311\\lib\\site-packages (from tiktoken) (2.31.0)\n",
19 | "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\program files\\python311\\lib\\site-packages (from requests>=2.26.0->tiktoken) (3.3.0)\n",
20 | "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\hp\\appdata\\roaming\\python\\python311\\site-packages (from requests>=2.26.0->tiktoken) (2.10)\n",
21 | "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\program files\\python311\\lib\\site-packages (from requests>=2.26.0->tiktoken) (2.0.6)\n",
22 | "Requirement already satisfied: certifi>=2017.4.17 in c:\\program files\\python311\\lib\\site-packages (from requests>=2.26.0->tiktoken) (2023.7.22)\n",
23 | "Downloading tiktoken-0.8.0-cp311-cp311-win_amd64.whl (884 kB)\n",
24 | " ---------------------------------------- 0.0/884.5 kB ? eta -:--:--\n",
25 | " ---------------------------------------- 0.0/884.5 kB ? eta -:--:--\n",
26 | " ---------------------------------------- 10.2/884.5 kB ? eta -:--:--\n",
27 | " - ------------------------------------- 30.7/884.5 kB 435.7 kB/s eta 0:00:02\n",
28 | " -- ------------------------------------ 61.4/884.5 kB 465.5 kB/s eta 0:00:02\n",
29 | " ---- --------------------------------- 112.6/884.5 kB 652.2 kB/s eta 0:00:02\n",
30 | " ------- ------------------------------ 174.1/884.5 kB 748.1 kB/s eta 0:00:01\n",
31 | " ---------- --------------------------- 245.8/884.5 kB 942.1 kB/s eta 0:00:01\n",
32 | " ---------------- ----------------------- 368.6/884.5 kB 1.1 MB/s eta 0:00:01\n",
33 | " ----------------------- ---------------- 522.2/884.5 kB 1.5 MB/s eta 0:00:01\n",
34 | " --------------------------- ------------ 614.4/884.5 kB 1.5 MB/s eta 0:00:01\n",
35 | " -------------------------------------- - 860.2/884.5 kB 1.9 MB/s eta 0:00:01\n",
36 | " ---------------------------------------- 884.5/884.5 kB 1.9 MB/s eta 0:00:00\n",
37 | "Installing collected packages: tiktoken\n",
38 | "Successfully installed tiktoken-0.8.0\n"
39 | ]
40 | },
41 | {
42 | "name": "stderr",
43 | "output_type": "stream",
44 | "text": [
45 | "DEPRECATION: Loading egg at c:\\program files\\python311\\lib\\site-packages\\vboxapi-1.0-py3.11.egg is deprecated. pip 23.3 will enforce this behaviour change. A possible replacement is to use pip for package installation..\n",
46 | "\n",
47 | "[notice] A new release of pip is available: 23.2.1 -> 24.3.1\n",
48 | "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
49 | ]
50 | }
51 | ],
52 | "source": [
53 | "!pip install tiktoken"
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "id": "933cd27d-f5e1-4e6d-899d-dbeb6da4a62c",
59 | "metadata": {},
60 | "source": [
61 | "## Input Target Pairs"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 2,
67 | "id": "a48c3ce9-ef5f-4070-9221-e7c6a860fc4d",
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "import tiktoken"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 3,
77 | "id": "acc25096-bf75-46c1-a8cc-3c0156127fda",
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "with open(\"the-verdict.txt\",\"r\",encoding=\"utf-8\") as f:\n",
82 | " raw_text=f.read()"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 4,
88 | "id": "2be408fe-c99a-4714-bb13-c0ee6235d240",
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "tokenizer=tiktoken.get_encoding(\"gpt2\")"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 6,
98 | "id": "06fe5989-120e-4348-84b1-cdb2ad26a633",
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "enc_text=tokenizer.encode(raw_text)"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 7,
108 | "id": "7119824a-2c34-4320-bc97-e1823ee2cc81",
109 | "metadata": {},
110 | "outputs": [
111 | {
112 | "name": "stdout",
113 | "output_type": "stream",
114 | "text": [
115 | "5145\n"
116 | ]
117 | }
118 | ],
119 | "source": [
120 | "print(len(enc_text))"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 10,
126 | "id": "7f35af5a-d1d2-47f0-aabc-33b668d2751c",
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "enc_sample=enc_text[50:]"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "id": "f26a6c88-6fe2-492e-89dc-a995f0dffe2a",
136 | "metadata": {},
137 | "source": [
138 | "## Context Size"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 16,
144 | "id": "1be29e4e-c7a5-4aad-b7a3-4f0536981767",
145 | "metadata": {},
146 | "outputs": [
147 | {
148 | "name": "stdout",
149 | "output_type": "stream",
150 | "text": [
151 | "x:-> [290, 4920, 2241, 287]\n",
152 | "y:-> [4920, 2241, 287, 257]\n"
153 | ]
154 | }
155 | ],
156 | "source": [
157 | "context_size=4 #length of input\n",
158 | "\n",
159 | "x= enc_sample[:context_size]\n",
160 | "y=enc_sample[1:context_size+1]\n",
161 | "\n",
162 | "print(\"x:->\",x)\n",
163 | "print(\"y:-> \",y)"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 20,
169 | "id": "63cdac35-34d9-48a0-9add-545e88dbcee9",
170 | "metadata": {},
171 | "outputs": [
172 | {
173 | "name": "stdout",
174 | "output_type": "stream",
175 | "text": [
176 | "[290] -> 4920\n",
177 | "[290, 4920] -> 2241\n",
178 | "[290, 4920, 2241] -> 287\n",
179 | "[290, 4920, 2241, 287] -> 257\n"
180 | ]
181 | }
182 | ],
183 | "source": [
184 | "# Same thing as above\n",
185 | "for i in range(1,context_size+1):\n",
186 | " context=enc_sample[:i]\n",
187 | " desired=enc_sample[i]\n",
188 | " print(context,\"->\",desired)"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 22,
194 | "id": "30a82a21-2497-4acd-a2dc-a8c703a644d9",
195 | "metadata": {},
196 | "outputs": [
197 | {
198 | "name": "stdout",
199 | "output_type": "stream",
200 | "text": [
201 | " and -> established\n",
202 | " and established -> himself\n",
203 | " and established himself -> in\n",
204 | " and established himself in -> a\n"
205 | ]
206 | }
207 | ],
208 | "source": [
209 | "for i in range(1,context_size+1):\n",
210 | " context=enc_sample[:i]\n",
211 | " desired=enc_sample[i]\n",
212 | " print(tokenizer.decode(context),\"->\",tokenizer.decode([desired]))"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": 31,
218 | "id": "611a1d55-fbc1-4830-8f69-f9a99e5f9011",
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "from torch.utils.data import Dataset, DataLoader\n",
223 | "\n",
224 | "class GPTDataset():\n",
225 | " def __init__(self,txt,tokenizer,max_length,stride):\n",
226 | " self.input_ids=[]\n",
227 | " self.target_ids=[]\n",
228 | "\n",
229 | " token_ids=tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n",
230 | "\n",
231 | " for i in range(0,len(token_ids)-max_length, stride):\n",
232 | " input_chunk=token_ids[i:i+max_length]\n",
233 | " target_chunk=token_ids[i+1:i+max_length+1]\n",
234 | " self.input_ids.append(torch.tensor(input_chunk))\n",
235 | " self.target_ids.append(torch.tensor(target_chunk))\n",
236 | " def __len__(self):\n",
237 | " return len(self.input_ids)\n",
238 | "\n",
239 | " def __getitem__(self, idx):\n",
240 | " return self.input_ids[idx], self.target_ids[idx]\n",
241 | " "
242 | ]
243 | },
244 | {
245 | "cell_type": "markdown",
246 | "id": "9ea5ca76-1877-46b5-b611-720bab8a1f2c",
247 | "metadata": {},
248 | "source": [
249 | "## Creating Dataloader"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 32,
255 | "id": "0b6eeaba-881a-47c8-9902-5941e55eda4a",
256 | "metadata": {},
257 | "outputs": [],
258 | "source": [
259 | "def create_dataloader(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True,num_workers=0):\n",
260 | "\n",
261 | " #Initialise Tokenizer\n",
262 | " tokenizer=tiktoken.get_encoding(\"gpt2\")\n",
263 | "\n",
264 | " #Create Dataset\n",
265 | " dataset=GPTDataset(txt, tokenizer, max_length, stride)\n",
266 | "\n",
267 | " dataloader=DataLoader(\n",
268 | " dataset,\n",
269 | " batch_size=batch_size,\n",
270 | " shuffle=shuffle,\n",
271 | " drop_last=drop_last,\n",
272 | " num_workers=num_workers\n",
273 | " )\n",
274 | "\n",
275 | " return dataloader"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 35,
281 | "id": "bdc045ed-8a15-4b72-b3fe-0ad4a8cd309d",
282 | "metadata": {},
283 | "outputs": [
284 | {
285 | "name": "stdout",
286 | "output_type": "stream",
287 | "text": [
288 | "[tensor([[ 40, 367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]\n"
289 | ]
290 | }
291 | ],
292 | "source": [
293 | "import torch\n",
294 | "\n",
295 | "dataloader=create_dataloader(raw_text, batch_size=1,max_length=4,stride=1, shuffle=False)\n",
296 | "\n",
297 | "data_iter= iter(dataloader)\n",
298 | "first_batch=next(data_iter)\n",
299 | "print(first_batch)"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": 36,
305 | "id": "55889b79-fb03-488b-bc4b-171227141de6",
306 | "metadata": {},
307 | "outputs": [
308 | {
309 | "name": "stdout",
310 | "output_type": "stream",
311 | "text": [
312 | "[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]\n"
313 | ]
314 | }
315 | ],
316 | "source": [
317 | "second_batch=next(data_iter)\n",
318 | "print(second_batch)"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 42,
324 | "id": "81d0318b-1eec-404c-b68a-79d603d5a545",
325 | "metadata": {},
326 | "outputs": [
327 | {
328 | "name": "stdout",
329 | "output_type": "stream",
330 | "text": [
331 | "Input:\n",
332 | " tensor([[ 40, 367, 2885, 1464],\n",
333 | " [ 1807, 3619, 402, 271],\n",
334 | " [10899, 2138, 257, 7026],\n",
335 | " [15632, 438, 2016, 257],\n",
336 | " [ 922, 5891, 1576, 438],\n",
337 | " [ 568, 340, 373, 645],\n",
338 | " [ 1049, 5975, 284, 502],\n",
339 | " [ 284, 3285, 326, 11]])\n",
340 | "\n",
341 | "Target:\n",
342 | " tensor([[ 367, 2885, 1464, 1807],\n",
343 | " [ 3619, 402, 271, 10899],\n",
344 | " [ 2138, 257, 7026, 15632],\n",
345 | " [ 438, 2016, 257, 922],\n",
346 | " [ 5891, 1576, 438, 568],\n",
347 | " [ 340, 373, 645, 1049],\n",
348 | " [ 5975, 284, 502, 284],\n",
349 | " [ 3285, 326, 11, 287]])\n"
350 | ]
351 | }
352 | ],
353 | "source": [
354 | "import torch\n",
355 | "\n",
356 | "dataloader=create_dataloader(raw_text, batch_size=8,max_length=4,stride=4, shuffle=False)\n",
357 | "\n",
358 | "data_iter= iter(dataloader)\n",
359 | "input,target=next(data_iter)\n",
360 | "print(\"Input:\\n\",input)\n",
361 | "print(\"\\nTarget:\\n\",target)"
362 | ]
363 | },
364 | {
365 | "cell_type": "code",
366 | "execution_count": null,
367 | "id": "a3af799b-4b26-4a9e-97b9-eeba99487b3b",
368 | "metadata": {},
369 | "outputs": [],
370 | "source": []
371 | }
372 | ],
373 | "metadata": {
374 | "kernelspec": {
375 | "display_name": "Python 3 (ipykernel)",
376 | "language": "python",
377 | "name": "python3"
378 | },
379 | "language_info": {
380 | "codemirror_mode": {
381 | "name": "ipython",
382 | "version": 3
383 | },
384 | "file_extension": ".py",
385 | "mimetype": "text/x-python",
386 | "name": "python",
387 | "nbconvert_exporter": "python",
388 | "pygments_lexer": "ipython3",
389 | "version": "3.11.5"
390 | }
391 | },
392 | "nbformat": 4,
393 | "nbformat_minor": 5
394 | }
395 |
--------------------------------------------------------------------------------
/9. Input Target Pairs/Untitled.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "78e735f4-1f1f-419f-84c8-9d96b456daf8",
7 | "metadata": {},
8 | "outputs": [
9 | {
10 | "name": "stdout",
11 | "output_type": "stream",
12 | "text": [
13 | "Defaulting to user installation because normal site-packages is not writeable\n",
14 | "Collecting tiktoken\n",
15 | " Obtaining dependency information for tiktoken from https://files.pythonhosted.org/packages/1e/86/eea2309dc258fb86c7d9b10db536434fc16420feaa3b6113df18b23db7c2/tiktoken-0.8.0-cp311-cp311-win_amd64.whl.metadata\n",
16 | " Downloading tiktoken-0.8.0-cp311-cp311-win_amd64.whl.metadata (6.8 kB)\n",
17 | "Requirement already satisfied: regex>=2022.1.18 in c:\\users\\hp\\appdata\\roaming\\python\\python311\\site-packages (from tiktoken) (2023.10.3)\n",
18 | "Requirement already satisfied: requests>=2.26.0 in c:\\program files\\python311\\lib\\site-packages (from tiktoken) (2.31.0)\n",
19 | "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\program files\\python311\\lib\\site-packages (from requests>=2.26.0->tiktoken) (3.3.0)\n",
20 | "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\hp\\appdata\\roaming\\python\\python311\\site-packages (from requests>=2.26.0->tiktoken) (2.10)\n",
21 | "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\program files\\python311\\lib\\site-packages (from requests>=2.26.0->tiktoken) (2.0.6)\n",
22 | "Requirement already satisfied: certifi>=2017.4.17 in c:\\program files\\python311\\lib\\site-packages (from requests>=2.26.0->tiktoken) (2023.7.22)\n",
23 | "Downloading tiktoken-0.8.0-cp311-cp311-win_amd64.whl (884 kB)\n",
24 | " ---------------------------------------- 0.0/884.5 kB ? eta -:--:--\n",
25 | " ---------------------------------------- 0.0/884.5 kB ? eta -:--:--\n",
26 | " ---------------------------------------- 10.2/884.5 kB ? eta -:--:--\n",
27 | " - ------------------------------------- 30.7/884.5 kB 435.7 kB/s eta 0:00:02\n",
28 | " -- ------------------------------------ 61.4/884.5 kB 465.5 kB/s eta 0:00:02\n",
29 | " ---- --------------------------------- 112.6/884.5 kB 652.2 kB/s eta 0:00:02\n",
30 | " ------- ------------------------------ 174.1/884.5 kB 748.1 kB/s eta 0:00:01\n",
31 | " ---------- --------------------------- 245.8/884.5 kB 942.1 kB/s eta 0:00:01\n",
32 | " ---------------- ----------------------- 368.6/884.5 kB 1.1 MB/s eta 0:00:01\n",
33 | " ----------------------- ---------------- 522.2/884.5 kB 1.5 MB/s eta 0:00:01\n",
34 | " --------------------------- ------------ 614.4/884.5 kB 1.5 MB/s eta 0:00:01\n",
35 | " -------------------------------------- - 860.2/884.5 kB 1.9 MB/s eta 0:00:01\n",
36 | " ---------------------------------------- 884.5/884.5 kB 1.9 MB/s eta 0:00:00\n",
37 | "Installing collected packages: tiktoken\n",
38 | "Successfully installed tiktoken-0.8.0\n"
39 | ]
40 | },
41 | {
42 | "name": "stderr",
43 | "output_type": "stream",
44 | "text": [
45 | "DEPRECATION: Loading egg at c:\\program files\\python311\\lib\\site-packages\\vboxapi-1.0-py3.11.egg is deprecated. pip 23.3 will enforce this behaviour change. A possible replacement is to use pip for package installation..\n",
46 | "\n",
47 | "[notice] A new release of pip is available: 23.2.1 -> 24.3.1\n",
48 | "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
49 | ]
50 | }
51 | ],
52 | "source": [
53 | "!pip install tiktoken"
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "id": "933cd27d-f5e1-4e6d-899d-dbeb6da4a62c",
59 | "metadata": {},
60 | "source": [
61 | "## Input Target Pairs"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 1,
67 | "id": "a48c3ce9-ef5f-4070-9221-e7c6a860fc4d",
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "import tiktoken"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 2,
77 | "id": "acc25096-bf75-46c1-a8cc-3c0156127fda",
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "with open(\"the-verdict.txt\",\"r\",encoding=\"utf-8\") as f:\n",
82 | " raw_text=f.read()"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 3,
88 | "id": "2be408fe-c99a-4714-bb13-c0ee6235d240",
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "tokenizer=tiktoken.get_encoding(\"gpt2\")"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 4,
98 | "id": "06fe5989-120e-4348-84b1-cdb2ad26a633",
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "enc_text=tokenizer.encode(raw_text)"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 5,
108 | "id": "7119824a-2c34-4320-bc97-e1823ee2cc81",
109 | "metadata": {},
110 | "outputs": [
111 | {
112 | "name": "stdout",
113 | "output_type": "stream",
114 | "text": [
115 | "5145\n"
116 | ]
117 | }
118 | ],
119 | "source": [
120 | "print(len(enc_text))"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 6,
126 | "id": "7f35af5a-d1d2-47f0-aabc-33b668d2751c",
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "enc_sample=enc_text[50:]"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "id": "f26a6c88-6fe2-492e-89dc-a995f0dffe2a",
136 | "metadata": {},
137 | "source": [
138 | "## Context Size"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 7,
144 | "id": "1be29e4e-c7a5-4aad-b7a3-4f0536981767",
145 | "metadata": {},
146 | "outputs": [
147 | {
148 | "name": "stdout",
149 | "output_type": "stream",
150 | "text": [
151 | "x:-> [290, 4920, 2241, 287]\n",
152 | "y:-> [4920, 2241, 287, 257]\n"
153 | ]
154 | }
155 | ],
156 | "source": [
157 | "context_size=4 #length of input\n",
158 | "\n",
159 | "x=enc_sample[:context_size]\n",
160 | "y=enc_sample[1:context_size+1]\n",
161 | "\n",
162 | "print(\"x:->\",x)\n",
163 | "print(\"y:-> \",y)"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 8,
169 | "id": "63cdac35-34d9-48a0-9add-545e88dbcee9",
170 | "metadata": {},
171 | "outputs": [
172 | {
173 | "name": "stdout",
174 | "output_type": "stream",
175 | "text": [
176 | "[290] -> 4920\n",
177 | "[290, 4920] -> 2241\n",
178 | "[290, 4920, 2241] -> 287\n",
179 | "[290, 4920, 2241, 287] -> 257\n"
180 | ]
181 | }
182 | ],
183 | "source": [
184 | "# Same thing as above\n",
185 | "for i in range(1,context_size+1):\n",
186 | " context=enc_sample[:i]\n",
187 | " desired=enc_sample[i]\n",
188 | " print(context,\"->\",desired)"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 9,
194 | "id": "30a82a21-2497-4acd-a2dc-a8c703a644d9",
195 | "metadata": {},
196 | "outputs": [
197 | {
198 | "name": "stdout",
199 | "output_type": "stream",
200 | "text": [
201 | " and -> established\n",
202 | " and established -> himself\n",
203 | " and established himself -> in\n",
204 | " and established himself in -> a\n"
205 | ]
206 | }
207 | ],
208 | "source": [
209 | "for i in range(1,context_size+1):\n",
210 | " context=enc_sample[:i]\n",
211 | " desired=enc_sample[i]\n",
212 | " print(tokenizer.decode(context),\"->\",tokenizer.decode([desired]))"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": 10,
218 | "id": "611a1d55-fbc1-4830-8f69-f9a99e5f9011",
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "from torch.utils.data import Dataset, DataLoader\n",
223 | "\n",
224 | "class GPTDataset():\n",
225 | " def __init__(self,txt,tokenizer,max_length,stride):\n",
226 | " self.input_ids=[]\n",
227 | " self.target_ids=[]\n",
228 | "\n",
229 | " token_ids=tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n",
230 | "\n",
231 | " for i in range(0,len(token_ids)-max_length, stride):\n",
232 | " input_chunk=token_ids[i:i+max_length]\n",
233 | " target_chunk=token_ids[i+1:i+max_length+1]\n",
234 | " self.input_ids.append(torch.tensor(input_chunk))\n",
235 | " self.target_ids.append(torch.tensor(target_chunk))\n",
236 | " def __len__(self):\n",
237 | " return len(self.input_ids)\n",
238 | "\n",
239 | " def __getitem__(self, idx):\n",
240 | " return self.input_ids[idx], self.target_ids[idx]\n",
241 | " "
242 | ]
243 | },
244 | {
245 | "cell_type": "markdown",
246 | "id": "9ea5ca76-1877-46b5-b611-720bab8a1f2c",
247 | "metadata": {},
248 | "source": [
249 | "## Creating Dataloader"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 11,
255 | "id": "0b6eeaba-881a-47c8-9902-5941e55eda4a",
256 | "metadata": {},
257 | "outputs": [],
258 | "source": [
259 | "def create_dataloader(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True,num_workers=0):\n",
260 | "\n",
261 | " #Initialise Tokenizer\n",
262 | " tokenizer=tiktoken.get_encoding(\"gpt2\")\n",
263 | "\n",
264 | " #Create Dataset\n",
265 | " dataset=GPTDataset(txt, tokenizer, max_length, stride)\n",
266 | "\n",
267 | " dataloader=DataLoader(\n",
268 | " dataset,\n",
269 | " batch_size=batch_size,\n",
270 | " shuffle=shuffle,\n",
271 | " drop_last=drop_last,\n",
272 | " num_workers=num_workers\n",
273 | " )\n",
274 | "\n",
275 | " return dataloader"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 12,
281 | "id": "bdc045ed-8a15-4b72-b3fe-0ad4a8cd309d",
282 | "metadata": {},
283 | "outputs": [
284 | {
285 | "name": "stdout",
286 | "output_type": "stream",
287 | "text": [
288 | "[tensor([[ 40, 367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]\n"
289 | ]
290 | }
291 | ],
292 | "source": [
293 | "import torch\n",
294 | "\n",
295 | "dataloader=create_dataloader(raw_text, batch_size=1,max_length=4,stride=1, shuffle=False)\n",
296 | "\n",
297 | "data_iter= iter(dataloader)\n",
298 | "first_batch=next(data_iter)\n",
299 | "print(first_batch)"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": 13,
305 | "id": "55889b79-fb03-488b-bc4b-171227141de6",
306 | "metadata": {},
307 | "outputs": [
308 | {
309 | "name": "stdout",
310 | "output_type": "stream",
311 | "text": [
312 | "[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]\n"
313 | ]
314 | }
315 | ],
316 | "source": [
317 | "second_batch=next(data_iter)\n",
318 | "print(second_batch)"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 14,
324 | "id": "81d0318b-1eec-404c-b68a-79d603d5a545",
325 | "metadata": {},
326 | "outputs": [
327 | {
328 | "name": "stdout",
329 | "output_type": "stream",
330 | "text": [
331 | "Input:\n",
332 | " tensor([[ 40, 367, 2885, 1464],\n",
333 | " [ 1807, 3619, 402, 271],\n",
334 | " [10899, 2138, 257, 7026],\n",
335 | " [15632, 438, 2016, 257],\n",
336 | " [ 922, 5891, 1576, 438],\n",
337 | " [ 568, 340, 373, 645],\n",
338 | " [ 1049, 5975, 284, 502],\n",
339 | " [ 284, 3285, 326, 11]])\n",
340 | "\n",
341 | "Target:\n",
342 | " tensor([[ 367, 2885, 1464, 1807],\n",
343 | " [ 3619, 402, 271, 10899],\n",
344 | " [ 2138, 257, 7026, 15632],\n",
345 | " [ 438, 2016, 257, 922],\n",
346 | " [ 5891, 1576, 438, 568],\n",
347 | " [ 340, 373, 645, 1049],\n",
348 | " [ 5975, 284, 502, 284],\n",
349 | " [ 3285, 326, 11, 287]])\n"
350 | ]
351 | }
352 | ],
353 | "source": [
354 | "import torch\n",
355 | "\n",
356 | "dataloader=create_dataloader(raw_text, batch_size=8,max_length=4,stride=4, shuffle=False)\n",
357 | "\n",
358 | "data_iter= iter(dataloader)\n",
359 | "input,target=next(data_iter)\n",
360 | "print(\"Input:\\n\",input)\n",
361 | "print(\"\\nTarget:\\n\",target)"
362 | ]
363 | },
364 | {
365 | "cell_type": "code",
366 | "execution_count": null,
367 | "id": "a3af799b-4b26-4a9e-97b9-eeba99487b3b",
368 | "metadata": {},
369 | "outputs": [],
370 | "source": []
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": null,
375 | "id": "14b6bf6f-9a6d-44de-84f7-10c7e3c16701",
376 | "metadata": {},
377 | "outputs": [],
378 | "source": []
379 | }
380 | ],
381 | "metadata": {
382 | "kernelspec": {
383 | "display_name": "Python 3 (ipykernel)",
384 | "language": "python",
385 | "name": "python3"
386 | },
387 | "language_info": {
388 | "codemirror_mode": {
389 | "name": "ipython",
390 | "version": 3
391 | },
392 | "file_extension": ".py",
393 | "mimetype": "text/x-python",
394 | "name": "python",
395 | "nbconvert_exporter": "python",
396 | "pygments_lexer": "ipython3",
397 | "version": "3.11.5"
398 | }
399 | },
400 | "nbformat": 4,
401 | "nbformat_minor": 5
402 | }
403 |
--------------------------------------------------------------------------------
/9. Input Target Pairs/the-verdict.txt:
--------------------------------------------------------------------------------
1 | I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)
2 |
3 | "The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it's going to send the value of my picture 'way up; but I don't think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing's lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn's "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?
4 |
5 | Well!--even through the prism of Hermia's tears I felt able to face the fact with equanimity. Poor Jack Gisburn! The women had made him--it was fitting that they should mourn him. Among his own sex fewer regrets were heard, and in his own trade hardly a murmur. Professional jealousy? Perhaps. If it were, the honour of the craft was vindicated by little Claude Nutley, who, in all good faith, brought out in the Burlington a very handsome "obituary" on Jack--one of those showy articles stocked with random technicalities that I have heard (I won't say by whom) compared to Gisburn's painting. And so--his resolve being apparently irrevocable--the discussion gradually died out, and, as Mrs. Thwing had predicted, the price of "Gisburns" went up.
6 |
7 | It was not till three years later that, in the course of a few weeks' idling on the Riviera, it suddenly occurred to me to wonder why Gisburn had given up his painting. On reflection, it really was a tempting problem. To accuse his wife would have been too easy--his fair sitters had been denied the solace of saying that Mrs. Gisburn had "dragged him down." For Mrs. Gisburn--as such--had not existed till nearly a year after Jack's resolve had been taken. It might be that he had married her--since he liked his ease--because he didn't want to go on painting; but it would have been hard to prove that he had given up his painting because he had married her.
8 |
9 | Of course, if she had not dragged him down, she had equally, as Miss Croft contended, failed to "lift him up"--she had not led him back to the easel. To put the brush into his hand again--what a vocation for a wife! But Mrs. Gisburn appeared to have disdained it--and I felt it might be interesting to find out why.
10 |
11 | The desultory life of the Riviera lends itself to such purely academic speculations; and having, on my way to Monte Carlo, caught a glimpse of Jack's balustraded terraces between the pines, I had myself borne thither the next day.
12 |
13 | I found the couple at tea beneath their palm-trees; and Mrs. Gisburn's welcome was so genial that, in the ensuing weeks, I claimed it frequently. It was not that my hostess was "interesting": on that point I could have given Miss Croft the fullest reassurance. It was just because she was _not_ interesting--if I may be pardoned the bull--that I found her so. For Jack, all his life, had been surrounded by interesting women: they had fostered his art, it had been reared in the hot-house of their adulation. And it was therefore instructive to note what effect the "deadening atmosphere of mediocrity" (I quote Miss Croft) was having on him.
14 |
15 | I have mentioned that Mrs. Gisburn was rich; and it was immediately perceptible that her husband was extracting from this circumstance a delicate but substantial satisfaction. It is, as a rule, the people who scorn money who get most out of it; and Jack's elegant disdain of his wife's big balance enabled him, with an appearance of perfect good-breeding, to transmute it into objects of art and luxury. To the latter, I must add, he remained relatively indifferent; but he was buying Renaissance bronzes and eighteenth-century pictures with a discrimination that bespoke the amplest resources.
16 |
17 | "Money's only excuse is to put beauty into circulation," was one of the axioms he laid down across the Sevres and silver of an exquisitely appointed luncheon-table, when, on a later day, I had again run over from Monte Carlo; and Mrs. Gisburn, beaming on him, added for my enlightenment: "Jack is so morbidly sensitive to every form of beauty."
18 |
19 | Poor Jack! It had always been his fate to have women say such things of him: the fact should be set down in extenuation. What struck me now was that, for the first time, he resented the tone. I had seen him, so often, basking under similar tributes--was it the conjugal note that robbed them of their savour? No--for, oddly enough, it became apparent that he was fond of Mrs. Gisburn--fond enough not to see her absurdity. It was his own absurdity he seemed to be wincing under--his own attitude as an object for garlands and incense.
20 |
21 | "My dear, since I've chucked painting people don't say that stuff about me--they say it about Victor Grindle," was his only protest, as he rose from the table and strolled out onto the sunlit terrace.
22 |
23 | I glanced after him, struck by his last word. Victor Grindle was, in fact, becoming the man of the moment--as Jack himself, one might put it, had been the man of the hour. The younger artist was said to have formed himself at my friend's feet, and I wondered if a tinge of jealousy underlay the latter's mysterious abdication. But no--for it was not till after that event that the _rose Dubarry_ drawing-rooms had begun to display their "Grindles."
24 |
25 | I turned to Mrs. Gisburn, who had lingered to give a lump of sugar to her spaniel in the dining-room.
26 |
27 | "Why _has_ he chucked painting?" I asked abruptly.
28 |
29 | She raised her eyebrows with a hint of good-humoured surprise.
30 |
31 | "Oh, he doesn't _have_ to now, you know; and I want him to enjoy himself," she said quite simply.
32 |
33 | I looked about the spacious white-panelled room, with its _famille-verte_ vases repeating the tones of the pale damask curtains, and its eighteenth-century pastels in delicate faded frames.
34 |
35 | "Has he chucked his pictures too? I haven't seen a single one in the house."
36 |
37 | A slight shade of constraint crossed Mrs. Gisburn's open countenance. "It's his ridiculous modesty, you know. He says they're not fit to have about; he's sent them all away except one--my portrait--and that I have to keep upstairs."
38 |
39 | His ridiculous modesty--Jack's modesty about his pictures? My curiosity was growing like the bean-stalk. I said persuasively to my hostess: "I must really see your portrait, you know."
40 |
41 | She glanced out almost timorously at the terrace where her husband, lounging in a hooded chair, had lit a cigar and drawn the Russian deerhound's head between his knees.
42 |
43 | "Well, come while he's not looking," she said, with a laugh that tried to hide her nervousness; and I followed her between the marble Emperors of the hall, and up the wide stairs with terra-cotta nymphs poised among flowers at each landing.
44 |
45 | In the dimmest corner of her boudoir, amid a profusion of delicate and distinguished objects, hung one of the familiar oval canvases, in the inevitable garlanded frame. The mere outline of the frame called up all Gisburn's past!
46 |
47 | Mrs. Gisburn drew back the window-curtains, moved aside a _jardiniere_ full of pink azaleas, pushed an arm-chair away, and said: "If you stand here you can just manage to see it. I had it over the mantel-piece, but he wouldn't let it stay."
48 |
49 | Yes--I could just manage to see it--the first portrait of Jack's I had ever had to strain my eyes over! Usually they had the place of honour--say the central panel in a pale yellow or _rose Dubarry_ drawing-room, or a monumental easel placed so that it took the light through curtains of old Venetian point. The more modest place became the picture better; yet, as my eyes grew accustomed to the half-light, all the characteristic qualities came out--all the hesitations disguised as audacities, the tricks of prestidigitation by which, with such consummate skill, he managed to divert attention from the real business of the picture to some pretty irrelevance of detail. Mrs. Gisburn, presenting a neutral surface to work on--forming, as it were, so inevitably the background of her own picture--had lent herself in an unusual degree to the display of this false virtuosity. The picture was one of Jack's "strongest," as his admirers would have put it--it represented, on his part, a swelling of muscles, a congesting of veins, a balancing, straddling and straining, that reminded one of the circus-clown's ironic efforts to lift a feather. It met, in short, at every point the demand of lovely woman to be painted "strongly" because she was tired of being painted "sweetly"--and yet not to lose an atom of the sweetness.
50 |
51 | "It's the last he painted, you know," Mrs. Gisburn said with pardonable pride. "The last but one," she corrected herself--"but the other doesn't count, because he destroyed it."
52 |
53 | "Destroyed it?" I was about to follow up this clue when I heard a footstep and saw Jack himself on the threshold.
54 |
55 | As he stood there, his hands in the pockets of his velveteen coat, the thin brown waves of hair pushed back from his white forehead, his lean sunburnt cheeks furrowed by a smile that lifted the tips of a self-confident moustache, I felt to what a degree he had the same quality as his pictures--the quality of looking cleverer than he was.
56 |
57 | His wife glanced at him deprecatingly, but his eyes travelled past her to the portrait.
58 |
59 | "Mr. Rickham wanted to see it," she began, as if excusing herself. He shrugged his shoulders, still smiling.
60 |
61 | "Oh, Rickham found me out long ago," he said lightly; then, passing his arm through mine: "Come and see the rest of the house."
62 |
63 | He showed it to me with a kind of naive suburban pride: the bath-rooms, the speaking-tubes, the dress-closets, the trouser-presses--all the complex simplifications of the millionaire's domestic economy. And whenever my wonder paid the expected tribute he said, throwing out his chest a little: "Yes, I really don't see how people manage to live without that."
64 |
65 | Well--it was just the end one might have foreseen for him. Only he was, through it all and in spite of it all--as he had been through, and in spite of, his pictures--so handsome, so charming, so disarming, that one longed to cry out: "Be dissatisfied with your leisure!" as once one had longed to say: "Be dissatisfied with your work!"
66 |
67 | But, with the cry on my lips, my diagnosis suffered an unexpected check.
68 |
69 | "This is my own lair," he said, leading me into a dark plain room at the end of the florid vista. It was square and brown and leathery: no "effects"; no bric-a-brac, none of the air of posing for reproduction in a picture weekly--above all, no least sign of ever having been used as a studio.
70 |
71 | The fact brought home to me the absolute finality of Jack's break with his old life.
72 |
73 | "Don't you ever dabble with paint any more?" I asked, still looking about for a trace of such activity.
74 |
75 | "Never," he said briefly.
76 |
77 | "Or water-colour--or etching?"
78 |
79 | His confident eyes grew dim, and his cheeks paled a little under their handsome sunburn.
80 |
81 | "Never think of it, my dear fellow--any more than if I'd never touched a brush."
82 |
83 | And his tone told me in a flash that he never thought of anything else.
84 |
85 | I moved away, instinctively embarrassed by my unexpected discovery; and as I turned, my eye fell on a small picture above the mantel-piece--the only object breaking the plain oak panelling of the room.
86 |
87 | "Oh, by Jove!" I said.
88 |
89 | It was a sketch of a donkey--an old tired donkey, standing in the rain under a wall.
90 |
91 | "By Jove--a Stroud!" I cried.
92 |
93 | He was silent; but I felt him close behind me, breathing a little quickly.
94 |
95 | "What a wonder! Made with a dozen lines--but on everlasting foundations. You lucky chap, where did you get it?"
96 |
97 | He answered slowly: "Mrs. Stroud gave it to me."
98 |
99 | "Ah--I didn't know you even knew the Strouds. He was such an inflexible hermit."
100 |
101 | "I didn't--till after. . . . She sent for me to paint him when he was dead."
102 |
103 | "When he was dead? You?"
104 |
105 | I must have let a little too much amazement escape through my surprise, for he answered with a deprecating laugh: "Yes--she's an awful simpleton, you know, Mrs. Stroud. Her only idea was to have him done by a fashionable painter--ah, poor Stroud! She thought it the surest way of proclaiming his greatness--of forcing it on a purblind public. And at the moment I was _the_ fashionable painter."
106 |
107 | "Ah, poor Stroud--as you say. Was _that_ his history?"
108 |
109 | "That was his history. She believed in him, gloried in him--or thought she did. But she couldn't bear not to have all the drawing-rooms with her. She couldn't bear the fact that, on varnishing days, one could always get near enough to see his pictures. Poor woman! She's just a fragment groping for other fragments. Stroud is the only whole I ever knew."
110 |
111 | "You ever knew? But you just said--"
112 |
113 | Gisburn had a curious smile in his eyes.
114 |
115 | "Oh, I knew him, and he knew me--only it happened after he was dead."
116 |
117 | I dropped my voice instinctively. "When she sent for you?"
118 |
119 | "Yes--quite insensible to the irony. She wanted him vindicated--and by me!"
120 |
121 | He laughed again, and threw back his head to look up at the sketch of the donkey. "There were days when I couldn't look at that thing--couldn't face it. But I forced myself to put it here; and now it's cured me--cured me. That's the reason why I don't dabble any more, my dear Rickham; or rather Stroud himself is the reason."
122 |
123 | For the first time my idle curiosity about my companion turned into a serious desire to understand him better.
124 |
125 | "I wish you'd tell me how it happened," I said.
126 |
127 | He stood looking up at the sketch, and twirling between his fingers a cigarette he had forgotten to light. Suddenly he turned toward me.
128 |
129 | "I'd rather like to tell you--because I've always suspected you of loathing my work."
130 |
131 | I made a deprecating gesture, which he negatived with a good-humoured shrug.
132 |
133 | "Oh, I didn't care a straw when I believed in myself--and now it's an added tie between us!"
134 |
135 | He laughed slightly, without bitterness, and pushed one of the deep arm-chairs forward. "There: make yourself comfortable--and here are the cigars you like."
136 |
137 | He placed them at my elbow and continued to wander up and down the room, stopping now and then beneath the picture.
138 |
139 | "How it happened? I can tell you in five minutes--and it didn't take much longer to happen. . . . I can remember now how surprised and pleased I was when I got Mrs. Stroud's note. Of course, deep down, I had always _felt_ there was no one like him--only I had gone with the stream, echoed the usual platitudes about him, till I half got to think he was a failure, one of the kind that are left behind. By Jove, and he _was_ left behind--because he had come to stay! The rest of us had to let ourselves be swept along or go under, but he was high above the current--on everlasting foundations, as you say.
140 |
141 | "Well, I went off to the house in my most egregious mood--rather moved, Lord forgive me, at the pathos of poor Stroud's career of failure being crowned by the glory of my painting him! Of course I meant to do the picture for nothing--I told Mrs. Stroud so when she began to stammer something about her poverty. I remember getting off a prodigious phrase about the honour being _mine_--oh, I was princely, my dear Rickham! I was posing to myself like one of my own sitters.
142 |
143 | "Then I was taken up and left alone with him. I had sent all my traps in advance, and I had only to set up the easel and get to work. He had been dead only twenty-four hours, and he died suddenly, of heart disease, so that there had been no preliminary work of destruction--his face was clear and untouched. I had met him once or twice, years before, and thought him insignificant and dingy. Now I saw that he was superb.
144 |
145 | "I was glad at first, with a merely aesthetic satisfaction: glad to have my hand on such a 'subject.' Then his strange life-likeness began to affect me queerly--as I blocked the head in I felt as if he were watching me do it. The sensation was followed by the thought: if he _were_ watching me, what would he say to my way of working? My strokes began to go a little wild--I felt nervous and uncertain.
146 |
147 | "Once, when I looked up, I seemed to see a smile behind his close grayish beard--as if he had the secret, and were amusing himself by holding it back from me. That exasperated me still more. The secret? Why, I had a secret worth twenty of his! I dashed at the canvas furiously, and tried some of my bravura tricks. But they failed me, they crumbled. I saw that he wasn't watching the showy bits--I couldn't distract his attention; he just kept his eyes on the hard passages between. Those were the ones I had always shirked, or covered up with some lying paint. And how he saw through my lies!
148 |
149 | "I looked up again, and caught sight of that sketch of the donkey hanging on the wall near his bed. His wife told me afterward it was the last thing he had done--just a note taken with a shaking hand, when he was down in Devonshire recovering from a previous heart attack. Just a note! But it tells his whole history. There are years of patient scornful persistence in every line. A man who had swum with the current could never have learned that mighty up-stream stroke. . . .
150 |
151 | "I turned back to my work, and went on groping and muddling; then I looked at the donkey again. I saw that, when Stroud laid in the first stroke, he knew just what the end would be. He had possessed his subject, absorbed it, recreated it. When had I done that with any of my things? They hadn't been born of me--I had just adopted them. . . .
152 |
153 | "Hang it, Rickham, with that face watching me I couldn't do another stroke. The plain truth was, I didn't know where to put it--_I had never known_. Only, with my sitters and my public, a showy splash of colour covered up the fact--I just threw paint into their faces. . . . Well, paint was the one medium those dead eyes could see through--see straight to the tottering foundations underneath. Don't you know how, in talking a foreign language, even fluently, one says half the time not what one wants to but what one can? Well--that was the way I painted; and as he lay there and watched me, the thing they called my 'technique' collapsed like a house of cards. He didn't sneer, you understand, poor Stroud--he just lay there quietly watching, and on his lips, through the gray beard, I seemed to hear the question: 'Are you sure you know where you're coming out?'
154 |
155 | "If I could have painted that face, with that question on it, I should have done a great thing. The next greatest thing was to see that I couldn't--and that grace was given me. But, oh, at that minute, Rickham, was there anything on earth I wouldn't have given to have Stroud alive before me, and to hear him say: 'It's not too late--I'll show you how'?
156 |
157 | "It _was_ too late--it would have been, even if he'd been alive. I packed up my traps, and went down and told Mrs. Stroud. Of course I didn't tell her _that_--it would have been Greek to her. I simply said I couldn't paint him, that I was too moved. She rather liked the idea--she's so romantic! It was that that made her give me the donkey. But she was terribly upset at not getting the portrait--she did so want him 'done' by some one showy! At first I was afraid she wouldn't let me off--and at my wits' end I suggested Grindle. Yes, it was I who started Grindle: I told Mrs. Stroud he was the 'coming' man, and she told somebody else, and so it got to be true. . . . And he painted Stroud without wincing; and she hung the picture among her husband's things. . . ."
158 |
159 | He flung himself down in the arm-chair near mine, laid back his head, and clasping his arms beneath it, looked up at the picture above the chimney-piece.
160 |
161 | "I like to fancy that Stroud himself would have given it to me, if he'd been able to say what he thought that day."
162 |
163 | And, in answer to a question I put half-mechanically--"Begin again?" he flashed out. "When the one thing that brings me anywhere near him is that I knew enough to leave off?"
164 |
165 | He stood up and laid his hand on my shoulder with a laugh. "Only the irony of it is that I _am_ still painting--since Grindle's doing it for me! The Strouds stand alone, and happen once--but there's no exterminating our kind of art."
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 Victor
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Nuts-and-Bolts-of-LLM
2 | Credit : Andrej Karpathy & Sebastian Raschka
3 |
4 |
5 |
6 | Letsss Go
--------------------------------------------------------------------------------
/assets/1. Genesis/market.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/1. Genesis/market.jpg
--------------------------------------------------------------------------------
/assets/1. Genesis/open-close.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/1. Genesis/open-close.png
--------------------------------------------------------------------------------
/assets/10. Embedding/nn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/10. Embedding/nn.png
--------------------------------------------------------------------------------
/assets/10. Embedding/size.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/10. Embedding/size.png
--------------------------------------------------------------------------------
/assets/10. Embedding/summary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/10. Embedding/summary.png
--------------------------------------------------------------------------------
/assets/10. Embedding/vector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/10. Embedding/vector.png
--------------------------------------------------------------------------------
/assets/11. Positional Encoding/abs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/11. Positional Encoding/abs.png
--------------------------------------------------------------------------------
/assets/11. Positional Encoding/pos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/11. Positional Encoding/pos.png
--------------------------------------------------------------------------------
/assets/13. Intro Attention/att.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/13. Intro Attention/att.png
--------------------------------------------------------------------------------
/assets/13. Intro Attention/attwt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/13. Intro Attention/attwt.png
--------------------------------------------------------------------------------
/assets/13. Intro Attention/endec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/13. Intro Attention/endec.png
--------------------------------------------------------------------------------
/assets/13. Intro Attention/im.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/13. Intro Attention/im.gif
--------------------------------------------------------------------------------
/assets/13. Intro Attention/jump.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/13. Intro Attention/jump.gif
--------------------------------------------------------------------------------
/assets/13. Intro Attention/trans.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/13. Intro Attention/trans.png
--------------------------------------------------------------------------------
/assets/14. SimAttention/dotprod.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/14. SimAttention/dotprod.gif
--------------------------------------------------------------------------------
/assets/14. SimAttention/final.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/14. SimAttention/final.png
--------------------------------------------------------------------------------
/assets/14. SimAttention/sim.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/14. SimAttention/sim.png
--------------------------------------------------------------------------------
/assets/15. Self Attention/multipl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/15. Self Attention/multipl.png
--------------------------------------------------------------------------------
/assets/15. Self Attention/only.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/15. Self Attention/only.gif
--------------------------------------------------------------------------------
/assets/15. Self Attention/selfatt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/15. Self Attention/selfatt.png
--------------------------------------------------------------------------------
/assets/15. Self Attention/var.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/15. Self Attention/var.gif
--------------------------------------------------------------------------------
/assets/16. Causal Attention/dropout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/16. Causal Attention/dropout.png
--------------------------------------------------------------------------------
/assets/16. Causal Attention/mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/16. Causal Attention/mask.png
--------------------------------------------------------------------------------
/assets/16. Causal Attention/strategy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/16. Causal Attention/strategy.png
--------------------------------------------------------------------------------
/assets/17. Multihead Attention/change.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/17. Multihead Attention/change.png
--------------------------------------------------------------------------------
/assets/17. Multihead Attention/dim.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/17. Multihead Attention/dim.png
--------------------------------------------------------------------------------
/assets/17. Multihead Attention/flip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/17. Multihead Attention/flip.png
--------------------------------------------------------------------------------
/assets/17. Multihead Attention/flip2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/17. Multihead Attention/flip2.png
--------------------------------------------------------------------------------
/assets/17. Multihead Attention/multi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/17. Multihead Attention/multi.png
--------------------------------------------------------------------------------
/assets/17. Multihead Attention/out.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/17. Multihead Attention/out.png
--------------------------------------------------------------------------------
/assets/17. Multihead Attention/prev.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/17. Multihead Attention/prev.png
--------------------------------------------------------------------------------
/assets/19. Birds Eye View/sofar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/19. Birds Eye View/sofar.png
--------------------------------------------------------------------------------
/assets/19. Birds Eye View/trans.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/19. Birds Eye View/trans.png
--------------------------------------------------------------------------------
/assets/19. Birds Eye View/view.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/19. Birds Eye View/view.png
--------------------------------------------------------------------------------
/assets/2. Basics/parm2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/2. Basics/parm2.jpg
--------------------------------------------------------------------------------
/assets/2. Basics/parms.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/2. Basics/parms.jpg
--------------------------------------------------------------------------------
/assets/2. Basics/trans.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/2. Basics/trans.gif
--------------------------------------------------------------------------------
/assets/2. Basics/trans2.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/2. Basics/trans2.webp
--------------------------------------------------------------------------------
/assets/2. Basics/vs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/2. Basics/vs.png
--------------------------------------------------------------------------------
/assets/3. Stages/cost.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/3. Stages/cost.png
--------------------------------------------------------------------------------
/assets/3. Stages/data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/3. Stages/data.png
--------------------------------------------------------------------------------
/assets/3. Stages/fine-hervy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/3. Stages/fine-hervy.png
--------------------------------------------------------------------------------
/assets/3. Stages/nomoney.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/3. Stages/nomoney.gif
--------------------------------------------------------------------------------
/assets/3. Stages/notice.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/3. Stages/notice.png
--------------------------------------------------------------------------------
/assets/4. LLM Basic/attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/4. LLM Basic/attention.png
--------------------------------------------------------------------------------
/assets/4. LLM Basic/word.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/4. LLM Basic/word.png
--------------------------------------------------------------------------------
/assets/5. Working/borrow.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/5. Working/borrow.gif
--------------------------------------------------------------------------------
/assets/5. Working/gpt-work.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/5. Working/gpt-work.png
--------------------------------------------------------------------------------
/assets/6. Steps/stage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/6. Steps/stage.png
--------------------------------------------------------------------------------
/assets/6. Steps/step.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/6. Steps/step.png
--------------------------------------------------------------------------------
/assets/8. BPE/paper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/8. BPE/paper.png
--------------------------------------------------------------------------------
/assets/9. Input Target Pairs/dataloader.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/9. Input Target Pairs/dataloader.png
--------------------------------------------------------------------------------
/assets/9. Input Target Pairs/inp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/9. Input Target Pairs/inp.png
--------------------------------------------------------------------------------
/assets/9. Input Target Pairs/stride.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/9. Input Target Pairs/stride.png
--------------------------------------------------------------------------------
/assets/screw.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/assets/screw.gif
--------------------------------------------------------------------------------
/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ictorv/Large-Language-Pretraining/b58c017e4cc37c298c9671b7386623da7553dab3/image.png
--------------------------------------------------------------------------------